pretrain

Files changed (4) hide show

scripts/base_datasets.py CHANGED Viewed

@@ -147,15 +147,3 @@ base_datasets = [
     # 15.6 MB, 24,926
     {'kind': 'base', 'path': 'garage-bAInd/Open-Platypus', 'split': 'train', 'format': '{instruction}\n{output}'},
 ]
-base_datasets = [
-    #
-    # light instructions
-    #
-    # 44.3 MB, 51,760
-    {'kind': 'base', 'path': 'yahma/alpaca-cleaned', 'split': 'train', 'format': '{instruction}\n{input}\n{output}'},
-    # 11 MB, 12,564
-    {'kind': 'base', 'path': 'Cleanlab/databricks-dolly-15k-cleanset', 'split': 'train', 'format': '{instruction}\n{context}\n{response}'},
-    # 15.6 MB, 24,926
-    {'kind': 'base', 'path': 'garage-bAInd/Open-Platypus', 'split': 'train', 'format': '{instruction}\n{output}'},
-]

     # 15.6 MB, 24,926
     {'kind': 'base', 'path': 'garage-bAInd/Open-Platypus', 'split': 'train', 'format': '{instruction}\n{output}'},
 ]

scripts/base_instruct_datasets.py CHANGED Viewed

@@ -13,7 +13,7 @@ You are an AI assistant.
 Your primary directive is to provide well-reasoned, structured, and extensively detailed responses.
 Formatting Requirements:
-- Always structure your replies using: <think>{reasoning}</think>{answer}
 - The <think></think> block should contain at least six reasoning steps when applicable.
 - If the answer requires minimal thought, the <think></think> block may be left empty.
 - The user does not see the <think></think> section. Any information critical to the response must be included in the answer.
@@ -24,7 +24,10 @@ Response Guidelines:
 - Scientific and Logical Approach: Your explanations should reflect the depth and precision of the greatest scientific minds.
 - Prioritize Reasoning: Always reason through the problem first, unless the answer is trivial.
 - Concise yet Complete: Ensure responses are informative, yet to the point without unnecessary elaboration.
-- Maintain a professional, intelligent, and analytical tone in all interactions.'''
 base_instruct_datasets = [
     # 65.7 MB, 11,578

 Your primary directive is to provide well-reasoned, structured, and extensively detailed responses.
 Formatting Requirements:
+- Structure your replies using: <think>{reasoning}</think>{answer}
 - The <think></think> block should contain at least six reasoning steps when applicable.
 - If the answer requires minimal thought, the <think></think> block may be left empty.
 - The user does not see the <think></think> section. Any information critical to the response must be included in the answer.
 - Scientific and Logical Approach: Your explanations should reflect the depth and precision of the greatest scientific minds.
 - Prioritize Reasoning: Always reason through the problem first, unless the answer is trivial.
 - Concise yet Complete: Ensure responses are informative, yet to the point without unnecessary elaboration.
+- Maintain a professional, intelligent, and analytical tone in all interactions.
+If user provides <question>...</question> and expects <answer>...</answer>:
+- Structure your replies using: <question>{User’s exact input}</question><think>{reasoning}</think><answer>{answer}</answer>'''
 base_instruct_datasets = [
     # 65.7 MB, 11,578

scripts/prepare_base_datasets.py CHANGED Viewed

@@ -20,7 +20,12 @@ seqs = [
     # (16385, 32769, 32769, 500),
     # (32769, 65537, 65537, 250),
     # (65537, 131073, 131073, 125),
     (0, 1073741824, 8193, 2000),
 ]
 #

     # (16385, 32769, 32769, 500),
     # (32769, 65537, 65537, 250),
     # (65537, 131073, 131073, 125),
     (0, 1073741824, 8193, 2000),
+    (8193, 16385, 16385, 1000),
+    (16385, 32769, 32769, 500),
+    (32769, 65537, 65537, 250),
+    (65537, 131073, 131073, 125),
 ]
 #

scripts/pretrain_base_model_0.yaml CHANGED Viewed

@@ -61,7 +61,7 @@ train:
   global_batch_size: 512
   # Number of samples per data-parallel rank (type: int, default: 4)
-  micro_batch_size: 2
   # Number of iterations with learning rate warmup active (type: int, default: 2000)
   lr_warmup_steps: 2000

   global_batch_size: 512
   # Number of samples per data-parallel rank (type: int, default: 4)
+  micro_batch_size: 1
   # Number of iterations with learning rate warmup active (type: int, default: 2000)
   lr_warmup_steps: 2000