pretrain
Browse files
scripts/base_datasets.py
CHANGED
|
@@ -147,15 +147,3 @@ base_datasets = [
|
|
| 147 |
# 15.6 MB, 24,926
|
| 148 |
{'kind': 'base', 'path': 'garage-bAInd/Open-Platypus', 'split': 'train', 'format': '{instruction}\n{output}'},
|
| 149 |
]
|
| 150 |
-
|
| 151 |
-
base_datasets = [
|
| 152 |
-
#
|
| 153 |
-
# light instructions
|
| 154 |
-
#
|
| 155 |
-
# 44.3 MB, 51,760
|
| 156 |
-
{'kind': 'base', 'path': 'yahma/alpaca-cleaned', 'split': 'train', 'format': '{instruction}\n{input}\n{output}'},
|
| 157 |
-
# 11 MB, 12,564
|
| 158 |
-
{'kind': 'base', 'path': 'Cleanlab/databricks-dolly-15k-cleanset', 'split': 'train', 'format': '{instruction}\n{context}\n{response}'},
|
| 159 |
-
# 15.6 MB, 24,926
|
| 160 |
-
{'kind': 'base', 'path': 'garage-bAInd/Open-Platypus', 'split': 'train', 'format': '{instruction}\n{output}'},
|
| 161 |
-
]
|
|
|
|
| 147 |
# 15.6 MB, 24,926
|
| 148 |
{'kind': 'base', 'path': 'garage-bAInd/Open-Platypus', 'split': 'train', 'format': '{instruction}\n{output}'},
|
| 149 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/base_instruct_datasets.py
CHANGED
|
@@ -13,7 +13,7 @@ You are an AI assistant.
|
|
| 13 |
Your primary directive is to provide well-reasoned, structured, and extensively detailed responses.
|
| 14 |
|
| 15 |
Formatting Requirements:
|
| 16 |
-
-
|
| 17 |
- The <think></think> block should contain at least six reasoning steps when applicable.
|
| 18 |
- If the answer requires minimal thought, the <think></think> block may be left empty.
|
| 19 |
- The user does not see the <think></think> section. Any information critical to the response must be included in the answer.
|
|
@@ -24,7 +24,10 @@ Response Guidelines:
|
|
| 24 |
- Scientific and Logical Approach: Your explanations should reflect the depth and precision of the greatest scientific minds.
|
| 25 |
- Prioritize Reasoning: Always reason through the problem first, unless the answer is trivial.
|
| 26 |
- Concise yet Complete: Ensure responses are informative, yet to the point without unnecessary elaboration.
|
| 27 |
-
- Maintain a professional, intelligent, and analytical tone in all interactions.
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
base_instruct_datasets = [
|
| 30 |
# 65.7 MB, 11,578
|
|
|
|
| 13 |
Your primary directive is to provide well-reasoned, structured, and extensively detailed responses.
|
| 14 |
|
| 15 |
Formatting Requirements:
|
| 16 |
+
- Structure your replies using: <think>{reasoning}</think>{answer}
|
| 17 |
- The <think></think> block should contain at least six reasoning steps when applicable.
|
| 18 |
- If the answer requires minimal thought, the <think></think> block may be left empty.
|
| 19 |
- The user does not see the <think></think> section. Any information critical to the response must be included in the answer.
|
|
|
|
| 24 |
- Scientific and Logical Approach: Your explanations should reflect the depth and precision of the greatest scientific minds.
|
| 25 |
- Prioritize Reasoning: Always reason through the problem first, unless the answer is trivial.
|
| 26 |
- Concise yet Complete: Ensure responses are informative, yet to the point without unnecessary elaboration.
|
| 27 |
+
- Maintain a professional, intelligent, and analytical tone in all interactions.
|
| 28 |
+
|
| 29 |
+
If user provides <question>...</question> and expects <answer>...</answer>:
|
| 30 |
+
- Structure your replies using: <question>{User’s exact input}</question><think>{reasoning}</think><answer>{answer}</answer>'''
|
| 31 |
|
| 32 |
base_instruct_datasets = [
|
| 33 |
# 65.7 MB, 11,578
|
scripts/prepare_base_datasets.py
CHANGED
|
@@ -20,7 +20,12 @@ seqs = [
|
|
| 20 |
# (16385, 32769, 32769, 500),
|
| 21 |
# (32769, 65537, 65537, 250),
|
| 22 |
# (65537, 131073, 131073, 125),
|
|
|
|
| 23 |
(0, 1073741824, 8193, 2000),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
]
|
| 25 |
|
| 26 |
#
|
|
|
|
| 20 |
# (16385, 32769, 32769, 500),
|
| 21 |
# (32769, 65537, 65537, 250),
|
| 22 |
# (65537, 131073, 131073, 125),
|
| 23 |
+
|
| 24 |
(0, 1073741824, 8193, 2000),
|
| 25 |
+
(8193, 16385, 16385, 1000),
|
| 26 |
+
(16385, 32769, 32769, 500),
|
| 27 |
+
(32769, 65537, 65537, 250),
|
| 28 |
+
(65537, 131073, 131073, 125),
|
| 29 |
]
|
| 30 |
|
| 31 |
#
|
scripts/pretrain_base_model_0.yaml
CHANGED
|
@@ -61,7 +61,7 @@ train:
|
|
| 61 |
global_batch_size: 512
|
| 62 |
|
| 63 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
| 64 |
-
micro_batch_size:
|
| 65 |
|
| 66 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
| 67 |
lr_warmup_steps: 2000
|
|
|
|
| 61 |
global_batch_size: 512
|
| 62 |
|
| 63 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
| 64 |
+
micro_batch_size: 1
|
| 65 |
|
| 66 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
| 67 |
lr_warmup_steps: 2000
|