Spaces:
Runtime error
Runtime error
Update train.py
Browse files
train.py
CHANGED
@@ -10,7 +10,7 @@ from tokenizers import ByteLevelBPETokenizer
|
|
10 |
MAX_SEQ_LENGTH = 128
|
11 |
BATCH_SIZE = 16
|
12 |
EPOCHS = 2
|
13 |
-
LEARNING_RATE = 2e-
|
14 |
FACTOR = 1024
|
15 |
VOCAB_SIZE = 32000
|
16 |
INPUT_DATASET = "HuggingFaceTB/smollm-corpus"
|
@@ -46,8 +46,12 @@ def create_tokenizer(training_corpus):
|
|
46 |
return fast_tokenizer
|
47 |
|
48 |
def get_training_corpus(dataset):
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
51 |
|
52 |
def format_prompts(examples, tokenizer, isinst):
|
53 |
texts = []
|
@@ -153,7 +157,7 @@ def main(push_to_hub=True):
|
|
153 |
dataset = load_data()
|
154 |
pretrain = dataset['pretrain']
|
155 |
instruct = dataset['instruct']
|
156 |
-
training_corpus = get_training_corpus(
|
157 |
tokenizer = create_tokenizer(training_corpus)
|
158 |
configure_tokenizer(tokenizer)
|
159 |
model = create_model(tokenizer)
|
|
|
10 |
MAX_SEQ_LENGTH = 128
|
11 |
BATCH_SIZE = 16
|
12 |
EPOCHS = 2
|
13 |
+
LEARNING_RATE = 2e-5
|
14 |
FACTOR = 1024
|
15 |
VOCAB_SIZE = 32000
|
16 |
INPUT_DATASET = "HuggingFaceTB/smollm-corpus"
|
|
|
46 |
return fast_tokenizer
|
47 |
|
48 |
def get_training_corpus(dataset):
|
49 |
+
texts = []
|
50 |
+
for field in ['pretrain', 'instruct']:
|
51 |
+
texts.extend(dataset[field]['text'])
|
52 |
+
|
53 |
+
for i in range(0, len(texts), 1000):
|
54 |
+
yield texts[i : i + 1000]
|
55 |
|
56 |
def format_prompts(examples, tokenizer, isinst):
|
57 |
texts = []
|
|
|
157 |
dataset = load_data()
|
158 |
pretrain = dataset['pretrain']
|
159 |
instruct = dataset['instruct']
|
160 |
+
training_corpus = get_training_corpus(dataset)
|
161 |
tokenizer = create_tokenizer(training_corpus)
|
162 |
configure_tokenizer(tokenizer)
|
163 |
model = create_model(tokenizer)
|