nroggendorff commited on
Commit
b8f77cf
·
verified ·
1 Parent(s): a1a8a8c

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +8 -4
train.py CHANGED
@@ -10,7 +10,7 @@ from tokenizers import ByteLevelBPETokenizer
10
  MAX_SEQ_LENGTH = 128
11
  BATCH_SIZE = 16
12
  EPOCHS = 2
13
- LEARNING_RATE = 2e-4
14
  FACTOR = 1024
15
  VOCAB_SIZE = 32000
16
  INPUT_DATASET = "HuggingFaceTB/smollm-corpus"
@@ -46,8 +46,12 @@ def create_tokenizer(training_corpus):
46
  return fast_tokenizer
47
 
48
  def get_training_corpus(dataset):
49
- for i in range(0, len(dataset), 1000):
50
- yield dataset[i : i + 1000]["text"]
 
 
 
 
51
 
52
  def format_prompts(examples, tokenizer, isinst):
53
  texts = []
@@ -153,7 +157,7 @@ def main(push_to_hub=True):
153
  dataset = load_data()
154
  pretrain = dataset['pretrain']
155
  instruct = dataset['instruct']
156
- training_corpus = get_training_corpus(pretrain)
157
  tokenizer = create_tokenizer(training_corpus)
158
  configure_tokenizer(tokenizer)
159
  model = create_model(tokenizer)
 
10
  MAX_SEQ_LENGTH = 128
11
  BATCH_SIZE = 16
12
  EPOCHS = 2
13
+ LEARNING_RATE = 2e-5
14
  FACTOR = 1024
15
  VOCAB_SIZE = 32000
16
  INPUT_DATASET = "HuggingFaceTB/smollm-corpus"
 
46
  return fast_tokenizer
47
 
48
  def get_training_corpus(dataset):
49
+ texts = []
50
+ for field in ['pretrain', 'instruct']:
51
+ texts.extend(dataset[field]['text'])
52
+
53
+ for i in range(0, len(texts), 1000):
54
+ yield texts[i : i + 1000]
55
 
56
  def format_prompts(examples, tokenizer, isinst):
57
  texts = []
 
157
  dataset = load_data()
158
  pretrain = dataset['pretrain']
159
  instruct = dataset['instruct']
160
+ training_corpus = get_training_corpus(dataset)
161
  tokenizer = create_tokenizer(training_corpus)
162
  configure_tokenizer(tokenizer)
163
  model = create_model(tokenizer)