cconsti commited on
Commit
816facc
·
verified ·
1 Parent(s): f05adaf

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +20 -2
train.py CHANGED
@@ -11,6 +11,23 @@ os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache"
11
  # Load dataset (Replace this with your dataset)
12
  dataset = load_dataset("tatsu-lab/alpaca") # Example alternative dataset
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  # Load model and tokenizer
15
  model_name = "t5-large"
16
  tokenizer = T5Tokenizer.from_pretrained(model_name)
@@ -51,10 +68,11 @@ training_args = TrainingArguments(
51
  trainer = Trainer(
52
  model=model,
53
  args=training_args,
54
- train_dataset=tokenized_datasets["train"],
55
- eval_dataset=tokenized_datasets["test"],
56
  )
57
 
 
58
  # Train the model
59
  trainer.train()
60
 
 
11
  # Load dataset (Replace this with your dataset)
12
  dataset = load_dataset("tatsu-lab/alpaca") # Example alternative dataset
13
 
14
+ # Check available dataset splits
15
+ print("Dataset splits available:", dataset)
16
+
17
+ # If "test" split is missing, use a portion of "train" split
18
+ if "test" not in dataset:
19
+ dataset = dataset["train"].train_test_split(test_size=0.1) # Split 10% for testing
20
+
21
+ # Tokenize dataset
22
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
23
+
24
+ # Assign datasets
25
+ train_dataset = tokenized_datasets["train"]
26
+ eval_dataset = tokenized_datasets["test"] # This is now safely created
27
+
28
+ # Debug output
29
+ print("Dataset successfully split into train and test sets")
30
+
31
  # Load model and tokenizer
32
  model_name = "t5-large"
33
  tokenizer = T5Tokenizer.from_pretrained(model_name)
 
68
  trainer = Trainer(
69
  model=model,
70
  args=training_args,
71
+ train_dataset=train_dataset, # Now correctly assigned
72
+ eval_dataset=eval_dataset, # No more KeyError
73
  )
74
 
75
+
76
  # Train the model
77
  trainer.train()
78