Spaces:

rivapereira123
/

firstaid

Sleeping

App Files Files Community

rivapereira123 commited on Jul 16

Commit

dbbaa64

verified ·

1 Parent(s): 464fae4

Update finetune_flan_t5.py

Browse files

Files changed (1) hide show

finetune_flan_t5.py +23 -14

finetune_flan_t5.py CHANGED Viewed

@@ -8,22 +8,28 @@ from transformers import (
 from trl import SFTTrainer
 import torch
 # 1. Load and prepare dataset
 dataset = load_dataset("json", data_files="data/med_q_n_a_converted.jsonl", split="train")
-# Add 'text' field containing the formatted examples
-def add_text_field(example):
-    example['text'] = f"### Instruction:\n{example['input']}\n\n### Response:\n{example['output']}"
-    return example
-dataset = dataset.map(add_text_field)
 # 2. Load model and tokenizer
 model_name = "google/flan-t5-base"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-# 3. Training arguments
 training_args = TrainingArguments(
     output_dir="./flan-t5-medical-finetuned",
     per_device_train_batch_size=4,
@@ -34,24 +40,27 @@ training_args = TrainingArguments(
     save_strategy="epoch",
     evaluation_strategy="no",
     fp16=torch.cuda.is_available(),
-    report_to="none"
 )
-# 4. Initialize SFTTrainer with correct configuration
 trainer = SFTTrainer(
     model=model,
     tokenizer=tokenizer,
     train_dataset=dataset,
     args=training_args,
-    max_seq_length=512,
-    dataset_text_field="text",  # Field we created
     data_collator=DataCollatorForSeq2Seq(
         tokenizer,
         model=model,
-        pad_to_multiple_of=8,
-        return_tensors="pt",
         padding=True
-    )
 )
 # 5. Start training

 from trl import SFTTrainer
 import torch
+# First check and update packages if needed
+def check_versions():
+    import subprocess
+    import sys
+    subprocess.run([sys.executable, "-m", "pip", "install", "--upgrade", "transformers", "accelerate", "trl"])
+check_versions()
 # 1. Load and prepare dataset
 dataset = load_dataset("json", data_files="data/med_q_n_a_converted.jsonl", split="train")
+# Add formatted text field
+dataset = dataset.map(lambda x: {
+    "text": f"### Instruction:\n{x['input']}\n\n### Response:\n{x['output']}"
+})
 # 2. Load model and tokenizer
 model_name = "google/flan-t5-base"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+# 3. Training arguments - modified to avoid deprecated parameters
 training_args = TrainingArguments(
     output_dir="./flan-t5-medical-finetuned",
     per_device_train_batch_size=4,
     save_strategy="epoch",
     evaluation_strategy="no",
     fp16=torch.cuda.is_available(),
+    report_to="none",
+    # Add these to avoid version conflicts
+    use_cpu=not torch.cuda.is_available(),
+    remove_unused_columns=False
 )
+# 4. Initialize trainer with updated configuration
 trainer = SFTTrainer(
     model=model,
     tokenizer=tokenizer,
     train_dataset=dataset,
     args=training_args,
+    dataset_text_field="text",
     data_collator=DataCollatorForSeq2Seq(
         tokenizer,
         model=model,
         padding=True
+    ),
+    # Remove deprecated parameters
+    max_seq_length=None,
+    formatting_func=None
 )
 # 5. Start training