nadeen-elsayed commited on
Commit
09f0a44
Β·
verified Β·
1 Parent(s): 790e6ce

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +10 -4
train.py CHANGED
@@ -34,16 +34,22 @@ tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
34
  if tokenizer.pad_token is None:
35
  tokenizer.pad_token = tokenizer.eos_token
36
 
37
- # βœ… Tokenize Data
38
  def preprocess_function(examples):
39
  prompt = examples.get("prompt", "")
40
  response = examples.get("response", "")
41
  inputs = f"Medical Q&A: {prompt} {response}"
 
42
  model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=512)
43
- model_inputs["labels"] = model_inputs["input_ids"].copy()
44
- return model_inputs
45
 
46
- tokenized_dataset = dataset.map(preprocess_function, batched=True)
 
 
 
 
 
 
 
47
 
48
  # βœ… Load Model with LoRA (Optimized for Falcon)
49
  model = AutoModelForCausalLM.from_pretrained(
 
34
  if tokenizer.pad_token is None:
35
  tokenizer.pad_token = tokenizer.eos_token
36
 
37
+ # βœ… Tokenize Data (Fixed)
38
  def preprocess_function(examples):
39
  prompt = examples.get("prompt", "")
40
  response = examples.get("response", "")
41
  inputs = f"Medical Q&A: {prompt} {response}"
42
+
43
  model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=512)
 
 
44
 
45
+ # βœ… Ensure labels have the same length
46
+ model_inputs["labels"] = model_inputs["input_ids"]
47
+
48
+ return {key: [val] for key, val in model_inputs.items()} # βœ… Wrap values in lists
49
+
50
+ # βœ… Apply tokenization
51
+ tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)
52
+
53
 
54
  # βœ… Load Model with LoRA (Optimized for Falcon)
55
  model = AutoModelForCausalLM.from_pretrained(