nadeen-elsayed commited on
Commit
5d20982
Β·
verified Β·
1 Parent(s): 6ab3b20

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +68 -0
train.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import json
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
4
+ from peft import LoraConfig, get_peft_model
5
+ from datasets import Dataset
6
+
7
+ # βœ… Load Extracted Data
8
+ with open("medical_dataset.json", "r", encoding="utf-8") as f:
9
+ data = json.load(f)
10
+
11
+ dataset = Dataset.from_list(data)
12
+
13
+ # βœ… Load Tokenizer
14
+ model_name = "meta-llama/Llama-2-7b-hf"
15
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
16
+
17
+ if tokenizer.pad_token is None:
18
+ tokenizer.pad_token = tokenizer.eos_token
19
+
20
+ # βœ… Tokenize Data
21
+ def preprocess_function(examples):
22
+ inputs = [f"Medical Q&A: {ex['prompt']} {ex['response']}" for ex in examples]
23
+ model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=512)
24
+ model_inputs["labels"] = model_inputs["input_ids"].copy()
25
+ return model_inputs
26
+
27
+ tokenized_dataset = dataset.map(preprocess_function, batched=True)
28
+ tokenized_dataset = tokenized_dataset.remove_columns(["prompt", "response"])
29
+
30
+ # βœ… Load Model with QLoRA (4-bit Precision)
31
+ model = AutoModelForCausalLM.from_pretrained(
32
+ model_name,
33
+ load_in_4bit=True,
34
+ device_map="auto"
35
+ )
36
+
37
+ lora_config = LoraConfig(
38
+ r=16,
39
+ lora_alpha=32,
40
+ target_modules=["q_proj", "v_proj"],
41
+ lora_dropout=0.05,
42
+ bias="none"
43
+ )
44
+
45
+ model = get_peft_model(model, lora_config)
46
+
47
+ # βœ… Define Training Arguments
48
+ training_args = TrainingArguments(
49
+ output_dir="./medical_llama2",
50
+ per_device_train_batch_size=1,
51
+ num_train_epochs=2, # 2 Epochs
52
+ logging_dir="./logs",
53
+ save_steps=100,
54
+ evaluation_strategy="no"
55
+ )
56
+
57
+ # βœ… Train Model
58
+ trainer = Trainer(
59
+ model=model,
60
+ args=training_args,
61
+ train_dataset=tokenized_dataset
62
+ )
63
+
64
+ trainer.train()
65
+
66
+ # βœ… Save Model
67
+ trainer.save_model("fine_tuned_medical_llama2")
68
+ tokenizer.save_pretrained("fine_tuned_medical_llama2")