Spaces:
Runtime error
Runtime error
File size: 2,732 Bytes
5d20982 58be175 5d20982 90d4df1 58be175 f8c98d9 58be175 84cca8a f8c98d9 58be175 5d20982 58be175 5d20982 58be175 5d20982 9c1dfee 790e6ce 5d20982 09f0a44 5d20982 58be175 09f0a44 5d20982 09f0a44 5d20982 790e6ce 5d20982 790e6ce 5d20982 790e6ce 5d20982 84cca8a 5d20982 790e6ce 5d20982 790e6ce 5d20982 84cca8a 58be175 84cca8a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import torch
import json
import os
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from huggingface_hub import login
# β
Authenticate with Hugging Face
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
raise ValueError("Hugging Face token not found. Add it in 'Secrets'.")
login(token=HF_TOKEN)
# β
Load Extracted Data
dataset_path = "medical_dataset.json"
if not os.path.exists(dataset_path):
raise FileNotFoundError(f"Dataset file '{dataset_path}' not found!")
with open(dataset_path, "r", encoding="utf-8") as f:
data = json.load(f)
if not isinstance(data, list):
raise ValueError("Dataset should be a list of dictionaries.")
dataset = Dataset.from_list(data)
# β
Load Tokenizer
model_name = "tiiuae/falcon-rw-1b"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# β
Tokenize Data (Fixed)
def preprocess_function(examples):
prompt = examples.get("prompt", "")
response = examples.get("response", "")
inputs = f"Medical Q&A: {prompt} {response}"
model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=512)
# β
Ensure labels have the same length
model_inputs["labels"] = model_inputs["input_ids"]
return {key: [val] for key, val in model_inputs.items()} # β
Wrap values in lists
# β
Apply tokenization
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)
# β
Load Model with LoRA (Optimized for Falcon)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16, # β
Save memory
device_map="auto" # β
Auto-assign to CPU/GPU
)
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["query_key_value"], # β
Correct target module for Falcon
lora_dropout=0.05,
bias="none"
)
model = get_peft_model(model, lora_config)
# β
Define Training Arguments
training_args = TrainingArguments(
output_dir="./medical_falcon",
per_device_train_batch_size=1,
num_train_epochs=3, # β
Adjust epochs as needed
logging_dir="./logs",
save_steps=100,
evaluation_strategy="no",
save_total_limit=2,
fp16=True # β
Enable mixed precision training
)
# β
Train Model
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset
)
trainer.train()
# β
Save Model
model_path = "fine_tuned_medical_falcon"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
print(f"β
Model fine-tuned and saved at: {model_path}")
|