Spaces:
Runtime error
Runtime error
import torch | |
import json | |
import os | |
from datasets import Dataset | |
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer | |
from peft import LoraConfig, get_peft_model | |
from huggingface_hub import login | |
# β Authenticate with Hugging Face | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
if not HF_TOKEN: | |
raise ValueError("Hugging Face token not found. Add it in 'Secrets'.") | |
login(token=HF_TOKEN) | |
# β Load Extracted Data | |
dataset_path = "medical_dataset.json" | |
if not os.path.exists(dataset_path): | |
raise FileNotFoundError(f"Dataset file '{dataset_path}' not found!") | |
with open(dataset_path, "r", encoding="utf-8") as f: | |
data = json.load(f) | |
if not isinstance(data, list): | |
raise ValueError("Dataset should be a list of dictionaries.") | |
dataset = Dataset.from_list(data) | |
# β Load Tokenizer | |
model_name = "tiiuae/falcon-rw-1b" | |
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) | |
if tokenizer.pad_token is None: | |
tokenizer.pad_token = tokenizer.eos_token | |
# β Tokenize Data (Fixed) | |
def preprocess_function(examples): | |
prompt = examples.get("prompt", "") | |
response = examples.get("response", "") | |
inputs = f"Medical Q&A: {prompt} {response}" | |
model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=512) | |
# β Ensure labels have the same length | |
model_inputs["labels"] = model_inputs["input_ids"] | |
return {key: [val] for key, val in model_inputs.items()} # β Wrap values in lists | |
# β Apply tokenization | |
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names) | |
# β Load Model with LoRA (Optimized for Falcon) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
torch_dtype=torch.float16, # β Save memory | |
device_map="auto" # β Auto-assign to CPU/GPU | |
) | |
lora_config = LoraConfig( | |
r=16, | |
lora_alpha=32, | |
target_modules=["query_key_value"], # β Correct target module for Falcon | |
lora_dropout=0.05, | |
bias="none" | |
) | |
model = get_peft_model(model, lora_config) | |
# β Define Training Arguments | |
training_args = TrainingArguments( | |
output_dir="./medical_falcon", | |
per_device_train_batch_size=1, | |
num_train_epochs=3, # β Adjust epochs as needed | |
logging_dir="./logs", | |
save_steps=100, | |
evaluation_strategy="no", | |
save_total_limit=2, | |
fp16=True # β Enable mixed precision training | |
) | |
# β Train Model | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_dataset | |
) | |
trainer.train() | |
# β Save Model | |
model_path = "fine_tuned_medical_falcon" | |
trainer.save_model(model_path) | |
tokenizer.save_pretrained(model_path) | |
print(f"β Model fine-tuned and saved at: {model_path}") | |