import torch import json import os from datasets import Dataset from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer from peft import LoraConfig, get_peft_model from huggingface_hub import login # ✅ Authenticate with Hugging Face HF_TOKEN = os.getenv("HF_TOKEN") if not HF_TOKEN: raise ValueError("Hugging Face token not found. Add it in 'Secrets'.") login(token=HF_TOKEN) # ✅ Load Extracted Data dataset_path = "medical_dataset.json" if not os.path.exists(dataset_path): raise FileNotFoundError(f"Dataset file '{dataset_path}' not found!") with open(dataset_path, "r", encoding="utf-8") as f: data = json.load(f) if not isinstance(data, list): raise ValueError("Dataset should be a list of dictionaries.") dataset = Dataset.from_list(data) # ✅ Load Tokenizer model_name = "tiiuae/falcon-rw-1b" tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # ✅ Tokenize Data (Fixed) def preprocess_function(examples): prompt = examples.get("prompt", "") response = examples.get("response", "") inputs = f"Medical Q&A: {prompt} {response}" model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=512) # ✅ Ensure labels have the same length model_inputs["labels"] = model_inputs["input_ids"] return {key: [val] for key, val in model_inputs.items()} # ✅ Wrap values in lists # ✅ Apply tokenization tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names) # ✅ Load Model with LoRA (Optimized for Falcon) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, # ✅ Save memory device_map="auto" # ✅ Auto-assign to CPU/GPU ) lora_config = LoraConfig( r=16, lora_alpha=32, target_modules=["query_key_value"], # ✅ Correct target module for Falcon lora_dropout=0.05, bias="none" ) model = get_peft_model(model, lora_config) # ✅ Define Training Arguments training_args = TrainingArguments( output_dir="./medical_falcon", per_device_train_batch_size=1, num_train_epochs=3, # ✅ Adjust epochs as needed logging_dir="./logs", save_steps=100, evaluation_strategy="no", save_total_limit=2, fp16=True # ✅ Enable mixed precision training ) # ✅ Train Model trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset ) trainer.train() # ✅ Save Model model_path = "fine_tuned_medical_falcon" trainer.save_model(model_path) tokenizer.save_pretrained(model_path) print(f"✅ Model fine-tuned and saved at: {model_path}")