from fastapi import FastAPI from transformers import AutoModelForCausalLM, AutoTokenizer import torch app = FastAPI() # ✅ Load Model model_path = "fine_tuned_medical_llama2" model = AutoModelForCausalLM.from_pretrained(model_path) tokenizer = AutoTokenizer.from_pretrained(model_path) @app.post("/predict") def predict(query: str): inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True) with torch.no_grad(): outputs = model.generate(**inputs, max_length=200) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return {"response": response}