from fastapi import FastAPI from pydantic import BaseModel from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from peft import PeftModel import torch app = FastAPI() # 👇 Replace with your actual repos BASE_MODEL = "unsloth/llama-3-8b-Instruct-bnb-4bit" ADAPTER_REPO = "rayymaxx/DirectEd-AI-LoRA" # your adapter repo # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) # Load base model model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, device_map="auto", torch_dtype=torch.float16, ) # Load adapter model = PeftModel.from_pretrained(model, ADAPTER_REPO) # Pipeline pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto") class Request(BaseModel): prompt: str max_new_tokens: int = 150 temperature: float = 0.7 @app.post("/generate") def generate(req: Request): output = pipe( req.prompt, max_new_tokens=req.max_new_tokens, temperature=req.temperature, do_sample=True ) return {"response": output[0]["generated_text"]}