rayymaxx's picture
Made changes to app structure
516d1c9
raw
history blame
1.08 kB
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel
import torch
app = FastAPI()
# πŸ‘‡ Replace with your actual repos
BASE_MODEL = "unsloth/llama-3-8b-Instruct-bnb-4bit"
ADAPTER_REPO = "rayymaxx/DirectEd-AI-LoRA" # your adapter repo
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
# Load base model
model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
device_map="auto",
torch_dtype=torch.float16,
)
# Load adapter
model = PeftModel.from_pretrained(model, ADAPTER_REPO)
# Pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
class Request(BaseModel):
prompt: str
max_new_tokens: int = 150
temperature: float = 0.7
@app.post("/generate")
def generate(req: Request):
output = pipe(
req.prompt,
max_new_tokens=req.max_new_tokens,
temperature=req.temperature,
do_sample=True
)
return {"response": output[0]["generated_text"]}