Spaces:
Sleeping
Sleeping
File size: 1,077 Bytes
516d1c9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel
import torch
app = FastAPI()
# π Replace with your actual repos
BASE_MODEL = "unsloth/llama-3-8b-Instruct-bnb-4bit"
ADAPTER_REPO = "rayymaxx/DirectEd-AI-LoRA" # your adapter repo
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
# Load base model
model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
device_map="auto",
torch_dtype=torch.float16,
)
# Load adapter
model = PeftModel.from_pretrained(model, ADAPTER_REPO)
# Pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
class Request(BaseModel):
prompt: str
max_new_tokens: int = 150
temperature: float = 0.7
@app.post("/generate")
def generate(req: Request):
output = pipe(
req.prompt,
max_new_tokens=req.max_new_tokens,
temperature=req.temperature,
do_sample=True
)
return {"response": output[0]["generated_text"]}
|