Spaces:
Sleeping
Sleeping
from fastapi import FastAPI | |
from pydantic import BaseModel | |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
from peft import PeftModel | |
import torch | |
app = FastAPI() | |
# π Replace with your actual repos | |
BASE_MODEL = "unsloth/llama-3-8b-Instruct-bnb-4bit" | |
ADAPTER_REPO = "rayymaxx/DirectEd-AI-LoRA" # your adapter repo | |
# Load tokenizer | |
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) | |
# Load base model | |
model = AutoModelForCausalLM.from_pretrained( | |
BASE_MODEL, | |
device_map="auto", | |
torch_dtype=torch.float16, | |
) | |
# Load adapter | |
model = PeftModel.from_pretrained(model, ADAPTER_REPO) | |
# Pipeline | |
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto") | |
class Request(BaseModel): | |
prompt: str | |
max_new_tokens: int = 150 | |
temperature: float = 0.7 | |
def generate(req: Request): | |
output = pipe( | |
req.prompt, | |
max_new_tokens=req.max_new_tokens, | |
temperature=req.temperature, | |
do_sample=True | |
) | |
return {"response": output[0]["generated_text"]} | |