File size: 1,077 Bytes
516d1c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel
import torch

app = FastAPI()

# πŸ‘‡ Replace with your actual repos
BASE_MODEL = "unsloth/llama-3-8b-Instruct-bnb-4bit"
ADAPTER_REPO = "rayymaxx/DirectEd-AI-LoRA"  # your adapter repo

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    torch_dtype=torch.float16,
)

# Load adapter
model = PeftModel.from_pretrained(model, ADAPTER_REPO)

# Pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")

class Request(BaseModel):
    prompt: str
    max_new_tokens: int = 150
    temperature: float = 0.7

@app.post("/generate")
def generate(req: Request):
    output = pipe(
        req.prompt,
        max_new_tokens=req.max_new_tokens,
        temperature=req.temperature,
        do_sample=True
    )
    return {"response": output[0]["generated_text"]}