hhelesto's picture
Update app.py
45289ed verified
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
from peft import PeftModel
import gradio as gr
import threading
# --- Load Model & Tokenizer ---
base_model_name = "unsloth/llama-3.2-3b-bnb-4bit"
adapter_model_name = "aismaanly/ai_synthetic"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
print("Loading base model...")
model = AutoModelForCausalLM.from_pretrained(
base_model_name,
quantization_config=bnb_config,
device_map="auto"
)
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
print("Loading PEFT adapter...")
model = PeftModel.from_pretrained(model, adapter_model_name)
model = model.merge_and_unload()
print("Model ready!")
# --- Gradio Streaming Function ---
def chat_fn(message, history, max_tokens):
prompt = message
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
streamer = TextIteratorStreamer(
tokenizer,
skip_special_tokens=True
)
generation_kwargs = dict(
**inputs,
streamer=streamer,
max_new_tokens=int(max_tokens),
temperature=0.7,
)
thread = threading.Thread(
target=model.generate,
kwargs=generation_kwargs
)
thread.start()
partial_text = ""
for new_text in streamer:
partial_text += new_text
yield partial_text
# --- Create Dropdown Component for max tokens ---
dropdown = gr.Dropdown(
choices=["100", "200", "300"],
value="100",
label="Max New Tokens"
)
# --- Launch Gradio Chat Interface ---
gr.ChatInterface(
fn=chat_fn,
additional_inputs=[dropdown],
title="LLM Finetuned Comment Generator",
description="Chat with the model.",
).launch(share=False)