Spaces:
Sleeping
Sleeping
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer | |
from peft import PeftModel | |
import gradio as gr | |
import threading | |
# --- Load Model & Tokenizer --- | |
base_model_name = "unsloth/llama-3.2-3b-bnb-4bit" | |
adapter_model_name = "aismaanly/ai_synthetic" | |
bnb_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_quant_type="nf4", | |
bnb_4bit_use_double_quant=True, | |
bnb_4bit_compute_dtype=torch.bfloat16 | |
) | |
print("Loading base model...") | |
model = AutoModelForCausalLM.from_pretrained( | |
base_model_name, | |
quantization_config=bnb_config, | |
device_map="auto" | |
) | |
print("Loading tokenizer...") | |
tokenizer = AutoTokenizer.from_pretrained(base_model_name) | |
print("Loading PEFT adapter...") | |
model = PeftModel.from_pretrained(model, adapter_model_name) | |
model = model.merge_and_unload() | |
print("Model ready!") | |
# --- Gradio Streaming Function --- | |
def chat_fn(message, history, max_tokens): | |
prompt = message | |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
streamer = TextIteratorStreamer( | |
tokenizer, | |
skip_special_tokens=True | |
) | |
generation_kwargs = dict( | |
**inputs, | |
streamer=streamer, | |
max_new_tokens=int(max_tokens), | |
temperature=0.7, | |
) | |
thread = threading.Thread( | |
target=model.generate, | |
kwargs=generation_kwargs | |
) | |
thread.start() | |
partial_text = "" | |
for new_text in streamer: | |
partial_text += new_text | |
yield partial_text | |
# --- Create Dropdown Component for max tokens --- | |
dropdown = gr.Dropdown( | |
choices=["100", "200", "300"], | |
value="100", | |
label="Max New Tokens" | |
) | |
# --- Launch Gradio Chat Interface --- | |
gr.ChatInterface( | |
fn=chat_fn, | |
additional_inputs=[dropdown], | |
title="LLM Finetuned Comment Generator", | |
description="Chat with the model.", | |
).launch(share=False) |