File size: 1,899 Bytes
64eb70f
ebfe80b
64eb70f
 
ebfe80b
64eb70f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ebfe80b
64eb70f
ebfe80b
64eb70f
 
ebfe80b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64eb70f
 
 
ebfe80b
64eb70f
 
45289ed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
from peft import PeftModel
import gradio as gr
import threading

# --- Load Model & Tokenizer ---

base_model_name = "unsloth/llama-3.2-3b-bnb-4bit"
adapter_model_name = "aismaanly/ai_synthetic"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

print("Loading base model...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

print("Loading PEFT adapter...")
model = PeftModel.from_pretrained(model, adapter_model_name)
model = model.merge_and_unload()
print("Model ready!")

# --- Gradio Streaming Function ---

def chat_fn(message, history, max_tokens):
    prompt = message
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    streamer = TextIteratorStreamer(
        tokenizer,
        skip_special_tokens=True
    )

    generation_kwargs = dict(
        **inputs,
        streamer=streamer,
        max_new_tokens=int(max_tokens),
        temperature=0.7,
    )

    thread = threading.Thread(
        target=model.generate,
        kwargs=generation_kwargs
    )
    thread.start()

    partial_text = ""
    for new_text in streamer:
        partial_text += new_text
        yield partial_text

# --- Create Dropdown Component for max tokens ---

dropdown = gr.Dropdown(
    choices=["100", "200", "300"],
    value="100",
    label="Max New Tokens"
)

# --- Launch Gradio Chat Interface ---

gr.ChatInterface(
    fn=chat_fn,
    additional_inputs=[dropdown],
    title="LLM Finetuned Comment Generator",
    description="Chat with the model.",
).launch(share=False)