|
import gradio as gr |
|
from transformers import AutoTokenizer, FastLanguageModel |
|
|
|
|
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name="lora_model", |
|
max_seq_length=512, |
|
dtype="float16", |
|
load_in_4bit=True, |
|
) |
|
FastLanguageModel.for_inference(model) |
|
|
|
|
|
def generate_response(user_input): |
|
|
|
labeled_prompt = ( |
|
"Please provide the response with the following labels:\n" |
|
f"User Input: {user_input}\n" |
|
"Response:" |
|
) |
|
|
|
inputs = tokenizer( |
|
[labeled_prompt], |
|
return_tensors="pt", |
|
padding=True, |
|
truncation=True, |
|
max_length=512, |
|
).to("cuda") |
|
|
|
response = model.generate(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id) |
|
return tokenizer.decode(response[0], skip_special_tokens=True) |
|
|
|
|
|
iface = gr.Interface(fn=generate_response, inputs="text", outputs="text", title="Chatbot Interface", description="Enter your message below:") |
|
|
|
|
|
iface.launch() |
|
|