Rafay17 commited on
Commit
6982029
·
verified ·
1 Parent(s): 5f24d8f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -30
app.py CHANGED
@@ -1,37 +1,34 @@
1
- import gradio as gr
2
- from transformers import AutoTokenizer, FastLanguageModel
3
 
4
  # Load the model and tokenizer
5
- model, tokenizer = FastLanguageModel.from_pretrained(
6
- model_name="lora_model", # Replace with your trained model name
7
- max_seq_length=512,
8
- dtype="float16",
9
- load_in_4bit=True,
10
- )
11
- FastLanguageModel.for_inference(model)
12
 
13
- # Define the inference function
14
- def generate_response(user_input):
15
- # Prepare the input for the model
16
- labeled_prompt = (
17
- "Please provide the response with the following labels:\n"
18
- f"User Input: {user_input}\n"
19
- "Response:"
20
- )
21
 
22
- inputs = tokenizer(
23
- [labeled_prompt],
24
- return_tensors="pt",
25
- padding=True,
26
- truncation=True,
27
- max_length=512,
28
- ).to("cuda")
29
 
30
- response = model.generate(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)
31
- return tokenizer.decode(response[0], skip_special_tokens=True)
32
 
33
- # Create a Gradio interface
34
- iface = gr.Interface(fn=generate_response, inputs="text", outputs="text", title="Chatbot Interface", description="Enter your message below:")
 
 
 
 
 
 
 
35
 
36
- # Launch the app
37
- iface.launch()
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
3
 
4
  # Load the model and tokenizer
5
+ model_name = "Rafay17/Llama3.2_1b_customModel2"
6
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
7
+ model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda") # Ensure to load the model on GPU
 
 
 
 
8
 
9
+ # Prepare the model for inference
10
+ model.eval()
 
 
 
 
 
 
11
 
12
+ # Define a function to generate responses
13
+ def generate_response(input_text):
14
+ # Prepare the input for the model
15
+ inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
 
 
 
16
 
17
+ # Set up the text streamer to stream the generated response
18
+ text_streamer = TextStreamer(tokenizer, skip_prompt=True)
19
 
20
+ # Generate the response
21
+ with torch.no_grad():
22
+ model.generate(
23
+ input_ids=inputs.input_ids,
24
+ attention_mask=inputs.attention_mask,
25
+ streamer=text_streamer,
26
+ max_new_tokens=64, # Adjust this value as needed
27
+ pad_token_id=tokenizer.eos_token_id,
28
+ )
29
 
30
+ # Example usage of the generate_response function
31
+ input_text = "Hello, how can I help you today?"
32
+ print("Generating response for input:")
33
+ print(input_text)
34
+ generate_response(input_text)