emotional-ai / app.py
dillibabukadati's picture
Update app.py
138200f verified
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import gradio as gr
import os
import json
import gc
from huggingface_hub import hf_hub_download
import shutil
import tempfile
# Free up memory
gc.collect()
print("Setting up model loading...")
# Create a temporary directory for model modifications
temp_dir = tempfile.mkdtemp()
print(f"Created temporary directory: {temp_dir}")
# Your model name
model_name = "unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit"
try:
# Download the config.json file
print("Downloading configuration file...")
config_path = hf_hub_download(repo_id=model_name, filename="config.json")
# Load and modify the config to remove quantization
print("Modifying configuration...")
with open(config_path, 'r') as file:
config_dict = json.load(file)
# Remove any quantization configs
if "quantization_config" in config_dict:
del config_dict["quantization_config"]
# Save the modified config to the temp directory
modified_config_path = os.path.join(temp_dir, "config.json")
with open(modified_config_path, 'w') as file:
json.dump(config_dict, file)
print("Modified configuration saved")
# Now try to load with the modified config
print("Loading model with modified configuration...")
config = AutoConfig.from_pretrained(temp_dir)
base_model = AutoModelForCausalLM.from_pretrained(
model_name,
config=config,
device_map="auto",
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
trust_remote_code=True
)
print("Model loaded successfully")
except Exception as e:
print(f"Error during custom loading: {e}")
# If the first approach fails, try a direct approach with explicit params
print("Attempting alternative loading method...")
try:
base_model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
load_in_4bit=False,
load_in_8bit=False,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
trust_remote_code=True,
)
print("Model loaded with alternative method")
except Exception as e2:
print(f"Error with alternative loading: {e2}")
raise RuntimeError("Failed to load model in any format")
finally:
# Clean up temp directory
shutil.rmtree(temp_dir)
print(f"Cleaned up temporary directory")
# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
# Function to generate response
def generate_response(message, history):
# Format history for the model
prompt = ""
if history:
for user_msg, assistant_msg in history:
prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
prompt += f"User: {message}\nAssistant: "
print(f"Tokenizing input...")
inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device)
# Free up memory before generation
gc.collect()
print(f"Generating response...")
with torch.no_grad(): # Disable gradient calculation to save memory
outputs = base_model.generate(
**inputs,
max_new_tokens=256, # Reduced from 300 to conserve memory
do_sample=True,
temperature=0.7,
top_k=50,
top_p=0.95
)
print(f"Decoding response...")
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract only the assistant's response
if "Assistant: " in response:
response = response.split("Assistant: ")[-1]
return response
# Launch Gradio UI with memory-efficient settings
print("Setting up Gradio interface...")
with gr.Blocks() as demo:
gr.Markdown("### 🦙 Chat with Your Fine-tuned LLaMA 3.2 3B")
chatbot = gr.ChatInterface(
generate_response,
chatbot=gr.Chatbot(height=400),
textbox=gr.Textbox(placeholder="Type your message here...", container=False, scale=7),
submit_btn="Send",
retry_btn="Retry",
clear_btn="Clear",
)
print("Launching interface...")
demo.launch(share=False, show_api=False) # Disable sharing and API to save resources