Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,82 +1,134 @@
|
|
| 1 |
import torch
|
| 2 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer,
|
| 3 |
-
from peft import PeftModel, PeftConfig
|
| 4 |
import gradio as gr
|
| 5 |
import os
|
|
|
|
| 6 |
import gc
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
# Free up memory
|
| 9 |
gc.collect()
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
print("Loading base model in float16...")
|
| 16 |
try:
|
| 17 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
base_model = AutoModelForCausalLM.from_pretrained(
|
| 19 |
model_name,
|
|
|
|
| 20 |
device_map="auto",
|
| 21 |
torch_dtype=torch.float16,
|
| 22 |
-
low_cpu_mem_usage=True
|
|
|
|
| 23 |
)
|
| 24 |
-
print("
|
| 25 |
-
|
| 26 |
-
# Load your adapter configuration
|
| 27 |
-
peft_config = PeftConfig.from_pretrained(adapter_name)
|
| 28 |
-
|
| 29 |
-
# Apply the adapter to the base model
|
| 30 |
-
print("Applying adapter to base model...")
|
| 31 |
-
model = PeftModel.from_pretrained(base_model, adapter_name)
|
| 32 |
|
| 33 |
-
print("Model with adapter loaded successfully")
|
| 34 |
except Exception as e:
|
| 35 |
-
print(f"Error
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
# Load tokenizer
|
| 39 |
-
tokenizer
|
|
|
|
| 40 |
|
| 41 |
# Function to generate response
|
| 42 |
def generate_response(message, history):
|
| 43 |
-
# Format
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
# Convert messages to the format expected by the model
|
| 51 |
-
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 52 |
|
| 53 |
-
|
| 54 |
-
inputs = tokenizer(prompt, return_tensors="pt").to(
|
| 55 |
|
| 56 |
# Free up memory before generation
|
| 57 |
gc.collect()
|
| 58 |
|
|
|
|
| 59 |
with torch.no_grad(): # Disable gradient calculation to save memory
|
| 60 |
-
outputs =
|
| 61 |
**inputs,
|
| 62 |
-
max_new_tokens=300
|
| 63 |
do_sample=True,
|
| 64 |
temperature=0.7,
|
| 65 |
top_k=50,
|
| 66 |
top_p=0.95
|
| 67 |
)
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
| 71 |
|
| 72 |
-
# Extract
|
| 73 |
-
|
|
|
|
| 74 |
|
| 75 |
-
return
|
| 76 |
|
| 77 |
-
# Launch Gradio UI
|
|
|
|
| 78 |
with gr.Blocks() as demo:
|
| 79 |
gr.Markdown("### 🦙 Chat with Your Fine-tuned LLaMA 3.2 3B")
|
| 80 |
-
chatbot = gr.ChatInterface(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
-
|
|
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
import os
|
| 5 |
+
import json
|
| 6 |
import gc
|
| 7 |
+
from huggingface_hub import hf_hub_download
|
| 8 |
+
import shutil
|
| 9 |
+
import tempfile
|
| 10 |
|
| 11 |
# Free up memory
|
| 12 |
gc.collect()
|
| 13 |
|
| 14 |
+
print("Setting up model loading...")
|
| 15 |
+
|
| 16 |
+
# Create a temporary directory for model modifications
|
| 17 |
+
temp_dir = tempfile.mkdtemp()
|
| 18 |
+
print(f"Created temporary directory: {temp_dir}")
|
| 19 |
+
|
| 20 |
+
# Your model name
|
| 21 |
+
model_name = "unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit"
|
| 22 |
|
|
|
|
| 23 |
try:
|
| 24 |
+
# Download the config.json file
|
| 25 |
+
print("Downloading configuration file...")
|
| 26 |
+
config_path = hf_hub_download(repo_id=model_name, filename="config.json")
|
| 27 |
+
|
| 28 |
+
# Load and modify the config to remove quantization
|
| 29 |
+
print("Modifying configuration...")
|
| 30 |
+
with open(config_path, 'r') as file:
|
| 31 |
+
config_dict = json.load(file)
|
| 32 |
+
|
| 33 |
+
# Remove any quantization configs
|
| 34 |
+
if "quantization_config" in config_dict:
|
| 35 |
+
del config_dict["quantization_config"]
|
| 36 |
+
|
| 37 |
+
# Save the modified config to the temp directory
|
| 38 |
+
modified_config_path = os.path.join(temp_dir, "config.json")
|
| 39 |
+
with open(modified_config_path, 'w') as file:
|
| 40 |
+
json.dump(config_dict, file)
|
| 41 |
+
|
| 42 |
+
print("Modified configuration saved")
|
| 43 |
+
|
| 44 |
+
# Now try to load with the modified config
|
| 45 |
+
print("Loading model with modified configuration...")
|
| 46 |
+
config = AutoConfig.from_pretrained(temp_dir)
|
| 47 |
+
|
| 48 |
base_model = AutoModelForCausalLM.from_pretrained(
|
| 49 |
model_name,
|
| 50 |
+
config=config,
|
| 51 |
device_map="auto",
|
| 52 |
torch_dtype=torch.float16,
|
| 53 |
+
low_cpu_mem_usage=True,
|
| 54 |
+
trust_remote_code=True
|
| 55 |
)
|
| 56 |
+
print("Model loaded successfully")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
|
|
|
| 58 |
except Exception as e:
|
| 59 |
+
print(f"Error during custom loading: {e}")
|
| 60 |
+
# If the first approach fails, try a direct approach with explicit params
|
| 61 |
+
print("Attempting alternative loading method...")
|
| 62 |
+
try:
|
| 63 |
+
base_model = AutoModelForCausalLM.from_pretrained(
|
| 64 |
+
model_name,
|
| 65 |
+
device_map="auto",
|
| 66 |
+
load_in_4bit=False,
|
| 67 |
+
load_in_8bit=False,
|
| 68 |
+
torch_dtype=torch.float16,
|
| 69 |
+
low_cpu_mem_usage=True,
|
| 70 |
+
trust_remote_code=True,
|
| 71 |
+
)
|
| 72 |
+
print("Model loaded with alternative method")
|
| 73 |
+
except Exception as e2:
|
| 74 |
+
print(f"Error with alternative loading: {e2}")
|
| 75 |
+
raise RuntimeError("Failed to load model in any format")
|
| 76 |
+
finally:
|
| 77 |
+
# Clean up temp directory
|
| 78 |
+
shutil.rmtree(temp_dir)
|
| 79 |
+
print(f"Cleaned up temporary directory")
|
| 80 |
|
| 81 |
# Load tokenizer
|
| 82 |
+
print("Loading tokenizer...")
|
| 83 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
|
| 84 |
|
| 85 |
# Function to generate response
|
| 86 |
def generate_response(message, history):
|
| 87 |
+
# Format history for the model
|
| 88 |
+
prompt = ""
|
| 89 |
+
if history:
|
| 90 |
+
for user_msg, assistant_msg in history:
|
| 91 |
+
prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
|
| 92 |
+
prompt += f"User: {message}\nAssistant: "
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
+
print(f"Tokenizing input...")
|
| 95 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device)
|
| 96 |
|
| 97 |
# Free up memory before generation
|
| 98 |
gc.collect()
|
| 99 |
|
| 100 |
+
print(f"Generating response...")
|
| 101 |
with torch.no_grad(): # Disable gradient calculation to save memory
|
| 102 |
+
outputs = base_model.generate(
|
| 103 |
**inputs,
|
| 104 |
+
max_new_tokens=256, # Reduced from 300 to conserve memory
|
| 105 |
do_sample=True,
|
| 106 |
temperature=0.7,
|
| 107 |
top_k=50,
|
| 108 |
top_p=0.95
|
| 109 |
)
|
| 110 |
|
| 111 |
+
print(f"Decoding response...")
|
| 112 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 113 |
|
| 114 |
+
# Extract only the assistant's response
|
| 115 |
+
if "Assistant: " in response:
|
| 116 |
+
response = response.split("Assistant: ")[-1]
|
| 117 |
|
| 118 |
+
return response
|
| 119 |
|
| 120 |
+
# Launch Gradio UI with memory-efficient settings
|
| 121 |
+
print("Setting up Gradio interface...")
|
| 122 |
with gr.Blocks() as demo:
|
| 123 |
gr.Markdown("### 🦙 Chat with Your Fine-tuned LLaMA 3.2 3B")
|
| 124 |
+
chatbot = gr.ChatInterface(
|
| 125 |
+
generate_response,
|
| 126 |
+
chatbot=gr.Chatbot(height=400),
|
| 127 |
+
textbox=gr.Textbox(placeholder="Type your message here...", container=False, scale=7),
|
| 128 |
+
submit_btn="Send",
|
| 129 |
+
retry_btn="Retry",
|
| 130 |
+
clear_btn="Clear",
|
| 131 |
+
)
|
| 132 |
|
| 133 |
+
print("Launching interface...")
|
| 134 |
+
demo.launch(share=False, show_api=False) # Disable sharing and API to save resources
|