dillibabukadati commited on
Commit
138200f
·
verified ·
1 Parent(s): 58505de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -41
app.py CHANGED
@@ -1,82 +1,134 @@
1
  import torch
2
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
3
- from peft import PeftModel, PeftConfig
4
  import gradio as gr
5
  import os
 
6
  import gc
 
 
 
7
 
8
  # Free up memory
9
  gc.collect()
10
 
11
- # Define paths and model names
12
- model_name = "meta-llama/Meta-Llama-3.2-3B-Instruct" # Base model (not quantized)
13
- adapter_name = "unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit" # Your adapter
 
 
 
 
 
14
 
15
- print("Loading base model in float16...")
16
  try:
17
- # Load the base model first (non-quantized)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  base_model = AutoModelForCausalLM.from_pretrained(
19
  model_name,
 
20
  device_map="auto",
21
  torch_dtype=torch.float16,
22
- low_cpu_mem_usage=True
 
23
  )
24
- print("Base model loaded successfully")
25
-
26
- # Load your adapter configuration
27
- peft_config = PeftConfig.from_pretrained(adapter_name)
28
-
29
- # Apply the adapter to the base model
30
- print("Applying adapter to base model...")
31
- model = PeftModel.from_pretrained(base_model, adapter_name)
32
 
33
- print("Model with adapter loaded successfully")
34
  except Exception as e:
35
- print(f"Error loading model with adapter: {e}")
36
- raise RuntimeError("Could not load model")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  # Load tokenizer
39
- tokenizer = AutoTokenizer.from_pretrained(model_name)
 
40
 
41
  # Function to generate response
42
  def generate_response(message, history):
43
- # Format conversation history for the model
44
- messages = []
45
- for user_msg, assistant_msg in history:
46
- messages.append({"role": "user", "content": user_msg})
47
- messages.append({"role": "assistant", "content": assistant_msg})
48
- messages.append({"role": "user", "content": message})
49
-
50
- # Convert messages to the format expected by the model
51
- prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
52
 
53
- # Tokenize and generate
54
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
55
 
56
  # Free up memory before generation
57
  gc.collect()
58
 
 
59
  with torch.no_grad(): # Disable gradient calculation to save memory
60
- outputs = model.generate(
61
  **inputs,
62
- max_new_tokens=300,
63
  do_sample=True,
64
  temperature=0.7,
65
  top_k=50,
66
  top_p=0.95
67
  )
68
 
69
- # Decode the response
70
- full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
71
 
72
- # Extract just the assistant's response
73
- assistant_response = full_response.split("<|assistant|>")[-1].strip()
 
74
 
75
- return assistant_response
76
 
77
- # Launch Gradio UI
 
78
  with gr.Blocks() as demo:
79
  gr.Markdown("### 🦙 Chat with Your Fine-tuned LLaMA 3.2 3B")
80
- chatbot = gr.ChatInterface(generate_response)
 
 
 
 
 
 
 
81
 
82
- demo.launch(show_api=False)
 
 
1
  import torch
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 
3
  import gradio as gr
4
  import os
5
+ import json
6
  import gc
7
+ from huggingface_hub import hf_hub_download
8
+ import shutil
9
+ import tempfile
10
 
11
  # Free up memory
12
  gc.collect()
13
 
14
+ print("Setting up model loading...")
15
+
16
+ # Create a temporary directory for model modifications
17
+ temp_dir = tempfile.mkdtemp()
18
+ print(f"Created temporary directory: {temp_dir}")
19
+
20
+ # Your model name
21
+ model_name = "unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit"
22
 
 
23
  try:
24
+ # Download the config.json file
25
+ print("Downloading configuration file...")
26
+ config_path = hf_hub_download(repo_id=model_name, filename="config.json")
27
+
28
+ # Load and modify the config to remove quantization
29
+ print("Modifying configuration...")
30
+ with open(config_path, 'r') as file:
31
+ config_dict = json.load(file)
32
+
33
+ # Remove any quantization configs
34
+ if "quantization_config" in config_dict:
35
+ del config_dict["quantization_config"]
36
+
37
+ # Save the modified config to the temp directory
38
+ modified_config_path = os.path.join(temp_dir, "config.json")
39
+ with open(modified_config_path, 'w') as file:
40
+ json.dump(config_dict, file)
41
+
42
+ print("Modified configuration saved")
43
+
44
+ # Now try to load with the modified config
45
+ print("Loading model with modified configuration...")
46
+ config = AutoConfig.from_pretrained(temp_dir)
47
+
48
  base_model = AutoModelForCausalLM.from_pretrained(
49
  model_name,
50
+ config=config,
51
  device_map="auto",
52
  torch_dtype=torch.float16,
53
+ low_cpu_mem_usage=True,
54
+ trust_remote_code=True
55
  )
56
+ print("Model loaded successfully")
 
 
 
 
 
 
 
57
 
 
58
  except Exception as e:
59
+ print(f"Error during custom loading: {e}")
60
+ # If the first approach fails, try a direct approach with explicit params
61
+ print("Attempting alternative loading method...")
62
+ try:
63
+ base_model = AutoModelForCausalLM.from_pretrained(
64
+ model_name,
65
+ device_map="auto",
66
+ load_in_4bit=False,
67
+ load_in_8bit=False,
68
+ torch_dtype=torch.float16,
69
+ low_cpu_mem_usage=True,
70
+ trust_remote_code=True,
71
+ )
72
+ print("Model loaded with alternative method")
73
+ except Exception as e2:
74
+ print(f"Error with alternative loading: {e2}")
75
+ raise RuntimeError("Failed to load model in any format")
76
+ finally:
77
+ # Clean up temp directory
78
+ shutil.rmtree(temp_dir)
79
+ print(f"Cleaned up temporary directory")
80
 
81
  # Load tokenizer
82
+ print("Loading tokenizer...")
83
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
84
 
85
  # Function to generate response
86
  def generate_response(message, history):
87
+ # Format history for the model
88
+ prompt = ""
89
+ if history:
90
+ for user_msg, assistant_msg in history:
91
+ prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
92
+ prompt += f"User: {message}\nAssistant: "
 
 
 
93
 
94
+ print(f"Tokenizing input...")
95
+ inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device)
96
 
97
  # Free up memory before generation
98
  gc.collect()
99
 
100
+ print(f"Generating response...")
101
  with torch.no_grad(): # Disable gradient calculation to save memory
102
+ outputs = base_model.generate(
103
  **inputs,
104
+ max_new_tokens=256, # Reduced from 300 to conserve memory
105
  do_sample=True,
106
  temperature=0.7,
107
  top_k=50,
108
  top_p=0.95
109
  )
110
 
111
+ print(f"Decoding response...")
112
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
113
 
114
+ # Extract only the assistant's response
115
+ if "Assistant: " in response:
116
+ response = response.split("Assistant: ")[-1]
117
 
118
+ return response
119
 
120
+ # Launch Gradio UI with memory-efficient settings
121
+ print("Setting up Gradio interface...")
122
  with gr.Blocks() as demo:
123
  gr.Markdown("### 🦙 Chat with Your Fine-tuned LLaMA 3.2 3B")
124
+ chatbot = gr.ChatInterface(
125
+ generate_response,
126
+ chatbot=gr.Chatbot(height=400),
127
+ textbox=gr.Textbox(placeholder="Type your message here...", container=False, scale=7),
128
+ submit_btn="Send",
129
+ retry_btn="Retry",
130
+ clear_btn="Clear",
131
+ )
132
 
133
+ print("Launching interface...")
134
+ demo.launch(share=False, show_api=False) # Disable sharing and API to save resources