Reubencf commited on
Commit
b32d25e
Β·
verified Β·
1 Parent(s): 6f40086

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +237 -110
app.py CHANGED
@@ -1,56 +1,95 @@
1
- # app.py β€” Optimized for Hugging Face Spaces Free Tier (CPU-only)
2
 
3
  import os
4
  import gc
5
  import torch
6
  import gradio as gr
7
  from typing import List, Tuple
 
 
8
 
9
- from peft import PeftConfig, PeftModel
10
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 
 
 
 
 
 
11
 
12
  # ── Configuration ──────────────────────────────────────────────────────────────
13
- HF_TOKEN = os.environ.get("HF_TOKEN") # set in Space β†’ Settings β†’ Variables & secrets
14
- ADAPTER_ID = "Reubencf/gemma3-goan-finetuned" # your LoRA adapter repo
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- # Free tier optimization flags
17
- USE_8BIT = False # Set to True if you have access to GPU tier
18
- MAX_MEMORY = "15GB" # Conservative for free tier
19
- DEVICE = "cpu" # Force CPU for free tier
20
 
21
  TITLE = "🌴 Gemma Goan Q&A Bot"
22
  DESCRIPTION = """
23
- Gemma-3-4B-Instruct base + LoRA adapter fine-tuned on a Goan Q&A dataset.
24
  Ask about Goa, Konkani culture, or general topics!
25
 
26
- **Adapter**: https://huggingface.co/Reubencf/gemma3-goan-finetuned
27
-
28
- ⚠️ **Note**: Running on free tier (CPU). Responses may be slower. For faster inference, consider upgrading to GPU tier.
29
  """
30
 
31
- # ── Load model + tokenizer (optimized for free tier) ───────────────────────────
32
  def load_model_and_tokenizer():
33
- """Load model with memory optimizations for free tier"""
34
 
35
- print("[Init] Starting model load for free tier...")
 
36
 
37
- # Get the base model ID from adapter config
38
- peft_cfg = PeftConfig.from_pretrained(ADAPTER_ID, token=HF_TOKEN)
39
- base_id = peft_cfg.base_model_name_or_path
40
- print(f"[Load] Base model: {base_id}")
41
 
42
- # Memory cleanup before loading
43
  gc.collect()
44
  if torch.cuda.is_available():
45
  torch.cuda.empty_cache()
46
 
 
 
 
 
47
  try:
48
- # Load base model with memory optimizations
49
- print("[Load] Loading base model with CPU optimizations...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
- # Quantization config (only if GPU available and enabled)
52
  quantization_config = None
53
  if USE_8BIT and torch.cuda.is_available():
 
54
  quantization_config = BitsAndBytesConfig(
55
  load_in_8bit=True,
56
  bnb_8bit_compute_dtype=torch.float16
@@ -58,74 +97,125 @@ def load_model_and_tokenizer():
58
 
59
  # Load base model
60
  base_model = AutoModelForCausalLM.from_pretrained(
61
- base_id,
62
  token=HF_TOKEN,
63
  trust_remote_code=True,
64
  quantization_config=quantization_config,
65
  low_cpu_mem_usage=True,
66
  torch_dtype=torch.float32 if DEVICE == "cpu" else torch.float16,
67
- device_map=None, # We'll move manually
68
- max_memory={0: MAX_MEMORY} if torch.cuda.is_available() else None,
69
  )
70
 
71
- # Move to device
72
- if DEVICE == "cpu":
73
  base_model = base_model.to("cpu")
74
- print("[Load] Model moved to CPU")
75
 
76
- # Load and apply LoRA adapter
77
- print("[Load] Loading LoRA adapter...")
78
- model = PeftModel.from_pretrained(
79
- base_model,
80
- ADAPTER_ID,
 
81
  token=HF_TOKEN,
 
82
  trust_remote_code=True,
83
- is_trainable=False, # Inference only
84
  )
85
 
86
- # Merge adapter with base (reduces memory overhead during inference)
87
- print("[Load] Merging adapter for efficiency...")
88
- model = model.merge_and_unload()
89
 
90
- print("[Load] Model loaded successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  except Exception as e:
93
- print(f"[Error] Failed to load model: {e}")
94
- raise gr.Error(
95
- f"Failed to load model. This may be due to memory constraints on free tier. "
96
- f"Consider using a smaller model or upgrading to GPU tier. Error: {str(e)}"
97
- )
98
-
99
- # Load tokenizer
100
- print("[Load] Loading tokenizer...")
101
- tokenizer = AutoTokenizer.from_pretrained(
102
- base_id,
103
- token=HF_TOKEN,
104
- use_fast=True,
105
- trust_remote_code=True,
106
- )
107
-
108
- # Set padding token
109
- if tokenizer.pad_token is None:
110
- tokenizer.pad_token = tokenizer.eos_token
111
- tokenizer.padding_side = "left" # Better for generation
112
-
113
- # Set model to eval mode
114
- model.eval()
115
-
116
- # Memory cleanup
117
- gc.collect()
118
-
119
- return model, tokenizer, base_id
 
 
 
 
 
120
 
121
- # Load model globally (done once at startup)
122
  try:
123
- model, tokenizer, BASE_ID = load_model_and_tokenizer()
124
  MODEL_LOADED = True
 
125
  except Exception as e:
126
  print(f"[Fatal] Could not load model: {e}")
127
  MODEL_LOADED = False
128
- model, tokenizer, BASE_ID = None, None, None
 
129
 
130
  # ── Generation function ─────────────────────────────────────────────────────────
131
  def generate_response(
@@ -136,81 +226,108 @@ def generate_response(
136
  top_p: float = 0.95,
137
  repetition_penalty: float = 1.1,
138
  ) -> str:
139
- """Generate response using the fine-tuned model"""
140
 
141
  if not MODEL_LOADED:
142
- return "⚠️ Model failed to load. This usually happens due to memory constraints on the free tier. Please try again later or contact the space owner."
143
 
144
  try:
145
- # Build conversation history
146
  conversation = []
147
- for user_msg, assistant_msg in history:
148
- if user_msg:
149
- conversation.append({"role": "user", "content": user_msg})
150
- if assistant_msg:
151
- conversation.append({"role": "assistant", "content": assistant_msg})
 
 
152
  conversation.append({"role": "user", "content": message})
153
 
154
  # Apply chat template
155
- prompt = tokenizer.apply_chat_template(
156
- conversation,
157
- add_generation_prompt=True,
158
- return_tensors="pt"
159
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
- # Move to model device
162
- prompt = prompt.to(model.device)
163
 
164
- # Generate with memory-efficient settings
 
165
  with torch.no_grad():
166
- # Use cache for faster generation
167
  outputs = model.generate(
168
  input_ids=prompt,
169
- max_new_tokens=min(int(max_new_tokens), 256), # Cap for free tier
170
  temperature=float(temperature),
171
  top_p=float(top_p),
172
  repetition_penalty=float(repetition_penalty),
173
  do_sample=True,
174
  pad_token_id=tokenizer.pad_token_id,
175
  eos_token_id=tokenizer.eos_token_id,
176
- use_cache=True, # Enable KV cache
177
  )
178
 
179
- # Decode only the generated tokens
180
  generated_tokens = outputs[0][prompt.shape[-1]:]
181
  response = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
182
 
183
- # Memory cleanup after generation
 
 
184
  del outputs, prompt, generated_tokens
185
  gc.collect()
186
 
187
  return response
188
 
189
- except torch.cuda.OutOfMemoryError:
190
- gc.collect()
191
- torch.cuda.empty_cache()
192
- return "⚠️ Out of memory. Try reducing max_new_tokens or restarting the space."
193
  except Exception as e:
194
- return f"⚠️ Error generating response: {str(e)}"
 
 
 
 
 
 
 
 
195
 
196
  # ── Gradio Interface ────────────────────────────────────────────────────────────
197
  examples = [
198
  ["What is the capital of Goa?"],
199
  ["Tell me about Konkani language"],
200
- ["What are the famous beaches in Goa?"],
201
  ["Describe Goan fish curry"],
202
  ["What is the history of Old Goa?"],
203
  ]
204
 
205
- # Create the chat interface
206
  if MODEL_LOADED:
207
  demo = gr.ChatInterface(
208
  fn=generate_response,
209
  title=TITLE,
210
  description=DESCRIPTION,
211
  examples=examples,
212
- retry_btn=None, # Disable retry to save memory
213
- undo_btn=None, # Disable undo to save memory
214
  additional_inputs=[
215
  gr.Slider(
216
  minimum=0.1,
@@ -222,7 +339,7 @@ if MODEL_LOADED:
222
  gr.Slider(
223
  minimum=32,
224
  maximum=256,
225
- value=128, # Reduced default for free tier
226
  step=16,
227
  label="Max new tokens"
228
  ),
@@ -244,21 +361,31 @@ if MODEL_LOADED:
244
  theme=gr.themes.Soft(),
245
  )
246
  else:
247
- # Fallback interface if model fails to load
248
  demo = gr.Interface(
249
- fn=lambda x: "⚠️ Model failed to load. Please check the logs or try restarting the space.",
250
  inputs=gr.Textbox(label="Message"),
251
  outputs=gr.Textbox(label="Response"),
252
  title=TITLE,
253
- description="**Error**: Model could not be loaded. This is likely due to memory constraints on the free tier.",
254
  )
255
 
256
- # Queue for handling multiple users
257
- demo.queue(
258
- concurrency_count=1, # Process one at a time to save memory
259
- max_size=10, # Reduced queue size for free tier
260
- )
 
 
 
 
 
 
261
 
262
- # Launch the app
263
  if __name__ == "__main__":
 
 
 
 
 
 
264
  demo.launch()
 
1
+ # app.py β€” Corrected for proper LoRA adapter loading
2
 
3
  import os
4
  import gc
5
  import torch
6
  import gradio as gr
7
  from typing import List, Tuple
8
+ import warnings
9
+ warnings.filterwarnings('ignore')
10
 
11
+ try:
12
+ from peft import PeftConfig, PeftModel
13
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
14
+ IMPORTS_OK = True
15
+ except ImportError as e:
16
+ IMPORTS_OK = False
17
+ print(f"Missing dependencies: {e}")
18
+ print("Please install: pip install transformers peft torch gradio accelerate")
19
 
20
  # ── Configuration ──────────────────────────────────────────────────────────────
21
+ HF_TOKEN = os.environ.get("HF_TOKEN") # Optional for public models
22
+
23
+ # Your LoRA adapter location (HuggingFace repo or local path)
24
+ ADAPTER_ID = "Reubencf/gemma3-goan-finetuned"
25
+ # For local adapter: ADAPTER_ID = "./path/to/your/adapter"
26
+
27
+ # Base model - MUST match what you used for fine-tuning!
28
+ # Check your adapter's config.json for "base_model_name_or_path"
29
+ BASE_MODEL_ID = "google/gemma-2b-it" # Change this to your actual base model
30
+ # Common options:
31
+ # - "google/gemma-2b-it" (2B parameters, easier on memory)
32
+ # - "unsloth/gemma-2-2b-it-bnb-4bit" (quantized version)
33
+ # - Your actual base model used for training
34
 
35
+ # Settings
36
+ USE_8BIT = False # Set to True if you have GPU and want to use 8-bit quantization
37
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
38
 
39
  TITLE = "🌴 Gemma Goan Q&A Bot"
40
  DESCRIPTION = """
41
+ Gemma base model + LoRA adapter fine-tuned on a Goan Q&A dataset.
42
  Ask about Goa, Konkani culture, or general topics!
43
 
44
+ **Status**: {}
 
 
45
  """
46
 
47
+ # ── Load model + tokenizer (correct LoRA loading) ──────────────────────────────
48
  def load_model_and_tokenizer():
49
+ """Load base model and apply LoRA adapter correctly"""
50
 
51
+ if not IMPORTS_OK:
52
+ raise ImportError("Required packages not installed")
53
 
54
+ print("[Init] Starting model load...")
55
+ print(f"[Config] Base model: {BASE_MODEL_ID}")
56
+ print(f"[Config] LoRA adapter: {ADAPTER_ID}")
57
+ print(f"[Config] Device: {DEVICE}")
58
 
59
+ # Memory cleanup
60
  gc.collect()
61
  if torch.cuda.is_available():
62
  torch.cuda.empty_cache()
63
 
64
+ status = ""
65
+ model = None
66
+ tokenizer = None
67
+
68
  try:
69
+ # Step 1: Try to read adapter config to get the correct base model
70
+ actual_base_model = BASE_MODEL_ID
71
+ try:
72
+ print(f"[Load] Checking adapter configuration...")
73
+ peft_config = PeftConfig.from_pretrained(ADAPTER_ID, token=HF_TOKEN)
74
+ actual_base_model = peft_config.base_model_name_or_path
75
+ print(f"[Load] Adapter expects base model: {actual_base_model}")
76
+
77
+ # Warn if mismatch
78
+ if actual_base_model != BASE_MODEL_ID:
79
+ print(f"[Warning] BASE_MODEL_ID ({BASE_MODEL_ID}) doesn't match adapter's base ({actual_base_model})")
80
+ print(f"[Load] Using adapter's base model: {actual_base_model}")
81
+ except Exception as e:
82
+ print(f"[Warning] Cannot read adapter config: {e}")
83
+ print(f"[Load] Will try with configured base model: {BASE_MODEL_ID}")
84
+ actual_base_model = BASE_MODEL_ID
85
+
86
+ # Step 2: Load the BASE MODEL (not the adapter!)
87
+ print(f"[Load] Loading base model: {actual_base_model}")
88
 
89
+ # Quantization config for GPU
90
  quantization_config = None
91
  if USE_8BIT and torch.cuda.is_available():
92
+ print("[Load] Using 8-bit quantization")
93
  quantization_config = BitsAndBytesConfig(
94
  load_in_8bit=True,
95
  bnb_8bit_compute_dtype=torch.float16
 
97
 
98
  # Load base model
99
  base_model = AutoModelForCausalLM.from_pretrained(
100
+ actual_base_model,
101
  token=HF_TOKEN,
102
  trust_remote_code=True,
103
  quantization_config=quantization_config,
104
  low_cpu_mem_usage=True,
105
  torch_dtype=torch.float32 if DEVICE == "cpu" else torch.float16,
106
+ device_map="auto" if torch.cuda.is_available() else None,
 
107
  )
108
 
109
+ # Move to device if needed
110
+ if DEVICE == "cpu" and not torch.cuda.is_available():
111
  base_model = base_model.to("cpu")
112
+ print("[Load] Model on CPU")
113
 
114
+ print("[Load] Base model loaded successfully")
115
+
116
+ # Step 3: Load tokenizer from the BASE MODEL
117
+ print(f"[Load] Loading tokenizer from base model...")
118
+ tokenizer = AutoTokenizer.from_pretrained(
119
+ actual_base_model,
120
  token=HF_TOKEN,
121
+ use_fast=True,
122
  trust_remote_code=True,
 
123
  )
124
 
125
+ if tokenizer.pad_token is None:
126
+ tokenizer.pad_token = tokenizer.eos_token
127
+ tokenizer.padding_side = "left"
128
 
129
+ # Step 4: Try to apply LoRA adapter
130
+ try:
131
+ print(f"[Load] Applying LoRA adapter: {ADAPTER_ID}")
132
+ model = PeftModel.from_pretrained(
133
+ base_model,
134
+ ADAPTER_ID,
135
+ token=HF_TOKEN,
136
+ trust_remote_code=True,
137
+ is_trainable=False, # Inference only
138
+ )
139
+
140
+ # Optional: Merge adapter with base model for faster inference
141
+ # This combines the weights permanently (uses more memory initially but faster inference)
142
+ merge = input("\nπŸ’‘ Merge adapter for faster inference? (y/n, default=y): ").strip().lower()
143
+ if merge != 'n':
144
+ print("[Load] Merging adapter with base model...")
145
+ model = model.merge_and_unload()
146
+ print("[Load] Adapter merged successfully")
147
+ status = f"βœ… Using fine-tuned model (merged): {ADAPTER_ID}"
148
+ else:
149
+ print("[Load] Using adapter without merging")
150
+ status = f"βœ… Using fine-tuned model: {ADAPTER_ID}"
151
+
152
+ except FileNotFoundError as e:
153
+ print(f"[Error] Adapter files not found: {e}")
154
+ print("[Fallback] Using base model without fine-tuning")
155
+ model = base_model
156
+ status = f"⚠️ Adapter not found. Using base model only: {actual_base_model}"
157
+
158
+ except Exception as e:
159
+ print(f"[Error] Failed to load adapter: {e}")
160
+ print("[Fallback] Using base model without fine-tuning")
161
+ model = base_model
162
+ status = f"⚠️ Could not load adapter. Using base model only: {actual_base_model}"
163
+
164
+ # Step 5: Final setup
165
+ model.eval()
166
+ print(f"[Load] Model ready on {DEVICE}!")
167
+
168
+ # Memory cleanup
169
+ gc.collect()
170
+ if torch.cuda.is_available():
171
+ torch.cuda.empty_cache()
172
+
173
+ return model, tokenizer, status
174
 
175
  except Exception as e:
176
+ error_msg = f"Failed to load model: {str(e)}"
177
+ print(f"[Fatal] {error_msg}")
178
+
179
+ # Try fallback to smallest model
180
+ if "gemma-2b" not in BASE_MODEL_ID.lower():
181
+ print("[Fallback] Trying with gemma-2b-it...")
182
+ try:
183
+ base_model = AutoModelForCausalLM.from_pretrained(
184
+ "google/gemma-2b-it",
185
+ token=HF_TOKEN,
186
+ trust_remote_code=True,
187
+ low_cpu_mem_usage=True,
188
+ torch_dtype=torch.float32,
189
+ device_map=None,
190
+ ).to("cpu")
191
+
192
+ tokenizer = AutoTokenizer.from_pretrained(
193
+ "google/gemma-2b-it",
194
+ token=HF_TOKEN,
195
+ trust_remote_code=True,
196
+ )
197
+ if tokenizer.pad_token is None:
198
+ tokenizer.pad_token = tokenizer.eos_token
199
+
200
+ base_model.eval()
201
+ return base_model, tokenizer, "⚠️ Using fallback model: gemma-2b-it (no fine-tuning)"
202
+
203
+ except Exception as fallback_error:
204
+ print(f"[Fatal] Fallback also failed: {fallback_error}")
205
+ raise gr.Error(f"Cannot load any model. Check your configuration.")
206
+ else:
207
+ raise gr.Error(error_msg)
208
 
209
+ # Load model globally
210
  try:
211
+ model, tokenizer, STATUS_MSG = load_model_and_tokenizer()
212
  MODEL_LOADED = True
213
+ DESCRIPTION = DESCRIPTION.format(STATUS_MSG)
214
  except Exception as e:
215
  print(f"[Fatal] Could not load model: {e}")
216
  MODEL_LOADED = False
217
+ model, tokenizer = None, None
218
+ DESCRIPTION = DESCRIPTION.format(f"❌ Model failed to load: {str(e)[:100]}")
219
 
220
  # ── Generation function ─────────────────────────────────────────────────────────
221
  def generate_response(
 
226
  top_p: float = 0.95,
227
  repetition_penalty: float = 1.1,
228
  ) -> str:
229
+ """Generate response using the model"""
230
 
231
  if not MODEL_LOADED:
232
+ return "⚠️ Model failed to load. Please check the logs or restart the application."
233
 
234
  try:
235
+ # Build conversation
236
  conversation = []
237
+ if history:
238
+ # Keep last 3 exchanges for context
239
+ for user_msg, assistant_msg in history[-3:]:
240
+ if user_msg:
241
+ conversation.append({"role": "user", "content": user_msg})
242
+ if assistant_msg:
243
+ conversation.append({"role": "assistant", "content": assistant_msg})
244
  conversation.append({"role": "user", "content": message})
245
 
246
  # Apply chat template
247
+ try:
248
+ prompt = tokenizer.apply_chat_template(
249
+ conversation,
250
+ add_generation_prompt=True,
251
+ return_tensors="pt"
252
+ )
253
+ except Exception as e:
254
+ print(f"[Warning] Chat template failed: {e}, using fallback format")
255
+ # Fallback format
256
+ prompt_text = ""
257
+ for msg in conversation:
258
+ if msg["role"] == "user":
259
+ prompt_text += f"User: {msg['content']}\n"
260
+ else:
261
+ prompt_text += f"Assistant: {msg['content']}\n"
262
+ prompt_text += "Assistant: "
263
+
264
+ inputs = tokenizer(
265
+ prompt_text,
266
+ return_tensors="pt",
267
+ truncation=True,
268
+ max_length=512
269
+ )
270
+ prompt = inputs.input_ids
271
 
272
+ # Move to device
273
+ prompt = prompt.to(model.device if hasattr(model, 'device') else DEVICE)
274
 
275
+ # Generate
276
+ print(f"[Generate] Input length: {prompt.shape[-1]} tokens")
277
  with torch.no_grad():
 
278
  outputs = model.generate(
279
  input_ids=prompt,
280
+ max_new_tokens=min(int(max_new_tokens), 256),
281
  temperature=float(temperature),
282
  top_p=float(top_p),
283
  repetition_penalty=float(repetition_penalty),
284
  do_sample=True,
285
  pad_token_id=tokenizer.pad_token_id,
286
  eos_token_id=tokenizer.eos_token_id,
287
+ use_cache=True,
288
  )
289
 
290
+ # Decode only generated tokens
291
  generated_tokens = outputs[0][prompt.shape[-1]:]
292
  response = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
293
 
294
+ print(f"[Generate] Output length: {len(generated_tokens)} tokens")
295
+
296
+ # Cleanup
297
  del outputs, prompt, generated_tokens
298
  gc.collect()
299
 
300
  return response
301
 
 
 
 
 
302
  except Exception as e:
303
+ error_msg = f"⚠️ Error generating response: {str(e)}"
304
+ print(f"[Error] {error_msg}")
305
+
306
+ # Try to recover memory
307
+ gc.collect()
308
+ if torch.cuda.is_available():
309
+ torch.cuda.empty_cache()
310
+
311
+ return error_msg
312
 
313
  # ── Gradio Interface ────────────────────────────────────────────────────────────
314
  examples = [
315
  ["What is the capital of Goa?"],
316
  ["Tell me about Konkani language"],
317
+ ["What are famous beaches in Goa?"],
318
  ["Describe Goan fish curry"],
319
  ["What is the history of Old Goa?"],
320
  ]
321
 
322
+ # Create interface
323
  if MODEL_LOADED:
324
  demo = gr.ChatInterface(
325
  fn=generate_response,
326
  title=TITLE,
327
  description=DESCRIPTION,
328
  examples=examples,
329
+ retry_btn=None,
330
+ undo_btn=None,
331
  additional_inputs=[
332
  gr.Slider(
333
  minimum=0.1,
 
339
  gr.Slider(
340
  minimum=32,
341
  maximum=256,
342
+ value=128,
343
  step=16,
344
  label="Max new tokens"
345
  ),
 
361
  theme=gr.themes.Soft(),
362
  )
363
  else:
 
364
  demo = gr.Interface(
365
+ fn=lambda x: "Model failed to load. Check console for errors.",
366
  inputs=gr.Textbox(label="Message"),
367
  outputs=gr.Textbox(label="Response"),
368
  title=TITLE,
369
+ description=DESCRIPTION,
370
  )
371
 
372
+ # Queue with version compatibility
373
+ try:
374
+ # Try newer Gradio syntax first (4.x)
375
+ demo.queue(default_concurrency_limit=1, max_size=10)
376
+ except TypeError:
377
+ try:
378
+ # Fall back to older syntax (3.x)
379
+ demo.queue(concurrency_count=1, max_size=10)
380
+ except:
381
+ # If both fail, try without parameters
382
+ demo.queue()
383
 
 
384
  if __name__ == "__main__":
385
+ print("\n" + "="*50)
386
+ print(f"πŸš€ Starting Gradio app on {DEVICE}...")
387
+ print(f"πŸ“ Base model: {BASE_MODEL_ID}")
388
+ print(f"πŸ”§ LoRA adapter: {ADAPTER_ID}")
389
+ print("="*50 + "\n")
390
+
391
  demo.launch()