rajeshlion commited on
Commit
bfed36f
·
verified ·
1 Parent(s): dcf09e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -69
app.py CHANGED
@@ -403,86 +403,153 @@
403
 
404
 
405
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
  import os
407
  import gradio as gr
408
- import torch
409
- from transformers import AutoTokenizer, AutoModelForCausalLM
410
-
411
- # You can override this via Space secret: MODEL_ID=Qwen/Qwen2-0.5B-Instruct (etc.)
412
- MODEL_ID = os.getenv("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
413
-
414
- # Load once at startup
415
- print(f"🔧 Loading local model: {MODEL_ID}")
416
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
417
- model = AutoModelForCausalLM.from_pretrained(
418
- MODEL_ID,
419
- torch_dtype=torch.float32, # CPU-friendly
 
 
 
 
 
 
420
  )
421
- model.eval()
422
 
423
- def build_prompt(system_message: str, history, user_msg: str) -> str:
424
- """Try to use the model's chat template if present; otherwise use a generic prompt."""
425
- messages = []
426
- if system_message:
427
- messages.append({"role": "system", "content": system_message})
428
- for u, a in (history or []):
429
- if u:
430
- messages.append({"role": "user", "content": u})
431
- if a:
432
- messages.append({"role": "assistant", "content": a})
433
- messages.append({"role": "user", "content": user_msg})
434
-
435
- # Use chat template when available
436
- try:
437
- if getattr(tokenizer, "chat_template", None):
438
- return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
439
- except Exception:
440
- pass
441
-
442
- # Fallback generic formatting
443
- parts = []
444
- if system_message:
445
- parts.append(f"System: {system_message}")
446
  for u, a in (history or []):
447
  if u:
448
- parts.append(f"User: {u}")
449
  if a:
450
- parts.append(f"Assistant: {a}")
451
- parts.append(f"User: {user_msg}")
452
- parts.append("Assistant:")
453
- return "\n".join(parts)
454
-
455
- def respond(message, history, system_message, max_tokens, temperature, top_p):
456
- prompt = build_prompt(system_message, history, message)
457
- inputs = tokenizer(prompt, return_tensors="pt")
458
- with torch.no_grad():
459
- outputs = model.generate(
460
- **inputs,
461
- max_new_tokens=int(max_tokens),
462
- do_sample=True,
463
- temperature=float(temperature),
464
- top_p=float(top_p),
465
- pad_token_id=tokenizer.eos_token_id,
466
- eos_token_id=tokenizer.eos_token_id,
467
- )
468
- # Decode only the newly generated portion
469
- gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
470
- text = tokenizer.decode(gen_ids, skip_special_tokens=True)
471
-
472
- # Stream the text in chunks so the UI feels live
473
  acc = ""
474
- for i in range(0, len(text), 40):
475
- acc += text[i:i+40]
476
- yield acc
 
 
 
477
 
478
  demo = gr.ChatInterface(
479
  respond,
480
  additional_inputs=[
481
- gr.Textbox(
482
- value=("You are a Chatbot who only answers spiritual questions based on three religiousscriptures (a) Hindu - e.g.Bhagwadgita, (b) Jewish, e.g. Torah, (c) Christian, e.g., Bible"
483
- ". You will ffer all three perspectives. You decline answering other questions that do not relate to spirituality."),
484
- label="System message",
485
- ),
486
  gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
487
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
488
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
@@ -490,6 +557,7 @@ demo = gr.ChatInterface(
490
  )
491
 
492
  if __name__ == "__main__":
493
- # share=True gives you a public link automatically
494
  demo.launch(share=True)
495
 
 
 
403
 
404
 
405
 
406
+ # import os
407
+ # import gradio as gr
408
+ # import torch
409
+ # from transformers import AutoTokenizer, AutoModelForCausalLM
410
+
411
+ # # You can override this via Space secret: MODEL_ID=Qwen/Qwen2-0.5B-Instruct (etc.)
412
+ # MODEL_ID = os.getenv("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
413
+
414
+ # # Load once at startup
415
+ # print(f"🔧 Loading local model: {MODEL_ID}")
416
+ # tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
417
+ # model = AutoModelForCausalLM.from_pretrained(
418
+ # MODEL_ID,
419
+ # torch_dtype=torch.float32, # CPU-friendly
420
+ # )
421
+ # model.eval()
422
+
423
+ # def build_prompt(system_message: str, history, user_msg: str) -> str:
424
+ # """Try to use the model's chat template if present; otherwise use a generic prompt."""
425
+ # messages = []
426
+ # if system_message:
427
+ # messages.append({"role": "system", "content": system_message})
428
+ # for u, a in (history or []):
429
+ # if u:
430
+ # messages.append({"role": "user", "content": u})
431
+ # if a:
432
+ # messages.append({"role": "assistant", "content": a})
433
+ # messages.append({"role": "user", "content": user_msg})
434
+
435
+ # # Use chat template when available
436
+ # try:
437
+ # if getattr(tokenizer, "chat_template", None):
438
+ # return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
439
+ # except Exception:
440
+ # pass
441
+
442
+ # # Fallback generic formatting
443
+ # parts = []
444
+ # if system_message:
445
+ # parts.append(f"System: {system_message}")
446
+ # for u, a in (history or []):
447
+ # if u:
448
+ # parts.append(f"User: {u}")
449
+ # if a:
450
+ # parts.append(f"Assistant: {a}")
451
+ # parts.append(f"User: {user_msg}")
452
+ # parts.append("Assistant:")
453
+ # return "\n".join(parts)
454
+
455
+ # def respond(message, history, system_message, max_tokens, temperature, top_p):
456
+ # prompt = build_prompt(system_message, history, message)
457
+ # inputs = tokenizer(prompt, return_tensors="pt")
458
+ # with torch.no_grad():
459
+ # outputs = model.generate(
460
+ # **inputs,
461
+ # max_new_tokens=int(max_tokens),
462
+ # do_sample=True,
463
+ # temperature=float(temperature),
464
+ # top_p=float(top_p),
465
+ # pad_token_id=tokenizer.eos_token_id,
466
+ # eos_token_id=tokenizer.eos_token_id,
467
+ # )
468
+ # # Decode only the newly generated portion
469
+ # gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
470
+ # text = tokenizer.decode(gen_ids, skip_special_tokens=True)
471
+
472
+ # # Stream the text in chunks so the UI feels live
473
+ # acc = ""
474
+ # for i in range(0, len(text), 40):
475
+ # acc += text[i:i+40]
476
+ # yield acc
477
+
478
+ # demo = gr.ChatInterface(
479
+ # respond,
480
+ # additional_inputs=[
481
+ # gr.Textbox(
482
+ # value=("You are a Chatbot who only answers spiritual questions based on three religiousscriptures (a) Hindu - e.g.Bhagwadgita, (b) Jewish, e.g. Torah, (c) Christian, e.g., Bible"
483
+ # ". You will ffer all three perspectives. You decline answering other questions that do not relate to spirituality."),
484
+ # label="System message",
485
+ # ),
486
+ # gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
487
+ # gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
488
+ # gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
489
+ # ],
490
+ # )
491
+
492
+ # if __name__ == "__main__":
493
+ # # share=True gives you a public link automatically
494
+ # demo.launch(share=True)
495
+
496
+
497
  import os
498
  import gradio as gr
499
+ from llama_cpp import Llama
500
+
501
+ # Small, fast, chat-tuned GGUF (≈0.5B params, 4-bit quant)
502
+ REPO_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
503
+ FILENAME = os.getenv("MODEL_FILE", "qwen2.5-0.5b-instruct-q4_k_m.gguf")
504
+
505
+ N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
506
+ CTX = int(os.getenv("CTX", "2048"))
507
+
508
+ print(f"🔧 Loading {REPO_ID}/{FILENAME} with {N_THREADS} threads, ctx={CTX}")
509
+ llm = Llama.from_pretrained(
510
+ repo_id=REPO_ID,
511
+ filename=FILENAME,
512
+ n_ctx=CTX,
513
+ n_threads=N_THREADS,
514
+ n_gpu_layers=0, # CPU only
515
+ logits_all=False,
516
+ verbose=False,
517
  )
 
518
 
519
+ SYSTEM_DEFAULT = (
520
+ "You are a Chatbot who only answers spiritual questions based on Indian scriptures "
521
+ "and politely decline other questions."
522
+ )
523
+
524
+ def respond(message, history, system_message, max_tokens, temperature, top_p):
525
+ sysmsg = system_message or SYSTEM_DEFAULT
526
+ msgs = [{"role": "system", "content": sysmsg}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
527
  for u, a in (history or []):
528
  if u:
529
+ msgs.append({"role": "user", "content": u})
530
  if a:
531
+ msgs.append({"role": "assistant", "content": a})
532
+ msgs.append({"role": "user", "content": message})
533
+
534
+ stream = llm.create_chat_completion(
535
+ messages=msgs,
536
+ temperature=float(temperature),
537
+ top_p=float(top_p),
538
+ max_tokens=int(max_tokens),
539
+ stream=True,
540
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
541
  acc = ""
542
+ for chunk in stream:
543
+ delta = chunk["choices"][0]["delta"]
544
+ tok = delta.get("content", "")
545
+ if tok:
546
+ acc += tok
547
+ yield acc
548
 
549
  demo = gr.ChatInterface(
550
  respond,
551
  additional_inputs=[
552
+ gr.Textbox(value=SYSTEM_DEFAULT, label="System message"),
 
 
 
 
553
  gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
554
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
555
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
 
557
  )
558
 
559
  if __name__ == "__main__":
560
+ print(f"🧵 Threads: {N_THREADS}")
561
  demo.launch(share=True)
562
 
563
+