Spaces:

Praga-6000
/

Arivian

Runtime error

App Files Files Community

Praga-6000 commited on Aug 30

Commit

5466c6a

verified ·

1 Parent(s): e7e48d3

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -15

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # app.py
 # RAG app for chatting with research papers (optimized for Hugging Face Spaces)
-import os, sys, subprocess, re, json, uuid, gc, hashlib
 from typing import List, Dict, Tuple
 # -----------------------------
@@ -16,15 +16,18 @@ def ensure(pkg, pip_name=None):
 ensure("torch")
 ensure("transformers")
 ensure("accelerate")
-ensure("bitsandbytes")
-ensure("faiss", "faiss-cpu")
 ensure("gradio")
 ensure("sentence_transformers", "sentence-transformers")
 ensure("pypdf")
 ensure("docx", "python-docx")
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, BitsAndBytesConfig
 from sentence_transformers import SentenceTransformer
 import faiss, gradio as gr
 from pypdf import PdfReader
@@ -37,9 +40,9 @@ os.makedirs(DATA_DIR, exist_ok=True)
 INDEX_PATH = os.path.join(DATA_DIR, "faiss.index")
 DOCS_PATH  = os.path.join(DATA_DIR, "docs.jsonl")
-# Models
 default_emb_model = "allenai/specter2_base"
-default_llm_model = "meta-llama/Llama-3.1-8B-Instruct"
 EMB_MODEL_ID = os.environ.get("EMB_MODEL_ID", default_emb_model)
 LLM_MODEL_ID = os.environ.get("LLM_MODEL_ID", default_llm_model)
@@ -107,11 +110,13 @@ class VectorStore:
         embs = self._embed(chunks)
         faiss.normalize_L2(embs)
         self.index.add(embs)
         for c in chunks:
             rec = {"id": str(uuid.uuid4()), "source": source, "text": c}
             self.meta.append(rec)
-            with open(DOCS_PATH, "a", encoding="utf-8") as f:
-                f.write(json.dumps(rec) + "\n")
         faiss.write_index(self.index, INDEX_PATH)
         return len(chunks)
@@ -134,14 +139,22 @@ print(f"[RAG] Loading embeddings: {EMB_MODEL_ID}")
 EMB = SentenceTransformer(EMB_MODEL_ID, device=DEVICE)
 VEC = VectorStore(EMB)
-print(f"[RAG] Loading LLM 4-bit: {LLM_MODEL_ID}")
-bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4")
 TOKENIZER = AutoTokenizer.from_pretrained(LLM_MODEL_ID, use_fast=True, trust_remote_code=True)
 LLM = AutoModelForCausalLM.from_pretrained(
     LLM_MODEL_ID,
     device_map="auto",
     quantization_config=bnb_config,
-    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16,
     low_cpu_mem_usage=True,
     trust_remote_code=True,
 )
@@ -153,14 +166,30 @@ SYSTEM_PROMPT = "You are a helpful assistant. Use the provided context from rese
 def build_prompt(query, history, retrieved):
     ctx = "\n\n".join([f"[{i+1}] {m['text']}" for i, (_, m) in enumerate(retrieved)])
-    hist = "".join([f"<user>{u}</user><assistant>{a}</assistant>" for u, a in history[-3:]])
-    return f"<system>{SYSTEM_PROMPT}\nContext:\n{ctx}</system>{hist}<user>{query}</user><assistant>"
 @torch.inference_mode()
 def generate_answer(prompt, temperature=0.3, max_new_tokens=512):
     streamer = TextIteratorStreamer(TOKENIZER, skip_prompt=True, skip_special_tokens=True)
     inputs = TOKENIZER([prompt], return_tensors="pt").to(LLM.device)
-    kwargs = dict(**inputs, max_new_tokens=max_new_tokens, temperature=temperature, do_sample=temperature>0, streamer=streamer)
     import threading
     t = threading.Thread(target=LLM.generate, kwargs=kwargs)
     t.start()
@@ -200,7 +229,7 @@ def ui_chat(msg, history, top_k, temperature, max_tokens):
     yield history + [(msg, reply)], ""
 with gr.Blocks() as demo:
-    gr.Markdown("# 🔎📚 Research Paper RAG Chat (Llama-3.1-8B 4-bit + Specter2)")
     with gr.Row():
         with gr.Column(scale=2):
             chatbot = gr.Chatbot(height=500)

 # app.py
 # RAG app for chatting with research papers (optimized for Hugging Face Spaces)
+import os, sys, subprocess, re, json, uuid, gc
 from typing import List, Dict, Tuple
 # -----------------------------
 ensure("torch")
 ensure("transformers")
 ensure("accelerate")
 ensure("gradio")
+ensure("faiss", "faiss-cpu")
 ensure("sentence_transformers", "sentence-transformers")
 ensure("pypdf")
 ensure("docx", "python-docx")
 import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    TextIteratorStreamer
+)
 from sentence_transformers import SentenceTransformer
 import faiss, gradio as gr
 from pypdf import PdfReader
 INDEX_PATH = os.path.join(DATA_DIR, "faiss.index")
 DOCS_PATH  = os.path.join(DATA_DIR, "docs.jsonl")
+# Default Models
 default_emb_model = "allenai/specter2_base"
+default_llm_model = "microsoft/Phi-3-mini-4k-instruct"
 EMB_MODEL_ID = os.environ.get("EMB_MODEL_ID", default_emb_model)
 LLM_MODEL_ID = os.environ.get("LLM_MODEL_ID", default_llm_model)
         embs = self._embed(chunks)
         faiss.normalize_L2(embs)
         self.index.add(embs)
+        recs = []
         for c in chunks:
             rec = {"id": str(uuid.uuid4()), "source": source, "text": c}
             self.meta.append(rec)
+            recs.append(json.dumps(rec))
+        with open(DOCS_PATH, "a", encoding="utf-8") as f:
+            f.write("\n".join(recs) + "\n")
         faiss.write_index(self.index, INDEX_PATH)
         return len(chunks)
 EMB = SentenceTransformer(EMB_MODEL_ID, device=DEVICE)
 VEC = VectorStore(EMB)
+print(f"[RAG] Loading LLM: {LLM_MODEL_ID}")
+bnb_config = None
+if DEVICE == "cuda":
+    from transformers import BitsAndBytesConfig
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type="nf4"
+    )
 TOKENIZER = AutoTokenizer.from_pretrained(LLM_MODEL_ID, use_fast=True, trust_remote_code=True)
 LLM = AutoModelForCausalLM.from_pretrained(
     LLM_MODEL_ID,
     device_map="auto",
     quantization_config=bnb_config,
+    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
     low_cpu_mem_usage=True,
     trust_remote_code=True,
 )
 def build_prompt(query, history, retrieved):
     ctx = "\n\n".join([f"[{i+1}] {m['text']}" for i, (_, m) in enumerate(retrieved)])
+    # Try to use chat template if available
+    if hasattr(TOKENIZER, "apply_chat_template"):
+        messages = [{"role": "system", "content": SYSTEM_PROMPT + "\nContext:\n" + ctx}]
+        for u, a in history[-3:]:
+            messages.append({"role": "user", "content": u})
+            messages.append({"role": "assistant", "content": a})
+        messages.append({"role": "user", "content": query})
+        return TOKENIZER.apply_chat_template(messages, tokenize=False)
+    else:
+        # Fallback manual prompt
+        hist = "".join([f"<user>{u}</user><assistant>{a}</assistant>" for u, a in history[-3:]])
+        return f"<system>{SYSTEM_PROMPT}\nContext:\n{ctx}</system>{hist}<user>{query}</user><assistant>"
 @torch.inference_mode()
 def generate_answer(prompt, temperature=0.3, max_new_tokens=512):
     streamer = TextIteratorStreamer(TOKENIZER, skip_prompt=True, skip_special_tokens=True)
     inputs = TOKENIZER([prompt], return_tensors="pt").to(LLM.device)
+    kwargs = dict(
+        **inputs,
+        max_new_tokens=max_new_tokens,
+        temperature=temperature,
+        do_sample=temperature > 0,
+        streamer=streamer
+    )
     import threading
     t = threading.Thread(target=LLM.generate, kwargs=kwargs)
     t.start()
     yield history + [(msg, reply)], ""
 with gr.Blocks() as demo:
+    gr.Markdown("# 🔎📚 Research Paper RAG Chat (Phi-3-mini + Specter2)")
     with gr.Row():
         with gr.Column(scale=2):
             chatbot = gr.Chatbot(height=500)