Spaces:

edouardfoussier
/

rag-rh-assistant

Sleeping

edouardfoussier commited on Aug 25

Commit

656726e

1 Parent(s): c0df2ba

first part MVP done

Files changed (3) hide show

app.py CHANGED Viewed

@@ -111,7 +111,6 @@ with gr.Blocks(theme="soft", fill_height=True) as demo:
         )
         topk = gr.Slider(1, 10, value=5, step=1, label="Top-K passages")
         # you can wire this later; not used now
-        save_history = gr.Checkbox(label="Ajouter un modèle eranker")
     with gr.Row():
         with gr.Column(scale=4):

         )
         topk = gr.Slider(1, 10, value=5, step=1, label="Top-K passages")
         # you can wire this later; not used now
     with gr.Row():
         with gr.Column(scale=4):

helpers.py CHANGED Viewed

@@ -94,6 +94,24 @@ def _group_sources_md(passages: list[dict], used_idxs: list[int]) -> str:
             lines.append(f"{i}. {title}{suffix}")
     return "\n".join(lines)
 # def sources_markdown(passages: list[dict]) -> str:
 #     if not passages:
 #         return "### Sources\n_(aucune)_"

             lines.append(f"{i}. {title}{suffix}")
     return "\n".join(lines)
+# Sanity Check
+def stats():
+    """Return quick information about the index and payloads."""
+    _ensure()
+    if _USE_FAISS:
+        n = _index.ntotal
+        dim = _index.d
+    else:
+        n = _index.shape[0]
+        dim = _index.shape[1]
+    return {
+        "backend": "faiss" if _USE_FAISS else "numpy",
+        "vectors": n,
+        "dim": dim,
+        "payloads": len(_payloads) if _payloads is not None else 0,
+        "datasets": [f"{name}:{split}" for name, split in DATASETS],
+    }
 # def sources_markdown(passages: list[dict]) -> str:
 #     if not passages:
 #         return "### Sources\n_(aucune)_"

rag/retrieval.py CHANGED Viewed

@@ -5,6 +5,7 @@ import numpy as np
 from datasets import load_dataset
 from huggingface_hub import InferenceClient
 EMBED_COL = os.getenv("EMBED_COL", "embeddings_bge-m3")
 DATASETS = [
     ("edouardfoussier/travail-emploi-clean", "train"),
@@ -126,4 +127,5 @@ def warm_up_async():
 def ensure_ready():
     """Build the index once and warm the embedding endpoint."""
     _ensure()          # builds FAISS/NumPy index + loads payloads
-    _ = embed("warmup")  # hits HF Inference API once to avoid cold-start

 from datasets import load_dataset
 from huggingface_hub import InferenceClient
 EMBED_COL = os.getenv("EMBED_COL", "embeddings_bge-m3")
 DATASETS = [
     ("edouardfoussier/travail-emploi-clean", "train"),
 def ensure_ready():
     """Build the index once and warm the embedding endpoint."""
     _ensure()          # builds FAISS/NumPy index + loads payloads
+    _ = embed("warmup")  # hits HF Inference API once to avoid cold-start