edouardfoussier commited on
Commit
656726e
·
1 Parent(s): c0df2ba

first part MVP done

Browse files
Files changed (3) hide show
  1. app.py +0 -1
  2. helpers.py +18 -0
  3. rag/retrieval.py +3 -1
app.py CHANGED
@@ -111,7 +111,6 @@ with gr.Blocks(theme="soft", fill_height=True) as demo:
111
  )
112
  topk = gr.Slider(1, 10, value=5, step=1, label="Top-K passages")
113
  # you can wire this later; not used now
114
- save_history = gr.Checkbox(label="Ajouter un modèle eranker")
115
 
116
  with gr.Row():
117
  with gr.Column(scale=4):
 
111
  )
112
  topk = gr.Slider(1, 10, value=5, step=1, label="Top-K passages")
113
  # you can wire this later; not used now
 
114
 
115
  with gr.Row():
116
  with gr.Column(scale=4):
helpers.py CHANGED
@@ -94,6 +94,24 @@ def _group_sources_md(passages: list[dict], used_idxs: list[int]) -> str:
94
  lines.append(f"{i}. {title}{suffix}")
95
  return "\n".join(lines)
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  # def sources_markdown(passages: list[dict]) -> str:
98
  # if not passages:
99
  # return "### Sources\n_(aucune)_"
 
94
  lines.append(f"{i}. {title}{suffix}")
95
  return "\n".join(lines)
96
 
97
+ # Sanity Check
98
+ def stats():
99
+ """Return quick information about the index and payloads."""
100
+ _ensure()
101
+ if _USE_FAISS:
102
+ n = _index.ntotal
103
+ dim = _index.d
104
+ else:
105
+ n = _index.shape[0]
106
+ dim = _index.shape[1]
107
+ return {
108
+ "backend": "faiss" if _USE_FAISS else "numpy",
109
+ "vectors": n,
110
+ "dim": dim,
111
+ "payloads": len(_payloads) if _payloads is not None else 0,
112
+ "datasets": [f"{name}:{split}" for name, split in DATASETS],
113
+ }
114
+
115
  # def sources_markdown(passages: list[dict]) -> str:
116
  # if not passages:
117
  # return "### Sources\n_(aucune)_"
rag/retrieval.py CHANGED
@@ -5,6 +5,7 @@ import numpy as np
5
  from datasets import load_dataset
6
  from huggingface_hub import InferenceClient
7
 
 
8
  EMBED_COL = os.getenv("EMBED_COL", "embeddings_bge-m3")
9
  DATASETS = [
10
  ("edouardfoussier/travail-emploi-clean", "train"),
@@ -126,4 +127,5 @@ def warm_up_async():
126
  def ensure_ready():
127
  """Build the index once and warm the embedding endpoint."""
128
  _ensure() # builds FAISS/NumPy index + loads payloads
129
- _ = embed("warmup") # hits HF Inference API once to avoid cold-start
 
 
5
  from datasets import load_dataset
6
  from huggingface_hub import InferenceClient
7
 
8
+
9
  EMBED_COL = os.getenv("EMBED_COL", "embeddings_bge-m3")
10
  DATASETS = [
11
  ("edouardfoussier/travail-emploi-clean", "train"),
 
127
  def ensure_ready():
128
  """Build the index once and warm the embedding endpoint."""
129
  _ensure() # builds FAISS/NumPy index + loads payloads
130
+ _ = embed("warmup") # hits HF Inference API once to avoid cold-start
131
+