Spaces:

pcreem
/

brown-cafe

Sleeping

App Files Files Community

Song commited on about 1 month ago

Commit

f03b053

1 Parent(s): 9cbfa48

hi

Browse files

Files changed (1) hide show

app.py +153 -61

app.py CHANGED Viewed

@@ -83,14 +83,14 @@ LM_MODEL             = os.getenv("LM_MODEL")
 # ---------- 檢索設定（固定常數） ----------
 TOP_K_SENTENCES   = 10
-BM25_WEIGHT       = 0.6
-SEM_WEIGHT        = 0.4
 EMBEDDING_MODEL_ID= "DMetaSoul/Dmeta-embedding-zh"
-RERANKER_MODEL_ID = "BAAI/bge-reranker-v2-m3"
 USE_CPU           = True  # HF 預設 CPU
 RERANK_THRESHOLD  = 0.5
 MAX_CONTEXT_CHARS = 8000
-DISCLAIMER = "*免責聲明：本資訊僅供參考，若有疑問請諮詢醫師或藥師。*"
 # 藥名映射與停用詞（擴充）
 DRUG_NAME_MAPPING = {
@@ -123,6 +123,15 @@ INTENT_KEYWORDS = {
 }
 # 章節權重
 SECTION_WEIGHTS = {
     "用法及用量": 1.0,
     "病人使用須知": 1.0,
@@ -238,6 +247,7 @@ class State:
     bm25: Optional[Any] = None
     df_csv: Optional[pd.DataFrame] = None
     user_sessions: Dict[str, Dict[str, Any]] = {}  # 簡易 session 快取
 STATE = State()
@@ -328,7 +338,7 @@ def ensure_bm25(pkl_path: str, sentences: List[str]) -> Optional[Any]:
 def extract_drug_candidates_from_query(query: str) -> List[str]:
     parts = re.split(r"[:：]", query, maxsplit=1)
     drug_part = parts[0].strip() if len(parts) > 1 else query.strip()
-    tokens = tokenize_zh(drug_part)
     candidates = [t.lower() for t in tokens if len(t) > 2 and t not in DRUG_STOPWORDS]
     return [DRUG_NAME_MAPPING.get(c, c) for c in candidates]
@@ -336,6 +346,10 @@ def find_drug_ids_from_name(candidates: List[str], df: pd.DataFrame) -> List[str
     if df is None or df.empty: return []
     drug_ids = set()
     for cand in candidates:
         bucket = []
         for _, row in df.iterrows():
             name_joined = f"{(row.get('drug_name_zh') or '').lower()} {(row.get('drug_name_en') or '').lower()} {(row.get('drug_name_norm') or '').lower()}".strip()
@@ -346,12 +360,15 @@ def find_drug_ids_from_name(candidates: List[str], df: pd.DataFrame) -> List[str
                 )
             else:
                 raw = 100 if cand in name_joined else 0
-            if raw >= 72:
                 score = raw * (2.0 if re.search(r'[a-zA-Z]', cand) else 1.5) * (1 + len(cand)/20)
-                bucket.append((score, row["drug_id"]))
-        # 每個 cand 保留前 3 高分（或依你資料量調 1~5）
-        for _, did in sorted(bucket, reverse=True)[:3]:
-            drug_ids.add(did)
     return list(drug_ids)
 # ---------- 意圖偵測 ----------
@@ -364,26 +381,34 @@ def detect_intent(query: str) -> List[str]:
 # ---------- 檢索 ----------
 def rerank_results(query: str, candidates: List[Tuple[int, float, float, float]], sentences: List[str], reranker: Optional[Any], top_k: int, threshold: float) -> List[Dict[str, Any]]:
-    if not candidates:
-        return []
-    valid_indices = [i for (i, *_ ) in candidates if 0 <= i < len(sentences)]
-    if not valid_indices:
-        return []
-    if reranker is None:
-        return [{"idx": i, "score": fused} for i, fused, _, _ in sorted(candidates, key=lambda x: -x[1])[:top_k]]
-    pairs = [[query, sentences[i]] for i in valid_indices]
-    if not pairs:
-        return []
-    scores = reranker.predict(pairs)
-    reranked = [{"idx": valid_indices[j], "score": float(scores[j])} for j in range(len(scores)) if float(scores[j]) >= threshold]
-    if not reranked:
-        # 回退: 用融合分數排序前 top_k
-        sorted_candidates = sorted(candidates, key=lambda x: -x[1])[:top_k]
-        reranked = [{"idx": i, "score": fused} for i, fused, _, _ in sorted_candidates]
-    return sorted(reranked, key=lambda x: -x["score"])[:top_k]
-def fuse_and_select(query: str, sentences: List[str], meta: List[Dict[str, Any]], bm25: Optional[Any], index: Optional[Any], emb_model: Optional[Any], reranker: Optional[Any], top_k: int = 10) -> List[int]:
-    drug_ids = find_drug_ids_from_name(extract_drug_candidates_from_query(query), STATE.df_csv)
     log.info("Detected drug_ids: %s for query: %s", drug_ids, query[:50])
     if not drug_ids:
         log.warning("No drug_ids found; falling back to full corpus search.")
@@ -391,7 +416,6 @@ def fuse_and_select(query: str, sentences: List[str], meta: List[Dict[str, Any]]
     bm_results = []
     if bm25:
         scores = bm25.get_scores(tokenized_query)
-        # Min-max normalize BM25 scores
         scores_np = np.array(scores)
         if np.max(scores_np) > np.min(scores_np):
             scores_norm = (scores_np - np.min(scores_np)) / (np.max(scores_np) - np.min(scores_np))
@@ -427,6 +451,7 @@ def fuse_and_select(query: str, sentences: List[str], meta: List[Dict[str, Any]]
     candidates = [(i, sc, 0.0, 0.0) for i, sc in candidates]
     reranked = rerank_results(query, candidates, sentences, reranker, top_k, RERANK_THRESHOLD)
     idxs = [r["idx"] for r in reranked]
     return idxs
 def build_context(idxs: List[int], sentences: List[str], meta: List[Dict[str, Any]]) -> str:
@@ -460,8 +485,8 @@ def build_prompt(query: str, contexts: str, intents: List[str]) -> str:
                 ts_parts.append("優先藥袋醫囑（如每日1顆，早餐後）。範圍 [Sxxx]。特殊：病人使用須知。")
         trouble_shooting = " ".join(ts_parts)
     return (
-        f"你是一位專業、有同理心的藥師。使用下列參考片段回答問題。若片段無相關資訊，請說不知道。{trouble_shooting}\n"
-        f"回答用台灣繁中，親切易懂，分2-3小段，每段<150字。末尾加'了解嗎？(回是/否)'。結尾加{DISCLAIMER}\n"
         f"問題：{query}\n"
         f"參考片段：\n{contexts}\n"
     )
@@ -520,53 +545,117 @@ def parse_user_message(query: str) -> Dict[str, str]:
 def make_clarify_message(drug_name_hint: str = "") -> str:
     msg = (
-        "抱歉，為了給您更準確的回答，請用這個格式描述（至少包含藥名及問題意圖）：\n"
-        "藥名＋劑型＋強度（例：普拿疼 500mg 錠）\n"
-        "症狀/目的（頭痛 6/10、發燒 38.5°C）\n"
-        "使用紀錄（今天 08:00 吃 1 顆、連用 2 天…）\n"
-        "對象（成人/兒童、孕哺、過敏/肝腎問題）\n"
-        "併用（其他藥/保健品/咖啡酒精/是否空腹）\n"
-        "你的問題（可否同用？多久可再吃？一天上限？）\n"
-        "例：普拿疼 500mg 錠，頭痛，今天 08:00 吃 1 顆，剛喝咖啡；想問現在可再吃嗎？一天上限多少？\n"
     )
     if drug_name_hint:
-        msg = f"目前無法識別特定藥名，我會先提供一般性建議。{drug_name_hint}請補充藥名、劑型與強度。\n" + msg
     return msg + DISCLAIMER
 async def answer_pipeline(query: str, user_id: str) -> str:
     if not query or not isinstance(query, str):
-        return "請提供有效問題。"
     if not STATE.sentences:
-        return "目前尚未載入語料，請稍後再試。"
     session = STATE.user_sessions.get(user_id, {})
     if "prev_query" in session and query.lower() in ["是", "否"]:
-        # 簡易互動
         if query.lower() == "是":
             return "太好了！若還有問題，請告訴我。" + DISCLAIMER
         else:
             return f"抱歉沒說明清楚。關於{session['prev_query']}，請再說詳細點，或直接問醫師。" + DISCLAIMER
-    # 新增: 解析與檢核
     parsed = parse_user_message(query)
     drug_name_hint = ""
     if "fentanyl" in query.lower():
         drug_name_hint = "你說的是 Fentanyl 經皮貼片（Duragesic）嗎？有寫幾 mcg/hr 嗎？"
-    drug_ids = find_drug_ids_from_name(extract_drug_candidates_from_query(query), STATE.df_csv)
-    if not drug_ids or not detect_intent(query):
         return make_clarify_message(drug_name_hint)
-    # 繼續原流程
     intents = detect_intent(query)
-    idxs = fuse_and_select(query, STATE.sentences, STATE.meta, STATE.bm25, STATE.faiss_index, STATE.emb_model, STATE.reranker_model, top_k=TOP_K_SENTENCES)
-    contexts = build_context(idxs, STATE.sentences, STATE.meta)
-    ans = None
-    if LM_MODEL and LITELLM_API_KEY and LITELLM_BASE_URL:
-        ans = call_llm(build_prompt(query, contexts, intents))
-    if not ans:
-        # 改進 fallback 格式
-        fallback_sentences = [STATE.sentences[i] for i in idxs[:3] if i >= 0]
-        ans = "以下是相關資訊：\n" + "\n".join([f"- {s}" for s in fallback_sentences]) if fallback_sentences else "抱歉，暫時找不到相關資訊。"
-        ans += "\n了解嗎？(回是/否)" + DISCLAIMER
-    STATE.user_sessions[user_id] = {"prev_query": query}
-    return ans
 # ---------- LINE 驗簽與回覆 ----------
 def verify_line_signature(body_bytes: bytes, signature: str) -> bool:
@@ -639,6 +728,9 @@ async def _startup():
     STATE.reranker_model = load_reranker_model(RERANKER_MODEL_ID)
     STATE.faiss_index = ensure_faiss(FAISS_INDEX, STATE.sentences)
     STATE.bm25 = ensure_bm25(BM25_PKL, STATE.sentences)
     if os.path.exists(CSV_PATH):
         STATE.df_csv = pd.read_csv(CSV_PATH, dtype=str)
     log.info("LLM via LiteLLM: base=%s model=%s", str(LITELLM_BASE_URL), str(LM_MODEL))

 # ---------- 檢索設定（固定常數） ----------
 TOP_K_SENTENCES   = 10
+BM25_WEIGHT       = 0.8
+SEM_WEIGHT        = 0.2
 EMBEDDING_MODEL_ID= "DMetaSoul/Dmeta-embedding-zh"
+RERANKER_MODEL_ID = "BAAI/bge-reranker-base"
 USE_CPU           = True  # HF 預設 CPU
 RERANK_THRESHOLD  = 0.5
 MAX_CONTEXT_CHARS = 8000
+DISCLAIMER = "此回覆僅供參考，請遵循醫師/藥師指示。"
 # 藥名映射與停用詞（擴充）
 DRUG_NAME_MAPPING = {
 }
 # 章節權重
+SECTION_NORMALIZE = {
+  "用法用量": "用法及用量",
+  "副作用不良反應": "不良反應",
+  "警語注意事項": "警語及注意事項",
+  "交互作用": "藥物交互作用",
+  "包裝及儲存": "儲存條件",
+  "儲存條件": "儲存條件"
+}
 SECTION_WEIGHTS = {
     "用法及用量": 1.0,
     "病人使用須知": 1.0,
     bm25: Optional[Any] = None
     df_csv: Optional[pd.DataFrame] = None
     user_sessions: Dict[str, Dict[str, Any]] = {}  # 簡易 session 快取
+    query_cache: Dict[str, Dict[str, Any]] = {}
 STATE = State()
 def extract_drug_candidates_from_query(query: str) -> List[str]:
     parts = re.split(r"[:：]", query, maxsplit=1)
     drug_part = parts[0].strip() if len(parts) > 1 else query.strip()
+    tokens = tokenize_zh(drug_part)  # 只取左側
     candidates = [t.lower() for t in tokens if len(t) > 2 and t not in DRUG_STOPWORDS]
     return [DRUG_NAME_MAPPING.get(c, c) for c in candidates]
     if df is None or df.empty: return []
     drug_ids = set()
     for cand in candidates:
+        # 先查映射
+        if cand in DRUG_NAME_MAPPING:
+            drug_ids.add(DRUG_NAME_MAPPING[cand])
+            continue
         bucket = []
         for _, row in df.iterrows():
             name_joined = f"{(row.get('drug_name_zh') or '').lower()} {(row.get('drug_name_en') or '').lower()} {(row.get('drug_name_norm') or '').lower()}".strip()
                 )
             else:
                 raw = 100 if cand in name_joined else 0
+            if raw >= 85:
                 score = raw * (2.0 if re.search(r'[a-zA-Z]', cand) else 1.5) * (1 + len(cand)/20)
+                # 劑型比對 (簡易，假設 row 有 'dosage_form' 欄，CSV無，需加邏輯或略)
+                # 假設無，扣分0
+                bucket.append((score, row["drug_id"], row.get("section"), name_joined))
+        if bucket:
+            top = sorted(bucket, reverse=True)[0]
+            drug_ids.add(top[1])
+            log.info(f"Drug candidates: {[(id, sec, score, terms) for score, id, sec, terms in bucket]}")
     return list(drug_ids)
 # ---------- 意圖偵測 ----------
 # ---------- 檢索 ----------
 def rerank_results(query: str, candidates: List[Tuple[int, float, float, float]], sentences: List[str], reranker: Optional[Any], top_k: int, threshold: float) -> List[Dict[str, Any]]:
+    try:
+        candidates = sorted(candidates, key=lambda x: -x[1])[:top_k * 2]  # 限20
+        if not candidates:
+            return []
+        valid_indices = [i for (i, *_ ) in candidates if 0 <= i < len(sentences)]
+        if not valid_indices:
+            return []
+        if reranker is None:
+            return [{"idx": i, "score": fused} for i, fused, _, _ in candidates]
+        pairs = [[query, sentences[i]] for i in valid_indices]
+        if not pairs:
+            return []
+        scores = reranker.predict(pairs, show_progress_bar=False)
+        reranked = [{"idx": valid_indices[j], "score": float(scores[j])} for j in range(len(scores)) if float(scores[j]) >= threshold]
+        if not reranked:
+            reranked = [{"idx": i, "score": fused} for i, fused, _, _ in candidates]
+        return sorted(reranked, key=lambda x: -x["score"])[:top_k]
+    except Exception as e:
+        log.warning("Rerank failed: %s", e)
+        return [{"idx": i, "score": fused} for i, fused, _, _ in sorted(candidates, key=lambda x: -x[1])[:top_k]]  # fallback
+def fuse_and_select(query: str, sentences: List[str], meta: List[Dict[str, Any]], bm25: Optional[Any], index: Optional[Any], emb_model: Optional[Any], reranker: Optional[Any], top_k: int = 10, drug_ids: List[str] = None) -> List[int]:
+    clean_query = query.strip().lower()  # 清理
+    cache_key = clean_query + str(drug_ids)  # per drug cache
+    if cache_key in STATE.query_cache and time.time() - STATE.query_cache[cache_key]['time'] < 180:  # 3min
+        log.info("Cache hit for query: %s", query[:50])
+        return STATE.query_cache[cache_key]['idxs']
+    drug_ids = drug_ids or []
     log.info("Detected drug_ids: %s for query: %s", drug_ids, query[:50])
     if not drug_ids:
         log.warning("No drug_ids found; falling back to full corpus search.")
     bm_results = []
     if bm25:
         scores = bm25.get_scores(tokenized_query)
         scores_np = np.array(scores)
         if np.max(scores_np) > np.min(scores_np):
             scores_norm = (scores_np - np.min(scores_np)) / (np.max(scores_np) - np.min(scores_np))
     candidates = [(i, sc, 0.0, 0.0) for i, sc in candidates]
     reranked = rerank_results(query, candidates, sentences, reranker, top_k, RERANK_THRESHOLD)
     idxs = [r["idx"] for r in reranked]
+    STATE.query_cache[cache_key] = {'idxs': idxs, 'time': time.time()}
     return idxs
 def build_context(idxs: List[int], sentences: List[str], meta: List[Dict[str, Any]]) -> str:
                 ts_parts.append("優先藥袋醫囑（如每日1顆，早餐後）。範圍 [Sxxx]。特殊：病人使用須知。")
         trouble_shooting = " ".join(ts_parts)
     return (
+        f"請根據提供內容，回答使用者問題。\n"
+        f"限制：- 回覆 ≤100字。- 條列 2–4 點，語氣精簡。- 不要附加「了解嗎」。若無關鍵依據，回答：「查無充分資料，建議詢問醫師/藥師」（≤60字）。\n"
         f"問題：{query}\n"
         f"參考片段：\n{contexts}\n"
     )
 def make_clarify_message(drug_name_hint: str = "") -> str:
     msg = (
+        "請提供：\n"
+        "1. 藥名＋劑型＋劑量（例：普拿疼 500mg 錠）\n"
+        "2. 使用情境（症狀、時間、對象）\n"
+        "3. 你的問題（可否同用？多久可再吃？）\n"
     )
     if drug_name_hint:
+        msg = (
+            "目前無法識別特定藥名，我會先提供一般性建議。"
+            "請補充藥名、劑型與強度。\n" + msg
+        )
     return msg + DISCLAIMER
+# ---------- 新增: 藥名處理 ----------
+def process_drug_names(drug_ids: List[str]) -> List[str]:
+    if not drug_ids:
+        log.warning("NO_DRUG_ID")
+        return []
+    if len(drug_ids) == 1:
+        log.info(f"單藥名模式: {drug_ids[0]}")
+        return drug_ids
+    else:
+        log.info(f"多藥名模式: {drug_ids}")
+        return drug_ids
+# ---------- 新增: 回覆壓縮 ----------
+def compress_reply(reply: str, max_len: int = 100) -> str:
+    if len(reply) <= max_len:
+        return reply
+    # 刪除修飾詞、合併句
+    reply = re.sub(r'(例如|比如|像是|可能會|有時候|通常|一般來說|另外|而且|因此|所以)', '', reply)
+    reply = re.sub(r'\s+', ' ', reply).strip()
+    reply = re.sub(r'(\.|\?|\!)\s*', r'\1 ', reply)  # 合併句
+    compressed = reply[:max_len] + '...' if len(reply) > max_len else reply
+    log.info("回覆超長，自動壓縮")
+    return compressed
+# ---------- 新增: 異常處理 ----------
+def handle_error(code: str) -> str:
+    log.error(f"Pipeline error: {code}")
+    return "查無充分資料，建議詢問醫師/藥師"
+# ---------- 新增: 全鏈路log ----------
+def log_pipeline(user_id: str, query: str, parsed: Dict, candidates: List, drug_choices: List,
+                 retrieval: Dict, sections: List, context: str, prompt: str, reply: str, error_code: str = None):
+    log.info(f"1. user_id: {user_id}, query: {query[:50] + '...' if len(query)>50 else query} (desensitized)")
+    log.info(f"2. parsed={parsed}")
+    log.info(f"3. candidates={candidates}")
+    log.info(f"4. drug_id_pick: {drug_choices} (每個候選的分數、淘汰原因 in find_drug_ids)")
+    log.info(f"5. retrieval: BM25_topN={retrieval.get('bm_top', 0)}, SEM_topN={retrieval.get('sem_top', 0)}, fused_top10={retrieval.get('fused_top10', [])}")
+    log.info(f"6. Injected sections: {sections}")
+    log.info(f"7. contexts_chars: {len(context)}, prompt_chars: {len(prompt)}, tokens: ~{len(prompt)//4}")
+    log.info(f"8. LLM: model={LM_MODEL}, time={retrieval.get('llm_time', 0):.2f}s, truncated={len(reply)>200}, output_chars={len(reply)}")
+    if error_code:
+        log.error(f"error_code={error_code}")
 async def answer_pipeline(query: str, user_id: str) -> str:
+    log.info("Pipeline start for user_id: %s, query: %s", user_id, query[:50])
     if not query or not isinstance(query, str):
+        return handle_error("INVALID_QUERY")
     if not STATE.sentences:
+        return handle_error("NO_CORPUS")
     session = STATE.user_sessions.get(user_id, {})
     if "prev_query" in session and query.lower() in ["是", "否"]:
         if query.lower() == "是":
             return "太好了！若還有問題，請告訴我。" + DISCLAIMER
         else:
             return f"抱歉沒說明清楚。關於{session['prev_query']}，請再說詳細點，或直接問醫師。" + DISCLAIMER
     parsed = parse_user_message(query)
     drug_name_hint = ""
     if "fentanyl" in query.lower():
         drug_name_hint = "你說的是 Fentanyl 經皮貼片（Duragesic）嗎？有寫幾 mcg/hr 嗎？"
+    candidates = extract_drug_candidates_from_query(query)
+    drug_ids = find_drug_ids_from_name(candidates, STATE.df_csv)
+    drug_choices = process_drug_names(drug_ids)
+    if not drug_choices:
+        log_pipeline(user_id, query, parsed, candidates, drug_choices, {}, [], "", "", "", "NO_DRUG_ID")
         return make_clarify_message(drug_name_hint)
     intents = detect_intent(query)
+    answers = []
+    retrieval = {'bm_top': 0, 'sem_top': 0, 'fused_top10': [], 'llm_time': 0}
+    sections = IMPORTANT_SECTIONS
+    context = ""
+    prompt = ""
+    reply = ""
+    try:
+        for did in drug_choices:
+            idxs = fuse_and_select(query, STATE.sentences, STATE.meta, STATE.bm25, STATE.faiss_index, STATE.emb_model, STATE.reranker_model, top_k=TOP_K_SENTENCES, drug_ids=[did])
+            if not idxs:
+                answers.append(handle_error("NO_CONTEXT"))
+                continue
+            context = build_context(idxs, STATE.sentences, STATE.meta)
+            prompt = build_prompt(query, context, intents)
+            t0 = time.time()
+            ans = call_llm(prompt)
+            retrieval['llm_time'] += time.time() - t0
+            if not ans:
+                answers.append(handle_error("LLM_ERROR"))
+                continue
+            ans = compress_reply(ans)
+            answers.append(ans)
+        reply = "\n".join(answers)
+        if len(drug_choices) > 1:
+            reply = "偵測到多個藥物，以下分別提供參考：\n" + reply
+        STATE.user_sessions[user_id] = {"prev_query": query}
+    except Exception as e:
+        log.warning("Pipeline 失敗：%s", e)
+        reply = handle_error("UNEXPECTED_ERROR")
+    log_pipeline(user_id, query, parsed, candidates, drug_choices, retrieval, sections, context, prompt, reply)
+    return reply
 # ---------- LINE 驗簽與回覆 ----------
 def verify_line_signature(body_bytes: bytes, signature: str) -> bool:
     STATE.reranker_model = load_reranker_model(RERANKER_MODEL_ID)
     STATE.faiss_index = ensure_faiss(FAISS_INDEX, STATE.sentences)
     STATE.bm25 = ensure_bm25(BM25_PKL, STATE.sentences)
+    for m in STATE.meta:
+        sec = m.get("section", "其他")
+        m["section"] = SECTION_NORMALIZE.get(sec, sec)
     if os.path.exists(CSV_PATH):
         STATE.df_csv = pd.read_csv(CSV_PATH, dtype=str)
     log.info("LLM via LiteLLM: base=%s model=%s", str(LITELLM_BASE_URL), str(LM_MODEL))