Spaces:

pcreem
/

brown-cafe

Sleeping

App Files Files Community

Song commited on 27 days ago

Commit

c40d5cc

1 Parent(s): f03b053

hi

Browse files

Files changed (1) hide show

app.py +282 -230

app.py CHANGED Viewed

@@ -26,7 +26,7 @@ for d in (os.getenv("HF_HOME"), os.getenv("SENTENCE_TRANSFORMERS_HOME"), os.gete
 # ---------- Imports ----------
 import re, hmac, base64, hashlib, pickle, logging, time, json
-from typing import List, Dict, Any, Optional, Tuple
 import numpy as np
 import pandas as pd
@@ -146,6 +146,7 @@ SECTION_WEIGHTS = {
 }
 IMPORTANT_SECTIONS = ["用法及用量", "病人使用須知", "包裝及儲存", "不良反應", "警語及注意事項"]
 # ---------- 路徑工具 ----------
 def pick_existing_or_tmp(candidates: List[str]) -> str:
@@ -246,7 +247,7 @@ class State:
     faiss_index: Optional[Any] = None
     bm25: Optional[Any] = None
     df_csv: Optional[pd.DataFrame] = None
-    user_sessions: Dict[str, Dict[str, Any]] = {}  # 簡易 session 快取
     query_cache: Dict[str, Dict[str, Any]] = {}
 STATE = State()
@@ -320,7 +321,8 @@ def ensure_bm25(pkl_path: str, sentences: List[str]) -> Optional[Any]:
         try:
             with open(pkl_path, "rb") as f:
                 bm = pickle.load(f)
-            n_bm = len(bm.corpus)
             if n_bm == len(sentences):
                 log.info("Loaded BM25: %s (n=%d)", pkl_path, n_bm)
                 return bm
@@ -334,42 +336,141 @@ def ensure_bm25(pkl_path: str, sentences: List[str]) -> Optional[Any]:
     safe_pickle_dump(bm, pkl_path)
     return bm
-# ---------- 藥名抽取 ----------
-def extract_drug_candidates_from_query(query: str) -> List[str]:
-    parts = re.split(r"[:：]", query, maxsplit=1)
-    drug_part = parts[0].strip() if len(parts) > 1 else query.strip()
-    tokens = tokenize_zh(drug_part)  # 只取左側
-    candidates = [t.lower() for t in tokens if len(t) > 2 and t not in DRUG_STOPWORDS]
-    return [DRUG_NAME_MAPPING.get(c, c) for c in candidates]
-def find_drug_ids_from_name(candidates: List[str], df: pd.DataFrame) -> List[str]:
-    if df is None or df.empty: return []
-    drug_ids = set()
     for cand in candidates:
-        # 先查映射
-        if cand in DRUG_NAME_MAPPING:
-            drug_ids.add(DRUG_NAME_MAPPING[cand])
-            continue
-        bucket = []
-        for _, row in df.iterrows():
             name_joined = f"{(row.get('drug_name_zh') or '').lower()} {(row.get('drug_name_en') or '').lower()} {(row.get('drug_name_norm') or '').lower()}".strip()
-            if fuzz:
-                raw = max(
                     fuzz.token_set_ratio(cand, name_joined),
                     fuzz.partial_ratio(cand, name_joined)
                 )
-            else:
-                raw = 100 if cand in name_joined else 0
-            if raw >= 85:
-                score = raw * (2.0 if re.search(r'[a-zA-Z]', cand) else 1.5) * (1 + len(cand)/20)
-                # 劑型比對 (簡易，假設 row 有 'dosage_form' 欄，CSV無，需加邏輯或略)
-                # 假設無，扣分0
-                bucket.append((score, row["drug_id"], row.get("section"), name_joined))
-        if bucket:
-            top = sorted(bucket, reverse=True)[0]
-            drug_ids.add(top[1])
-            log.info(f"Drug candidates: {[(id, sec, score, terms) for score, id, sec, terms in bucket]}")
-    return list(drug_ids)
 # ---------- 意圖偵測 ----------
 def detect_intent(query: str) -> List[str]:
@@ -402,58 +503,71 @@ def rerank_results(query: str, candidates: List[Tuple[int, float, float, float]]
         log.warning("Rerank failed: %s", e)
         return [{"idx": i, "score": fused} for i, fused, _, _ in sorted(candidates, key=lambda x: -x[1])[:top_k]]  # fallback
-def fuse_and_select(query: str, sentences: List[str], meta: List[Dict[str, Any]], bm25: Optional[Any], index: Optional[Any], emb_model: Optional[Any], reranker: Optional[Any], top_k: int = 10, drug_ids: List[str] = None) -> List[int]:
-    clean_query = query.strip().lower()  # 清理
-    cache_key = clean_query + str(drug_ids)  # per drug cache
-    if cache_key in STATE.query_cache and time.time() - STATE.query_cache[cache_key]['time'] < 180:  # 3min
-        log.info("Cache hit for query: %s", query[:50])
         return STATE.query_cache[cache_key]['idxs']
-    drug_ids = drug_ids or []
-    log.info("Detected drug_ids: %s for query: %s", drug_ids, query[:50])
-    if not drug_ids:
-        log.warning("No drug_ids found; falling back to full corpus search.")
-    tokenized_query = tokenize_zh(query)
-    bm_results = []
     if bm25:
-        scores = bm25.get_scores(tokenized_query)
-        scores_np = np.array(scores)
-        if np.max(scores_np) > np.min(scores_np):
-            scores_norm = (scores_np - np.min(scores_np)) / (np.max(scores_np) - np.min(scores_np))
         else:
-            scores_norm = scores_np
-        ranked = sorted(enumerate(scores_norm), key=lambda x: -x[1])[:top_k * 4]
-        for rank, (global_i, s_norm) in enumerate(ranked):
-            if 0 <= global_i < len(meta) and (not drug_ids or meta[global_i].get("drug_id") in drug_ids):
-                sec = meta[global_i].get("section", "其他")
-                bm_results.append((global_i, s_norm, SECTION_WEIGHTS.get(sec, 1.0)))
-    log.info("BM25 candidates after filter: %d", len(bm_results))
-    sem_results = []
     if emb_model and index:
-        q_emb = emb_model.encode([query], normalize_embeddings=True).astype(np.float32)
         _, idxs = index.search(q_emb, top_k * 8)
-        for rank, global_i in enumerate(idxs[0].tolist()):
-            if 0 <= global_i < len(meta) and (not drug_ids or meta[global_i].get("drug_id") in drug_ids):
-                sec = meta[global_i].get("section", "其他")
-                sem_results.append((global_i, 1.0 / (1 + rank), SECTION_WEIGHTS.get(sec, 1.0)))
-    log.info("Semantic candidates after filter: %d", len(sem_results))
-    scores = {}
-    for global_i, s_norm, w in bm_results:
-        scores[global_i] = scores.get(global_i, 0.0) + BM25_WEIGHT * s_norm * w
-    for global_i, invrank, w in sem_results:
-        scores[global_i] = scores.get(global_i, 0.0) + SEM_WEIGHT * invrank * w
-    inject_sections = bool(drug_ids)  # 只有鎖定藥物時才注入
-    if inject_sections:
-        for sec in IMPORTANT_SECTIONS:
-            sec_idx = next((i for i, m in enumerate(meta) if (not drug_ids or m.get("drug_id") in drug_ids) and m.get("section") == sec), None)
-            if sec_idx is not None and sec_idx not in scores:
-                scores[sec_idx] = 1.0
-    candidates = sorted(scores.items(), key=lambda x: -x[1])[:top_k * 3]
-    candidates = [(i, sc, 0.0, 0.0) for i, sc in candidates]
-    reranked = rerank_results(query, candidates, sentences, reranker, top_k, RERANK_THRESHOLD)
     idxs = [r["idx"] for r in reranked]
     STATE.query_cache[cache_key] = {'idxs': idxs, 'time': time.time()}
     return idxs
 def build_context(idxs: List[int], sentences: List[str], meta: List[Dict[str, Any]]) -> str:
     ctx_lines, total_len, seen = [], 0, set()
     for i in idxs:
@@ -461,34 +575,40 @@ def build_context(idxs: List[int], sentences: List[str], meta: List[Dict[str, An
         text = sentences[i]
         if text in seen: continue
         chunk_id = meta[i].get("chunk_id", "None")
-        line = f"[S{chunk_id}]: {text}"
         if total_len + len(line) > MAX_CONTEXT_CHARS: break
         ctx_lines.append(line)
         total_len += len(line) + 1
         seen.add(text)
-    return "\n".join(ctx_lines) or "[SNone]: 沒有找到相關資料，請諮詢醫師或藥師。"
-def build_prompt(query: str, contexts: str, intents: List[str]) -> str:
-    trouble_shooting = ""
-    intent_order = ["操作 (Administration)", "劑型相關 (Dosage Form Concerns)", "保存/攜帶 (Storage & Handling)", "時間/併用 (Timing & Interaction)", "副作用/異常 (Side Effects / Issues)", "劑量調整 (Dosage Adjustment)"]
-    matched_intents = [i for i in intent_order if i in intents]
-    if matched_intents:
-        ts_parts = []
-        for intent in matched_intents:
-            if intent in ["操作 (Administration)", "劑型相關 (Dosage Form Concerns)"]:
-                ts_parts.append("檢查組裝：問用戶平時怎麼用，有什麼問題。示範步驟。若不會組裝，建議示範或諮醫。優先藥袋醫囑，其次用法用量/病人使用須知。")
-            elif intent == "保存/攜帶 (Storage & Handling)":
-                ts_parts.append("檢查保存：問怎麼存，避免水/熱/潮濕，否則失效。標準：室溫<30°C或冷藏2-8°C [Sxxx]。旅遊：用原瓶避熱。")
-            elif intent == "副作用/異常 (Side Effects / Issues)":
-                ts_parts.append("常見：頭痛等 [Sxxx]；嚴重：立即停藥諮醫 [Sxxx]。合併不良反應/警語。")
-            elif intent in ["時間/併用 (Timing & Interaction)", "劑量調整 (Dosage Adjustment)"]:
-                ts_parts.append("優先藥袋醫囑（如每日1顆，早餐後）。範圍 [Sxxx]。特殊：病人使用須知。")
-        trouble_shooting = " ".join(ts_parts)
     return (
-        f"請根據提供內容，回答使用者問題。\n"
-        f"限制：- 回覆 ≤100字。- 條列 2–4 點，語氣精簡。- 不要附加「了解嗎」。若無關鍵依據，回答：「查無充分資料，建議詢問醫師/藥師」（≤60字）。\n"
-        f"問題：{query}\n"
-        f"參考片段：\n{contexts}\n"
     )
 def call_llm(prompt: str, max_tokens: int = 2048) -> Optional[str]:
@@ -505,12 +625,9 @@ def call_llm(prompt: str, max_tokens: int = 2048) -> Optional[str]:
         t0 = time.time()
         resp = client.chat.completions.create(
             model=LM_MODEL,
-            messages=[
-                {"role": "system", "content": "你是一位專業、有同理心的藥師。回答忠於資料，不可捏造。語言親切，用台灣繁中+英文藥名。"},
-                {"role": "user", "content": prompt},
-            ],
-            temperature=0.2,
-            timeout=10,
             max_tokens=max_tokens,
         )
         used = time.time() - t0
@@ -520,142 +637,77 @@ def call_llm(prompt: str, max_tokens: int = 2048) -> Optional[str]:
         log.warning("LLM 失敗：%s", e)
         return None
-# ---------- 新增: 輸入解析與引導 ----------
-def parse_user_message(query: str) -> Dict[str, str]:
-    parsed = {"drug_name": "", "dosage_form": "", "strength": "", "symptom": "", "usage_record": "", "patient_info": "", "concomitant": "", "question": query}
-    # 改進正則：支援中英、單位如 mcg/hr
-    drug_match = re.search(r"([\w\s]+?)(?:\s*(\d+(?:\.\d+)?\s*(?:mg|mcg|µg|g|mcg/hr|mg/hr))?\s*(\w+))?", query, re.I)
-    if drug_match:
-        parsed["drug_name"] = drug_match.group(1).strip()
-        parsed["strength"] = drug_match.group(2) or ""
-        parsed["dosage_form"] = drug_match.group(3) or ""
-    symptom_match = re.search(r"(頭痛|發燒|拉肚子|症狀|目的)\s*([^，；]*)", query)
-    if symptom_match:
-        parsed["symptom"] = symptom_match.group(2).strip()
-    usage_match = re.search(r"(吃|用|貼|喝|注射)\s*(\d+)\s*(\w+)", query)
-    if usage_match:
-        parsed["usage_record"] = f"{usage_match.group(2)} {usage_match.group(3)}"
-    patient_match = re.search(r"(成人|兒童|孕|哺|過敏|肝|腎)", query)
-    if patient_match:
-        parsed["patient_info"] = patient_match.group(1)
-    concomitant_match = re.search(r"(併用|其他藥|保健|咖啡|酒|空腹)\s*([^，；]*)", query)
-    if concomitant_match:
-        parsed["concomitant"] = concomitant_match.group(2).strip()
-    return parsed
-def make_clarify_message(drug_name_hint: str = "") -> str:
     msg = (
-        "請提供：\n"
-        "1. 藥名＋劑型＋劑量（例：普拿疼 500mg 錠）\n"
-        "2. 使用情境（症狀、時間、對象）\n"
-        "3. 你的問題（可否同用？多久可再吃？）\n"
     )
-    if drug_name_hint:
-        msg = (
-            "目前無法識別特定藥名，我會先提供一般性建議。"
-            "請補充藥名、劑型與強度。\n" + msg
-        )
-    return msg + DISCLAIMER
-# ---------- 新增: 藥名處理 ----------
-def process_drug_names(drug_ids: List[str]) -> List[str]:
-    if not drug_ids:
-        log.warning("NO_DRUG_ID")
-        return []
-    if len(drug_ids) == 1:
-        log.info(f"單藥名模式: {drug_ids[0]}")
-        return drug_ids
-    else:
-        log.info(f"多藥名模式: {drug_ids}")
-        return drug_ids
-# ---------- 新增: 回覆壓縮 ----------
-def compress_reply(reply: str, max_len: int = 100) -> str:
-    if len(reply) <= max_len:
-        return reply
-    # 刪除修飾詞、合併句
-    reply = re.sub(r'(例如|比如|像是|可能會|有時候|通常|一般來說|另外|而且|因此|所以)', '', reply)
-    reply = re.sub(r'\s+', ' ', reply).strip()
-    reply = re.sub(r'(\.|\?|\!)\s*', r'\1 ', reply)  # 合併句
-    compressed = reply[:max_len] + '...' if len(reply) > max_len else reply
-    log.info("回覆超長，自動壓縮")
-    return compressed
-# ---------- 新增: 異常處理 ----------
 def handle_error(code: str) -> str:
     log.error(f"Pipeline error: {code}")
-    return "查無充分資料，建議詢問醫師/藥師"
-# ---------- 新增: 全鏈路log ----------
-def log_pipeline(user_id: str, query: str, parsed: Dict, candidates: List, drug_choices: List,
-                 retrieval: Dict, sections: List, context: str, prompt: str, reply: str, error_code: str = None):
-    log.info(f"1. user_id: {user_id}, query: {query[:50] + '...' if len(query)>50 else query} (desensitized)")
-    log.info(f"2. parsed={parsed}")
-    log.info(f"3. candidates={candidates}")
-    log.info(f"4. drug_id_pick: {drug_choices} (每個候選的分數、淘汰原因 in find_drug_ids)")
-    log.info(f"5. retrieval: BM25_topN={retrieval.get('bm_top', 0)}, SEM_topN={retrieval.get('sem_top', 0)}, fused_top10={retrieval.get('fused_top10', [])}")
-    log.info(f"6. Injected sections: {sections}")
-    log.info(f"7. contexts_chars: {len(context)}, prompt_chars: {len(prompt)}, tokens: ~{len(prompt)//4}")
-    log.info(f"8. LLM: model={LM_MODEL}, time={retrieval.get('llm_time', 0):.2f}s, truncated={len(reply)>200}, output_chars={len(reply)}")
-    if error_code:
-        log.error(f"error_code={error_code}")
 async def answer_pipeline(query: str, user_id: str) -> str:
     log.info("Pipeline start for user_id: %s, query: %s", user_id, query[:50])
     if not query or not isinstance(query, str):
         return handle_error("INVALID_QUERY")
     if not STATE.sentences:
         return handle_error("NO_CORPUS")
-    session = STATE.user_sessions.get(user_id, {})
-    if "prev_query" in session and query.lower() in ["是", "否"]:
-        if query.lower() == "是":
-            return "太好了！若還有問題，請告訴我。" + DISCLAIMER
-        else:
-            return f"抱歉沒說明清楚。關於{session['prev_query']}，請再說詳細點，或直接問醫師。" + DISCLAIMER
-    parsed = parse_user_message(query)
-    drug_name_hint = ""
-    if "fentanyl" in query.lower():
-        drug_name_hint = "你說的是 Fentanyl 經皮貼片（Duragesic）嗎？有寫幾 mcg/hr 嗎？"
-    candidates = extract_drug_candidates_from_query(query)
-    drug_ids = find_drug_ids_from_name(candidates, STATE.df_csv)
-    drug_choices = process_drug_names(drug_ids)
-    if not drug_choices:
-        log_pipeline(user_id, query, parsed, candidates, drug_choices, {}, [], "", "", "", "NO_DRUG_ID")
-        return make_clarify_message(drug_name_hint)
-    intents = detect_intent(query)
-    answers = []
-    retrieval = {'bm_top': 0, 'sem_top': 0, 'fused_top10': [], 'llm_time': 0}
-    sections = IMPORTANT_SECTIONS
-    context = ""
-    prompt = ""
-    reply = ""
-    try:
-        for did in drug_choices:
-            idxs = fuse_and_select(query, STATE.sentences, STATE.meta, STATE.bm25, STATE.faiss_index, STATE.emb_model, STATE.reranker_model, top_k=TOP_K_SENTENCES, drug_ids=[did])
-            if not idxs:
-                answers.append(handle_error("NO_CONTEXT"))
-                continue
-            context = build_context(idxs, STATE.sentences, STATE.meta)
-            prompt = build_prompt(query, context, intents)
-            t0 = time.time()
-            ans = call_llm(prompt)
-            retrieval['llm_time'] += time.time() - t0
-            if not ans:
-                answers.append(handle_error("LLM_ERROR"))
-                continue
-            ans = compress_reply(ans)
-            answers.append(ans)
-        reply = "\n".join(answers)
-        if len(drug_choices) > 1:
-            reply = "偵測到多個藥物，以下分別提供參考：\n" + reply
-        STATE.user_sessions[user_id] = {"prev_query": query}
-    except Exception as e:
-        log.warning("Pipeline 失敗：%s", e)
-        reply = handle_error("UNEXPECTED_ERROR")
-    log_pipeline(user_id, query, parsed, candidates, drug_choices, retrieval, sections, context, prompt, reply)
-    return reply
 # ---------- LINE 驗簽與回覆 ----------
 def verify_line_signature(body_bytes: bytes, signature: str) -> bool:

 # ---------- Imports ----------
 import re, hmac, base64, hashlib, pickle, logging, time, json
+from typing import List, Dict, Any, Optional, Tuple, Union
 import numpy as np
 import pandas as pd
 }
 IMPORTANT_SECTIONS = ["用法及用量", "病人使用須知", "包裝及儲存", "不良反應", "警語及注意事項"]
+DOSAGE_FORM_BOOST = 1.2 # NEW: 劑型匹配的權重提升
 # ---------- 路徑工具 ----------
 def pick_existing_or_tmp(candidates: List[str]) -> str:
     faiss_index: Optional[Any] = None
     bm25: Optional[Any] = None
     df_csv: Optional[pd.DataFrame] = None
+    user_sessions: Dict[str, Dict[str, Any]] = {}
     query_cache: Dict[str, Dict[str, Any]] = {}
 STATE = State()
         try:
             with open(pkl_path, "rb") as f:
                 bm = pickle.load(f)
+            # MODIFIED: BM25 has corpus, not corpus_size attribute
+            n_bm = len(bm.corpus) if hasattr(bm, 'corpus') else 0
             if n_bm == len(sentences):
                 log.info("Loaded BM25: %s (n=%d)", pkl_path, n_bm)
                 return bm
     safe_pickle_dump(bm, pkl_path)
     return bm
+# ---------- 資訊解析與藥名處理 (MODIFIED & NEW) ----------
+def parse_user_message(query: str) -> Dict[str, Any]:
+    """
+    NEW: 使用更強的正規表示式從使用者問題中提取結構化資訊
+    """
+    parsed = {
+        "drug_name": "", "strength": "", "dosage_form": "", "question": query, "raw_query": query
+    }
+    # Regex to find drug name, strength (e.g., 500mg, 25 mcg/hr), and dosage form
+    # It tries to find a noun-like part followed by numbers/units and another noun-like part
+    drug_pattern = re.compile(
+        r"([\u4e00-\u9fa5a-zA-Z\s\d\.\-]+?)"  # Drug name (non-greedy)
+        r"[:：\s]*?"                         # Optional separator
+        r"(\d+(?:\.\d+)?\s*(?:mg|mcg|µg|g|mcg/hr|mg/hr|iu|國際單位))?"  # Strength (optional)
+        r"[\s]*?"                            # Optional space
+        r"([\u4e00-\u9fa5a-zA-Z]+(?:劑|錠|膠囊|糖漿|乳膏|貼片|噴劑|噴霧|吸入劑))?" # Dosage form (optional)
+    , re.I)
+    match = drug_pattern.search(query)
+    question_part = query
+    if match:
+        drug_name = (match.group(1) or "").strip()
+        strength = (match.group(2) or "").strip()
+        dosage_form = (match.group(3) or "").strip()
+        # Clean up drug name from common question words if they are at the end
+        drug_name = re.sub(r"(怎麼用|怎麼吃|副作用|的用法)$", "", drug_name).strip()
+        parsed.update({"drug_name": drug_name, "strength": strength, "dosage_form": dosage_form})
+        # The rest of the query is the question
+        question_part = query[match.end():].strip()
+        if not question_part:
+             question_part = query # fallback if parsing consumes whole string
+    parsed["question"] = question_part
+    log.info("Parsed user message: %s", parsed)
+    return parsed
+def find_drug_candidates(parsed_info: Dict[str, Any], df: pd.DataFrame) -> List[Dict[str, Any]]:
+    """
+    MODIFIED: Find drug candidates based on parsed info and return a ranked list of dicts.
+    """
+    if df is None or df.empty or not parsed_info.get("drug_name"):
+        return []
+    query_drug_name = parsed_info["drug_name"].lower()
+    query_dosage_form = parsed_info["dosage_form"]
+    # Use jieba to get core drug name
+    tokens = tokenize_zh(query_drug_name)
+    candidates = [t.lower() for t in tokens if len(t) > 1 and t not in DRUG_STOPWORDS]
+    candidates.append(query_drug_name) # also search the raw name
+    candidates = list(set([DRUG_NAME_MAPPING.get(c, c) for c in candidates]))
+    drug_bucket = []
+    # Get all unique drug_ids and their names to avoid iterating the whole dataframe repeatedly
+    unique_drugs = df.drop_duplicates(subset=['drug_id'])
     for cand in candidates:
+        for _, row in unique_drugs.iterrows():
             name_joined = f"{(row.get('drug_name_zh') or '').lower()} {(row.get('drug_name_en') or '').lower()} {(row.get('drug_name_norm') or '').lower()}".strip()
+            if not fuzz:
+                raw_score = 100 if cand in name_joined else 0
+            else:
+                raw_score = max(
                     fuzz.token_set_ratio(cand, name_joined),
                     fuzz.partial_ratio(cand, name_joined)
                 )
+            if raw_score >= 85:
+                # Boost score for English name match, length, and dosage form match
+                score = raw_score
+                if re.search(r'[a-zA-Z]', cand):
+                    score *= 1.2
+                if query_dosage_form and query_dosage_form in name_joined:
+                    score *= 1.1
+                drug_bucket.append({
+                    "score": score,
+                    "drug_id": row["drug_id"],
+                    "drug_name_zh": row.get('drug_name_zh'),
+                    "drug_name_en": row.get('drug_name_en'),
+                    "matched_term": cand
+                })
+    if not drug_bucket:
+        return []
+    # Sort and deduplicate results
+    sorted_bucket = sorted(drug_bucket, key=lambda x: x['score'], reverse=True)
+    seen_ids = set()
+    unique_top = []
+    for item in sorted_bucket:
+        if item['drug_id'] not in seen_ids:
+            unique_top.append(item)
+            seen_ids.add(item['drug_id'])
+    log.info(f"Found drug candidates: {unique_top[:5]}")
+    return unique_top[:5] # Return top 5 candidates
+def select_best_drug_candidate(candidates: List[Dict[str, Any]]) -> Union[Dict[str, Any], str, None]:
+    """
+    NEW: Logic to decide if we have a clear winner or need clarification.
+    """
+    if not candidates:
+        return None
+    # Case 1: The top candidate has a very high score and is significantly better than the second
+    if len(candidates) == 1 and candidates[0]['score'] >= 90:
+        return candidates[0]
+    if len(candidates) > 1:
+        top_score = candidates[0]['score']
+        second_score = candidates[1]['score']
+        if top_score >= 95 and (top_score - second_score) > 10:
+            return candidates[0]
+    # Case 2: Multiple candidates are close in score, ask for clarification
+    if len(candidates) > 1 and (candidates[0]['score'] - candidates[1]['score']) <= 10:
+        options = [f"「{c.get('drug_name_zh') or c.get('drug_name_en')}」" for c in candidates[:3]]
+        return f"請問您指的是以下哪一種藥物？\n- " + "\n- ".join(options)
+    # Case 3: One candidate, but score is not high enough to be confident
+    if candidates[0]['score'] < 90:
+        return None
+    return candidates[0]
 # ---------- 意圖偵測 ----------
 def detect_intent(query: str) -> List[str]:
         log.warning("Rerank failed: %s", e)
         return [{"idx": i, "score": fused} for i, fused, _, _ in sorted(candidates, key=lambda x: -x[1])[:top_k]]  # fallback
+# MODIFIED: fuse_and_select now accepts parsed_info to boost scores based on dosage form
+def fuse_and_select(query: str, sentences: List[str], meta: List[Dict[str, Any]], bm25: Optional[Any], index: Optional[Any], emb_model: Optional[Any], reranker: Optional[Any], top_k: int = 10, drug_id: str = None, parsed_info: Dict[str, Any] = None) -> List[int]:
+    clean_query = parsed_info.get("question", query).strip().lower()
+    cache_key = clean_query + str(drug_id)
+    if cache_key in STATE.query_cache and time.time() - STATE.query_cache[cache_key]['time'] < 180:
+        log.info("Cache hit for query: %s", clean_query[:50])
         return STATE.query_cache[cache_key]['idxs']
+    log.info("Searching for drug_id: %s with query: %s", drug_id, clean_query[:50])
+    if not drug_id:
+        log.warning("No drug_id provided; falling back to full corpus search.")
+    tokenized_query = tokenize_zh(clean_query)
+    scores = {}
+    # BM25 lexical search
     if bm25:
+        bm_scores = bm25.get_scores(tokenized_query)
+        bm_scores_np = np.array(bm_scores)
+        if np.max(bm_scores_np) > np.min(bm_scores_np):
+            scores_norm = (bm_scores_np - np.min(bm_scores_np)) / (np.max(bm_scores_np) - np.min(bm_scores_np))
         else:
+            scores_norm = bm_scores_np
+        for i, s_norm in enumerate(scores_norm):
+            if 0 <= i < len(meta) and (not drug_id or meta[i].get("drug_id") == drug_id):
+                scores[i] = scores.get(i, 0.0) + BM25_WEIGHT * s_norm
+    # FAISS semantic search
     if emb_model and index:
+        q_emb = emb_model.encode([clean_query], normalize_embeddings=True).astype(np.float32)
         _, idxs = index.search(q_emb, top_k * 8)
+        for rank, i in enumerate(idxs[0].tolist()):
+            if 0 <= i < len(meta) and (not drug_id or meta[i].get("drug_id") == drug_id):
+                scores[i] = scores.get(i, 0.0) + SEM_WEIGHT * (1.0 / (1 + rank))
+    # Apply boosts
+    query_dosage_form = parsed_info.get("dosage_form") if parsed_info else ""
+    for i in list(scores.keys()): # Iterate over a copy of keys
+        meta_item = meta[i]
+        # Section weight boost
+        sec = meta_item.get("section", "其他")
+        scores[i] *= SECTION_WEIGHTS.get(sec, 1.0)
+        # NEW: Dosage form boost
+        if query_dosage_form and query_dosage_form in sentences[i]:
+            scores[i] *= DOSAGE_FORM_BOOST
+    # Inject important sections if they are missing
+    for sec in IMPORTANT_SECTIONS:
+        sec_idx = next((i for i, m in enumerate(meta) if (m.get("drug_id") == drug_id) and m.get("section") == sec), None)
+        if sec_idx is not None and sec_idx not in scores:
+            scores[sec_idx] = 1.0 # Give it a moderate score to ensure inclusion before reranking
+    # Prepare for reranking
+    candidates = [(i, sc, 0.0, 0.0) for i, sc in scores.items()]
+    reranked = rerank_results(clean_query, candidates, sentences, reranker, top_k, RERANK_THRESHOLD)
     idxs = [r["idx"] for r in reranked]
     STATE.query_cache[cache_key] = {'idxs': idxs, 'time': time.time()}
     return idxs
 def build_context(idxs: List[int], sentences: List[str], meta: List[Dict[str, Any]]) -> str:
     ctx_lines, total_len, seen = [], 0, set()
     for i in idxs:
         text = sentences[i]
         if text in seen: continue
         chunk_id = meta[i].get("chunk_id", "None")
+        section = meta[i].get("section", "未知章節")
+        line = f"[{section}]: {text}" # MODIFIED: Show section name for better context
         if total_len + len(line) > MAX_CONTEXT_CHARS: break
         ctx_lines.append(line)
         total_len += len(line) + 1
         seen.add(text)
+    return "\n".join(ctx_lines) or "[未知章節]: 沒有找到相關資料，請諮詢醫師或藥師。"
+# MODIFIED: Prompt now includes structured patient info
+def build_prompt(parsed_info: Dict[str, Any], contexts: str, drug_choice: Dict[str, Any]) -> str:
+    patient_context_parts = []
+    if parsed_info.get('strength'):
+        patient_context_parts.append(f"劑量: {parsed_info['strength']}")
+    if parsed_info.get('dosage_form'):
+        patient_context_parts.append(f"劑型: {parsed_info['dosage_form']}")
+    patient_context_str = " ".join(patient_context_parts)
+    if not patient_context_str:
+        patient_context_str = "未提供"
     return (
+        "你是一位專業、有同理心的藥師。請根據提供的「參考片段」，並考量「病患已知資訊」，簡潔地回答使用者的「問題」。\n"
+        "---限制---\n"
+        "- 絕對忠於「參考片段」，不可捏造或過度推論。你的知識僅限於提供的片段。\n"
+        "- 回覆少於 120 字，並使用繁體中文條列式 2-4 點說明。\n"
+        "- 語氣親切、精簡、專業。\n"
+        "- 若片段中無足夠資訊回答，必須回覆：「根據提供的資料，我無法找到關於您問題的明確答案，建議您諮詢醫師或藥師。」\n"
+        "---輸入資訊---\n"
+        f"藥物名稱: {drug_choice.get('drug_name_zh') or drug_choice.get('drug_name_en')}\n"
+        f"病患已知資訊: {patient_context_str}\n"
+        f"問題: {parsed_info.get('raw_query')}\n\n"
+        f"參考片段:\n{contexts}\n"
+        "---你的回答---"
     )
 def call_llm(prompt: str, max_tokens: int = 2048) -> Optional[str]:
         t0 = time.time()
         resp = client.chat.completions.create(
             model=LM_MODEL,
+            messages=[{"role": "user", "content": prompt}], # MODIFIED: Simplified to user role only, as system prompt is now part of the main prompt
+            temperature=0.1, # MODIFIED: slightly lower temperature for more deterministic answers
+            timeout=15, # MODIFIED: slightly longer timeout
             max_tokens=max_tokens,
         )
         used = time.time() - t0
         log.warning("LLM 失敗：%s", e)
         return None
+def make_clarify_message() -> str:
+    # MODIFIED: More generic clarification message
     msg = (
+        "我需要更多資訊才能準確回答，請您提供：\n"
+        "1. 完整的藥物名稱\n"
+        "2. 劑量和劑型（例如：普拿疼 500mg 錠劑）\n"
+        "3. 您的具體問題\n\n"
+        f"{DISCLAIMER}"
     )
+    return msg
 def handle_error(code: str) -> str:
     log.error(f"Pipeline error: {code}")
+    return f"抱歉，系統暫時無法回覆 ({code})。請諮詢醫師或藥師。{DISCLAIMER}"
+# ---------- 主流程 (MODIFIED) ----------
 async def answer_pipeline(query: str, user_id: str) -> str:
     log.info("Pipeline start for user_id: %s, query: %s", user_id, query[:50])
     if not query or not isinstance(query, str):
         return handle_error("INVALID_QUERY")
     if not STATE.sentences:
         return handle_error("NO_CORPUS")
+    # 1. 解析使用者輸入
+    parsed_info = parse_user_message(query)
+    # 2. 尋找藥物候選
+    drug_candidates = find_drug_candidates(parsed_info, STATE.df_csv)
+    # 3. 選擇最佳藥物或要求澄清
+    drug_choice_or_clarification = select_best_drug_candidate(drug_candidates)
+    if drug_choice_or_clarification is None:
+        log.warning("No confident drug match found.")
+        return make_clarify_message()
+    if isinstance(drug_choice_or_clarification, str): # It's a clarification message
+        log.info("Requesting clarification from user.")
+        return drug_choice_or_clarification + f"\n\n{DISCLAIMER}"
+    drug_choice = drug_choice_or_clarification
+    log.info("Selected drug: %s", drug_choice)
+    # 4. 檢索相關內文
+    idxs = fuse_and_select(
+        query=parsed_info["raw_query"],
+        sentences=STATE.sentences,
+        meta=STATE.meta,
+        bm25=STATE.bm25,
+        index=STATE.faiss_index,
+        emb_model=STATE.emb_model,
+        reranker=STATE.reranker_model,
+        top_k=TOP_K_SENTENCES,
+        drug_id=drug_choice['drug_id'],
+        parsed_info=parsed_info
+    )
+    if not idxs:
+        return handle_error("NO_CONTEXT")
+    # 5. 建立上下文和 Prompt
+    context = build_context(idxs, STATE.sentences, STATE.meta)
+    prompt = build_prompt(parsed_info, context, drug_choice)
+    log.info("Generated Prompt:\n%s", prompt)
+    # 6. 呼叫 LLM 生成答案
+    answer = call_llm(prompt)
+    if not answer:
+        return handle_error("LLM_ERROR")
+    return f"{answer}\n\n{DISCLAIMER}"
 # ---------- LINE 驗簽與回覆 ----------
 def verify_line_signature(body_bytes: bytes, signature: str) -> bool: