Song commited on
Commit
032fccb
·
1 Parent(s): 81f1ba7
Files changed (1) hide show
  1. app.py +194 -277
app.py CHANGED
@@ -92,7 +92,7 @@ RERANK_THRESHOLD = 0.5
92
  MAX_CONTEXT_CHARS = 8000
93
  DISCLAIMER = "此回覆僅供參考,請遵循醫師/藥師指示。"
94
 
95
- # 藥名映射與停用詞(擴充)
96
  DRUG_NAME_MAPPING = {
97
  "fentanyl patch": "fentanyl",
98
  "spiriva respimat": "spiriva",
@@ -108,6 +108,7 @@ DRUG_NAME_MAPPING = {
108
  "芬太尼貼片": "fentanyl",
109
  "透皮止痛貼片": "fentanyl",
110
  }
 
111
  DRUG_STOPWORDS = {"藥", "劑", "錠", "膠囊", "糖漿", "乳膏", "貼片", "含錠", "膜衣錠", "緩釋錠", "滴劑", "懸液", "注射液",
112
  "吸入劑", "噴霧", "噴霧劑", "吸入器", "注射筆", "藥水", "小袋", "條", "包", "瓶", "外用", "口服"}
113
 
@@ -146,7 +147,7 @@ SECTION_WEIGHTS = {
146
  }
147
 
148
  IMPORTANT_SECTIONS = ["用法及用量", "病人使用須知", "包裝及儲存", "不良反應", "警語及注意事項"]
149
- DOSAGE_FORM_BOOST = 1.2 # NEW: 劑型匹配的權重提升
150
 
151
  # ---------- 路徑工具 ----------
152
  def pick_existing_or_tmp(candidates: List[str]) -> str:
@@ -321,7 +322,7 @@ def ensure_bm25(pkl_path: str, sentences: List[str]) -> Optional[Any]:
321
  try:
322
  with open(pkl_path, "rb") as f:
323
  bm = pickle.load(f)
324
- # MODIFIED: BM25 has corpus, not corpus_size attribute
325
  n_bm = len(bm.corpus) if hasattr(bm, 'corpus') else 0
326
  if n_bm == len(sentences):
327
  log.info("Loaded BM25: %s (n=%d)", pkl_path, n_bm)
@@ -336,168 +337,216 @@ def ensure_bm25(pkl_path: str, sentences: List[str]) -> Optional[Any]:
336
  safe_pickle_dump(bm, pkl_path)
337
  return bm
338
 
339
- # ---------- 資訊解析與藥名處理 (MODIFIED & NEW) ----------
340
 
341
- def parse_user_message(query: str) -> Dict[str, Any]:
 
342
  """
343
- NEW: 使用更強的正規表示式從使用者問題中提取結構化資訊
344
  """
345
- parsed = {
346
- "drug_name": "", "strength": "", "dosage_form": "", "question": query, "raw_query": query
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  }
348
 
349
- # Regex to find drug name, strength (e.g., 500mg, 25 mcg/hr), and dosage form
350
- # It tries to find a noun-like part followed by numbers/units and another noun-like part
351
- drug_pattern = re.compile(
352
- r"([\u4e00-\u9fa5a-zA-Z\s\d\.\-]+?)" # Drug name (non-greedy)
353
- r"[::\s]*?" # Optional separator
354
- r"(\d+(?:\.\d+)?\s*(?:mg|mcg|µg|g|mcg/hr|mg/hr|iu|國際單位))?" # Strength (optional)
355
- r"[\s]*?" # Optional space
356
- r"([\u4e00-\u9fa5a-zA-Z]+(?:劑|錠|膠囊|糖漿|乳膏|貼片|噴劑|噴霧|吸入劑))?" # Dosage form (optional)
357
- , re.I)
358
-
359
- match = drug_pattern.search(query)
360
- question_part = query
361
-
362
- if match:
363
- drug_name = (match.group(1) or "").strip()
364
- strength = (match.group(2) or "").strip()
365
- dosage_form = (match.group(3) or "").strip()
366
 
367
- # Clean up drug name from common question words if they are at the end
368
- drug_name = re.sub(r"(怎麼用|怎麼吃|副作用|的用法)$", "", drug_name).strip()
369
-
370
- parsed.update({"drug_name": drug_name, "strength": strength, "dosage_form": dosage_form})
 
 
 
 
 
371
 
372
- # The rest of the query is the question
373
- question_part = query[match.end():].strip()
374
- if not question_part:
375
- question_part = query # fallback if parsing consumes whole string
 
 
 
 
376
 
377
- parsed["question"] = question_part
378
- log.info("Parsed user message: %s", parsed)
379
- return parsed
380
 
 
 
 
 
 
 
 
381
 
382
- def find_drug_candidates(parsed_info: Dict[str, Any], df: pd.DataFrame) -> List[Dict[str, Any]]:
383
- """
384
- MODIFIED: Find drug candidates based on parsed info and return a ranked list of dicts.
385
- """
386
- if df is None or df.empty or not parsed_info.get("drug_name"):
387
- return []
388
-
389
- query_drug_name = parsed_info["drug_name"].lower()
390
- query_dosage_form = parsed_info["dosage_form"]
391
 
392
- # Use jieba to get core drug name
393
- tokens = tokenize_zh(query_drug_name)
394
- candidates = [t.lower() for t in tokens if len(t) > 1 and t not in DRUG_STOPWORDS]
395
- candidates.append(query_drug_name) # also search the raw name
396
- candidates = list(set([DRUG_NAME_MAPPING.get(c, c) for c in candidates]))
397
 
398
- drug_bucket = []
 
399
 
400
- # Get all unique drug_ids and their names to avoid iterating the whole dataframe repeatedly
401
- unique_drugs = df.drop_duplicates(subset=['drug_id'])
 
 
 
 
 
 
 
 
 
402
 
403
- for cand in candidates:
404
- for _, row in unique_drugs.iterrows():
405
- name_joined = f"{(row.get('drug_name_zh') or '').lower()} {(row.get('drug_name_en') or '').lower()} {(row.get('drug_name_norm') or '').lower()}".strip()
406
-
407
- if not fuzz:
408
- raw_score = 100 if cand in name_joined else 0
409
- else:
410
- raw_score = max(
411
- fuzz.token_set_ratio(cand, name_joined),
412
- fuzz.partial_ratio(cand, name_joined)
413
- )
414
-
415
- if raw_score >= 85:
416
- # Boost score for English name match, length, and dosage form match
417
- score = raw_score
418
- if re.search(r'[a-zA-Z]', cand):
419
- score *= 1.2
420
- if query_dosage_form and query_dosage_form in name_joined:
421
- score *= 1.1
422
-
423
- drug_bucket.append({
424
- "score": score,
425
- "drug_id": row["drug_id"],
426
- "drug_name_zh": row.get('drug_name_zh'),
427
- "drug_name_en": row.get('drug_name_en'),
428
- "matched_term": cand
429
- })
430
-
431
- if not drug_bucket:
432
- return []
433
 
434
- # Sort and deduplicate results
435
- sorted_bucket = sorted(drug_bucket, key=lambda x: x['score'], reverse=True)
436
- seen_ids = set()
437
- unique_top = []
438
- for item in sorted_bucket:
439
- if item['drug_id'] not in seen_ids:
440
- unique_top.append(item)
441
- seen_ids.add(item['drug_id'])
442
 
443
- log.info(f"Found drug candidates: {unique_top[:5]}")
444
- return unique_top[:5] # Return top 5 candidates
445
 
446
- def select_best_drug_candidate(candidates: List[Dict[str, Any]]) -> Union[Dict[str, Any], str, None]:
 
447
  """
448
- NEW: Logic to decide if we have a clear winner or need clarification.
449
  """
450
- if not candidates:
451
- return None
 
 
 
 
 
 
 
 
 
 
 
452
 
453
- # Case 1: The top candidate has a very high score
454
- if candidates[0]['score'] >= 95:
455
- # Check if score difference is big enough or if it's the only one.
456
- if len(candidates) == 1 or (candidates[0]['score'] - candidates[1]['score'] > 10):
457
- return candidates[0]
458
-
459
- # Case 2: One candidate is clearly the best, but score isn't super high
460
- if len(candidates) > 1 and (candidates[0]['score'] - candidates[1]['score']) > 10:
461
- return candidates[0]
462
-
463
- # Case 3: Multiple candidates are close in score, ask for clarification
464
- options = [f"「{c.get('drug_name_zh') or c.get('drug_name_en')}」" for c in candidates[:3]]
465
- return f"請問您指的是以下哪一種藥物?\n- " + "\n- ".join(options)
466
-
467
- # ---------- 意圖偵測 ----------
468
- def detect_intent(query: str) -> List[str]:
469
- intents = []
470
- for cat, kws in INTENT_KEYWORDS.items():
471
- if any(kw in query for kw in kws):
472
- intents.append(cat)
473
- return intents or ["其他"]
474
-
475
- # ---------- 檢索 ----------
476
- def rerank_results(query: str, candidates: List[Tuple[int, float, float, float]], sentences: List[str], reranker: Optional[Any], top_k: int, threshold: float) -> List[Dict[str, Any]]:
477
  try:
478
- candidates = sorted(candidates, key=lambda x: -x[1])[:top_k * 2] # 限20
479
- if not candidates:
480
- return []
481
- valid_indices = [i for (i, *_ ) in candidates if 0 <= i < len(sentences)]
482
- if not valid_indices:
483
- return []
484
- if reranker is None:
485
- return [{"idx": i, "score": fused} for i, fused, _, _ in candidates]
486
- pairs = [[query, sentences[i]] for i in valid_indices]
487
- if not pairs:
488
- return []
489
- scores = reranker.predict(pairs, show_progress_bar=False)
490
- reranked = [{"idx": valid_indices[j], "score": float(scores[j])} for j in range(len(scores)) if float(scores[j]) >= threshold]
491
- if not reranked:
492
- reranked = [{"idx": i, "score": fused} for i, fused, _, _ in candidates]
493
- return sorted(reranked, key=lambda x: -x["score"])[:top_k]
494
  except Exception as e:
495
- log.warning("Rerank failed: %s", e)
496
- return [{"idx": i, "score": fused} for i, fused, _, _ in sorted(candidates, key=lambda x: -x[1])[:top_k]] # fallback
 
 
 
 
 
 
 
 
 
497
 
498
- # MODIFIED: fuse_and_select now accepts parsed_info to boost scores based on dosage form
499
- def fuse_and_select(query: str, sentences: List[str], meta: List[Dict[str, Any]], bm25: Optional[Any], index: Optional[Any], emb_model: Optional[Any], reranker: Optional[Any], top_k: int = 10, drug_id: str = None, parsed_info: Dict[str, Any] = None) -> List[int]:
500
- clean_query = parsed_info.get("question", query).strip().lower()
 
 
 
 
 
 
 
501
  cache_key = clean_query + str(drug_id)
502
  if cache_key in STATE.query_cache and time.time() - STATE.query_cache[cache_key]['time'] < 180:
503
  log.info("Cache hit for query: %s", clean_query[:50])
@@ -532,19 +581,14 @@ def fuse_and_select(query: str, sentences: List[str], meta: List[Dict[str, Any]]
532
  scores[i] = scores.get(i, 0.0) + SEM_WEIGHT * (1.0 / (1 + rank))
533
 
534
  # Apply boosts
535
- query_dosage_form = parsed_info.get("dosage_form") if parsed_info else ""
536
  for i in list(scores.keys()): # Iterate over a copy of keys
537
  meta_item = meta[i]
538
 
539
  # Section weight boost
540
  sec = meta_item.get("section", "其他")
541
  scores[i] *= SECTION_WEIGHTS.get(sec, 1.0)
542
-
543
- # NEW: Dosage form boost
544
- if query_dosage_form and query_dosage_form in sentences[i]:
545
- scores[i] *= DOSAGE_FORM_BOOST
546
 
547
- # NEW: Boost based on detected intent
548
  detected_intents = detect_intent(clean_query)
549
  for i in list(scores.keys()):
550
  meta_item = meta[i]
@@ -573,7 +617,6 @@ def fuse_and_select(query: str, sentences: List[str], meta: List[Dict[str, Any]]
573
  STATE.query_cache[cache_key] = {'idxs': idxs, 'time': time.time()}
574
  return idxs
575
 
576
-
577
  def build_context(idxs: List[int], sentences: List[str], meta: List[Dict[str, Any]]) -> str:
578
  ctx_lines, total_len, seen = [], 0, set()
579
  for i in idxs:
@@ -582,139 +625,13 @@ def build_context(idxs: List[int], sentences: List[str], meta: List[Dict[str, An
582
  if text in seen: continue
583
  chunk_id = meta[i].get("chunk_id", "None")
584
  section = meta[i].get("section", "未知章節")
585
- line = f"[{section}]: {text}" # MODIFIED: Show section name for better context
586
  if total_len + len(line) > MAX_CONTEXT_CHARS: break
587
  ctx_lines.append(line)
588
  total_len += len(line) + 1
589
  seen.add(text)
590
  return "\n".join(ctx_lines) or "[未知章節]: 沒有找到相關資料,請諮詢醫師或藥師。"
591
 
592
- # MODIFIED: Prompt now includes structured patient info
593
- def build_prompt(parsed_info: Dict[str, Any], contexts: str, drug_choice: Dict[str, Any]) -> str:
594
-
595
- patient_context_parts = []
596
- if parsed_info.get('strength'):
597
- patient_context_parts.append(f"劑量: {parsed_info['strength']}")
598
- if parsed_info.get('dosage_form'):
599
- patient_context_parts.append(f"劑型: {parsed_info['dosage_form']}")
600
-
601
- patient_context_str = " ".join(patient_context_parts)
602
- if not patient_context_str:
603
- patient_context_str = "未提供"
604
-
605
- return (
606
- "你是一位專業、有同理心的藥師。請根據提供的「參考片段」,並考量「病患已知資訊」,簡潔地回答使用者的「問題」。\n"
607
- "---限制---\n"
608
- "- 絕對忠於「參考片段」,不可捏造或過度推論。你的知識僅限於提供的片段。\n"
609
- "- 回覆少於 120 字,並使用繁體中文條列式 2-4 點說明。\n"
610
- "- 語氣親切、精簡、專業。\n"
611
- "- 若片段中無足夠資訊回答,必須回覆:「根據提供的資料,我無法找到關於您問題的明確答案,建議您諮詢醫師或藥師。」\n"
612
- "---輸入資訊---\n"
613
- f"藥物名稱: {drug_choice.get('drug_name_zh') or drug_choice.get('drug_name_en')}\n"
614
- f"病患已知資訊: {patient_context_str}\n"
615
- f"問題: {parsed_info.get('raw_query')}\n\n"
616
- f"參考片段:\n{contexts}\n"
617
- "---你的回答---"
618
- )
619
-
620
- def call_llm(prompt: str, max_tokens: int = 2048) -> Optional[str]:
621
- try:
622
- from openai import OpenAI
623
- except Exception as e:
624
- log.warning("openai client 不可用:%s", e)
625
- return None
626
- if not (LITELLM_API_KEY and LM_MODEL and LITELLM_BASE_URL):
627
- log.warning("LLM 未完整設定;略過生成。")
628
- return None
629
- client = OpenAI(base_url=LITELLM_BASE_URL, api_key=LITELLM_API_KEY)
630
- try:
631
- t0 = time.time()
632
- resp = client.chat.completions.create(
633
- model=LM_MODEL,
634
- messages=[{"role": "user", "content": prompt}], # MODIFIED: Simplified to user role only, as system prompt is now part of the main prompt
635
- temperature=0.1, # MODIFIED: slightly lower temperature for more deterministic answers
636
- timeout=15, # MODIFIED: slightly longer timeout
637
- max_tokens=max_tokens,
638
- )
639
- used = time.time() - t0
640
- log.info("LLM ok (%.2fs)", used)
641
- return (resp.choices[0].message.content or "").strip()
642
- except Exception as e:
643
- log.warning("LLM 失敗:%s", e)
644
- return None
645
-
646
- def make_clarify_message() -> str:
647
- # MODIFIED: More generic clarification message
648
- msg = (
649
- "我需要更多資訊才能準確回答,請您提供:\n"
650
- "1. 完整的藥物名稱\n"
651
- "2. 劑量和劑型(例如:普拿疼 500mg 錠劑)\n"
652
- "3. 您的具體問題\n\n"
653
- f"{DISCLAIMER}"
654
- )
655
- return msg
656
-
657
- def handle_error(code: str) -> str:
658
- log.error(f"Pipeline error: {code}")
659
- return f"抱歉,系統暫時無法回覆 ({code})。請諮詢醫師或藥師。{DISCLAIMER}"
660
-
661
- # ---------- 主流程 (MODIFIED) ----------
662
- async def answer_pipeline(query: str, user_id: str) -> str:
663
- log.info("Pipeline start for user_id: %s, query: %s", user_id, query[:50])
664
- if not query or not isinstance(query, str):
665
- return handle_error("INVALID_QUERY")
666
- if not STATE.sentences:
667
- return handle_error("NO_CORPUS")
668
-
669
- # 1. 解析使用者輸入
670
- parsed_info = parse_user_message(query)
671
-
672
- # 2. 尋找藥物候選
673
- drug_candidates = find_drug_candidates(parsed_info, STATE.df_csv)
674
-
675
- # 3. 選擇最佳藥物或要求澄清
676
- drug_choice_or_clarification = select_best_drug_candidate(drug_candidates)
677
-
678
- if drug_choice_or_clarification is None:
679
- log.warning("No confident drug match found.")
680
- return make_clarify_message()
681
-
682
- if isinstance(drug_choice_or_clarification, str): # It's a clarification message
683
- log.info("Requesting clarification from user.")
684
- return drug_choice_or_clarification + f"\n\n{DISCLAIMER}"
685
-
686
- drug_choice = drug_choice_or_clarification
687
- log.info("Selected drug: %s", drug_choice)
688
-
689
- # 4. 檢索相關內文
690
- idxs = fuse_and_select(
691
- query=parsed_info["raw_query"],
692
- sentences=STATE.sentences,
693
- meta=STATE.meta,
694
- bm25=STATE.bm25,
695
- index=STATE.faiss_index,
696
- emb_model=STATE.emb_model,
697
- reranker=STATE.reranker_model,
698
- top_k=TOP_K_SENTENCES,
699
- drug_id=drug_choice['drug_id'],
700
- parsed_info=parsed_info
701
- )
702
-
703
- if not idxs:
704
- return handle_error("NO_CONTEXT")
705
-
706
- # 5. 建立上下文和 Prompt
707
- context = build_context(idxs, STATE.sentences, STATE.meta)
708
- prompt = build_prompt(parsed_info, context, drug_choice)
709
- log.info("Generated Prompt:\n%s", prompt)
710
-
711
- # 6. 呼叫 LLM 生成答案
712
- answer = call_llm(prompt)
713
- if not answer:
714
- return handle_error("LLM_ERROR")
715
-
716
- return f"{answer}\n\n{DISCLAIMER}"
717
-
718
  # ---------- LINE 驗簽與回覆 ----------
719
  def verify_line_signature(body_bytes: bytes, signature: str) -> bool:
720
  if not CHANNEL_SECRET:
@@ -801,4 +718,4 @@ async def health():
801
  if __name__ == "__main__":
802
  import uvicorn
803
  port = int(os.getenv("PORT", "7860"))
804
- uvicorn.run("app:app", host="0.0.0.0", port=port, log_level=LOG_LEVEL.lower(), reload=False)
 
92
  MAX_CONTEXT_CHARS = 8000
93
  DISCLAIMER = "此回覆僅供參考,請遵循醫師/藥師指示。"
94
 
95
+ # 藥名映射與停用詞
96
  DRUG_NAME_MAPPING = {
97
  "fentanyl patch": "fentanyl",
98
  "spiriva respimat": "spiriva",
 
108
  "芬太尼貼片": "fentanyl",
109
  "透皮止痛貼片": "fentanyl",
110
  }
111
+
112
  DRUG_STOPWORDS = {"藥", "劑", "錠", "膠囊", "糖漿", "乳膏", "貼片", "含錠", "膜衣錠", "緩釋錠", "滴劑", "懸液", "注射液",
113
  "吸入劑", "噴霧", "噴霧劑", "吸入器", "注射筆", "藥水", "小袋", "條", "包", "瓶", "外用", "口服"}
114
 
 
147
  }
148
 
149
  IMPORTANT_SECTIONS = ["用法及用量", "病人使用須知", "包裝及儲存", "不良反應", "警語及注意事項"]
150
+ # 移除 DOSAGE_FORM_BOOST
151
 
152
  # ---------- 路徑工具 ----------
153
  def pick_existing_or_tmp(candidates: List[str]) -> str:
 
322
  try:
323
  with open(pkl_path, "rb") as f:
324
  bm = pickle.load(f)
325
+ # BM25 has corpus, not corpus_size attribute
326
  n_bm = len(bm.corpus) if hasattr(bm, 'corpus') else 0
327
  if n_bm == len(sentences):
328
  log.info("Loaded BM25: %s (n=%d)", pkl_path, n_bm)
 
337
  safe_pickle_dump(bm, pkl_path)
338
  return bm
339
 
340
+ # ---------- 資訊解析與藥名處理 (簡化) ----------
341
 
342
+ # 1. parse_user_message: 簡化為只比對藥名
343
+ def parse_user_message(query: str, df: pd.DataFrame) -> Dict[str, Any]:
344
  """
345
+ MODIFIED: 只比對 drug_name_norm,找最佳藥品。
346
  """
347
+ best_drug = None
348
+ best_row = None
349
+ max_score = 0
350
+
351
+ if not fuzz:
352
+ log.warning("fuzzywuzzy not available; skipping fuzzy match.")
353
+ return {
354
+ "drug_name": None,
355
+ "drug_id": None,
356
+ "question": query,
357
+ }
358
+
359
+ # Use a pre-tokenized and normalized list for faster fuzzy matching
360
+ # In a real app, this should be pre-computed and stored for efficiency
361
+ unique_drugs = df.drop_duplicates(subset=['drug_id'])
362
+
363
+ # Check for direct match first
364
+ query_lower = query.lower().strip()
365
+ direct_match = unique_drugs[unique_drugs['drug_name_norm'].str.lower() == query_lower]
366
+ if not direct_match.empty:
367
+ best_row = direct_match.iloc[0]
368
+ best_drug = best_row["drug_name_norm"]
369
+ log.info(f"Direct match found: {best_drug}")
370
+ else:
371
+ for _, row in unique_drugs.iterrows():
372
+ drug_norm = (row.get('drug_name_norm') or "").lower()
373
+ score = fuzz.token_set_ratio(query_lower, drug_norm)
374
+
375
+ if score > max_score:
376
+ max_score = score
377
+ best_drug = drug_norm
378
+ best_row = row
379
+
380
+ if best_drug is None or max_score < 80: # 設定一個閾值來避免不相關的匹配
381
+ log.warning(f"No confident drug match found (score: {max_score})")
382
+ return {
383
+ "drug_name": None,
384
+ "drug_id": None,
385
+ "question": query,
386
+ }
387
+
388
+ log.info(f"Parsed user message (best match): {best_drug}, score: {max_score}")
389
+
390
+ return {
391
+ "drug_name": best_drug,
392
+ "drug_id": best_row["drug_id"],
393
+ "question": query
394
  }
395
 
396
+ # 2. find_drug_candidates: 簡化為單純 fuzzy 比對
397
+ def find_drug_candidates(parsed_info: Dict[str, Any], df: pd.DataFrame, top_k: int = 5) -> List[Dict[str, Any]]:
398
+ """
399
+ MODIFIED: 單純對 drug_name_norm 做 fuzzy 比對,並回傳前 top_k 候選。
400
+ """
401
+ query_text = parsed_info.get("question", "").lower()
402
+ if df is None or df.empty or not query_text:
403
+ return []
 
 
 
 
 
 
 
 
 
404
 
405
+ if not fuzz:
406
+ return []
407
+
408
+ candidates_list = []
409
+ unique_drugs = df.drop_duplicates(subset=['drug_id'])
410
+
411
+ for _, row in unique_drugs.iterrows():
412
+ drug_norm = (row.get('drug_name_norm') or "").lower()
413
+ score = fuzz.token_set_ratio(query_text, drug_norm)
414
 
415
+ candidates_list.append({
416
+ "drug_id": row["drug_id"],
417
+ "drug_name": drug_norm,
418
+ "score": score
419
+ })
420
+
421
+ # 依 score 排序並回傳前 top_k
422
+ sorted_candidates = sorted(candidates_list, key=lambda x: x['score'], reverse=True)
423
 
424
+ log.info(f"Found drug candidates: {sorted_candidates[:top_k]}")
425
+ return sorted_candidates[:top_k]
 
426
 
427
+ # 3. answer_pipeline: 簡化流程
428
+ async def answer_pipeline(query: str, user_id: str) -> str:
429
+ log.info("Pipeline start for user_id: %s, query: %s", user_id, query[:50])
430
+ if not query or not isinstance(query, str):
431
+ return handle_error("INVALID_QUERY")
432
+ if not STATE.sentences or not STATE.df_csv:
433
+ return handle_error("NO_CORPUS")
434
 
435
+ # 1. 解析使用者輸入並找到最佳藥品
436
+ best_drug_info = parse_user_message(query, STATE.df_csv)
 
 
 
 
 
 
 
437
 
438
+ if not best_drug_info.get("drug_id"):
439
+ log.warning("No confident drug match found.")
440
+ return make_clarify_message()
 
 
441
 
442
+ # 2. 呼叫 find_drug_candidates 產生候選清單
443
+ drug_candidates = find_drug_candidates(best_drug_info, STATE.df_csv)
444
 
445
+ # 3. score >= 95 或與次高分差距 > 10 判斷是否選定最佳藥品
446
+ top_score = drug_candidates[0]['score'] if drug_candidates else 0
447
+ second_score = drug_candidates[1]['score'] if len(drug_candidates) > 1 else 0
448
+
449
+ if top_score >= 95 or (top_score - second_score) > 10:
450
+ log.info("Confidently selected drug: %s", best_drug_info['drug_name'])
451
+ drug_choice = best_drug_info
452
+ else:
453
+ log.info("Scores are too close, requesting clarification.")
454
+ options = [f"「{c.get('drug_name')}」" for c in drug_candidates[:3]]
455
+ return f"請問您指的是以下哪一種藥物?\n- " + "\n- ".join(options) + f"\n\n{DISCLAIMER}"
456
 
457
+ # 4. 檢索相關內文 (fuse_and_select)
458
+ idxs = fuse_and_select(
459
+ query=best_drug_info["question"],
460
+ sentences=STATE.sentences,
461
+ meta=STATE.meta,
462
+ bm25=STATE.bm25,
463
+ index=STATE.faiss_index,
464
+ emb_model=STATE.emb_model,
465
+ reranker=STATE.reranker_model,
466
+ top_k=TOP_K_SENTENCES,
467
+ drug_id=drug_choice['drug_id'],
468
+ # 移除 parsed_info
469
+ )
470
+
471
+ if not idxs:
472
+ return handle_error("NO_CONTEXT")
473
+
474
+ # 5. 建立上下文和 Prompt (build_prompt)
475
+ context = build_context(idxs, STATE.sentences, STATE.meta)
476
+ prompt = build_prompt(best_drug_info, context, drug_choice)
477
+ log.info("Generated Prompt:\n%s", prompt)
 
 
 
 
 
 
 
 
 
478
 
479
+ # 6. 呼叫 LLM 生成答案
480
+ answer = call_llm(prompt)
481
+ if not answer:
482
+ return handle_error("LLM_ERROR")
 
 
 
 
483
 
484
+ return f"{answer}\n\n{DISCLAIMER}"
 
485
 
486
+ # 4. build_prompt: 簡化提示詞
487
+ def build_prompt(parsed_info: Dict[str, Any], contexts: str, drug_choice: Dict[str, Any]) -> str:
488
  """
489
+ MODIFIED: 簡化為只包含藥品名稱、使用者問題、參考片段。
490
  """
491
+ return (
492
+ "你是一位專業、有同理心的藥師。請根據提供的「參考片段」,簡潔地回答使用者的「問題」。\n"
493
+ "---限制---\n"
494
+ "- 絕對忠於「參考片段」,不可捏造或過度推論。你的知識僅限於提供的片段。\n"
495
+ "- 回覆少於 120 字,並使用繁體中文條列式 2-4 點說明。\n"
496
+ "- 語氣親切、精簡、專業。\n"
497
+ "- 若片段中無足夠資訊回答,必須回覆:「根據提供的資料,我無法找到關於您問題的明確答案,建議您諮詢醫師或藥師。」\n"
498
+ "---輸入資訊---\n"
499
+ f"藥物名稱: {drug_choice.get('drug_name')}\n"
500
+ f"問題: {parsed_info.get('question')}\n\n"
501
+ f"參考片段:\n{contexts}\n"
502
+ "---你的回答---"
503
+ )
504
 
505
+ def call_llm(prompt: str, max_tokens: int = 2048) -> Optional[str]:
506
+ try:
507
+ from openai import OpenAI
508
+ except Exception as e:
509
+ log.warning("openai client 不可用:%s", e)
510
+ return None
511
+ if not (LITELLM_API_KEY and LM_MODEL and LITELLM_BASE_URL):
512
+ log.warning("LLM 未完整設定;略過生成。")
513
+ return None
514
+ client = OpenAI(base_url=LITELLM_BASE_URL, api_key=LITELLM_API_KEY)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
515
  try:
516
+ t0 = time.time()
517
+ resp = client.chat.completions.create(
518
+ model=LM_MODEL,
519
+ messages=[{"role": "user", "content": prompt}],
520
+ temperature=0.1,
521
+ timeout=15,
522
+ max_tokens=max_tokens,
523
+ )
524
+ used = time.time() - t0
525
+ log.info("LLM ok (%.2fs)", used)
526
+ return (resp.choices[0].message.content or "").strip()
 
 
 
 
 
527
  except Exception as e:
528
+ log.warning("LLM 失敗:%s", e)
529
+ return None
530
+
531
+ def make_clarify_message() -> str:
532
+ msg = (
533
+ "我需要更多資訊才能準確回答,請您提供:\n"
534
+ "1. 完整的藥物名稱\n"
535
+ "2. 您的具體問題\n\n"
536
+ f"{DISCLAIMER}"
537
+ )
538
+ return msg
539
 
540
+ def handle_error(code: str) -> str:
541
+ log.error(f"Pipeline error: {code}")
542
+ return f"抱歉,系統暫時無法回覆 ({code})。請諮詢醫師或藥師。{DISCLAIMER}"
543
+
544
+ # 5. fuse_and_select: 移除劑型加權
545
+ def fuse_and_select(query: str, sentences: List[str], meta: List[Dict[str, Any]], bm25: Optional[Any], index: Optional[Any], emb_model: Optional[Any], reranker: Optional[Any], top_k: int = 10, drug_id: str = None) -> List[int]:
546
+ """
547
+ MODIFIED: 移除劑型加權。只保留 BM25/FAISS 融合 + 章節加權 + 意圖加權。
548
+ """
549
+ clean_query = query.strip().lower()
550
  cache_key = clean_query + str(drug_id)
551
  if cache_key in STATE.query_cache and time.time() - STATE.query_cache[cache_key]['time'] < 180:
552
  log.info("Cache hit for query: %s", clean_query[:50])
 
581
  scores[i] = scores.get(i, 0.0) + SEM_WEIGHT * (1.0 / (1 + rank))
582
 
583
  # Apply boosts
 
584
  for i in list(scores.keys()): # Iterate over a copy of keys
585
  meta_item = meta[i]
586
 
587
  # Section weight boost
588
  sec = meta_item.get("section", "其他")
589
  scores[i] *= SECTION_WEIGHTS.get(sec, 1.0)
 
 
 
 
590
 
591
+ # Boost based on detected intent
592
  detected_intents = detect_intent(clean_query)
593
  for i in list(scores.keys()):
594
  meta_item = meta[i]
 
617
  STATE.query_cache[cache_key] = {'idxs': idxs, 'time': time.time()}
618
  return idxs
619
 
 
620
  def build_context(idxs: List[int], sentences: List[str], meta: List[Dict[str, Any]]) -> str:
621
  ctx_lines, total_len, seen = [], 0, set()
622
  for i in idxs:
 
625
  if text in seen: continue
626
  chunk_id = meta[i].get("chunk_id", "None")
627
  section = meta[i].get("section", "未知章節")
628
+ line = f"[{section}]: {text}"
629
  if total_len + len(line) > MAX_CONTEXT_CHARS: break
630
  ctx_lines.append(line)
631
  total_len += len(line) + 1
632
  seen.add(text)
633
  return "\n".join(ctx_lines) or "[未知章節]: 沒有找到相關資料,請諮詢醫師或藥師。"
634
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
635
  # ---------- LINE 驗簽與回覆 ----------
636
  def verify_line_signature(body_bytes: bytes, signature: str) -> bool:
637
  if not CHANNEL_SECRET:
 
718
  if __name__ == "__main__":
719
  import uvicorn
720
  port = int(os.getenv("PORT", "7860"))
721
+ uvicorn.run("app:app", host="0.0.0.0", port=port, log_level=LOG_LEVEL.lower(), reload=False)