Spaces:
Sleeping
Sleeping
Song
commited on
Commit
·
032fccb
1
Parent(s):
81f1ba7
hi
Browse files
app.py
CHANGED
@@ -92,7 +92,7 @@ RERANK_THRESHOLD = 0.5
|
|
92 |
MAX_CONTEXT_CHARS = 8000
|
93 |
DISCLAIMER = "此回覆僅供參考,請遵循醫師/藥師指示。"
|
94 |
|
95 |
-
#
|
96 |
DRUG_NAME_MAPPING = {
|
97 |
"fentanyl patch": "fentanyl",
|
98 |
"spiriva respimat": "spiriva",
|
@@ -108,6 +108,7 @@ DRUG_NAME_MAPPING = {
|
|
108 |
"芬太尼貼片": "fentanyl",
|
109 |
"透皮止痛貼片": "fentanyl",
|
110 |
}
|
|
|
111 |
DRUG_STOPWORDS = {"藥", "劑", "錠", "膠囊", "糖漿", "乳膏", "貼片", "含錠", "膜衣錠", "緩釋錠", "滴劑", "懸液", "注射液",
|
112 |
"吸入劑", "噴霧", "噴霧劑", "吸入器", "注射筆", "藥水", "小袋", "條", "包", "瓶", "外用", "口服"}
|
113 |
|
@@ -146,7 +147,7 @@ SECTION_WEIGHTS = {
|
|
146 |
}
|
147 |
|
148 |
IMPORTANT_SECTIONS = ["用法及用量", "病人使用須知", "包裝及儲存", "不良反應", "警語及注意事項"]
|
149 |
-
|
150 |
|
151 |
# ---------- 路徑工具 ----------
|
152 |
def pick_existing_or_tmp(candidates: List[str]) -> str:
|
@@ -321,7 +322,7 @@ def ensure_bm25(pkl_path: str, sentences: List[str]) -> Optional[Any]:
|
|
321 |
try:
|
322 |
with open(pkl_path, "rb") as f:
|
323 |
bm = pickle.load(f)
|
324 |
-
#
|
325 |
n_bm = len(bm.corpus) if hasattr(bm, 'corpus') else 0
|
326 |
if n_bm == len(sentences):
|
327 |
log.info("Loaded BM25: %s (n=%d)", pkl_path, n_bm)
|
@@ -336,168 +337,216 @@ def ensure_bm25(pkl_path: str, sentences: List[str]) -> Optional[Any]:
|
|
336 |
safe_pickle_dump(bm, pkl_path)
|
337 |
return bm
|
338 |
|
339 |
-
# ---------- 資訊解析與藥名處理 (
|
340 |
|
341 |
-
|
|
|
342 |
"""
|
343 |
-
|
344 |
"""
|
345 |
-
|
346 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
347 |
}
|
348 |
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
, re.I)
|
358 |
-
|
359 |
-
match = drug_pattern.search(query)
|
360 |
-
question_part = query
|
361 |
-
|
362 |
-
if match:
|
363 |
-
drug_name = (match.group(1) or "").strip()
|
364 |
-
strength = (match.group(2) or "").strip()
|
365 |
-
dosage_form = (match.group(3) or "").strip()
|
366 |
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
|
|
|
|
|
|
|
|
|
|
371 |
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
|
|
|
|
|
|
|
|
376 |
|
377 |
-
|
378 |
-
|
379 |
-
return parsed
|
380 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
381 |
|
382 |
-
|
383 |
-
|
384 |
-
MODIFIED: Find drug candidates based on parsed info and return a ranked list of dicts.
|
385 |
-
"""
|
386 |
-
if df is None or df.empty or not parsed_info.get("drug_name"):
|
387 |
-
return []
|
388 |
-
|
389 |
-
query_drug_name = parsed_info["drug_name"].lower()
|
390 |
-
query_dosage_form = parsed_info["dosage_form"]
|
391 |
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
candidates.append(query_drug_name) # also search the raw name
|
396 |
-
candidates = list(set([DRUG_NAME_MAPPING.get(c, c) for c in candidates]))
|
397 |
|
398 |
-
|
|
|
399 |
|
400 |
-
#
|
401 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
402 |
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
"score": score,
|
425 |
-
"drug_id": row["drug_id"],
|
426 |
-
"drug_name_zh": row.get('drug_name_zh'),
|
427 |
-
"drug_name_en": row.get('drug_name_en'),
|
428 |
-
"matched_term": cand
|
429 |
-
})
|
430 |
-
|
431 |
-
if not drug_bucket:
|
432 |
-
return []
|
433 |
|
434 |
-
#
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
for item in sorted_bucket:
|
439 |
-
if item['drug_id'] not in seen_ids:
|
440 |
-
unique_top.append(item)
|
441 |
-
seen_ids.add(item['drug_id'])
|
442 |
|
443 |
-
|
444 |
-
return unique_top[:5] # Return top 5 candidates
|
445 |
|
446 |
-
|
|
|
447 |
"""
|
448 |
-
|
449 |
"""
|
450 |
-
|
451 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
452 |
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
return
|
462 |
-
|
463 |
-
# Case 3: Multiple candidates are close in score, ask for clarification
|
464 |
-
options = [f"「{c.get('drug_name_zh') or c.get('drug_name_en')}」" for c in candidates[:3]]
|
465 |
-
return f"請問您指的是以下哪一種藥物?\n- " + "\n- ".join(options)
|
466 |
-
|
467 |
-
# ---------- 意圖偵測 ----------
|
468 |
-
def detect_intent(query: str) -> List[str]:
|
469 |
-
intents = []
|
470 |
-
for cat, kws in INTENT_KEYWORDS.items():
|
471 |
-
if any(kw in query for kw in kws):
|
472 |
-
intents.append(cat)
|
473 |
-
return intents or ["其他"]
|
474 |
-
|
475 |
-
# ---------- 檢索 ----------
|
476 |
-
def rerank_results(query: str, candidates: List[Tuple[int, float, float, float]], sentences: List[str], reranker: Optional[Any], top_k: int, threshold: float) -> List[Dict[str, Any]]:
|
477 |
try:
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
scores = reranker.predict(pairs, show_progress_bar=False)
|
490 |
-
reranked = [{"idx": valid_indices[j], "score": float(scores[j])} for j in range(len(scores)) if float(scores[j]) >= threshold]
|
491 |
-
if not reranked:
|
492 |
-
reranked = [{"idx": i, "score": fused} for i, fused, _, _ in candidates]
|
493 |
-
return sorted(reranked, key=lambda x: -x["score"])[:top_k]
|
494 |
except Exception as e:
|
495 |
-
log.warning("
|
496 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
497 |
|
498 |
-
|
499 |
-
|
500 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
501 |
cache_key = clean_query + str(drug_id)
|
502 |
if cache_key in STATE.query_cache and time.time() - STATE.query_cache[cache_key]['time'] < 180:
|
503 |
log.info("Cache hit for query: %s", clean_query[:50])
|
@@ -532,19 +581,14 @@ def fuse_and_select(query: str, sentences: List[str], meta: List[Dict[str, Any]]
|
|
532 |
scores[i] = scores.get(i, 0.0) + SEM_WEIGHT * (1.0 / (1 + rank))
|
533 |
|
534 |
# Apply boosts
|
535 |
-
query_dosage_form = parsed_info.get("dosage_form") if parsed_info else ""
|
536 |
for i in list(scores.keys()): # Iterate over a copy of keys
|
537 |
meta_item = meta[i]
|
538 |
|
539 |
# Section weight boost
|
540 |
sec = meta_item.get("section", "其他")
|
541 |
scores[i] *= SECTION_WEIGHTS.get(sec, 1.0)
|
542 |
-
|
543 |
-
# NEW: Dosage form boost
|
544 |
-
if query_dosage_form and query_dosage_form in sentences[i]:
|
545 |
-
scores[i] *= DOSAGE_FORM_BOOST
|
546 |
|
547 |
-
#
|
548 |
detected_intents = detect_intent(clean_query)
|
549 |
for i in list(scores.keys()):
|
550 |
meta_item = meta[i]
|
@@ -573,7 +617,6 @@ def fuse_and_select(query: str, sentences: List[str], meta: List[Dict[str, Any]]
|
|
573 |
STATE.query_cache[cache_key] = {'idxs': idxs, 'time': time.time()}
|
574 |
return idxs
|
575 |
|
576 |
-
|
577 |
def build_context(idxs: List[int], sentences: List[str], meta: List[Dict[str, Any]]) -> str:
|
578 |
ctx_lines, total_len, seen = [], 0, set()
|
579 |
for i in idxs:
|
@@ -582,139 +625,13 @@ def build_context(idxs: List[int], sentences: List[str], meta: List[Dict[str, An
|
|
582 |
if text in seen: continue
|
583 |
chunk_id = meta[i].get("chunk_id", "None")
|
584 |
section = meta[i].get("section", "未知章節")
|
585 |
-
line = f"[{section}]: {text}"
|
586 |
if total_len + len(line) > MAX_CONTEXT_CHARS: break
|
587 |
ctx_lines.append(line)
|
588 |
total_len += len(line) + 1
|
589 |
seen.add(text)
|
590 |
return "\n".join(ctx_lines) or "[未知章節]: 沒有找到相關資料,請諮詢醫師或藥師。"
|
591 |
|
592 |
-
# MODIFIED: Prompt now includes structured patient info
|
593 |
-
def build_prompt(parsed_info: Dict[str, Any], contexts: str, drug_choice: Dict[str, Any]) -> str:
|
594 |
-
|
595 |
-
patient_context_parts = []
|
596 |
-
if parsed_info.get('strength'):
|
597 |
-
patient_context_parts.append(f"劑量: {parsed_info['strength']}")
|
598 |
-
if parsed_info.get('dosage_form'):
|
599 |
-
patient_context_parts.append(f"劑型: {parsed_info['dosage_form']}")
|
600 |
-
|
601 |
-
patient_context_str = " ".join(patient_context_parts)
|
602 |
-
if not patient_context_str:
|
603 |
-
patient_context_str = "未提供"
|
604 |
-
|
605 |
-
return (
|
606 |
-
"你是一位專業、有同理心的藥師。請根據提供的「參考片段」,並考量「病患已知資訊」,簡潔地回答使用者的「問題」。\n"
|
607 |
-
"---限制---\n"
|
608 |
-
"- 絕對忠於「參考片段」,不可捏造或過度推論。你的知識僅限於提供的片段。\n"
|
609 |
-
"- 回覆少於 120 字,並使用繁體中文條列式 2-4 點說明。\n"
|
610 |
-
"- 語氣親切、精簡、專業。\n"
|
611 |
-
"- 若片段中無足夠資訊回答,必須回覆:「根據提供的資料,我無法找到關於您問題的明確答案,建議您諮詢醫師或藥師。」\n"
|
612 |
-
"---輸入資訊---\n"
|
613 |
-
f"藥物名稱: {drug_choice.get('drug_name_zh') or drug_choice.get('drug_name_en')}\n"
|
614 |
-
f"病患已知資訊: {patient_context_str}\n"
|
615 |
-
f"問題: {parsed_info.get('raw_query')}\n\n"
|
616 |
-
f"參考片段:\n{contexts}\n"
|
617 |
-
"---你的回答---"
|
618 |
-
)
|
619 |
-
|
620 |
-
def call_llm(prompt: str, max_tokens: int = 2048) -> Optional[str]:
|
621 |
-
try:
|
622 |
-
from openai import OpenAI
|
623 |
-
except Exception as e:
|
624 |
-
log.warning("openai client 不可用:%s", e)
|
625 |
-
return None
|
626 |
-
if not (LITELLM_API_KEY and LM_MODEL and LITELLM_BASE_URL):
|
627 |
-
log.warning("LLM 未完整設定;略過生成。")
|
628 |
-
return None
|
629 |
-
client = OpenAI(base_url=LITELLM_BASE_URL, api_key=LITELLM_API_KEY)
|
630 |
-
try:
|
631 |
-
t0 = time.time()
|
632 |
-
resp = client.chat.completions.create(
|
633 |
-
model=LM_MODEL,
|
634 |
-
messages=[{"role": "user", "content": prompt}], # MODIFIED: Simplified to user role only, as system prompt is now part of the main prompt
|
635 |
-
temperature=0.1, # MODIFIED: slightly lower temperature for more deterministic answers
|
636 |
-
timeout=15, # MODIFIED: slightly longer timeout
|
637 |
-
max_tokens=max_tokens,
|
638 |
-
)
|
639 |
-
used = time.time() - t0
|
640 |
-
log.info("LLM ok (%.2fs)", used)
|
641 |
-
return (resp.choices[0].message.content or "").strip()
|
642 |
-
except Exception as e:
|
643 |
-
log.warning("LLM 失敗:%s", e)
|
644 |
-
return None
|
645 |
-
|
646 |
-
def make_clarify_message() -> str:
|
647 |
-
# MODIFIED: More generic clarification message
|
648 |
-
msg = (
|
649 |
-
"我需要更多資訊才能準確回答,請您提供:\n"
|
650 |
-
"1. 完整的藥物名稱\n"
|
651 |
-
"2. 劑量和劑型(例如:普拿疼 500mg 錠劑)\n"
|
652 |
-
"3. 您的具體問題\n\n"
|
653 |
-
f"{DISCLAIMER}"
|
654 |
-
)
|
655 |
-
return msg
|
656 |
-
|
657 |
-
def handle_error(code: str) -> str:
|
658 |
-
log.error(f"Pipeline error: {code}")
|
659 |
-
return f"抱歉,系統暫時無法回覆 ({code})。請諮詢醫師或藥師。{DISCLAIMER}"
|
660 |
-
|
661 |
-
# ---------- 主流程 (MODIFIED) ----------
|
662 |
-
async def answer_pipeline(query: str, user_id: str) -> str:
|
663 |
-
log.info("Pipeline start for user_id: %s, query: %s", user_id, query[:50])
|
664 |
-
if not query or not isinstance(query, str):
|
665 |
-
return handle_error("INVALID_QUERY")
|
666 |
-
if not STATE.sentences:
|
667 |
-
return handle_error("NO_CORPUS")
|
668 |
-
|
669 |
-
# 1. 解析使用者輸入
|
670 |
-
parsed_info = parse_user_message(query)
|
671 |
-
|
672 |
-
# 2. 尋找藥物候選
|
673 |
-
drug_candidates = find_drug_candidates(parsed_info, STATE.df_csv)
|
674 |
-
|
675 |
-
# 3. 選擇最佳藥物或要求澄清
|
676 |
-
drug_choice_or_clarification = select_best_drug_candidate(drug_candidates)
|
677 |
-
|
678 |
-
if drug_choice_or_clarification is None:
|
679 |
-
log.warning("No confident drug match found.")
|
680 |
-
return make_clarify_message()
|
681 |
-
|
682 |
-
if isinstance(drug_choice_or_clarification, str): # It's a clarification message
|
683 |
-
log.info("Requesting clarification from user.")
|
684 |
-
return drug_choice_or_clarification + f"\n\n{DISCLAIMER}"
|
685 |
-
|
686 |
-
drug_choice = drug_choice_or_clarification
|
687 |
-
log.info("Selected drug: %s", drug_choice)
|
688 |
-
|
689 |
-
# 4. 檢索相關內文
|
690 |
-
idxs = fuse_and_select(
|
691 |
-
query=parsed_info["raw_query"],
|
692 |
-
sentences=STATE.sentences,
|
693 |
-
meta=STATE.meta,
|
694 |
-
bm25=STATE.bm25,
|
695 |
-
index=STATE.faiss_index,
|
696 |
-
emb_model=STATE.emb_model,
|
697 |
-
reranker=STATE.reranker_model,
|
698 |
-
top_k=TOP_K_SENTENCES,
|
699 |
-
drug_id=drug_choice['drug_id'],
|
700 |
-
parsed_info=parsed_info
|
701 |
-
)
|
702 |
-
|
703 |
-
if not idxs:
|
704 |
-
return handle_error("NO_CONTEXT")
|
705 |
-
|
706 |
-
# 5. 建立上下文和 Prompt
|
707 |
-
context = build_context(idxs, STATE.sentences, STATE.meta)
|
708 |
-
prompt = build_prompt(parsed_info, context, drug_choice)
|
709 |
-
log.info("Generated Prompt:\n%s", prompt)
|
710 |
-
|
711 |
-
# 6. 呼叫 LLM 生成答案
|
712 |
-
answer = call_llm(prompt)
|
713 |
-
if not answer:
|
714 |
-
return handle_error("LLM_ERROR")
|
715 |
-
|
716 |
-
return f"{answer}\n\n{DISCLAIMER}"
|
717 |
-
|
718 |
# ---------- LINE 驗簽與回覆 ----------
|
719 |
def verify_line_signature(body_bytes: bytes, signature: str) -> bool:
|
720 |
if not CHANNEL_SECRET:
|
@@ -801,4 +718,4 @@ async def health():
|
|
801 |
if __name__ == "__main__":
|
802 |
import uvicorn
|
803 |
port = int(os.getenv("PORT", "7860"))
|
804 |
-
uvicorn.run("app:app", host="0.0.0.0", port=port, log_level=LOG_LEVEL.lower(), reload=False)
|
|
|
92 |
MAX_CONTEXT_CHARS = 8000
|
93 |
DISCLAIMER = "此回覆僅供參考,請遵循醫師/藥師指示。"
|
94 |
|
95 |
+
# 藥名映射與停用詞
|
96 |
DRUG_NAME_MAPPING = {
|
97 |
"fentanyl patch": "fentanyl",
|
98 |
"spiriva respimat": "spiriva",
|
|
|
108 |
"芬太尼貼片": "fentanyl",
|
109 |
"透皮止痛貼片": "fentanyl",
|
110 |
}
|
111 |
+
|
112 |
DRUG_STOPWORDS = {"藥", "劑", "錠", "膠囊", "糖漿", "乳膏", "貼片", "含錠", "膜衣錠", "緩釋錠", "滴劑", "懸液", "注射液",
|
113 |
"吸入劑", "噴霧", "噴霧劑", "吸入器", "注射筆", "藥水", "小袋", "條", "包", "瓶", "外用", "口服"}
|
114 |
|
|
|
147 |
}
|
148 |
|
149 |
IMPORTANT_SECTIONS = ["用法及用量", "病人使用須知", "包裝及儲存", "不良反應", "警語及注意事項"]
|
150 |
+
# 移除 DOSAGE_FORM_BOOST
|
151 |
|
152 |
# ---------- 路徑工具 ----------
|
153 |
def pick_existing_or_tmp(candidates: List[str]) -> str:
|
|
|
322 |
try:
|
323 |
with open(pkl_path, "rb") as f:
|
324 |
bm = pickle.load(f)
|
325 |
+
# BM25 has corpus, not corpus_size attribute
|
326 |
n_bm = len(bm.corpus) if hasattr(bm, 'corpus') else 0
|
327 |
if n_bm == len(sentences):
|
328 |
log.info("Loaded BM25: %s (n=%d)", pkl_path, n_bm)
|
|
|
337 |
safe_pickle_dump(bm, pkl_path)
|
338 |
return bm
|
339 |
|
340 |
+
# ---------- 資訊解析與藥名處理 (簡化) ----------
|
341 |
|
342 |
+
# 1. parse_user_message: 簡化為只比對藥名
|
343 |
+
def parse_user_message(query: str, df: pd.DataFrame) -> Dict[str, Any]:
|
344 |
"""
|
345 |
+
MODIFIED: 只比對 drug_name_norm,找最佳藥品。
|
346 |
"""
|
347 |
+
best_drug = None
|
348 |
+
best_row = None
|
349 |
+
max_score = 0
|
350 |
+
|
351 |
+
if not fuzz:
|
352 |
+
log.warning("fuzzywuzzy not available; skipping fuzzy match.")
|
353 |
+
return {
|
354 |
+
"drug_name": None,
|
355 |
+
"drug_id": None,
|
356 |
+
"question": query,
|
357 |
+
}
|
358 |
+
|
359 |
+
# Use a pre-tokenized and normalized list for faster fuzzy matching
|
360 |
+
# In a real app, this should be pre-computed and stored for efficiency
|
361 |
+
unique_drugs = df.drop_duplicates(subset=['drug_id'])
|
362 |
+
|
363 |
+
# Check for direct match first
|
364 |
+
query_lower = query.lower().strip()
|
365 |
+
direct_match = unique_drugs[unique_drugs['drug_name_norm'].str.lower() == query_lower]
|
366 |
+
if not direct_match.empty:
|
367 |
+
best_row = direct_match.iloc[0]
|
368 |
+
best_drug = best_row["drug_name_norm"]
|
369 |
+
log.info(f"Direct match found: {best_drug}")
|
370 |
+
else:
|
371 |
+
for _, row in unique_drugs.iterrows():
|
372 |
+
drug_norm = (row.get('drug_name_norm') or "").lower()
|
373 |
+
score = fuzz.token_set_ratio(query_lower, drug_norm)
|
374 |
+
|
375 |
+
if score > max_score:
|
376 |
+
max_score = score
|
377 |
+
best_drug = drug_norm
|
378 |
+
best_row = row
|
379 |
+
|
380 |
+
if best_drug is None or max_score < 80: # 設定一個閾值來避免不相關的匹配
|
381 |
+
log.warning(f"No confident drug match found (score: {max_score})")
|
382 |
+
return {
|
383 |
+
"drug_name": None,
|
384 |
+
"drug_id": None,
|
385 |
+
"question": query,
|
386 |
+
}
|
387 |
+
|
388 |
+
log.info(f"Parsed user message (best match): {best_drug}, score: {max_score}")
|
389 |
+
|
390 |
+
return {
|
391 |
+
"drug_name": best_drug,
|
392 |
+
"drug_id": best_row["drug_id"],
|
393 |
+
"question": query
|
394 |
}
|
395 |
|
396 |
+
# 2. find_drug_candidates: 簡化為單純 fuzzy 比對
|
397 |
+
def find_drug_candidates(parsed_info: Dict[str, Any], df: pd.DataFrame, top_k: int = 5) -> List[Dict[str, Any]]:
|
398 |
+
"""
|
399 |
+
MODIFIED: 單純對 drug_name_norm 做 fuzzy 比對,並回傳前 top_k 候選。
|
400 |
+
"""
|
401 |
+
query_text = parsed_info.get("question", "").lower()
|
402 |
+
if df is None or df.empty or not query_text:
|
403 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
404 |
|
405 |
+
if not fuzz:
|
406 |
+
return []
|
407 |
+
|
408 |
+
candidates_list = []
|
409 |
+
unique_drugs = df.drop_duplicates(subset=['drug_id'])
|
410 |
+
|
411 |
+
for _, row in unique_drugs.iterrows():
|
412 |
+
drug_norm = (row.get('drug_name_norm') or "").lower()
|
413 |
+
score = fuzz.token_set_ratio(query_text, drug_norm)
|
414 |
|
415 |
+
candidates_list.append({
|
416 |
+
"drug_id": row["drug_id"],
|
417 |
+
"drug_name": drug_norm,
|
418 |
+
"score": score
|
419 |
+
})
|
420 |
+
|
421 |
+
# 依 score 排序並回傳前 top_k
|
422 |
+
sorted_candidates = sorted(candidates_list, key=lambda x: x['score'], reverse=True)
|
423 |
|
424 |
+
log.info(f"Found drug candidates: {sorted_candidates[:top_k]}")
|
425 |
+
return sorted_candidates[:top_k]
|
|
|
426 |
|
427 |
+
# 3. answer_pipeline: 簡化流程
|
428 |
+
async def answer_pipeline(query: str, user_id: str) -> str:
|
429 |
+
log.info("Pipeline start for user_id: %s, query: %s", user_id, query[:50])
|
430 |
+
if not query or not isinstance(query, str):
|
431 |
+
return handle_error("INVALID_QUERY")
|
432 |
+
if not STATE.sentences or not STATE.df_csv:
|
433 |
+
return handle_error("NO_CORPUS")
|
434 |
|
435 |
+
# 1. 解析使用者輸入並找到最佳藥品
|
436 |
+
best_drug_info = parse_user_message(query, STATE.df_csv)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
437 |
|
438 |
+
if not best_drug_info.get("drug_id"):
|
439 |
+
log.warning("No confident drug match found.")
|
440 |
+
return make_clarify_message()
|
|
|
|
|
441 |
|
442 |
+
# 2. 呼叫 find_drug_candidates 產生候選清單
|
443 |
+
drug_candidates = find_drug_candidates(best_drug_info, STATE.df_csv)
|
444 |
|
445 |
+
# 3. 依 score >= 95 或與次高分差距 > 10 判斷是否選定最佳藥品
|
446 |
+
top_score = drug_candidates[0]['score'] if drug_candidates else 0
|
447 |
+
second_score = drug_candidates[1]['score'] if len(drug_candidates) > 1 else 0
|
448 |
+
|
449 |
+
if top_score >= 95 or (top_score - second_score) > 10:
|
450 |
+
log.info("Confidently selected drug: %s", best_drug_info['drug_name'])
|
451 |
+
drug_choice = best_drug_info
|
452 |
+
else:
|
453 |
+
log.info("Scores are too close, requesting clarification.")
|
454 |
+
options = [f"「{c.get('drug_name')}」" for c in drug_candidates[:3]]
|
455 |
+
return f"請問您指的是以下哪一種藥物?\n- " + "\n- ".join(options) + f"\n\n{DISCLAIMER}"
|
456 |
|
457 |
+
# 4. 檢索相關內文 (fuse_and_select)
|
458 |
+
idxs = fuse_and_select(
|
459 |
+
query=best_drug_info["question"],
|
460 |
+
sentences=STATE.sentences,
|
461 |
+
meta=STATE.meta,
|
462 |
+
bm25=STATE.bm25,
|
463 |
+
index=STATE.faiss_index,
|
464 |
+
emb_model=STATE.emb_model,
|
465 |
+
reranker=STATE.reranker_model,
|
466 |
+
top_k=TOP_K_SENTENCES,
|
467 |
+
drug_id=drug_choice['drug_id'],
|
468 |
+
# 移除 parsed_info
|
469 |
+
)
|
470 |
+
|
471 |
+
if not idxs:
|
472 |
+
return handle_error("NO_CONTEXT")
|
473 |
+
|
474 |
+
# 5. 建立上下文和 Prompt (build_prompt)
|
475 |
+
context = build_context(idxs, STATE.sentences, STATE.meta)
|
476 |
+
prompt = build_prompt(best_drug_info, context, drug_choice)
|
477 |
+
log.info("Generated Prompt:\n%s", prompt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
478 |
|
479 |
+
# 6. 呼叫 LLM 生成答案
|
480 |
+
answer = call_llm(prompt)
|
481 |
+
if not answer:
|
482 |
+
return handle_error("LLM_ERROR")
|
|
|
|
|
|
|
|
|
483 |
|
484 |
+
return f"{answer}\n\n{DISCLAIMER}"
|
|
|
485 |
|
486 |
+
# 4. build_prompt: 簡化提示詞
|
487 |
+
def build_prompt(parsed_info: Dict[str, Any], contexts: str, drug_choice: Dict[str, Any]) -> str:
|
488 |
"""
|
489 |
+
MODIFIED: 簡化為只包含藥品名稱、使用者問題、參考片段。
|
490 |
"""
|
491 |
+
return (
|
492 |
+
"你是一位專業、有同理心的藥師。請根據提供的「參考片段」,簡潔地回答使用者的「問題」。\n"
|
493 |
+
"---限制---\n"
|
494 |
+
"- 絕對忠於「參考片段」,不可捏造或過度推論。你的知識僅限於提供的片段。\n"
|
495 |
+
"- 回覆少於 120 字,並使用繁體中文條列式 2-4 點說明。\n"
|
496 |
+
"- 語氣親切、精簡、專業。\n"
|
497 |
+
"- 若片段中無足夠資訊回答,必須回覆:「根據提供的資料,我無法找到關於您問題的明確答案,建議您諮詢醫師或藥師。」\n"
|
498 |
+
"---輸入資訊---\n"
|
499 |
+
f"藥物名稱: {drug_choice.get('drug_name')}\n"
|
500 |
+
f"問題: {parsed_info.get('question')}\n\n"
|
501 |
+
f"參考片段:\n{contexts}\n"
|
502 |
+
"---你的回答---"
|
503 |
+
)
|
504 |
|
505 |
+
def call_llm(prompt: str, max_tokens: int = 2048) -> Optional[str]:
|
506 |
+
try:
|
507 |
+
from openai import OpenAI
|
508 |
+
except Exception as e:
|
509 |
+
log.warning("openai client 不可用:%s", e)
|
510 |
+
return None
|
511 |
+
if not (LITELLM_API_KEY and LM_MODEL and LITELLM_BASE_URL):
|
512 |
+
log.warning("LLM 未完整設定;略過生成。")
|
513 |
+
return None
|
514 |
+
client = OpenAI(base_url=LITELLM_BASE_URL, api_key=LITELLM_API_KEY)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
515 |
try:
|
516 |
+
t0 = time.time()
|
517 |
+
resp = client.chat.completions.create(
|
518 |
+
model=LM_MODEL,
|
519 |
+
messages=[{"role": "user", "content": prompt}],
|
520 |
+
temperature=0.1,
|
521 |
+
timeout=15,
|
522 |
+
max_tokens=max_tokens,
|
523 |
+
)
|
524 |
+
used = time.time() - t0
|
525 |
+
log.info("LLM ok (%.2fs)", used)
|
526 |
+
return (resp.choices[0].message.content or "").strip()
|
|
|
|
|
|
|
|
|
|
|
527 |
except Exception as e:
|
528 |
+
log.warning("LLM 失敗:%s", e)
|
529 |
+
return None
|
530 |
+
|
531 |
+
def make_clarify_message() -> str:
|
532 |
+
msg = (
|
533 |
+
"我需要更多資訊才能準確回答,請您提供:\n"
|
534 |
+
"1. 完整的藥物名稱\n"
|
535 |
+
"2. 您的具體問題\n\n"
|
536 |
+
f"{DISCLAIMER}"
|
537 |
+
)
|
538 |
+
return msg
|
539 |
|
540 |
+
def handle_error(code: str) -> str:
|
541 |
+
log.error(f"Pipeline error: {code}")
|
542 |
+
return f"抱歉,系統暫時無法回覆 ({code})。請諮詢醫師或藥師。{DISCLAIMER}"
|
543 |
+
|
544 |
+
# 5. fuse_and_select: 移除劑型加權
|
545 |
+
def fuse_and_select(query: str, sentences: List[str], meta: List[Dict[str, Any]], bm25: Optional[Any], index: Optional[Any], emb_model: Optional[Any], reranker: Optional[Any], top_k: int = 10, drug_id: str = None) -> List[int]:
|
546 |
+
"""
|
547 |
+
MODIFIED: 移除劑型加權。只保留 BM25/FAISS 融合 + 章節加權 + 意圖加權。
|
548 |
+
"""
|
549 |
+
clean_query = query.strip().lower()
|
550 |
cache_key = clean_query + str(drug_id)
|
551 |
if cache_key in STATE.query_cache and time.time() - STATE.query_cache[cache_key]['time'] < 180:
|
552 |
log.info("Cache hit for query: %s", clean_query[:50])
|
|
|
581 |
scores[i] = scores.get(i, 0.0) + SEM_WEIGHT * (1.0 / (1 + rank))
|
582 |
|
583 |
# Apply boosts
|
|
|
584 |
for i in list(scores.keys()): # Iterate over a copy of keys
|
585 |
meta_item = meta[i]
|
586 |
|
587 |
# Section weight boost
|
588 |
sec = meta_item.get("section", "其他")
|
589 |
scores[i] *= SECTION_WEIGHTS.get(sec, 1.0)
|
|
|
|
|
|
|
|
|
590 |
|
591 |
+
# Boost based on detected intent
|
592 |
detected_intents = detect_intent(clean_query)
|
593 |
for i in list(scores.keys()):
|
594 |
meta_item = meta[i]
|
|
|
617 |
STATE.query_cache[cache_key] = {'idxs': idxs, 'time': time.time()}
|
618 |
return idxs
|
619 |
|
|
|
620 |
def build_context(idxs: List[int], sentences: List[str], meta: List[Dict[str, Any]]) -> str:
|
621 |
ctx_lines, total_len, seen = [], 0, set()
|
622 |
for i in idxs:
|
|
|
625 |
if text in seen: continue
|
626 |
chunk_id = meta[i].get("chunk_id", "None")
|
627 |
section = meta[i].get("section", "未知章節")
|
628 |
+
line = f"[{section}]: {text}"
|
629 |
if total_len + len(line) > MAX_CONTEXT_CHARS: break
|
630 |
ctx_lines.append(line)
|
631 |
total_len += len(line) + 1
|
632 |
seen.add(text)
|
633 |
return "\n".join(ctx_lines) or "[未知章節]: 沒有找到相關資料,請諮詢醫師或藥師。"
|
634 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
635 |
# ---------- LINE 驗簽與回覆 ----------
|
636 |
def verify_line_signature(body_bytes: bytes, signature: str) -> bool:
|
637 |
if not CHANNEL_SECRET:
|
|
|
718 |
if __name__ == "__main__":
|
719 |
import uvicorn
|
720 |
port = int(os.getenv("PORT", "7860"))
|
721 |
+
uvicorn.run("app:app", host="0.0.0.0", port=port, log_level=LOG_LEVEL.lower(), reload=False)
|