Spaces:

pcreem
/

brown-cafe

Sleeping

App Files Files Community

Song commited on 25 days ago

Commit

ee31585

1 Parent(s): 5f83bd3

hi

Browse files

Files changed (1) hide show

app.py +75 -33

app.py CHANGED Viewed

@@ -59,8 +59,6 @@ def _require_llm_config():
     for k in ("LITELLM_BASE_URL", "LITELLM_API_KEY", "LM_MODEL"):
         _require_env(k)
-_require_llm_config()
 CSV_PATH = os.getenv("CSV_PATH", "cleaned_combined.csv")
 FAISS_INDEX = os.getenv("FAISS_INDEX", "drug_sentences.index")
 SENTENCES_PKL = os.getenv("SENTENCES_PKL", "drug_sentences.pkl")
@@ -192,10 +190,19 @@ class RagPipeline:
                 self.df_csv['drug_name_norm'].str.lower().str.replace(r'[^\w\s]', '', regex=True).str.strip()
             )
             self.drug_name_to_ids = self.df_csv.groupby('drug_name_norm_normalized')['drug_id'].unique().apply(list).to_dict()
             self._load_drug_name_vocabulary()
             log.info("載入 FAISS 索引與句子資料...")
             self.state.index = faiss.read_index(FAISS_INDEX)
             with open(SENTENCES_PKL, "rb") as f:
                 data = pickle.load(f)
                 self.state.sentences = data["sentences"]
@@ -204,6 +211,8 @@ class RagPipeline:
             log.info("載入 BM25 索引...")
             with open(BM25_PKL, "rb") as f:
                 self.state.bm25 = pickle.load(f)
         except (FileNotFoundError, KeyError) as e:
             log.exception(f"資料或索引檔案載入失敗: {e}")
             raise RuntimeError(f"資料初始化失敗，請檢查檔案路徑與內容: {e}")
@@ -217,10 +226,19 @@ class RagPipeline:
             for part in parts:
                 if re.search(r'[\u4e00-\u9fff]', part):
                     self.drug_vocab["zh"].add(part)
                 else:
                     self.drug_vocab["en"].add(part)
         for alias in DRUG_NAME_MAPPING:
             self.drug_vocab["en"].add(alias.lower())
     @retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
     def _llm_call(self, messages, **kwargs) -> str:
@@ -273,9 +291,20 @@ class RagPipeline:
     @lru_cache(maxsize=128)
     def _find_drug_ids_from_name(self, query: str) -> List[str]:
-        candidates = extract_drug_candidates_from_query(query.lower(), self.drug_vocab)
         drug_ids = set()
         for alias in candidates:
             # [MODIFIED] 英文藥名比對使用詞邊界，避免子字串誤判
             is_english = not re.search(r'[\u4e00-\u9fff]', alias)
@@ -291,7 +320,6 @@ class RagPipeline:
                     drug_ids.update(ids)
         return list(drug_ids)
     def _analyze_query(self, query: str) -> Dict[str, Any]:
         prompt = PROMPT_TEMPLATES["analyze_query"].format(
             options="\n".join(f"- {c}" for c in INTENT_CATEGORIES),
@@ -311,25 +339,31 @@ class RagPipeline:
             expanded_q = self._expand_query_with_llm(sub_q, tuple(intents))
             q_emb = self.embedding_model.encode([expanded_q], convert_to_numpy=True).astype("float32")
-            faiss.normalize_L2(q_emb)
             distances, sim_indices = self.state.index.search(q_emb, PRE_RERANK_K)
             tokenized_query = list(jieba.cut(expanded_q))
-            # [MODIFIED] 改為獲取真實 BM25 分數，而非使用排名
             bm25_scores = self.state.bm25.get_scores(tokenized_query)
-            top_bm25_indices = np.argsort(bm25_scores)[::-1][:PRE_RERANK_K]
-            doc_to_bm25_score = {int(i): float(bm25_scores[i]) for i in top_bm25_indices}
             candidate_scores: Dict[int, Dict[str, float]] = {}
-            # [MODIFIED] FAISS L2 距離轉為相似度 (分數越高越好)
-            def dist_to_sim(d: float) -> float:
-                return 1.0 / (1.0 + d)
             for i, dist in zip(sim_indices[0], distances[0]):
                 if i in relevant_indices:
-                    similarity = dist_to_sim(dist)
                     candidate_scores[int(i)] = {"sem": float(similarity), "bm": 0.0}
             for i, score in doc_to_bm25_score.items():
@@ -408,16 +442,14 @@ class RagPipeline:
         )
     # [MODIFIED] 增強 JSON 解析的穩健性，從字串中提取 JSON 物件
-    def _safe_json_parse(self, json_str: str, default: Any = None) -> Any:
-        # Find the JSON object within the string
-        match = re.search(r'\{.*\}', json_str, re.DOTALL)
-        if match:
-            json_str = match.group(0)
         try:
-            return json.loads(json_str)
-        except json.JSONDecodeError:
-            log.warning(f"無法解析 LLM 回傳的 JSON: {json_str}")
             return default
 # ---------- FastAPI 事件與路由 ----------
@@ -432,6 +464,7 @@ class AppConfig:
 @app.on_event("startup")
 async def startup_event():
     global rag_pipeline
     rag_pipeline = RagPipeline(AppConfig)
     rag_pipeline.load_data()
     log.info("啟動完成，服務準備就緒。")
@@ -457,35 +490,41 @@ async def handle_webhook(request: Request, background_tasks: BackgroundTasks):
     if not hmac.compare_digest(expected_signature, signature):
         raise HTTPException(status_code=403, detail="Invalid signature")
-    data = json.loads(body.decode('utf-8'))
     for event in data.get("events", []):
         if event.get("type") == "message" and event.get("message", {}).get("type") == "text":
             reply_token = event.get("replyToken")
             user_text = event.get("message", {}).get("text", "").strip()
-            # [MODIFIED] 安全地獲取 userId，應對群組/聊天室中可能不存在的情況
             source = event.get("source", {})
-            user_id = source.get("userId")
-            if reply_token and user_id and user_text:
                 # [MODIFIED] 更改回覆策略：立即回覆處理中訊息，避免 replyToken 逾時
                 line_reply(reply_token, "收到您的問題，正在查詢資料庫，請稍候...")
                 # 將耗時的任務交給背景處理，使用 push message 回覆最終答案
-                background_tasks.add_task(process_user_query, user_id, user_text)
     return Response(status_code=status.HTTP_200_OK)
 # [MODIFIED] 調整函式簽名，只接收 user_id 和 text，並使用 push message
-def process_user_query(user_id: str, user_text: str):
     try:
         if rag_pipeline:
             answer = rag_pipeline.answer_question(user_text)
         else:
             answer = "系統正在啟動中，請稍後再試。"
-        line_push(user_id, answer)
     except Exception as e:
-        log.error(f"背景處理 user_id={user_id} 發生錯誤: {e}", exc_info=True)
-        line_push(user_id, f"抱歉，處理時發生未預期的錯誤。{DISCLAIMER}")
 def line_api_call(endpoint: str, data: Dict):
     headers = {
         "Content-Type": "application/json",
@@ -496,14 +535,17 @@ def line_api_call(endpoint: str, data: Dict):
         response.raise_for_status()
     except requests.exceptions.RequestException as e:
         log.error(f"LINE API ({endpoint}) 呼叫失敗: {e} | Response: {e.response.text if e.response else 'N/A'}")
 def line_reply(reply_token: str, text: str):
     messages = [{"type": "text", "text": chunk} for chunk in textwrap.wrap(text, 4800, replace_whitespace=False)[:5]]
     line_api_call("reply", {"replyToken": reply_token, "messages": messages})
-def line_push(user_id: str, text: str):
     messages = [{"type": "text", "text": chunk} for chunk in textwrap.wrap(text, 4800, replace_whitespace=False)[:5]]
-    line_api_call("push", {"to": user_id, "messages": messages})
 # [MODIFIED] 改善藥名提取的正則表達式
 def extract_drug_candidates_from_query(query: str, drug_vocab: dict) -> list:

     for k in ("LITELLM_BASE_URL", "LITELLM_API_KEY", "LM_MODEL"):
         _require_env(k)
 CSV_PATH = os.getenv("CSV_PATH", "cleaned_combined.csv")
 FAISS_INDEX = os.getenv("FAISS_INDEX", "drug_sentences.index")
 SENTENCES_PKL = os.getenv("SENTENCES_PKL", "drug_sentences.pkl")
                 self.df_csv['drug_name_norm'].str.lower().str.replace(r'[^\w\s]', '', regex=True).str.strip()
             )
             self.drug_name_to_ids = self.df_csv.groupby('drug_name_norm_normalized')['drug_id'].unique().apply(list).to_dict()
+            # [MODIFIED] 把別名也變成可查鍵
+            for alias, canonical in DRUG_NAME_MAPPING.items():
+                alias_key = re.sub(r'[^\w\s]', '', alias.lower()).strip()
+                canonical_key = re.sub(r'[^\w\s]', '', canonical.lower()).strip()
+                if canonical_key in self.drug_name_to_ids:
+                    self.drug_name_to_ids[alias_key] = self.drug_name_to_ids[canonical_key]
             self._load_drug_name_vocabulary()
             log.info("載入 FAISS 索引與句子資料...")
             self.state.index = faiss.read_index(FAISS_INDEX)
+            self.state.faiss_metric = getattr(self.state.index, "metric_type", faiss.METRIC_L2)
+            if hasattr(self.state.index, "nprobe"):
+                self.state.index.nprobe = int(os.getenv("FAISS_NPROBE", "16"))
             with open(SENTENCES_PKL, "rb") as f:
                 data = pickle.load(f)
                 self.state.sentences = data["sentences"]
             log.info("載入 BM25 索引...")
             with open(BM25_PKL, "rb") as f:
                 self.state.bm25 = pickle.load(f)
+            if not isinstance(self.state.bm25, BM25Okapi):
+                raise ValueError("Loaded BM25 is not a BM25Okapi instance.")
         except (FileNotFoundError, KeyError) as e:
             log.exception(f"資料或索引檔案載入失敗: {e}")
             raise RuntimeError(f"資料初始化失敗，請檢查檔案路徑與內容: {e}")
             for part in parts:
                 if re.search(r'[\u4e00-\u9fff]', part):
                     self.drug_vocab["zh"].add(part)
+                    try:
+                        jieba.add_word(part, freq=2_000_000)
+                    except Exception:
+                        pass
                 else:
                     self.drug_vocab["en"].add(part)
         for alias in DRUG_NAME_MAPPING:
             self.drug_vocab["en"].add(alias.lower())
+            if re.search(r'[\u4e00-\u9fff]', alias):
+                try:
+                    jieba.add_word(alias, freq=2_000_000)
+                except Exception:
+                    pass
     @retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
     def _llm_call(self, messages, **kwargs) -> str:
     @lru_cache(maxsize=128)
     def _find_drug_ids_from_name(self, query: str) -> List[str]:
+        q = query.lower()
+        candidates = extract_drug_candidates_from_query(q, self.drug_vocab)
         drug_ids = set()
+        # 英文：詞邊界；中文：也做子字串掃描
+        for k, ids in self.drug_name_to_ids.items():
+            if re.search(r'[\u4e00-\u9fff]', k):
+                if k in q:
+                    drug_ids.update(ids)
+            else:
+                if re.search(rf"\b{re.escape(k)}\b", q):
+                    drug_ids.update(ids)
+        # 仍保留舊的候選詞路徑（補強）
         for alias in candidates:
             # [MODIFIED] 英文藥名比對使用詞邊界，避免子字串誤判
             is_english = not re.search(r'[\u4e00-\u9fff]', alias)
                     drug_ids.update(ids)
         return list(drug_ids)
     def _analyze_query(self, query: str) -> Dict[str, Any]:
         prompt = PROMPT_TEMPLATES["analyze_query"].format(
             options="\n".join(f"- {c}" for c in INTENT_CATEGORIES),
             expanded_q = self._expand_query_with_llm(sub_q, tuple(intents))
             q_emb = self.embedding_model.encode([expanded_q], convert_to_numpy=True).astype("float32")
+            if self.state.faiss_metric == faiss.METRIC_INNER_PRODUCT:
+                faiss.normalize_L2(q_emb)
             distances, sim_indices = self.state.index.search(q_emb, PRE_RERANK_K)
             tokenized_query = list(jieba.cut(expanded_q))
+            # [MODIFIED] 先過濾 relevant_indices 再取 TopK
             bm25_scores = self.state.bm25.get_scores(tokenized_query)
+            rel_idx = np.fromiter(relevant_indices, dtype=int)
+            rel_scores = bm25_scores[rel_idx]
+            top_rel = rel_idx[np.argsort(rel_scores)[::-1][:PRE_RERANK_K]]
+            doc_to_bm25_score = {int(i): float(bm25_scores[i]) for i in top_rel}
             candidate_scores: Dict[int, Dict[str, float]] = {}
+            # [MODIFIED] 把 distance 轉成「越大越好的相似度」
+            def to_similarity(d: float) -> float:
+                if self.state.faiss_metric == faiss.METRIC_INNER_PRODUCT:
+                    return float(d)                   # IP 越大越好
+                else:  # METRIC_L2（多半是平方 L2）
+                    return 1.0 / (1.0 + float(d))
             for i, dist in zip(sim_indices[0], distances[0]):
                 if i in relevant_indices:
+                    similarity = to_similarity(dist)
                     candidate_scores[int(i)] = {"sem": float(similarity), "bm": 0.0}
             for i, score in doc_to_bm25_score.items():
         )
     # [MODIFIED] 增強 JSON 解析的穩健性，從字串中提取 JSON 物件
+    def _safe_json_parse(self, s: str, default: Any = None) -> Any:
+        m = re.search(r'\{.*?\}', s, re.DOTALL)  # 非貪婪
+        if m:
+            s = m.group(0)
         try:
+            return json.loads(s)
+        except Exception:
+            log.warning(f"無法解析 LLM 回傳的 JSON: {s[:200]}...")
             return default
 # ---------- FastAPI 事件與路由 ----------
 @app.on_event("startup")
 async def startup_event():
     global rag_pipeline
+    _require_llm_config()
     rag_pipeline = RagPipeline(AppConfig)
     rag_pipeline.load_data()
     log.info("啟動完成，服務準備就緒。")
     if not hmac.compare_digest(expected_signature, signature):
         raise HTTPException(status_code=403, detail="Invalid signature")
+    try:
+        data = json.loads(body.decode('utf-8'))
+    except json.JSONDecodeError:
+        raise HTTPException(status_code=400, detail="Invalid JSON body")
     for event in data.get("events", []):
         if event.get("type") == "message" and event.get("message", {}).get("type") == "text":
             reply_token = event.get("replyToken")
             user_text = event.get("message", {}).get("text", "").strip()
+            # [MODIFIED] 擷取 target
             source = event.get("source", {})
+            stype = source.get("type")  # "user" | "group" | "room"
+            target_id = source.get("userId") or source.get("groupId") or source.get("roomId")
+            if reply_token and user_text and target_id:
                 # [MODIFIED] 更改回覆策略：立即回覆處理中訊息，避免 replyToken 逾時
                 line_reply(reply_token, "收到您的問題，正在查詢資料庫，請稍候...")
                 # 將耗時的任務交給背景處理，使用 push message 回覆最終答案
+                background_tasks.add_task(process_user_query, stype, target_id, user_text)
     return Response(status_code=status.HTTP_200_OK)
 # [MODIFIED] 調整函式簽名，只接收 user_id 和 text，並使用 push message
+def process_user_query(source_type: str, target_id: str, user_text: str):
     try:
         if rag_pipeline:
             answer = rag_pipeline.answer_question(user_text)
         else:
             answer = "系統正在啟動中，請稍後再試。"
+        line_push_generic(source_type, target_id, answer)
     except Exception as e:
+        log.error(f"背景處理 target_id={target_id} 發生錯誤: {e}", exc_info=True)
+        line_push_generic(source_type, target_id, f"抱歉，處理時發生未預期的錯誤。{DISCLAIMER}")
+@retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
 def line_api_call(endpoint: str, data: Dict):
     headers = {
         "Content-Type": "application/json",
         response.raise_for_status()
     except requests.exceptions.RequestException as e:
         log.error(f"LINE API ({endpoint}) 呼叫失敗: {e} | Response: {e.response.text if e.response else 'N/A'}")
+        raise
 def line_reply(reply_token: str, text: str):
     messages = [{"type": "text", "text": chunk} for chunk in textwrap.wrap(text, 4800, replace_whitespace=False)[:5]]
     line_api_call("reply", {"replyToken": reply_token, "messages": messages})
+def line_push_generic(source_type: str, target_id: str, text: str):
     messages = [{"type": "text", "text": chunk} for chunk in textwrap.wrap(text, 4800, replace_whitespace=False)[:5]]
+    endpoint = "push"
+    data = {"to": target_id, "messages": messages}
+    line_api_call(endpoint, data)
 # [MODIFIED] 改善藥名提取的正則表達式
 def extract_drug_candidates_from_query(query: str, drug_vocab: dict) -> list: