Spaces:

pcreem
/

brown-cafe

Sleeping

App Files Files Community

Song commited on 21 days ago

Commit

0f8b8aa

1 Parent(s): da1563e

hi

Browse files

Files changed (1) hide show

app.py +126 -57

app.py CHANGED Viewed

@@ -25,6 +25,7 @@ import json
 from typing import List, Dict, Any, Optional, Tuple, Union
 from functools import lru_cache
 import time
 # ---------- 第三方函式庫 ----------
 import numpy as np
@@ -39,6 +40,7 @@ import torch
 from openai import OpenAI
 from tenacity import retry, stop_after_attempt, wait_fixed
 import requests
 # ==== CONFIG (從環境變數載入，或使用預設值) ====
 # 根據提供的檔案清單，將預設路徑設定為當前目錄
@@ -46,7 +48,6 @@ CSV_PATH = os.getenv("CSV_PATH", "cleaned_combined.csv")
 FAISS_INDEX = os.getenv("FAISS_INDEX", "drug_sentences.index")
 SENTENCES_PKL = os.getenv("SENTENCES_PKL", "drug_sentences.pkl")
 BM25_PKL = os.getenv("BM25_PKL", "bm25.pkl")
-META_PKL = "/tmp/drug_meta.pkl" # 這個檔案不再需要，但保留以避免載入時出錯
 TOP_K_SENTENCES = int(os.getenv("TOP_K_SENTENCES", 30))
 PRE_RERANK_K = int(os.getenv("PRE_RERANK_K", 30))
@@ -124,7 +125,6 @@ class RagPipeline:
         self.embedding_model = self._load_embedding_model()
         self.reranker = self._load_reranker_model()
         self.csv_path = self._ensure_csv_path(CSV_PATH)
-        self.app = FastAPI()
     def _load_embedding_model(self):
         device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -156,18 +156,31 @@ class RagPipeline:
     def _load_data(self):
         """在啟動時載入所有必要的模型與資料"""
         log.info("開始載入資料與模型...")
-        # 載入 CSV
-        if os.path.exists(self.csv_path):
-            self.df_csv = pd.read_csv(self.csv_path, dtype=str).fillna('')
-            self.df_csv['drug_name_norm_normalized'] = self.df_csv['drug_name_norm'].str.lower().str.replace(r'[^\w\s]', '', regex=True).str.strip()
-            log.info(f"成功載入 CSV: {self.csv_path} (rows={len(self.df_csv)})")
-        else:
-            log.error(f"錯誤: 找不到 CSV 檔案於 {self.csv_path}")
-            self.df_csv = None
-        # 載入語料庫與模型
         self.state.index, self.state.sentences, self.state.meta = self._load_or_build_sentence_index()
         self.state.bm25 = self._ensure_bm25_index()
         log.info("所有模型與資料載入完成。")
     def _load_or_build_sentence_index(self):
@@ -181,7 +194,7 @@ class RagPipeline:
             return index, sentences, meta
         log.info("索引檔案不存在，正在從 CSV 重新建立...")
-        # 這個函式並未在 app.py 內，它應由獨立的腳本執行
         raise RuntimeError("FAISS 和句子 PKL 檔案未找到，請先執行索引生成腳本。")
     def _ensure_bm25_index(self):
@@ -190,11 +203,10 @@ class RagPipeline:
             try:
                 with open(BM25_PKL, "rb") as f:
                     data = pickle.load(f)
-                    # 從字典中取出 BM25 物件
-                    bm25 = data.get("bm25")
                     if not hasattr(bm25, 'get_scores'):
                         raise ValueError("載入的 BM25 索引無效。")
-                    log.info(f"成功載入 BM25 索引，包含 {len(data.get('sentences', []))} 篇文件。")
                     return bm25
             except Exception as e:
                 log.error(f"載入 BM25 索引失敗 ({e})，請檢查檔案格式。")
@@ -211,14 +223,15 @@ class RagPipeline:
                 messages=messages,
                 temperature=temperature,
                 max_tokens=max_tokens,
             )
             return response.choices[0].message.content
         except Exception as e:
             log.error(f"LLM API 呼叫失敗: {e}")
             raise
-    async def answer_question(self, q_orig: str) -> str:
-        """處理使用者問題的完整流程"""
         start_time = time.time()
         log.info(f"===== 處理新查詢: '{q_orig}' =====")
@@ -239,8 +252,7 @@ class RagPipeline:
             log.info(f"分析結果 - 意圖: {intents}")
             all_reranked_results = []
-            processed_chunk_ids = set()
             log.info("步驟 3/5: 檢索與重排序...")
             relevant_indices = {i for i, m in enumerate(self.state.meta) if m.get("drug_id") in drug_ids}
             if not relevant_indices:
@@ -275,26 +287,41 @@ class RagPipeline:
                         else:
                             candidate_dict[i] = {"sem": 0.0, "bm": bm_score}
-                candidates = []
                 for i, scores in candidate_dict.items():
                     section_name = self.state.meta[i].get("section", "其他")
                     section_weight = weights.get(section_name, 1.0)
-                    fused_score = (scores["sem"] * 0.5 + scores["bm"] * 0.4) * section_weight
-                    candidates.append((i, fused_score, scores["sem"], scores["bm"]))
-                candidates.sort(key=lambda x: x[1], reverse=True)
-                sub_reranked = self._rerank_with_crossencoder(q_orig, candidates, self.state.sentences, self.reranker, TOP_K_SENTENCES, self.state.meta, RERANK_THRESHOLD)
                 for r in sub_reranked:
-                    if r.get('chunk_id') and r['chunk_id'] in processed_chunk_ids:
-                        continue
-                    elif not r.get('chunk_id') and r['idx'] in {res['idx'] for res in all_reranked_results}:
                         continue
                     all_reranked_results.append(r)
-                    if r.get('chunk_id'):
-                        processed_chunk_ids.add(r['chunk_id'])
             all_reranked_results.sort(key=lambda x: x['rerank_score'], reverse=True)
             log.info(f"Reranker 最終選出 {len(all_reranked_results)} 個高品質候選。")
@@ -358,12 +385,21 @@ class RagPipeline:
             return {"sub_queries": [query], "intents": []}
     def _find_drug_ids_from_name(self, query: str, df: pd.DataFrame) -> List[str]:
         candidates = extract_drug_candidates_from_query(query)
         expanded = expand_aliases(candidates)
         drug_ids = set()
         for alias in expanded:
-            matched_rows = df[df['drug_name_norm_normalized'].str.contains(alias.lower(), na=False)]
-            drug_ids.update(matched_rows['drug_id'].unique())
         return list(drug_ids)
     def _expand_query_with_llm(self, query: str, intents: List[str]) -> str:
@@ -407,8 +443,23 @@ class RagPipeline:
             return [], []
         q_emb = embedding_model.encode([query], convert_to_numpy=True).astype("float32")
         faiss.normalize_L2(q_emb)
         distances, indices = index.search(q_emb, top_k)
-        return indices[0].tolist(), distances[0].tolist()
     def _rerank_with_crossencoder(self, query: str, candidates: List[Tuple], sentences: List[str], reranker, top_k: int, meta: List[Dict], threshold: float) -> List[Dict]:
         if not candidates:
@@ -420,13 +471,14 @@ class RagPipeline:
         scores = reranker.predict(pairs)
         reranked = []
-        for (i, _, sem, bm), score in zip(limited_candidates, scores):
-            if score >= threshold:
                 reranked.append({
                     "idx": i,
-                    "rerank_score": score,
-                    "sem_score": sem,
-                    "bm_score": bm,
                     "meta": meta[i],
                     "text": sentences[i]
                 })
@@ -474,6 +526,7 @@ class RagPipeline:
 # ---------- FastAPI 事件與路由 ----------
 app = FastAPI()
 rag_pipeline = None
 class AppConfig:
     CHANNEL_ACCESS_TOKEN = os.getenv("LINE_CHANNEL_ACCESS_TOKEN")
     CHANNEL_SECRET = os.getenv("LINE_CHANNEL_SECRET")
@@ -482,12 +535,21 @@ class AppConfig:
 async def startup_event():
     """應用程式啟動時執行的任務"""
     log.info("===== Application Startup =====")
     global rag_pipeline
     rag_pipeline = RagPipeline(AppConfig)
     rag_pipeline._load_data()
-    # 檢查 LINE 設定是否齊全
-    if not AppConfig.CHANNEL_ACCESS_TOKEN or not AppConfig.CHANNEL_SECRET:
-        log.error("錯誤: LINE_CHANNEL_ACCESS_TOKEN 或 LINE_CHANNEL_SECRET 未設定！")
 @app.get("/health", status_code=status.HTTP_200_OK)
 async def health_check():
@@ -503,12 +565,15 @@ async def handle_webhook(request: Request, response: Response):
     body = await request.body()
     try:
-        hash = hmac.new(AppConfig.CHANNEL_SECRET.encode('utf-8'), body, hashlib.sha256).digest()
-        if base64.b64encode(hash) != signature.encode('utf-8'):
             raise HTTPException(status_code=403, detail="Invalid signature")
     except Exception as e:
         log.error(f"簽名驗證失敗: {e}")
-        raise HTTPException(status_code=403, detail="Invalid signature")
     data = json.loads(body.decode('utf-8'))
     for event in data.get("events", []):
@@ -518,7 +583,8 @@ async def handle_webhook(request: Request, response: Response):
             if not user_text: continue
-            answer = await rag_pipeline.answer_question(user_text)
             if reply_token:
                 line_reply(reply_token, answer)
@@ -526,21 +592,23 @@ async def handle_webhook(request: Request, response: Response):
     return {"status": "ok"}
 def line_reply(reply_token: str, text: str):
-    """透過 LINE Message API 回覆訊息"""
     headers = {
         "Content-Type": "application/json",
         "Authorization": f"Bearer {AppConfig.CHANNEL_ACCESS_TOKEN}"
     }
-    data = {
-        "replyToken": reply_token,
-        "messages": [{"type": "text", "text": text}]
-    }
     try:
-        requests.post("https://api.line.me/v2/bot/message/reply", headers=headers, json=data)
     except Exception as e:
         log.error(f"LINE API 回覆失敗: {e}")
-# ---- 整合的額外工具函式 ----
 def extract_drug_candidates_from_query(query: str) -> list:
     query = re.sub(r"[A-Za-z]+", lambda m: m.group(0).lower(), query)
     candidates = set()
@@ -552,8 +620,11 @@ def extract_drug_candidates_from_query(query: str) -> list:
         clean_token = re.sub(r'[a-zA-Z0-9\s]+', '', token).strip()
         if clean_token and clean_token.lower() not in DRUG_STOPWORDS:
             candidates.add(clean_token)
-    if drug_part.strip():
-        candidates.add(drug_part.strip())
     for query_name, dataset_name in DRUG_NAME_MAPPING.items():
         if query_name in query.lower():
             candidates.add(dataset_name)
@@ -575,5 +646,3 @@ def expand_aliases(candidates: list) -> list:
 if __name__ == "__main__":
     port = int(os.getenv("PORT", 7860))
     uvicorn.run(app, host="0.0.0.0", port=port)

 from typing import List, Dict, Any, Optional, Tuple, Union
 from functools import lru_cache
 import time
+import textwrap
 # ---------- 第三方函式庫 ----------
 import numpy as np
 from openai import OpenAI
 from tenacity import retry, stop_after_attempt, wait_fixed
 import requests
+from starlette.concurrency import run_in_threadpool
 # ==== CONFIG (從環境變數載入，或使用預設值) ====
 # 根據提供的檔案清單，將預設路徑設定為當前目錄
 FAISS_INDEX = os.getenv("FAISS_INDEX", "drug_sentences.index")
 SENTENCES_PKL = os.getenv("SENTENCES_PKL", "drug_sentences.pkl")
 BM25_PKL = os.getenv("BM25_PKL", "bm25.pkl")
 TOP_K_SENTENCES = int(os.getenv("TOP_K_SENTENCES", 30))
 PRE_RERANK_K = int(os.getenv("PRE_RERANK_K", 30))
         self.embedding_model = self._load_embedding_model()
         self.reranker = self._load_reranker_model()
         self.csv_path = self._ensure_csv_path(CSV_PATH)
     def _load_embedding_model(self):
         device = "cuda" if torch.cuda.is_available() else "cpu"
     def _load_data(self):
         """在啟動時載入所有必要的模型與資料"""
         log.info("開始載入資料與模型...")
+        # Load CSV and check for required columns
+        if not os.path.exists(self.csv_path):
+            raise FileNotFoundError(f"找不到 CSV 檔案於 {self.csv_path}")
+        self.df_csv = pd.read_csv(self.csv_path, dtype=str).fillna('')
+        required_cols = {"drug_id", "drug_name_norm", "section"}
+        missing_cols = required_cols - set(self.df_csv.columns)
+        if missing_cols:
+            raise ValueError(f"CSV 缺少必要欄位: {missing_cols}")
+        self.df_csv['drug_name_norm_normalized'] = (
+            self.df_csv['drug_name_norm'].str.lower().str.replace(r'[^\w\s]', '', regex=True).str.strip()
+        )
+        log.info(f"成功載入 CSV: {self.csv_path} (rows={len(self.df_csv)})")
+        # Load corpus and index
         self.state.index, self.state.sentences, self.state.meta = self._load_or_build_sentence_index()
         self.state.bm25 = self._ensure_bm25_index()
+        # Check for BM25 alignment
+        bm_n = len(self.state.bm25.corpus)
+        sent_n = len(self.state.sentences)
+        if bm_n != sent_n:
+            raise RuntimeError(f"BM25 文件數 ({bm_n}) 與 sentences ({sent_n}) 不一致，請重新生成索引。")
         log.info("所有模型與資料載入完成。")
     def _load_or_build_sentence_index(self):
             return index, sentences, meta
         log.info("索引檔案不存在，正在從 CSV 重新建立...")
+        # This function should be run by a separate script, not here.
         raise RuntimeError("FAISS 和句子 PKL 檔案未找到，請先執行索引生成腳本。")
     def _ensure_bm25_index(self):
             try:
                 with open(BM25_PKL, "rb") as f:
                     data = pickle.load(f)
+                    bm25 = data.get("bm25") if isinstance(data, dict) else data
                     if not hasattr(bm25, 'get_scores'):
                         raise ValueError("載入的 BM25 索引無效。")
+                    log.info(f"成功載入 BM25 索引，包含 {len(bm25.corpus)} 篇文件。")
                     return bm25
             except Exception as e:
                 log.error(f"載入 BM25 索引失敗 ({e})，請檢查檔案格式。")
                 messages=messages,
                 temperature=temperature,
                 max_tokens=max_tokens,
+                stop=LLM_MODEL_CONFIG.get("stop_tokens") or None,
             )
             return response.choices[0].message.content
         except Exception as e:
             log.error(f"LLM API 呼叫失敗: {e}")
             raise
+    def answer_question(self, q_orig: str) -> str:
+        """處理使用者問題的完整流程 (同步版本)"""
         start_time = time.time()
         log.info(f"===== 處理新查詢: '{q_orig}' =====")
             log.info(f"分析結果 - 意圖: {intents}")
             all_reranked_results = []
             log.info("步驟 3/5: 檢索與重排序...")
             relevant_indices = {i for i, m in enumerate(self.state.meta) if m.get("drug_id") in drug_ids}
             if not relevant_indices:
                         else:
                             candidate_dict[i] = {"sem": 0.0, "bm": bm_score}
+                candidates_list = []
                 for i, scores in candidate_dict.items():
+                    candidates_list.append((i, scores["sem"], scores["bm"]))
+                if not candidates_list:
+                    continue
+                # Normalize scores
+                sem_vals = np.array([s for _, s, _ in candidates_list], dtype=np.float32)
+                bm_vals  = np.array([b for _, _, b in candidates_list], dtype=np.float32)
+                def norm(x):
+                    rng = x.max() - x.min()
+                    return (x - x.min()) / (rng + 1e-8)
+                sem_n = norm(sem_vals)
+                bm_n  = norm(bm_vals)
+                fused_candidates = []
+                for idx, (i, s_raw, b_raw) in enumerate(candidates_list):
                     section_name = self.state.meta[i].get("section", "其他")
                     section_weight = weights.get(section_name, 1.0)
+                    fused_score = (sem_n[idx] * 0.5 + bm_n[idx] * 0.4) * section_weight
+                    fused_candidates.append((i, fused_score, s_raw, b_raw))
+                fused_candidates.sort(key=lambda x: x[1], reverse=True)
+                sub_reranked = self._rerank_with_crossencoder(q_orig, fused_candidates, self.state.sentences, self.reranker, TOP_K_SENTENCES, self.state.meta, RERANK_THRESHOLD)
+                # De-duplicate using index
+                processed_indices = {res['idx'] for res in all_reranked_results}
                 for r in sub_reranked:
+                    if r['idx'] in processed_indices:
                         continue
                     all_reranked_results.append(r)
             all_reranked_results.sort(key=lambda x: x['rerank_score'], reverse=True)
             log.info(f"Reranker 最終選出 {len(all_reranked_results)} 個高品質候選。")
             return {"sub_queries": [query], "intents": []}
     def _find_drug_ids_from_name(self, query: str, df: pd.DataFrame) -> List[str]:
+        if df is None:
+            return []
         candidates = extract_drug_candidates_from_query(query)
         expanded = expand_aliases(candidates)
         drug_ids = set()
         for alias in expanded:
+            try:
+                # Use regex=False for literal matching, which is safer
+                mask = df['drug_name_norm_normalized'].str.contains(alias.lower(), case=False, regex=False, na=False)
+                matches = df.loc[mask, 'drug_id'].dropna().unique().tolist()
+                drug_ids.update(matches)
+            except Exception as e:
+                log.warning(f"Failed to match '{alias}': {e}. Skipping this alias.")
         return list(drug_ids)
     def _expand_query_with_llm(self, query: str, intents: List[str]) -> str:
             return [], []
         q_emb = embedding_model.encode([query], convert_to_numpy=True).astype("float32")
         faiss.normalize_L2(q_emb)
         distances, indices = index.search(q_emb, top_k)
+        # Check for metric type to ensure scores are "higher is better"
+        metric = getattr(index, "metric_type", None)
+        try:
+            import faiss
+            METRIC_L2 = faiss.METRIC_L2
+        except Exception:
+            METRIC_L2 = 1
+        if metric == METRIC_L2:
+            scores = (-distances[0]).tolist()  # L2 distance is smaller for closer points
+        else:
+            scores = distances[0].tolist()     # Inner product (cosine) is larger for closer points
+        return indices[0].tolist(), scores
     def _rerank_with_crossencoder(self, query: str, candidates: List[Tuple], sentences: List[str], reranker, top_k: int, meta: List[Dict], threshold: float) -> List[Dict]:
         if not candidates:
         scores = reranker.predict(pairs)
         reranked = []
+        for (i, fused_score, sem_score, bm_score), rerank_score in zip(limited_candidates, scores):
+            if rerank_score >= threshold:
                 reranked.append({
                     "idx": i,
+                    "rerank_score": rerank_score,
+                    "fused_score": fused_score,
+                    "sem_score": sem_score,
+                    "bm_score": bm_score,
                     "meta": meta[i],
                     "text": sentences[i]
                 })
 # ---------- FastAPI 事件與路由 ----------
 app = FastAPI()
 rag_pipeline = None
 class AppConfig:
     CHANNEL_ACCESS_TOKEN = os.getenv("LINE_CHANNEL_ACCESS_TOKEN")
     CHANNEL_SECRET = os.getenv("LINE_CHANNEL_SECRET")
 async def startup_event():
     """應用程式啟動時執行的任務"""
     log.info("===== Application Startup =====")
+    missing = []
+    if not AppConfig.CHANNEL_ACCESS_TOKEN: missing.append("LINE_CHANNEL_ACCESS_TOKEN")
+    if not AppConfig.CHANNEL_SECRET: missing.append("LINE_CHANNEL_SECRET")
+    if not LLM_API_CONFIG.get("api_key"): missing.append("LITELLM_API_KEY")
+    if not LLM_API_CONFIG.get("base_url"): missing.append("LITELLM_BASE_URL")
+    if not LLM_API_CONFIG.get("model"): missing.append("LM_MODEL")
+    if missing:
+        log.error(f"缺少必要環境變數：{missing}")
+        raise RuntimeError(f"Missing required environment variables: {missing}")
     global rag_pipeline
     rag_pipeline = RagPipeline(AppConfig)
     rag_pipeline._load_data()
+    log.info("啟動檢查完成。")
 @app.get("/health", status_code=status.HTTP_200_OK)
 async def health_check():
     body = await request.body()
     try:
+        digest = hmac.new(AppConfig.CHANNEL_SECRET.encode("utf-8"), body, hashlib.sha256).digest()
+        expected = base64.b64encode(digest).decode()
+        if not hmac.compare_digest(expected, signature):
             raise HTTPException(status_code=403, detail="Invalid signature")
+    except HTTPException:
+        raise
     except Exception as e:
         log.error(f"簽名驗證失敗: {e}")
+        raise HTTPException(status_code=500, detail="Signature verification error")
     data = json.loads(body.decode('utf-8'))
     for event in data.get("events", []):
             if not user_text: continue
+            # Offload heavy work to a thread pool
+            answer = await run_in_threadpool(rag_pipeline.answer_question, user_text)
             if reply_token:
                 line_reply(reply_token, answer)
     return {"status": "ok"}
 def line_reply(reply_token: str, text: str):
+    """透過 LINE Message API 回覆訊息，並進行分塊以避免長度限制"""
     headers = {
         "Content-Type": "application/json",
         "Authorization": f"Bearer {AppConfig.CHANNEL_ACCESS_TOKEN}"
     }
+    # LINE 文本長度上限約為 5000 字元
+    chunks = textwrap.wrap(text, 4900)
+    messages = [{"type": "text", "text": c} for c in chunks] or [{"type": "text", "text": text[:4900]}]
+    data = {"replyToken": reply_token, "messages": messages}
     try:
+        r = requests.post("https://api.line.me/v2/bot/message/reply", headers=headers, json=data, timeout=10)
+        if r.status_code >= 300:
+            log.error(f"LINE API 回覆失敗: {r.status_code} {r.text}")
     except Exception as e:
         log.error(f"LINE API 回覆失敗: {e}")
+# ---- 額外工具函式 ----
 def extract_drug_candidates_from_query(query: str) -> list:
     query = re.sub(r"[A-Za-z]+", lambda m: m.group(0).lower(), query)
     candidates = set()
         clean_token = re.sub(r'[a-zA-Z0-9\s]+', '', token).strip()
         if clean_token and clean_token.lower() not in DRUG_STOPWORDS:
             candidates.add(clean_token)
+    # Avoid adding the whole drug_part to prevent regex errors
+    # if drug_part.strip():
+    #     candidates.add(drug_part.strip())
     for query_name, dataset_name in DRUG_NAME_MAPPING.items():
         if query_name in query.lower():
             candidates.add(dataset_name)
 if __name__ == "__main__":
     port = int(os.getenv("PORT", 7860))
     uvicorn.run(app, host="0.0.0.0", port=port)