Spaces:

retopara
/

ragflow

Build error

App Files Files Community

KevinHuSh commited on Jan 22, 2024

Commit

34b2ab3

1 Parent(s): 484e5ab

Test APIs and fix bugs (#41)

Browse files

Files changed (11) hide show

api/apps/chunk_app.py +1 -1
api/apps/conversation_app.py +5 -3
api/apps/document_app.py +7 -3
api/apps/llm_app.py +1 -1
api/db/db_models.py +2 -2
api/db/services/llm_service.py +1 -1
api/utils/file_utils.py +1 -1
rag/llm/chat_model.py +14 -6
rag/llm/cv_model.py +3 -1
rag/nlp/search.py +6 -3
rag/svr/parse_user_docs.py +5 -5

api/apps/chunk_app.py CHANGED Viewed

@@ -214,7 +214,7 @@ def retrieval_test():
     question = req["question"]
     kb_id = req["kb_id"]
     doc_ids = req.get("doc_ids", [])
-    similarity_threshold = float(req.get("similarity_threshold", 0.4))
     vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
     top = int(req.get("top", 1024))
     try:

     question = req["question"]
     kb_id = req["kb_id"]
     doc_ids = req.get("doc_ids", [])
+    similarity_threshold = float(req.get("similarity_threshold", 0.2))
     vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
     top = int(req.get("top", 1024))
     try:

api/apps/conversation_app.py CHANGED Viewed

@@ -170,7 +170,7 @@ def chat(dialog, messages, **kwargs):
         if p["key"] not in kwargs:
             prompt_config["system"] = prompt_config["system"].replace("{%s}"%p["key"], " ")
-    model_config = TenantLLMService.get_api_key(dialog.tenant_id, LLMType.CHAT.value, dialog.llm_id)
     if not model_config: raise LookupError("LLM({}) API key not found".format(dialog.llm_id))
     question = messages[-1]["content"]
@@ -186,10 +186,10 @@ def chat(dialog, messages, **kwargs):
     kwargs["knowledge"] = "\n".join(knowledges)
     gen_conf = dialog.llm_setting[dialog.llm_setting_type]
     msg = [{"role": m["role"], "content": m["content"]} for m in messages if m["role"] != "system"]
-    used_token_count = message_fit_in(msg, int(llm.max_tokens * 0.97))
     if "max_tokens" in gen_conf:
         gen_conf["max_tokens"] = min(gen_conf["max_tokens"], llm.max_tokens - used_token_count)
-    mdl = ChatModel[model_config.llm_factory](model_config["api_key"], dialog.llm_id)
     answer = mdl.chat(prompt_config["system"].format(**kwargs), msg, gen_conf)
     answer = retrievaler.insert_citations(answer,
@@ -198,4 +198,6 @@ def chat(dialog, messages, **kwargs):
                                  embd_mdl,
                                  tkweight=1-dialog.vector_similarity_weight,
                                  vtweight=dialog.vector_similarity_weight)
     return {"answer": answer, "retrieval": kbinfos}

         if p["key"] not in kwargs:
             prompt_config["system"] = prompt_config["system"].replace("{%s}"%p["key"], " ")
+    model_config = TenantLLMService.get_api_key(dialog.tenant_id, dialog.llm_id)
     if not model_config: raise LookupError("LLM({}) API key not found".format(dialog.llm_id))
     question = messages[-1]["content"]
     kwargs["knowledge"] = "\n".join(knowledges)
     gen_conf = dialog.llm_setting[dialog.llm_setting_type]
     msg = [{"role": m["role"], "content": m["content"]} for m in messages if m["role"] != "system"]
+    used_token_count, msg = message_fit_in(msg, int(llm.max_tokens * 0.97))
     if "max_tokens" in gen_conf:
         gen_conf["max_tokens"] = min(gen_conf["max_tokens"], llm.max_tokens - used_token_count)
+    mdl = ChatModel[model_config.llm_factory](model_config.api_key, dialog.llm_id)
     answer = mdl.chat(prompt_config["system"].format(**kwargs), msg, gen_conf)
     answer = retrievaler.insert_citations(answer,
                                  embd_mdl,
                                  tkweight=1-dialog.vector_similarity_weight,
                                  vtweight=dialog.vector_similarity_weight)
+    for c in kbinfos["chunks"]:
+        if c.get("vector"):del c["vector"]
     return {"answer": answer, "retrieval": kbinfos}

api/apps/document_app.py CHANGED Viewed

@@ -11,7 +11,8 @@
 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
-#  limitations under the License.
 #
 import base64
 import pathlib
@@ -65,7 +66,7 @@ def upload():
         while MINIO.obj_exist(kb_id, location):
             location += "_"
         blob = request.files['file'].read()
-        MINIO.put(kb_id, filename, blob)
         doc = DocumentService.insert({
             "id": get_uuid(),
             "kb_id": kb.id,
@@ -188,7 +189,10 @@ def rm():
         e, doc = DocumentService.get_by_id(req["doc_id"])
         if not e:
             return get_data_error_result(retmsg="Document not found!")
-        ELASTICSEARCH.deleteByQuery(Q("match", doc_id=doc.id), idxnm=search.index_name(doc.kb_id))
         DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num*-1, doc.chunk_num*-1, 0)
         if not DocumentService.delete_by_id(req["doc_id"]):

 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
+#  limitations under the License
+#
 #
 import base64
 import pathlib
         while MINIO.obj_exist(kb_id, location):
             location += "_"
         blob = request.files['file'].read()
+        MINIO.put(kb_id, location, blob)
         doc = DocumentService.insert({
             "id": get_uuid(),
             "kb_id": kb.id,
         e, doc = DocumentService.get_by_id(req["doc_id"])
         if not e:
             return get_data_error_result(retmsg="Document not found!")
+        tenant_id = DocumentService.get_tenant_id(req["doc_id"])
+        if not tenant_id:
+            return get_data_error_result(retmsg="Tenant not found!")
+        ELASTICSEARCH.deleteByQuery(Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
         DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num*-1, doc.chunk_num*-1, 0)
         if not DocumentService.delete_by_id(req["doc_id"]):

api/apps/llm_app.py CHANGED Viewed

@@ -75,7 +75,7 @@ def list():
         llms = LLMService.get_all()
         llms = [m.to_dict() for m in llms if m.status == StatusEnum.VALID.value]
         for m in llms:
-            m["available"] = m.llm_name in mdlnms
         res = {}
         for m in llms:

         llms = LLMService.get_all()
         llms = [m.to_dict() for m in llms if m.status == StatusEnum.VALID.value]
         for m in llms:
+            m["available"] = m["llm_name"] in mdlnms
         res = {}
         for m in llms:

api/db/db_models.py CHANGED Viewed

@@ -469,7 +469,7 @@ class Knowledgebase(DataBaseModel):
     doc_num = IntegerField(default=0)
     token_num = IntegerField(default=0)
     chunk_num = IntegerField(default=0)
-    similarity_threshold = FloatField(default=0.4)
     vector_similarity_weight = FloatField(default=0.3)
     parser_id = CharField(max_length=32, null=False, help_text="default parser ID")
@@ -521,7 +521,7 @@ class Dialog(DataBaseModel):
     prompt_config = JSONField(null=False, default={"system": "", "prologue": "您好，我是您的助手小樱，长得可爱又善良，can I help you?",
                                                    "parameters": [], "empty_response": "Sorry! 知识库中未找到相关内容！"})
-    similarity_threshold = FloatField(default=0.4)
     vector_similarity_weight = FloatField(default=0.3)
     top_n = IntegerField(default=6)

     doc_num = IntegerField(default=0)
     token_num = IntegerField(default=0)
     chunk_num = IntegerField(default=0)
+    similarity_threshold = FloatField(default=0.2)
     vector_similarity_weight = FloatField(default=0.3)
     parser_id = CharField(max_length=32, null=False, help_text="default parser ID")
     prompt_config = JSONField(null=False, default={"system": "", "prologue": "您好，我是您的助手小樱，长得可爱又善良，can I help you?",
                                                    "parameters": [], "empty_response": "Sorry! 知识库中未找到相关内容！"})
+    similarity_threshold = FloatField(default=0.2)
     vector_similarity_weight = FloatField(default=0.3)
     top_n = IntegerField(default=6)

api/db/services/llm_service.py CHANGED Viewed

@@ -63,7 +63,7 @@ class TenantLLMService(CommonService):
         model_config = cls.get_api_key(tenant_id, mdlnm)
         if not model_config: raise LookupError("Model({}) not found".format(mdlnm))
-        model_config = model_config[0].to_dict()
         if llm_type == LLMType.EMBEDDING.value:
             if model_config["llm_factory"] not in EmbeddingModel: return
             return EmbeddingModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"])

         model_config = cls.get_api_key(tenant_id, mdlnm)
         if not model_config: raise LookupError("Model({}) not found".format(mdlnm))
+        model_config = model_config.to_dict()
         if llm_type == LLMType.EMBEDDING.value:
             if model_config["llm_factory"] not in EmbeddingModel: return
             return EmbeddingModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"])

api/utils/file_utils.py CHANGED Viewed

@@ -143,7 +143,7 @@ def filename_type(filename):
     if re.match(r".*\.pdf$", filename):
         return FileType.PDF.value
-    if re.match(r".*\.(doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp|pages|numbers|key|md)$", filename):
         return FileType.DOC.value
     if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename):

     if re.match(r".*\.pdf$", filename):
         return FileType.PDF.value
+    if re.match(r".*\.(docx|doc|ppt|yml|xml|htm|json|csv|txt|ini|xsl|wps|rtf|hlp|pages|numbers|key|md)$", filename):
         return FileType.DOC.value
     if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename):

rag/llm/chat_model.py CHANGED Viewed

@@ -19,31 +19,39 @@ import os
 class Base(ABC):
     def chat(self, system, history, gen_conf):
         raise NotImplementedError("Please implement encode method!")
 class GptTurbo(Base):
-    def __init__(self):
-        self.client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
     def chat(self, system, history, gen_conf):
         history.insert(0, {"role": "system", "content": system})
         res = self.client.chat.completions.create(
-            model="gpt-3.5-turbo",
             messages=history,
             **gen_conf)
         return res.choices[0].message.content.strip()
 class QWenChat(Base):
     def chat(self, system, history, gen_conf):
         from http import HTTPStatus
-        from dashscope import Generation
-        # export DASHSCOPE_API_KEY=YOUR_DASHSCOPE_API_KEY
         history.insert(0, {"role": "system", "content": system})
         response = Generation.call(
-            Generation.Models.qwen_turbo,
             messages=history,
             result_format='message'
         )

 class Base(ABC):
+    def __init__(self, key, model_name):
+        pass
     def chat(self, system, history, gen_conf):
         raise NotImplementedError("Please implement encode method!")
 class GptTurbo(Base):
+    def __init__(self, key, model_name="gpt-3.5-turbo"):
+        self.client = OpenAI(api_key=key)
+        self.model_name = model_name
     def chat(self, system, history, gen_conf):
         history.insert(0, {"role": "system", "content": system})
         res = self.client.chat.completions.create(
+            model=self.model_name,
             messages=history,
             **gen_conf)
         return res.choices[0].message.content.strip()
+from dashscope import Generation
 class QWenChat(Base):
+    def __init__(self, key, model_name=Generation.Models.qwen_turbo):
+        import dashscope
+        dashscope.api_key = key
+        self.model_name = model_name
     def chat(self, system, history, gen_conf):
         from http import HTTPStatus
         history.insert(0, {"role": "system", "content": system})
         response = Generation.call(
+            self.model_name,
             messages=history,
             result_format='message'
         )

rag/llm/cv_model.py CHANGED Viewed

@@ -28,6 +28,8 @@ class Base(ABC):
         raise NotImplementedError("Please implement encode method!")
     def image2base64(self, image):
         if isinstance(image, BytesIO):
             return base64.b64encode(image.getvalue()).decode("utf-8")
         buffered = BytesIO()
@@ -59,7 +61,7 @@ class Base(ABC):
 class GptV4(Base):
     def __init__(self, key, model_name="gpt-4-vision-preview"):
-        self.client = OpenAI(key)
         self.model_name = model_name
     def describe(self, image, max_tokens=300):

         raise NotImplementedError("Please implement encode method!")
     def image2base64(self, image):
+        if isinstance(image, bytes):
+            return base64.b64encode(image).decode("utf-8")
         if isinstance(image, BytesIO):
             return base64.b64encode(image.getvalue()).decode("utf-8")
         buffered = BytesIO()
 class GptV4(Base):
     def __init__(self, key, model_name="gpt-4-vision-preview"):
+        self.client = OpenAI(api_key = key)
         self.model_name = model_name
     def describe(self, image, max_tokens=300):

rag/nlp/search.py CHANGED Viewed

@@ -187,9 +187,10 @@ class Dealer:
             if len(t) < 5: continue
             idx.append(i)
             pieces_.append(t)
         if not pieces_: return answer
-        ans_v = embd_mdl.encode(pieces_)
         assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
             len(ans_v[0]), len(chunk_v[0]))
@@ -219,7 +220,7 @@ class Dealer:
             Dealer.trans2floats(
                 sres.field[i]["q_%d_vec" % len(sres.query_vector)]) for i in sres.ids]
         if not ins_embd:
-            return []
         ins_tw = [huqie.qie(sres.field[i][cfield]).split(" ") for i in sres.ids]
         sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
                                                         ins_embd,
@@ -235,6 +236,8 @@ class Dealer:
     def retrieval(self, question, embd_mdl, tenant_id, kb_ids, page, page_size, similarity_threshold=0.2,
                   vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True):
         req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "size": top,
                "question": question, "vector": True,
                "similarity": similarity_threshold}
@@ -243,7 +246,7 @@ class Dealer:
         sim, tsim, vsim = self.rerank(
             sres, question, 1 - vector_similarity_weight, vector_similarity_weight)
         idx = np.argsort(sim * -1)
-        ranks = {"total": 0, "chunks": [], "doc_aggs": {}}
         dim = len(sres.query_vector)
         start_idx = (page - 1) * page_size
         for i in idx:

             if len(t) < 5: continue
             idx.append(i)
             pieces_.append(t)
+        es_logger.info("{} => {}".format(answer, pieces_))
         if not pieces_: return answer
+        ans_v, c = embd_mdl.encode(pieces_)
         assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
             len(ans_v[0]), len(chunk_v[0]))
             Dealer.trans2floats(
                 sres.field[i]["q_%d_vec" % len(sres.query_vector)]) for i in sres.ids]
         if not ins_embd:
+            return [], [], []
         ins_tw = [huqie.qie(sres.field[i][cfield]).split(" ") for i in sres.ids]
         sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
                                                         ins_embd,
     def retrieval(self, question, embd_mdl, tenant_id, kb_ids, page, page_size, similarity_threshold=0.2,
                   vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True):
+        ranks = {"total": 0, "chunks": [], "doc_aggs": {}}
+        if not question: return ranks
         req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "size": top,
                "question": question, "vector": True,
                "similarity": similarity_threshold}
         sim, tsim, vsim = self.rerank(
             sres, question, 1 - vector_similarity_weight, vector_similarity_weight)
         idx = np.argsort(sim * -1)
         dim = len(sres.query_vector)
         start_idx = (page - 1) * page_size
         for i in idx:

rag/svr/parse_user_docs.py CHANGED Viewed

@@ -78,6 +78,7 @@ def chuck_doc(name, binary, cvmdl=None):
         field = TextChunker.Fields()
         field.text_chunks = [(txt, binary)]
         field.table_chunks = []
     return TextChunker()(binary)
@@ -161,9 +162,9 @@ def build(row, cvmdl):
     doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
     output_buffer = BytesIO()
     docs = []
-    md5 = hashlib.md5()
     for txt, img in obj.text_chunks:
         d = copy.deepcopy(doc)
         md5.update((txt + str(d["doc_id"])).encode("utf-8"))
         d["_id"] = md5.hexdigest()
         d["content_ltks"] = huqie.qie(txt)
@@ -186,6 +187,7 @@ def build(row, cvmdl):
         for i, txt in enumerate(arr):
             d = copy.deepcopy(doc)
             d["content_ltks"] = huqie.qie(txt)
             md5.update((txt + str(d["doc_id"])).encode("utf-8"))
             d["_id"] = md5.hexdigest()
             if not img:
@@ -226,9 +228,6 @@ def embedding(docs, mdl):
 def main(comm, mod):
-    global model
-    from rag.llm import HuEmbedding
-    model = HuEmbedding()
     tm_fnm = os.path.join(get_project_base_directory(), "rag/res", f"{comm}-{mod}.tm")
     tm = findMaxTm(tm_fnm)
     rows = collect(comm, mod, tm)
@@ -260,13 +259,14 @@ def main(comm, mod):
         set_progress(r["id"], random.randint(70, 95) / 100.,
                      "Finished embedding! Start to build index!")
         init_kb(r)
         es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"]))
         if es_r:
             set_progress(r["id"], -1, "Index failure!")
             cron_logger.error(str(es_r))
         else:
             set_progress(r["id"], 1., "Done!")
-            DocumentService.increment_chunk_num(r["id"], r["kb_id"], tk_count, len(cks), timer()-st_tm)
             cron_logger.info("Chunk doc({}), token({}), chunks({})".format(r["id"], tk_count, len(cks)))
         tmf.write(str(r["update_time"]) + "\n")

         field = TextChunker.Fields()
         field.text_chunks = [(txt, binary)]
         field.table_chunks = []
+        return field
     return TextChunker()(binary)
     doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
     output_buffer = BytesIO()
     docs = []
     for txt, img in obj.text_chunks:
         d = copy.deepcopy(doc)
+        md5 = hashlib.md5()
         md5.update((txt + str(d["doc_id"])).encode("utf-8"))
         d["_id"] = md5.hexdigest()
         d["content_ltks"] = huqie.qie(txt)
         for i, txt in enumerate(arr):
             d = copy.deepcopy(doc)
             d["content_ltks"] = huqie.qie(txt)
+            md5 = hashlib.md5()
             md5.update((txt + str(d["doc_id"])).encode("utf-8"))
             d["_id"] = md5.hexdigest()
             if not img:
 def main(comm, mod):
     tm_fnm = os.path.join(get_project_base_directory(), "rag/res", f"{comm}-{mod}.tm")
     tm = findMaxTm(tm_fnm)
     rows = collect(comm, mod, tm)
         set_progress(r["id"], random.randint(70, 95) / 100.,
                      "Finished embedding! Start to build index!")
         init_kb(r)
+        chunk_count = len(set([c["_id"] for c in cks]))
         es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"]))
         if es_r:
             set_progress(r["id"], -1, "Index failure!")
             cron_logger.error(str(es_r))
         else:
             set_progress(r["id"], 1., "Done!")
+            DocumentService.increment_chunk_num(r["id"], r["kb_id"], tk_count, chunk_count, timer()-st_tm)
             cron_logger.info("Chunk doc({}), token({}), chunks({})".format(r["id"], tk_count, len(cks)))
         tmf.write(str(r["update_time"]) + "\n")