Spaces:

retopara
/

ragflow

Build error

App Files Files Community

zhichyu commited on Dec 12, 2024

Commit

4ba2b4f

1 Parent(s): 47ea26c

Replaced md5 with xxhash64 for chunk id (#4009)

Browse files

### What problem does this PR solve?

Replaced md5 with xxhash64 for chunk id

### Type of change

- [x] Refactoring

Files changed (5) hide show

api/apps/chunk_app.py +2 -4
api/apps/sdk/doc.py +2 -5
api/db/services/document_service.py +2 -5
api/db/services/task_service.py +8 -12
rag/svr/task_executor.py +3 -8

api/apps/chunk_app.py CHANGED Viewed

@@ -31,7 +31,7 @@ from api.utils.api_utils import server_error_response, get_data_error_result, va
 from api.db.services.document_service import DocumentService
 from api import settings
 from api.utils.api_utils import get_json_result
-import hashlib
 import re
@@ -208,9 +208,7 @@ def rm():
 @validate_request("doc_id", "content_with_weight")
 def create():
     req = request.json
-    md5 = hashlib.md5()
-    md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
-    chunck_id = md5.hexdigest()
     d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
          "content_with_weight": req["content_with_weight"]}
     d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])

 from api.db.services.document_service import DocumentService
 from api import settings
 from api.utils.api_utils import get_json_result
+import xxhash
 import re
 @validate_request("doc_id", "content_with_weight")
 def create():
     req = request.json
+    chunck_id = xxhash.xxh64((req["content_with_weight"] + req["doc_id"]).encode("utf-8")).hexdigest()
     d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
          "content_with_weight": req["content_with_weight"]}
     d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])

api/apps/sdk/doc.py CHANGED Viewed

@@ -22,7 +22,7 @@ from rag.nlp import rag_tokenizer
 from api.db import LLMType, ParserType
 from api.db.services.llm_service import TenantLLMService
 from api import settings
-import hashlib
 import re
 from api.utils.api_utils import token_required
 from api.db.db_models import Task
@@ -984,10 +984,7 @@ def add_chunk(tenant_id, dataset_id, document_id):
             return get_error_data_result(
                 "`questions` is required to be a list"
             )
-    md5 = hashlib.md5()
-    md5.update((req["content"] + document_id).encode("utf-8"))
-    chunk_id = md5.hexdigest()
     d = {
         "id": chunk_id,
         "content_ltks": rag_tokenizer.tokenize(req["content"]),

 from api.db import LLMType, ParserType
 from api.db.services.llm_service import TenantLLMService
 from api import settings
+import xxhash
 import re
 from api.utils.api_utils import token_required
 from api.db.db_models import Task
             return get_error_data_result(
                 "`questions` is required to be a list"
             )
+    chunk_id = xxhash.xxh64((req["content"] + document_id).encode("utf-8")).hexdigest()
     d = {
         "id": chunk_id,
         "content_ltks": rag_tokenizer.tokenize(req["content"]),

api/db/services/document_service.py CHANGED Viewed

@@ -14,7 +14,7 @@
 #  limitations under the License.
 #
 import logging
-import hashlib
 import json
 import random
 import re
@@ -508,10 +508,7 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
         for ck in th.result():
             d = deepcopy(doc)
             d.update(ck)
-            md5 = hashlib.md5()
-            md5.update((ck["content_with_weight"] +
-                        str(d["doc_id"])).encode("utf-8"))
-            d["id"] = md5.hexdigest()
             d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
             d["create_timestamp_flt"] = datetime.now().timestamp()
             if not d.get("image"):

 #  limitations under the License.
 #
 import logging
+import xxhash
 import json
 import random
 import re
         for ck in th.result():
             d = deepcopy(doc)
             d.update(ck)
+            d["id"] = xxhash.xxh64((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest()
             d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
             d["create_timestamp_flt"] = datetime.now().timestamp()
             if not d.get("image"):

api/db/services/task_service.py CHANGED Viewed

@@ -35,17 +35,13 @@ from api import settings
 from rag.nlp import search
 def trim_header_by_lines(text: str, max_length) -> str:
-    if len(text) <= max_length:
         return text
-    lines = text.split("\n")
-    total = 0
-    idx = len(lines) - 1
-    for i in range(len(lines)-1, -1, -1):
-        if total + len(lines[i]) > max_length:
-            break
-        idx = i
-    text2 = "\n".join(lines[idx:])
-    return text2
 class TaskService(CommonService):
     model = Task
@@ -183,7 +179,7 @@ class TaskService(CommonService):
         if os.environ.get("MACOS"):
             if info["progress_msg"]:
                 task = cls.model.get_by_id(id)
-                progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 10000)
                 cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
             if "progress" in info:
                 cls.model.update(progress=info["progress"]).where(
@@ -194,7 +190,7 @@ class TaskService(CommonService):
         with DB.lock("update_progress", -1):
             if info["progress_msg"]:
                 task = cls.model.get_by_id(id)
-                progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 10000)
                 cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
             if "progress" in info:
                 cls.model.update(progress=info["progress"]).where(

 from rag.nlp import search
 def trim_header_by_lines(text: str, max_length) -> str:
+    len_text = len(text)
+    if len_text <= max_length:
         return text
+    for i in range(len_text):
+        if text[i] == '\n' and len_text - i <= max_length:
+            return text[i+1:]
+    return text
 class TaskService(CommonService):
     model = Task
         if os.environ.get("MACOS"):
             if info["progress_msg"]:
                 task = cls.model.get_by_id(id)
+                progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 1000)
                 cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
             if "progress" in info:
                 cls.model.update(progress=info["progress"]).where(
         with DB.lock("update_progress", -1):
             if info["progress_msg"]:
                 task = cls.model.get_by_id(id)
+                progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 1000)
                 cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
             if "progress" in info:
                 cls.model.update(progress=info["progress"]).where(

rag/svr/task_executor.py CHANGED Viewed

@@ -27,7 +27,7 @@ import logging
 import os
 from datetime import datetime
 import json
-import hashlib
 import copy
 import re
 import time
@@ -226,10 +226,7 @@ def build_chunks(task, progress_callback):
     for ck in cks:
         d = copy.deepcopy(doc)
         d.update(ck)
-        md5 = hashlib.md5()
-        md5.update((ck["content_with_weight"] +
-                    str(d["doc_id"])).encode("utf-8"))
-        d["id"] = md5.hexdigest()
         d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
         d["create_timestamp_flt"] = datetime.now().timestamp()
         if not d.get("image"):
@@ -368,9 +365,7 @@ def run_raptor(row, chat_mdl, embd_mdl, callback=None):
     tk_count = 0
     for content, vctr in chunks[original_length:]:
         d = copy.deepcopy(doc)
-        md5 = hashlib.md5()
-        md5.update((content + str(d["doc_id"])).encode("utf-8"))
-        d["id"] = md5.hexdigest()
         d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
         d["create_timestamp_flt"] = datetime.now().timestamp()
         d[vctr_nm] = vctr.tolist()

 import os
 from datetime import datetime
 import json
+import xxhash
 import copy
 import re
 import time
     for ck in cks:
         d = copy.deepcopy(doc)
         d.update(ck)
+        d["id"] = xxhash.xxh64((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest()
         d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
         d["create_timestamp_flt"] = datetime.now().timestamp()
         if not d.get("image"):
     tk_count = 0
     for content, vctr in chunks[original_length:]:
         d = copy.deepcopy(doc)
+        d["id"] = xxhash.xxh64((content + str(d["doc_id"])).encode("utf-8")).hexdigest()
         d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
         d["create_timestamp_flt"] = datetime.now().timestamp()
         d[vctr_nm] = vctr.tolist()