zhichyu commited on
Commit
4ba2b4f
·
1 Parent(s): 47ea26c

Replaced md5 with xxhash64 for chunk id (#4009)

Browse files

### What problem does this PR solve?

Replaced md5 with xxhash64 for chunk id

### Type of change

- [x] Refactoring

api/apps/chunk_app.py CHANGED
@@ -31,7 +31,7 @@ from api.utils.api_utils import server_error_response, get_data_error_result, va
31
  from api.db.services.document_service import DocumentService
32
  from api import settings
33
  from api.utils.api_utils import get_json_result
34
- import hashlib
35
  import re
36
 
37
 
@@ -208,9 +208,7 @@ def rm():
208
  @validate_request("doc_id", "content_with_weight")
209
  def create():
210
  req = request.json
211
- md5 = hashlib.md5()
212
- md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
213
- chunck_id = md5.hexdigest()
214
  d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
215
  "content_with_weight": req["content_with_weight"]}
216
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
 
31
  from api.db.services.document_service import DocumentService
32
  from api import settings
33
  from api.utils.api_utils import get_json_result
34
+ import xxhash
35
  import re
36
 
37
 
 
208
  @validate_request("doc_id", "content_with_weight")
209
  def create():
210
  req = request.json
211
+ chunck_id = xxhash.xxh64((req["content_with_weight"] + req["doc_id"]).encode("utf-8")).hexdigest()
 
 
212
  d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
213
  "content_with_weight": req["content_with_weight"]}
214
  d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
api/apps/sdk/doc.py CHANGED
@@ -22,7 +22,7 @@ from rag.nlp import rag_tokenizer
22
  from api.db import LLMType, ParserType
23
  from api.db.services.llm_service import TenantLLMService
24
  from api import settings
25
- import hashlib
26
  import re
27
  from api.utils.api_utils import token_required
28
  from api.db.db_models import Task
@@ -984,10 +984,7 @@ def add_chunk(tenant_id, dataset_id, document_id):
984
  return get_error_data_result(
985
  "`questions` is required to be a list"
986
  )
987
- md5 = hashlib.md5()
988
- md5.update((req["content"] + document_id).encode("utf-8"))
989
-
990
- chunk_id = md5.hexdigest()
991
  d = {
992
  "id": chunk_id,
993
  "content_ltks": rag_tokenizer.tokenize(req["content"]),
 
22
  from api.db import LLMType, ParserType
23
  from api.db.services.llm_service import TenantLLMService
24
  from api import settings
25
+ import xxhash
26
  import re
27
  from api.utils.api_utils import token_required
28
  from api.db.db_models import Task
 
984
  return get_error_data_result(
985
  "`questions` is required to be a list"
986
  )
987
+ chunk_id = xxhash.xxh64((req["content"] + document_id).encode("utf-8")).hexdigest()
 
 
 
988
  d = {
989
  "id": chunk_id,
990
  "content_ltks": rag_tokenizer.tokenize(req["content"]),
api/db/services/document_service.py CHANGED
@@ -14,7 +14,7 @@
14
  # limitations under the License.
15
  #
16
  import logging
17
- import hashlib
18
  import json
19
  import random
20
  import re
@@ -508,10 +508,7 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
508
  for ck in th.result():
509
  d = deepcopy(doc)
510
  d.update(ck)
511
- md5 = hashlib.md5()
512
- md5.update((ck["content_with_weight"] +
513
- str(d["doc_id"])).encode("utf-8"))
514
- d["id"] = md5.hexdigest()
515
  d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
516
  d["create_timestamp_flt"] = datetime.now().timestamp()
517
  if not d.get("image"):
 
14
  # limitations under the License.
15
  #
16
  import logging
17
+ import xxhash
18
  import json
19
  import random
20
  import re
 
508
  for ck in th.result():
509
  d = deepcopy(doc)
510
  d.update(ck)
511
+ d["id"] = xxhash.xxh64((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest()
 
 
 
512
  d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
513
  d["create_timestamp_flt"] = datetime.now().timestamp()
514
  if not d.get("image"):
api/db/services/task_service.py CHANGED
@@ -35,17 +35,13 @@ from api import settings
35
  from rag.nlp import search
36
 
37
  def trim_header_by_lines(text: str, max_length) -> str:
38
- if len(text) <= max_length:
 
39
  return text
40
- lines = text.split("\n")
41
- total = 0
42
- idx = len(lines) - 1
43
- for i in range(len(lines)-1, -1, -1):
44
- if total + len(lines[i]) > max_length:
45
- break
46
- idx = i
47
- text2 = "\n".join(lines[idx:])
48
- return text2
49
 
50
  class TaskService(CommonService):
51
  model = Task
@@ -183,7 +179,7 @@ class TaskService(CommonService):
183
  if os.environ.get("MACOS"):
184
  if info["progress_msg"]:
185
  task = cls.model.get_by_id(id)
186
- progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 10000)
187
  cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
188
  if "progress" in info:
189
  cls.model.update(progress=info["progress"]).where(
@@ -194,7 +190,7 @@ class TaskService(CommonService):
194
  with DB.lock("update_progress", -1):
195
  if info["progress_msg"]:
196
  task = cls.model.get_by_id(id)
197
- progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 10000)
198
  cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
199
  if "progress" in info:
200
  cls.model.update(progress=info["progress"]).where(
 
35
  from rag.nlp import search
36
 
37
  def trim_header_by_lines(text: str, max_length) -> str:
38
+ len_text = len(text)
39
+ if len_text <= max_length:
40
  return text
41
+ for i in range(len_text):
42
+ if text[i] == '\n' and len_text - i <= max_length:
43
+ return text[i+1:]
44
+ return text
 
 
 
 
 
45
 
46
  class TaskService(CommonService):
47
  model = Task
 
179
  if os.environ.get("MACOS"):
180
  if info["progress_msg"]:
181
  task = cls.model.get_by_id(id)
182
+ progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 1000)
183
  cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
184
  if "progress" in info:
185
  cls.model.update(progress=info["progress"]).where(
 
190
  with DB.lock("update_progress", -1):
191
  if info["progress_msg"]:
192
  task = cls.model.get_by_id(id)
193
+ progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 1000)
194
  cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
195
  if "progress" in info:
196
  cls.model.update(progress=info["progress"]).where(
rag/svr/task_executor.py CHANGED
@@ -27,7 +27,7 @@ import logging
27
  import os
28
  from datetime import datetime
29
  import json
30
- import hashlib
31
  import copy
32
  import re
33
  import time
@@ -226,10 +226,7 @@ def build_chunks(task, progress_callback):
226
  for ck in cks:
227
  d = copy.deepcopy(doc)
228
  d.update(ck)
229
- md5 = hashlib.md5()
230
- md5.update((ck["content_with_weight"] +
231
- str(d["doc_id"])).encode("utf-8"))
232
- d["id"] = md5.hexdigest()
233
  d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
234
  d["create_timestamp_flt"] = datetime.now().timestamp()
235
  if not d.get("image"):
@@ -368,9 +365,7 @@ def run_raptor(row, chat_mdl, embd_mdl, callback=None):
368
  tk_count = 0
369
  for content, vctr in chunks[original_length:]:
370
  d = copy.deepcopy(doc)
371
- md5 = hashlib.md5()
372
- md5.update((content + str(d["doc_id"])).encode("utf-8"))
373
- d["id"] = md5.hexdigest()
374
  d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
375
  d["create_timestamp_flt"] = datetime.now().timestamp()
376
  d[vctr_nm] = vctr.tolist()
 
27
  import os
28
  from datetime import datetime
29
  import json
30
+ import xxhash
31
  import copy
32
  import re
33
  import time
 
226
  for ck in cks:
227
  d = copy.deepcopy(doc)
228
  d.update(ck)
229
+ d["id"] = xxhash.xxh64((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest()
 
 
 
230
  d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
231
  d["create_timestamp_flt"] = datetime.now().timestamp()
232
  if not d.get("image"):
 
365
  tk_count = 0
366
  for content, vctr in chunks[original_length:]:
367
  d = copy.deepcopy(doc)
368
+ d["id"] = xxhash.xxh64((content + str(d["doc_id"])).encode("utf-8")).hexdigest()
 
 
369
  d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
370
  d["create_timestamp_flt"] = datetime.now().timestamp()
371
  d[vctr_nm] = vctr.tolist()