KevinHuSh commited on
Commit
6224edc
·
1 Parent(s): af3ef26

Add task moduel, and pipline the task and every parser (#49)

Browse files
api/apps/document_app.py CHANGED
@@ -22,6 +22,8 @@ from elasticsearch_dsl import Q
22
  from flask import request
23
  from flask_login import login_required, current_user
24
 
 
 
25
  from rag.nlp import search
26
  from rag.utils import ELASTICSEARCH
27
  from api.db.services import duplicate_name
@@ -205,6 +207,26 @@ def rm():
205
  return server_error_response(e)
206
 
207
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  @manager.route('/rename', methods=['POST'])
209
  @login_required
210
  @validate_request("doc_id", "name", "old_name")
@@ -262,7 +284,7 @@ def change_parser():
262
  if doc.parser_id.lower() == req["parser_id"].lower():
263
  return get_json_result(data=True)
264
 
265
- e = DocumentService.update_by_id(doc.id, {"parser_id": req["parser_id"], "progress":0, "progress_msg": ""})
266
  if not e:
267
  return get_data_error_result(retmsg="Document not found!")
268
  e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num*-1, doc.chunk_num*-1, doc.process_duation*-1)
 
22
  from flask import request
23
  from flask_login import login_required, current_user
24
 
25
+ from api.db.db_models import Task
26
+ from api.db.services.task_service import TaskService
27
  from rag.nlp import search
28
  from rag.utils import ELASTICSEARCH
29
  from api.db.services import duplicate_name
 
207
  return server_error_response(e)
208
 
209
 
210
+ @manager.route('/run', methods=['POST'])
211
+ @login_required
212
+ @validate_request("doc_ids", "run")
213
+ def rm():
214
+ req = request.json
215
+ try:
216
+ for id in req["doc_ids"]:
217
+ DocumentService.update_by_id(id, {"run": str(req["run"])})
218
+ if req["run"] == "2":
219
+ TaskService.filter_delete([Task.doc_id == id])
220
+ tenant_id = DocumentService.get_tenant_id(id)
221
+ if not tenant_id:
222
+ return get_data_error_result(retmsg="Tenant not found!")
223
+ ELASTICSEARCH.deleteByQuery(Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
224
+
225
+ return get_json_result(data=True)
226
+ except Exception as e:
227
+ return server_error_response(e)
228
+
229
+
230
  @manager.route('/rename', methods=['POST'])
231
  @login_required
232
  @validate_request("doc_id", "name", "old_name")
 
284
  if doc.parser_id.lower() == req["parser_id"].lower():
285
  return get_json_result(data=True)
286
 
287
+ e = DocumentService.update_by_id(doc.id, {"parser_id": req["parser_id"], "progress":0, "progress_msg": "", "run": 1})
288
  if not e:
289
  return get_data_error_result(retmsg="Document not found!")
290
  e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num*-1, doc.chunk_num*-1, doc.process_duation*-1)
api/db/__init__.py CHANGED
@@ -59,3 +59,14 @@ class ChatStyle(StrEnum):
59
  PRECISE = 'Precise'
60
  EVENLY = 'Evenly'
61
  CUSTOM = 'Custom'
 
 
 
 
 
 
 
 
 
 
 
 
59
  PRECISE = 'Precise'
60
  EVENLY = 'Evenly'
61
  CUSTOM = 'Custom'
62
+
63
+
64
+ class ParserType(StrEnum):
65
+ GENERAL = "general"
66
+ PRESENTATION = "presentation"
67
+ LAWS = "laws"
68
+ MANUAL = "manual"
69
+ PAPER = "paper"
70
+ RESUME = ""
71
+ BOOK = ""
72
+ QA = ""
api/db/db_models.py CHANGED
@@ -496,15 +496,27 @@ class Document(DataBaseModel):
496
  token_num = IntegerField(default=0)
497
  chunk_num = IntegerField(default=0)
498
  progress = FloatField(default=0)
499
- progress_msg = CharField(max_length=255, null=True, help_text="process message", default="")
500
  process_begin_at = DateTimeField(null=True)
501
  process_duation = FloatField(default=0)
 
502
  status = CharField(max_length=1, null=True, help_text="is it validate(0: wasted,1: validate)", default="1")
503
 
504
  class Meta:
505
  db_table = "document"
506
 
507
 
 
 
 
 
 
 
 
 
 
 
 
508
  class Dialog(DataBaseModel):
509
  id = CharField(max_length=32, primary_key=True)
510
  tenant_id = CharField(max_length=32, null=False)
@@ -553,72 +565,6 @@ class Conversation(DataBaseModel):
553
 
554
 
555
  """
556
- class Job(DataBaseModel):
557
- # multi-party common configuration
558
- f_user_id = CharField(max_length=25, null=True)
559
- f_job_id = CharField(max_length=25, index=True)
560
- f_name = CharField(max_length=500, null=True, default='')
561
- f_description = TextField(null=True, default='')
562
- f_tag = CharField(max_length=50, null=True, default='')
563
- f_dsl = JSONField()
564
- f_runtime_conf = JSONField()
565
- f_runtime_conf_on_party = JSONField()
566
- f_train_runtime_conf = JSONField(null=True)
567
- f_roles = JSONField()
568
- f_initiator_role = CharField(max_length=50)
569
- f_initiator_party_id = CharField(max_length=50)
570
- f_status = CharField(max_length=50)
571
- f_status_code = IntegerField(null=True)
572
- f_user = JSONField()
573
- # this party configuration
574
- f_role = CharField(max_length=50, index=True)
575
- f_party_id = CharField(max_length=10, index=True)
576
- f_is_initiator = BooleanField(null=True, default=False)
577
- f_progress = IntegerField(null=True, default=0)
578
- f_ready_signal = BooleanField(default=False)
579
- f_ready_time = BigIntegerField(null=True)
580
- f_cancel_signal = BooleanField(default=False)
581
- f_cancel_time = BigIntegerField(null=True)
582
- f_rerun_signal = BooleanField(default=False)
583
- f_end_scheduling_updates = IntegerField(null=True, default=0)
584
-
585
- f_engine_name = CharField(max_length=50, null=True)
586
- f_engine_type = CharField(max_length=10, null=True)
587
- f_cores = IntegerField(default=0)
588
- f_memory = IntegerField(default=0) # MB
589
- f_remaining_cores = IntegerField(default=0)
590
- f_remaining_memory = IntegerField(default=0) # MB
591
- f_resource_in_use = BooleanField(default=False)
592
- f_apply_resource_time = BigIntegerField(null=True)
593
- f_return_resource_time = BigIntegerField(null=True)
594
-
595
- f_inheritance_info = JSONField(null=True)
596
- f_inheritance_status = CharField(max_length=50, null=True)
597
-
598
- f_start_time = BigIntegerField(null=True)
599
- f_start_date = DateTimeField(null=True)
600
- f_end_time = BigIntegerField(null=True)
601
- f_end_date = DateTimeField(null=True)
602
- f_elapsed = BigIntegerField(null=True)
603
-
604
- class Meta:
605
- db_table = "t_job"
606
- primary_key = CompositeKey('f_job_id', 'f_role', 'f_party_id')
607
-
608
-
609
-
610
- class PipelineComponentMeta(DataBaseModel):
611
- f_model_id = CharField(max_length=100, index=True)
612
- f_model_version = CharField(max_length=100, index=True)
613
- f_role = CharField(max_length=50, index=True)
614
- f_party_id = CharField(max_length=10, index=True)
615
- f_component_name = CharField(max_length=100, index=True)
616
- f_component_module_name = CharField(max_length=100)
617
- f_model_alias = CharField(max_length=100, index=True)
618
- f_model_proto_index = JSONField(null=True)
619
- f_run_parameters = JSONField(null=True)
620
- f_archive_sha256 = CharField(max_length=100, null=True)
621
- f_archive_from_ip = CharField(max_length=100, null=True)
622
 
623
  class Meta:
624
  db_table = 't_pipeline_component_meta'
 
496
  token_num = IntegerField(default=0)
497
  chunk_num = IntegerField(default=0)
498
  progress = FloatField(default=0)
499
+ progress_msg = CharField(max_length=512, null=True, help_text="process message", default="")
500
  process_begin_at = DateTimeField(null=True)
501
  process_duation = FloatField(default=0)
502
+ run = CharField(max_length=1, null=True, help_text="start to run processing or cancel.(1: run it; 2: cancel)", default="0")
503
  status = CharField(max_length=1, null=True, help_text="is it validate(0: wasted,1: validate)", default="1")
504
 
505
  class Meta:
506
  db_table = "document"
507
 
508
 
509
+ class Task(DataBaseModel):
510
+ id = CharField(max_length=32, primary_key=True)
511
+ doc_id = CharField(max_length=32, null=False, index=True)
512
+ from_page = IntegerField(default=0)
513
+ to_page = IntegerField(default=-1)
514
+ begin_at = DateTimeField(null=True)
515
+ process_duation = FloatField(default=0)
516
+ progress = FloatField(default=0)
517
+ progress_msg = CharField(max_length=255, null=True, help_text="process message", default="")
518
+
519
+
520
  class Dialog(DataBaseModel):
521
  id = CharField(max_length=32, primary_key=True)
522
  tenant_id = CharField(max_length=32, null=False)
 
565
 
566
 
567
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
568
 
569
  class Meta:
570
  db_table = 't_pipeline_component_meta'
api/db/db_utils.py CHANGED
@@ -32,19 +32,19 @@ LOGGER = getLogger()
32
  def bulk_insert_into_db(model, data_source, replace_on_conflict=False):
33
  DB.create_tables([model])
34
 
35
- current_time = current_timestamp()
36
- current_date = timestamp_to_date(current_time)
37
 
38
  for data in data_source:
39
- if 'f_create_time' not in data:
40
- data['f_create_time'] = current_time
41
- data['f_create_date'] = timestamp_to_date(data['f_create_time'])
42
- data['f_update_time'] = current_time
43
- data['f_update_date'] = current_date
 
 
44
 
45
- preserve = tuple(data_source[0].keys() - {'f_create_time', 'f_create_date'})
46
 
47
- batch_size = 50 if RuntimeConfig.USE_LOCAL_DATABASE else 1000
48
 
49
  for i in range(0, len(data_source), batch_size):
50
  with DB.atomic():
 
32
  def bulk_insert_into_db(model, data_source, replace_on_conflict=False):
33
  DB.create_tables([model])
34
 
 
 
35
 
36
  for data in data_source:
37
+ current_time = current_timestamp()
38
+ current_date = timestamp_to_date(current_time)
39
+ if 'create_time' not in data:
40
+ data['create_time'] = current_time
41
+ data['create_date'] = timestamp_to_date(data['create_time'])
42
+ data['update_time'] = current_time
43
+ data['update_date'] = current_date
44
 
45
+ preserve = tuple(data_source[0].keys() - {'create_time', 'create_date'})
46
 
47
+ batch_size = 1000
48
 
49
  for i in range(0, len(data_source), batch_size):
50
  with DB.atomic():
api/db/services/common_service.py CHANGED
@@ -70,6 +70,7 @@ class CommonService:
70
  @DB.connection_context()
71
  def insert_many(cls, data_list, batch_size=100):
72
  with DB.atomic():
 
73
  for i in range(0, len(data_list), batch_size):
74
  cls.model.insert_many(data_list[i:i + batch_size]).execute()
75
 
 
70
  @DB.connection_context()
71
  def insert_many(cls, data_list, batch_size=100):
72
  with DB.atomic():
73
+ for d in data_list: d["create_time"] = datetime_format(datetime.now())
74
  for i in range(0, len(data_list), batch_size):
75
  cls.model.insert_many(data_list[i:i + batch_size]).execute()
76
 
api/db/services/document_service.py CHANGED
@@ -61,8 +61,8 @@ class DocumentService(CommonService):
61
 
62
  @classmethod
63
  @DB.connection_context()
64
- def get_newly_uploaded(cls, tm, mod, comm, items_per_page=64):
65
- fields = [cls.model.id, cls.model.kb_id, cls.model.parser_id, cls.model.name, cls.model.location, cls.model.size, Knowledgebase.tenant_id, Tenant.embd_id, Tenant.img2txt_id, cls.model.update_time]
66
  docs = cls.model.select(*fields) \
67
  .join(Knowledgebase, on=(cls.model.kb_id == Knowledgebase.id)) \
68
  .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\
@@ -76,6 +76,18 @@ class DocumentService(CommonService):
76
  .paginate(1, items_per_page)
77
  return list(docs.dicts())
78
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  @classmethod
80
  @DB.connection_context()
81
  def increment_chunk_num(cls, doc_id, kb_id, token_num, chunk_num, duation):
 
61
 
62
  @classmethod
63
  @DB.connection_context()
64
+ def get_newly_uploaded(cls, tm, mod=0, comm=1, items_per_page=64):
65
+ fields = [cls.model.id, cls.model.kb_id, cls.model.parser_id, cls.model.name, cls.model.type, cls.model.location, cls.model.size, Knowledgebase.tenant_id, Tenant.embd_id, Tenant.img2txt_id, Tenant.asr_id, cls.model.update_time]
66
  docs = cls.model.select(*fields) \
67
  .join(Knowledgebase, on=(cls.model.kb_id == Knowledgebase.id)) \
68
  .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\
 
76
  .paginate(1, items_per_page)
77
  return list(docs.dicts())
78
 
79
+ @classmethod
80
+ @DB.connection_context()
81
+ def get_unfinished_docs(cls):
82
+ fields = [cls.model.id, cls.model.process_begin_at]
83
+ docs = cls.model.select(*fields) \
84
+ .where(
85
+ cls.model.status == StatusEnum.VALID.value,
86
+ ~(cls.model.type == FileType.VIRTUAL.value),
87
+ cls.model.progress < 1,
88
+ cls.model.progress > 0)
89
+ return list(docs.dicts())
90
+
91
  @classmethod
92
  @DB.connection_context()
93
  def increment_chunk_num(cls, doc_id, kb_id, token_num, chunk_num, duation):
api/db/services/task_service.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ from peewee import Expression
17
+ from api.db.db_models import DB
18
+ from api.db import StatusEnum, FileType
19
+ from api.db.db_models import Task, Document, Knowledgebase, Tenant
20
+ from api.db.services.common_service import CommonService
21
+
22
+
23
+ class TaskService(CommonService):
24
+ model = Task
25
+
26
+ @classmethod
27
+ @DB.connection_context()
28
+ def get_tasks(cls, tm, mod=0, comm=1, items_per_page=64):
29
+ fields = [cls.model.id, cls.model.doc_id, cls.model.from_page,cls.model.to_page, Document.kb_id, Document.parser_id, Document.name, Document.type, Document.location, Document.size, Knowledgebase.tenant_id, Tenant.embd_id, Tenant.img2txt_id, Tenant.asr_id, cls.model.update_time]
30
+ docs = cls.model.select(*fields) \
31
+ .join(Document, on=(cls.model.doc_id == Document.id)) \
32
+ .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
33
+ .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\
34
+ .where(
35
+ Document.status == StatusEnum.VALID.value,
36
+ ~(Document.type == FileType.VIRTUAL.value),
37
+ cls.model.progress == 0,
38
+ cls.model.update_time >= tm,
39
+ (Expression(cls.model.create_time, "%%", comm) == mod))\
40
+ .order_by(cls.model.update_time.asc())\
41
+ .paginate(1, items_per_page)
42
+ return list(docs.dicts())
43
+
44
+
45
+ @classmethod
46
+ @DB.connection_context()
47
+ def do_cancel(cls, id):
48
+ try:
49
+ cls.model.get_by_id(id)
50
+ return False
51
+ except Exception as e:
52
+ pass
53
+ return True
rag/app/__init__.py CHANGED
@@ -67,4 +67,6 @@ def tokenize(d, t, eng):
67
  d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
68
  else:
69
  d["content_ltks"] = huqie.qie(t)
70
- d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
 
 
 
67
  d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
68
  else:
69
  d["content_ltks"] = huqie.qie(t)
70
+ d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
71
+
72
+
rag/app/laws.py CHANGED
@@ -32,14 +32,12 @@ class Pdf(HuParser):
32
  zoomin,
33
  from_page,
34
  to_page)
35
- callback__((min(to_page, self.total_page) - from_page) / self.total_page / 2,
36
- "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback)
37
 
38
  from timeit import default_timer as timer
39
  start = timer()
40
  self._layouts_paddle(zoomin)
41
- callback__((min(to_page, self.total_page) - from_page) / self.total_page / 2,
42
- "Page {}~{}: Layout analysis finished".format(from_page, min(to_page, self.total_page)), callback)
43
  print("paddle layouts:", timer()-start)
44
  bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
45
  # is it English
@@ -77,8 +75,7 @@ class Pdf(HuParser):
77
  b["x1"] = max(b["x1"], b_["x1"])
78
  bxs.pop(i + 1)
79
 
80
- callback__((min(to_page, self.total_page) - from_page) / self.total_page / 2,
81
- "Page {}~{}: Text extraction finished".format(from_page, min(to_page, self.total_page)), callback)
82
 
83
  return [b["text"] + self._line_tag(b, zoomin) for b in bxs]
84
 
@@ -92,14 +89,17 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
92
  pdf_parser = None
93
  sections = []
94
  if re.search(r"\.docx?$", filename, re.IGNORECASE):
 
95
  for txt in Docx()(filename, binary):
96
  sections.append(txt)
97
- if re.search(r"\.pdf$", filename, re.IGNORECASE):
 
98
  pdf_parser = Pdf()
99
  for txt in pdf_parser(filename if not binary else binary,
100
  from_page=from_page, to_page=to_page, callback=callback):
101
  sections.append(txt)
102
- if re.search(r"\.txt$", filename, re.IGNORECASE):
 
103
  txt = ""
104
  if binary:txt = binary.decode("utf-8")
105
  else:
@@ -110,6 +110,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
110
  txt += l
111
  sections = txt.split("\n")
112
  sections = [l for l in sections if l]
 
 
113
 
114
  # is it English
115
  eng = is_english(sections)
 
32
  zoomin,
33
  from_page,
34
  to_page)
35
+ callback__(0.1, "OCR finished", callback)
 
36
 
37
  from timeit import default_timer as timer
38
  start = timer()
39
  self._layouts_paddle(zoomin)
40
+ callback__(0.77, "Layout analysis finished", callback)
 
41
  print("paddle layouts:", timer()-start)
42
  bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
43
  # is it English
 
75
  b["x1"] = max(b["x1"], b_["x1"])
76
  bxs.pop(i + 1)
77
 
78
+ callback__(0.8, "Text extraction finished", callback)
 
79
 
80
  return [b["text"] + self._line_tag(b, zoomin) for b in bxs]
81
 
 
89
  pdf_parser = None
90
  sections = []
91
  if re.search(r"\.docx?$", filename, re.IGNORECASE):
92
+ callback__(0.1, "Start to parse.", callback)
93
  for txt in Docx()(filename, binary):
94
  sections.append(txt)
95
+ callback__(0.8, "Finish parsing.", callback)
96
+ elif re.search(r"\.pdf$", filename, re.IGNORECASE):
97
  pdf_parser = Pdf()
98
  for txt in pdf_parser(filename if not binary else binary,
99
  from_page=from_page, to_page=to_page, callback=callback):
100
  sections.append(txt)
101
+ elif re.search(r"\.txt$", filename, re.IGNORECASE):
102
+ callback__(0.1, "Start to parse.", callback)
103
  txt = ""
104
  if binary:txt = binary.decode("utf-8")
105
  else:
 
110
  txt += l
111
  sections = txt.split("\n")
112
  sections = [l for l in sections if l]
113
+ callback__(0.8, "Finish parsing.", callback)
114
+ else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
115
 
116
  # is it English
117
  eng = is_english(sections)
rag/app/manual.py CHANGED
@@ -1,12 +1,8 @@
1
  import copy
2
  import re
3
- from collections import Counter
4
- from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize
5
- from rag.nlp import huqie, stemmer
6
- from rag.parser.docx_parser import HuDocxParser
7
  from rag.parser.pdf_parser import HuParser
8
- from nltk.tokenize import word_tokenize
9
- import numpy as np
10
  from rag.utils import num_tokens_from_string
11
 
12
 
@@ -18,24 +14,19 @@ class Pdf(HuParser):
18
  zoomin,
19
  from_page,
20
  to_page)
21
- callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
22
- "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback)
23
 
24
  from timeit import default_timer as timer
25
  start = timer()
26
  self._layouts_paddle(zoomin)
27
- callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
28
- "Page {}~{}: Layout analysis finished".format(from_page, min(to_page, self.total_page)), callback)
29
  print("paddle layouts:", timer() - start)
30
  self._table_transformer_job(zoomin)
31
- callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
32
- "Page {}~{}: Table analysis finished".format(from_page, min(to_page, self.total_page)), callback)
33
  self._text_merge()
34
- column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
35
  self._concat_downward(concat_between_pages=False)
36
  self._filter_forpages()
37
- callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
38
- "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)), callback)
39
  tbls = self._extract_table_figure(True, zoomin, False)
40
 
41
  # clean mess
@@ -71,6 +62,7 @@ class Pdf(HuParser):
71
  b_["top"] = b["top"]
72
  self.boxes.pop(i)
73
 
 
74
  for b in self.boxes: print(b["text"], b.get("layoutno"))
75
 
76
  print(tbls)
@@ -85,6 +77,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
85
  pdf_parser = Pdf()
86
  cks, tbls = pdf_parser(filename if not binary else binary,
87
  from_page=from_page, to_page=to_page, callback=callback)
 
88
  doc = {
89
  "docnm_kwd": filename
90
  }
 
1
  import copy
2
  import re
3
+ from rag.app import callback__, tokenize
4
+ from rag.nlp import huqie
 
 
5
  from rag.parser.pdf_parser import HuParser
 
 
6
  from rag.utils import num_tokens_from_string
7
 
8
 
 
14
  zoomin,
15
  from_page,
16
  to_page)
17
+ callback__(0.2, "OCR finished.", callback)
 
18
 
19
  from timeit import default_timer as timer
20
  start = timer()
21
  self._layouts_paddle(zoomin)
22
+ callback__(0.5, "Layout analysis finished.", callback)
 
23
  print("paddle layouts:", timer() - start)
24
  self._table_transformer_job(zoomin)
25
+ callback__(0.7, "Table analysis finished.", callback)
 
26
  self._text_merge()
 
27
  self._concat_downward(concat_between_pages=False)
28
  self._filter_forpages()
29
+ callback__(0.77, "Text merging finished", callback)
 
30
  tbls = self._extract_table_figure(True, zoomin, False)
31
 
32
  # clean mess
 
62
  b_["top"] = b["top"]
63
  self.boxes.pop(i)
64
 
65
+ callback__(0.8, "Parsing finished", callback)
66
  for b in self.boxes: print(b["text"], b.get("layoutno"))
67
 
68
  print(tbls)
 
77
  pdf_parser = Pdf()
78
  cks, tbls = pdf_parser(filename if not binary else binary,
79
  from_page=from_page, to_page=to_page, callback=callback)
80
+ else: raise NotImplementedError("file type not supported yet(pdf supported)")
81
  doc = {
82
  "docnm_kwd": filename
83
  }
rag/app/paper.py CHANGED
@@ -18,24 +18,20 @@ class Pdf(HuParser):
18
  zoomin,
19
  from_page,
20
  to_page)
21
- callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
22
- "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback)
23
 
24
  from timeit import default_timer as timer
25
  start = timer()
26
  self._layouts_paddle(zoomin)
27
- callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
28
- "Page {}~{}: Layout analysis finished".format(from_page, min(to_page, self.total_page)), callback)
29
  print("paddle layouts:", timer() - start)
30
  self._table_transformer_job(zoomin)
31
- callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
32
- "Page {}~{}: Table analysis finished".format(from_page, min(to_page, self.total_page)), callback)
33
  self._text_merge()
34
  column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
35
  self._concat_downward(concat_between_pages=False)
36
  self._filter_forpages()
37
- callback__((min(to_page, self.total_page) - from_page) / self.total_page / 4,
38
- "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)), callback)
39
  tbls = self._extract_table_figure(True, zoomin, False)
40
 
41
  # clean mess
@@ -105,6 +101,7 @@ class Pdf(HuParser):
105
  break
106
  if not abstr: i = 0
107
 
 
108
  for b in self.boxes: print(b["text"], b.get("layoutno"))
109
  print(tbls)
110
 
@@ -126,6 +123,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
126
  pdf_parser = Pdf()
127
  paper = pdf_parser(filename if not binary else binary,
128
  from_page=from_page, to_page=to_page, callback=callback)
 
129
  doc = {
130
  "docnm_kwd": paper["title"] if paper["title"] else filename,
131
  "authors_tks": paper["authors"]
 
18
  zoomin,
19
  from_page,
20
  to_page)
21
+ callback__(0.2, "OCR finished.", callback)
 
22
 
23
  from timeit import default_timer as timer
24
  start = timer()
25
  self._layouts_paddle(zoomin)
26
+ callback__(0.47, "Layout analysis finished", callback)
 
27
  print("paddle layouts:", timer() - start)
28
  self._table_transformer_job(zoomin)
29
+ callback__(0.68, "Table analysis finished", callback)
 
30
  self._text_merge()
31
  column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
32
  self._concat_downward(concat_between_pages=False)
33
  self._filter_forpages()
34
+ callback__(0.75, "Text merging finished.", callback)
 
35
  tbls = self._extract_table_figure(True, zoomin, False)
36
 
37
  # clean mess
 
101
  break
102
  if not abstr: i = 0
103
 
104
+ callback__(0.8, "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)), callback)
105
  for b in self.boxes: print(b["text"], b.get("layoutno"))
106
  print(tbls)
107
 
 
123
  pdf_parser = Pdf()
124
  paper = pdf_parser(filename if not binary else binary,
125
  from_page=from_page, to_page=to_page, callback=callback)
126
+ else: raise NotImplementedError("file type not supported yet(pdf supported)")
127
  doc = {
128
  "docnm_kwd": paper["title"] if paper["title"] else filename,
129
  "authors_tks": paper["authors"]
rag/app/presentation.py CHANGED
@@ -42,10 +42,8 @@ class Ppt(object):
42
  txt = self.__extract(shape)
43
  if txt: texts.append(txt)
44
  txts.append("\n".join(texts))
45
- callback__((i+1)/self.total_page/2, "", callback)
46
 
47
- callback__((min(to_page, self.total_page) - from_page) / self.total_page,
48
- "Page {}~{}: Text extraction finished".format(from_page, min(to_page, self.total_page)), callback)
49
  import aspose.slides as slides
50
  import aspose.pydrawing as drawing
51
  imgs = []
@@ -55,8 +53,7 @@ class Ppt(object):
55
  slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
56
  imgs.append(buffered.getvalue())
57
  assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
58
- callback__((min(to_page, self.total_page) - from_page) / self.total_page,
59
- "Page {}~{}: Image extraction finished".format(from_page, min(to_page, self.total_page)), callback)
60
  self.is_english = is_english(txts)
61
  return [(txts[i], imgs[i]) for i in range(len(txts))]
62
 
@@ -73,7 +70,7 @@ class Pdf(HuParser):
73
 
74
  def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
75
  self.__images__(filename if not binary else binary, zoomin, from_page, to_page)
76
- callback__((min(to_page, self.total_page)-from_page) / self.total_page, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback)
77
  assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
78
  res = []
79
  #################### More precisely ###################
@@ -92,6 +89,7 @@ class Pdf(HuParser):
92
  for i in range(len(self.boxes)):
93
  lines = "\n".join([b["text"] for b in self.boxes[i] if not self.__garbage(b["text"])])
94
  res.append((lines, self.page_images[i]))
 
95
  return res
96
 
97
 
@@ -104,13 +102,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
104
  res = []
105
  if re.search(r"\.pptx?$", filename, re.IGNORECASE):
106
  ppt_parser = Ppt()
107
- for txt,img in ppt_parser(filename if not binary else binary, from_page, to_page, callback):
108
  d = copy.deepcopy(doc)
109
  d["image"] = img
110
  tokenize(d, txt, ppt_parser.is_english)
111
  res.append(d)
112
  return res
113
- if re.search(r"\.pdf$", filename, re.IGNORECASE):
114
  pdf_parser = Pdf()
115
  for txt,img in pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback):
116
  d = copy.deepcopy(doc)
@@ -118,7 +116,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
118
  tokenize(d, txt, pdf_parser.is_english)
119
  res.append(d)
120
  return res
121
- callback__(-1, "This kind of presentation document did not support yet!", callback)
 
122
 
123
 
124
  if __name__== "__main__":
 
42
  txt = self.__extract(shape)
43
  if txt: texts.append(txt)
44
  txts.append("\n".join(texts))
 
45
 
46
+ callback__(0.5, "Text extraction finished.", callback)
 
47
  import aspose.slides as slides
48
  import aspose.pydrawing as drawing
49
  imgs = []
 
53
  slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
54
  imgs.append(buffered.getvalue())
55
  assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
56
+ callback__(0.9, "Image extraction finished", callback)
 
57
  self.is_english = is_english(txts)
58
  return [(txts[i], imgs[i]) for i in range(len(txts))]
59
 
 
70
 
71
  def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
72
  self.__images__(filename if not binary else binary, zoomin, from_page, to_page)
73
+ callback__(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback)
74
  assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
75
  res = []
76
  #################### More precisely ###################
 
89
  for i in range(len(self.boxes)):
90
  lines = "\n".join([b["text"] for b in self.boxes[i] if not self.__garbage(b["text"])])
91
  res.append((lines, self.page_images[i]))
92
+ callback__(0.9, "Page {}~{}: Parsing finished".format(from_page, min(to_page, self.total_page)), callback)
93
  return res
94
 
95
 
 
102
  res = []
103
  if re.search(r"\.pptx?$", filename, re.IGNORECASE):
104
  ppt_parser = Ppt()
105
+ for txt,img in ppt_parser(filename if not binary else binary, from_page, 1000000, callback):
106
  d = copy.deepcopy(doc)
107
  d["image"] = img
108
  tokenize(d, txt, ppt_parser.is_english)
109
  res.append(d)
110
  return res
111
+ elif re.search(r"\.pdf$", filename, re.IGNORECASE):
112
  pdf_parser = Pdf()
113
  for txt,img in pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback):
114
  d = copy.deepcopy(doc)
 
116
  tokenize(d, txt, pdf_parser.is_english)
117
  res.append(d)
118
  return res
119
+
120
+ raise NotImplementedError("file type not supported yet(pptx, pdf supported)")
121
 
122
 
123
  if __name__== "__main__":
rag/parser/pdf_parser.py CHANGED
@@ -1559,6 +1559,15 @@ class HuParser:
1559
 
1560
  return "\n\n".join(res)
1561
 
 
 
 
 
 
 
 
 
 
1562
  def __images__(self, fnm, zoomin=3, page_from=0, page_to=299):
1563
  self.lefted_chars = []
1564
  self.mean_height = []
 
1559
 
1560
  return "\n\n".join(res)
1561
 
1562
+ @staticmethod
1563
+ def total_page_number(fnm, binary=None):
1564
+ try:
1565
+ pdf = pdfplumber.open(fnm) if not binary else pdfplumber.open(BytesIO(binary))
1566
+ return len(pdf.pages)
1567
+ except Exception as e:
1568
+ pdf = fitz.open(fnm) if not binary else fitz.open(stream=fnm, filetype="pdf")
1569
+ return len(pdf)
1570
+
1571
  def __images__(self, fnm, zoomin=3, page_from=0, page_to=299):
1572
  self.lefted_chars = []
1573
  self.mean_height = []
rag/svr/task_broker.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ import logging
17
+ import os
18
+ import time
19
+ import random
20
+ from timeit import default_timer as timer
21
+ from api.db.db_models import Task
22
+ from api.db.db_utils import bulk_insert_into_db
23
+ from api.db.services.task_service import TaskService
24
+ from rag.parser.pdf_parser import HuParser
25
+ from rag.settings import cron_logger
26
+ from rag.utils import MINIO
27
+ from rag.utils import findMaxTm
28
+ import pandas as pd
29
+ from api.db import FileType
30
+ from api.db.services.document_service import DocumentService
31
+ from api.settings import database_logger
32
+ from api.utils import get_format_time, get_uuid
33
+ from api.utils.file_utils import get_project_base_directory
34
+
35
+
36
+ def collect(tm):
37
+ docs = DocumentService.get_newly_uploaded(tm)
38
+ if len(docs) == 0:
39
+ return pd.DataFrame()
40
+ docs = pd.DataFrame(docs)
41
+ mtm = docs["update_time"].max()
42
+ cron_logger.info("TOTAL:{}, To:{}".format(len(docs), mtm))
43
+ return docs
44
+
45
+
46
+ def set_dispatching(docid):
47
+ try:
48
+ DocumentService.update_by_id(
49
+ docid, {"progress": random.randint(0, 3) / 100.,
50
+ "progress_msg": "Task dispatched...",
51
+ "process_begin_at": get_format_time()
52
+ })
53
+ except Exception as e:
54
+ cron_logger.error("set_dispatching:({}), {}".format(docid, str(e)))
55
+
56
+
57
+ def dispatch():
58
+ tm_fnm = os.path.join(get_project_base_directory(), "rag/res", f"broker.tm")
59
+ tm = findMaxTm(tm_fnm)
60
+ rows = collect(tm)
61
+ if len(rows) == 0:
62
+ return
63
+
64
+ tmf = open(tm_fnm, "a+")
65
+ for _, r in rows.iterrows():
66
+ try:
67
+ tsks = TaskService.query(doc_id=r["id"])
68
+ if tsks:
69
+ for t in tsks:
70
+ TaskService.delete_by_id(t.id)
71
+ except Exception as e:
72
+ cron_logger.error("delete task exception:" + str(e))
73
+
74
+ def new_task():
75
+ nonlocal r
76
+ return {
77
+ "id": get_uuid(),
78
+ "doc_id": r["id"]
79
+ }
80
+
81
+ tsks = []
82
+ if r["type"] == FileType.PDF.value:
83
+ pages = HuParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
84
+ for p in range(0, pages, 10):
85
+ task = new_task()
86
+ task["from_page"] = p
87
+ task["to_page"] = min(p + 10, pages)
88
+ tsks.append(task)
89
+ else:
90
+ tsks.append(new_task())
91
+ print(tsks)
92
+ bulk_insert_into_db(Task, tsks, True)
93
+ set_dispatching(r["id"])
94
+ tmf.write(str(r["update_time"]) + "\n")
95
+ tmf.close()
96
+
97
+
98
+ def update_progress():
99
+ docs = DocumentService.get_unfinished_docs()
100
+ for d in docs:
101
+ try:
102
+ tsks = TaskService.query(doc_id=d["id"], order_by=Task.create_time)
103
+ if not tsks:continue
104
+ msg = []
105
+ prg = 0
106
+ finished = True
107
+ bad = 0
108
+ for t in tsks:
109
+ if 0 <= t.progress < 1: finished = False
110
+ prg += t.progress if t.progress >= 0 else 0
111
+ msg.append(t.progress_msg)
112
+ if t.progress == -1: bad += 1
113
+ prg /= len(tsks)
114
+ if finished and bad: prg = -1
115
+ msg = "\n".join(msg)
116
+ DocumentService.update_by_id(d["id"], {"progress": prg, "progress_msg": msg, "process_duation": timer()-d["process_begin_at"].timestamp()})
117
+ except Exception as e:
118
+ cron_logger.error("fetch task exception:" + str(e))
119
+
120
+
121
+ if __name__ == "__main__":
122
+ peewee_logger = logging.getLogger('peewee')
123
+ peewee_logger.propagate = False
124
+ peewee_logger.addHandler(database_logger.handlers[0])
125
+ peewee_logger.setLevel(database_logger.level)
126
+
127
+ while True:
128
+ dispatch()
129
+ time.sleep(3)
130
+ update_progress()
rag/svr/{parse_user_docs.py → task_executor.py} RENAMED
@@ -19,49 +19,59 @@ import logging
19
  import os
20
  import hashlib
21
  import copy
22
- import time
23
- import random
24
  import re
 
 
25
  from timeit import default_timer as timer
26
 
 
27
  from rag.llm import EmbeddingModel, CvModel
28
  from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
29
  from rag.utils import ELASTICSEARCH
30
  from rag.utils import MINIO
31
  from rag.utils import rmSpace, findMaxTm
32
 
33
- from rag.nlp import huchunk, huqie, search
34
  from io import BytesIO
35
  import pandas as pd
36
- from elasticsearch_dsl import Q
37
- from PIL import Image
38
- from rag.parser import (
39
- PdfParser,
40
- DocxParser,
41
- ExcelParser
42
- )
43
- from rag.nlp.huchunk import (
44
- PdfChunker,
45
- DocxChunker,
46
- ExcelChunker,
47
- PptChunker,
48
- TextChunker
49
- )
50
- from api.db import LLMType
51
  from api.db.services.document_service import DocumentService
52
- from api.db.services.llm_service import TenantLLMService, LLMBundle
53
  from api.settings import database_logger
54
- from api.utils import get_format_time
55
  from api.utils.file_utils import get_project_base_directory
56
 
57
  BATCH_SIZE = 64
58
 
59
- PDF = PdfChunker(PdfParser())
60
- DOC = DocxChunker(DocxParser())
61
- EXC = ExcelChunker(ExcelParser())
62
- PPT = PptChunker()
 
 
 
 
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
 
 
 
 
65
  def chuck_doc(name, binary, tenant_id, cvmdl=None):
66
  suff = os.path.split(name)[-1].lower().split(".")[-1]
67
  if suff.find("pdf") >= 0:
@@ -81,27 +91,17 @@ def chuck_doc(name, binary, tenant_id, cvmdl=None):
81
  return field
82
 
83
  return TextChunker()(binary)
 
84
 
85
 
86
  def collect(comm, mod, tm):
87
- docs = DocumentService.get_newly_uploaded(tm, mod, comm)
88
- if len(docs) == 0:
89
  return pd.DataFrame()
90
- docs = pd.DataFrame(docs)
91
- mtm = docs["update_time"].max()
92
- cron_logger.info("TOTAL:{}, To:{}".format(len(docs), mtm))
93
- return docs
94
-
95
-
96
- def set_progress(docid, prog, msg="Processing...", begin=False):
97
- d = {"progress": prog, "progress_msg": msg}
98
- if begin:
99
- d["process_begin_at"] = get_format_time()
100
- try:
101
- DocumentService.update_by_id(
102
- docid, {"progress": prog, "progress_msg": msg})
103
- except Exception as e:
104
- cron_logger.error("set_progress:({}), {}".format(docid, str(e)))
105
 
106
 
107
  def build(row, cvmdl):
@@ -110,97 +110,50 @@ def build(row, cvmdl):
110
  (int(DOC_MAXIMUM_SIZE / 1024 / 1024)))
111
  return []
112
 
113
- # res = ELASTICSEARCH.search(Q("term", doc_id=row["id"]))
114
- # if ELASTICSEARCH.getTotal(res) > 0:
115
- # ELASTICSEARCH.updateScriptByQuery(Q("term", doc_id=row["id"]),
116
- # scripts="""
117
- # if(!ctx._source.kb_id.contains('%s'))
118
- # ctx._source.kb_id.add('%s');
119
- # """ % (str(row["kb_id"]), str(row["kb_id"])),
120
- # idxnm=search.index_name(row["tenant_id"])
121
- # )
122
- # set_progress(row["id"], 1, "Done")
123
- # return []
124
-
125
- random.seed(time.time())
126
- set_progress(row["id"], random.randint(0, 20) /
127
- 100., "Finished preparing! Start to slice file!", True)
128
  try:
129
  cron_logger.info("Chunkking {}/{}".format(row["location"], row["name"]))
130
- obj = chuck_doc(row["name"], MINIO.get(row["kb_id"], row["location"]), row["tenant_id"], cvmdl)
 
131
  except Exception as e:
132
  if re.search("(No such file|not found)", str(e)):
133
- set_progress(
134
- row["id"], -1, "Can not find file <%s>" %
135
- row["doc_name"])
136
  else:
137
- set_progress(
138
- row["id"], -1, f"Internal server error: %s" %
139
- str(e).replace(
140
- "'", ""))
141
 
142
  cron_logger.warn("Chunkking {}/{}: {}".format(row["location"], row["name"], str(e)))
143
 
144
  return []
145
 
146
- if not obj.text_chunks and not obj.table_chunks:
147
- set_progress(
148
- row["id"],
149
- 1,
150
- "Nothing added! Mostly, file type unsupported yet.")
151
- return []
152
-
153
- set_progress(row["id"], random.randint(20, 60) / 100.,
154
- "Finished slicing files. Start to embedding the content.")
155
 
 
156
  doc = {
157
- "doc_id": row["id"],
158
- "kb_id": [str(row["kb_id"])],
159
- "docnm_kwd": os.path.split(row["location"])[-1],
160
- "title_tks": huqie.qie(row["name"])
161
  }
162
- doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
163
- output_buffer = BytesIO()
164
- docs = []
165
- for txt, img in obj.text_chunks:
166
  d = copy.deepcopy(doc)
 
167
  md5 = hashlib.md5()
168
- md5.update((txt + str(d["doc_id"])).encode("utf-8"))
169
  d["_id"] = md5.hexdigest()
170
- d["content_ltks"] = huqie.qie(txt)
171
- d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
172
- if not img:
173
  docs.append(d)
174
  continue
175
 
176
- if isinstance(img, bytes):
177
- output_buffer = BytesIO(img)
 
178
  else:
179
- img.save(output_buffer, format='JPEG')
180
 
181
  MINIO.put(row["kb_id"], d["_id"], output_buffer.getvalue())
182
  d["img_id"] = "{}-{}".format(row["kb_id"], d["_id"])
183
- d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
184
  docs.append(d)
185
 
186
- for arr, img in obj.table_chunks:
187
- for i, txt in enumerate(arr):
188
- d = copy.deepcopy(doc)
189
- d["content_ltks"] = huqie.qie(txt)
190
- md5 = hashlib.md5()
191
- md5.update((txt + str(d["doc_id"])).encode("utf-8"))
192
- d["_id"] = md5.hexdigest()
193
- if not img:
194
- docs.append(d)
195
- continue
196
- img.save(output_buffer, format='JPEG')
197
- MINIO.put(row["kb_id"], d["_id"], output_buffer.getvalue())
198
- d["img_id"] = "{}-{}".format(row["kb_id"], d["_id"])
199
- d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
200
- docs.append(d)
201
- set_progress(row["id"], random.randint(60, 70) /
202
- 100., "Continue embedding the content.")
203
-
204
  return docs
205
 
206
 
@@ -213,7 +166,7 @@ def init_kb(row):
213
 
214
 
215
  def embedding(docs, mdl):
216
- tts, cnts = [rmSpace(d["title_tks"]) for d in docs], [rmSpace(d["content_ltks"]) for d in docs]
217
  tk_count = 0
218
  tts, c = mdl.encode(tts)
219
  tk_count += c
@@ -223,7 +176,7 @@ def embedding(docs, mdl):
223
  assert len(vects) == len(docs)
224
  for i, d in enumerate(docs):
225
  v = vects[i].tolist()
226
- d["q_%d_vec"%len(v)] = v
227
  return tk_count
228
 
229
 
@@ -239,11 +192,12 @@ def main(comm, mod):
239
  try:
240
  embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING)
241
  cv_mdl = LLMBundle(r["tenant_id"], LLMType.IMAGE2TEXT)
242
- #TODO: sequence2text model
243
  except Exception as e:
244
  set_progress(r["id"], -1, str(e))
245
  continue
246
 
 
247
  st_tm = timer()
248
  cks = build(r, cv_mdl)
249
  if not cks:
@@ -254,21 +208,20 @@ def main(comm, mod):
254
  try:
255
  tk_count = embedding(cks, embd_mdl)
256
  except Exception as e:
257
- set_progress(r["id"], -1, "Embedding error:{}".format(str(e)))
258
  cron_logger.error(str(e))
259
  continue
260
 
261
- set_progress(r["id"], random.randint(70, 95) / 100.,
262
- "Finished embedding! Start to build index!")
263
  init_kb(r)
264
  chunk_count = len(set([c["_id"] for c in cks]))
 
265
  es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"]))
266
  if es_r:
267
- set_progress(r["id"], -1, "Index failure!")
268
  cron_logger.error(str(es_r))
269
  else:
270
- set_progress(r["id"], 1., "Done!")
271
- DocumentService.increment_chunk_num(r["id"], r["kb_id"], tk_count, chunk_count, timer()-st_tm)
272
  cron_logger.info("Chunk doc({}), token({}), chunks({})".format(r["id"], tk_count, len(cks)))
273
 
274
  tmf.write(str(r["update_time"]) + "\n")
@@ -282,5 +235,6 @@ if __name__ == "__main__":
282
  peewee_logger.setLevel(database_logger.level)
283
 
284
  from mpi4py import MPI
 
285
  comm = MPI.COMM_WORLD
286
  main(comm.Get_size(), comm.Get_rank())
 
19
  import os
20
  import hashlib
21
  import copy
 
 
22
  import re
23
+ import sys
24
+ from functools import partial
25
  from timeit import default_timer as timer
26
 
27
+ from api.db.services.task_service import TaskService
28
  from rag.llm import EmbeddingModel, CvModel
29
  from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
30
  from rag.utils import ELASTICSEARCH
31
  from rag.utils import MINIO
32
  from rag.utils import rmSpace, findMaxTm
33
 
34
+ from rag.nlp import search
35
  from io import BytesIO
36
  import pandas as pd
37
+
38
+ from rag.app import laws, paper, presentation, manual
39
+
40
+ from api.db import LLMType, ParserType
 
 
 
 
 
 
 
 
 
 
 
41
  from api.db.services.document_service import DocumentService
42
+ from api.db.services.llm_service import LLMBundle
43
  from api.settings import database_logger
 
44
  from api.utils.file_utils import get_project_base_directory
45
 
46
  BATCH_SIZE = 64
47
 
48
+ FACTORY = {
49
+ ParserType.GENERAL.value: laws,
50
+ ParserType.PAPER.value: paper,
51
+ ParserType.PRESENTATION.value: presentation,
52
+ ParserType.MANUAL.value: manual,
53
+ ParserType.LAWS.value: laws,
54
+ }
55
+
56
 
57
+ def set_progress(task_id, from_page, to_page, prog=None, msg="Processing..."):
58
+ cancel = TaskService.do_cancel(task_id)
59
+ if cancel:
60
+ msg = "Canceled."
61
+ prog = -1
62
+
63
+ if to_page > 0: msg = f"Page({from_page}~{to_page}): " + msg
64
+ d = {"progress_msg": msg}
65
+ if prog is not None: d["progress"] = prog
66
+ try:
67
+ TaskService.update_by_id(task_id, d)
68
+ except Exception as e:
69
+ cron_logger.error("set_progress:({}), {}".format(task_id, str(e)))
70
 
71
+ if cancel:sys.exit()
72
+
73
+
74
+ """
75
  def chuck_doc(name, binary, tenant_id, cvmdl=None):
76
  suff = os.path.split(name)[-1].lower().split(".")[-1]
77
  if suff.find("pdf") >= 0:
 
91
  return field
92
 
93
  return TextChunker()(binary)
94
+ """
95
 
96
 
97
  def collect(comm, mod, tm):
98
+ tasks = TaskService.get_tasks(tm, mod, comm)
99
+ if len(tasks) == 0:
100
  return pd.DataFrame()
101
+ tasks = pd.DataFrame(tasks)
102
+ mtm = tasks["update_time"].max()
103
+ cron_logger.info("TOTAL:{}, To:{}".format(len(tasks), mtm))
104
+ return tasks
 
 
 
 
 
 
 
 
 
 
 
105
 
106
 
107
  def build(row, cvmdl):
 
110
  (int(DOC_MAXIMUM_SIZE / 1024 / 1024)))
111
  return []
112
 
113
+ callback = partial(set_progress, row["id"], row["from_page"], row["to_page"])
114
+ chunker = FACTORY[row["parser_id"]]
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  try:
116
  cron_logger.info("Chunkking {}/{}".format(row["location"], row["name"]))
117
+ cks = chunker.chunk(row["name"], MINIO.get(row["kb_id"], row["location"]), row["from_page"], row["to_page"],
118
+ callback)
119
  except Exception as e:
120
  if re.search("(No such file|not found)", str(e)):
121
+ callback(-1, "Can not find file <%s>" % row["doc_name"])
 
 
122
  else:
123
+ callback(-1, f"Internal server error: %s" % str(e).replace("'", ""))
 
 
 
124
 
125
  cron_logger.warn("Chunkking {}/{}: {}".format(row["location"], row["name"], str(e)))
126
 
127
  return []
128
 
129
+ callback(msg="Finished slicing files. Start to embedding the content.")
 
 
 
 
 
 
 
 
130
 
131
+ docs = []
132
  doc = {
133
+ "doc_id": row["doc_id"],
134
+ "kb_id": [str(row["kb_id"])]
 
 
135
  }
136
+ for ck in cks:
 
 
 
137
  d = copy.deepcopy(doc)
138
+ d.update(ck)
139
  md5 = hashlib.md5()
140
+ md5.update((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8"))
141
  d["_id"] = md5.hexdigest()
142
+ d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
143
+ if not d.get("image"):
 
144
  docs.append(d)
145
  continue
146
 
147
+ output_buffer = BytesIO()
148
+ if isinstance(d["image"], bytes):
149
+ output_buffer = BytesIO(d["image"])
150
  else:
151
+ d["image"].save(output_buffer, format='JPEG')
152
 
153
  MINIO.put(row["kb_id"], d["_id"], output_buffer.getvalue())
154
  d["img_id"] = "{}-{}".format(row["kb_id"], d["_id"])
 
155
  docs.append(d)
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  return docs
158
 
159
 
 
166
 
167
 
168
  def embedding(docs, mdl):
169
+ tts, cnts = [d["docnm_kwd"] for d in docs], [d["content_with_weight"] for d in docs]
170
  tk_count = 0
171
  tts, c = mdl.encode(tts)
172
  tk_count += c
 
176
  assert len(vects) == len(docs)
177
  for i, d in enumerate(docs):
178
  v = vects[i].tolist()
179
+ d["q_%d_vec" % len(v)] = v
180
  return tk_count
181
 
182
 
 
192
  try:
193
  embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING)
194
  cv_mdl = LLMBundle(r["tenant_id"], LLMType.IMAGE2TEXT)
195
+ # TODO: sequence2text model
196
  except Exception as e:
197
  set_progress(r["id"], -1, str(e))
198
  continue
199
 
200
+ callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
201
  st_tm = timer()
202
  cks = build(r, cv_mdl)
203
  if not cks:
 
208
  try:
209
  tk_count = embedding(cks, embd_mdl)
210
  except Exception as e:
211
+ callback(-1, "Embedding error:{}".format(str(e)))
212
  cron_logger.error(str(e))
213
  continue
214
 
215
+ callback(msg="Finished embedding! Start to build index!")
 
216
  init_kb(r)
217
  chunk_count = len(set([c["_id"] for c in cks]))
218
+ callback(1., "Done!")
219
  es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"]))
220
  if es_r:
221
+ callback(-1, "Index failure!")
222
  cron_logger.error(str(es_r))
223
  else:
224
+ DocumentService.increment_chunk_num(r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
 
225
  cron_logger.info("Chunk doc({}), token({}), chunks({})".format(r["id"], tk_count, len(cks)))
226
 
227
  tmf.write(str(r["update_time"]) + "\n")
 
235
  peewee_logger.setLevel(database_logger.level)
236
 
237
  from mpi4py import MPI
238
+
239
  comm = MPI.COMM_WORLD
240
  main(comm.Get_size(), comm.Get_rank())