jinhai-2012 commited on
Commit
6e567cd
·
1 Parent(s): 1427166

Update progress info and start welcome info (#3768)

Browse files

### What problem does this PR solve?

_Briefly describe what this PR aims to solve. Include background context
that will help reviewers understand the purpose of the PR._

### Type of change

- [x] Refactoring

---------

Signed-off-by: jinhai <[email protected]>

api/ragflow_server.py CHANGED
@@ -47,6 +47,7 @@ from api.db.db_models import init_database_tables as init_web_db
47
  from api.db.init_data import init_web_data
48
  from api.versions import get_ragflow_version
49
  from api.utils import show_configs
 
50
 
51
 
52
  def update_progress():
@@ -75,6 +76,7 @@ if __name__ == '__main__':
75
  )
76
  show_configs()
77
  settings.init_settings()
 
78
 
79
  # init db
80
  init_web_db()
 
47
  from api.db.init_data import init_web_data
48
  from api.versions import get_ragflow_version
49
  from api.utils import show_configs
50
+ from rag.settings import print_rag_settings
51
 
52
 
53
  def update_progress():
 
76
  )
77
  show_configs()
78
  settings.init_settings()
79
+ print_rag_settings()
80
 
81
  # init db
82
  init_web_db()
rag/app/book.py CHANGED
@@ -26,30 +26,33 @@ from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
26
  class Pdf(PdfParser):
27
  def __call__(self, filename, binary=None, from_page=0,
28
  to_page=100000, zoomin=3, callback=None):
29
- callback(msg="OCR is running...")
 
 
30
  self.__images__(
31
  filename if not binary else binary,
32
  zoomin,
33
  from_page,
34
  to_page,
35
  callback)
36
- callback(msg="OCR finished")
37
 
38
- from timeit import default_timer as timer
39
  start = timer()
40
  self._layouts_rec(zoomin)
41
- callback(0.67, "Layout analysis finished")
42
  logging.debug("layouts: {}".format(timer() - start))
 
 
43
  self._table_transformer_job(zoomin)
44
- callback(0.68, "Table analysis finished")
 
 
45
  self._text_merge()
46
  tbls = self._extract_table_figure(True, zoomin, True, True)
47
  self._naive_vertical_merge()
48
  self._filter_forpages()
49
  self._merge_with_same_bullet()
50
- callback(0.75, "Text merging finished.")
51
-
52
- callback(0.8, "Text extraction finished")
53
 
54
  return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))
55
  for b in self.boxes], tbls
 
26
  class Pdf(PdfParser):
27
  def __call__(self, filename, binary=None, from_page=0,
28
  to_page=100000, zoomin=3, callback=None):
29
+ from timeit import default_timer as timer
30
+ start = timer()
31
+ callback(msg="OCR started")
32
  self.__images__(
33
  filename if not binary else binary,
34
  zoomin,
35
  from_page,
36
  to_page,
37
  callback)
38
+ callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
39
 
 
40
  start = timer()
41
  self._layouts_rec(zoomin)
42
+ callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start))
43
  logging.debug("layouts: {}".format(timer() - start))
44
+
45
+ start = timer()
46
  self._table_transformer_job(zoomin)
47
+ callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start))
48
+
49
+ start = timer()
50
  self._text_merge()
51
  tbls = self._extract_table_figure(True, zoomin, True, True)
52
  self._naive_vertical_merge()
53
  self._filter_forpages()
54
  self._merge_with_same_bullet()
55
+ callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
 
 
56
 
57
  return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))
58
  for b in self.boxes], tbls
rag/app/laws.py CHANGED
@@ -108,7 +108,9 @@ class Pdf(PdfParser):
108
 
109
  def __call__(self, filename, binary=None, from_page=0,
110
  to_page=100000, zoomin=3, callback=None):
111
- callback(msg="OCR is running...")
 
 
112
  self.__images__(
113
  filename if not binary else binary,
114
  zoomin,
@@ -116,17 +118,16 @@ class Pdf(PdfParser):
116
  to_page,
117
  callback
118
  )
119
- callback(msg="OCR finished")
120
 
121
- from timeit import default_timer as timer
122
  start = timer()
123
  self._layouts_rec(zoomin)
124
- callback(0.67, "Layout analysis finished")
125
  logging.debug("layouts:".format(
126
  ))
127
  self._naive_vertical_merge()
128
 
129
- callback(0.8, "Text extraction finished")
130
 
131
  return [(b["text"], self._line_tag(b, zoomin))
132
  for b in self.boxes], None
 
108
 
109
  def __call__(self, filename, binary=None, from_page=0,
110
  to_page=100000, zoomin=3, callback=None):
111
+ from timeit import default_timer as timer
112
+ start = timer()
113
+ callback(msg="OCR started")
114
  self.__images__(
115
  filename if not binary else binary,
116
  zoomin,
 
118
  to_page,
119
  callback
120
  )
121
+ callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
122
 
 
123
  start = timer()
124
  self._layouts_rec(zoomin)
125
+ callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start))
126
  logging.debug("layouts:".format(
127
  ))
128
  self._naive_vertical_merge()
129
 
130
+ callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
131
 
132
  return [(b["text"], self._line_tag(b, zoomin))
133
  for b in self.boxes], None
rag/app/manual.py CHANGED
@@ -36,7 +36,7 @@ class Pdf(PdfParser):
36
  to_page=100000, zoomin=3, callback=None):
37
  from timeit import default_timer as timer
38
  start = timer()
39
- callback(msg="OCR is running...")
40
  self.__images__(
41
  filename if not binary else binary,
42
  zoomin,
@@ -44,22 +44,27 @@ class Pdf(PdfParser):
44
  to_page,
45
  callback
46
  )
47
- callback(msg="OCR finished.")
48
  # for bb in self.boxes:
49
  # for b in bb:
50
  # print(b)
51
  logging.debug("OCR: {}".format(timer() - start))
52
 
 
53
  self._layouts_rec(zoomin)
54
- callback(0.65, "Layout analysis finished.")
55
  logging.debug("layouts: {}".format(timer() - start))
 
 
56
  self._table_transformer_job(zoomin)
57
- callback(0.67, "Table analysis finished.")
 
 
58
  self._text_merge()
59
  tbls = self._extract_table_figure(True, zoomin, True, True)
60
  self._concat_downward()
61
  self._filter_forpages()
62
- callback(0.68, "Text merging finished")
63
 
64
  # clean mess
65
  for b in self.boxes:
 
36
  to_page=100000, zoomin=3, callback=None):
37
  from timeit import default_timer as timer
38
  start = timer()
39
+ callback(msg="OCR started")
40
  self.__images__(
41
  filename if not binary else binary,
42
  zoomin,
 
44
  to_page,
45
  callback
46
  )
47
+ callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
48
  # for bb in self.boxes:
49
  # for b in bb:
50
  # print(b)
51
  logging.debug("OCR: {}".format(timer() - start))
52
 
53
+ start = timer()
54
  self._layouts_rec(zoomin)
55
+ callback(0.65, "Layout analysis ({:.2f}s)".format(timer() - start))
56
  logging.debug("layouts: {}".format(timer() - start))
57
+
58
+ start = timer()
59
  self._table_transformer_job(zoomin)
60
+ callback(0.67, "Table analysis ({:.2f}s)".format(timer() - start))
61
+
62
+ start = timer()
63
  self._text_merge()
64
  tbls = self._extract_table_figure(True, zoomin, True, True)
65
  self._concat_downward()
66
  self._filter_forpages()
67
+ callback(0.68, "Text merged ({:.2f}s)".format(timer() - start))
68
 
69
  # clean mess
70
  for b in self.boxes:
rag/app/naive.py CHANGED
@@ -124,7 +124,8 @@ class Pdf(PdfParser):
124
  def __call__(self, filename, binary=None, from_page=0,
125
  to_page=100000, zoomin=3, callback=None):
126
  start = timer()
127
- callback(msg="OCR is running...")
 
128
  self.__images__(
129
  filename if not binary else binary,
130
  zoomin,
@@ -132,22 +133,26 @@ class Pdf(PdfParser):
132
  to_page,
133
  callback
134
  )
135
- callback(msg="OCR finished")
136
- logging.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
137
 
138
  start = timer()
139
  self._layouts_rec(zoomin)
140
- callback(0.63, "Layout analysis finished.")
 
 
141
  self._table_transformer_job(zoomin)
142
- callback(0.65, "Table analysis finished.")
 
 
143
  self._text_merge()
144
- callback(0.67, "Text merging finished")
145
  tbls = self._extract_table_figure(True, zoomin, True, True)
146
  # self._naive_vertical_merge()
147
  self._concat_downward()
148
  # self._filter_forpages()
149
 
150
- logging.info("layouts cost: {}s".format(timer() - start))
151
  return [(b["text"], self._line_tag(b, zoomin))
152
  for b in self.boxes], tbls
153
 
@@ -170,7 +175,7 @@ class Markdown(MarkdownParser):
170
  else:
171
  if sections and sections[-1][0].strip().find("#") == 0:
172
  sec_, _ = sections.pop(-1)
173
- sections.append((sec_+"\n"+sec, ""))
174
  else:
175
  sections.append((sec, ""))
176
 
 
124
  def __call__(self, filename, binary=None, from_page=0,
125
  to_page=100000, zoomin=3, callback=None):
126
  start = timer()
127
+ first_start = start
128
+ callback(msg="OCR started")
129
  self.__images__(
130
  filename if not binary else binary,
131
  zoomin,
 
133
  to_page,
134
  callback
135
  )
136
+ callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
137
+ logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
138
 
139
  start = timer()
140
  self._layouts_rec(zoomin)
141
+ callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
142
+
143
+ start = timer()
144
  self._table_transformer_job(zoomin)
145
+ callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
146
+
147
+ start = timer()
148
  self._text_merge()
149
+ callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
150
  tbls = self._extract_table_figure(True, zoomin, True, True)
151
  # self._naive_vertical_merge()
152
  self._concat_downward()
153
  # self._filter_forpages()
154
 
155
+ logging.info("layouts cost: {}s".format(timer() - first_start))
156
  return [(b["text"], self._line_tag(b, zoomin))
157
  for b in self.boxes], tbls
158
 
 
175
  else:
176
  if sections and sections[-1][0].strip().find("#") == 0:
177
  sec_, _ = sections.pop(-1)
178
+ sections.append((sec_ + "\n" + sec, ""))
179
  else:
180
  sections.append((sec, ""))
181
 
rag/app/one.py CHANGED
@@ -24,7 +24,9 @@ from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser
24
  class Pdf(PdfParser):
25
  def __call__(self, filename, binary=None, from_page=0,
26
  to_page=100000, zoomin=3, callback=None):
27
- callback(msg="OCR is running...")
 
 
28
  self.__images__(
29
  filename if not binary else binary,
30
  zoomin,
@@ -32,17 +34,20 @@ class Pdf(PdfParser):
32
  to_page,
33
  callback
34
  )
35
- callback(msg="OCR finished")
36
 
37
- from timeit import default_timer as timer
38
  start = timer()
39
  self._layouts_rec(zoomin, drop=False)
40
- callback(0.63, "Layout analysis finished.")
41
  logging.debug("layouts cost: {}s".format(timer() - start))
 
 
42
  self._table_transformer_job(zoomin)
43
- callback(0.65, "Table analysis finished.")
 
 
44
  self._text_merge()
45
- callback(0.67, "Text merging finished")
46
  tbls = self._extract_table_figure(True, zoomin, True, True)
47
  self._concat_downward()
48
 
 
24
  class Pdf(PdfParser):
25
  def __call__(self, filename, binary=None, from_page=0,
26
  to_page=100000, zoomin=3, callback=None):
27
+ from timeit import default_timer as timer
28
+ start = timer()
29
+ callback(msg="OCR started")
30
  self.__images__(
31
  filename if not binary else binary,
32
  zoomin,
 
34
  to_page,
35
  callback
36
  )
37
+ callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
38
 
 
39
  start = timer()
40
  self._layouts_rec(zoomin, drop=False)
41
+ callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
42
  logging.debug("layouts cost: {}s".format(timer() - start))
43
+
44
+ start = timer()
45
  self._table_transformer_job(zoomin)
46
+ callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
47
+
48
+ start = timer()
49
  self._text_merge()
50
+ callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
51
  tbls = self._extract_table_figure(True, zoomin, True, True)
52
  self._concat_downward()
53
 
rag/app/paper.py CHANGED
@@ -27,7 +27,9 @@ class Pdf(PdfParser):
27
 
28
  def __call__(self, filename, binary=None, from_page=0,
29
  to_page=100000, zoomin=3, callback=None):
30
- callback(msg="OCR is running...")
 
 
31
  self.__images__(
32
  filename if not binary else binary,
33
  zoomin,
@@ -35,21 +37,24 @@ class Pdf(PdfParser):
35
  to_page,
36
  callback
37
  )
38
- callback(msg="OCR finished.")
39
 
40
- from timeit import default_timer as timer
41
  start = timer()
42
  self._layouts_rec(zoomin)
43
- callback(0.63, "Layout analysis finished")
44
  logging.debug(f"layouts cost: {timer() - start}s")
 
 
45
  self._table_transformer_job(zoomin)
46
- callback(0.68, "Table analysis finished")
 
 
47
  self._text_merge()
48
  tbls = self._extract_table_figure(True, zoomin, True, True)
49
  column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
50
  self._concat_downward()
51
  self._filter_forpages()
52
- callback(0.75, "Text merging finished.")
53
 
54
  # clean mess
55
  if column_width < self.page_images[0].size[0] / zoomin / 2:
 
27
 
28
  def __call__(self, filename, binary=None, from_page=0,
29
  to_page=100000, zoomin=3, callback=None):
30
+ from timeit import default_timer as timer
31
+ start = timer()
32
+ callback(msg="OCR started")
33
  self.__images__(
34
  filename if not binary else binary,
35
  zoomin,
 
37
  to_page,
38
  callback
39
  )
40
+ callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
41
 
 
42
  start = timer()
43
  self._layouts_rec(zoomin)
44
+ callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
45
  logging.debug(f"layouts cost: {timer() - start}s")
46
+
47
+ start = timer()
48
  self._table_transformer_job(zoomin)
49
+ callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start))
50
+
51
+ start = timer()
52
  self._text_merge()
53
  tbls = self._extract_table_figure(True, zoomin, True, True)
54
  column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
55
  self._concat_downward()
56
  self._filter_forpages()
57
+ callback(0.75, "Text merged ({:.2f}s)".format(timer() - start))
58
 
59
  # clean mess
60
  if column_width < self.page_images[0].size[0] / zoomin / 2:
rag/app/presentation.py CHANGED
@@ -59,11 +59,12 @@ class Pdf(PdfParser):
59
 
60
  def __call__(self, filename, binary=None, from_page=0,
61
  to_page=100000, zoomin=3, callback=None):
62
- callback(msg="OCR is running...")
 
 
63
  self.__images__(filename if not binary else binary,
64
  zoomin, from_page, to_page, callback)
65
- callback(0.8, "Page {}~{}: OCR finished".format(
66
- from_page, min(to_page, self.total_page)))
67
  assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(
68
  len(self.boxes), len(self.page_images))
69
  res = []
 
59
 
60
  def __call__(self, filename, binary=None, from_page=0,
61
  to_page=100000, zoomin=3, callback=None):
62
+ from timeit import default_timer as timer
63
+ start = timer()
64
+ callback(msg="OCR started")
65
  self.__images__(filename if not binary else binary,
66
  zoomin, from_page, to_page, callback)
67
+ callback(msg="Page {}~{}: OCR finished ({:.2f}s)".format(from_page, min(to_page, self.total_page), timer() - start))
 
68
  assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(
69
  len(self.boxes), len(self.page_images))
70
  res = []
rag/app/qa.py CHANGED
@@ -73,7 +73,7 @@ class Pdf(PdfParser):
73
  def __call__(self, filename, binary=None, from_page=0,
74
  to_page=100000, zoomin=3, callback=None):
75
  start = timer()
76
- callback(msg="OCR is running...")
77
  self.__images__(
78
  filename if not binary else binary,
79
  zoomin,
@@ -81,15 +81,19 @@ class Pdf(PdfParser):
81
  to_page,
82
  callback
83
  )
84
- callback(msg="OCR finished")
85
- logging.debug("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
86
  start = timer()
87
  self._layouts_rec(zoomin, drop=False)
88
- callback(0.63, "Layout analysis finished.")
 
 
89
  self._table_transformer_job(zoomin)
90
- callback(0.65, "Table analysis finished.")
 
 
91
  self._text_merge()
92
- callback(0.67, "Text merging finished")
93
  tbls = self._extract_table_figure(True, zoomin, True, True)
94
  #self._naive_vertical_merge()
95
  # self._concat_downward()
@@ -226,7 +230,7 @@ class Docx(DocxParser):
226
  sum_question = '\n'.join(question_stack)
227
  if sum_question:
228
  qai_list.append((sum_question, last_answer, last_image))
229
-
230
  tbls = []
231
  for tb in self.doc.tables:
232
  html= "<table>"
 
73
  def __call__(self, filename, binary=None, from_page=0,
74
  to_page=100000, zoomin=3, callback=None):
75
  start = timer()
76
+ callback(msg="OCR started")
77
  self.__images__(
78
  filename if not binary else binary,
79
  zoomin,
 
81
  to_page,
82
  callback
83
  )
84
+ callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
85
+ logging.debug("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
86
  start = timer()
87
  self._layouts_rec(zoomin, drop=False)
88
+ callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
89
+
90
+ start = timer()
91
  self._table_transformer_job(zoomin)
92
+ callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
93
+
94
+ start = timer()
95
  self._text_merge()
96
+ callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
97
  tbls = self._extract_table_figure(True, zoomin, True, True)
98
  #self._naive_vertical_merge()
99
  # self._concat_downward()
 
230
  sum_question = '\n'.join(question_stack)
231
  if sum_question:
232
  qai_list.append((sum_question, last_answer, last_image))
233
+
234
  tbls = []
235
  for tb in self.doc.tables:
236
  html= "<table>"
rag/settings.py CHANGED
@@ -14,6 +14,7 @@
14
  # limitations under the License.
15
  #
16
  import os
 
17
  from api.utils import get_base_config, decrypt_database_config
18
  from api.utils.file_utils import get_project_base_directory
19
 
@@ -37,3 +38,9 @@ SVR_QUEUE_RETENTION = 60*60
37
  SVR_QUEUE_MAX_LEN = 1024
38
  SVR_CONSUMER_NAME = "rag_flow_svr_consumer"
39
  SVR_CONSUMER_GROUP_NAME = "rag_flow_svr_consumer_group"
 
 
 
 
 
 
 
14
  # limitations under the License.
15
  #
16
  import os
17
+ import logging
18
  from api.utils import get_base_config, decrypt_database_config
19
  from api.utils.file_utils import get_project_base_directory
20
 
 
38
  SVR_QUEUE_MAX_LEN = 1024
39
  SVR_CONSUMER_NAME = "rag_flow_svr_consumer"
40
  SVR_CONSUMER_GROUP_NAME = "rag_flow_svr_consumer_group"
41
+
42
+ def print_rag_settings():
43
+ logging.info(f"MAX_CONTENT_LENGTH: {DOC_MAXIMUM_SIZE}")
44
+ logging.info(f"SERVER_QUEUE_MAX_LEN: {SVR_QUEUE_MAX_LEN}")
45
+ logging.info(f"SERVER_QUEUE_RETENTION: {SVR_QUEUE_RETENTION}")
46
+ logging.info(f"MAX_FILE_COUNT_PER_USER: {int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))}")
rag/svr/task_executor.py CHANGED
@@ -56,12 +56,13 @@ from api.db.services.llm_service import LLMBundle
56
  from api.db.services.task_service import TaskService
57
  from api.db.services.file2document_service import File2DocumentService
58
  from api import settings
 
59
  from api.db.db_models import close_connection
60
  from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, \
61
  knowledge_graph, email
62
  from rag.nlp import search, rag_tokenizer
63
  from rag.raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor
64
- from rag.settings import DOC_MAXIMUM_SIZE, SVR_QUEUE_NAME
65
  from rag.utils import rmSpace, num_tokens_from_string
66
  from rag.utils.redis_conn import REDIS_CONN, Payload
67
  from rag.utils.storage_factory import STORAGE_IMPL
@@ -395,8 +396,7 @@ def do_handle_task(r):
395
  # TODO: exception handler
396
  ## set_progress(r["did"], -1, "ERROR: ")
397
  callback(
398
- msg="Finished slicing files ({} chunks in {:.2f}s). Start to embedding the content.".format(len(cks),
399
- timer() - st)
400
  )
401
  st = timer()
402
  try:
@@ -407,7 +407,7 @@ def do_handle_task(r):
407
  tk_count = 0
408
  raise
409
  logging.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
410
- callback(msg="Finished embedding (in {:.2f}s)! Start to build index!".format(timer() - st))
411
  # logging.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}")
412
  init_kb(r, vector_size)
413
  chunk_count = len(set([c["id"] for c in cks]))
@@ -420,7 +420,8 @@ def do_handle_task(r):
420
  callback(prog=0.8 + 0.1 * (b + 1) / len(cks), msg="")
421
  logging.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st))
422
  if es_r:
423
- callback(-1, "Insert chunk error, detail info please check log file. Please also check Elasticsearch/Infinity status!")
 
424
  settings.docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
425
  logging.error('Insert chunk error: ' + str(es_r))
426
  raise Exception('Insert chunk error: ' + str(es_r))
@@ -429,13 +430,12 @@ def do_handle_task(r):
429
  settings.docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
430
  return
431
 
432
- callback(msg="Indexing elapsed in {:.2f}s.".format(timer() - st))
433
- callback(1., "Done!")
434
  DocumentService.increment_chunk_num(
435
  r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
436
  logging.info(
437
  "Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format(
438
- r["id"], tk_count, len(cks), timer() - st))
439
 
440
 
441
  def handle_task():
@@ -502,7 +502,7 @@ def analyze_heap(snapshot1: tracemalloc.Snapshot, snapshot2: tracemalloc.Snapsho
502
  for stat in stats2[:10]:
503
  msg += f"{stat}\n"
504
  stats1_vs_2 = snapshot2.compare_to(snapshot1, 'lineno')
505
- msg += f"{CONSUMER_NAME} memory usage increase from snapshot {snapshot_id-1} to snapshot {snapshot_id}:\n"
506
  for stat in stats1_vs_2[:10]:
507
  msg += f"{stat}\n"
508
  msg += f"{CONSUMER_NAME} detailed traceback for the top memory consumers:\n"
@@ -512,7 +512,16 @@ def analyze_heap(snapshot1: tracemalloc.Snapshot, snapshot2: tracemalloc.Snapsho
512
 
513
 
514
  def main():
 
 
 
 
 
 
 
 
515
  settings.init_settings()
 
516
  background_thread = threading.Thread(target=report_status)
517
  background_thread.daemon = True
518
  background_thread.start()
@@ -527,11 +536,12 @@ def main():
527
  while True:
528
  handle_task()
529
  num_tasks = DONE_TASKS + FAILED_TASKS
530
- if TRACE_MALLOC_DELTA> 0 and num_tasks > 0 and num_tasks % TRACE_MALLOC_DELTA == 0:
531
  snapshot2 = tracemalloc.take_snapshot()
532
- analyze_heap(snapshot1, snapshot2, int(num_tasks/TRACE_MALLOC_DELTA), num_tasks % TRACE_MALLOC_FULL == 0)
533
  snapshot1 = snapshot2
534
  snapshot2 = None
535
 
 
536
  if __name__ == "__main__":
537
  main()
 
56
  from api.db.services.task_service import TaskService
57
  from api.db.services.file2document_service import File2DocumentService
58
  from api import settings
59
+ from api.versions import get_ragflow_version
60
  from api.db.db_models import close_connection
61
  from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, \
62
  knowledge_graph, email
63
  from rag.nlp import search, rag_tokenizer
64
  from rag.raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor
65
+ from rag.settings import DOC_MAXIMUM_SIZE, SVR_QUEUE_NAME, print_rag_settings
66
  from rag.utils import rmSpace, num_tokens_from_string
67
  from rag.utils.redis_conn import REDIS_CONN, Payload
68
  from rag.utils.storage_factory import STORAGE_IMPL
 
396
  # TODO: exception handler
397
  ## set_progress(r["did"], -1, "ERROR: ")
398
  callback(
399
+ msg="Generate {} chunks ({:.2f}s). Embedding chunks.".format(len(cks), timer() - st)
 
400
  )
401
  st = timer()
402
  try:
 
407
  tk_count = 0
408
  raise
409
  logging.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
410
+ callback(msg="Finished embedding ({:.2f}s)!".format(timer() - st))
411
  # logging.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}")
412
  init_kb(r, vector_size)
413
  chunk_count = len(set([c["id"] for c in cks]))
 
420
  callback(prog=0.8 + 0.1 * (b + 1) / len(cks), msg="")
421
  logging.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st))
422
  if es_r:
423
+ callback(-1,
424
+ "Insert chunk error, detail info please check log file. Please also check Elasticsearch/Infinity status!")
425
  settings.docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
426
  logging.error('Insert chunk error: ' + str(es_r))
427
  raise Exception('Insert chunk error: ' + str(es_r))
 
430
  settings.docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
431
  return
432
 
433
+ callback(1., msg="Index cost {:.2f}s.".format(timer() - st))
 
434
  DocumentService.increment_chunk_num(
435
  r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
436
  logging.info(
437
  "Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format(
438
+ r["id"], tk_count, len(cks), timer() - st))
439
 
440
 
441
  def handle_task():
 
502
  for stat in stats2[:10]:
503
  msg += f"{stat}\n"
504
  stats1_vs_2 = snapshot2.compare_to(snapshot1, 'lineno')
505
+ msg += f"{CONSUMER_NAME} memory usage increase from snapshot {snapshot_id - 1} to snapshot {snapshot_id}:\n"
506
  for stat in stats1_vs_2[:10]:
507
  msg += f"{stat}\n"
508
  msg += f"{CONSUMER_NAME} detailed traceback for the top memory consumers:\n"
 
512
 
513
 
514
  def main():
515
+ logging.info(r"""
516
+ ______ __ ______ __
517
+ /_ __/___ ______/ /__ / ____/ _____ _______ __/ /_____ _____
518
+ / / / __ `/ ___/ //_/ / __/ | |/_/ _ \/ ___/ / / / __/ __ \/ ___/
519
+ / / / /_/ (__ ) ,< / /____> </ __/ /__/ /_/ / /_/ /_/ / /
520
+ /_/ \__,_/____/_/|_| /_____/_/|_|\___/\___/\__,_/\__/\____/_/
521
+ """)
522
+ logging.info(f'TaskExecutor: RAGFlow version: {get_ragflow_version()}')
523
  settings.init_settings()
524
+ print_rag_settings()
525
  background_thread = threading.Thread(target=report_status)
526
  background_thread.daemon = True
527
  background_thread.start()
 
536
  while True:
537
  handle_task()
538
  num_tasks = DONE_TASKS + FAILED_TASKS
539
+ if TRACE_MALLOC_DELTA > 0 and num_tasks > 0 and num_tasks % TRACE_MALLOC_DELTA == 0:
540
  snapshot2 = tracemalloc.take_snapshot()
541
+ analyze_heap(snapshot1, snapshot2, int(num_tasks / TRACE_MALLOC_DELTA), num_tasks % TRACE_MALLOC_FULL == 0)
542
  snapshot1 = snapshot2
543
  snapshot2 = None
544
 
545
+
546
  if __name__ == "__main__":
547
  main()