Commit
·
6e567cd
1
Parent(s):
1427166
Update progress info and start welcome info (#3768)
Browse files### What problem does this PR solve?
_Briefly describe what this PR aims to solve. Include background context
that will help reviewers understand the purpose of the PR._
### Type of change
- [x] Refactoring
---------
Signed-off-by: jinhai <[email protected]>
- api/ragflow_server.py +2 -0
- rag/app/book.py +11 -8
- rag/app/laws.py +6 -5
- rag/app/manual.py +10 -5
- rag/app/naive.py +13 -8
- rag/app/one.py +11 -6
- rag/app/paper.py +11 -6
- rag/app/presentation.py +4 -3
- rag/app/qa.py +11 -7
- rag/settings.py +7 -0
- rag/svr/task_executor.py +21 -11
api/ragflow_server.py
CHANGED
@@ -47,6 +47,7 @@ from api.db.db_models import init_database_tables as init_web_db
|
|
47 |
from api.db.init_data import init_web_data
|
48 |
from api.versions import get_ragflow_version
|
49 |
from api.utils import show_configs
|
|
|
50 |
|
51 |
|
52 |
def update_progress():
|
@@ -75,6 +76,7 @@ if __name__ == '__main__':
|
|
75 |
)
|
76 |
show_configs()
|
77 |
settings.init_settings()
|
|
|
78 |
|
79 |
# init db
|
80 |
init_web_db()
|
|
|
47 |
from api.db.init_data import init_web_data
|
48 |
from api.versions import get_ragflow_version
|
49 |
from api.utils import show_configs
|
50 |
+
from rag.settings import print_rag_settings
|
51 |
|
52 |
|
53 |
def update_progress():
|
|
|
76 |
)
|
77 |
show_configs()
|
78 |
settings.init_settings()
|
79 |
+
print_rag_settings()
|
80 |
|
81 |
# init db
|
82 |
init_web_db()
|
rag/app/book.py
CHANGED
@@ -26,30 +26,33 @@ from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
|
|
26 |
class Pdf(PdfParser):
|
27 |
def __call__(self, filename, binary=None, from_page=0,
|
28 |
to_page=100000, zoomin=3, callback=None):
|
29 |
-
|
|
|
|
|
30 |
self.__images__(
|
31 |
filename if not binary else binary,
|
32 |
zoomin,
|
33 |
from_page,
|
34 |
to_page,
|
35 |
callback)
|
36 |
-
callback(msg="OCR finished")
|
37 |
|
38 |
-
from timeit import default_timer as timer
|
39 |
start = timer()
|
40 |
self._layouts_rec(zoomin)
|
41 |
-
callback(0.67, "Layout analysis
|
42 |
logging.debug("layouts: {}".format(timer() - start))
|
|
|
|
|
43 |
self._table_transformer_job(zoomin)
|
44 |
-
callback(0.68, "Table analysis
|
|
|
|
|
45 |
self._text_merge()
|
46 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
47 |
self._naive_vertical_merge()
|
48 |
self._filter_forpages()
|
49 |
self._merge_with_same_bullet()
|
50 |
-
callback(0.
|
51 |
-
|
52 |
-
callback(0.8, "Text extraction finished")
|
53 |
|
54 |
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))
|
55 |
for b in self.boxes], tbls
|
|
|
26 |
class Pdf(PdfParser):
|
27 |
def __call__(self, filename, binary=None, from_page=0,
|
28 |
to_page=100000, zoomin=3, callback=None):
|
29 |
+
from timeit import default_timer as timer
|
30 |
+
start = timer()
|
31 |
+
callback(msg="OCR started")
|
32 |
self.__images__(
|
33 |
filename if not binary else binary,
|
34 |
zoomin,
|
35 |
from_page,
|
36 |
to_page,
|
37 |
callback)
|
38 |
+
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
39 |
|
|
|
40 |
start = timer()
|
41 |
self._layouts_rec(zoomin)
|
42 |
+
callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start))
|
43 |
logging.debug("layouts: {}".format(timer() - start))
|
44 |
+
|
45 |
+
start = timer()
|
46 |
self._table_transformer_job(zoomin)
|
47 |
+
callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start))
|
48 |
+
|
49 |
+
start = timer()
|
50 |
self._text_merge()
|
51 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
52 |
self._naive_vertical_merge()
|
53 |
self._filter_forpages()
|
54 |
self._merge_with_same_bullet()
|
55 |
+
callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
|
|
|
|
|
56 |
|
57 |
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))
|
58 |
for b in self.boxes], tbls
|
rag/app/laws.py
CHANGED
@@ -108,7 +108,9 @@ class Pdf(PdfParser):
|
|
108 |
|
109 |
def __call__(self, filename, binary=None, from_page=0,
|
110 |
to_page=100000, zoomin=3, callback=None):
|
111 |
-
|
|
|
|
|
112 |
self.__images__(
|
113 |
filename if not binary else binary,
|
114 |
zoomin,
|
@@ -116,17 +118,16 @@ class Pdf(PdfParser):
|
|
116 |
to_page,
|
117 |
callback
|
118 |
)
|
119 |
-
callback(msg="OCR finished")
|
120 |
|
121 |
-
from timeit import default_timer as timer
|
122 |
start = timer()
|
123 |
self._layouts_rec(zoomin)
|
124 |
-
callback(0.67, "Layout analysis
|
125 |
logging.debug("layouts:".format(
|
126 |
))
|
127 |
self._naive_vertical_merge()
|
128 |
|
129 |
-
callback(0.8, "Text extraction
|
130 |
|
131 |
return [(b["text"], self._line_tag(b, zoomin))
|
132 |
for b in self.boxes], None
|
|
|
108 |
|
109 |
def __call__(self, filename, binary=None, from_page=0,
|
110 |
to_page=100000, zoomin=3, callback=None):
|
111 |
+
from timeit import default_timer as timer
|
112 |
+
start = timer()
|
113 |
+
callback(msg="OCR started")
|
114 |
self.__images__(
|
115 |
filename if not binary else binary,
|
116 |
zoomin,
|
|
|
118 |
to_page,
|
119 |
callback
|
120 |
)
|
121 |
+
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
122 |
|
|
|
123 |
start = timer()
|
124 |
self._layouts_rec(zoomin)
|
125 |
+
callback(0.67, "Layout analysis ({:.2f}s)".format(timer() - start))
|
126 |
logging.debug("layouts:".format(
|
127 |
))
|
128 |
self._naive_vertical_merge()
|
129 |
|
130 |
+
callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
|
131 |
|
132 |
return [(b["text"], self._line_tag(b, zoomin))
|
133 |
for b in self.boxes], None
|
rag/app/manual.py
CHANGED
@@ -36,7 +36,7 @@ class Pdf(PdfParser):
|
|
36 |
to_page=100000, zoomin=3, callback=None):
|
37 |
from timeit import default_timer as timer
|
38 |
start = timer()
|
39 |
-
callback(msg="OCR
|
40 |
self.__images__(
|
41 |
filename if not binary else binary,
|
42 |
zoomin,
|
@@ -44,22 +44,27 @@ class Pdf(PdfParser):
|
|
44 |
to_page,
|
45 |
callback
|
46 |
)
|
47 |
-
callback(msg="OCR finished.
|
48 |
# for bb in self.boxes:
|
49 |
# for b in bb:
|
50 |
# print(b)
|
51 |
logging.debug("OCR: {}".format(timer() - start))
|
52 |
|
|
|
53 |
self._layouts_rec(zoomin)
|
54 |
-
callback(0.65, "Layout analysis
|
55 |
logging.debug("layouts: {}".format(timer() - start))
|
|
|
|
|
56 |
self._table_transformer_job(zoomin)
|
57 |
-
callback(0.67, "Table analysis
|
|
|
|
|
58 |
self._text_merge()
|
59 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
60 |
self._concat_downward()
|
61 |
self._filter_forpages()
|
62 |
-
callback(0.68, "Text
|
63 |
|
64 |
# clean mess
|
65 |
for b in self.boxes:
|
|
|
36 |
to_page=100000, zoomin=3, callback=None):
|
37 |
from timeit import default_timer as timer
|
38 |
start = timer()
|
39 |
+
callback(msg="OCR started")
|
40 |
self.__images__(
|
41 |
filename if not binary else binary,
|
42 |
zoomin,
|
|
|
44 |
to_page,
|
45 |
callback
|
46 |
)
|
47 |
+
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
48 |
# for bb in self.boxes:
|
49 |
# for b in bb:
|
50 |
# print(b)
|
51 |
logging.debug("OCR: {}".format(timer() - start))
|
52 |
|
53 |
+
start = timer()
|
54 |
self._layouts_rec(zoomin)
|
55 |
+
callback(0.65, "Layout analysis ({:.2f}s)".format(timer() - start))
|
56 |
logging.debug("layouts: {}".format(timer() - start))
|
57 |
+
|
58 |
+
start = timer()
|
59 |
self._table_transformer_job(zoomin)
|
60 |
+
callback(0.67, "Table analysis ({:.2f}s)".format(timer() - start))
|
61 |
+
|
62 |
+
start = timer()
|
63 |
self._text_merge()
|
64 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
65 |
self._concat_downward()
|
66 |
self._filter_forpages()
|
67 |
+
callback(0.68, "Text merged ({:.2f}s)".format(timer() - start))
|
68 |
|
69 |
# clean mess
|
70 |
for b in self.boxes:
|
rag/app/naive.py
CHANGED
@@ -124,7 +124,8 @@ class Pdf(PdfParser):
|
|
124 |
def __call__(self, filename, binary=None, from_page=0,
|
125 |
to_page=100000, zoomin=3, callback=None):
|
126 |
start = timer()
|
127 |
-
|
|
|
128 |
self.__images__(
|
129 |
filename if not binary else binary,
|
130 |
zoomin,
|
@@ -132,22 +133,26 @@ class Pdf(PdfParser):
|
|
132 |
to_page,
|
133 |
callback
|
134 |
)
|
135 |
-
callback(msg="OCR finished")
|
136 |
-
logging.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
|
137 |
|
138 |
start = timer()
|
139 |
self._layouts_rec(zoomin)
|
140 |
-
callback(0.63, "Layout analysis
|
|
|
|
|
141 |
self._table_transformer_job(zoomin)
|
142 |
-
callback(0.65, "Table analysis
|
|
|
|
|
143 |
self._text_merge()
|
144 |
-
callback(0.67, "Text
|
145 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
146 |
# self._naive_vertical_merge()
|
147 |
self._concat_downward()
|
148 |
# self._filter_forpages()
|
149 |
|
150 |
-
logging.info("layouts cost: {}s".format(timer() -
|
151 |
return [(b["text"], self._line_tag(b, zoomin))
|
152 |
for b in self.boxes], tbls
|
153 |
|
@@ -170,7 +175,7 @@ class Markdown(MarkdownParser):
|
|
170 |
else:
|
171 |
if sections and sections[-1][0].strip().find("#") == 0:
|
172 |
sec_, _ = sections.pop(-1)
|
173 |
-
sections.append((sec_+"\n"+sec, ""))
|
174 |
else:
|
175 |
sections.append((sec, ""))
|
176 |
|
|
|
124 |
def __call__(self, filename, binary=None, from_page=0,
|
125 |
to_page=100000, zoomin=3, callback=None):
|
126 |
start = timer()
|
127 |
+
first_start = start
|
128 |
+
callback(msg="OCR started")
|
129 |
self.__images__(
|
130 |
filename if not binary else binary,
|
131 |
zoomin,
|
|
|
133 |
to_page,
|
134 |
callback
|
135 |
)
|
136 |
+
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
137 |
+
logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
|
138 |
|
139 |
start = timer()
|
140 |
self._layouts_rec(zoomin)
|
141 |
+
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
|
142 |
+
|
143 |
+
start = timer()
|
144 |
self._table_transformer_job(zoomin)
|
145 |
+
callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
|
146 |
+
|
147 |
+
start = timer()
|
148 |
self._text_merge()
|
149 |
+
callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
|
150 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
151 |
# self._naive_vertical_merge()
|
152 |
self._concat_downward()
|
153 |
# self._filter_forpages()
|
154 |
|
155 |
+
logging.info("layouts cost: {}s".format(timer() - first_start))
|
156 |
return [(b["text"], self._line_tag(b, zoomin))
|
157 |
for b in self.boxes], tbls
|
158 |
|
|
|
175 |
else:
|
176 |
if sections and sections[-1][0].strip().find("#") == 0:
|
177 |
sec_, _ = sections.pop(-1)
|
178 |
+
sections.append((sec_ + "\n" + sec, ""))
|
179 |
else:
|
180 |
sections.append((sec, ""))
|
181 |
|
rag/app/one.py
CHANGED
@@ -24,7 +24,9 @@ from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser
|
|
24 |
class Pdf(PdfParser):
|
25 |
def __call__(self, filename, binary=None, from_page=0,
|
26 |
to_page=100000, zoomin=3, callback=None):
|
27 |
-
|
|
|
|
|
28 |
self.__images__(
|
29 |
filename if not binary else binary,
|
30 |
zoomin,
|
@@ -32,17 +34,20 @@ class Pdf(PdfParser):
|
|
32 |
to_page,
|
33 |
callback
|
34 |
)
|
35 |
-
callback(msg="OCR finished")
|
36 |
|
37 |
-
from timeit import default_timer as timer
|
38 |
start = timer()
|
39 |
self._layouts_rec(zoomin, drop=False)
|
40 |
-
callback(0.63, "Layout analysis
|
41 |
logging.debug("layouts cost: {}s".format(timer() - start))
|
|
|
|
|
42 |
self._table_transformer_job(zoomin)
|
43 |
-
callback(0.65, "Table analysis
|
|
|
|
|
44 |
self._text_merge()
|
45 |
-
callback(0.67, "Text
|
46 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
47 |
self._concat_downward()
|
48 |
|
|
|
24 |
class Pdf(PdfParser):
|
25 |
def __call__(self, filename, binary=None, from_page=0,
|
26 |
to_page=100000, zoomin=3, callback=None):
|
27 |
+
from timeit import default_timer as timer
|
28 |
+
start = timer()
|
29 |
+
callback(msg="OCR started")
|
30 |
self.__images__(
|
31 |
filename if not binary else binary,
|
32 |
zoomin,
|
|
|
34 |
to_page,
|
35 |
callback
|
36 |
)
|
37 |
+
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
38 |
|
|
|
39 |
start = timer()
|
40 |
self._layouts_rec(zoomin, drop=False)
|
41 |
+
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
|
42 |
logging.debug("layouts cost: {}s".format(timer() - start))
|
43 |
+
|
44 |
+
start = timer()
|
45 |
self._table_transformer_job(zoomin)
|
46 |
+
callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
|
47 |
+
|
48 |
+
start = timer()
|
49 |
self._text_merge()
|
50 |
+
callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
|
51 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
52 |
self._concat_downward()
|
53 |
|
rag/app/paper.py
CHANGED
@@ -27,7 +27,9 @@ class Pdf(PdfParser):
|
|
27 |
|
28 |
def __call__(self, filename, binary=None, from_page=0,
|
29 |
to_page=100000, zoomin=3, callback=None):
|
30 |
-
|
|
|
|
|
31 |
self.__images__(
|
32 |
filename if not binary else binary,
|
33 |
zoomin,
|
@@ -35,21 +37,24 @@ class Pdf(PdfParser):
|
|
35 |
to_page,
|
36 |
callback
|
37 |
)
|
38 |
-
callback(msg="OCR finished.
|
39 |
|
40 |
-
from timeit import default_timer as timer
|
41 |
start = timer()
|
42 |
self._layouts_rec(zoomin)
|
43 |
-
callback(0.63, "Layout analysis
|
44 |
logging.debug(f"layouts cost: {timer() - start}s")
|
|
|
|
|
45 |
self._table_transformer_job(zoomin)
|
46 |
-
callback(0.68, "Table analysis
|
|
|
|
|
47 |
self._text_merge()
|
48 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
49 |
column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
|
50 |
self._concat_downward()
|
51 |
self._filter_forpages()
|
52 |
-
callback(0.75, "Text
|
53 |
|
54 |
# clean mess
|
55 |
if column_width < self.page_images[0].size[0] / zoomin / 2:
|
|
|
27 |
|
28 |
def __call__(self, filename, binary=None, from_page=0,
|
29 |
to_page=100000, zoomin=3, callback=None):
|
30 |
+
from timeit import default_timer as timer
|
31 |
+
start = timer()
|
32 |
+
callback(msg="OCR started")
|
33 |
self.__images__(
|
34 |
filename if not binary else binary,
|
35 |
zoomin,
|
|
|
37 |
to_page,
|
38 |
callback
|
39 |
)
|
40 |
+
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
41 |
|
|
|
42 |
start = timer()
|
43 |
self._layouts_rec(zoomin)
|
44 |
+
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
|
45 |
logging.debug(f"layouts cost: {timer() - start}s")
|
46 |
+
|
47 |
+
start = timer()
|
48 |
self._table_transformer_job(zoomin)
|
49 |
+
callback(0.68, "Table analysis ({:.2f}s)".format(timer() - start))
|
50 |
+
|
51 |
+
start = timer()
|
52 |
self._text_merge()
|
53 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
54 |
column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
|
55 |
self._concat_downward()
|
56 |
self._filter_forpages()
|
57 |
+
callback(0.75, "Text merged ({:.2f}s)".format(timer() - start))
|
58 |
|
59 |
# clean mess
|
60 |
if column_width < self.page_images[0].size[0] / zoomin / 2:
|
rag/app/presentation.py
CHANGED
@@ -59,11 +59,12 @@ class Pdf(PdfParser):
|
|
59 |
|
60 |
def __call__(self, filename, binary=None, from_page=0,
|
61 |
to_page=100000, zoomin=3, callback=None):
|
62 |
-
|
|
|
|
|
63 |
self.__images__(filename if not binary else binary,
|
64 |
zoomin, from_page, to_page, callback)
|
65 |
-
callback(
|
66 |
-
from_page, min(to_page, self.total_page)))
|
67 |
assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(
|
68 |
len(self.boxes), len(self.page_images))
|
69 |
res = []
|
|
|
59 |
|
60 |
def __call__(self, filename, binary=None, from_page=0,
|
61 |
to_page=100000, zoomin=3, callback=None):
|
62 |
+
from timeit import default_timer as timer
|
63 |
+
start = timer()
|
64 |
+
callback(msg="OCR started")
|
65 |
self.__images__(filename if not binary else binary,
|
66 |
zoomin, from_page, to_page, callback)
|
67 |
+
callback(msg="Page {}~{}: OCR finished ({:.2f}s)".format(from_page, min(to_page, self.total_page), timer() - start))
|
|
|
68 |
assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(
|
69 |
len(self.boxes), len(self.page_images))
|
70 |
res = []
|
rag/app/qa.py
CHANGED
@@ -73,7 +73,7 @@ class Pdf(PdfParser):
|
|
73 |
def __call__(self, filename, binary=None, from_page=0,
|
74 |
to_page=100000, zoomin=3, callback=None):
|
75 |
start = timer()
|
76 |
-
callback(msg="OCR
|
77 |
self.__images__(
|
78 |
filename if not binary else binary,
|
79 |
zoomin,
|
@@ -81,15 +81,19 @@ class Pdf(PdfParser):
|
|
81 |
to_page,
|
82 |
callback
|
83 |
)
|
84 |
-
callback(msg="OCR finished")
|
85 |
-
logging.debug("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
|
86 |
start = timer()
|
87 |
self._layouts_rec(zoomin, drop=False)
|
88 |
-
callback(0.63, "Layout analysis
|
|
|
|
|
89 |
self._table_transformer_job(zoomin)
|
90 |
-
callback(0.65, "Table analysis
|
|
|
|
|
91 |
self._text_merge()
|
92 |
-
callback(0.67, "Text
|
93 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
94 |
#self._naive_vertical_merge()
|
95 |
# self._concat_downward()
|
@@ -226,7 +230,7 @@ class Docx(DocxParser):
|
|
226 |
sum_question = '\n'.join(question_stack)
|
227 |
if sum_question:
|
228 |
qai_list.append((sum_question, last_answer, last_image))
|
229 |
-
|
230 |
tbls = []
|
231 |
for tb in self.doc.tables:
|
232 |
html= "<table>"
|
|
|
73 |
def __call__(self, filename, binary=None, from_page=0,
|
74 |
to_page=100000, zoomin=3, callback=None):
|
75 |
start = timer()
|
76 |
+
callback(msg="OCR started")
|
77 |
self.__images__(
|
78 |
filename if not binary else binary,
|
79 |
zoomin,
|
|
|
81 |
to_page,
|
82 |
callback
|
83 |
)
|
84 |
+
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
85 |
+
logging.debug("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
|
86 |
start = timer()
|
87 |
self._layouts_rec(zoomin, drop=False)
|
88 |
+
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
|
89 |
+
|
90 |
+
start = timer()
|
91 |
self._table_transformer_job(zoomin)
|
92 |
+
callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
|
93 |
+
|
94 |
+
start = timer()
|
95 |
self._text_merge()
|
96 |
+
callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
|
97 |
tbls = self._extract_table_figure(True, zoomin, True, True)
|
98 |
#self._naive_vertical_merge()
|
99 |
# self._concat_downward()
|
|
|
230 |
sum_question = '\n'.join(question_stack)
|
231 |
if sum_question:
|
232 |
qai_list.append((sum_question, last_answer, last_image))
|
233 |
+
|
234 |
tbls = []
|
235 |
for tb in self.doc.tables:
|
236 |
html= "<table>"
|
rag/settings.py
CHANGED
@@ -14,6 +14,7 @@
|
|
14 |
# limitations under the License.
|
15 |
#
|
16 |
import os
|
|
|
17 |
from api.utils import get_base_config, decrypt_database_config
|
18 |
from api.utils.file_utils import get_project_base_directory
|
19 |
|
@@ -37,3 +38,9 @@ SVR_QUEUE_RETENTION = 60*60
|
|
37 |
SVR_QUEUE_MAX_LEN = 1024
|
38 |
SVR_CONSUMER_NAME = "rag_flow_svr_consumer"
|
39 |
SVR_CONSUMER_GROUP_NAME = "rag_flow_svr_consumer_group"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
# limitations under the License.
|
15 |
#
|
16 |
import os
|
17 |
+
import logging
|
18 |
from api.utils import get_base_config, decrypt_database_config
|
19 |
from api.utils.file_utils import get_project_base_directory
|
20 |
|
|
|
38 |
SVR_QUEUE_MAX_LEN = 1024
|
39 |
SVR_CONSUMER_NAME = "rag_flow_svr_consumer"
|
40 |
SVR_CONSUMER_GROUP_NAME = "rag_flow_svr_consumer_group"
|
41 |
+
|
42 |
+
def print_rag_settings():
|
43 |
+
logging.info(f"MAX_CONTENT_LENGTH: {DOC_MAXIMUM_SIZE}")
|
44 |
+
logging.info(f"SERVER_QUEUE_MAX_LEN: {SVR_QUEUE_MAX_LEN}")
|
45 |
+
logging.info(f"SERVER_QUEUE_RETENTION: {SVR_QUEUE_RETENTION}")
|
46 |
+
logging.info(f"MAX_FILE_COUNT_PER_USER: {int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))}")
|
rag/svr/task_executor.py
CHANGED
@@ -56,12 +56,13 @@ from api.db.services.llm_service import LLMBundle
|
|
56 |
from api.db.services.task_service import TaskService
|
57 |
from api.db.services.file2document_service import File2DocumentService
|
58 |
from api import settings
|
|
|
59 |
from api.db.db_models import close_connection
|
60 |
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, \
|
61 |
knowledge_graph, email
|
62 |
from rag.nlp import search, rag_tokenizer
|
63 |
from rag.raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor
|
64 |
-
from rag.settings import DOC_MAXIMUM_SIZE, SVR_QUEUE_NAME
|
65 |
from rag.utils import rmSpace, num_tokens_from_string
|
66 |
from rag.utils.redis_conn import REDIS_CONN, Payload
|
67 |
from rag.utils.storage_factory import STORAGE_IMPL
|
@@ -395,8 +396,7 @@ def do_handle_task(r):
|
|
395 |
# TODO: exception handler
|
396 |
## set_progress(r["did"], -1, "ERROR: ")
|
397 |
callback(
|
398 |
-
|
399 |
-
timer() - st)
|
400 |
)
|
401 |
st = timer()
|
402 |
try:
|
@@ -407,7 +407,7 @@ def do_handle_task(r):
|
|
407 |
tk_count = 0
|
408 |
raise
|
409 |
logging.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
|
410 |
-
callback(msg="Finished embedding (
|
411 |
# logging.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}")
|
412 |
init_kb(r, vector_size)
|
413 |
chunk_count = len(set([c["id"] for c in cks]))
|
@@ -420,7 +420,8 @@ def do_handle_task(r):
|
|
420 |
callback(prog=0.8 + 0.1 * (b + 1) / len(cks), msg="")
|
421 |
logging.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st))
|
422 |
if es_r:
|
423 |
-
callback(-1,
|
|
|
424 |
settings.docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
|
425 |
logging.error('Insert chunk error: ' + str(es_r))
|
426 |
raise Exception('Insert chunk error: ' + str(es_r))
|
@@ -429,13 +430,12 @@ def do_handle_task(r):
|
|
429 |
settings.docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
|
430 |
return
|
431 |
|
432 |
-
callback(msg="
|
433 |
-
callback(1., "Done!")
|
434 |
DocumentService.increment_chunk_num(
|
435 |
r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
|
436 |
logging.info(
|
437 |
"Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format(
|
438 |
-
|
439 |
|
440 |
|
441 |
def handle_task():
|
@@ -502,7 +502,7 @@ def analyze_heap(snapshot1: tracemalloc.Snapshot, snapshot2: tracemalloc.Snapsho
|
|
502 |
for stat in stats2[:10]:
|
503 |
msg += f"{stat}\n"
|
504 |
stats1_vs_2 = snapshot2.compare_to(snapshot1, 'lineno')
|
505 |
-
msg += f"{CONSUMER_NAME} memory usage increase from snapshot {snapshot_id-1} to snapshot {snapshot_id}:\n"
|
506 |
for stat in stats1_vs_2[:10]:
|
507 |
msg += f"{stat}\n"
|
508 |
msg += f"{CONSUMER_NAME} detailed traceback for the top memory consumers:\n"
|
@@ -512,7 +512,16 @@ def analyze_heap(snapshot1: tracemalloc.Snapshot, snapshot2: tracemalloc.Snapsho
|
|
512 |
|
513 |
|
514 |
def main():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
515 |
settings.init_settings()
|
|
|
516 |
background_thread = threading.Thread(target=report_status)
|
517 |
background_thread.daemon = True
|
518 |
background_thread.start()
|
@@ -527,11 +536,12 @@ def main():
|
|
527 |
while True:
|
528 |
handle_task()
|
529 |
num_tasks = DONE_TASKS + FAILED_TASKS
|
530 |
-
if TRACE_MALLOC_DELTA> 0 and num_tasks > 0 and num_tasks % TRACE_MALLOC_DELTA == 0:
|
531 |
snapshot2 = tracemalloc.take_snapshot()
|
532 |
-
analyze_heap(snapshot1, snapshot2, int(num_tasks/TRACE_MALLOC_DELTA), num_tasks % TRACE_MALLOC_FULL == 0)
|
533 |
snapshot1 = snapshot2
|
534 |
snapshot2 = None
|
535 |
|
|
|
536 |
if __name__ == "__main__":
|
537 |
main()
|
|
|
56 |
from api.db.services.task_service import TaskService
|
57 |
from api.db.services.file2document_service import File2DocumentService
|
58 |
from api import settings
|
59 |
+
from api.versions import get_ragflow_version
|
60 |
from api.db.db_models import close_connection
|
61 |
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, \
|
62 |
knowledge_graph, email
|
63 |
from rag.nlp import search, rag_tokenizer
|
64 |
from rag.raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor
|
65 |
+
from rag.settings import DOC_MAXIMUM_SIZE, SVR_QUEUE_NAME, print_rag_settings
|
66 |
from rag.utils import rmSpace, num_tokens_from_string
|
67 |
from rag.utils.redis_conn import REDIS_CONN, Payload
|
68 |
from rag.utils.storage_factory import STORAGE_IMPL
|
|
|
396 |
# TODO: exception handler
|
397 |
## set_progress(r["did"], -1, "ERROR: ")
|
398 |
callback(
|
399 |
+
msg="Generate {} chunks ({:.2f}s). Embedding chunks.".format(len(cks), timer() - st)
|
|
|
400 |
)
|
401 |
st = timer()
|
402 |
try:
|
|
|
407 |
tk_count = 0
|
408 |
raise
|
409 |
logging.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
|
410 |
+
callback(msg="Finished embedding ({:.2f}s)!".format(timer() - st))
|
411 |
# logging.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}")
|
412 |
init_kb(r, vector_size)
|
413 |
chunk_count = len(set([c["id"] for c in cks]))
|
|
|
420 |
callback(prog=0.8 + 0.1 * (b + 1) / len(cks), msg="")
|
421 |
logging.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st))
|
422 |
if es_r:
|
423 |
+
callback(-1,
|
424 |
+
"Insert chunk error, detail info please check log file. Please also check Elasticsearch/Infinity status!")
|
425 |
settings.docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
|
426 |
logging.error('Insert chunk error: ' + str(es_r))
|
427 |
raise Exception('Insert chunk error: ' + str(es_r))
|
|
|
430 |
settings.docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
|
431 |
return
|
432 |
|
433 |
+
callback(1., msg="Index cost {:.2f}s.".format(timer() - st))
|
|
|
434 |
DocumentService.increment_chunk_num(
|
435 |
r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
|
436 |
logging.info(
|
437 |
"Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format(
|
438 |
+
r["id"], tk_count, len(cks), timer() - st))
|
439 |
|
440 |
|
441 |
def handle_task():
|
|
|
502 |
for stat in stats2[:10]:
|
503 |
msg += f"{stat}\n"
|
504 |
stats1_vs_2 = snapshot2.compare_to(snapshot1, 'lineno')
|
505 |
+
msg += f"{CONSUMER_NAME} memory usage increase from snapshot {snapshot_id - 1} to snapshot {snapshot_id}:\n"
|
506 |
for stat in stats1_vs_2[:10]:
|
507 |
msg += f"{stat}\n"
|
508 |
msg += f"{CONSUMER_NAME} detailed traceback for the top memory consumers:\n"
|
|
|
512 |
|
513 |
|
514 |
def main():
|
515 |
+
logging.info(r"""
|
516 |
+
______ __ ______ __
|
517 |
+
/_ __/___ ______/ /__ / ____/ _____ _______ __/ /_____ _____
|
518 |
+
/ / / __ `/ ___/ //_/ / __/ | |/_/ _ \/ ___/ / / / __/ __ \/ ___/
|
519 |
+
/ / / /_/ (__ ) ,< / /____> </ __/ /__/ /_/ / /_/ /_/ / /
|
520 |
+
/_/ \__,_/____/_/|_| /_____/_/|_|\___/\___/\__,_/\__/\____/_/
|
521 |
+
""")
|
522 |
+
logging.info(f'TaskExecutor: RAGFlow version: {get_ragflow_version()}')
|
523 |
settings.init_settings()
|
524 |
+
print_rag_settings()
|
525 |
background_thread = threading.Thread(target=report_status)
|
526 |
background_thread.daemon = True
|
527 |
background_thread.start()
|
|
|
536 |
while True:
|
537 |
handle_task()
|
538 |
num_tasks = DONE_TASKS + FAILED_TASKS
|
539 |
+
if TRACE_MALLOC_DELTA > 0 and num_tasks > 0 and num_tasks % TRACE_MALLOC_DELTA == 0:
|
540 |
snapshot2 = tracemalloc.take_snapshot()
|
541 |
+
analyze_heap(snapshot1, snapshot2, int(num_tasks / TRACE_MALLOC_DELTA), num_tasks % TRACE_MALLOC_FULL == 0)
|
542 |
snapshot1 = snapshot2
|
543 |
snapshot2 = None
|
544 |
|
545 |
+
|
546 |
if __name__ == "__main__":
|
547 |
main()
|