KevinHuSh
commited on
Commit
·
6be8543
1
Parent(s):
1550520
refine table parser (#120)
Browse files- api/db/services/task_service.py +1 -0
- deepdoc/parser/ppt_parser.py +3 -1
- rag/app/presentation.py +4 -1
- rag/app/table.py +7 -7
- rag/svr/task_broker.py +2 -2
- rag/svr/task_executor.py +3 -4
api/db/services/task_service.py
CHANGED
@@ -51,6 +51,7 @@ class TaskService(CommonService):
|
|
51 |
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\
|
52 |
.where(
|
53 |
Document.status == StatusEnum.VALID.value,
|
|
|
54 |
~(Document.type == FileType.VIRTUAL.value),
|
55 |
cls.model.progress == 0,
|
56 |
cls.model.update_time >= tm,
|
|
|
51 |
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\
|
52 |
.where(
|
53 |
Document.status == StatusEnum.VALID.value,
|
54 |
+
Document.run == TaskStatus.RUNNING.value,
|
55 |
~(Document.type == FileType.VIRTUAL.value),
|
56 |
cls.model.progress == 0,
|
57 |
cls.model.update_time >= tm,
|
deepdoc/parser/ppt_parser.py
CHANGED
@@ -42,7 +42,9 @@ class HuPptParser(object):
|
|
42 |
BytesIO(fnm))
|
43 |
txts = []
|
44 |
self.total_page = len(ppt.slides)
|
45 |
-
for i, slide in enumerate(ppt.slides
|
|
|
|
|
46 |
texts = []
|
47 |
for shape in slide.shapes:
|
48 |
txt = self.__extract(shape)
|
|
|
42 |
BytesIO(fnm))
|
43 |
txts = []
|
44 |
self.total_page = len(ppt.slides)
|
45 |
+
for i, slide in enumerate(ppt.slides):
|
46 |
+
if i < from_page: continue
|
47 |
+
if i >= to_page:break
|
48 |
texts = []
|
49 |
for shape in slide.shapes:
|
50 |
txt = self.__extract(shape)
|
rag/app/presentation.py
CHANGED
@@ -13,6 +13,9 @@
|
|
13 |
import copy
|
14 |
import re
|
15 |
from io import BytesIO
|
|
|
|
|
|
|
16 |
from rag.nlp import tokenize, is_english
|
17 |
from rag.nlp import huqie
|
18 |
from deepdoc.parser import PdfParser, PptParser
|
@@ -30,7 +33,7 @@ class Ppt(PptParser):
|
|
30 |
for i, slide in enumerate(presentation.slides[from_page: to_page]):
|
31 |
buffered = BytesIO()
|
32 |
slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
|
33 |
-
imgs.append(
|
34 |
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
35 |
callback(0.9, "Image extraction finished")
|
36 |
self.is_english = is_english(txts)
|
|
|
13 |
import copy
|
14 |
import re
|
15 |
from io import BytesIO
|
16 |
+
|
17 |
+
from PIL import Image
|
18 |
+
|
19 |
from rag.nlp import tokenize, is_english
|
20 |
from rag.nlp import huqie
|
21 |
from deepdoc.parser import PdfParser, PptParser
|
|
|
33 |
for i, slide in enumerate(presentation.slides[from_page: to_page]):
|
34 |
buffered = BytesIO()
|
35 |
slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
|
36 |
+
imgs.append(Image.open(buffered))
|
37 |
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
38 |
callback(0.9, "Image extraction finished")
|
39 |
self.is_english = is_english(txts)
|
rag/app/table.py
CHANGED
@@ -58,12 +58,9 @@ class Excel(ExcelParser):
|
|
58 |
continue
|
59 |
data.append(row)
|
60 |
done += 1
|
61 |
-
if done % 999 == 0:
|
62 |
-
callback(done * 0.6 / total, ("Extract records: {}".format(len(res)) + (
|
63 |
-
f"{len(fails)} failure({sheetname}), line: %s..." % (",".join(fails[:3])) if fails else "")))
|
64 |
res.append(pd.DataFrame(np.array(data), columns=headers))
|
65 |
|
66 |
-
callback(0.
|
67 |
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
68 |
return res
|
69 |
|
@@ -151,7 +148,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
|
|
151 |
headers = lines[0].split(kwargs.get("delimiter", "\t"))
|
152 |
rows = []
|
153 |
for i, line in enumerate(lines[1:]):
|
154 |
-
if
|
155 |
if i >= to_page: break
|
156 |
row = [l for l in line.split(kwargs.get("delimiter", "\t"))]
|
157 |
if len(row) != len(headers):
|
@@ -191,12 +188,15 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
|
|
191 |
df[clmns[j]] = cln
|
192 |
if ty == "text":
|
193 |
txts.extend([str(c) for c in cln if c])
|
194 |
-
clmns_map = [(py_clmns[
|
195 |
for i in range(len(clmns))]
|
196 |
|
197 |
eng = lang.lower() == "english"#is_english(txts)
|
198 |
for ii, row in df.iterrows():
|
199 |
-
d = {
|
|
|
|
|
|
|
200 |
row_txt = []
|
201 |
for j in range(len(clmns)):
|
202 |
if row[clmns[j]] is None:
|
|
|
58 |
continue
|
59 |
data.append(row)
|
60 |
done += 1
|
|
|
|
|
|
|
61 |
res.append(pd.DataFrame(np.array(data), columns=headers))
|
62 |
|
63 |
+
callback(0.3, ("Extract records: {}~{}".format(from_page+1, min(to_page, from_page+rn)) + (
|
64 |
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
65 |
return res
|
66 |
|
|
|
148 |
headers = lines[0].split(kwargs.get("delimiter", "\t"))
|
149 |
rows = []
|
150 |
for i, line in enumerate(lines[1:]):
|
151 |
+
if i < from_page:continue
|
152 |
if i >= to_page: break
|
153 |
row = [l for l in line.split(kwargs.get("delimiter", "\t"))]
|
154 |
if len(row) != len(headers):
|
|
|
188 |
df[clmns[j]] = cln
|
189 |
if ty == "text":
|
190 |
txts.extend([str(c) for c in cln if c])
|
191 |
+
clmns_map = [(py_clmns[i] + fieds_map[clmn_tys[i]], clmns[i])
|
192 |
for i in range(len(clmns))]
|
193 |
|
194 |
eng = lang.lower() == "english"#is_english(txts)
|
195 |
for ii, row in df.iterrows():
|
196 |
+
d = {
|
197 |
+
"docnm_kwd": filename,
|
198 |
+
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
199 |
+
}
|
200 |
row_txt = []
|
201 |
for j in range(len(clmns)):
|
202 |
if row[clmns[j]] is None:
|
rag/svr/task_broker.py
CHANGED
@@ -91,10 +91,10 @@ def dispatch():
|
|
91 |
tsks.append(task)
|
92 |
elif r["parser_id"] == "table":
|
93 |
rn = HuExcelParser.row_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
|
94 |
-
for i in range(0, rn,
|
95 |
task = new_task()
|
96 |
task["from_page"] = i
|
97 |
-
task["to_page"] = min(i +
|
98 |
tsks.append(task)
|
99 |
else:
|
100 |
tsks.append(new_task())
|
|
|
91 |
tsks.append(task)
|
92 |
elif r["parser_id"] == "table":
|
93 |
rn = HuExcelParser.row_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
|
94 |
+
for i in range(0, rn, 3000):
|
95 |
task = new_task()
|
96 |
task["from_page"] = i
|
97 |
+
task["to_page"] = min(i + 3000, rn)
|
98 |
tsks.append(task)
|
99 |
else:
|
100 |
tsks.append(new_task())
|
rag/svr/task_executor.py
CHANGED
@@ -128,8 +128,6 @@ def build(row):
|
|
128 |
|
129 |
return
|
130 |
|
131 |
-
callback(msg="Finished slicing files(%d). Start to embedding the content."%len(cks))
|
132 |
-
|
133 |
docs = []
|
134 |
doc = {
|
135 |
"doc_id": row["doc_id"],
|
@@ -179,8 +177,8 @@ def embedding(docs, mdl, parser_config={}, callback=None):
|
|
179 |
tk_count += c
|
180 |
|
181 |
cnts_ = np.array([])
|
182 |
-
for i in range(0, len(cnts),
|
183 |
-
vts, c = mdl.encode(cnts[i: i+
|
184 |
if len(cnts_) == 0: cnts_ = vts
|
185 |
else: cnts_ = np.concatenate((cnts_, vts), axis=0)
|
186 |
tk_count += c
|
@@ -226,6 +224,7 @@ def main(comm, mod):
|
|
226 |
continue
|
227 |
# TODO: exception handler
|
228 |
## set_progress(r["did"], -1, "ERROR: ")
|
|
|
229 |
try:
|
230 |
tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
|
231 |
except Exception as e:
|
|
|
128 |
|
129 |
return
|
130 |
|
|
|
|
|
131 |
docs = []
|
132 |
doc = {
|
133 |
"doc_id": row["doc_id"],
|
|
|
177 |
tk_count += c
|
178 |
|
179 |
cnts_ = np.array([])
|
180 |
+
for i in range(0, len(cnts), 8):
|
181 |
+
vts, c = mdl.encode(cnts[i: i+8])
|
182 |
if len(cnts_) == 0: cnts_ = vts
|
183 |
else: cnts_ = np.concatenate((cnts_, vts), axis=0)
|
184 |
tk_count += c
|
|
|
224 |
continue
|
225 |
# TODO: exception handler
|
226 |
## set_progress(r["did"], -1, "ERROR: ")
|
227 |
+
callback(msg="Finished slicing files(%d). Start to embedding the content."%len(cks))
|
228 |
try:
|
229 |
tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
|
230 |
except Exception as e:
|