KevinHuSh commited on
Commit
6be8543
·
1 Parent(s): 1550520

refine table parser (#120)

Browse files
api/db/services/task_service.py CHANGED
@@ -51,6 +51,7 @@ class TaskService(CommonService):
51
  .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\
52
  .where(
53
  Document.status == StatusEnum.VALID.value,
 
54
  ~(Document.type == FileType.VIRTUAL.value),
55
  cls.model.progress == 0,
56
  cls.model.update_time >= tm,
 
51
  .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\
52
  .where(
53
  Document.status == StatusEnum.VALID.value,
54
+ Document.run == TaskStatus.RUNNING.value,
55
  ~(Document.type == FileType.VIRTUAL.value),
56
  cls.model.progress == 0,
57
  cls.model.update_time >= tm,
deepdoc/parser/ppt_parser.py CHANGED
@@ -42,7 +42,9 @@ class HuPptParser(object):
42
  BytesIO(fnm))
43
  txts = []
44
  self.total_page = len(ppt.slides)
45
- for i, slide in enumerate(ppt.slides[from_page: to_page]):
 
 
46
  texts = []
47
  for shape in slide.shapes:
48
  txt = self.__extract(shape)
 
42
  BytesIO(fnm))
43
  txts = []
44
  self.total_page = len(ppt.slides)
45
+ for i, slide in enumerate(ppt.slides):
46
+ if i < from_page: continue
47
+ if i >= to_page:break
48
  texts = []
49
  for shape in slide.shapes:
50
  txt = self.__extract(shape)
rag/app/presentation.py CHANGED
@@ -13,6 +13,9 @@
13
  import copy
14
  import re
15
  from io import BytesIO
 
 
 
16
  from rag.nlp import tokenize, is_english
17
  from rag.nlp import huqie
18
  from deepdoc.parser import PdfParser, PptParser
@@ -30,7 +33,7 @@ class Ppt(PptParser):
30
  for i, slide in enumerate(presentation.slides[from_page: to_page]):
31
  buffered = BytesIO()
32
  slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
33
- imgs.append(buffered.getvalue())
34
  assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
35
  callback(0.9, "Image extraction finished")
36
  self.is_english = is_english(txts)
 
13
  import copy
14
  import re
15
  from io import BytesIO
16
+
17
+ from PIL import Image
18
+
19
  from rag.nlp import tokenize, is_english
20
  from rag.nlp import huqie
21
  from deepdoc.parser import PdfParser, PptParser
 
33
  for i, slide in enumerate(presentation.slides[from_page: to_page]):
34
  buffered = BytesIO()
35
  slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
36
+ imgs.append(Image.open(buffered))
37
  assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
38
  callback(0.9, "Image extraction finished")
39
  self.is_english = is_english(txts)
rag/app/table.py CHANGED
@@ -58,12 +58,9 @@ class Excel(ExcelParser):
58
  continue
59
  data.append(row)
60
  done += 1
61
- if done % 999 == 0:
62
- callback(done * 0.6 / total, ("Extract records: {}".format(len(res)) + (
63
- f"{len(fails)} failure({sheetname}), line: %s..." % (",".join(fails[:3])) if fails else "")))
64
  res.append(pd.DataFrame(np.array(data), columns=headers))
65
 
66
- callback(0.6, ("Extract records: {}. ".format(done) + (
67
  f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
68
  return res
69
 
@@ -151,7 +148,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
151
  headers = lines[0].split(kwargs.get("delimiter", "\t"))
152
  rows = []
153
  for i, line in enumerate(lines[1:]):
154
- if from_page < from_page:continue
155
  if i >= to_page: break
156
  row = [l for l in line.split(kwargs.get("delimiter", "\t"))]
157
  if len(row) != len(headers):
@@ -191,12 +188,15 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
191
  df[clmns[j]] = cln
192
  if ty == "text":
193
  txts.extend([str(c) for c in cln if c])
194
- clmns_map = [(py_clmns[j] + fieds_map[clmn_tys[j]], clmns[j])
195
  for i in range(len(clmns))]
196
 
197
  eng = lang.lower() == "english"#is_english(txts)
198
  for ii, row in df.iterrows():
199
- d = {}
 
 
 
200
  row_txt = []
201
  for j in range(len(clmns)):
202
  if row[clmns[j]] is None:
 
58
  continue
59
  data.append(row)
60
  done += 1
 
 
 
61
  res.append(pd.DataFrame(np.array(data), columns=headers))
62
 
63
+ callback(0.3, ("Extract records: {}~{}".format(from_page+1, min(to_page, from_page+rn)) + (
64
  f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
65
  return res
66
 
 
148
  headers = lines[0].split(kwargs.get("delimiter", "\t"))
149
  rows = []
150
  for i, line in enumerate(lines[1:]):
151
+ if i < from_page:continue
152
  if i >= to_page: break
153
  row = [l for l in line.split(kwargs.get("delimiter", "\t"))]
154
  if len(row) != len(headers):
 
188
  df[clmns[j]] = cln
189
  if ty == "text":
190
  txts.extend([str(c) for c in cln if c])
191
+ clmns_map = [(py_clmns[i] + fieds_map[clmn_tys[i]], clmns[i])
192
  for i in range(len(clmns))]
193
 
194
  eng = lang.lower() == "english"#is_english(txts)
195
  for ii, row in df.iterrows():
196
+ d = {
197
+ "docnm_kwd": filename,
198
+ "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
199
+ }
200
  row_txt = []
201
  for j in range(len(clmns)):
202
  if row[clmns[j]] is None:
rag/svr/task_broker.py CHANGED
@@ -91,10 +91,10 @@ def dispatch():
91
  tsks.append(task)
92
  elif r["parser_id"] == "table":
93
  rn = HuExcelParser.row_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
94
- for i in range(0, rn, 1000):
95
  task = new_task()
96
  task["from_page"] = i
97
- task["to_page"] = min(i + 1000, rn)
98
  tsks.append(task)
99
  else:
100
  tsks.append(new_task())
 
91
  tsks.append(task)
92
  elif r["parser_id"] == "table":
93
  rn = HuExcelParser.row_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
94
+ for i in range(0, rn, 3000):
95
  task = new_task()
96
  task["from_page"] = i
97
+ task["to_page"] = min(i + 3000, rn)
98
  tsks.append(task)
99
  else:
100
  tsks.append(new_task())
rag/svr/task_executor.py CHANGED
@@ -128,8 +128,6 @@ def build(row):
128
 
129
  return
130
 
131
- callback(msg="Finished slicing files(%d). Start to embedding the content."%len(cks))
132
-
133
  docs = []
134
  doc = {
135
  "doc_id": row["doc_id"],
@@ -179,8 +177,8 @@ def embedding(docs, mdl, parser_config={}, callback=None):
179
  tk_count += c
180
 
181
  cnts_ = np.array([])
182
- for i in range(0, len(cnts), 32):
183
- vts, c = mdl.encode(cnts[i: i+32])
184
  if len(cnts_) == 0: cnts_ = vts
185
  else: cnts_ = np.concatenate((cnts_, vts), axis=0)
186
  tk_count += c
@@ -226,6 +224,7 @@ def main(comm, mod):
226
  continue
227
  # TODO: exception handler
228
  ## set_progress(r["did"], -1, "ERROR: ")
 
229
  try:
230
  tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
231
  except Exception as e:
 
128
 
129
  return
130
 
 
 
131
  docs = []
132
  doc = {
133
  "doc_id": row["doc_id"],
 
177
  tk_count += c
178
 
179
  cnts_ = np.array([])
180
+ for i in range(0, len(cnts), 8):
181
+ vts, c = mdl.encode(cnts[i: i+8])
182
  if len(cnts_) == 0: cnts_ = vts
183
  else: cnts_ = np.concatenate((cnts_, vts), axis=0)
184
  tk_count += c
 
224
  continue
225
  # TODO: exception handler
226
  ## set_progress(r["did"], -1, "ERROR: ")
227
+ callback(msg="Finished slicing files(%d). Start to embedding the content."%len(cks))
228
  try:
229
  tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
230
  except Exception as e: