GYH commited on
Commit
366c531
·
1 Parent(s): c008ff2

Split Excel file into different chunks (#847)

Browse files

### What problem does this PR solve?


Split Excel into different chunk
### Type of change

- [x] New Feature (non-breaking change which adds functionality)

deepdoc/parser/excel_parser.py CHANGED
@@ -7,30 +7,39 @@ from rag.nlp import find_codec
7
 
8
 
9
  class RAGFlowExcelParser:
10
- def html(self, fnm):
11
  if isinstance(fnm, str):
12
  wb = load_workbook(fnm)
13
  else:
14
  wb = load_workbook(BytesIO(fnm))
15
- tb = ""
 
16
  for sheetname in wb.sheetnames:
17
  ws = wb[sheetname]
18
  rows = list(ws.rows)
19
- if not rows:continue
20
- tb += f"<table><caption>{sheetname}</caption><tr>"
 
21
  for t in list(rows[0]):
22
- tb += f"<th>{t.value}</th>"
23
- tb += "</tr>"
24
- for r in list(rows[1:]):
25
- tb += "<tr>"
26
- for i, c in enumerate(r):
27
- if c.value is None:
28
- tb += "<td></td>"
29
- else:
30
- tb += f"<td>{c.value}</td>"
31
- tb += "</tr>"
32
- tb += "</table>\n"
33
- return tb
 
 
 
 
 
 
 
34
 
35
  def __call__(self, fnm):
36
  if isinstance(fnm, str):
 
7
 
8
 
9
  class RAGFlowExcelParser:
10
+ def html(self, fnm,chunk_rows=256):
11
  if isinstance(fnm, str):
12
  wb = load_workbook(fnm)
13
  else:
14
  wb = load_workbook(BytesIO(fnm))
15
+
16
+ tb_chunks = []
17
  for sheetname in wb.sheetnames:
18
  ws = wb[sheetname]
19
  rows = list(ws.rows)
20
+ if not rows: continue
21
+
22
+ tb_rows_0 = "<tr>"
23
  for t in list(rows[0]):
24
+ tb_rows_0 += f"<th>{t.value}</th>"
25
+ tb_rows_0 += "</tr>"
26
+
27
+ for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
28
+ tb = ""
29
+ tb += f"<table><caption>{sheetname}</caption>"
30
+ tb += tb_rows_0
31
+ for r in list(rows[1 + chunk_i * chunk_rows:1 + (chunk_i + 1) * chunk_rows]):
32
+ tb += "<tr>"
33
+ for i, c in enumerate(r):
34
+ if c.value is None:
35
+ tb += "<td></td>"
36
+ else:
37
+ tb += f"<td>{c.value}</td>"
38
+ tb += "</tr>"
39
+ tb += "</table>\n"
40
+ tb_chunks.append(tb)
41
+
42
+ return tb_chunks
43
 
44
  def __call__(self, fnm):
45
  if isinstance(fnm, str):
rag/app/naive.py CHANGED
@@ -134,7 +134,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
134
  elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
135
  callback(0.1, "Start to parse.")
136
  excel_parser = ExcelParser()
137
- sections = [(excel_parser.html(binary), "")]
138
 
139
  elif re.search(r"\.(txt|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
140
  callback(0.1, "Start to parse.")
 
134
  elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
135
  callback(0.1, "Start to parse.")
136
  excel_parser = ExcelParser()
137
+ sections = [(l, "") for l in excel_parser.html(binary) if l]
138
 
139
  elif re.search(r"\.(txt|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
140
  callback(0.1, "Start to parse.")
rag/app/one.py CHANGED
@@ -78,7 +78,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
78
  elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
79
  callback(0.1, "Start to parse.")
80
  excel_parser = ExcelParser()
81
- sections = [excel_parser.html(binary)]
82
 
83
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
84
  callback(0.1, "Start to parse.")
 
78
  elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
79
  callback(0.1, "Start to parse.")
80
  excel_parser = ExcelParser()
81
+ sections = excel_parser.html(binary , 10000000)
82
 
83
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
84
  callback(0.1, "Start to parse.")