Kevin Hu commited on
Commit
fa11d40
·
1 Parent(s): 71da872

make titles in markdown not be splited with following content (#2971)

Browse files

### What problem does this PR solve?

#2970
### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)

Files changed (3) hide show
  1. rag/app/manual.py +5 -2
  2. rag/app/naive.py +6 -2
  3. rag/app/qa.py +1 -0
rag/app/manual.py CHANGED
@@ -67,9 +67,11 @@ class Pdf(PdfParser):
67
  return [(b["text"], b.get("layout_no", ""), self.get_position(b, zoomin))
68
  for i, b in enumerate(self.boxes)], tbls
69
 
 
70
  class Docx(DocxParser):
71
  def __init__(self):
72
  pass
 
73
  def get_picture(self, document, paragraph):
74
  img = paragraph._element.xpath('.//pic:pic')
75
  if not img:
@@ -80,6 +82,7 @@ class Docx(DocxParser):
80
  image = related_part.image
81
  image = Image.open(BytesIO(image.blob))
82
  return image
 
83
  def concat_img(self, img1, img2):
84
  if img1 and not img2:
85
  return img1
@@ -160,6 +163,7 @@ class Docx(DocxParser):
160
  tbls.append(((None, html), ""))
161
  return ti_list, tbls
162
 
 
163
  def chunk(filename, binary=None, from_page=0, to_page=100000,
164
  lang="Chinese", callback=None, **kwargs):
165
  """
@@ -244,6 +248,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
244
  res = tokenize_table(tbls, doc, eng)
245
  res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
246
  return res
 
247
  if re.search(r"\.docx$", filename, re.IGNORECASE):
248
  docx_parser = Docx()
249
  ti_list, tbls = docx_parser(filename, binary,
@@ -259,8 +264,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
259
  raise NotImplementedError("file type not supported yet(pdf and docx supported)")
260
 
261
 
262
-
263
-
264
  if __name__ == "__main__":
265
  import sys
266
 
 
67
  return [(b["text"], b.get("layout_no", ""), self.get_position(b, zoomin))
68
  for i, b in enumerate(self.boxes)], tbls
69
 
70
+
71
  class Docx(DocxParser):
72
  def __init__(self):
73
  pass
74
+
75
  def get_picture(self, document, paragraph):
76
  img = paragraph._element.xpath('.//pic:pic')
77
  if not img:
 
82
  image = related_part.image
83
  image = Image.open(BytesIO(image.blob))
84
  return image
85
+
86
  def concat_img(self, img1, img2):
87
  if img1 and not img2:
88
  return img1
 
163
  tbls.append(((None, html), ""))
164
  return ti_list, tbls
165
 
166
+
167
  def chunk(filename, binary=None, from_page=0, to_page=100000,
168
  lang="Chinese", callback=None, **kwargs):
169
  """
 
248
  res = tokenize_table(tbls, doc, eng)
249
  res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
250
  return res
251
+
252
  if re.search(r"\.docx$", filename, re.IGNORECASE):
253
  docx_parser = Docx()
254
  ti_list, tbls = docx_parser(filename, binary,
 
264
  raise NotImplementedError("file type not supported yet(pdf and docx supported)")
265
 
266
 
 
 
267
  if __name__ == "__main__":
268
  import sys
269
 
rag/app/naive.py CHANGED
@@ -168,8 +168,12 @@ class Markdown(MarkdownParser):
168
  sections.append((sec[:int(len(sec) / 2)], ""))
169
  sections.append((sec[int(len(sec) / 2):], ""))
170
  else:
171
- sections.append((sec, ""))
172
- print(tables)
 
 
 
 
173
  for table in tables:
174
  tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
175
  return sections, tbls
 
168
  sections.append((sec[:int(len(sec) / 2)], ""))
169
  sections.append((sec[int(len(sec) / 2):], ""))
170
  else:
171
+ if sections and sections[-1][0].strip().find("#") == 0:
172
+ sec_, _ = sections.pop(-1)
173
+ sections.append((sec_+"\n"+sec, ""))
174
+ else:
175
+ sections.append((sec, ""))
176
+
177
  for table in tables:
178
  tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
179
  return sections, tbls
rag/app/qa.py CHANGED
@@ -393,6 +393,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
393
  if sum_question:
394
  res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng))
395
  return res
 
396
  elif re.search(r"\.docx$", filename, re.IGNORECASE):
397
  docx_parser = Docx()
398
  qai_list, tbls = docx_parser(filename, binary,
 
393
  if sum_question:
394
  res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng))
395
  return res
396
+
397
  elif re.search(r"\.docx$", filename, re.IGNORECASE):
398
  docx_parser = Docx()
399
  qai_list, tbls = docx_parser(filename, binary,