aopstudio commited on
Commit
622e082
·
1 Parent(s): 3726b19

Support displaying tables in the chunks of pdf file when using QA parser (#1263)

Browse files

### What problem does this PR solve?

Support displaying tables in the chunks of pdf file when using QA parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

rag/app/qa.py CHANGED
@@ -22,6 +22,7 @@ from rag.settings import cron_logger
22
  from deepdoc.parser import PdfParser, ExcelParser, DocxParser
23
  from docx import Document
24
  from PIL import Image
 
25
  class Excel(ExcelParser):
26
  def __call__(self, fnm, binary=None, callback=None):
27
  if not binary:
@@ -374,8 +375,6 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
374
  code_block = False
375
  level_index = [-1] * 7
376
  for index, l in enumerate(lines):
377
- if not l.strip():
378
- continue
379
  if l.strip().startswith('```'):
380
  code_block = not code_block
381
  question_level, question = 0, ''
@@ -385,10 +384,10 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
385
  if not question_level or question_level > 6: # not a question
386
  last_answer = f'{last_answer}\n{l}'
387
  else: # is a question
388
- if last_answer:
389
  sum_question = '\n'.join(question_stack)
390
  if sum_question:
391
- res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
392
  last_answer = ''
393
 
394
  i = question_level
@@ -397,10 +396,10 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
397
  level_stack.pop()
398
  question_stack.append(question)
399
  level_stack.append(question_level)
400
- if last_answer:
401
  sum_question = '\n'.join(question_stack)
402
  if sum_question:
403
- res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
404
  return res
405
  elif re.search(r"\.docx$", filename, re.IGNORECASE):
406
  docx_parser = Docx()
 
22
  from deepdoc.parser import PdfParser, ExcelParser, DocxParser
23
  from docx import Document
24
  from PIL import Image
25
+ from markdown import markdown
26
  class Excel(ExcelParser):
27
  def __call__(self, fnm, binary=None, callback=None):
28
  if not binary:
 
375
  code_block = False
376
  level_index = [-1] * 7
377
  for index, l in enumerate(lines):
 
 
378
  if l.strip().startswith('```'):
379
  code_block = not code_block
380
  question_level, question = 0, ''
 
384
  if not question_level or question_level > 6: # not a question
385
  last_answer = f'{last_answer}\n{l}'
386
  else: # is a question
387
+ if last_answer.strip():
388
  sum_question = '\n'.join(question_stack)
389
  if sum_question:
390
+ res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng))
391
  last_answer = ''
392
 
393
  i = question_level
 
396
  level_stack.pop()
397
  question_stack.append(question)
398
  level_stack.append(question_level)
399
+ if last_answer.strip():
400
  sum_question = '\n'.join(question_stack)
401
  if sum_question:
402
+ res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng))
403
  return res
404
  elif re.search(r"\.docx$", filename, re.IGNORECASE):
405
  docx_parser = Docx()
requirements.txt CHANGED
@@ -143,3 +143,4 @@ webdriver-manager==4.0.1
143
  cn2an==0.5.22
144
  roman-numbers==1.0.2
145
  word2number==1.1
 
 
143
  cn2an==0.5.22
144
  roman-numbers==1.0.2
145
  word2number==1.1
146
+ markdown==3.6
requirements_arm.txt CHANGED
@@ -143,4 +143,5 @@ selenium==4.21.0
143
  webdriver-manager==4.0.1
144
  cn2an==0.5.22
145
  roman-numbers==1.0.2
146
- word2number==1.1
 
 
143
  webdriver-manager==4.0.1
144
  cn2an==0.5.22
145
  roman-numbers==1.0.2
146
+ word2number==1.1
147
+ markdown==3.6
requirements_dev.txt CHANGED
@@ -129,3 +129,4 @@ html_text==0.6.2
129
  cn2an==0.5.22
130
  roman-numbers==1.0.2
131
  word2number==1.1
 
 
129
  cn2an==0.5.22
130
  roman-numbers==1.0.2
131
  word2number==1.1
132
+ markdown==3.6