KevinHuSh commited on
Commit
ae35e13
·
1 Parent(s): ef15d2d

remove doc from supported processing types (#488)

Browse files

### What problem does this PR solve?
#474

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Files changed (4) hide show
  1. rag/app/book.py +1 -1
  2. rag/app/laws.py +1 -1
  3. rag/app/naive.py +1 -1
  4. rag/app/one.py +1 -1
rag/app/book.py CHANGED
@@ -67,7 +67,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
67
  doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
68
  pdf_parser = None
69
  sections, tbls = [], []
70
- if re.search(r"\.docx?$", filename, re.IGNORECASE):
71
  callback(0.1, "Start to parse.")
72
  doc_parser = DocxParser()
73
  # TODO: table of contents need to be removed
 
67
  doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
68
  pdf_parser = None
69
  sections, tbls = [], []
70
+ if re.search(r"\.docx$", filename, re.IGNORECASE):
71
  callback(0.1, "Start to parse.")
72
  doc_parser = DocxParser()
73
  # TODO: table of contents need to be removed
rag/app/laws.py CHANGED
@@ -93,7 +93,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
93
  doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
94
  pdf_parser = None
95
  sections = []
96
- if re.search(r"\.docx?$", filename, re.IGNORECASE):
97
  callback(0.1, "Start to parse.")
98
  for txt in Docx()(filename, binary):
99
  sections.append(txt)
 
93
  doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
94
  pdf_parser = None
95
  sections = []
96
+ if re.search(r"\.docx$", filename, re.IGNORECASE):
97
  callback(0.1, "Start to parse.")
98
  for txt in Docx()(filename, binary):
99
  sections.append(txt)
rag/app/naive.py CHANGED
@@ -119,7 +119,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
119
  res = []
120
  pdf_parser = None
121
  sections = []
122
- if re.search(r"\.docx?$", filename, re.IGNORECASE):
123
  callback(0.1, "Start to parse.")
124
  sections, tbls = Docx()(filename, binary)
125
  res = tokenize_table(tbls, doc, eng)
 
119
  res = []
120
  pdf_parser = None
121
  sections = []
122
+ if re.search(r"\.docx$", filename, re.IGNORECASE):
123
  callback(0.1, "Start to parse.")
124
  sections, tbls = Docx()(filename, binary)
125
  res = tokenize_table(tbls, doc, eng)
rag/app/one.py CHANGED
@@ -60,7 +60,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
60
 
61
  eng = lang.lower() == "english" # is_english(cks)
62
 
63
- if re.search(r"\.docx?$", filename, re.IGNORECASE):
64
  callback(0.1, "Start to parse.")
65
  sections = [txt for txt in laws.Docx()(filename, binary) if txt]
66
  callback(0.8, "Finish parsing.")
 
60
 
61
  eng = lang.lower() == "english" # is_english(cks)
62
 
63
+ if re.search(r"\.docx$", filename, re.IGNORECASE):
64
  callback(0.1, "Start to parse.")
65
  sections = [txt for txt in laws.Docx()(filename, binary) if txt]
66
  callback(0.8, "Finish parsing.")