KevinHuSh commited on
Commit
c61bcde
·
1 Parent(s): d7bf446

fix #917 #915 (#946)

Browse files

### What problem does this PR solve?

#917
#915

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

deepdoc/parser/pdf_parser.py CHANGED
@@ -392,7 +392,7 @@ class RAGFlowPdfParser:
392
  b["text"].strip()[-1] in ",;:'\",、‘“;:-",
393
  len(b["text"].strip()) > 1 and b["text"].strip(
394
  )[-2] in ",;:'\",‘“、;:",
395
- b["text"].strip()[0] in "。;?!?”)),,、:",
396
  ]
397
  # features for not concating
398
  feats = [
 
392
  b["text"].strip()[-1] in ",;:'\",、‘“;:-",
393
  len(b["text"].strip()) > 1 and b["text"].strip(
394
  )[-2] in ",;:'\",‘“、;:",
395
+ b_["text"].strip()[0] in "。;?!?”)),,、:",
396
  ]
397
  # features for not concating
398
  feats = [
rag/app/naive.py CHANGED
@@ -19,6 +19,8 @@ from deepdoc.parser.pdf_parser import PlainParser
19
  from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec
20
  from deepdoc.parser import PdfParser, ExcelParser, DocxParser
21
  from rag.settings import cron_logger
 
 
22
 
23
  class Docx(DocxParser):
24
  def __init__(self):
@@ -149,8 +151,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
149
  if not l:
150
  break
151
  txt += l
152
- sections = txt.split("\n")
153
- sections = [(l, "") for l in sections if l]
 
 
 
 
 
 
154
  callback(0.8, "Finish parsing.")
155
 
156
  elif re.search(r"\.doc$", filename, re.IGNORECASE):
@@ -163,7 +171,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
163
 
164
  else:
165
  raise NotImplementedError(
166
- "file type not supported yet(doc, docx, pdf, txt supported)")
167
 
168
  st = timer()
169
  chunks = naive_merge(
 
19
  from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec
20
  from deepdoc.parser import PdfParser, ExcelParser, DocxParser
21
  from rag.settings import cron_logger
22
+ from rag.utils import num_tokens_from_string
23
+
24
 
25
  class Docx(DocxParser):
26
  def __init__(self):
 
151
  if not l:
152
  break
153
  txt += l
154
+ sections = []
155
+ for sec in txt.split("\n"):
156
+ if num_tokens_from_string(sec) > 10 * parser_config.get("chunk_token_num", 128):
157
+ sections.append((sec[:int(len(sec)/2)], ""))
158
+ sections.append((sec[int(len(sec)/2):], ""))
159
+ else:
160
+ sections.append((sec, ""))
161
+
162
  callback(0.8, "Finish parsing.")
163
 
164
  elif re.search(r"\.doc$", filename, re.IGNORECASE):
 
171
 
172
  else:
173
  raise NotImplementedError(
174
+ "file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
175
 
176
  st = timer()
177
  chunks = naive_merge(
rag/nlp/rag_tokenizer.py CHANGED
@@ -24,7 +24,7 @@ class RagTokenizer:
24
  def loadDict_(self, fnm):
25
  print("[HUQIE]:Build trie", fnm, file=sys.stderr)
26
  try:
27
- of = open(fnm, "r")
28
  while True:
29
  line = of.readline()
30
  if not line:
 
24
  def loadDict_(self, fnm):
25
  print("[HUQIE]:Build trie", fnm, file=sys.stderr)
26
  try:
27
+ of = open(fnm, "r", encoding='utf-8')
28
  while True:
29
  line = of.readline()
30
  if not line:
requirements.txt CHANGED
@@ -136,3 +136,4 @@ BCEmbedding
136
  loguru==0.7.2
137
  umap-learn
138
  fasttext==0.9.2
 
 
136
  loguru==0.7.2
137
  umap-learn
138
  fasttext==0.9.2
139
+ volcengine
requirements_dev.txt CHANGED
@@ -124,3 +124,4 @@ ollama==0.1.8
124
  redis==5.0.4
125
  fasttext==0.9.2
126
  umap-learn
 
 
124
  redis==5.0.4
125
  fasttext==0.9.2
126
  umap-learn
127
+ volcengine