KevinHuSh commited on
Commit
7e1a9f0
·
1 Parent(s): 8c14459

fix gb2312 encoding issue (#394)

Browse files

### What problem does this PR solve?

Issue link:#384
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Files changed (2) hide show
  1. rag/app/naive.py +5 -3
  2. rag/nlp/search.py +1 -1
rag/app/naive.py CHANGED
@@ -14,8 +14,7 @@ from io import BytesIO
14
  from docx import Document
15
  import re
16
  from deepdoc.parser.pdf_parser import PlainParser
17
- from rag.app import laws
18
- from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions, tokenize_chunks
19
  from deepdoc.parser import PdfParser, ExcelParser, DocxParser
20
  from rag.settings import cron_logger
21
 
@@ -140,7 +139,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
140
  callback(0.1, "Start to parse.")
141
  txt = ""
142
  if binary:
143
- txt = binary.decode("utf-8")
 
 
 
144
  else:
145
  with open(filename, "r") as f:
146
  while True:
 
14
  from docx import Document
15
  import re
16
  from deepdoc.parser.pdf_parser import PlainParser
17
+ from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks
 
18
  from deepdoc.parser import PdfParser, ExcelParser, DocxParser
19
  from rag.settings import cron_logger
20
 
 
139
  callback(0.1, "Start to parse.")
140
  txt = ""
141
  if binary:
142
+ try:
143
+ txt = binary.decode("utf-8")
144
+ except Exception as e:
145
+ txt = binary.decode("gb2312")
146
  else:
147
  with open(filename, "r") as f:
148
  while True:
rag/nlp/search.py CHANGED
@@ -237,7 +237,7 @@ class Dealer:
237
  pieces_.append(t)
238
  es_logger.info("{} => {}".format(answer, pieces_))
239
  if not pieces_:
240
- return answer
241
 
242
  ans_v, _ = embd_mdl.encode(pieces_)
243
  assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
 
237
  pieces_.append(t)
238
  es_logger.info("{} => {}".format(answer, pieces_))
239
  if not pieces_:
240
+ return answer, set([])
241
 
242
  ans_v, _ = embd_mdl.encode(pieces_)
243
  assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(