Kevin Hu commited on
Commit
4dd5c5e
·
1 Parent(s): eba4697

Removing invisible chars before tokenization. (#4233)

Browse files

### What problem does this PR solve?

#4223

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Files changed (1) hide show
  1. rag/nlp/rag_tokenizer.py +1 -0
rag/nlp/rag_tokenizer.py CHANGED
@@ -264,6 +264,7 @@ class RagTokenizer:
264
  return [self.stemmer.stem(self.lemmatizer.lemmatize(t)) if re.match(r"[a-zA-Z_-]+$", t) else t for t in tks]
265
 
266
  def tokenize(self, line):
 
267
  line = self._strQ2B(line).lower()
268
  line = self._tradi2simp(line)
269
  zh_num = len([1 for c in line if is_chinese(c)])
 
264
  return [self.stemmer.stem(self.lemmatizer.lemmatize(t)) if re.match(r"[a-zA-Z_-]+$", t) else t for t in tks]
265
 
266
  def tokenize(self, line):
267
+ line = re.sub(r"\W+", " ", line)
268
  line = self._strQ2B(line).lower()
269
  line = self._tradi2simp(line)
270
  zh_num = len([1 for c in line if is_chinese(c)])