Kevin Hu
commited on
Commit
·
4dd5c5e
1
Parent(s):
eba4697
Removing invisible chars before tokenization. (#4233)
Browse files### What problem does this PR solve?
#4223
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- rag/nlp/rag_tokenizer.py +1 -0
rag/nlp/rag_tokenizer.py
CHANGED
@@ -264,6 +264,7 @@ class RagTokenizer:
|
|
264 |
return [self.stemmer.stem(self.lemmatizer.lemmatize(t)) if re.match(r"[a-zA-Z_-]+$", t) else t for t in tks]
|
265 |
|
266 |
def tokenize(self, line):
|
|
|
267 |
line = self._strQ2B(line).lower()
|
268 |
line = self._tradi2simp(line)
|
269 |
zh_num = len([1 for c in line if is_chinese(c)])
|
|
|
264 |
return [self.stemmer.stem(self.lemmatizer.lemmatize(t)) if re.match(r"[a-zA-Z_-]+$", t) else t for t in tks]
|
265 |
|
266 |
def tokenize(self, line):
|
267 |
+
line = re.sub(r"\W+", " ", line)
|
268 |
line = self._strQ2B(line).lower()
|
269 |
line = self._tradi2simp(line)
|
270 |
zh_num = len([1 for c in line if is_chinese(c)])
|