Spaces:
Sleeping
Sleeping
DWizard
commited on
Commit
·
1de6702
1
Parent(s):
d438792
optimize spell check
Browse filesFormer-commit-id: 26b78cae5e1d9aaf72a21fd13900a9fdb5f381d4
SRT.py
CHANGED
|
@@ -178,16 +178,20 @@ class SRT_script():
|
|
| 178 |
# TODO: variety of translation
|
| 179 |
|
| 180 |
# load term dictionary
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
# term_enzh_dict = {rows[0]:rows[1] for rows in csv_reader}
|
| 184 |
|
| 185 |
# change term
|
| 186 |
for seg in self.segments:
|
| 187 |
ready_words = seg.source_text.split(" ")
|
| 188 |
for i in range(len(ready_words)):
|
| 189 |
word = ready_words[i]
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
# if word[-2:] == ".\n":
|
| 192 |
# if word[:-2].lower() in term_enzh_dict:
|
| 193 |
# new_word = word.replace(word[:-2], term_enzh_dict.get(word[:-2].lower()))
|
|
@@ -206,15 +210,22 @@ class SRT_script():
|
|
| 206 |
## known bug: I've will be replaced because i've is not in the dict
|
| 207 |
|
| 208 |
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
|
| 213 |
for seg in self.segments:
|
| 214 |
ready_words = seg.source_text.split(" ")
|
| 215 |
for i in range(len(ready_words)):
|
| 216 |
word = ready_words[i]
|
| 217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
# if word[-2:] == ".\n":
|
| 219 |
# real_word = word[:-2]
|
| 220 |
# if not dict.check(real_word.lower()):
|
|
@@ -265,4 +276,16 @@ class SRT_script():
|
|
| 265 |
if not dict.check(real_word):
|
| 266 |
if term_spellDict.suggest(real_word): # relax spell check
|
| 267 |
new_word = word.replace(word[:pos],term_spellDict.suggest(real_word)[0])
|
| 268 |
-
return new_word
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
# TODO: variety of translation
|
| 179 |
|
| 180 |
# load term dictionary
|
| 181 |
+
with open("dict_enzh.csv",'r', encoding='utf-8') as f:
|
| 182 |
+
term_enzh_dict = {rows[0]:rows[1] for rows in reader(f)}
|
|
|
|
| 183 |
|
| 184 |
# change term
|
| 185 |
for seg in self.segments:
|
| 186 |
ready_words = seg.source_text.split(" ")
|
| 187 |
for i in range(len(ready_words)):
|
| 188 |
word = ready_words[i]
|
| 189 |
+
[real_word, pos] = self.get_real_word(word)
|
| 190 |
+
if real_word in term_enzh_dict:
|
| 191 |
+
new_word = word.replace(word[:pos], term_enzh_dict.get(real_word))
|
| 192 |
+
else:
|
| 193 |
+
new_word = word
|
| 194 |
+
ready_words[i] = new_word
|
| 195 |
# if word[-2:] == ".\n":
|
| 196 |
# if word[:-2].lower() in term_enzh_dict:
|
| 197 |
# new_word = word.replace(word[:-2], term_enzh_dict.get(word[:-2].lower()))
|
|
|
|
| 210 |
## known bug: I've will be replaced because i've is not in the dict
|
| 211 |
|
| 212 |
|
| 213 |
+
import enchant
|
| 214 |
+
dict = enchant.Dict('en_US')
|
| 215 |
+
term_spellDict = enchant.PyPWL('finetune_data/dict_freq.txt')
|
| 216 |
|
| 217 |
for seg in self.segments:
|
| 218 |
ready_words = seg.source_text.split(" ")
|
| 219 |
for i in range(len(ready_words)):
|
| 220 |
word = ready_words[i]
|
| 221 |
+
[real_word, pos] = self.get_real_word(word)
|
| 222 |
+
if not dict.check(real_word):
|
| 223 |
+
suggest = term_spellDict.suggest(real_word)
|
| 224 |
+
if suggest: # relax spell check
|
| 225 |
+
new_word = word.replace(word[:pos],suggest[0])
|
| 226 |
+
else:
|
| 227 |
+
new_word = word
|
| 228 |
+
ready_words[i] = new_word
|
| 229 |
# if word[-2:] == ".\n":
|
| 230 |
# real_word = word[:-2]
|
| 231 |
# if not dict.check(real_word.lower()):
|
|
|
|
| 276 |
if not dict.check(real_word):
|
| 277 |
if term_spellDict.suggest(real_word): # relax spell check
|
| 278 |
new_word = word.replace(word[:pos],term_spellDict.suggest(real_word)[0])
|
| 279 |
+
return new_word
|
| 280 |
+
|
| 281 |
+
def get_real_word(self, word:str):
|
| 282 |
+
if word[-2:] == ".\n":
|
| 283 |
+
real_word = word[:-2].lower()
|
| 284 |
+
n = -2
|
| 285 |
+
elif word[-1:] in [".", "\n", ",", "!", "?"]:
|
| 286 |
+
real_word = word[:-1].lower()
|
| 287 |
+
n = -1
|
| 288 |
+
else:
|
| 289 |
+
real_word = word.lower()
|
| 290 |
+
n = 0
|
| 291 |
+
return real_word, len(word)+n
|