Spaces:

StarPigeon
/

ViDove

Sleeping

App Files Files Community

DWizard commited on Mar 26, 2023

Commit

1de6702

1 Parent(s): d438792

optimize spell check

Browse files

Former-commit-id: 26b78cae5e1d9aaf72a21fd13900a9fdb5f381d4

Files changed (1) hide show

SRT.py +32 -9

SRT.py CHANGED Viewed

@@ -178,16 +178,20 @@ class SRT_script():
         # TODO: variety of translation
         # load term dictionary
-        # with open("dict_enzh.csv",'r', encoding='utf-8') as f:
-        #     csv_reader = reader(f)
-        #     term_enzh_dict = {rows[0]:rows[1] for rows in csv_reader}
         # change term
         for seg in self.segments:
             ready_words = seg.source_text.split(" ")
             for i in range(len(ready_words)):
                 word = ready_words[i]
-                ready_words[i] = self.spell_correction(word, 0)
                 # if word[-2:] == ".\n":
                 #     if word[:-2].lower() in term_enzh_dict:
                 #         new_word = word.replace(word[:-2], term_enzh_dict.get(word[:-2].lower()))
@@ -206,15 +210,22 @@ class SRT_script():
         ## known bug: I've will be replaced because i've is not in the dict
-        # import enchant
-        # dict = enchant.Dict('en_US')
-        # term_spellDict = enchant.PyPWL('project-t/finetune_data/dict_freq.txt')
         for seg in self.segments:
             ready_words = seg.source_text.split(" ")
             for i in range(len(ready_words)):
                 word = ready_words[i]
-                ready_words[i] = self.spell_correction(word, 1)
                 # if word[-2:] == ".\n":
                 #     real_word = word[:-2]
                 #     if not dict.check(real_word.lower()):
@@ -265,4 +276,16 @@ class SRT_script():
             if not dict.check(real_word):
                 if term_spellDict.suggest(real_word):  # relax spell check
                     new_word = word.replace(word[:pos],term_spellDict.suggest(real_word)[0])
-        return new_word

         # TODO: variety of translation
         # load term dictionary
+        with open("dict_enzh.csv",'r', encoding='utf-8') as f:
+            term_enzh_dict = {rows[0]:rows[1] for rows in reader(f)}
         # change term
         for seg in self.segments:
             ready_words = seg.source_text.split(" ")
             for i in range(len(ready_words)):
                 word = ready_words[i]
+                [real_word, pos] = self.get_real_word(word)
+                if real_word in term_enzh_dict:
+                    new_word = word.replace(word[:pos], term_enzh_dict.get(real_word))
+                else:
+                    new_word = word
+                ready_words[i] = new_word
                 # if word[-2:] == ".\n":
                 #     if word[:-2].lower() in term_enzh_dict:
                 #         new_word = word.replace(word[:-2], term_enzh_dict.get(word[:-2].lower()))
         ## known bug: I've will be replaced because i've is not in the dict
+        import enchant
+        dict = enchant.Dict('en_US')
+        term_spellDict = enchant.PyPWL('finetune_data/dict_freq.txt')
         for seg in self.segments:
             ready_words = seg.source_text.split(" ")
             for i in range(len(ready_words)):
                 word = ready_words[i]
+                [real_word, pos] = self.get_real_word(word)
+                if not dict.check(real_word):
+                    suggest = term_spellDict.suggest(real_word)
+                    if suggest:  # relax spell check
+                        new_word = word.replace(word[:pos],suggest[0])
+                else:
+                    new_word = word
+                ready_words[i] = new_word
                 # if word[-2:] == ".\n":
                 #     real_word = word[:-2]
                 #     if not dict.check(real_word.lower()):
             if not dict.check(real_word):
                 if term_spellDict.suggest(real_word):  # relax spell check
                     new_word = word.replace(word[:pos],term_spellDict.suggest(real_word)[0])
+        return new_word
+    def get_real_word(self, word:str):
+        if word[-2:] == ".\n":
+            real_word = word[:-2].lower()
+            n = -2
+        elif word[-1:] in [".", "\n", ",", "!", "?"]:
+            real_word = word[:-1].lower()
+            n = -1
+        else:
+            real_word = word.lower()
+            n = 0
+        return real_word, len(word)+n