Spaces:

StarPigeon
/

ViDove

Sleeping

App Files Files Community

Eason Lu commited on Nov 17, 2023

Commit

e3f9642

1 Parent(s): 1a902ed

solve puncs for multilingual

Browse files

Former-commit-id: 2d6150e7eadae2735d85204e283cace5c12789e4

Files changed (1) hide show

src/srt_util/srt.py +65 -22

src/srt_util/srt.py CHANGED Viewed

@@ -8,15 +8,48 @@ import logging
 import openai
 from tqdm import tqdm
 punctuation_dict = {
-    "EN": ". , ? ! : ; - ( ) [ ] { } ' \"",
-    "ES": ". , ? ! : ; - ( ) [ ] { } ' \" ¡ ¿",
-    "FR": ". , ? ! : ; - ( ) [ ] { } ' \" « » —",
-    "DE": ". , ? ! : ; - ( ) [ ] { } ' \" „ “ –",
-    "RU": ". , ? ! : ; - ( ) [ ] { } ' \" « » —",
-    "ZH": "。 ， ？ ！ ： ； — （ ） &#8203;``【oaicite:1】``&#8203; 《 》 “ ”",
-    "JA": "。 、 ？ ！ ： ； ー （ ） &#8203;``【oaicite:0】``&#8203; 「 」 『 』",
-    "AR": ". , ? ! : ; - ( ) [ ] { } ، ؛ ؟ « »",
 }
 class SrtSegment(object):
@@ -99,7 +132,7 @@ class SrtSegment(object):
         remove punctuations in translation text
         :return: None
         """
-        punc = punctuation_dict[self.tgt_lang]
         translator = str.maketrans(punc, ' ' * len(punc))
         self.translation = self.translation.translate(translator)
@@ -162,9 +195,10 @@ class SrtScript(object):
         logging.info("Forming whole sentences...")
         merge_list = []  # a list of indices that should be merged e.g. [[0], [1, 2, 3, 4], [5, 6], [7]]
         sentence = []
         # Get each entire sentence of distinct segments, fill indices to merge_list
         for i, seg in enumerate(self.segments):
-            if seg.source_text[-1] in ['.', '!', '?'] and len(seg.source_text) > 10 and 'vs.' not in seg.source_text:
                 sentence.append(i)
                 merge_list.append(sentence)
                 sentence = []
@@ -199,6 +233,7 @@ class SrtScript(object):
             src_text += '\n\n'
         def inner_func(target, input_str):
             response = openai.ChatCompletion.create(
                 model="gpt-4",
                 messages=[
@@ -270,25 +305,27 @@ class SrtScript(object):
     def split_seg(self, seg, text_threshold, time_threshold):
         # evenly split seg to 2 parts and add new seg into self.segments
         # ignore the initial comma to solve the recursion problem
-        # FIXME: accomodate multilingual setting
         if len(seg.source_text) > 2:
-            if seg.source_text[:2] == ', ':
                 seg.source_text = seg.source_text[2:]
-        if seg.translation[0] == '，':
             seg.translation = seg.translation[1:]
         source_text = seg.source_text
         translation = seg.translation
         # split the text based on commas
-        src_commas = [m.start() for m in re.finditer(',', source_text)]
-        trans_commas = [m.start() for m in re.finditer('，', translation)]
         if len(src_commas) != 0:
             src_split_idx = src_commas[len(src_commas) // 2] if len(src_commas) % 2 == 1 else src_commas[
                 len(src_commas) // 2 - 1]
         else:
             src_space = [m.start() for m in re.finditer(' ', source_text)]
             if len(src_space) > 0:
                 src_split_idx = src_space[len(src_space) // 2] if len(src_space) % 2 == 1 else src_space[
@@ -300,13 +337,19 @@ class SrtScript(object):
             trans_split_idx = trans_commas[len(trans_commas) // 2] if len(trans_commas) % 2 == 1 else trans_commas[
                 len(trans_commas) // 2 - 1]
         else:
-            trans_split_idx = len(translation) // 2
-            # to avoid split English word
-            for i in range(trans_split_idx, len(translation)):
-                if not translation[i].encode('utf-8').isalpha():
-                    trans_split_idx = i
-                    break
         # split the time duration based on text length
         time_split_ratio = trans_split_idx / (len(seg.translation) - 1)

 import openai
 from tqdm import tqdm
+# punctuation dictionary for supported languages
 punctuation_dict = {
+    "EN": {
+        "punc_str": ". , ? ! : ; - ( ) [ ] { } ' \"",
+        "comma": ", ",
+        "sentence_end": [".", "!", "?", ";"]
+    },
+    "ES": {
+        "punc_str": ". , ? ! : ; - ( ) [ ] { } ' \" ¡ ¿",
+        "comma": ", ",
+        "sentence_end": [".", "!", "?", ";", "¡", "¿"]
+    },
+    "FR": {
+        "punc_str": ". , ? ! : ; - ( ) [ ] { } ' \" « » —",
+        "comma": ", ",
+        "sentence_end": [".", "!", "?", ";"]
+    },
+    "DE": {
+        "punc_str": ". , ? ! : ; - ( ) [ ] { } ' \" „ “ –",
+        "comma": ", ",
+        "sentence_end": [".", "!", "?", ";"]
+    },
+    "RU": {
+        "punc_str": ". , ? ! : ; - ( ) [ ] { } ' \" « » —",
+        "comma": ", ",
+        "sentence_end": [".", "!", "?", ";"]
+    },
+    "ZH": {
+        "punc_str": "。 ， ？ ！ ： ； — （ ） &#8203;``【oaicite:1】``&#8203; 《 》 “ ”",
+        "comma": "，",
+        "sentence_end": ["。", "！", "？"]
+    },
+    "JA": {
+        "punc_str": "。 、 ？ ！ ： ； ー （ ） &#8203;``【oaicite:0】``&#8203; 「 」 『 』",
+        "comma": "、",
+        "sentence_end": ["。", "！", "？"]
+    },
+    "AR": {
+        "punc_str": ". , ? ! : ; - ( ) [ ] { } ، ؛ ؟ « »",
+        "comma": "، ",
+        "sentence_end": [".", "!", "?", ";", "؟"]
+    },
 }
 class SrtSegment(object):
         remove punctuations in translation text
         :return: None
         """
+        punc = punctuation_dict[self.tgt_lang]["punc_str"]
         translator = str.maketrans(punc, ' ' * len(punc))
         self.translation = self.translation.translate(translator)
         logging.info("Forming whole sentences...")
         merge_list = []  # a list of indices that should be merged e.g. [[0], [1, 2, 3, 4], [5, 6], [7]]
         sentence = []
+        ending_puncs = punctuation_dict[self.src_lang]["sentence_end"]
         # Get each entire sentence of distinct segments, fill indices to merge_list
         for i, seg in enumerate(self.segments):
+            if seg.source_text[-1] in ending_puncs and len(seg.source_text) > 10 and 'vs.' not in seg.source_text:
                 sentence.append(i)
                 merge_list.append(sentence)
                 sentence = []
             src_text += '\n\n'
         def inner_func(target, input_str):
+            # TODO: accomodate different languages
             response = openai.ChatCompletion.create(
                 model="gpt-4",
                 messages=[
     def split_seg(self, seg, text_threshold, time_threshold):
         # evenly split seg to 2 parts and add new seg into self.segments
         # ignore the initial comma to solve the recursion problem
+        src_comma_str = punctuation_dict[self.src_lang]["comma"]
+        tgt_comma_str = punctuation_dict[self.tgt_lang]["comma"]
         if len(seg.source_text) > 2:
+            if seg.source_text[:2] == src_comma_str:
                 seg.source_text = seg.source_text[2:]
+        if seg.translation[0] == tgt_comma_str:
             seg.translation = seg.translation[1:]
         source_text = seg.source_text
         translation = seg.translation
         # split the text based on commas
+        src_commas = [m.start() for m in re.finditer(src_comma_str, source_text)]
+        trans_commas = [m.start() for m in re.finditer(tgt_comma_str, translation)]
         if len(src_commas) != 0:
             src_split_idx = src_commas[len(src_commas) // 2] if len(src_commas) % 2 == 1 else src_commas[
                 len(src_commas) // 2 - 1]
         else:
+            # split the text based on spaces
             src_space = [m.start() for m in re.finditer(' ', source_text)]
             if len(src_space) > 0:
                 src_split_idx = src_space[len(src_space) // 2] if len(src_space) % 2 == 1 else src_space[
             trans_split_idx = trans_commas[len(trans_commas) // 2] if len(trans_commas) % 2 == 1 else trans_commas[
                 len(trans_commas) // 2 - 1]
         else:
+            # split the text based on spaces
+            trans_space = [m.start() for m in re.finditer(' ', translation)]
+            if len(trans_space) > 0:
+                trans_split_idx = trans_space[len(trans_space) // 2] if len(trans_space) % 2 == 1 else trans_space[
+                    len(trans_space) // 2 - 1]
+            else:
+                trans_split_idx = len(translation) // 2
+                # to avoid split English word
+                for i in range(trans_split_idx, len(translation)):
+                    if not translation[i].encode('utf-8').isalpha():
+                        trans_split_idx = i
+                        break
         # split the time duration based on text length
         time_split_ratio = trans_split_idx / (len(seg.translation) - 1)