Spaces:

StarPigeon
/

ViDove

Sleeping

App Files Files Community

CanYing0913 commited on Apr 22, 2023

Commit

2d29c14

2 Parent(s): 85c07a7 7d74f8e

Merge branch 'SRT_cleanup' into eason/main

Browse files

Former-commit-id: 3cedf7bb4e826122d3227968510ee9811a86bcb5

Files changed (6) hide show

doc/Installation.md +7 -0
doc/struct.md +7 -0
pipeline.py +68 -49
srt_util/__init__.py +0 -0
SRT.py → srt_util/srt.py +40 -46
srt2ass.py → srt_util/srt2ass.py +0 -0

doc/Installation.md ADDED Viewed

	@@ -0,0 +1,7 @@

+### **Recommended:**
+We recommend you to configure your environment using [mamba](https://pypi.org/project/mamba/). The following packages are required:
+```
+openai
+openai-whisper
+```

doc/struct.md ADDED Viewed

	@@ -0,0 +1,7 @@

+# Structure of Repository
+```
+├── doc             # Baseline implementation of SpMM algorithm.
+├────── struct.md   # Document of repository structure.
+├── finetune_data   #
+└── README.md
+```

pipeline.py CHANGED Viewed

@@ -3,10 +3,10 @@ from pytube import YouTube
 import argparse
 import os
 from tqdm import tqdm
-from SRT import SRT_script
 import stable_whisper
 import whisper
-from srt2ass import srt2ass
 import logging
 from datetime import datetime
 import torch
@@ -15,23 +15,29 @@ import subprocess
 import time
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--link", help="youtube video link here", default=None, type=str, required=False)
     parser.add_argument("--video_file", help="local video path here", default=None, type=str, required=False)
     parser.add_argument("--audio_file", help="local audio path here", default=None, type=str, required=False)
-    parser.add_argument("--srt_file", help="srt file input path here", default=None, type=str, required=False)  # New argument
     parser.add_argument("--download", help="download path", default='./downloads', type=str, required=False)
     parser.add_argument("--output_dir", help="translate result path", default='./results', type=str, required=False)
-    parser.add_argument("--video_name", help="video name, if use video link as input, the name will auto-filled by youtube video name", default='placeholder', type=str, required=False)
-    parser.add_argument("--model_name", help="model name only support gpt-4 and gpt-3.5-turbo", type=str, required=False, default="gpt-4") # default change to gpt-4
     parser.add_argument("--log_dir", help="log path", default='./logs', type=str, required=False)
     parser.add_argument("-only_srt", help="set script output to only .srt file", action='store_true')
     parser.add_argument("-v", help="auto encode script with video", action='store_true')
     args = parser.parse_args()
     return args
 def get_sources(args, download_path, result_path, video_name):
     # get source audio
     audio_path = None
@@ -59,9 +65,9 @@ def get_sources(args, download_path, result_path, video_name):
                 print("Error: Audio stream not found")
         except Exception as e:
             print("Connection Error")
-            print(e)
             exit()
         video_path = f'{download_path}/video/{video.default_filename}'
         audio_path = '{}/audio/{}'.format(download_path, audio.default_filename)
         audio_file = open(audio_path, "rb")
@@ -72,7 +78,7 @@ def get_sources(args, download_path, result_path, video_name):
         video_path = args.video_file
         if args.audio_file is not None:
-            audio_file= open(args.audio_file, "rb")
             audio_path = args.audio_file
         else:
             output_audio_path = f'{download_path}/audio/{video_name}.mp3'
@@ -84,37 +90,41 @@ def get_sources(args, download_path, result_path, video_name):
         os.mkdir(f'{result_path}/{video_name}')
     if args.audio_file is not None:
-        audio_file= open(args.audio_file, "rb")
         audio_path = args.audio_file
         pass
     return audio_path, audio_file, video_path, video_name
-def get_srt_class(srt_file_en, result_path, video_name, audio_path, audio_file = None, whisper_model = 'large', method = "stable"):
     # Instead of using the script_en variable directly, we'll use script_input
-    if srt_file_en is not None:
-        srt = SRT_script.parse_from_srt_file(srt_file_en)
     else:
         # using whisper to perform speech-to-text and save it in <video name>_en.txt under RESULT PATH.
         srt_file_en = "{}/{}/{}_en.srt".format(result_path, video_name, video_name)
         if not os.path.exists(srt_file_en):
-            devices = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
             # use OpenAI API for transcribe
             if method == "api":
-                transcript = openai.Audio.transcribe("whisper-1", audio_file)
-            # use local whisper model
             elif method == "basic":
-                model = whisper.load_model(whisper_model, device = devices) # using base model in local machine (may use large model on our server)
                 transcript = model.transcribe(audio_path)
             # use stable-whisper
             elif method == "stable":
                 # use cuda if available
-                model = stable_whisper.load_model(whisper_model, device = devices)
-                transcript = model.transcribe(audio_path, regroup = False, initial_prompt="Hello, welcome to my lecture. Are you good my friend?")
                 (
                     transcript
                     .split_by_punctuation(['.', '。', '?'])
@@ -126,14 +136,15 @@ def get_srt_class(srt_file_en, result_path, video_name, audio_path, audio_file =
             else:
                 raise ValueError("invalid speech to text method")
-            srt = SRT_script(transcript['segments']) # read segments to SRT class
         else:
-            srt = SRT_script.parse_from_srt_file(srt_file_en)
     return srt_file_en, srt
 # Split the video script by sentences and create chunks within the token limit
-def script_split(script_in, chunk_size = 1000):
     script_split = script_in.split('\n\n')
     script_arr = []
     range_arr = []
@@ -143,20 +154,21 @@ def script_split(script_in, chunk_size = 1000):
     for sentence in script_split:
         if len(script) + len(sentence) + 1 <= chunk_size:
             script += sentence + '\n\n'
-            end+=1
         else:
             range_arr.append((start, end))
-            start = end+1
             end += 1
             script_arr.append(script.strip())
             script = sentence + '\n\n'
     if script.strip():
         script_arr.append(script.strip())
-        range_arr.append((start, len(script_split)-1))
     assert len(script_arr) == len(range_arr)
     return script_arr, range_arr
 def check_translation(sentence, translation):
     """
     check merge sentence issue from openai translation
@@ -187,24 +199,25 @@ def get_response(model_name, sentence):
     if model_name == "gpt-3.5-turbo" or model_name == "gpt-4":
         response = openai.ChatCompletion.create(
             model=model_name,
-            messages = [
-                #{"role": "system", "content": "You are a helpful assistant that translates English to Chinese and have decent background in starcraft2."},
-                #{"role": "system", "content": "Your translation has to keep the orginal format and be as accurate as possible."},
-                #{"role": "system", "content": "Your translation needs to be consistent with the number of sentences in the original."},
-                #{"role": "system", "content": "There is no need for you to add any comments or notes."},
-                #{"role": "user", "content": 'Translate the following English text to Chinese: "{}"'.format(sentence)}
-                {"role": "system", "content": "你是一个翻译助理，你的任务是翻译星际争霸视频，你会被提供一个按行分割的英文段落，你需要在保证句意和行数的情况下输出翻译后的文本。"},
                 {"role": "user", "content": sentence}
             ],
             temperature=0.15
         )
         return response['choices'][0]['message']['content'].strip()
 # Translate and save
-def translate(srt, script_arr, range_arr, model_name, video_name, video_link, attempts_count = 5):
     """
     Translates the given script array into another language using the chatgpt and writes to the SRT file.
@@ -226,7 +239,7 @@ def translate(srt, script_arr, range_arr, model_name, video_name, video_link, at
     previous_length = 0
     for sentence, range in tqdm(zip(script_arr, range_arr)):
         # update the range based on previous length
-        range = (range[0]+previous_length, range[1]+previous_length)
         # using chatgpt model
         print(f"now translating sentences {range}")
@@ -240,7 +253,7 @@ def translate(srt, script_arr, range_arr, model_name, video_name, video_link, at
                 while not check_translation(sentence, translate) and attempts_count > 0:
                     translate = get_response(model_name, sentence)
                     attempts_count -= 1
                 # if failure still happen, split into smaller tokens
                 if attempts_count == 0:
                     single_sentences = sentence.split("\n\n")
@@ -252,11 +265,11 @@ def translate(srt, script_arr, range_arr, model_name, video_name, video_link, at
                         else:
                             translate += get_response(model_name, single_sentence) + "\n\n"
                             # print(single_sentence, translate.split("\n\n")[-2])
-                    logging.info("solved by individually translation!")
             except Exception as e:
-                logging.debug("An error has occurred during translation:",e)
-                print("An error has occurred during translation:",e)
                 print("Retrying... the script will continue after 30 seconds.")
                 time.sleep(30)
                 flag = True
@@ -284,9 +297,9 @@ def main():
     RESULT_PATH = args.output_dir
     if not os.path.exists(RESULT_PATH):
         os.mkdir(RESULT_PATH)
     # set video name as the input file name if not specified
-    if args.video_name == 'placeholder' :
         # set video name to upload file name
         if args.video_file is not None:
             VIDEO_NAME = args.video_file.split('/')[-1].split('.')[0]
@@ -303,7 +316,9 @@ def main():
     if not os.path.exists(args.log_dir):
         os.makedirs(args.log_dir)
-    logging.basicConfig(level=logging.INFO, handlers=[logging.FileHandler("{}/{}_{}.log".format(args.log_dir, VIDEO_NAME, datetime.now().strftime("%m%d%Y_%H%M%S")), 'w', encoding='utf-8')])
     logging.info("---------------------Video Info---------------------")
     logging.info("Video name: {}, translation model: {}, video link: {}".format(VIDEO_NAME, args.model_name, args.link))
@@ -346,12 +361,16 @@ def main():
     if args.v:
         logging.info("encoding video file")
         if args.only_srt:
-            os.system(f'ffmpeg -i {video_path} -vf "subtitles={RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt" {RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}.mp4')
         else:
-            os.system(f'ffmpeg -i {video_path} -vf "subtitles={RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.ass" {RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}.mp4')
     end_time = time.time()
-    logging.info("Pipeline finished, time duration:{}".format(time.strftime("%H:%M:%S", time.gmtime(end_time - start_time))))
 if __name__ == "__main__":
-    main()

 import argparse
 import os
 from tqdm import tqdm
+from srt_util.srt import SrtScript
 import stable_whisper
 import whisper
+from srt_util.srt2ass import srt2ass
 import logging
 from datetime import datetime
 import torch
 import time
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--link", help="youtube video link here", default=None, type=str, required=False)
     parser.add_argument("--video_file", help="local video path here", default=None, type=str, required=False)
     parser.add_argument("--audio_file", help="local audio path here", default=None, type=str, required=False)
+    parser.add_argument("--srt_file", help="srt file input path here", default=None, type=str,
+                        required=False)  # New argument
     parser.add_argument("--download", help="download path", default='./downloads', type=str, required=False)
     parser.add_argument("--output_dir", help="translate result path", default='./results', type=str, required=False)
+    parser.add_argument("--video_name",
+                        help="video name, if use video link as input, the name will auto-filled by youtube video name",
+                        default='placeholder', type=str, required=False)
+    parser.add_argument("--model_name", help="model name only support gpt-4 and gpt-3.5-turbo", type=str,
+                        required=False, default="gpt-4")  # default change to gpt-4
     parser.add_argument("--log_dir", help="log path", default='./logs', type=str, required=False)
     parser.add_argument("-only_srt", help="set script output to only .srt file", action='store_true')
     parser.add_argument("-v", help="auto encode script with video", action='store_true')
     args = parser.parse_args()
     return args
 def get_sources(args, download_path, result_path, video_name):
     # get source audio
     audio_path = None
                 print("Error: Audio stream not found")
         except Exception as e:
             print("Connection Error")
+            print(e)
             exit()
         video_path = f'{download_path}/video/{video.default_filename}'
         audio_path = '{}/audio/{}'.format(download_path, audio.default_filename)
         audio_file = open(audio_path, "rb")
         video_path = args.video_file
         if args.audio_file is not None:
+            audio_file = open(args.audio_file, "rb")
             audio_path = args.audio_file
         else:
             output_audio_path = f'{download_path}/audio/{video_name}.mp3'
         os.mkdir(f'{result_path}/{video_name}')
     if args.audio_file is not None:
+        audio_file = open(args.audio_file, "rb")
         audio_path = args.audio_file
         pass
     return audio_path, audio_file, video_path, video_name
+def get_srt_class(srt_file_en, result_path, video_name, audio_path, audio_file=None, whisper_model='large',
+                  method="stable"):
     # Instead of using the script_en variable directly, we'll use script_input
+    if srt_file_en is not None:
+        srt = SrtScript.parse_from_srt_file(srt_file_en)
     else:
         # using whisper to perform speech-to-text and save it in <video name>_en.txt under RESULT PATH.
         srt_file_en = "{}/{}/{}_en.srt".format(result_path, video_name, video_name)
         if not os.path.exists(srt_file_en):
+            devices = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
             # use OpenAI API for transcribe
             if method == "api":
+                transcript = openai.Audio.transcribe("whisper-1", audio_file)
+                # use local whisper model
             elif method == "basic":
+                model = whisper.load_model(whisper_model,
+                                           device=devices)  # using base model in local machine (may use large model on our server)
                 transcript = model.transcribe(audio_path)
             # use stable-whisper
             elif method == "stable":
                 # use cuda if available
+                model = stable_whisper.load_model(whisper_model, device=devices)
+                transcript = model.transcribe(audio_path, regroup=False,
+                                              initial_prompt="Hello, welcome to my lecture. Are you good my friend?")
                 (
                     transcript
                     .split_by_punctuation(['.', '。', '?'])
             else:
                 raise ValueError("invalid speech to text method")
+            srt = SrtScript(transcript['segments'])  # read segments to SRT class
         else:
+            srt = SrtScript.parse_from_srt_file(srt_file_en)
     return srt_file_en, srt
 # Split the video script by sentences and create chunks within the token limit
+def script_split(script_in, chunk_size=1000):
     script_split = script_in.split('\n\n')
     script_arr = []
     range_arr = []
     for sentence in script_split:
         if len(script) + len(sentence) + 1 <= chunk_size:
             script += sentence + '\n\n'
+            end += 1
         else:
             range_arr.append((start, end))
+            start = end + 1
             end += 1
             script_arr.append(script.strip())
             script = sentence + '\n\n'
     if script.strip():
         script_arr.append(script.strip())
+        range_arr.append((start, len(script_split) - 1))
     assert len(script_arr) == len(range_arr)
     return script_arr, range_arr
 def check_translation(sentence, translation):
     """
     check merge sentence issue from openai translation
     if model_name == "gpt-3.5-turbo" or model_name == "gpt-4":
         response = openai.ChatCompletion.create(
             model=model_name,
+            messages=[
+                # {"role": "system", "content": "You are a helpful assistant that translates English to Chinese and have decent background in starcraft2."},
+                # {"role": "system", "content": "Your translation has to keep the orginal format and be as accurate as possible."},
+                # {"role": "system", "content": "Your translation needs to be consistent with the number of sentences in the original."},
+                # {"role": "system", "content": "There is no need for you to add any comments or notes."},
+                # {"role": "user", "content": 'Translate the following English text to Chinese: "{}"'.format(sentence)}
+                {"role": "system",
+                 "content": "你是一个翻译助理，你的任务是翻译星际争霸视频，你会被提供一个按行分割的英文段落，你需要在保证句意和行数的情况下输出翻译后的文本。"},
                 {"role": "user", "content": sentence}
             ],
             temperature=0.15
         )
         return response['choices'][0]['message']['content'].strip()
 # Translate and save
+def translate(srt, script_arr, range_arr, model_name, video_name, video_link, attempts_count=5):
     """
     Translates the given script array into another language using the chatgpt and writes to the SRT file.
     previous_length = 0
     for sentence, range in tqdm(zip(script_arr, range_arr)):
         # update the range based on previous length
+        range = (range[0] + previous_length, range[1] + previous_length)
         # using chatgpt model
         print(f"now translating sentences {range}")
                 while not check_translation(sentence, translate) and attempts_count > 0:
                     translate = get_response(model_name, sentence)
                     attempts_count -= 1
                 # if failure still happen, split into smaller tokens
                 if attempts_count == 0:
                     single_sentences = sentence.split("\n\n")
                         else:
                             translate += get_response(model_name, single_sentence) + "\n\n"
                             # print(single_sentence, translate.split("\n\n")[-2])
+                    logging.info("solved by individually translation!")
             except Exception as e:
+                logging.debug("An error has occurred during translation:", e)
+                print("An error has occurred during translation:", e)
                 print("Retrying... the script will continue after 30 seconds.")
                 time.sleep(30)
                 flag = True
     RESULT_PATH = args.output_dir
     if not os.path.exists(RESULT_PATH):
         os.mkdir(RESULT_PATH)
     # set video name as the input file name if not specified
+    if args.video_name == 'placeholder':
         # set video name to upload file name
         if args.video_file is not None:
             VIDEO_NAME = args.video_file.split('/')[-1].split('.')[0]
     if not os.path.exists(args.log_dir):
         os.makedirs(args.log_dir)
+    logging.basicConfig(level=logging.INFO, handlers=[
+        logging.FileHandler("{}/{}_{}.log".format(args.log_dir, VIDEO_NAME, datetime.now().strftime("%m%d%Y_%H%M%S")),
+                            'w', encoding='utf-8')])
     logging.info("---------------------Video Info---------------------")
     logging.info("Video name: {}, translation model: {}, video link: {}".format(VIDEO_NAME, args.model_name, args.link))
     if args.v:
         logging.info("encoding video file")
         if args.only_srt:
+            os.system(
+                f'ffmpeg -i {video_path} -vf "subtitles={RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt" {RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}.mp4')
         else:
+            os.system(
+                f'ffmpeg -i {video_path} -vf "subtitles={RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.ass" {RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}.mp4')
     end_time = time.time()
+    logging.info(
+        "Pipeline finished, time duration:{}".format(time.strftime("%H:%M:%S", time.gmtime(end_time - start_time))))
 if __name__ == "__main__":
+    main()

srt_util/__init__.py ADDED Viewed

File without changes

SRT.py → srt_util/srt.py RENAMED Viewed

@@ -8,7 +8,7 @@ import openai
 from tqdm import tqdm
-class SRT_segment(object):
     def __init__(self, *args) -> None:
         if isinstance(args[0], dict):
             segment = args[0]
@@ -64,28 +64,23 @@ class SRT_segment(object):
         self.end = seg.end
         self.end_ms = seg.end_ms
         self.duration = f"{self.start_time_str} --> {self.end_time_str}"
-        pass
     def __add__(self, other):
         """
         Merge the segment seg with the current segment, and return the new constructed segment.
         No in-place modification.
         :param other: Another segment that is strictly next to added segment.
         :return: new segment of the two sub-segments
         """
         result = deepcopy(self)
-        result.source_text += f' {other.source_text}'
-        result.translation += f' {other.translation}'
-        result.end_time_str = other.end_time_str
-        result.end = other.end
-        result.end_ms = other.end_ms
-        result.duration = f"{self.start_time_str} --> {result.end_time_str}"
         return result
-    def remove_trans_punc(self):
         """
-        remove punctuations in translation text
         :return: None
         """
         punc_cn = "，。！？"
@@ -102,12 +97,9 @@ class SRT_segment(object):
         return f'{self.duration}\n{self.source_text}\n{self.translation}\n\n'
-class SRT_script():
     def __init__(self, segments) -> None:
-        self.segments = []
-        for seg in segments:
-            srt_seg = SRT_segment(seg)
-            self.segments.append(srt_seg)
     @classmethod
     def parse_from_srt_file(cls, path: str):
@@ -115,13 +107,12 @@ class SRT_script():
             script_lines = [line.rstrip() for line in f.readlines()]
         segments = []
-        for i in range(len(script_lines)-4):
-            if i % 4 == 0:
-                segments.append(list(script_lines[i:i + 4]))
         return cls(segments)
-    def merge_segs(self, idx_list) -> SRT_segment:
         """
         Merge entire segment list to a single segment
         :param idx_list: List of index to merge
@@ -147,6 +138,7 @@ class SRT_script():
         logging.info("Forming whole sentences...")
         merge_list = []  # a list of indices that should be merged e.g. [[0], [1, 2, 3, 4], [5, 6], [7]]
         sentence = []
         for i, seg in enumerate(self.segments):
             if seg.source_text[-1] in ['.', '!', '?'] and len(seg.source_text) > 10 and 'vs.' not in seg.source_text:
                 sentence.append(i)
@@ -155,6 +147,7 @@ class SRT_script():
             else:
                 sentence.append(i)
         segments = []
         for idx_list in merge_list:
             if len(idx_list) > 1:
@@ -254,11 +247,10 @@ class SRT_script():
                     max_num -= 1
                     if i == len(lines) - 1:
                         break
-                if lines[i][0] in [' ', '\n']:
                     lines[i] = lines[i][1:]
                 seg.translation = lines[i]
     def split_seg(self, seg, text_threshold, time_threshold):
         # evenly split seg to 2 parts and add new seg into self.segments
@@ -314,14 +306,14 @@ class SRT_script():
         seg1_dict['text'] = src_seg1
         seg1_dict['start'] = start_seg1
         seg1_dict['end'] = end_seg1
-        seg1 = SRT_segment(seg1_dict)
         seg1.translation = trans_seg1
         seg2_dict = {}
         seg2_dict['text'] = src_seg2
         seg2_dict['start'] = start_seg2
         seg2_dict['end'] = end_seg2
-        seg2 = SRT_segment(seg2_dict)
         seg2.translation = trans_seg2
         result_list = []
@@ -344,7 +336,7 @@ class SRT_script():
         for i, seg in enumerate(self.segments):
             if len(seg.translation) > text_threshold and (seg.end - seg.start) > time_threshold:
                 seg_list = self.split_seg(seg, text_threshold, time_threshold)
-                logging.info("splitting segment {} in to {} parts".format(i+1, len(seg_list)))
                 segments += seg_list
             else:
                 segments.append(seg)
@@ -376,39 +368,41 @@ class SRT_script():
         ## force term correction
         logging.info("performing force term correction")
         # load term dictionary
-        with open("./finetune_data/dict_enzh.csv", 'r', encoding='utf-8') as f:
             term_enzh_dict = {rows[0]: rows[1] for rows in reader(f)}
         keywords = list(term_enzh_dict.keys())
         keywords.sort(key=lambda x: len(x), reverse=True)
         for word in keywords:
             for i, seg in enumerate(self.segments):
                 if word in seg.source_text.lower():
-                    seg.source_text = re.sub(fr"({word}es|{word}s?)\b", "{}".format(term_enzh_dict.get(word)), seg.source_text, flags=re.IGNORECASE)
-                    logging.info("replace term: " + word + " --> " + term_enzh_dict.get(word) + " in time stamp {}".format(i+1))
                     logging.info("source text becomes: " + seg.source_text)
     comp_dict = []
-    def fetchfunc(self,word,threshold):
         import enchant
         result = word
         distance = 0
-        threshold = threshold*len(word)
-        if len(self.comp_dict)==0:
             with open("./finetune_data/dict_freq.txt", 'r', encoding='utf-8') as f:
-                    self.comp_dict = {rows[0]: 1 for rows in reader(f)}
         temp = ""
         for matched in self.comp_dict:
             if (" " in matched and " " in word) or (" " not in matched and " " not in word):
-                if enchant.utils.levenshtein(word, matched)<enchant.utils.levenshtein(word, temp):
                     temp = matched
         if enchant.utils.levenshtein(word, temp) < threshold:
             distance = enchant.utils.levenshtein(word, temp)
             result = temp
-        return distance, result
     def extract_words(self, sentence, n):
         # this function split the sentence to chunks by n of words
@@ -417,9 +411,9 @@ class SRT_script():
         words = sentence.split()
         res = []
         for j in range(n, 0, -1):
-            res += [words[i:i+j] for i in range(len(words)-j+1)]
-        return  res
     def spell_check_term(self):
         logging.info("performing spell check")
         import enchant
@@ -435,14 +429,14 @@ class SRT_script():
                     distance, correct_term = self.fetchfunc(real_word, 0.3)
                     if distance != 0:
                         seg.source_text = re.sub(word[:pos], correct_term, seg.source_text, flags=re.IGNORECASE)
-                        logging.info("replace: " + word[:pos] + " to " + correct_term + "\t distance = " + str(distance))
-    def get_real_word(self, word_list:list):
         word = ""
         for w in word_list:
             word += f"{w} "
-        word = word[:-1] # "this, is"
         if word[-2:] == ".\n":
             real_word = word[:-2].lower()
             n = -2
@@ -460,8 +454,8 @@ class SRT_script():
         # return a string with pure source text
         result = ""
         for i, seg in enumerate(self.segments):
-            result+=f'{seg.source_text}\n\n\n'#f'SENTENCE {i+1}: {seg.source_text}\n\n\n'
         return result
     def reform_src_str(self):

 from tqdm import tqdm
+class SrtSegment(object):
     def __init__(self, *args) -> None:
         if isinstance(args[0], dict):
             segment = args[0]
         self.end = seg.end
         self.end_ms = seg.end_ms
         self.duration = f"{self.start_time_str} --> {self.end_time_str}"
     def __add__(self, other):
         """
         Merge the segment seg with the current segment, and return the new constructed segment.
         No in-place modification.
+        This is used for '+' operator.
         :param other: Another segment that is strictly next to added segment.
         :return: new segment of the two sub-segments
         """
         result = deepcopy(self)
+        result.merge_seg(other)
         return result
+    def remove_trans_punc(self) -> None:
         """
+        remove CN punctuations in translation text
         :return: None
         """
         punc_cn = "，。！？"
         return f'{self.duration}\n{self.source_text}\n{self.translation}\n\n'
+class SrtScript(object):
     def __init__(self, segments) -> None:
+        self.segments = [SrtSegment(seg) for seg in segments]
     @classmethod
     def parse_from_srt_file(cls, path: str):
             script_lines = [line.rstrip() for line in f.readlines()]
         segments = []
+        for i in range(0, len(script_lines), 4):
+            segments.append(list(script_lines[i:i + 4]))
         return cls(segments)
+    def merge_segs(self, idx_list) -> SrtSegment:
         """
         Merge entire segment list to a single segment
         :param idx_list: List of index to merge
         logging.info("Forming whole sentences...")
         merge_list = []  # a list of indices that should be merged e.g. [[0], [1, 2, 3, 4], [5, 6], [7]]
         sentence = []
+        # Get each entire sentence of distinct segments, fill indices to merge_list
         for i, seg in enumerate(self.segments):
             if seg.source_text[-1] in ['.', '!', '?'] and len(seg.source_text) > 10 and 'vs.' not in seg.source_text:
                 sentence.append(i)
             else:
                 sentence.append(i)
+        # Reconstruct segments, each with an entire sentence
         segments = []
         for idx_list in merge_list:
             if len(idx_list) > 1:
                     max_num -= 1
                     if i == len(lines) - 1:
                         break
+                if lines[i][0] in [' ', '\n']:
                     lines[i] = lines[i][1:]
                 seg.translation = lines[i]
     def split_seg(self, seg, text_threshold, time_threshold):
         # evenly split seg to 2 parts and add new seg into self.segments
         seg1_dict['text'] = src_seg1
         seg1_dict['start'] = start_seg1
         seg1_dict['end'] = end_seg1
+        seg1 = SrtSegment(seg1_dict)
         seg1.translation = trans_seg1
         seg2_dict = {}
         seg2_dict['text'] = src_seg2
         seg2_dict['start'] = start_seg2
         seg2_dict['end'] = end_seg2
+        seg2 = SrtSegment(seg2_dict)
         seg2.translation = trans_seg2
         result_list = []
         for i, seg in enumerate(self.segments):
             if len(seg.translation) > text_threshold and (seg.end - seg.start) > time_threshold:
                 seg_list = self.split_seg(seg, text_threshold, time_threshold)
+                logging.info("splitting segment {} in to {} parts".format(i + 1, len(seg_list)))
                 segments += seg_list
             else:
                 segments.append(seg)
         ## force term correction
         logging.info("performing force term correction")
         # load term dictionary
+        with open("../finetune_data/dict_enzh.csv", 'r', encoding='utf-8') as f:
             term_enzh_dict = {rows[0]: rows[1] for rows in reader(f)}
         keywords = list(term_enzh_dict.keys())
         keywords.sort(key=lambda x: len(x), reverse=True)
         for word in keywords:
             for i, seg in enumerate(self.segments):
                 if word in seg.source_text.lower():
+                    seg.source_text = re.sub(fr"({word}es|{word}s?)\b", "{}".format(term_enzh_dict.get(word)),
+                                             seg.source_text, flags=re.IGNORECASE)
+                    logging.info(
+                        "replace term: " + word + " --> " + term_enzh_dict.get(word) + " in time stamp {}".format(
+                            i + 1))
                     logging.info("source text becomes: " + seg.source_text)
     comp_dict = []
+    def fetchfunc(self, word, threshold):
         import enchant
         result = word
         distance = 0
+        threshold = threshold * len(word)
+        if len(self.comp_dict) == 0:
             with open("./finetune_data/dict_freq.txt", 'r', encoding='utf-8') as f:
+                self.comp_dict = {rows[0]: 1 for rows in reader(f)}
         temp = ""
         for matched in self.comp_dict:
             if (" " in matched and " " in word) or (" " not in matched and " " not in word):
+                if enchant.utils.levenshtein(word, matched) < enchant.utils.levenshtein(word, temp):
                     temp = matched
         if enchant.utils.levenshtein(word, temp) < threshold:
             distance = enchant.utils.levenshtein(word, temp)
             result = temp
+        return distance, result
     def extract_words(self, sentence, n):
         # this function split the sentence to chunks by n of words
         words = sentence.split()
         res = []
         for j in range(n, 0, -1):
+            res += [words[i:i + j] for i in range(len(words) - j + 1)]
+        return res
     def spell_check_term(self):
         logging.info("performing spell check")
         import enchant
                     distance, correct_term = self.fetchfunc(real_word, 0.3)
                     if distance != 0:
                         seg.source_text = re.sub(word[:pos], correct_term, seg.source_text, flags=re.IGNORECASE)
+                        logging.info(
+                            "replace: " + word[:pos] + " to " + correct_term + "\t distance = " + str(distance))
+    def get_real_word(self, word_list: list):
         word = ""
         for w in word_list:
             word += f"{w} "
+        word = word[:-1]  # "this, is"
         if word[-2:] == ".\n":
             real_word = word[:-2].lower()
             n = -2
         # return a string with pure source text
         result = ""
         for i, seg in enumerate(self.segments):
+            result += f'{seg.source_text}\n\n\n'  # f'SENTENCE {i+1}: {seg.source_text}\n\n\n'
         return result
     def reform_src_str(self):

srt2ass.py → srt_util/srt2ass.py RENAMED Viewed

File without changes