Spaces:
Sleeping
Sleeping
CanYing0913
commited on
Commit
Β·
7d74f8e
1
Parent(s):
cba75b6
Update srt.py and file hierarchy
Browse filesFormer-commit-id: d36b43736cb3447da3e26e3caef1e351bf431dc3
- doc/Installation.md +7 -0
- doc/struct.md +7 -0
- pipeline.py +5 -5
- srt_util/__init__.py +0 -0
- SRT.py β srt_util/srt.py +16 -23
- srt2ass.py β srt_util/srt2ass.py +0 -0
doc/Installation.md
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
### **Recommended:**
|
| 2 |
+
We recommend you to configure your environment using [mamba](https://pypi.org/project/mamba/). The following packages are required:
|
| 3 |
+
```
|
| 4 |
+
openai
|
| 5 |
+
openai-whisper
|
| 6 |
+
|
| 7 |
+
```
|
doc/struct.md
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Structure of Repository
|
| 2 |
+
```
|
| 3 |
+
βββ doc # Baseline implementation of SpMM algorithm.
|
| 4 |
+
βββββββ struct.md # Document of repository structure.
|
| 5 |
+
βββ finetune_data #
|
| 6 |
+
βββ README.md
|
| 7 |
+
```
|
pipeline.py
CHANGED
|
@@ -3,10 +3,10 @@ from pytube import YouTube
|
|
| 3 |
import argparse
|
| 4 |
import os
|
| 5 |
from tqdm import tqdm
|
| 6 |
-
from
|
| 7 |
import stable_whisper
|
| 8 |
import whisper
|
| 9 |
-
from srt2ass import srt2ass
|
| 10 |
|
| 11 |
import subprocess
|
| 12 |
|
|
@@ -85,7 +85,7 @@ def get_sources(args, download_path, result_path, video_name):
|
|
| 85 |
def get_srt_class(srt_file_en, result_path, video_name, audio_path, audio_file = None, whisper_model = 'large', method = "stable"):
|
| 86 |
# Instead of using the script_en variable directly, we'll use script_input
|
| 87 |
if srt_file_en is not None:
|
| 88 |
-
srt =
|
| 89 |
else:
|
| 90 |
# using whisper to perform speech-to-text and save it in <video name>_en.txt under RESULT PATH.
|
| 91 |
srt_file_en = "{}/{}/{}_en.srt".format(result_path, video_name, video_name)
|
|
@@ -115,10 +115,10 @@ def get_srt_class(srt_file_en, result_path, video_name, audio_path, audio_file =
|
|
| 115 |
else:
|
| 116 |
raise ValueError("invalid speech to text method")
|
| 117 |
|
| 118 |
-
srt =
|
| 119 |
|
| 120 |
else:
|
| 121 |
-
srt =
|
| 122 |
return srt_file_en, srt
|
| 123 |
|
| 124 |
# Split the video script by sentences and create chunks within the token limit
|
|
|
|
| 3 |
import argparse
|
| 4 |
import os
|
| 5 |
from tqdm import tqdm
|
| 6 |
+
from srt_util.srt import SrtScript
|
| 7 |
import stable_whisper
|
| 8 |
import whisper
|
| 9 |
+
from srt_util.srt2ass import srt2ass
|
| 10 |
|
| 11 |
import subprocess
|
| 12 |
|
|
|
|
| 85 |
def get_srt_class(srt_file_en, result_path, video_name, audio_path, audio_file = None, whisper_model = 'large', method = "stable"):
|
| 86 |
# Instead of using the script_en variable directly, we'll use script_input
|
| 87 |
if srt_file_en is not None:
|
| 88 |
+
srt = SrtScript.parse_from_srt_file(srt_file_en)
|
| 89 |
else:
|
| 90 |
# using whisper to perform speech-to-text and save it in <video name>_en.txt under RESULT PATH.
|
| 91 |
srt_file_en = "{}/{}/{}_en.srt".format(result_path, video_name, video_name)
|
|
|
|
| 115 |
else:
|
| 116 |
raise ValueError("invalid speech to text method")
|
| 117 |
|
| 118 |
+
srt = SrtScript(transcript['segments']) # read segments to SRT class
|
| 119 |
|
| 120 |
else:
|
| 121 |
+
srt = SrtScript.parse_from_srt_file(srt_file_en)
|
| 122 |
return srt_file_en, srt
|
| 123 |
|
| 124 |
# Split the video script by sentences and create chunks within the token limit
|
srt_util/__init__.py
ADDED
|
File without changes
|
SRT.py β srt_util/srt.py
RENAMED
|
@@ -7,7 +7,7 @@ from datetime import timedelta
|
|
| 7 |
import openai
|
| 8 |
|
| 9 |
|
| 10 |
-
class
|
| 11 |
def __init__(self, *args) -> None:
|
| 12 |
if isinstance(args[0], dict):
|
| 13 |
segment = args[0]
|
|
@@ -63,28 +63,23 @@ class SRT_segment(object):
|
|
| 63 |
self.end = seg.end
|
| 64 |
self.end_ms = seg.end_ms
|
| 65 |
self.duration = f"{self.start_time_str} --> {self.end_time_str}"
|
| 66 |
-
pass
|
| 67 |
|
| 68 |
def __add__(self, other):
|
| 69 |
"""
|
| 70 |
Merge the segment seg with the current segment, and return the new constructed segment.
|
| 71 |
No in-place modification.
|
|
|
|
| 72 |
:param other: Another segment that is strictly next to added segment.
|
| 73 |
:return: new segment of the two sub-segments
|
| 74 |
"""
|
| 75 |
# assert other.start_ms == self.end_ms, f"cannot merge discontinuous segments."
|
| 76 |
result = deepcopy(self)
|
| 77 |
-
result.
|
| 78 |
-
result.translation += f' {other.translation}'
|
| 79 |
-
result.end_time_str = other.end_time_str
|
| 80 |
-
result.end = other.end
|
| 81 |
-
result.end_ms = other.end_ms
|
| 82 |
-
result.duration = f"{self.start_time_str} --> {self.end_time_str}"
|
| 83 |
return result
|
| 84 |
|
| 85 |
-
def remove_trans_punc(self):
|
| 86 |
"""
|
| 87 |
-
remove punctuations in translation text
|
| 88 |
:return: None
|
| 89 |
"""
|
| 90 |
punc_cn = "οΌγοΌοΌ"
|
|
@@ -101,12 +96,9 @@ class SRT_segment(object):
|
|
| 101 |
return f'{self.duration}\n{self.source_text}\n{self.translation}\n\n'
|
| 102 |
|
| 103 |
|
| 104 |
-
class
|
| 105 |
def __init__(self, segments) -> None:
|
| 106 |
-
self.segments = []
|
| 107 |
-
for seg in segments:
|
| 108 |
-
srt_seg = SRT_segment(seg)
|
| 109 |
-
self.segments.append(srt_seg)
|
| 110 |
|
| 111 |
@classmethod
|
| 112 |
def parse_from_srt_file(cls, path: str):
|
|
@@ -114,13 +106,12 @@ class SRT_script():
|
|
| 114 |
script_lines = [line.rstrip() for line in f.readlines()]
|
| 115 |
|
| 116 |
segments = []
|
| 117 |
-
for i in range(len(script_lines)):
|
| 118 |
-
|
| 119 |
-
segments.append(list(script_lines[i:i + 4]))
|
| 120 |
|
| 121 |
return cls(segments)
|
| 122 |
|
| 123 |
-
def merge_segs(self, idx_list) ->
|
| 124 |
"""
|
| 125 |
Merge entire segment list to a single segment
|
| 126 |
:param idx_list: List of index to merge
|
|
@@ -145,6 +136,7 @@ class SRT_script():
|
|
| 145 |
"""
|
| 146 |
merge_list = [] # a list of indices that should be merged e.g. [[0], [1, 2, 3, 4], [5, 6], [7]]
|
| 147 |
sentence = []
|
|
|
|
| 148 |
for i, seg in enumerate(self.segments):
|
| 149 |
if seg.source_text[-1] in ['.', '!', '?'] and len(seg.source_text) > 10 and 'vs.' not in seg.source_text:
|
| 150 |
sentence.append(i)
|
|
@@ -153,6 +145,7 @@ class SRT_script():
|
|
| 153 |
else:
|
| 154 |
sentence.append(i)
|
| 155 |
|
|
|
|
| 156 |
segments = []
|
| 157 |
for idx_list in merge_list:
|
| 158 |
segments.append(self.merge_segs(idx_list))
|
|
@@ -327,14 +320,14 @@ class SRT_script():
|
|
| 327 |
seg1_dict['text'] = src_seg1
|
| 328 |
seg1_dict['start'] = start_seg1
|
| 329 |
seg1_dict['end'] = end_seg1
|
| 330 |
-
seg1 =
|
| 331 |
seg1.translation = trans_seg1
|
| 332 |
|
| 333 |
seg2_dict = {}
|
| 334 |
seg2_dict['text'] = src_seg2
|
| 335 |
seg2_dict['start'] = start_seg2
|
| 336 |
seg2_dict['end'] = end_seg2
|
| 337 |
-
seg2 =
|
| 338 |
seg2.translation = trans_seg2
|
| 339 |
|
| 340 |
result_list = []
|
|
@@ -386,7 +379,7 @@ class SRT_script():
|
|
| 386 |
## force term correction
|
| 387 |
|
| 388 |
# load term dictionary
|
| 389 |
-
with open("
|
| 390 |
term_enzh_dict = {rows[0]: rows[1] for rows in reader(f)}
|
| 391 |
|
| 392 |
# change term
|
|
@@ -455,7 +448,7 @@ class SRT_script():
|
|
| 455 |
pos = uncover(word)[1]
|
| 456 |
new_word = word
|
| 457 |
if arg == 0: # term translate mode
|
| 458 |
-
with open("finetune_data/dict_enzh.csv", 'r', encoding='utf-8') as f:
|
| 459 |
term_enzh_dict = {rows[0]: rows[1] for rows in reader(f)}
|
| 460 |
if real_word in term_enzh_dict:
|
| 461 |
new_word = word.replace(word[:pos], term_enzh_dict.get(real_word))
|
|
|
|
| 7 |
import openai
|
| 8 |
|
| 9 |
|
| 10 |
+
class SrtSegment(object):
|
| 11 |
def __init__(self, *args) -> None:
|
| 12 |
if isinstance(args[0], dict):
|
| 13 |
segment = args[0]
|
|
|
|
| 63 |
self.end = seg.end
|
| 64 |
self.end_ms = seg.end_ms
|
| 65 |
self.duration = f"{self.start_time_str} --> {self.end_time_str}"
|
|
|
|
| 66 |
|
| 67 |
def __add__(self, other):
|
| 68 |
"""
|
| 69 |
Merge the segment seg with the current segment, and return the new constructed segment.
|
| 70 |
No in-place modification.
|
| 71 |
+
This is used for '+' operator.
|
| 72 |
:param other: Another segment that is strictly next to added segment.
|
| 73 |
:return: new segment of the two sub-segments
|
| 74 |
"""
|
| 75 |
# assert other.start_ms == self.end_ms, f"cannot merge discontinuous segments."
|
| 76 |
result = deepcopy(self)
|
| 77 |
+
result.merge_seg(other)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
return result
|
| 79 |
|
| 80 |
+
def remove_trans_punc(self) -> None:
|
| 81 |
"""
|
| 82 |
+
remove CN punctuations in translation text
|
| 83 |
:return: None
|
| 84 |
"""
|
| 85 |
punc_cn = "οΌγοΌοΌ"
|
|
|
|
| 96 |
return f'{self.duration}\n{self.source_text}\n{self.translation}\n\n'
|
| 97 |
|
| 98 |
|
| 99 |
+
class SrtScript(object):
|
| 100 |
def __init__(self, segments) -> None:
|
| 101 |
+
self.segments = [SrtSegment(seg) for seg in segments]
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
@classmethod
|
| 104 |
def parse_from_srt_file(cls, path: str):
|
|
|
|
| 106 |
script_lines = [line.rstrip() for line in f.readlines()]
|
| 107 |
|
| 108 |
segments = []
|
| 109 |
+
for i in range(0, len(script_lines), 4):
|
| 110 |
+
segments.append(list(script_lines[i:i + 4]))
|
|
|
|
| 111 |
|
| 112 |
return cls(segments)
|
| 113 |
|
| 114 |
+
def merge_segs(self, idx_list) -> SrtSegment:
|
| 115 |
"""
|
| 116 |
Merge entire segment list to a single segment
|
| 117 |
:param idx_list: List of index to merge
|
|
|
|
| 136 |
"""
|
| 137 |
merge_list = [] # a list of indices that should be merged e.g. [[0], [1, 2, 3, 4], [5, 6], [7]]
|
| 138 |
sentence = []
|
| 139 |
+
# Get each entire sentence of distinct segments, fill indices to merge_list
|
| 140 |
for i, seg in enumerate(self.segments):
|
| 141 |
if seg.source_text[-1] in ['.', '!', '?'] and len(seg.source_text) > 10 and 'vs.' not in seg.source_text:
|
| 142 |
sentence.append(i)
|
|
|
|
| 145 |
else:
|
| 146 |
sentence.append(i)
|
| 147 |
|
| 148 |
+
# Reconstruct segments, each with an entire sentence
|
| 149 |
segments = []
|
| 150 |
for idx_list in merge_list:
|
| 151 |
segments.append(self.merge_segs(idx_list))
|
|
|
|
| 320 |
seg1_dict['text'] = src_seg1
|
| 321 |
seg1_dict['start'] = start_seg1
|
| 322 |
seg1_dict['end'] = end_seg1
|
| 323 |
+
seg1 = SrtSegment(seg1_dict)
|
| 324 |
seg1.translation = trans_seg1
|
| 325 |
|
| 326 |
seg2_dict = {}
|
| 327 |
seg2_dict['text'] = src_seg2
|
| 328 |
seg2_dict['start'] = start_seg2
|
| 329 |
seg2_dict['end'] = end_seg2
|
| 330 |
+
seg2 = SrtSegment(seg2_dict)
|
| 331 |
seg2.translation = trans_seg2
|
| 332 |
|
| 333 |
result_list = []
|
|
|
|
| 379 |
## force term correction
|
| 380 |
|
| 381 |
# load term dictionary
|
| 382 |
+
with open("../finetune_data/dict_enzh.csv", 'r', encoding='utf-8') as f:
|
| 383 |
term_enzh_dict = {rows[0]: rows[1] for rows in reader(f)}
|
| 384 |
|
| 385 |
# change term
|
|
|
|
| 448 |
pos = uncover(word)[1]
|
| 449 |
new_word = word
|
| 450 |
if arg == 0: # term translate mode
|
| 451 |
+
with open("../finetune_data/dict_enzh.csv", 'r', encoding='utf-8') as f:
|
| 452 |
term_enzh_dict = {rows[0]: rows[1] for rows in reader(f)}
|
| 453 |
if real_word in term_enzh_dict:
|
| 454 |
new_word = word.replace(word[:pos], term_enzh_dict.get(real_word))
|
srt2ass.py β srt_util/srt2ass.py
RENAMED
|
File without changes
|