SPACERUNNER99 commited on
Commit
a5e1063
·
verified ·
1 Parent(s): df32805

Update transcribe.py

Browse files
Files changed (1) hide show
  1. transcribe.py +129 -129
transcribe.py CHANGED
@@ -1,130 +1,130 @@
1
- from faster_whisper import WhisperModel
2
- import math
3
-
4
-
5
- def word_level_transcribe(audio, max_segment_duration=2.0): # Set your desired max duration here
6
- model = WhisperModel("tiny", device="cpu", cpu_threads=12, local_files_only=True)
7
- segments, info = model.transcribe(audio, vad_filter=True, vad_parameters=dict(min_silence_duration_ms=1500), word_timestamps=True, log_progress=True)
8
- segments = list(segments) # The transcription will actually run here.
9
- wordlevel_info = []
10
- for segment in segments:
11
- for word in segment.words:
12
- print("[%.2fs -> %.2fs] %s" % (word.start, word.end, word.word))
13
- wordlevel_info.append({'word':word.word,'start':word.start,'end':word.end})
14
- return wordlevel_info
15
-
16
- def create_subtitles(wordlevel_info):
17
- punctuation_marks = {'.', '!', '?', ',', ';', ':', '—', '-', '。', '!', '?'} # Add/remove punctuation as needed
18
- subtitles = []
19
- line = []
20
-
21
- for word_data in wordlevel_info:
22
- line.append(word_data)
23
- current_word = word_data['word']
24
-
25
- # Check if current word ends with punctuation or line reached 5 words
26
- ends_with_punct = current_word and (current_word[-1] in punctuation_marks)
27
-
28
- if ends_with_punct or len(line) == 5:
29
- # Create a new subtitle segment
30
- subtitle = {
31
- "word": " ".join(item["word"] for item in line),
32
- "start": line[0]["start"],
33
- "end": line[-1]["end"],
34
- "textcontents": line.copy()
35
- }
36
- subtitles.append(subtitle)
37
- line = []
38
-
39
- # Add remaining words if any
40
- if line:
41
- subtitle = {
42
- "word": " ".join(item["word"] for item in line),
43
- "start": line[0]["start"],
44
- "end": line[-1]["end"],
45
- "textcontents": line.copy()
46
- }
47
- subtitles.append(subtitle)
48
-
49
- # Remove gaps between segments by extending the previous segment's end time
50
- for i in range(1, len(subtitles)):
51
- prev_subtitle = subtitles[i - 1]
52
- current_subtitle = subtitles[i]
53
-
54
- # Extend the previous segment's end time to the start of the current segment
55
- prev_subtitle["end"] = current_subtitle["start"]
56
-
57
- return subtitles
58
-
59
- def format_time(seconds):
60
- hours = math.floor(seconds / 3600)
61
- seconds %= 3600
62
- minutes = math.floor(seconds / 60)
63
- seconds %= 60
64
- milliseconds = round((seconds - math.floor(seconds)) * 1000)
65
- seconds = math.floor(seconds)
66
- formatted_time = f"{hours:02d}:{minutes:02d}:{seconds:01d},{milliseconds:03d}"
67
- return formatted_time
68
-
69
- def generate_subtitle_file(language, segments, input_video_name):
70
- subtitle_file = f"./src/media/sub-{input_video_name}.{language}.srt"
71
- text = ""
72
- for index, segment in enumerate(segments):
73
- segment_start = format_time(segment['start'])
74
- segment_end = format_time(segment['end'])
75
- text += f"{str(index+1)} \n"
76
- text += f"{segment_start} --> {segment_end} \n"
77
- text += f"{segment['word']} \n"
78
- text += "\n"
79
- f = open(subtitle_file, "w", encoding='utf8')
80
- f.write(text)
81
- f.close()
82
- return subtitle_file
83
-
84
- def split_srt_file(input_file, max_chars=3000):
85
- # Read the contents of the SRT file
86
- with open(input_file, 'r', encoding='utf-8') as file:
87
- content = file.read()
88
- file.close()
89
-
90
- # Split the content into individual subtitles
91
- subtitles = content.strip().split('\n\n')
92
-
93
- # Prepare to write the split files
94
- output_files = []
95
- current_file_content = ''
96
- current_file_index = 1
97
-
98
- for subtitle in subtitles:
99
- # Check if adding this subtitle would exceed the character limit
100
- if len(current_file_content) + len(subtitle) + 2 > max_chars: # +2 for \n\n
101
- # Write the current file
102
- output_file_name = f'./src/media/split_{current_file_index}.srt'
103
- with open(output_file_name, 'w', encoding='utf-8') as output_file:
104
- output_file.write(current_file_content.strip())
105
- output_files.append(output_file_name)
106
-
107
- # Prepare for the next file
108
- current_file_index += 1
109
- current_file_content = subtitle + '\n\n'
110
- else:
111
- # If it fits, add the subtitle
112
- current_file_content += subtitle + '\n\n'
113
-
114
- # Write any remaining content to a new SRT file
115
- if current_file_content:
116
- output_file_name = f'./src/media/split_{current_file_index}.srt'
117
- with open(output_file_name, 'w', encoding='utf-8') as output_file:
118
- output_file.write(current_file_content.strip())
119
- output_files.append(output_file_name)
120
-
121
- return output_files
122
-
123
- def transcribe(mp3_file):
124
-
125
- print("transcribe")
126
- wordlevel_info=word_level_transcribe(mp3_file)
127
- subtitles = create_subtitles(wordlevel_info)
128
- subtitle_file = generate_subtitle_file('fa', subtitles, 'video_subtitled')
129
- srt_list = split_srt_file(subtitle_file)
130
  return srt_list
 
1
+ from faster_whisper import WhisperModel
2
+ import math
3
+
4
+
5
+ def word_level_transcribe(audio, max_segment_duration=2.0): # Set your desired max duration here
6
+ model = WhisperModel("tiny", device="cpu", local_files_only=True)
7
+ segments, info = model.transcribe(audio, vad_filter=True, vad_parameters=dict(min_silence_duration_ms=1500), word_timestamps=True, log_progress=True)
8
+ segments = list(segments) # The transcription will actually run here.
9
+ wordlevel_info = []
10
+ for segment in segments:
11
+ for word in segment.words:
12
+ print("[%.2fs -> %.2fs] %s" % (word.start, word.end, word.word))
13
+ wordlevel_info.append({'word':word.word,'start':word.start,'end':word.end})
14
+ return wordlevel_info
15
+
16
+ def create_subtitles(wordlevel_info):
17
+ punctuation_marks = {'.', '!', '?', ',', ';', ':', '—', '-', '。', '!', '?'} # Add/remove punctuation as needed
18
+ subtitles = []
19
+ line = []
20
+
21
+ for word_data in wordlevel_info:
22
+ line.append(word_data)
23
+ current_word = word_data['word']
24
+
25
+ # Check if current word ends with punctuation or line reached 5 words
26
+ ends_with_punct = current_word and (current_word[-1] in punctuation_marks)
27
+
28
+ if ends_with_punct or len(line) == 5:
29
+ # Create a new subtitle segment
30
+ subtitle = {
31
+ "word": " ".join(item["word"] for item in line),
32
+ "start": line[0]["start"],
33
+ "end": line[-1]["end"],
34
+ "textcontents": line.copy()
35
+ }
36
+ subtitles.append(subtitle)
37
+ line = []
38
+
39
+ # Add remaining words if any
40
+ if line:
41
+ subtitle = {
42
+ "word": " ".join(item["word"] for item in line),
43
+ "start": line[0]["start"],
44
+ "end": line[-1]["end"],
45
+ "textcontents": line.copy()
46
+ }
47
+ subtitles.append(subtitle)
48
+
49
+ # Remove gaps between segments by extending the previous segment's end time
50
+ for i in range(1, len(subtitles)):
51
+ prev_subtitle = subtitles[i - 1]
52
+ current_subtitle = subtitles[i]
53
+
54
+ # Extend the previous segment's end time to the start of the current segment
55
+ prev_subtitle["end"] = current_subtitle["start"]
56
+
57
+ return subtitles
58
+
59
+ def format_time(seconds):
60
+ hours = math.floor(seconds / 3600)
61
+ seconds %= 3600
62
+ minutes = math.floor(seconds / 60)
63
+ seconds %= 60
64
+ milliseconds = round((seconds - math.floor(seconds)) * 1000)
65
+ seconds = math.floor(seconds)
66
+ formatted_time = f"{hours:02d}:{minutes:02d}:{seconds:01d},{milliseconds:03d}"
67
+ return formatted_time
68
+
69
+ def generate_subtitle_file(language, segments, input_video_name):
70
+ subtitle_file = f"sub-{input_video_name}.{language}.srt"
71
+ text = ""
72
+ for index, segment in enumerate(segments):
73
+ segment_start = format_time(segment['start'])
74
+ segment_end = format_time(segment['end'])
75
+ text += f"{str(index+1)} \n"
76
+ text += f"{segment_start} --> {segment_end} \n"
77
+ text += f"{segment['word']} \n"
78
+ text += "\n"
79
+ f = open(subtitle_file, "w", encoding='utf8')
80
+ f.write(text)
81
+ f.close()
82
+ return subtitle_file
83
+
84
+ def split_srt_file(input_file, max_chars=3000):
85
+ # Read the contents of the SRT file
86
+ with open(input_file, 'r', encoding='utf-8') as file:
87
+ content = file.read()
88
+ file.close()
89
+
90
+ # Split the content into individual subtitles
91
+ subtitles = content.strip().split('\n\n')
92
+
93
+ # Prepare to write the split files
94
+ output_files = []
95
+ current_file_content = ''
96
+ current_file_index = 1
97
+
98
+ for subtitle in subtitles:
99
+ # Check if adding this subtitle would exceed the character limit
100
+ if len(current_file_content) + len(subtitle) + 2 > max_chars: # +2 for \n\n
101
+ # Write the current file
102
+ output_file_name = f'split_{current_file_index}.srt'
103
+ with open(output_file_name, 'w', encoding='utf-8') as output_file:
104
+ output_file.write(current_file_content.strip())
105
+ output_files.append(output_file_name)
106
+
107
+ # Prepare for the next file
108
+ current_file_index += 1
109
+ current_file_content = subtitle + '\n\n'
110
+ else:
111
+ # If it fits, add the subtitle
112
+ current_file_content += subtitle + '\n\n'
113
+
114
+ # Write any remaining content to a new SRT file
115
+ if current_file_content:
116
+ output_file_name = f'split_{current_file_index}.srt'
117
+ with open(output_file_name, 'w', encoding='utf-8') as output_file:
118
+ output_file.write(current_file_content.strip())
119
+ output_files.append(output_file_name)
120
+
121
+ return output_files
122
+
123
+ def transcribe(mp3_file):
124
+
125
+ print("transcribe")
126
+ wordlevel_info=word_level_transcribe(mp3_file)
127
+ subtitles = create_subtitles(wordlevel_info)
128
+ subtitle_file = generate_subtitle_file('fa', subtitles, 'video_subtitled')
129
+ srt_list = split_srt_file(subtitle_file)
130
  return srt_list