Spaces:
Running
Running
File size: 5,033 Bytes
06e9abd 21c5daf adb8970 06e9abd adb8970 06e9abd a1f27f0 06e9abd bc1f0e6 06e9abd 53cb608 06e9abd bc1f0e6 06e9abd 55bdc8c 06e9abd bc1f0e6 53cb608 06e9abd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
from faster_whisper import WhisperModel
import math
import gradio as gr
from moviepy import VideoFileClip
import requests
def extract_audio(input_video_name):
# Define the input video file and output audio file
mp3_file = "audio.mp3"
# Load the video clip
video_clip = VideoFileClip(input_video_name)
# Extract the audio from the video clip
audio_clip = video_clip.audio
duration = audio_clip.duration
print(f"Audio duration: {duration}")
# Write the audio to a separate file
audio_clip.write_audiofile(mp3_file)
# Close the video and audio clips
audio_clip.close()
video_clip.close()
print("Audio extraction successful!")
return mp3_file, duration
def download_video(url):
response = requests.get(url, stream=True)
response.raise_for_status()
video_file = "video.mp4"
with open(video_file, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
file.write(chunk)
print("Video downloaded successfully!")
return video_file
def word_level_transcribe(audio, max_segment_duration=2.0): # Set your desired max duration here
model = WhisperModel("small", device="cpu")
segments, info = model.transcribe(audio, vad_filter=True, vad_parameters=dict(min_silence_duration_ms=1500), word_timestamps=True, log_progress=True)
segments = list(segments) # The transcription will actually run here.
wordlevel_info = []
for segment in segments:
for word in segment.words:
print("[%.2fs -> %.2fs] %s" % (word.start, word.end, word.word))
wordlevel_info.append({'word':word.word,'start':word.start,'end':word.end})
return wordlevel_info
def create_subtitles(wordlevel_info):
punctuation_marks = {'.', '!', '?', ',', ';', ':', '—', '-', '。', '!', '?'} # Add/remove punctuation as needed
subtitles = []
line = []
for word_data in wordlevel_info:
line.append(word_data)
current_word = word_data['word']
# Check if current word ends with punctuation or line reached 5 words
ends_with_punct = current_word and (current_word[-1] in punctuation_marks)
if ends_with_punct or len(line) == 5:
# Create a new subtitle segment
subtitle = {
"word": " ".join(item["word"] for item in line),
"start": line[0]["start"],
"end": line[-1]["end"],
"textcontents": line.copy()
}
subtitles.append(subtitle)
line = []
# Add remaining words if any
if line:
subtitle = {
"word": " ".join(item["word"] for item in line),
"start": line[0]["start"],
"end": line[-1]["end"],
"textcontents": line.copy()
}
subtitles.append(subtitle)
# Remove gaps between segments by extending the previous segment's end time
for i in range(1, len(subtitles)):
prev_subtitle = subtitles[i - 1]
current_subtitle = subtitles[i]
# Extend the previous segment's end time to the start of the current segment
prev_subtitle["end"] = current_subtitle["start"]
return subtitles
def format_time(seconds):
hours = math.floor(seconds / 3600)
seconds %= 3600
minutes = math.floor(seconds / 60)
seconds %= 60
milliseconds = round((seconds - math.floor(seconds)) * 1000)
seconds = math.floor(seconds)
formatted_time = f"{hours:02d}:{minutes:02d}:{seconds:01d},{milliseconds:03d}"
return formatted_time
def generate_subtitle_file(language, segments, input_video_name):
subtitle_file = f"sub-{input_video_name}.{language}.srt"
text = ""
for index, segment in enumerate(segments):
segment_start = format_time(segment['start'])
segment_end = format_time(segment['end'])
text += f"{str(index+1)} \n"
text += f"{segment_start} --> {segment_end} \n"
text += f"{segment['word']} \n"
text += "\n"
f = open(subtitle_file, "w", encoding='utf8')
f.write(text)
f.close()
return subtitle_file
def transcribe(url):
video = download_video(url)
mp3_file, duration = extract_audio(video)
print("transcribe")
wordlevel_info=word_level_transcribe(mp3_file)
subtitles = create_subtitles(wordlevel_info)
subtitle_file = generate_subtitle_file('fa', subtitles, 'video_subtitled')
return subtitle_file, video, mp3_file
with gr.Blocks() as demo:
gr.Markdown("Start typing below and then click **Run** to see the progress and final output.")
with gr.Column():
#audio_in = gr.Audio(type="filepath")
url = gr.Text()
srt_file = gr.File()
btn = gr.Button("Create")
video_file_output = gr.Video(label="Result Video")
mp3_file = gr.Audio(type="filepath")
btn.click(
fn=transcribe,
inputs=url,
outputs=[srt_file, video_file_output, mp3_file],
)
demo.launch(debug=True) |