Deepsheka commited on
Commit
797126d
·
1 Parent(s): f96972a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -295
app.py CHANGED
@@ -1,302 +1,56 @@
1
- import json
2
  import gradio as gr
3
- from difflib import Differ
4
- import ffmpeg
5
- import os
6
- from pathlib import Path
7
- import time
8
- import aiohttp
9
- import asyncio
10
 
 
 
 
 
 
11
 
12
- # Set true if you're using huggingface inference API API https://huggingface.co/inference-api
13
- API_BACKEND = True
14
- # MODEL = 'facebook/wav2vec2-large-960h-lv60-self'
15
- # MODEL = "facebook/wav2vec2-large-960h"
16
- MODEL = "facebook/wav2vec2-base-960h"
17
- # MODEL = "patrickvonplaten/wav2vec2-large-960h-lv60-self-4-gram"
18
- if API_BACKEND:
19
- from dotenv import load_dotenv
20
- import base64
21
- import asyncio
22
- load_dotenv(Path(".env"))
23
-
24
- HF_TOKEN = os.environ["HF_TOKEN"]
25
- headers = {"Authorization": f"Bearer {HF_TOKEN}"}
26
- API_URL = f'https://api-inference.huggingface.co/models/{MODEL}'
27
-
28
- else:
29
- import torch
30
- from transformers import pipeline
31
-
32
- # is cuda available?
33
- cuda = torch.device(
34
- 'cuda:0') if torch.cuda.is_available() else torch.device('cpu')
35
- device = 0 if torch.cuda.is_available() else -1
36
- speech_recognizer = pipeline(
37
- task="automatic-speech-recognition",
38
- model=f'{MODEL}',
39
- tokenizer=f'{MODEL}',
40
- framework="pt",
41
- device=device,
42
- )
43
-
44
- videos_out_path = Path("./videos_out")
45
- videos_out_path.mkdir(parents=True, exist_ok=True)
46
-
47
- samples_data = sorted(Path('examples').glob('*.json'))
48
- SAMPLES = []
49
- for file in samples_data:
50
- with open(file) as f:
51
- sample = json.load(f)
52
- SAMPLES.append(sample)
53
- VIDEOS = list(map(lambda x: [x['video']], SAMPLES))
54
-
55
- total_inferences_since_reboot = 415
56
- total_cuts_since_reboot = 1539
57
-
58
-
59
- async def speech_to_text(video_file_path):
60
- """
61
- Takes a video path to convert to audio, transcribe audio channel to text and char timestamps
62
-
63
- Using https://huggingface.co/tasks/automatic-speech-recognition pipeline
64
- """
65
- global total_inferences_since_reboot
66
- if(video_file_path == None):
67
- raise ValueError("Error no video input")
68
-
69
- video_path = Path(video_file_path)
70
- try:
71
- # convert video to audio 16k using PIPE to audio_memory
72
- audio_memory, _ = ffmpeg.input(video_path).output(
73
- '-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
74
- except Exception as e:
75
- raise RuntimeError("Error converting video to audio")
76
-
77
- ping("speech_to_text")
78
- last_time = time.time()
79
- if API_BACKEND:
80
- # Using Inference API https://huggingface.co/inference-api
81
- # try twice, because the model must be loaded
82
- for i in range(10):
83
- for tries in range(4):
84
- print(f'Transcribing from API attempt {tries}')
85
- try:
86
- inference_reponse = await query_api(audio_memory)
87
- transcription = inference_reponse["text"].lower()
88
- timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
89
- for chunk in inference_reponse['chunks']]
90
-
91
- total_inferences_since_reboot += 1
92
- print("\n\ntotal_inferences_since_reboot: ",
93
- total_inferences_since_reboot, "\n\n")
94
- return (transcription, transcription, timestamps)
95
- except:
96
- if 'error' in inference_reponse and 'estimated_time' in inference_reponse:
97
- wait_time = inference_reponse['estimated_time']
98
- print("Waiting for model to load....", wait_time)
99
- # wait for loading model
100
- # 5 seconds plus for certanty
101
- await asyncio.sleep(wait_time + 5.0)
102
- elif 'error' in inference_reponse:
103
- raise RuntimeError("Error Fetching API",
104
- inference_reponse['error'])
105
- else:
106
- break
107
- else:
108
- raise RuntimeError(inference_reponse, "Error Fetching API")
109
  else:
 
110
 
111
- try:
112
- print(f'Transcribing via local model')
113
- output = speech_recognizer(
114
- audio_memory, return_timestamps="char", chunk_length_s=10, stride_length_s=(4, 2))
115
-
116
- transcription = output["text"].lower()
117
- timestamps = [[chunk["text"].lower(), chunk["timestamp"][0].tolist(), chunk["timestamp"][1].tolist()]
118
- for chunk in output['chunks']]
119
- total_inferences_since_reboot += 1
120
-
121
- print("\n\ntotal_inferences_since_reboot: ",
122
- total_inferences_since_reboot, "\n\n")
123
- return (transcription, transcription, timestamps)
124
- except Exception as e:
125
- raise RuntimeError("Error Running inference with local model", e)
126
-
127
-
128
- async def cut_timestamps_to_video(video_in, transcription, text_in, timestamps):
129
- """
130
- Given original video input, text transcript + timestamps,
131
- and edit ext cuts video segments into a single video
132
- """
133
- global total_cuts_since_reboot
134
-
135
- video_path = Path(video_in)
136
- video_file_name = video_path.stem
137
- if(video_in == None or text_in == None or transcription == None):
138
- raise ValueError("Inputs undefined")
139
-
140
- d = Differ()
141
- # compare original transcription with edit text
142
- diff_chars = d.compare(transcription, text_in)
143
- # remove all text aditions from diff
144
- filtered = list(filter(lambda x: x[0] != '+', diff_chars))
145
-
146
- # filter timestamps to be removed
147
- # timestamps_to_cut = [b for (a,b) in zip(filtered, timestamps_var) if a[0]== '-' ]
148
- # return diff tokes and cutted video!!
149
-
150
- # groupping character timestamps so there are less cuts
151
- idx = 0
152
- grouped = {}
153
- for(a, b) in zip(filtered, timestamps):
154
- if a[0] != '-':
155
- if idx in grouped:
156
- grouped[idx].append(b)
157
- else:
158
- grouped[idx] = []
159
- grouped[idx].append(b)
160
- else:
161
- idx += 1
162
-
163
- # after grouping, gets the lower and upter start and time for each group
164
- timestamps_to_cut = [[v[0][1], v[-1][2]] for v in grouped.values()]
165
-
166
- between_str = '+'.join(
167
- map(lambda t: f'between(t,{t[0]},{t[1]})', timestamps_to_cut))
168
 
169
- if timestamps_to_cut:
170
- video_file = ffmpeg.input(video_in)
171
- video = video_file.video.filter(
172
- "select", f'({between_str})').filter("setpts", "N/FRAME_RATE/TB")
173
- audio = video_file.audio.filter(
174
- "aselect", f'({between_str})').filter("asetpts", "N/SR/TB")
175
-
176
- output_video = f'./videos_out/{video_file_name}.mp4'
177
- ffmpeg.concat(video, audio, v=1, a=1).output(
178
- output_video).overwrite_output().global_args('-loglevel', 'quiet').run()
179
  else:
180
- output_video = video_in
181
-
182
- tokens = [(token[2:], token[0] if token[0] != " " else None)
183
- for token in filtered]
184
-
185
- total_cuts_since_reboot += 1
186
- ping("video_cuts")
187
- print("\n\ntotal_cuts_since_reboot: ", total_cuts_since_reboot, "\n\n")
188
- return (tokens, output_video)
189
-
190
-
191
- async def query_api(audio_bytes: bytes):
192
- """
193
- Query for Huggingface Inference API for Automatic Speech Recognition task
194
- """
195
- payload = json.dumps({
196
- "inputs": base64.b64encode(audio_bytes).decode("utf-8"),
197
- "parameters": {
198
- "return_timestamps": "char",
199
- "chunk_length_s": 10,
200
- "stride_length_s": [4, 2]
201
- },
202
- "options": {"use_gpu": False}
203
- }).encode("utf-8")
204
- async with aiohttp.ClientSession() as session:
205
- async with session.post(API_URL, headers=headers, data=payload) as response:
206
- return await response.json()
207
-
208
-
209
- def ping(name):
210
- url = f'https://huggingface.co/api/telemetry/spaces/radames/edit-video-by-editing-text/{name}'
211
- print("ping: ", url)
212
-
213
- async def req():
214
- async with aiohttp.ClientSession() as session:
215
- async with session.get(url) as response:
216
- print("pong: ", response.status)
217
- asyncio.create_task(req())
218
-
219
-
220
- # ---- Gradio Layout -----
221
- video_in = gr.Video(label="Video file")
222
- text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)
223
- video_out = gr.Video(label="Video Out")
224
- diff_out = gr.HighlightedText(label="Cuts Diffs", combine_adjacent=True)
225
- examples = gr.components.Dataset(
226
- components=[video_in], samples=VIDEOS, type="index")
227
-
228
- demo = gr.Blocks(enable_queue=True, css='''
229
- #cut_btn, #reset_btn { align-self:stretch; }
230
- #\\31 3 { max-width: 540px; }
231
- .output-markdown {max-width: 65ch !important;}
232
- ''')
233
- demo.encrypt = False
234
- with demo:
235
- transcription_var = gr.Variable()
236
- timestamps_var = gr.Variable()
237
- with gr.Row():
238
- with gr.Column():
239
- gr.Markdown('''
240
- # Edit Video By Editing Text
241
- This project is a quick proof of concept of a simple video editor where the edits
242
- are made by editing the audio transcription.
243
- Using the [Huggingface Automatic Speech Recognition Pipeline](https://huggingface.co/tasks/automatic-speech-recognition)
244
- with a fine tuned [Wav2Vec2 model using Connectionist Temporal Classification (CTC)](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self)
245
- you can predict not only the text transcription but also the [character or word base timestamps](https://huggingface.co/docs/transformers/v4.19.2/en/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline.__call__.return_timestamps)
246
- ''')
247
-
248
- with gr.Row():
249
-
250
- examples.render()
251
-
252
- def load_example(id):
253
- video = SAMPLES[id]['video']
254
- transcription = SAMPLES[id]['transcription'].lower()
255
- timestamps = SAMPLES[id]['timestamps']
256
-
257
- return (video, transcription, transcription, timestamps)
258
-
259
- examples.click(
260
- load_example,
261
- inputs=[examples],
262
- outputs=[video_in, text_in, transcription_var, timestamps_var],
263
- queue=False)
264
- with gr.Row():
265
- with gr.Column():
266
- video_in.render()
267
- transcribe_btn = gr.Button("Transcribe Audio")
268
- transcribe_btn.click(speech_to_text, [video_in], [
269
- text_in, transcription_var, timestamps_var])
270
-
271
- with gr.Row():
272
- gr.Markdown('''
273
- ### Now edit as text
274
- After running the video transcription, you can make cuts to the text below (only cuts, not additions!)''')
275
-
276
- with gr.Row():
277
- with gr.Column():
278
- text_in.render()
279
- with gr.Row():
280
- cut_btn = gr.Button("Cut to video", elem_id="cut_btn")
281
- # send audio path and hidden variables
282
- cut_btn.click(cut_timestamps_to_video, [
283
- video_in, transcription_var, text_in, timestamps_var], [diff_out, video_out])
284
-
285
- reset_transcription = gr.Button(
286
- "Reset to last trascription", elem_id="reset_btn")
287
- reset_transcription.click(
288
- lambda x: x, transcription_var, text_in)
289
- with gr.Column():
290
- video_out.render()
291
- diff_out.render()
292
- with gr.Row():
293
- gr.Markdown('''
294
- #### Video Credits
295
-
296
- 1. [Cooking](https://vimeo.com/573792389)
297
- 1. [Shia LaBeouf "Just Do It"](https://www.youtube.com/watch?v=n2lTxIk_Dr0)
298
- 1. [Mark Zuckerberg & Yuval Noah Harari in Conversation](https://www.youtube.com/watch?v=Boj9eD0Wug8)
299
- ''')
300
-
301
- if __name__ == "__main__":
302
- demo.launch(debug=True)
 
 
1
  import gradio as gr
2
+ from pytube import YouTube
3
+ import whisper
 
 
 
 
 
4
 
5
+ define function for transcription
6
+ def whisper_transcript(model_size,audio_file):
7
+ if url:
8
+ link = YouTube(url)
9
+ source = link.streams.filter(only_audio=True)[0].download(filename="audio.mp4")
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  else:
12
+ source = audio_file
13
 
14
+ if model_size.endswith(".en"):
15
+ language = "english"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
 
 
 
 
 
 
 
 
 
 
17
  else:
18
+ language = None
19
+
20
+ options = whisper.DecodingOptions(without_timestamps=True)
21
+
22
+ loaded_model = whisper.load_model(model_size)
23
+ transcript = loaded_model.transcribe(source, language=language)
24
+
25
+ return transcript["text"]
26
+
27
+ define Gradio app interface
28
+ gradio_ui = gr.Interface(
29
+ fn=whisper_transcript,
30
+ title="Transcribe multi-lingual audio",
31
+ theme="peach",
32
+ description="**How to use**: Select a model, upload an audio clip, then click submit. If your clip is **100% in English, select models ending in ‘.en’**. If the clip is in other languages, or a mix of languages, select models without ‘.en’",
33
+ article="**Note**: The larger the model size selected or the longer the audio clip, the more time it would take to process the transcript.",
34
+ inputs=[
35
+ gr.Dropdown(
36
+ label="Select Model",
37
+ choices=[
38
+ "tiny.en",
39
+ "base.en",
40
+ "small.en",
41
+ "medium.en",
42
+ "tiny",
43
+ "base",
44
+ "small",
45
+ "medium",
46
+ "large",
47
+ ],
48
+ value="base",
49
+ ),
50
+ # gr.Textbox(label="Paste YouTube link here"),
51
+ gr.Audio(label="Upload Audio File", source="upload", type="filepath"),
52
+ ],
53
+ outputs=gr.outputs.Textbox(label="Whisper Transcript"),
54
+ )
55
+
56
+ gradio_ui.queue().launch()