Spaces:

Deepsheka
/

newdemo-app

Build error

App Files Files Community

Deepsheka commited on May 26, 2023

Commit

797126d

1 Parent(s): f96972a

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -295

app.py CHANGED Viewed

@@ -1,302 +1,56 @@
-import json
 import gradio as gr
-from difflib import Differ
-import ffmpeg
-import os
-from pathlib import Path
-import time
-import aiohttp
-import asyncio
-# Set true if you're using huggingface inference API API https://huggingface.co/inference-api
-API_BACKEND = True
-# MODEL = 'facebook/wav2vec2-large-960h-lv60-self'
-# MODEL  = "facebook/wav2vec2-large-960h"
-MODEL = "facebook/wav2vec2-base-960h"
-# MODEL = "patrickvonplaten/wav2vec2-large-960h-lv60-self-4-gram"
-if API_BACKEND:
-    from dotenv import load_dotenv
-    import base64
-    import asyncio
-    load_dotenv(Path(".env"))
-    HF_TOKEN = os.environ["HF_TOKEN"]
-    headers = {"Authorization": f"Bearer {HF_TOKEN}"}
-    API_URL = f'https://api-inference.huggingface.co/models/{MODEL}'
-else:
-    import torch
-    from transformers import pipeline
-    # is cuda available?
-    cuda = torch.device(
-        'cuda:0') if torch.cuda.is_available() else torch.device('cpu')
-    device = 0 if torch.cuda.is_available() else -1
-    speech_recognizer = pipeline(
-        task="automatic-speech-recognition",
-        model=f'{MODEL}',
-        tokenizer=f'{MODEL}',
-        framework="pt",
-        device=device,
-    )
-videos_out_path = Path("./videos_out")
-videos_out_path.mkdir(parents=True, exist_ok=True)
-samples_data = sorted(Path('examples').glob('*.json'))
-SAMPLES = []
-for file in samples_data:
-    with open(file) as f:
-        sample = json.load(f)
-    SAMPLES.append(sample)
-VIDEOS = list(map(lambda x: [x['video']], SAMPLES))
-total_inferences_since_reboot = 415
-total_cuts_since_reboot = 1539
-async def speech_to_text(video_file_path):
-    """
-    Takes a video path to convert to audio, transcribe audio channel to text and char timestamps
-    Using https://huggingface.co/tasks/automatic-speech-recognition pipeline
-    """
-    global total_inferences_since_reboot
-    if(video_file_path == None):
-        raise ValueError("Error no video input")
-    video_path = Path(video_file_path)
-    try:
-        # convert video to audio 16k using PIPE to audio_memory
-        audio_memory, _ = ffmpeg.input(video_path).output(
-            '-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
-    except Exception as e:
-        raise RuntimeError("Error converting video to audio")
-    ping("speech_to_text")
-    last_time = time.time()
-    if API_BACKEND:
-        # Using Inference API https://huggingface.co/inference-api
-        # try twice, because the model must be loaded
-        for i in range(10):
-            for tries in range(4):
-                print(f'Transcribing from API attempt {tries}')
-                try:
-                    inference_reponse = await query_api(audio_memory)
-                    transcription = inference_reponse["text"].lower()
-                    timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
-                                  for chunk in inference_reponse['chunks']]
-                    total_inferences_since_reboot += 1
-                    print("\n\ntotal_inferences_since_reboot: ",
-                          total_inferences_since_reboot, "\n\n")
-                    return (transcription, transcription, timestamps)
-                except:
-                    if 'error' in inference_reponse and 'estimated_time' in inference_reponse:
-                        wait_time = inference_reponse['estimated_time']
-                        print("Waiting for model to load....", wait_time)
-                        # wait for loading model
-                        # 5 seconds plus for certanty
-                        await asyncio.sleep(wait_time + 5.0)
-                    elif 'error' in inference_reponse:
-                        raise RuntimeError("Error Fetching API",
-                                           inference_reponse['error'])
-                    else:
-                        break
-            else:
-                raise RuntimeError(inference_reponse, "Error Fetching API")
     else:
-        try:
-            print(f'Transcribing via local model')
-            output = speech_recognizer(
-                audio_memory, return_timestamps="char",  chunk_length_s=10, stride_length_s=(4, 2))
-            transcription = output["text"].lower()
-            timestamps = [[chunk["text"].lower(), chunk["timestamp"][0].tolist(), chunk["timestamp"][1].tolist()]
-                          for chunk in output['chunks']]
-            total_inferences_since_reboot += 1
-            print("\n\ntotal_inferences_since_reboot: ",
-                  total_inferences_since_reboot, "\n\n")
-            return (transcription, transcription, timestamps)
-        except Exception as e:
-            raise RuntimeError("Error Running inference with local model", e)
-async def cut_timestamps_to_video(video_in, transcription, text_in, timestamps):
-    """
-    Given original video input, text transcript + timestamps,
-    and edit ext cuts video segments into a single video
-    """
-    global total_cuts_since_reboot
-    video_path = Path(video_in)
-    video_file_name = video_path.stem
-    if(video_in == None or text_in == None or transcription == None):
-        raise ValueError("Inputs undefined")
-    d = Differ()
-    # compare original transcription with edit text
-    diff_chars = d.compare(transcription, text_in)
-    # remove all text aditions from diff
-    filtered = list(filter(lambda x: x[0] != '+', diff_chars))
-    # filter timestamps to be removed
-    # timestamps_to_cut = [b for (a,b) in zip(filtered, timestamps_var) if a[0]== '-' ]
-    # return diff tokes and cutted video!!
-    # groupping character timestamps so there are less cuts
-    idx = 0
-    grouped = {}
-    for(a, b) in zip(filtered, timestamps):
-        if a[0] != '-':
-            if idx in grouped:
-                grouped[idx].append(b)
-            else:
-                grouped[idx] = []
-                grouped[idx].append(b)
-        else:
-            idx += 1
-    # after grouping, gets the lower and upter start and time for each group
-    timestamps_to_cut = [[v[0][1], v[-1][2]] for v in grouped.values()]
-    between_str = '+'.join(
-        map(lambda t: f'between(t,{t[0]},{t[1]})', timestamps_to_cut))
-    if timestamps_to_cut:
-        video_file = ffmpeg.input(video_in)
-        video = video_file.video.filter(
-            "select", f'({between_str})').filter("setpts", "N/FRAME_RATE/TB")
-        audio = video_file.audio.filter(
-            "aselect", f'({between_str})').filter("asetpts", "N/SR/TB")
-        output_video = f'./videos_out/{video_file_name}.mp4'
-        ffmpeg.concat(video, audio, v=1, a=1).output(
-            output_video).overwrite_output().global_args('-loglevel', 'quiet').run()
     else:
-        output_video = video_in
-    tokens = [(token[2:], token[0] if token[0] != " " else None)
-              for token in filtered]
-    total_cuts_since_reboot += 1
-    ping("video_cuts")
-    print("\n\ntotal_cuts_since_reboot: ", total_cuts_since_reboot, "\n\n")
-    return (tokens, output_video)
-async def query_api(audio_bytes: bytes):
-    """
-    Query for Huggingface Inference API for Automatic Speech Recognition task
-    """
-    payload = json.dumps({
-        "inputs": base64.b64encode(audio_bytes).decode("utf-8"),
-        "parameters": {
-            "return_timestamps": "char",
-            "chunk_length_s": 10,
-            "stride_length_s": [4, 2]
-        },
-        "options": {"use_gpu": False}
-    }).encode("utf-8")
-    async with aiohttp.ClientSession() as session:
-        async with session.post(API_URL, headers=headers, data=payload) as response:
-            return await response.json()
-def ping(name):
-    url = f'https://huggingface.co/api/telemetry/spaces/radames/edit-video-by-editing-text/{name}'
-    print("ping: ", url)
-    async def req():
-        async with aiohttp.ClientSession() as session:
-            async with session.get(url) as response:
-                print("pong: ", response.status)
-    asyncio.create_task(req())
-# ---- Gradio Layout -----
-video_in = gr.Video(label="Video file")
-text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)
-video_out = gr.Video(label="Video Out")
-diff_out = gr.HighlightedText(label="Cuts Diffs", combine_adjacent=True)
-examples = gr.components.Dataset(
-    components=[video_in], samples=VIDEOS, type="index")
-demo = gr.Blocks(enable_queue=True, css='''
-#cut_btn, #reset_btn { align-self:stretch; }
-#\\31 3 { max-width: 540px; }
-.output-markdown {max-width: 65ch !important;}
-''')
-demo.encrypt = False
-with demo:
-    transcription_var = gr.Variable()
-    timestamps_var = gr.Variable()
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown('''
-            # Edit Video By Editing Text
-            This project is a quick proof of concept of a simple video editor where the edits
-            are made by editing the audio transcription.
-            Using the [Huggingface Automatic Speech Recognition Pipeline](https://huggingface.co/tasks/automatic-speech-recognition)
-            with a fine tuned [Wav2Vec2 model using Connectionist Temporal Classification (CTC)](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self)
-            you can predict not only the text transcription but also the [character or word base timestamps](https://huggingface.co/docs/transformers/v4.19.2/en/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline.__call__.return_timestamps)
-            ''')
-    with gr.Row():
-        examples.render()
-        def load_example(id):
-            video = SAMPLES[id]['video']
-            transcription = SAMPLES[id]['transcription'].lower()
-            timestamps = SAMPLES[id]['timestamps']
-            return (video, transcription, transcription, timestamps)
-        examples.click(
-            load_example,
-            inputs=[examples],
-            outputs=[video_in, text_in, transcription_var, timestamps_var],
-            queue=False)
-    with gr.Row():
-        with gr.Column():
-            video_in.render()
-            transcribe_btn = gr.Button("Transcribe Audio")
-            transcribe_btn.click(speech_to_text, [video_in], [
-                text_in, transcription_var, timestamps_var])
-    with gr.Row():
-        gr.Markdown('''
-        ### Now edit as text
-        After running the video transcription, you can make cuts to the text below (only cuts, not additions!)''')
-    with gr.Row():
-        with gr.Column():
-            text_in.render()
-            with gr.Row():
-                cut_btn = gr.Button("Cut to video", elem_id="cut_btn")
-                # send audio path and hidden variables
-                cut_btn.click(cut_timestamps_to_video, [
-                    video_in, transcription_var, text_in, timestamps_var], [diff_out, video_out])
-                reset_transcription = gr.Button(
-                    "Reset to last trascription", elem_id="reset_btn")
-                reset_transcription.click(
-                    lambda x: x, transcription_var, text_in)
-        with gr.Column():
-            video_out.render()
-            diff_out.render()
-    with gr.Row():
-        gr.Markdown('''
-        #### Video Credits
-        1. [Cooking](https://vimeo.com/573792389)
-        1. [Shia LaBeouf "Just Do It"](https://www.youtube.com/watch?v=n2lTxIk_Dr0)
-        1. [Mark Zuckerberg & Yuval Noah Harari in Conversation](https://www.youtube.com/watch?v=Boj9eD0Wug8)
-        ''')
-if __name__ == "__main__":
-    demo.launch(debug=True)

 import gradio as gr
+from pytube import YouTube
+import whisper
+ define function for transcription
+def whisper_transcript(model_size,audio_file):
+    if url:
+        link = YouTube(url)
+        source = link.streams.filter(only_audio=True)[0].download(filename="audio.mp4")
     else:
+    source = audio_file
+    if model_size.endswith(".en"):
+        language = "english"
     else:
+        language = None
+    options = whisper.DecodingOptions(without_timestamps=True)
+    loaded_model = whisper.load_model(model_size)
+    transcript = loaded_model.transcribe(source, language=language)
+    return transcript["text"]
+ define Gradio app interface
+gradio_ui = gr.Interface(
+    fn=whisper_transcript,
+    title="Transcribe multi-lingual audio",
+    theme="peach",
+    description="**How to use**: Select a model, upload an audio clip, then click submit. If your clip is **100% in English, select models ending in ‘.en’**. If the clip is in other languages, or a mix of languages, select models without ‘.en’",
+    article="**Note**: The larger the model size selected or the longer the audio clip, the more time it would take to process the transcript.",
+    inputs=[
+        gr.Dropdown(
+            label="Select Model",
+            choices=[
+                "tiny.en",
+                "base.en",
+                "small.en",
+                "medium.en",
+                "tiny",
+                "base",
+                "small",
+                "medium",
+                "large",
+            ],
+            value="base",
+        ),
+       # gr.Textbox(label="Paste YouTube link here"),
+        gr.Audio(label="Upload Audio File", source="upload", type="filepath"),
+    ],
+    outputs=gr.outputs.Textbox(label="Whisper Transcript"),
+)
+gradio_ui.queue().launch()