import gradio as gr from transformers import pipeline import torch import numpy as np device = "cuda:0" if torch.cuda.is_available() else "cpu" wav2_ft = pipeline("automatic-speech-recognition",model='sanchit-gandhi/wav2vec2-large-tedlium',device=device,trust_remote_code=True) app = gr.Blocks() def inference(path): out = wav2_ft( path, max_new_tokens=256, chunk_length_s=30, batch_size=8, ) return out['text'] def transcribe(stream, new_chunk): sr, y = new_chunk y = y.astype(np.float32) y /= np.max(np.abs(y)) if stream is not None: stream = np.concatenate([stream, y]) else: stream = y return stream, wav2_ft({"sampling_rate": sr, "raw": stream})["text"] mic_mode = gr.Interface( fn=inference, inputs=gr.Audio(sources="microphone", type='filepath', label="Record Your Lecture"), outputs=gr.Textbox(label="Transcription Output"), title="πŸŽ™οΈ Recording & Transcribe", description="Record through your mic. When you're done, hit stop and wait a moment. Feel free to trim the recording. Then, hit Submit!", examples=[], ) upload_mode = gr.Interface( fn=inference, inputs=gr.Audio(sources="upload", type='filepath', label="Upload Your Lecture Recording"), outputs=gr.Textbox(label="Transcription Output"), title="πŸ“‚ Upload & Transcribe", description="Have a recorded lecture? Upload the audio file here, and it'll be transcribed in seconds!", ) # inspired by Gradio App Real Time Speech Recognition: https://www.gradio.app/guides/real-time-speech-recognition live_mode = gr.Interface( transcribe, ["state", gr.Audio(sources=["microphone"], streaming=True)], ["state", "text"], title="🎀 Live Transcription", description="Transcribe your lecture in real-time! Start speaking into your microphone, and watch the transcription appear instantly.", live=True, ) with app: gr.Markdown( """ # Lecture Transcription πŸ“ Welcome to **Lecture Transcription**, the go-to tool for transcribing lectures accurately. Whether you’re attending a live lecture or revisiting a recorded one, this app will ensure you don’t miss a single detail. ## How It Works - **Recording Mode:** Record the lecture as it happens. When you stop, your transcription will be generated. - **Upload Mode:** Upload your pre-recorded lecture audio files, and receive a precise transcription. Supports various audio formats including WAV, MP3, and more. - **Live Mode:** That's right, low-latency live transcription! ## Optimized for Technical Oration Under the hood, this is a Wav2Vec2 model fine-tuned on the TED-Lium dataset. It's well-versed for accurately transcribing technical speech. """ ) gr.TabbedInterface( [mic_mode, upload_mode,live_mode], ["πŸŽ™οΈ Record & Transcribe", "πŸ“‚ Upload & Transcribe","🎀 Live Transcribe"] ) app.launch(debug=True)