Spaces:
Runtime error
Runtime error
import gradio as gr | |
import os | |
import tempfile | |
import ffmpeg | |
import json | |
from huggingface_hub import InferenceApi | |
from typing import List, Dict, Tuple | |
# ๐น Constants | |
MODEL_NAME: str = "ivrit-ai/faster-whisper-v2-d4" | |
TRANSLATION_MODEL_NAME: str = "dicta-il/dictalm2.0-GGUF" | |
TEMP_DIR: str = tempfile.gettempdir() | |
# ๐น Load Hugging Face Inference API | |
ASR_API = InferenceApi(repo_id=MODEL_NAME) | |
TRANSLATION_API = InferenceApi(repo_id=TRANSLATION_MODEL_NAME) | |
def convert_audio(audio_path: str) -> str: | |
"""Converts an audio file to 16kHz WAV format for compatibility.""" | |
converted_path = os.path.join(TEMP_DIR, "converted.wav") | |
( | |
ffmpeg | |
.input(audio_path) | |
.output(converted_path, format="wav", ar="16000") | |
.run(overwrite_output=True, quiet=True) | |
) | |
return converted_path | |
def transcribe_audio(file: str, translate: bool) -> Tuple[str, str]: | |
"""Transcribes audio and optionally translates it using Hugging Face API.""" | |
audio_path = file if file.endswith(".wav") else convert_audio(file) | |
with open(audio_path, "rb") as audio_file: | |
result = ASR_API(inputs=audio_file) | |
segments = result.get("segments", []) | |
subtitles: List[Dict[str, str]] = [] | |
transcribed_text: str = "" | |
for segment in segments: | |
hebrew_text = segment["text"] | |
start_time = segment["start"] | |
end_time = segment["end"] | |
eng_translation = "" | |
if translate: | |
eng_translation = TRANSLATION_API(inputs=hebrew_text)[0]["translation_text"] | |
subtitles.append({ | |
"start": start_time, | |
"end": end_time, | |
"text": hebrew_text, | |
"translation": eng_translation if translate else None | |
}) | |
transcribed_text += f"{hebrew_text} " | |
return json.dumps(subtitles), transcribed_text | |
# ๐น Inject WebGPU-compatible JavaScript via `gr.HTML()` | |
webgpu_script = """ | |
<script type="module"> | |
import { pipeline } from 'https://cdn.jsdelivr.net/npm/@xenova/transformers@latest'; | |
let asr; | |
async function loadModel() { | |
asr = await pipeline("automatic-speech-recognition", "openai/whisper-large-v3"); | |
console.log("WebGPU ASR model loaded."); | |
} | |
async function transcribe(audioFile) { | |
if (!asr) { | |
console.error("Model not loaded."); | |
return; | |
} | |
const result = await asr(audioFile); | |
document.getElementById("output").innerText = result.text; | |
} | |
document.getElementById("upload").addEventListener("change", async (event) => { | |
const file = event.target.files[0]; | |
transcribe(file); | |
}); | |
loadModel(); | |
</script> | |
<input type="file" id="upload" accept="audio/*"> | |
<p id="output">Transcription will appear here.</p> | |
""" | |
# ๐น Gradio UI | |
with gr.Blocks() as demo: | |
gr.Markdown("# WhatShutup: Transcribe WhatsApp Voice Messages with WebGPU Support") | |
webgpu_component = gr.HTML(webgpu_script) | |
audio_input = gr.Audio(source="upload", type="filepath", label="Upload Audio File") | |
translate_checkbox = gr.Checkbox(label="Translate to English?", value=False) | |
with gr.Row(): | |
audio_player = gr.Audio(source="upload", type="filepath", label="Playback") | |
transcript_output = gr.Textbox(label="Transcription & Subtitles", lines=10) | |
submit_btn = gr.Button("Transcribe") | |
submit_btn.click(transcribe_audio, inputs=[audio_input, translate_checkbox], outputs=[audio_player, transcript_output]) | |
demo.launch() | |