WhatShutup / app.py
danelkay's picture
Injected gradio-lite via gr.HTML(), added WebGPU support
491cd72
import gradio as gr
import os
import tempfile
import ffmpeg
import json
from huggingface_hub import InferenceApi
from typing import List, Dict, Tuple
# ๐Ÿ”น Constants
MODEL_NAME: str = "ivrit-ai/faster-whisper-v2-d4"
TRANSLATION_MODEL_NAME: str = "dicta-il/dictalm2.0-GGUF"
TEMP_DIR: str = tempfile.gettempdir()
# ๐Ÿ”น Load Hugging Face Inference API
ASR_API = InferenceApi(repo_id=MODEL_NAME)
TRANSLATION_API = InferenceApi(repo_id=TRANSLATION_MODEL_NAME)
def convert_audio(audio_path: str) -> str:
"""Converts an audio file to 16kHz WAV format for compatibility."""
converted_path = os.path.join(TEMP_DIR, "converted.wav")
(
ffmpeg
.input(audio_path)
.output(converted_path, format="wav", ar="16000")
.run(overwrite_output=True, quiet=True)
)
return converted_path
def transcribe_audio(file: str, translate: bool) -> Tuple[str, str]:
"""Transcribes audio and optionally translates it using Hugging Face API."""
audio_path = file if file.endswith(".wav") else convert_audio(file)
with open(audio_path, "rb") as audio_file:
result = ASR_API(inputs=audio_file)
segments = result.get("segments", [])
subtitles: List[Dict[str, str]] = []
transcribed_text: str = ""
for segment in segments:
hebrew_text = segment["text"]
start_time = segment["start"]
end_time = segment["end"]
eng_translation = ""
if translate:
eng_translation = TRANSLATION_API(inputs=hebrew_text)[0]["translation_text"]
subtitles.append({
"start": start_time,
"end": end_time,
"text": hebrew_text,
"translation": eng_translation if translate else None
})
transcribed_text += f"{hebrew_text} "
return json.dumps(subtitles), transcribed_text
# ๐Ÿ”น Inject WebGPU-compatible JavaScript via `gr.HTML()`
webgpu_script = """
<script type="module">
import { pipeline } from 'https://cdn.jsdelivr.net/npm/@xenova/transformers@latest';
let asr;
async function loadModel() {
asr = await pipeline("automatic-speech-recognition", "openai/whisper-large-v3");
console.log("WebGPU ASR model loaded.");
}
async function transcribe(audioFile) {
if (!asr) {
console.error("Model not loaded.");
return;
}
const result = await asr(audioFile);
document.getElementById("output").innerText = result.text;
}
document.getElementById("upload").addEventListener("change", async (event) => {
const file = event.target.files[0];
transcribe(file);
});
loadModel();
</script>
<input type="file" id="upload" accept="audio/*">
<p id="output">Transcription will appear here.</p>
"""
# ๐Ÿ”น Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# WhatShutup: Transcribe WhatsApp Voice Messages with WebGPU Support")
webgpu_component = gr.HTML(webgpu_script)
audio_input = gr.Audio(source="upload", type="filepath", label="Upload Audio File")
translate_checkbox = gr.Checkbox(label="Translate to English?", value=False)
with gr.Row():
audio_player = gr.Audio(source="upload", type="filepath", label="Playback")
transcript_output = gr.Textbox(label="Transcription & Subtitles", lines=10)
submit_btn = gr.Button("Transcribe")
submit_btn.click(transcribe_audio, inputs=[audio_input, translate_checkbox], outputs=[audio_player, transcript_output])
demo.launch()