import gradio as gr import os import tempfile import ffmpeg import json from huggingface_hub import InferenceApi from typing import List, Dict, Tuple # 🔹 Constants MODEL_NAME: str = "ivrit-ai/faster-whisper-v2-d4" TRANSLATION_MODEL_NAME: str = "dicta-il/dictalm2.0-GGUF" TEMP_DIR: str = tempfile.gettempdir() # 🔹 Load Hugging Face Inference API ASR_API = InferenceApi(repo_id=MODEL_NAME) TRANSLATION_API = InferenceApi(repo_id=TRANSLATION_MODEL_NAME) def convert_audio(audio_path: str) -> str: """Converts an audio file to 16kHz WAV format for compatibility.""" converted_path = os.path.join(TEMP_DIR, "converted.wav") ( ffmpeg .input(audio_path) .output(converted_path, format="wav", ar="16000") .run(overwrite_output=True, quiet=True) ) return converted_path def transcribe_audio(file: str, translate: bool) -> Tuple[str, str]: """Transcribes audio and optionally translates it using Hugging Face API.""" audio_path = file if file.endswith(".wav") else convert_audio(file) with open(audio_path, "rb") as audio_file: result = ASR_API(inputs=audio_file) segments = result.get("segments", []) subtitles: List[Dict[str, str]] = [] transcribed_text: str = "" for segment in segments: hebrew_text = segment["text"] start_time = segment["start"] end_time = segment["end"] eng_translation = "" if translate: eng_translation = TRANSLATION_API(inputs=hebrew_text)[0]["translation_text"] subtitles.append({ "start": start_time, "end": end_time, "text": hebrew_text, "translation": eng_translation if translate else None }) transcribed_text += f"{hebrew_text} " return json.dumps(subtitles), transcribed_text # 🔹 Inject WebGPU-compatible JavaScript via `gr.HTML()` webgpu_script = """
Transcription will appear here.
""" # 🔹 Gradio UI with gr.Blocks() as demo: gr.Markdown("# WhatShutup: Transcribe WhatsApp Voice Messages with WebGPU Support") webgpu_component = gr.HTML(webgpu_script) audio_input = gr.Audio(source="upload", type="filepath", label="Upload Audio File") translate_checkbox = gr.Checkbox(label="Translate to English?", value=False) with gr.Row(): audio_player = gr.Audio(source="upload", type="filepath", label="Playback") transcript_output = gr.Textbox(label="Transcription & Subtitles", lines=10) submit_btn = gr.Button("Transcribe") submit_btn.click(transcribe_audio, inputs=[audio_input, translate_checkbox], outputs=[audio_player, transcript_output]) demo.launch()