File size: 3,494 Bytes
491cd72
3fadc6e
 
8d120bf
491cd72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9cae71a
491cd72
 
 
 
9cae71a
491cd72
 
 
 
 
 
 
 
9cae71a
491cd72
 
 
 
a1da02d
491cd72
 
a1da02d
491cd72
 
 
7ce6041
491cd72
 
 
cddfab9
491cd72
95261ed
491cd72
 
1acaa19
491cd72
 
 
2698c96
491cd72
 
43189ac
491cd72
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import gradio as gr
import os
import tempfile
import ffmpeg
import json
from huggingface_hub import InferenceApi
from typing import List, Dict, Tuple

# 🔹 Constants
MODEL_NAME: str = "ivrit-ai/faster-whisper-v2-d4"
TRANSLATION_MODEL_NAME: str = "dicta-il/dictalm2.0-GGUF"
TEMP_DIR: str = tempfile.gettempdir()

# 🔹 Load Hugging Face Inference API
ASR_API = InferenceApi(repo_id=MODEL_NAME)
TRANSLATION_API = InferenceApi(repo_id=TRANSLATION_MODEL_NAME)

def convert_audio(audio_path: str) -> str:
    """Converts an audio file to 16kHz WAV format for compatibility."""
    converted_path = os.path.join(TEMP_DIR, "converted.wav")
    (
        ffmpeg
        .input(audio_path)
        .output(converted_path, format="wav", ar="16000")
        .run(overwrite_output=True, quiet=True)
    )
    return converted_path

def transcribe_audio(file: str, translate: bool) -> Tuple[str, str]:
    """Transcribes audio and optionally translates it using Hugging Face API."""
    audio_path = file if file.endswith(".wav") else convert_audio(file)

    with open(audio_path, "rb") as audio_file:
        result = ASR_API(inputs=audio_file)

    segments = result.get("segments", [])
    subtitles: List[Dict[str, str]] = []
    transcribed_text: str = ""

    for segment in segments:
        hebrew_text = segment["text"]
        start_time = segment["start"]
        end_time = segment["end"]
        eng_translation = ""

        if translate:
            eng_translation = TRANSLATION_API(inputs=hebrew_text)[0]["translation_text"]

        subtitles.append({
            "start": start_time,
            "end": end_time,
            "text": hebrew_text,
            "translation": eng_translation if translate else None
        })

        transcribed_text += f"{hebrew_text} "

    return json.dumps(subtitles), transcribed_text

# 🔹 Inject WebGPU-compatible JavaScript via `gr.HTML()`
webgpu_script = """
<script type="module">
    import { pipeline } from 'https://cdn.jsdelivr.net/npm/@xenova/transformers@latest';

    let asr;
    
    async function loadModel() {
        asr = await pipeline("automatic-speech-recognition", "openai/whisper-large-v3");
        console.log("WebGPU ASR model loaded.");
    }

    async function transcribe(audioFile) {
        if (!asr) {
            console.error("Model not loaded.");
            return;
        }
        const result = await asr(audioFile);
        document.getElementById("output").innerText = result.text;
    }

    document.getElementById("upload").addEventListener("change", async (event) => {
        const file = event.target.files[0];
        transcribe(file);
    });

    loadModel();
</script>

<input type="file" id="upload" accept="audio/*">
<p id="output">Transcription will appear here.</p>
"""

# 🔹 Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# WhatShutup: Transcribe WhatsApp Voice Messages with WebGPU Support")

    webgpu_component = gr.HTML(webgpu_script)

    audio_input = gr.Audio(source="upload", type="filepath", label="Upload Audio File")
    translate_checkbox = gr.Checkbox(label="Translate to English?", value=False)

    with gr.Row():
        audio_player = gr.Audio(source="upload", type="filepath", label="Playback")
        transcript_output = gr.Textbox(label="Transcription & Subtitles", lines=10)

    submit_btn = gr.Button("Transcribe")
    submit_btn.click(transcribe_audio, inputs=[audio_input, translate_checkbox], outputs=[audio_player, transcript_output])

demo.launch()