File size: 3,657 Bytes
3f1c7ef
 
e5b54f2
3f1c7ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2dcb1c1
 
 
 
 
 
3f1c7ef
 
 
 
 
 
 
 
e5b54f2
 
3f1c7ef
 
 
 
 
 
 
 
 
e5b54f2
 
2dcb1c1
 
e5b54f2
 
 
2dcb1c1
 
e5b54f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2dcb1c1
e5b54f2
 
 
 
 
2dcb1c1
e5b54f2
2dcb1c1
e5b54f2
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import gradio as gr
from pydub import AudioSegment
import os
import speech_recognition as sr
import concurrent.futures

def split_audio(audio_path, chunk_length_ms=60000, overlap_ms=2000):
    audio = AudioSegment.from_file(audio_path)
    chunks = []
    for i in range(0, len(audio), chunk_length_ms - overlap_ms):
        chunks.append(audio[i:i + chunk_length_ms])
    return chunks

def convert_audio_to_wav(input_path, output_path):
    audio = AudioSegment.from_file(input_path)
    audio = audio.set_frame_rate(16000).set_channels(1)
    audio.export(output_path, format="wav")

def transcribe_chunk_indexed(indexed_chunk_language):
    index, chunk, language = indexed_chunk_language
    recognizer = sr.Recognizer()
    try:
        with open(f"chunk_{index}.wav", "wb") as f:
            chunk.export(f.name, format="wav")
        with sr.AudioFile(f"chunk_{index}.wav") as source:
            audio_data = recognizer.record(source)
            text = recognizer.recognize_google(audio_data, language=language)
        os.remove(f"chunk_{index}.wav")
        return index, text
    except sr.RequestError:
        return index, "[Error: API unavailable or unresponsive]"
    except sr.UnknownValueError:
        return index, "[Error: Unable to recognize speech]"
    except Exception as e:
        return index, f"[Error: {str(e)}]"

def transcribe_audio_with_google_parallel(audio_path, chunk_length_ms=60000, overlap_ms=2000, language="en-US"):
    chunks = split_audio(audio_path, chunk_length_ms, overlap_ms)
    indexed_chunks = [(i, chunk, language) for i, chunk in enumerate(chunks)]
    transcription = [""] * len(indexed_chunks)

    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        futures = {executor.submit(transcribe_chunk_indexed, ic): ic[0] for ic in indexed_chunks}
        for future in concurrent.futures.as_completed(futures):
            idx, text = future.result()
            transcription[idx] = text

    return " ".join(transcription)

def transcribe(audio_file_path, language):
    if audio_file_path is None:
        return "Please upload an audio file."

    try:
        converted_path = audio_file_path + "_converted.wav"
        convert_audio_to_wav(audio_file_path, converted_path)
        temp_path = converted_path
    except Exception as e:
        return f"Error processing audio: {e}"

    transcription = transcribe_audio_with_google_parallel(temp_path, chunk_length_ms=60000, overlap_ms=2000, language=language)

    try:
        os.remove(temp_path)
    except Exception:
        pass

    return transcription

language_options = {
    "English (US)": "en-US",
    "Dutch": "nl-NL",
    "English (UK)": "en-GB",
    "Spanish": "es-ES",
    "French": "fr-FR",
    "German": "de-DE",
    "Hindi": "hi-IN",
    "Chinese (Mandarin)": "zh-CN",
    "Arabic": "ar-SA",
    "Turkish": "tr-TR",
}

with gr.Blocks() as demo:
    gr.Markdown("# Audio to Text Transcription")
    gr.Markdown("Upload an audio file, and we'll transcribe it into text using chunk processing.")

    with gr.Row():
        audio_input = gr.Audio(type="filepath", label="Upload audio file (mp3, wav, m4a, ogg)")
        language_dropdown = gr.Dropdown(list(language_options.keys()), label="Select language", value="English (US)")

    transcribe_btn = gr.Button("Transcribe")
    output_text = gr.Textbox(label="Transcription Output", lines=15)

    def on_transcribe(audio_path, lang_name):
        lang_code = language_options[lang_name]
        return transcribe(audio_path, lang_code)

    transcribe_btn.click(on_transcribe, inputs=[audio_input, language_dropdown], outputs=output_text)

demo.launch()