Spaces:

Loren
/

Voxtral_Mini_Evaluation

Running on Zero

App Files Files Community

Loren commited on 21 days ago

Commit

8559f42

verified ·

1 Parent(s): c4e20d8

Upload app.py

Browse files

Files changed (1) hide show

app.py +241 -100

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import gradio as gr
 import torch
 from transformers import AutoProcessor, VoxtralForConditionalGeneration
 from pydub import AudioSegment
-from pydub.silence import split_on_silence, detect_silence
 import yt_dlp
 import requests
 import validators
@@ -13,8 +13,103 @@ import re
 import glob
 import spaces
 #### Functions
 @spaces.GPU
 def process_transcript(language: str, audio_path: str) -> str:
     """Process the audio file to return its transcription.
@@ -25,66 +120,105 @@ def process_transcript(language: str, audio_path: str) -> str:
     Returns:
         The transcribed text of the audio.
     """
     if audio_path is None:
-        return "Please provide some input audio: either upload an audio file or use the microphone."
     else:
         id_language = dict_languages[language]
-        inputs = processor.apply_transcrition_request(language=id_language, audio=audio_path, model_id=model_name)
-        inputs = inputs.to(device, dtype=torch.bfloat16)
-        outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
-        decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
-        return decoded_outputs[0]
 ###
 @spaces.GPU
 def process_translate(language: str, audio_path: str) -> str:
-    conversation = [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "audio",
-                    "path": audio_path,
-                },
-                {"type": "text", "text": "Translate this in "+language},
-            ],
-        }
-    ]
-    inputs = processor.apply_chat_template(conversation)
-    inputs = inputs.to(device, dtype=torch.bfloat16)
-    outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
-    decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
-    return decoded_outputs[0]
 ###
 @spaces.GPU
 def process_chat(question: str, audio_path: str) -> str:
-    conversation = [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "audio",
-                    "path": audio_path,
-                },
-                {"type": "text", "text": question},
-            ],
-        }
-    ]
-    inputs = processor.apply_chat_template(conversation)
-    inputs = inputs.to(device, dtype=torch.bfloat16)
-    outputs = model.generate(**inputs, max_new_tokens=500)
-    decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
-    return decoded_outputs[0]
 ###
 def disable_buttons():
@@ -94,6 +228,30 @@ def enable_buttons():
     return gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)
 ###
 def secure_download_from_url(url: str):
     """
     Validates a URL and downloads the file if it is an authorized media.
@@ -269,57 +427,15 @@ def clear_audio():
     return None, None, None, None
 ###
-### Initializations
-MAX_TOKENS = 32000
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"*** Device: {device}")
-model_name = 'mistralai/Voxtral-Mini-3B-2507'
-processor = AutoProcessor.from_pretrained(model_name)
-model = VoxtralForConditionalGeneration.from_pretrained(model_name,
-                                                        torch_dtype=torch.bfloat16,
-                                                        device_map=device)
-# Supported languages
-dict_languages = {"English": "en",
-                  "French": "fr",
-                  "German": "de",
-                  "Spanish": "es",
-                  "Italian": "it",
-                  "Portuguese": "pt",
-                  "Dutch": "nl",
-                  "Hindi": "hi"}
-# Whitelist of allowed MIME types for audio and video
-ALLOWED_MIME_TYPES = {
-    # Audio
-    'audio/mpeg', 'audio/wav', 'audio/wave', 'audio/x-wav', 'audio/x-pn-wav',
-    'audio/ogg', 'audio/vorbis', 'audio/aac', 'audio/mp4', 'audio/flac',
-    'audio/x-flac', 'audio/opus', 'audio/webm',
-    # Vidéo
-    'video/mp4', 'video/mpeg', 'video/ogg', 'video/webm', 'video/quicktime',
-    'video/x-msvideo', 'video/x-matroska'
-}
-# Maximum allowed file size (in bytes). Ex: 1 GB
-MAX_FILE_SIZE = 1 * 1024 * 1024 * 1024  # 1 GB
-# Directory where the files will be saved
-DOWNLOAD_DIR = "downloaded_files"
-if not os.path.exists(DOWNLOAD_DIR):
-    os.makedirs(DOWNLOAD_DIR)
 #### Gradio interface
 with gr.Blocks(title="Voxtral") as voxtral:
-    gr.Markdown("# **Voxtral Mini Evaluation**")
-    gr.Markdown("""#### Voxtral Mini is an enhancement of **Ministral 3B**, incorporating state-of-the-art audio input \
-    capabilities while retaining best-in-class text performance.
-    #### It excels at speech transcription, translation and audio understanding.""")
-    with gr.Accordion("🔎 More on Voxtral", open=False):
-        gr.Markdown("""## **Key Features:**
 #### Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
 ##### - **Dedicated transcription mode**: Voxtral can operate in a pure speech transcription mode to maximize performance. By default, Voxtral automatically predicts the source audio language and transcribes the text accordingly
@@ -329,6 +445,9 @@ with gr.Blocks(title="Voxtral") as voxtral:
 ##### - **Function-calling straight from voice**: Enables direct triggering of backend functions, workflows, or API calls based on spoken user intents
 ##### - **Highly capable at text**: Retains the text understanding capabilities of its language model backbone, Ministral-3B""")
     gr.Markdown("### **1.Choose the audio:**")
     sel_audio = gr.State()
@@ -336,12 +455,12 @@ with gr.Blocks(title="Voxtral") as voxtral:
         with gr.Tabs():
             with gr.Tab("From record or file upload"):
                 gr.Markdown("### **Upload an audio file, record via microphone, or select a demo file:**")
-                gr.Markdown("### *(Voxtral handles audios up to 30 minutes for transcription)*")
                 sel_audio1 = gr.Audio(sources=["upload", "microphone"], type="filepath",
                                     label="Set an audio file to process it:")
-                example = [["mapo_tofu.mp3"]]
                 gr.Examples(
-                    examples=example,
                     inputs=sel_audio1,
                     outputs=None,
                     fn=None,
@@ -363,6 +482,15 @@ with gr.Blocks(title="Voxtral") as voxtral:
                 gr.Markdown("### **Enter the url of the file (mp3, wav, mp4, ...):**")
                 url_input2 = gr.Textbox(label="URL (MP3 or MP4 file)",
                                        placeholder="https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/mapo_tofu.mp4")
                 download_button2 = gr.Button("Check and upload", variant="primary")
                 input_audio2 = gr.Audio()
                 status_output2 = gr.Markdown()
@@ -416,23 +544,36 @@ with gr.Blocks(title="Voxtral") as voxtral:
                 )
                 submit_transcript = gr.Button("Extract transcription", variant="primary")
                 text_transcript = gr.Textbox(label="💬 Generated transcription", lines=10)
         with gr.Column():
             with gr.Accordion("🔁 Translation", open=True):
                 sel_translate_language = gr.Dropdown(
                     choices=list(dict_languages.keys()),
                     value="English",
                     label="Select the language for translation:"
                 )
                 submit_translate = gr.Button("Translate audio file", variant="primary")
                 text_translate = gr.Textbox(label="💬 Generated translation", lines=10)
         with gr.Column():
             with gr.Accordion("🤖 Ask audio file", open=True):
                 question_chat = gr.Textbox(label="Enter your question about audio file:", placeholder="Enter your question about audio file")
-                submit_chat = gr.Button("Ask audio file:", variant="primary")
                 text_chat = gr.Textbox(label="💬 Model answer", lines=10)
 ### Processing
@@ -444,7 +585,7 @@ with gr.Blocks(title="Voxtral") as voxtral:
     ).then(
         fn=process_transcript,
         inputs=[sel_language, sel_audio],
-        outputs=text_transcript
     ).then(
         enable_buttons,
         outputs=[submit_transcript, submit_translate, submit_chat],
@@ -458,7 +599,7 @@ with gr.Blocks(title="Voxtral") as voxtral:
     ).then(
         fn=process_translate,
         inputs=[sel_translate_language, sel_audio],
-        outputs=text_translate
     ).then(
         enable_buttons,
         outputs=[submit_transcript, submit_translate, submit_chat],
@@ -472,7 +613,7 @@ with gr.Blocks(title="Voxtral") as voxtral:
     ).then(
         fn=process_chat,
         inputs=[question_chat, sel_audio],
-        outputs=text_chat
     ).then(
         enable_buttons,
         outputs=[submit_transcript, submit_translate, submit_chat],
@@ -481,4 +622,4 @@ with gr.Blocks(title="Voxtral") as voxtral:
 ### Launch the app
 if __name__ == "__main__":
-    voxtral.queue().launch()

 import torch
 from transformers import AutoProcessor, VoxtralForConditionalGeneration
 from pydub import AudioSegment
+from pydub.silence import detect_silence
 import yt_dlp
 import requests
 import validators
 import glob
 import spaces
+### Initializations
+MAX_TOKENS = 32000
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"*** Device: {device}")
+model_name = 'mistralai/Voxtral-Mini-3B-2507'
+processor = AutoProcessor.from_pretrained(model_name)
+model = VoxtralForConditionalGeneration.from_pretrained(model_name,
+                                                        torch_dtype=torch.bfloat16,
+                                                        device_map=device)
+# Supported languages
+dict_languages = {"English": "en",
+                  "French": "fr",
+                  "German": "de",
+                  "Spanish": "es",
+                  "Italian": "it",
+                  "Portuguese": "pt",
+                  "Dutch": "nl",
+                  "Hindi": "hi"}
+# Whitelist of allowed MIME types for audio and video
+ALLOWED_MIME_TYPES = {
+    # Audio
+    'audio/mpeg', 'audio/wav', 'audio/wave', 'audio/x-wav', 'audio/x-pn-wav',
+    'audio/ogg', 'audio/vorbis', 'audio/aac', 'audio/mp4', 'audio/flac',
+    'audio/x-flac', 'audio/opus', 'audio/webm',
+    # Video
+    'video/mp4', 'video/mpeg', 'video/ogg', 'video/webm', 'video/quicktime',
+    'video/x-msvideo', 'video/x-matroska'
+}
+# Maximum allowed file size (in bytes). Ex: 1 GB
+MAX_FILE_SIZE = 1 * 1024 * 1024 * 1024  # 1 GB
+# Directory where the files will be saved
+DOWNLOAD_DIR = "downloaded_files"
+if not os.path.exists(DOWNLOAD_DIR):
+    os.makedirs(DOWNLOAD_DIR)
+MAX_LEN = 1800000 # 30 mn
+one_second_silence = AudioSegment.silent(duration=1000)
 #### Functions
+@spaces.GPU
+def chunks_creation(audio_path):
+    list_audio_path = [audio_path]
+    audio = AudioSegment.from_file(audio_path)
+    status = gr.Markdown("👍 Audio duration less than max")
+    # Input too large ?
+    if len(audio) > MAX_LEN:
+        list_audio_path = []
+        try:
+            # Create list of chunks
+            list_silent = detect_silence(audio,min_silence_len=300,
+                    # silent if quieter than -14 dBFS threshold
+                    silence_thresh=audio.dBFS-14, seek_step=100)
+            list_interval = [(start, stop) for start, stop in list_silent]
+            # Calculate speech intervals
+            list_speech = []
+            current_start = 0
+            for start, stop in list_interval:
+                if current_start < start:
+                    list_interval.append((current_start, start))
+                current_start = stop
+            # Add last interval if needed
+            if current_start < len(audio):
+                list_speech.append((current_start, len(audio)))
+            # Determination of chunks, to fit within the maximum duration
+            list_chunks = []
+            deb_chunk, fin_chunk = 0, list_speech[0][1]
+            for start, end in list_speech[1:]:
+                if end - deb_chunk + one_second_silence <= MAX_LEN:
+                    fin_chunk = end + one_second_silence
+                else:
+                    list_chunks.append([deb_chunk, fin_chunk])
+                    deb_chunk, fin_chunk = start, end
+            list_chunks.append([deb_chunk, fin_chunk+one_second_silence])
+            # Save chunks
+            for i, (start, stop) in enumerate(list_chunks):
+                segment = audio[start:stop]
+                segment.export(f"chunk_{i}.wav", format="wav")
+                list_audio_path.append(f"chunk_{i}.wav")
+            status = f"✅ **Success!** {len(list_audio_path)} chunks saved."
+        except Exception as e:
+            status = gr.Markdown(f"❌ **Unexpected error during chuncks creation:** {e}")
+    return list_audio_path, status
+###
 @spaces.GPU
 def process_transcript(language: str, audio_path: str) -> str:
     """Process the audio file to return its transcription.
     Returns:
         The transcribed text of the audio.
+        The status of transcription : with or without chunking.
     """
+    result = ""
+    status = gr.Markdown()
     if audio_path is None:
+        status = gr.Markdown("Please provide some input audio: either upload an audio file or use the microphone.")
     else:
         id_language = dict_languages[language]
+        # Verification of the duration, for possible division into chunks
+        list_audio_path, status = chunks_creation(audio_path)
+        # Transcription process
+        try:
+            for path in list_audio_path:
+                inputs = processor.apply_transcrition_request(language=id_language,
+                                                              audio=path, model_id=model_name)
+                inputs = inputs.to(device, dtype=torch.bfloat16)
+                outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
+                decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:],
+                                                         skip_special_tokens=True)
+                result += decoded_outputs[0]
+            status = "✅ **Success!** Transcription done."
+        except Exception as e:
+            status = gr.Markdown(f"❌ **Unexpected error during transcription:** {e}")
+    return result, status
 ###
 @spaces.GPU
 def process_translate(language: str, audio_path: str) -> str:
+    result = ""
+    status = gr.Markdown()
+    if audio_path is None:
+        status = gr.Markdown("Please provide some input audio: either upload an audio file or use the microphone.")
+    else:
+        try:
+            conversation = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "audio",
+                            "path": audio_path,
+                        },
+                        {"type": "text", "text": "Translate this in "+language},
+                    ],
+                }
+            ]
+            inputs = processor.apply_chat_template(conversation)
+            inputs = inputs.to(device, dtype=torch.bfloat16)
+            outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
+            decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+            result = decoded_outputs[0]
+            status = "✅ **Success!** Translation done."
+        except Exception as e:
+            status = gr.Markdown(f"❌ **Unexpected error during translation:** {e}")
+    return result, status
 ###
 @spaces.GPU
 def process_chat(question: str, audio_path: str) -> str:
+    result = ""
+    status = gr.Markdown()
+    if audio_path is None:
+        status = gr.Markdown("Please provide some input audio: either upload an audio file or use the microphone.")
+    else:
+        try:
+            conversation = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "audio",
+                            "path": audio_path,
+                        },
+                        {"type": "text", "text": question},
+                    ],
+                }
+            ]
+            inputs = processor.apply_chat_template(conversation)
+            inputs = inputs.to(device, dtype=torch.bfloat16)
+            outputs = model.generate(**inputs, max_new_tokens=500)
+            decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+            result = decoded_outputs[0]
+            status = "✅ **Success!** Translation done."
+        except Exception as e:
+            status = gr.Markdown(f"❌ **Unexpected error during translation:** {e}")
+    return result, status
 ###
 def disable_buttons():
     return gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)
 ###
+def clear_audio():
+    return None, None, None, None
+###
+@spaces.GPU
+def voice_extract_demucs():
+    """
+    Returns the path of the voice extracted file.
+    """
+    try:
+        cmd = [
+            "demucs",
+            "--two-stems=vocals",
+            "--out", "demucs",
+            "audio_file.wav"
+        ]
+        subprocess.run(cmd, check=True)
+        voice_path = os.path.join("demucs", "htdemucs", "audio_file", "vocals.wav")
+        success_message = "✅ **Success!** Voice extracted."
+        return voice_path, voice_path, gr.Markdown(success_message)
+    except Exception as e:
+        return None, None, gr.Markdown(f"❌ **Error:** An unexpected ERROR occurred: {e}")
+###
 def secure_download_from_url(url: str):
     """
     Validates a URL and downloads the file if it is an authorized media.
     return None, None, None, None
 ###
 #### Gradio interface
 with gr.Blocks(title="Voxtral") as voxtral:
+    with gr.Row():
+        gr.Markdown("# **Voxtral Mini Evaluation**")
+        with gr.Accordion("🔎 More on Voxtral", open=False):
+            gr.Markdown("""## **Key Features:**
 #### Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
 ##### - **Dedicated transcription mode**: Voxtral can operate in a pure speech transcription mode to maximize performance. By default, Voxtral automatically predicts the source audio language and transcribes the text accordingly
 ##### - **Function-calling straight from voice**: Enables direct triggering of backend functions, workflows, or API calls based on spoken user intents
 ##### - **Highly capable at text**: Retains the text understanding capabilities of its language model backbone, Ministral-3B""")
+    gr.Markdown("""#### Voxtral Mini is an enhancement of **Ministral 3B**, incorporating state-of-the-art audio input \
+    capabilities while retaining best-in-class text performance. It excels at speech transcription, translation and \
+    audio understanding. Available languages: English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian.""")
     gr.Markdown("### **1.Choose the audio:**")
     sel_audio = gr.State()
         with gr.Tabs():
             with gr.Tab("From record or file upload"):
                 gr.Markdown("### **Upload an audio file, record via microphone, or select a demo file:**")
+                gr.Markdown("### *(Voxtral handles audios up to 30 minutes for transcription; if longer, it will be cut into chunks)*")
                 sel_audio1 = gr.Audio(sources=["upload", "microphone"], type="filepath",
                                     label="Set an audio file to process it:")
+                example1 = [["mapo_tofu.mp3"]]
                 gr.Examples(
+                    examples=example1,
                     inputs=sel_audio1,
                     outputs=None,
                     fn=None,
                 gr.Markdown("### **Enter the url of the file (mp3, wav, mp4, ...):**")
                 url_input2 = gr.Textbox(label="URL (MP3 or MP4 file)",
                                        placeholder="https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/mapo_tofu.mp4")
+                example2 = [["https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/mapo_tofu.mp4"]]
+                gr.Examples(
+                    examples=example2,
+                    inputs=url_input2,
+                    outputs=None,
+                    fn=None,
+                    cache_examples=False,
+                    run_on_click=False
+                )
                 download_button2 = gr.Button("Check and upload", variant="primary")
                 input_audio2 = gr.Audio()
                 status_output2 = gr.Markdown()
                 )
                 submit_transcript = gr.Button("Extract transcription", variant="primary")
                 text_transcript = gr.Textbox(label="💬 Generated transcription", lines=10)
+                status_transcript = gr.Markdown()
         with gr.Column():
             with gr.Accordion("🔁 Translation", open=True):
+                list_language = list(dict_languages.keys())
+                list_language.pop(list_language.index(sel_language.value)) # Fix: Access the value of the dropdown
                 sel_translate_language = gr.Dropdown(
                     choices=list(dict_languages.keys()),
                     value="English",
                     label="Select the language for translation:"
                 )
                 submit_translate = gr.Button("Translate audio file", variant="primary")
                 text_translate = gr.Textbox(label="💬 Generated translation", lines=10)
+                status_translate = gr.Markdown()
         with gr.Column():
             with gr.Accordion("🤖 Ask audio file", open=True):
                 question_chat = gr.Textbox(label="Enter your question about audio file:", placeholder="Enter your question about audio file")
+                submit_chat = gr.Button("Ask audio file", variant="primary")
+                example_chat = [["What is the subject of this audio file?"], ["Quels sont les ingrédients ?"]]
+                gr.Examples(
+                    examples=example_chat,
+                    inputs=question_chat,
+                    outputs=None,
+                    fn=None,
+                    cache_examples=False,
+                    run_on_click=False
+                )
                 text_chat = gr.Textbox(label="💬 Model answer", lines=10)
+                status_chat = gr.Markdown()
 ### Processing
     ).then(
         fn=process_transcript,
         inputs=[sel_language, sel_audio],
+        outputs=[text_transcript, status_transcript]
     ).then(
         enable_buttons,
         outputs=[submit_transcript, submit_translate, submit_chat],
     ).then(
         fn=process_translate,
         inputs=[sel_translate_language, sel_audio],
+        outputs=[text_translate, status_translate]
     ).then(
         enable_buttons,
         outputs=[submit_transcript, submit_translate, submit_chat],
     ).then(
         fn=process_chat,
         inputs=[question_chat, sel_audio],
+        outputs=[text_chat, status_chat]
     ).then(
         enable_buttons,
         outputs=[submit_transcript, submit_translate, submit_chat],
 ### Launch the app
 if __name__ == "__main__":
+    voxtral.queue().launch(debug=True)