Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	
		nithinraok
		
	commited on
		
		
					Commit 
							
							Β·
						
						75c1233
	
1
								Parent(s):
							
							ea560f2
								
Add space with mp3 via LFS
Browse files- .gitattributes +1 -0
- app.py +427 -0
- data/example-yt_saTD1u8PorI.mp3 +3 -0
- packages.txt +2 -0
- requirements.txt +2 -0
    	
        .gitattributes
    CHANGED
    
    | @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text | |
| 33 | 
             
            *.zip filter=lfs diff=lfs merge=lfs -text
         | 
| 34 | 
             
            *.zst filter=lfs diff=lfs merge=lfs -text
         | 
| 35 | 
             
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         | 
|  | 
|  | |
| 33 | 
             
            *.zip filter=lfs diff=lfs merge=lfs -text
         | 
| 34 | 
             
            *.zst filter=lfs diff=lfs merge=lfs -text
         | 
| 35 | 
             
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         | 
| 36 | 
            +
            *.mp3 filter=lfs diff=lfs merge=lfs -text
         | 
    	
        app.py
    ADDED
    
    | @@ -0,0 +1,427 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from nemo.collections.asr.models import ASRModel
         | 
| 2 | 
            +
            import torch
         | 
| 3 | 
            +
            import gradio as gr
         | 
| 4 | 
            +
            import spaces
         | 
| 5 | 
            +
            import gc
         | 
| 6 | 
            +
            import shutil
         | 
| 7 | 
            +
            from pathlib import Path
         | 
| 8 | 
            +
            from pydub import AudioSegment
         | 
| 9 | 
            +
            import numpy as np
         | 
| 10 | 
            +
            import os
         | 
| 11 | 
            +
            import gradio.themes as gr_themes
         | 
| 12 | 
            +
            import csv
         | 
| 13 | 
            +
            import datetime
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            device = "cuda" if torch.cuda.is_available() else "cpu"
         | 
| 16 | 
            +
            MODEL_NAME="nvidia/parakeet-tdt-0.6b-v3"
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            model = ASRModel.from_pretrained(model_name=MODEL_NAME)
         | 
| 19 | 
            +
            model.eval()
         | 
| 20 | 
            +
             | 
| 21 | 
            +
             | 
| 22 | 
            +
            def start_session(request: gr.Request):
         | 
| 23 | 
            +
                session_hash = request.session_hash
         | 
| 24 | 
            +
                session_dir = Path(f'/tmp/{session_hash}')
         | 
| 25 | 
            +
                session_dir.mkdir(parents=True, exist_ok=True)
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                print(f"Session with hash {session_hash} started.")
         | 
| 28 | 
            +
                return session_dir.as_posix()
         | 
| 29 | 
            +
             | 
| 30 | 
            +
            def end_session(request: gr.Request):
         | 
| 31 | 
            +
                session_hash = request.session_hash
         | 
| 32 | 
            +
                session_dir = Path(f'/tmp/{session_hash}')
         | 
| 33 | 
            +
                
         | 
| 34 | 
            +
                if session_dir.exists():
         | 
| 35 | 
            +
                    shutil.rmtree(session_dir)
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                print(f"Session with hash {session_hash} ended.")
         | 
| 38 | 
            +
             | 
| 39 | 
            +
            def get_audio_segment(audio_path, start_second, end_second):
         | 
| 40 | 
            +
                if not audio_path or not Path(audio_path).exists():
         | 
| 41 | 
            +
                    print(f"Warning: Audio path '{audio_path}' not found or invalid for clipping.")
         | 
| 42 | 
            +
                    return None
         | 
| 43 | 
            +
                try:
         | 
| 44 | 
            +
                    start_ms = int(start_second * 1000)
         | 
| 45 | 
            +
                    end_ms = int(end_second * 1000)
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                    start_ms = max(0, start_ms)
         | 
| 48 | 
            +
                    if end_ms <= start_ms:
         | 
| 49 | 
            +
                        print(f"Warning: End time ({end_second}s) is not after start time ({start_second}s). Adjusting end time.")
         | 
| 50 | 
            +
                        end_ms = start_ms + 100
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                    audio = AudioSegment.from_file(audio_path)
         | 
| 53 | 
            +
                    clipped_audio = audio[start_ms:end_ms]
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                    samples = np.array(clipped_audio.get_array_of_samples())
         | 
| 56 | 
            +
                    if clipped_audio.channels == 2:
         | 
| 57 | 
            +
                        samples = samples.reshape((-1, 2)).mean(axis=1).astype(samples.dtype)
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                    frame_rate = clipped_audio.frame_rate
         | 
| 60 | 
            +
                    if frame_rate <= 0:
         | 
| 61 | 
            +
                         print(f"Warning: Invalid frame rate ({frame_rate}) detected for clipped audio.")
         | 
| 62 | 
            +
                         frame_rate = audio.frame_rate
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                    if samples.size == 0:
         | 
| 65 | 
            +
                         print(f"Warning: Clipped audio resulted in empty samples array ({start_second}s to {end_second}s).")
         | 
| 66 | 
            +
                         return None
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                    return (frame_rate, samples)
         | 
| 69 | 
            +
                except FileNotFoundError:
         | 
| 70 | 
            +
                    print(f"Error: Audio file not found at path: {audio_path}")
         | 
| 71 | 
            +
                    return None
         | 
| 72 | 
            +
                except Exception as e:
         | 
| 73 | 
            +
                    print(f"Error clipping audio {audio_path} from {start_second}s to {end_second}s: {e}")
         | 
| 74 | 
            +
                    return None
         | 
| 75 | 
            +
             | 
| 76 | 
            +
            def format_srt_time(seconds: float) -> str:
         | 
| 77 | 
            +
                """Converts seconds to SRT time format HH:MM:SS,mmm using datetime.timedelta"""
         | 
| 78 | 
            +
                sanitized_total_seconds = max(0.0, seconds)
         | 
| 79 | 
            +
                delta = datetime.timedelta(seconds=sanitized_total_seconds)
         | 
| 80 | 
            +
                total_int_seconds = int(delta.total_seconds())
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                hours = total_int_seconds // 3600
         | 
| 83 | 
            +
                remainder_seconds_after_hours = total_int_seconds % 3600
         | 
| 84 | 
            +
                minutes = remainder_seconds_after_hours // 60
         | 
| 85 | 
            +
                seconds_part = remainder_seconds_after_hours % 60
         | 
| 86 | 
            +
                milliseconds = delta.microseconds // 1000
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                return f"{hours:02d}:{minutes:02d}:{seconds_part:02d},{milliseconds:03d}"
         | 
| 89 | 
            +
             | 
| 90 | 
            +
            def generate_srt_content(segment_timestamps: list) -> str:
         | 
| 91 | 
            +
                """Generates SRT formatted string from segment timestamps."""
         | 
| 92 | 
            +
                srt_content = []
         | 
| 93 | 
            +
                for i, ts in enumerate(segment_timestamps):
         | 
| 94 | 
            +
                    start_time = format_srt_time(ts['start'])
         | 
| 95 | 
            +
                    end_time = format_srt_time(ts['end'])
         | 
| 96 | 
            +
                    text = ts['segment']
         | 
| 97 | 
            +
                    srt_content.append(str(i + 1))
         | 
| 98 | 
            +
                    srt_content.append(f"{start_time} --> {end_time}")
         | 
| 99 | 
            +
                    srt_content.append(text)
         | 
| 100 | 
            +
                    srt_content.append("")
         | 
| 101 | 
            +
                return "\n".join(srt_content)
         | 
| 102 | 
            +
             | 
| 103 | 
            +
            @spaces.GPU
         | 
| 104 | 
            +
            def get_transcripts_and_raw_times(audio_path, session_dir):
         | 
| 105 | 
            +
                if not audio_path:
         | 
| 106 | 
            +
                    gr.Error("No audio file path provided for transcription.", duration=None)
         | 
| 107 | 
            +
                    # Return an update to hide the buttons
         | 
| 108 | 
            +
                    return [], [], None, gr.DownloadButton(label="Download Transcript (CSV)", visible=False), gr.DownloadButton(label="Download Transcript (SRT)", visible=False)
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                vis_data = [["N/A", "N/A", "Processing failed"]]
         | 
| 111 | 
            +
                raw_times_data = [[0.0, 0.0]]
         | 
| 112 | 
            +
                processed_audio_path = None
         | 
| 113 | 
            +
                csv_file_path = None
         | 
| 114 | 
            +
                srt_file_path = None
         | 
| 115 | 
            +
                original_path_name = Path(audio_path).name
         | 
| 116 | 
            +
                audio_name = Path(audio_path).stem
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                # Initialize button states
         | 
| 119 | 
            +
                csv_button_update = gr.DownloadButton(label="Download Transcript (CSV)", visible=False)
         | 
| 120 | 
            +
                srt_button_update = gr.DownloadButton(label="Download Transcript (SRT)", visible=False)
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                try:
         | 
| 123 | 
            +
                    try:
         | 
| 124 | 
            +
                        gr.Info(f"Loading audio: {original_path_name}", duration=2)
         | 
| 125 | 
            +
                        audio = AudioSegment.from_file(audio_path)
         | 
| 126 | 
            +
                        duration_sec = audio.duration_seconds
         | 
| 127 | 
            +
                    except Exception as load_e:
         | 
| 128 | 
            +
                        gr.Error(f"Failed to load audio file {original_path_name}: {load_e}", duration=None)
         | 
| 129 | 
            +
                        return [["Error", "Error", "Load failed"]], [[0.0, 0.0]], audio_path, csv_button_update, srt_button_update
         | 
| 130 | 
            +
             | 
| 131 | 
            +
                    resampled = False
         | 
| 132 | 
            +
                    mono = False
         | 
| 133 | 
            +
             | 
| 134 | 
            +
                    target_sr = 16000
         | 
| 135 | 
            +
                    if audio.frame_rate != target_sr:
         | 
| 136 | 
            +
                        try:
         | 
| 137 | 
            +
                            audio = audio.set_frame_rate(target_sr)
         | 
| 138 | 
            +
                            resampled = True
         | 
| 139 | 
            +
                        except Exception as resample_e:
         | 
| 140 | 
            +
                             gr.Error(f"Failed to resample audio: {resample_e}", duration=None)
         | 
| 141 | 
            +
                             return [["Error", "Error", "Resample failed"]], [[0.0, 0.0]], audio_path, csv_button_update, srt_button_update
         | 
| 142 | 
            +
             | 
| 143 | 
            +
                    if audio.channels == 2:
         | 
| 144 | 
            +
                        try:
         | 
| 145 | 
            +
                            audio = audio.set_channels(1)
         | 
| 146 | 
            +
                            mono = True
         | 
| 147 | 
            +
                        except Exception as mono_e:
         | 
| 148 | 
            +
                             gr.Error(f"Failed to convert audio to mono: {mono_e}", duration=None)
         | 
| 149 | 
            +
                             return [["Error", "Error", "Mono conversion failed"]], [[0.0, 0.0]], audio_path, csv_button_update, srt_button_update
         | 
| 150 | 
            +
                    elif audio.channels > 2:
         | 
| 151 | 
            +
                         gr.Error(f"Audio has {audio.channels} channels. Only mono (1) or stereo (2) supported.", duration=None)
         | 
| 152 | 
            +
                         return [["Error", "Error", f"{audio.channels}-channel audio not supported"]], [[0.0, 0.0]], audio_path, csv_button_update, srt_button_update
         | 
| 153 | 
            +
             | 
| 154 | 
            +
                    if resampled or mono:
         | 
| 155 | 
            +
                        try:
         | 
| 156 | 
            +
                            processed_audio_path = Path(session_dir, f"{audio_name}_resampled.wav")
         | 
| 157 | 
            +
                            audio.export(processed_audio_path, format="wav")
         | 
| 158 | 
            +
                            transcribe_path = processed_audio_path.as_posix()
         | 
| 159 | 
            +
                            info_path_name = f"{original_path_name} (processed)"
         | 
| 160 | 
            +
                        except Exception as export_e:
         | 
| 161 | 
            +
                            gr.Error(f"Failed to export processed audio: {export_e}", duration=None)
         | 
| 162 | 
            +
                            if processed_audio_path and os.path.exists(processed_audio_path):
         | 
| 163 | 
            +
                                os.remove(processed_audio_path)
         | 
| 164 | 
            +
                            return [["Error", "Error", "Export failed"]], [[0.0, 0.0]], audio_path, csv_button_update, srt_button_update
         | 
| 165 | 
            +
                    else:
         | 
| 166 | 
            +
                        transcribe_path = audio_path
         | 
| 167 | 
            +
                        info_path_name = original_path_name
         | 
| 168 | 
            +
             | 
| 169 | 
            +
                    # Flag to track if long audio settings were applied
         | 
| 170 | 
            +
                    long_audio_settings_applied = False
         | 
| 171 | 
            +
                    try:
         | 
| 172 | 
            +
                        model.to(device)
         | 
| 173 | 
            +
                        model.to(torch.float32)
         | 
| 174 | 
            +
                        gr.Info(f"Transcribing {info_path_name} on {device}...", duration=2)
         | 
| 175 | 
            +
             | 
| 176 | 
            +
                        # Check duration and apply specific settings for long audio
         | 
| 177 | 
            +
                        if duration_sec > 480 : # 8 minutes
         | 
| 178 | 
            +
                            try:
         | 
| 179 | 
            +
                                gr.Info("Audio longer than 8 minutes. Applying optimized settings for long transcription.", duration=3)
         | 
| 180 | 
            +
                                print("Applying long audio settings: Local Attention and Chunking.")
         | 
| 181 | 
            +
                                model.change_attention_model("rel_pos_local_attn", [256,256])
         | 
| 182 | 
            +
                                model.change_subsampling_conv_chunking_factor(1)  # 1 = auto select
         | 
| 183 | 
            +
                                long_audio_settings_applied = True
         | 
| 184 | 
            +
                            except Exception as setting_e:
         | 
| 185 | 
            +
                                gr.Warning(f"Could not apply long audio settings: {setting_e}", duration=5)
         | 
| 186 | 
            +
                                print(f"Warning: Failed to apply long audio settings: {setting_e}")
         | 
| 187 | 
            +
                                # Proceed without long audio settings if applying them failed
         | 
| 188 | 
            +
                        
         | 
| 189 | 
            +
                        model.to(torch.bfloat16)
         | 
| 190 | 
            +
                        output = model.transcribe([transcribe_path], timestamps=True)
         | 
| 191 | 
            +
             | 
| 192 | 
            +
                        if not output or not isinstance(output, list) or not output[0] or not hasattr(output[0], 'timestamp') or not output[0].timestamp or 'segment' not in output[0].timestamp:
         | 
| 193 | 
            +
                             gr.Error("Transcription failed or produced unexpected output format.", duration=None)
         | 
| 194 | 
            +
                             # Return an update to hide the buttons
         | 
| 195 | 
            +
                             return [["Error", "Error", "Transcription Format Issue"]], [[0.0, 0.0]], audio_path, csv_button_update, srt_button_update
         | 
| 196 | 
            +
             | 
| 197 | 
            +
                        segment_timestamps = output[0].timestamp['segment']
         | 
| 198 | 
            +
                        csv_headers = ["Start (s)", "End (s)", "Segment"]
         | 
| 199 | 
            +
                        vis_data = [[f"{ts['start']:.2f}", f"{ts['end']:.2f}", ts['segment']] for ts in segment_timestamps]
         | 
| 200 | 
            +
                        raw_times_data = [[ts['start'], ts['end']] for ts in segment_timestamps]
         | 
| 201 | 
            +
             | 
| 202 | 
            +
                        # CSV file generation
         | 
| 203 | 
            +
                        try:
         | 
| 204 | 
            +
                            csv_file_path = Path(session_dir, f"transcription_{audio_name}.csv")
         | 
| 205 | 
            +
                            writer = csv.writer(open(csv_file_path, 'w'))
         | 
| 206 | 
            +
                            writer.writerow(csv_headers)
         | 
| 207 | 
            +
                            writer.writerows(vis_data)
         | 
| 208 | 
            +
                            print(f"CSV transcript saved to temporary file: {csv_file_path}")
         | 
| 209 | 
            +
                            csv_button_update = gr.DownloadButton(value=csv_file_path, visible=True, label="Download Transcript (CSV)")
         | 
| 210 | 
            +
                        except Exception as csv_e:
         | 
| 211 | 
            +
                            gr.Error(f"Failed to create transcript CSV file: {csv_e}", duration=None)
         | 
| 212 | 
            +
                            print(f"Error writing CSV: {csv_e}")
         | 
| 213 | 
            +
             | 
| 214 | 
            +
                        if segment_timestamps:
         | 
| 215 | 
            +
                            try:
         | 
| 216 | 
            +
                                srt_content = generate_srt_content(segment_timestamps)
         | 
| 217 | 
            +
                                srt_file_path = Path(session_dir, f"transcription_{audio_name}.srt")
         | 
| 218 | 
            +
                                with open(srt_file_path, 'w', encoding='utf-8') as f:
         | 
| 219 | 
            +
                                    f.write(srt_content)
         | 
| 220 | 
            +
                                print(f"SRT transcript saved to temporary file: {srt_file_path}")
         | 
| 221 | 
            +
                                srt_button_update = gr.DownloadButton(value=srt_file_path, visible=True, label="Download Transcript (SRT)")
         | 
| 222 | 
            +
                            except Exception as srt_e:
         | 
| 223 | 
            +
                                gr.Warning(f"Failed to create transcript SRT file: {srt_e}", duration=5)
         | 
| 224 | 
            +
                                print(f"Error writing SRT: {srt_e}")
         | 
| 225 | 
            +
             | 
| 226 | 
            +
                        gr.Info("Transcription complete.", duration=2)
         | 
| 227 | 
            +
                        return vis_data, raw_times_data, audio_path, csv_button_update, srt_button_update
         | 
| 228 | 
            +
             | 
| 229 | 
            +
                    except torch.cuda.OutOfMemoryError as e:
         | 
| 230 | 
            +
                        error_msg = 'CUDA out of memory. Please try a shorter audio or reduce GPU load.'
         | 
| 231 | 
            +
                        print(f"CUDA OutOfMemoryError: {e}")
         | 
| 232 | 
            +
                        gr.Error(error_msg, duration=None)
         | 
| 233 | 
            +
                        return [["OOM", "OOM", error_msg]], [[0.0, 0.0]], audio_path, csv_button_update, srt_button_update
         | 
| 234 | 
            +
             | 
| 235 | 
            +
                    except FileNotFoundError:
         | 
| 236 | 
            +
                        error_msg = f"Audio file for transcription not found: {Path(transcribe_path).name}."
         | 
| 237 | 
            +
                        print(f"Error: Transcribe audio file not found at path: {transcribe_path}")
         | 
| 238 | 
            +
                        gr.Error(error_msg, duration=None)
         | 
| 239 | 
            +
                        return [["Error", "Error", "File not found for transcription"]], [[0.0, 0.0]], audio_path, csv_button_update, srt_button_update
         | 
| 240 | 
            +
             | 
| 241 | 
            +
                    except Exception as e:
         | 
| 242 | 
            +
                        error_msg = f"Transcription failed: {e}"
         | 
| 243 | 
            +
                        print(f"Error during transcription processing: {e}")
         | 
| 244 | 
            +
                        gr.Error(error_msg, duration=None)
         | 
| 245 | 
            +
                        vis_data = [["Error", "Error", error_msg]]
         | 
| 246 | 
            +
                        raw_times_data = [[0.0, 0.0]]
         | 
| 247 | 
            +
                        return vis_data, raw_times_data, audio_path, csv_button_update, srt_button_update
         | 
| 248 | 
            +
                    finally:
         | 
| 249 | 
            +
                        # --- Model Cleanup ---
         | 
| 250 | 
            +
                        try:
         | 
| 251 | 
            +
                            # Revert settings if they were applied for long audio
         | 
| 252 | 
            +
                            if long_audio_settings_applied:
         | 
| 253 | 
            +
                                try:
         | 
| 254 | 
            +
                                    print("Reverting long audio settings.")
         | 
| 255 | 
            +
                                    model.change_attention_model("rel_pos") 
         | 
| 256 | 
            +
                                    model.change_subsampling_conv_chunking_factor(-1)
         | 
| 257 | 
            +
                                    long_audio_settings_applied = False # Reset flag
         | 
| 258 | 
            +
                                except Exception as revert_e:
         | 
| 259 | 
            +
                                    print(f"Warning: Failed to revert long audio settings: {revert_e}")
         | 
| 260 | 
            +
                                    gr.Warning(f"Issue reverting model settings after long transcription: {revert_e}", duration=5)
         | 
| 261 | 
            +
             | 
| 262 | 
            +
                            # Original cleanup
         | 
| 263 | 
            +
                            if 'model' in locals() and hasattr(model, 'cpu'):
         | 
| 264 | 
            +
                                 if device == 'cuda':
         | 
| 265 | 
            +
                                      model.cpu()
         | 
| 266 | 
            +
                            gc.collect()
         | 
| 267 | 
            +
                            if device == 'cuda':
         | 
| 268 | 
            +
                                torch.cuda.empty_cache()
         | 
| 269 | 
            +
                        except Exception as cleanup_e:
         | 
| 270 | 
            +
                            print(f"Error during model cleanup: {cleanup_e}")
         | 
| 271 | 
            +
                            gr.Warning(f"Issue during model cleanup: {cleanup_e}", duration=5)
         | 
| 272 | 
            +
                        # --- End Model Cleanup ---
         | 
| 273 | 
            +
             | 
| 274 | 
            +
                finally:
         | 
| 275 | 
            +
                    if processed_audio_path and os.path.exists(processed_audio_path):
         | 
| 276 | 
            +
                        try:
         | 
| 277 | 
            +
                            os.remove(processed_audio_path)
         | 
| 278 | 
            +
                            print(f"Temporary audio file {processed_audio_path} removed.")
         | 
| 279 | 
            +
                        except Exception as e:
         | 
| 280 | 
            +
                            print(f"Error removing temporary audio file {processed_audio_path}: {e}")
         | 
| 281 | 
            +
             | 
| 282 | 
            +
            def play_segment(evt: gr.SelectData, raw_ts_list, current_audio_path):
         | 
| 283 | 
            +
                if not isinstance(raw_ts_list, list):
         | 
| 284 | 
            +
                    print(f"Warning: raw_ts_list is not a list ({type(raw_ts_list)}). Cannot play segment.")
         | 
| 285 | 
            +
                    return gr.Audio(value=None, label="Selected Segment")
         | 
| 286 | 
            +
             | 
| 287 | 
            +
                if not current_audio_path:
         | 
| 288 | 
            +
                    print("No audio path available to play segment from.")
         | 
| 289 | 
            +
                    return gr.Audio(value=None, label="Selected Segment")
         | 
| 290 | 
            +
             | 
| 291 | 
            +
                selected_index = evt.index[0]
         | 
| 292 | 
            +
             | 
| 293 | 
            +
                if selected_index < 0 or selected_index >= len(raw_ts_list):
         | 
| 294 | 
            +
                     print(f"Invalid index {selected_index} selected for list of length {len(raw_ts_list)}.")
         | 
| 295 | 
            +
                     return gr.Audio(value=None, label="Selected Segment")
         | 
| 296 | 
            +
             | 
| 297 | 
            +
                if not isinstance(raw_ts_list[selected_index], (list, tuple)) or len(raw_ts_list[selected_index]) != 2:
         | 
| 298 | 
            +
                     print(f"Warning: Data at index {selected_index} is not in the expected format [start, end].")
         | 
| 299 | 
            +
                     return gr.Audio(value=None, label="Selected Segment")
         | 
| 300 | 
            +
             | 
| 301 | 
            +
                start_time_s, end_time_s = raw_ts_list[selected_index]
         | 
| 302 | 
            +
             | 
| 303 | 
            +
                print(f"Attempting to play segment: {current_audio_path} from {start_time_s:.2f}s to {end_time_s:.2f}s")
         | 
| 304 | 
            +
             | 
| 305 | 
            +
                segment_data = get_audio_segment(current_audio_path, start_time_s, end_time_s)
         | 
| 306 | 
            +
             | 
| 307 | 
            +
                if segment_data:
         | 
| 308 | 
            +
                    print("Segment data retrieved successfully.")
         | 
| 309 | 
            +
                    return gr.Audio(value=segment_data, autoplay=True, label=f"Segment: {start_time_s:.2f}s - {end_time_s:.2f}s", interactive=False)
         | 
| 310 | 
            +
                else:
         | 
| 311 | 
            +
                    print("Failed to get audio segment data.")
         | 
| 312 | 
            +
                    return gr.Audio(value=None, label="Selected Segment")
         | 
| 313 | 
            +
             | 
| 314 | 
            +
            article = (
         | 
| 315 | 
            +
                "<p style='font-size: 1.1em;'>"
         | 
| 316 | 
            +
                "This demo showcases <code><a href='https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3'>parakeet-tdt-0.6b-v3</a></code>, a 600-million-parameter <strong>multilingual</strong> model designed for high-quality speech recognition with automatic language detection."
         | 
| 317 | 
            +
                "</p>"
         | 
| 318 | 
            +
                "<p><strong style='color: red; font-size: 1.2em;'>Key Features:</strong></p>"
         | 
| 319 | 
            +
                "<ul style='font-size: 1.1em;'>"
         | 
| 320 | 
            +
                "    <li>Automatic punctuation and capitalization</li>"
         | 
| 321 | 
            +
                "    <li>Accurate word-level timestamps (click on a segment in the table below to play it!)</li>"
         | 
| 322 | 
            +
                "    <li>Multilingual transcription across 25 European languages with automatic language detection</li>"
         | 
| 323 | 
            +
                "    <li>Long audio transcription: up to 24 minutes with full attention (A100 80GB) or up to 3 hours with local attention</li>"
         | 
| 324 | 
            +
                "</ul>"
         | 
| 325 | 
            +
                "<p style='font-size: 1.1em;'>"
         | 
| 326 | 
            +
                "<strong>Supported Languages:</strong> bg, hr, cs, da, nl, en, et, fi, fr, de, el, hu, it, lv, lt, mt, pl, pt, ro, sk, sl, es, sv, ru, uk"
         | 
| 327 | 
            +
                "</p>"
         | 
| 328 | 
            +
                "<p style='font-size: 1.1em;'>"
         | 
| 329 | 
            +
                "This model is <strong>available for commercial and non-commercial use</strong> (CC BY 4.0)."
         | 
| 330 | 
            +
                "</p>"
         | 
| 331 | 
            +
                "<p style='text-align: center;'>"
         | 
| 332 | 
            +
                "<a href='https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3' target='_blank'>ποΈ Learn more about the Model</a> | "
         | 
| 333 | 
            +
                "<a href='https://arxiv.org/abs/2305.05084' target='_blank'>π Fast Conformer paper</a> | "
         | 
| 334 | 
            +
                "<a href='https://arxiv.org/abs/2304.06795' target='_blank'>π TDT paper</a> | "
         | 
| 335 | 
            +
                "<a href='https://github.com/NVIDIA/NeMo' target='_blank'>π§βπ» NeMo Repository</a>"
         | 
| 336 | 
            +
                "</p>"
         | 
| 337 | 
            +
            )
         | 
| 338 | 
            +
             | 
| 339 | 
            +
            examples = [
         | 
| 340 | 
            +
                ["data/example-yt_saTD1u8PorI.mp3"],
         | 
| 341 | 
            +
            ]
         | 
| 342 | 
            +
             | 
| 343 | 
            +
            # Define an NVIDIA-inspired theme
         | 
| 344 | 
            +
            nvidia_theme = gr_themes.Default(
         | 
| 345 | 
            +
                primary_hue=gr_themes.Color(
         | 
| 346 | 
            +
                    c50="#E6F1D9", # Lightest green
         | 
| 347 | 
            +
                    c100="#CEE3B3",
         | 
| 348 | 
            +
                    c200="#B5D58C",
         | 
| 349 | 
            +
                    c300="#9CC766",
         | 
| 350 | 
            +
                    c400="#84B940",
         | 
| 351 | 
            +
                    c500="#76B900", # NVIDIA Green
         | 
| 352 | 
            +
                    c600="#68A600",
         | 
| 353 | 
            +
                    c700="#5A9200",
         | 
| 354 | 
            +
                    c800="#4C7E00",
         | 
| 355 | 
            +
                    c900="#3E6A00", # Darkest green
         | 
| 356 | 
            +
                    c950="#2F5600"
         | 
| 357 | 
            +
                ),
         | 
| 358 | 
            +
                neutral_hue="gray", # Use gray for neutral elements
         | 
| 359 | 
            +
                font=[gr_themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
         | 
| 360 | 
            +
            ).set()
         | 
| 361 | 
            +
             | 
| 362 | 
            +
            # Apply the custom theme
         | 
| 363 | 
            +
            with gr.Blocks(theme=nvidia_theme) as demo:
         | 
| 364 | 
            +
                model_display_name = MODEL_NAME.split('/')[-1] if '/' in MODEL_NAME else MODEL_NAME
         | 
| 365 | 
            +
                gr.Markdown(f"<h1 style='text-align: center; margin: 0 auto;'>Speech Transcription with {model_display_name}</h1>")
         | 
| 366 | 
            +
                gr.HTML(article)
         | 
| 367 | 
            +
             | 
| 368 | 
            +
                current_audio_path_state = gr.State(None)
         | 
| 369 | 
            +
                raw_timestamps_list_state = gr.State([])
         | 
| 370 | 
            +
             | 
| 371 | 
            +
                session_dir = gr.State()
         | 
| 372 | 
            +
                demo.load(start_session, outputs=[session_dir])
         | 
| 373 | 
            +
             | 
| 374 | 
            +
                with gr.Tabs():
         | 
| 375 | 
            +
                    with gr.TabItem("Audio File"):
         | 
| 376 | 
            +
                        file_input = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio File")
         | 
| 377 | 
            +
                        gr.Examples(examples=examples, inputs=[file_input], label="Example Audio Files (Click to Load)")
         | 
| 378 | 
            +
                        file_transcribe_btn = gr.Button("Transcribe Uploaded File", variant="primary")
         | 
| 379 | 
            +
                    
         | 
| 380 | 
            +
                    with gr.TabItem("Microphone"):
         | 
| 381 | 
            +
                        mic_input = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
         | 
| 382 | 
            +
                        mic_transcribe_btn = gr.Button("Transcribe Microphone Input", variant="primary")
         | 
| 383 | 
            +
             | 
| 384 | 
            +
                gr.Markdown("---")
         | 
| 385 | 
            +
                gr.Markdown("<p><strong style='color: #FF0000; font-size: 1.2em;'>Transcription Results (Click row to play segment)</strong></p>")
         | 
| 386 | 
            +
             | 
| 387 | 
            +
                # Define the DownloadButton *before* the DataFrame
         | 
| 388 | 
            +
                with gr.Row():
         | 
| 389 | 
            +
                    download_btn_csv = gr.DownloadButton(label="Download Transcript (CSV)", visible=False)
         | 
| 390 | 
            +
                    download_btn_srt = gr.DownloadButton(label="Download Transcript (SRT)", visible=False)
         | 
| 391 | 
            +
             | 
| 392 | 
            +
                vis_timestamps_df = gr.DataFrame(
         | 
| 393 | 
            +
                    headers=["Start (s)", "End (s)", "Segment"],
         | 
| 394 | 
            +
                    datatype=["number", "number", "str"],
         | 
| 395 | 
            +
                    wrap=True,
         | 
| 396 | 
            +
                    label="Transcription Segments"
         | 
| 397 | 
            +
                )
         | 
| 398 | 
            +
             | 
| 399 | 
            +
                # selected_segment_player was defined after download_btn previously, keep it after df for layout
         | 
| 400 | 
            +
                selected_segment_player = gr.Audio(label="Selected Segment", interactive=False)
         | 
| 401 | 
            +
             | 
| 402 | 
            +
                mic_transcribe_btn.click(
         | 
| 403 | 
            +
                    fn=get_transcripts_and_raw_times,
         | 
| 404 | 
            +
                    inputs=[mic_input, session_dir],
         | 
| 405 | 
            +
                    outputs=[vis_timestamps_df, raw_timestamps_list_state, current_audio_path_state, download_btn_csv, download_btn_srt],
         | 
| 406 | 
            +
                    api_name="transcribe_mic"
         | 
| 407 | 
            +
                )
         | 
| 408 | 
            +
             | 
| 409 | 
            +
                file_transcribe_btn.click(
         | 
| 410 | 
            +
                    fn=get_transcripts_and_raw_times,
         | 
| 411 | 
            +
                    inputs=[file_input, session_dir],
         | 
| 412 | 
            +
                    outputs=[vis_timestamps_df, raw_timestamps_list_state, current_audio_path_state, download_btn_csv, download_btn_srt],
         | 
| 413 | 
            +
                    api_name="transcribe_file"
         | 
| 414 | 
            +
                )
         | 
| 415 | 
            +
             | 
| 416 | 
            +
                vis_timestamps_df.select(
         | 
| 417 | 
            +
                    fn=play_segment,
         | 
| 418 | 
            +
                    inputs=[raw_timestamps_list_state, current_audio_path_state],
         | 
| 419 | 
            +
                    outputs=[selected_segment_player],
         | 
| 420 | 
            +
                )
         | 
| 421 | 
            +
             | 
| 422 | 
            +
                demo.unload(end_session)
         | 
| 423 | 
            +
             | 
| 424 | 
            +
            if __name__ == "__main__":
         | 
| 425 | 
            +
                print("Launching Gradio Demo...")
         | 
| 426 | 
            +
                demo.queue()
         | 
| 427 | 
            +
                demo.launch()
         | 
    	
        data/example-yt_saTD1u8PorI.mp3
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:3cb340c3b868eb3695cdb06683decbff217331c2459a69394be8d3ad3b53bdf0
         | 
| 3 | 
            +
            size 2493472
         | 
    	
        packages.txt
    ADDED
    
    | @@ -0,0 +1,2 @@ | |
|  | |
|  | 
|  | |
| 1 | 
            +
            ffmpeg
         | 
| 2 | 
            +
            libsndfile1
         | 
    	
        requirements.txt
    ADDED
    
    | @@ -0,0 +1,2 @@ | |
|  | |
|  | 
|  | |
| 1 | 
            +
            Cython
         | 
| 2 | 
            +
            git+https://github.com/NVIDIA/NeMo.git@main#egg=nemo_toolkit[asr]
         | 
