Spaces:

MohammadReza-Halakoo
/

persian-whisper-asr

Running

App Files Files Community

MohammadReza-Halakoo commited on 11 days ago

Commit

4885664

verified ·

1 Parent(s): 830e248

Create app.py

Browse files

Files changed (1) hide show

app.py +271 -0

app.py ADDED Viewed

	@@ -0,0 +1,271 @@

+import gradio as gr
+import torch
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+import librosa
+import numpy as np
+import logging
+import base64
+import os
+import time
+import datetime
+from html import escape
+from difflib import SequenceMatcher
+from pydub import AudioSegment
+from pydub.silence import detect_nonsilent
+# ===== Logging =====
+logging.basicConfig(level=logging.INFO)
+# ===== Device =====
+device = "cuda" if torch.cuda.is_available() else "cpu"
+logging.info(f"Using device: {device}")
+# ===== Model (Private) =====
+# 1) در Settings → Secrets یک secret با نام HF_TOKEN بسازید
+HF_TOKEN = os.getenv("HF_TOKEN", None)
+if HF_TOKEN is None:
+    logging.warning("HF_TOKEN is not set. Make sure to add it in Space Settings → Secrets.")
+# 2) آیدی مدل Private خودتان را اینجا قرار دهید
+# مثال: "MohammadReza-Halakoo/1-persian-whisper-large-v"
+MODEL_ID = os.getenv("MODEL_ID", "MohammadReza-Halakoo/1-persian-whisper-large-v")
+processor = AutoProcessor.from_pretrained(MODEL_ID, use_auth_token=HF_TOKEN)
+model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_ID, use_auth_token=HF_TOKEN)
+model = model.to(device)
+# attention mask fix (ایمن)
+if model.config.pad_token_id is None:
+    model.config.pad_token_id = processor.tokenizer.pad_token_id
+if model.config.pad_token_id == model.config.eos_token_id:
+    if processor.tokenizer.pad_token_id != processor.tokenizer.eos_token_id:
+        model.config.pad_token_id = processor.tokenizer.pad_token_id
+    else:
+        processor.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        model.resize_token_embeddings(len(processor.tokenizer))
+        model.config.pad_token_id = processor.tokenizer.pad_token_id
+# ===== Audio Utils =====
+def load_audio_preserving_quality(audio_path, target_sr=16000):
+    try:
+        audio, sr = librosa.load(audio_path, sr=None, mono=False)
+        if audio.ndim > 1:
+            audio = np.mean(audio, axis=0)
+        if sr != target_sr:
+            audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
+            sr = target_sr
+        if audio.dtype != np.float32:
+            audio = audio.astype(np.float32)
+        audio = np.nan_to_num(audio)
+        return audio, sr
+    except Exception as e:
+        logging.error(f"Audio load error: {str(e)}")
+        return None, None
+def remove_intermediate_silence(audio, sr, silence_thresh=-38, min_silence_len=700, padding=200):
+    try:
+        audio_segment = AudioSegment(
+            (audio * np.iinfo(np.int16).max).astype(np.int16).tobytes(),
+            frame_rate=sr,
+            sample_width=2,
+            channels=1
+        )
+        nonsilent_ranges = detect_nonsilent(
+            audio_segment,
+            min_silence_len=min_silence_len,
+            silence_thresh=silence_thresh
+        )
+        if not nonsilent_ranges:
+            return np.array([], dtype=np.float32), sr
+        non_silent_audio = AudioSegment.empty()
+        for start, end in nonsilent_ranges:
+            start = max(0, start - padding)
+            end = min(len(audio_segment), end + padding)
+            non_silent_audio += audio_segment[start:end]
+        processed_audio = np.array(non_silent_audio.get_array_of_samples()).astype(np.float32)
+        processed_audio /= np.iinfo(np.int16).max
+        return processed_audio, sr
+    except Exception as e:
+        logging.error(f"Silence removal error: {str(e)}")
+        return audio, sr
+def is_silent(audio, threshold=1e-4):
+    if audio is None or len(audio) == 0:
+        return True
+    rms = np.sqrt(np.mean(audio**2))
+    return rms < threshold
+def merge_transcriptions(transcriptions):
+    if not transcriptions:
+        return ''
+    final_transcription = transcriptions[0]
+    for i in range(1, len(transcriptions)):
+        prev_transcription = final_transcription
+        current_transcription = transcriptions[i]
+        N = 50
+        prev_part = prev_transcription[-N:]
+        curr_part = current_transcription[:N]
+        match = SequenceMatcher(None, prev_part, curr_part).find_longest_match(0, len(prev_part), 0, len(curr_part))
+        if match.size > 10:
+            non_overlapping_part = current_transcription[match.b + match.size:]
+            final_transcription += non_overlapping_part
+        else:
+            final_transcription += ' ' + current_transcription
+    return final_transcription
+# ===== Core Inference =====
+def transcribe_audio(mic=None, upload_audio=None, file=None):
+    start_time = time.time()
+    audio_path = mic or upload_audio or (file.name if file else None)
+    if not audio_path:
+        return 'لطفاً یک فایل صوتی یا صدای ضبط‌شده ارسال کنید.', None, None, None
+    audio, sr = load_audio_preserving_quality(audio_path, target_sr=16000)
+    if audio is None:
+        return "خطا در بارگذاری و پردازش صوت.", None, None, None
+    audio, sr = remove_intermediate_silence(audio, sr)
+    if is_silent(audio):
+        return 'صوت ورودی حاوی صدای قابل پردازش نیست.', None, None, None
+    # تقسیم به چانک‌های 29 ثانیه با هم‌پوشانی 3 ثانیه
+    max_chunk_length = 29
+    stride_length = 3
+    max_chunk_samples = max_chunk_length * sr
+    stride_samples = stride_length * sr
+    chunks, start = [], 0
+    while start < len(audio):
+        end = min(start + max_chunk_samples, len(audio))
+        chunks.append(audio[int(start):int(end)])
+        if end == len(audio):
+            break
+        start += max_chunk_samples - stride_samples
+    if not chunks:
+        return 'صوت ورودی خالی است.', None, None, None
+    transcriptions = []
+    for i, chunk in enumerate(chunks):
+        try:
+            inputs = processor(chunk, sampling_rate=sr, return_tensors="pt", padding=True)
+            input_features = inputs.input_features.to(device)
+            attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
+            with torch.no_grad():
+                generated_ids = model.generate(
+                    input_features,
+                    attention_mask=attention_mask,
+                    num_beams=5,
+                    length_penalty=1.0,
+                    repetition_penalty=1.1,
+                    no_repeat_ngram_size=4,
+                    temperature=0.9,
+                )
+            transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+            transcriptions.append(transcription)
+        except Exception as e:
+            logging.error(f"Model error on chunk {i+1}: {str(e)}")
+            return "خطا در تبدیل گفتار به متن رخ داد.", None, None, None
+    final_transcription = merge_transcriptions(transcriptions)
+    if not final_transcription.strip():
+        return 'هیچ متنی استخراج نشد.', None, None, None
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"transcription_{timestamp}.txt"
+    with open(filename, "w", encoding="utf-8") as f:
+        f.write(final_transcription)
+    escaped_transcription = escape(final_transcription)
+    copy_download_buttons_html = f"""
+    <div class="copy-download-buttons">
+        <button id="copy-button" data-transcription="{escaped_transcription}"
+            onclick="
+                var t=this.getAttribute('data-transcription');
+                if(t){{navigator.clipboard.writeText(t).then(()=>alert('متن کپی شد!'),err=>alert('عدم موفقیت کپی: '+err));}}
+                else{{alert('متنی یافت نشد!');}}
+            "
+            style="padding:8px 16px; background:#4CAF50; color:#fff; border:none; cursor:pointer;">
+            کپی متن
+        </button>
+        <button id="download-button"
+            onclick="
+                var a=document.querySelector('#download-file a');
+                if(a) a.click(); else alert('لینک دانلود یافت نشد!');
+            "
+            style="padding:8px 16px; background:#008CBA; color:#fff; border:none; cursor:pointer;">
+            دانلود متن
+        </button>
+    </div>
+    """
+    audio_output = audio_path if file else None
+    return final_transcription, filename, copy_download_buttons_html, audio_output
+# ===== Image helper =====
+def image_to_base64(image_path):
+    try:
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+    except Exception:
+        return None
+# لطفاً یک تصویر در مسیر assets/hero.jpg قرار دهید (دلخواه)
+image_base64 = image_to_base64("assets/hero.jpg")
+# ===== UI =====
+custom_css = """
+body { background-color: rgba(0,0,128,0.7); color:#fff; }
+h1 { color:#fff; }
+p { color:#ccc; }
+button { border:none; padding:10px 20px; border-radius:8px; color:#fff; }
+.copy-download-buttons { display:flex; gap:20px; justify-content:center; margin-top:20px; }
+textarea { border-radius:8px; padding:10px; background-color: rgba(52,58,64,0.9); color:white; border:none; direction:rtl; text-align:right; }
+.gradio-container { border-radius:10px; padding:20px; margin:20px; background-color: rgba(28,30,34,0.9); }
+#gradio-app .powered-by, footer { display:none !important; }
+"""
+title = "تبدیل گفتار به متن (Whisper فارسی)"
+img_html = f'<img src="data:image/jpeg;base64,{image_base64}" width="400px">' if image_base64 else ""
+description = f"""
+<div style="text-align:center; direction:rtl;">
+  <p>با استفاده از مدل خصوصی، صوت شما به متن تبدیل می‌شود. دسترسی مستقیم به فایل‌های مدل امکان‌پذیر نیست.</p>
+  <div style="display:flex; justify-content:center;">{img_html}</div>
+</div>
+"""
+article = """
+<div style="direction:rtl;">
+  این یک دمو برای ماژول گفتار به متن فارسی است.
+</div>
+"""
+interface = gr.Interface(
+    fn=transcribe_audio,
+    inputs=[
+        gr.Audio(source="microphone", type="filepath", label="صدای خود را ضبط کنید", clear_on_submit=True),
+        gr.Audio(source="upload", type="filepath", label="یک فایل صوتی بارگذاری کنید", max_size=300, clear_on_submit=True),
+        gr.File(label="فایل‌های صوتی بزرگ (اختیاری)", type="file")
+    ],
+    outputs=[
+        gr.Textbox(label="متن تبدیل‌شده", interactive=False, lines=4, elem_id="output-text", placeholder="نتیجه اینجا نمایش داده می‌شود."),
+        gr.File(label="دانلود متن", elem_id="download-file"),
+        gr.HTML(value="", elem_id="copy-download-buttons"),
+        gr.Audio(label="پخش فایل ورودی", type="filepath")
+    ],
+    title=title,
+    description=description,
+    article=article,
+    css=custom_css,
+    allow_flagging="never",
+    live=False
+)
+if __name__ == "__main__":
+    # روی Spaces فقط launch ساده نیاز است؛ نیازی به پورت/SSL/Share نیست.
+    interface.launch(show_error=True)