Spaces:

MohammadReza-Halakoo
/

persian-whisper-asr

Sleeping

App Files Files Community

assets

by MohammadReza-Halakoo - opened 7 days ago

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+149

-176

This PR is in draft mode

Files changed (7) hide show

.gitattributes +0 -4
app.py +146 -163
assets/.gitattributes +0 -2
assets/.gitkeep +0 -0
assets/hero.jpg +0 -3
packages.txt +0 -2
requirements.txt +3 -2

.gitattributes CHANGED Viewed

@@ -33,7 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-*.jpg filter=lfs diff=lfs merge=lfs -text
-*.png filter=lfs diff=lfs merge=lfs -text
-*.wav filter=lfs diff=lfs merge=lfs -text
-*.mp3 filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,86 +1,52 @@
-# app.py — Persian Whisper ASR (HF Spaces friendly)
-import os, time, base64, datetime, logging
-from html import escape
-from difflib import SequenceMatcher
 import gradio as gr
 import torch
-import numpy as np
-import librosa
 from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 # ===== Logging =====
 logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger("persian-whisper-space")
-# ===== Env =====
 HF_TOKEN = os.getenv("HF_TOKEN", None)
 if HF_TOKEN is None:
-    logger.warning("HF_TOKEN is not set. Add it in Space Settings → Secrets.")
-# مدل پیش‌فرض (می‌توانی هر موقع عوضش کنی)
-MODEL_ID = os.getenv(
-    "MODEL_ID",
-    "MohammadReza-Halakoo/Whisper-Small-PersianASR-20-percent-17-0"
-)
-# اگر خواستی حذفِ سکوت فعال شود: در Settings→Variables مقدار 1 بگذار
-ENABLE_SILENCE_REMOVAL = os.getenv("ENABLE_SILENCE_REMOVAL", "0") == "1"
-# ===== Device & dtype =====
-if torch.cuda.is_available():
-    device = "cuda"
-    torch_dtype = torch.float16
-    logger.info("GPU detected → using CUDA + float16")
-else:
-    device = "cpu"
-    try:
-        # روی CPU، اگر bf16 پشتیبانی نشود، می‌رویم روی float32
-        torch_dtype = torch.bfloat16  # اکثر اوقات امن است؛ اگر خطا داد، except پایین می‌گیرد
-        _ = torch.tensor([0], dtype=torch_dtype)  # sanity check
-    except Exception:
-        torch_dtype = torch.float32
-    logger.info(f"No GPU detected → falling back to CPU + {torch_dtype}")
-# ===== Load model =====
-processor = AutoProcessor.from_pretrained(MODEL_ID, token=HF_TOKEN)
-model = AutoModelForSpeechSeq2Seq.from_pretrained(
-    MODEL_ID,
-    token=HF_TOKEN,
-    torch_dtype=torch_dtype if device == "cuda" else None,  # روی CPU بهتره float32/bf16 بماند
-    low_cpu_mem_usage=True,
-    device_map="auto" if device == "cuda" else None
-).to(device)
-# Pad token safety
-if getattr(model.config, "pad_token_id", None) is None:
     model.config.pad_token_id = processor.tokenizer.pad_token_id
 if model.config.pad_token_id == model.config.eos_token_id:
     if processor.tokenizer.pad_token_id != processor.tokenizer.eos_token_id:
         model.config.pad_token_id = processor.tokenizer.pad_token_id
     else:
-        processor.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
         model.resize_token_embeddings(len(processor.tokenizer))
         model.config.pad_token_id = processor.tokenizer.pad_token_id
-# ===== Optional prompt ids (fa/transcribe) =====
-try:
-    prompt_ids = processor.get_decoder_prompt_ids(language="farsi", task="transcribe")
-except Exception:
-    prompt_ids = None
-# ===== Audio utils =====
-def resolve_path(x):
-    if not x:
-        return None
-    if isinstance(x, str):
-        return x
-    if hasattr(x, "name"):
-        return x.name
-    if isinstance(x, dict) and "name" in x:
-        return x["name"]
-    return None
 def load_audio_preserving_quality(audio_path, target_sr=16000):
     try:
         audio, sr = librosa.load(audio_path, sr=None, mono=False)
@@ -94,152 +60,165 @@ def load_audio_preserving_quality(audio_path, target_sr=16000):
         audio = np.nan_to_num(audio)
         return audio, sr
     except Exception as e:
-        logger.exception(f"Audio load error: {e}")
         return None, None
 def remove_intermediate_silence(audio, sr, silence_thresh=-38, min_silence_len=700, padding=200):
-    if not ENABLE_SILENCE_REMOVAL:
-        return audio, sr
     try:
-        # import اینجا تا اگر pydub/ffmpeg نصب نبود، کل برنامه crash نکند
-        from pydub import AudioSegment
-        from pydub.silence import detect_nonsilent
         audio_segment = AudioSegment(
             (audio * np.iinfo(np.int16).max).astype(np.int16).tobytes(),
-            frame_rate=sr, sample_width=2, channels=1
         )
-        ranges = detect_nonsilent(audio_segment, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
-        if not ranges:
             return np.array([], dtype=np.float32), sr
-        merged = AudioSegment.empty()
-        for start, end in ranges:
             start = max(0, start - padding)
             end = min(len(audio_segment), end + padding)
-            merged += audio_segment[start:end]
-        data = np.array(merged.get_array_of_samples()).astype(np.float32)
-        data /= np.iinfo(np.int16).max
-        return data, sr
     except Exception as e:
-        logger.warning(f"Silence removal disabled (error): {e}")
         return audio, sr
 def is_silent(audio, threshold=1e-4):
     if audio is None or len(audio) == 0:
         return True
-    rms = float(np.sqrt(np.mean(audio**2)))
     return rms < threshold
 def merge_transcriptions(transcriptions):
     if not transcriptions:
-        return ""
-    out = transcriptions[0]
     for i in range(1, len(transcriptions)):
-        prev, cur = out, transcriptions[i]
         N = 50
-        match = SequenceMatcher(None, prev[-N:], cur[:N]).find_longest_match(0, min(N, len(prev)), 0, min(N, len(cur)))
         if match.size > 10:
-            out += cur[match.b + match.size :]
         else:
-            out += " " + cur
-    return out
-# ===== Image helper (optional) =====
-def image_to_base64(path):
-    try:
-        with open(path, "rb") as f:
-            return base64.b64encode(f.read()).decode("utf-8")
-    except Exception:
-        return None
-image_base64 = image_to_base64("assets/hero.jpg")
-img_html = f'<img src="data:image/jpeg;base64,{image_base64}" width="400" />' if image_base64 else ""
-# ===== Inference =====
 def transcribe_audio(mic=None, upload_audio=None, file=None):
-    t0 = time.time()
-    audio_path = resolve_path(mic) or resolve_path(upload_audio) or resolve_path(file)
-    logger.info(f"audio_path: {audio_path!r}")
     if not audio_path:
-        return "لطفاً یک فایل صوتی یا صدای ضبط‌شده ارسال کنید.", None, None, None
-    audio, sr = load_audio_preserving_quality(audio_path, 16000)
     if audio is None:
         return "خطا در بارگذاری و پردازش صوت.", None, None, None
     audio, sr = remove_intermediate_silence(audio, sr)
     if is_silent(audio):
-        return "صوت ورودی حاوی صدای قابل پردازش نیست.", None, None, None
-    # Chunking ~22s with 3s stride (سبک‌تر از 29s)
-    max_chunk_length, stride_length = 22, 3
-    max_chunk_samples, stride_samples = int(max_chunk_length*sr), int(stride_length*sr)
     chunks, start = [], 0
-    L = len(audio)
-    while start < L:
-        end = min(start + max_chunk_samples, L)
-        chunks.append(audio[start:end])
-        if end >= L: break
         start += max_chunk_samples - stride_samples
-    if not chunks:
-        return "صوت ورودی خالی است.", None, None, None
-    # سبک‌تر برای CPU
-    gen_kwargs = dict(max_new_tokens=225, do_sample=False, num_beams=1, length_penalty=1.0)
-    trans = []
-    for i, chunk in enumerate(chunks, 1):
         try:
             inputs = processor(chunk, sampling_rate=sr, return_tensors="pt", padding=True)
-            feats = inputs.input_features.to(device)
             with torch.no_grad():
-                if prompt_ids is not None:
-                    dec_ids = torch.tensor([x[1] for x in prompt_ids]).unsqueeze(0).to(device)
-                    ids = model.generate(feats, decoder_input_ids=dec_ids, **gen_kwargs)
-                else:
-                    ids = model.generate(feats, **gen_kwargs)
-            text = processor.batch_decode(ids, skip_special_tokens=True)[0].strip()
-            trans.append(text)
         except Exception as e:
-            logger.exception(f"Model error on chunk {i}: {e}")
             return "خطا در تبدیل گفتار به متن رخ داد.", None, None, None
-    final_text = merge_transcriptions(trans).strip()
-    if not final_text:
-        return "هیچ متنی استخراج نشد.", None, None, None
-    # Save .txt
-    ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-    filename = f"transcription_{ts}.txt"
     with open(filename, "w", encoding="utf-8") as f:
-        f.write(final_text)
-    escaped = escape(final_text)
-    html_buttons = f"""
     <div class="copy-download-buttons">
-        <button id="copy-button" data-transcription="{escaped}"
-            onclick="var t=this.getAttribute('data-transcription'); if(t){{navigator.clipboard.writeText(t).then(()=>alert('متن کپی شد!'),err=>alert('عدم موفقیت کپی: '+err));}}"
-            style="padding:8px 16px; background:#4CAF50; color:#fff; border:none; cursor:pointer; border-radius:8px;">
             کپی متن
         </button>
         <button id="download-button"
-            onclick="var a=document.querySelector('#download-file a'); if(a) a.click(); else alert('لینک دانلود یافت نشد!');"
-            style="padding:8px 16px; background:#008CBA; color:#fff; border:none; cursor:pointer; border-radius:8px;">
             دانلود متن
         </button>
     </div>
     """
-    # همیشه ورودی را برای پخش برگردان
-    audio_output = audio_path
-    logger.info(f"Done in {time.time()-t0:.2f}s, chunks={len(chunks)}")
-    return final_text, filename, html_buttons, audio_output
-# ===== UI / CSS =====
 custom_css = """
 body { background-color: rgba(0,0,128,0.7); color:#fff; }
 h1 { color:#fff; }
@@ -252,37 +231,41 @@ textarea { border-radius:8px; padding:10px; background-color: rgba(52,58,64,0.9)
 """
 title = "تبدیل گفتار به متن (Whisper فارسی)"
 description = f"""
 <div style="text-align:center; direction:rtl;">
-  <p>با استفاده از مدل خصوصی، صوت شما به متن تبدیل می‌شود.</p>
   <div style="display:flex; justify-content:center;">{img_html}</div>
 </div>
 """
 article = """
 <div style="direction:rtl;">
-  نکته: برای سرعت بیشتر، Space را روی GPU اجرا کنید. برای فعال‌سازی حذف سکوت، متغیر ENABLE_SILENCE_REMOVAL=1 تنظیم شود.
 </div>
 """
-mic_in = gr.Audio(sources=["microphone"], type="filepath", label="صدای خود را ضبط کنید",
-                  streaming=False, interactive=True)
-upl_in = gr.Audio(sources=["upload"], type="filepath", label="یک فایل صوتی بارگذاری کنید",
-                  streaming=False, interactive=True)
-big_in = gr.File(label="فایل‌های صوتی بزرگ (اختیاری)")
 interface = gr.Interface(
     fn=transcribe_audio,
-    inputs=[mic_in, upl_in, big_in],
     outputs=[
-        gr.Textbox(label="متن تبدیل‌شده", interactive=False, lines=6, elem_id="output-text",
-                   placeholder="نتیجه اینجا نمایش داده می‌شود."),
         gr.File(label="دانلود متن", elem_id="download-file"),
         gr.HTML(value="", elem_id="copy-download-buttons"),
-        gr.Audio(label="پخش فایل ورودی"),
     ],
-    title=title, description=description, article=article,
-    css=custom_css, allow_flagging="never", live=False
 )
 if __name__ == "__main__":
     interface.launch(show_error=True)

 import gradio as gr
 import torch
 from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+import librosa
+import numpy as np
+import logging
+import base64
+import os
+import time
+import datetime
+from html import escape
+from difflib import SequenceMatcher
+from pydub import AudioSegment
+from pydub.silence import detect_nonsilent
 # ===== Logging =====
 logging.basicConfig(level=logging.INFO)
+# ===== Device =====
+device = "cuda" if torch.cuda.is_available() else "cpu"
+logging.info(f"Using device: {device}")
+# ===== Model (Private) =====
+# 1) در Settings → Secrets یک secret با نام HF_TOKEN بسازید
 HF_TOKEN = os.getenv("HF_TOKEN", None)
 if HF_TOKEN is None:
+    logging.warning("HF_TOKEN is not set. Make sure to add it in Space Settings → Secrets.")
+# 2) آیدی مدل Private خودتان را اینجا قرار دهید
+# مثال: "MohammadReza-Halakoo/1-persian-whisper-large-v"
+MODEL_ID = os.getenv("MODEL_ID", "MohammadReza-Halakoo/1-persian-whisper-large-v")
+processor = AutoProcessor.from_pretrained(MODEL_ID, use_auth_token=HF_TOKEN)
+model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_ID, use_auth_token=HF_TOKEN)
+model = model.to(device)
+# attention mask fix (ایمن)
+if model.config.pad_token_id is None:
     model.config.pad_token_id = processor.tokenizer.pad_token_id
 if model.config.pad_token_id == model.config.eos_token_id:
     if processor.tokenizer.pad_token_id != processor.tokenizer.eos_token_id:
         model.config.pad_token_id = processor.tokenizer.pad_token_id
     else:
+        processor.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
         model.resize_token_embeddings(len(processor.tokenizer))
         model.config.pad_token_id = processor.tokenizer.pad_token_id
+# ===== Audio Utils =====
 def load_audio_preserving_quality(audio_path, target_sr=16000):
     try:
         audio, sr = librosa.load(audio_path, sr=None, mono=False)
         audio = np.nan_to_num(audio)
         return audio, sr
     except Exception as e:
+        logging.error(f"Audio load error: {str(e)}")
         return None, None
 def remove_intermediate_silence(audio, sr, silence_thresh=-38, min_silence_len=700, padding=200):
     try:
         audio_segment = AudioSegment(
             (audio * np.iinfo(np.int16).max).astype(np.int16).tobytes(),
+            frame_rate=sr,
+            sample_width=2,
+            channels=1
+        )
+        nonsilent_ranges = detect_nonsilent(
+            audio_segment,
+            min_silence_len=min_silence_len,
+            silence_thresh=silence_thresh
         )
+        if not nonsilent_ranges:
             return np.array([], dtype=np.float32), sr
+        non_silent_audio = AudioSegment.empty()
+        for start, end in nonsilent_ranges:
             start = max(0, start - padding)
             end = min(len(audio_segment), end + padding)
+            non_silent_audio += audio_segment[start:end]
+        processed_audio = np.array(non_silent_audio.get_array_of_samples()).astype(np.float32)
+        processed_audio /= np.iinfo(np.int16).max
+        return processed_audio, sr
     except Exception as e:
+        logging.error(f"Silence removal error: {str(e)}")
         return audio, sr
 def is_silent(audio, threshold=1e-4):
     if audio is None or len(audio) == 0:
         return True
+    rms = np.sqrt(np.mean(audio**2))
     return rms < threshold
 def merge_transcriptions(transcriptions):
     if not transcriptions:
+        return ''
+    final_transcription = transcriptions[0]
     for i in range(1, len(transcriptions)):
+        prev_transcription = final_transcription
+        current_transcription = transcriptions[i]
         N = 50
+        prev_part = prev_transcription[-N:]
+        curr_part = current_transcription[:N]
+        match = SequenceMatcher(None, prev_part, curr_part).find_longest_match(0, len(prev_part), 0, len(curr_part))
         if match.size > 10:
+            non_overlapping_part = current_transcription[match.b + match.size:]
+            final_transcription += non_overlapping_part
         else:
+            final_transcription += ' ' + current_transcription
+    return final_transcription
+# ===== Core Inference =====
 def transcribe_audio(mic=None, upload_audio=None, file=None):
+    start_time = time.time()
+    audio_path = mic or upload_audio or (file.name if file else None)
     if not audio_path:
+        return 'لطفاً یک فایل صوتی یا صدای ضبط‌شده ارسال کنید.', None, None, None
+    audio, sr = load_audio_preserving_quality(audio_path, target_sr=16000)
     if audio is None:
         return "خطا در بارگذاری و پردازش صوت.", None, None, None
     audio, sr = remove_intermediate_silence(audio, sr)
     if is_silent(audio):
+        return 'صوت ورودی حاوی صدای قابل پردازش نیست.', None, None, None
+    # تقسیم به چانک‌های 29 ثانیه با هم‌پوشانی 3 ثانیه
+    max_chunk_length = 29
+    stride_length = 3
+    max_chunk_samples = max_chunk_length * sr
+    stride_samples = stride_length * sr
     chunks, start = [], 0
+    while start < len(audio):
+        end = min(start + max_chunk_samples, len(audio))
+        chunks.append(audio[int(start):int(end)])
+        if end == len(audio):
+            break
         start += max_chunk_samples - stride_samples
+    if not chunks:
+        return 'صوت ورودی خالی است.', None, None, None
+    transcriptions = []
+    for i, chunk in enumerate(chunks):
         try:
             inputs = processor(chunk, sampling_rate=sr, return_tensors="pt", padding=True)
+            input_features = inputs.input_features.to(device)
+            attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
             with torch.no_grad():
+                generated_ids = model.generate(
+                    input_features,
+                    attention_mask=attention_mask,
+                    num_beams=5,
+                    length_penalty=1.0,
+                    repetition_penalty=1.1,
+                    no_repeat_ngram_size=4,
+                    temperature=0.9,
+                )
+            transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+            transcriptions.append(transcription)
         except Exception as e:
+            logging.error(f"Model error on chunk {i+1}: {str(e)}")
             return "خطا در تبدیل گفتار به متن رخ داد.", None, None, None
+    final_transcription = merge_transcriptions(transcriptions)
+    if not final_transcription.strip():
+        return 'هیچ متنی استخراج نشد.', None, None, None
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"transcription_{timestamp}.txt"
     with open(filename, "w", encoding="utf-8") as f:
+        f.write(final_transcription)
+    escaped_transcription = escape(final_transcription)
+    copy_download_buttons_html = f"""
     <div class="copy-download-buttons">
+        <button id="copy-button" data-transcription="{escaped_transcription}"
+            onclick="
+                var t=this.getAttribute('data-transcription');
+                if(t){{navigator.clipboard.writeText(t).then(()=>alert('متن کپی شد!'),err=>alert('عدم موفقیت کپی: '+err));}}
+                else{{alert('متنی یافت نشد!');}}
+            "
+            style="padding:8px 16px; background:#4CAF50; color:#fff; border:none; cursor:pointer;">
             کپی متن
         </button>
         <button id="download-button"
+            onclick="
+                var a=document.querySelector('#download-file a');
+                if(a) a.click(); else alert('لینک دانلود یافت نشد!');
+            "
+            style="padding:8px 16px; background:#008CBA; color:#fff; border:none; cursor:pointer;">
             دانلود متن
         </button>
     </div>
     """
+    audio_output = audio_path if file else None
+    return final_transcription, filename, copy_download_buttons_html, audio_output
+# ===== Image helper =====
+def image_to_base64(image_path):
+    try:
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+    except Exception:
+        return None
+# لطفاً یک تصویر در مسیر assets/hero.jpg قرار دهید (دلخواه)
+image_base64 = image_to_base64("assets/hero.jpg")
+# ===== UI =====
 custom_css = """
 body { background-color: rgba(0,0,128,0.7); color:#fff; }
 h1 { color:#fff; }
 """
 title = "تبدیل گفتار به متن (Whisper فارسی)"
+img_html = f'<img src="data:image/jpeg;base64,{image_base64}" width="400px">' if image_base64 else ""
 description = f"""
 <div style="text-align:center; direction:rtl;">
+  <p>با استفاده از مدل خصوصی، صوت شما به متن تبدیل می‌شود. دسترسی مستقیم به فایل‌های مدل امکان‌پذیر نیست.</p>
   <div style="display:flex; justify-content:center;">{img_html}</div>
 </div>
 """
 article = """
 <div style="direction:rtl;">
+  این یک دمو برای ماژول گفتار به متن فارسی است.
 </div>
 """
 interface = gr.Interface(
     fn=transcribe_audio,
+    inputs=[
+        gr.Audio(source="microphone", type="filepath", label="صدای خود را ضبط کنید", clear_on_submit=True),
+        gr.Audio(source="upload", type="filepath", label="یک فایل صوتی بارگذاری کنید", max_size=300, clear_on_submit=True),
+        gr.File(label="فایل‌های صوتی بزرگ (اختیاری)", type="file")
+    ],
     outputs=[
+        gr.Textbox(label="متن تبدیل‌شده", interactive=False, lines=4, elem_id="output-text", placeholder="نتیجه اینجا نمایش داده می‌شود."),
         gr.File(label="دانلود متن", elem_id="download-file"),
         gr.HTML(value="", elem_id="copy-download-buttons"),
+        gr.Audio(label="پخش فایل ورودی", type="filepath")
     ],
+    title=title,
+    description=description,
+    article=article,
+    css=custom_css,
+    allow_flagging="never",
+    live=False
 )
 if __name__ == "__main__":
+    # روی Spaces فقط launch ساده نیاز است؛ نیازی به پورت/SSL/Share نیست.
     interface.launch(show_error=True)

assets/.gitattributes DELETED Viewed

	@@ -1,2 +0,0 @@
1	- *.png filter=lfs diff=lfs merge=lfs -text
2	- *.jpg filter=lfs diff=lfs merge=lfs -text

assets/.gitkeep DELETED Viewed

File without changes

assets/hero.jpg DELETED Viewed

Git LFS Details

SHA256: 9c5a95d02bb857e862d0fedb7cd497abfbaf52a06031e638ac01d343651d775e
Pointer size: 130 Bytes
Size of remote file: 68.7 kB

packages.txt DELETED Viewed

	@@ -1,2 +0,0 @@
1	- ffmpeg
2	- libsndfile1

requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
-gradio>=4.44.0
 transformers>=4.42.0
-torch  # (نسخه مطابق هاردور Space)
 librosa
 numpy
 pydub

 transformers>=4.42.0
+torch
+torchaudio
+gradio>=4.36.0
 librosa
 numpy
 pydub