|
import gradio as gr |
|
import torch |
|
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq |
|
import librosa |
|
import numpy as np |
|
import logging |
|
import base64 |
|
import os |
|
import time |
|
import datetime |
|
from html import escape |
|
from difflib import SequenceMatcher |
|
from pydub import AudioSegment |
|
from pydub.silence import detect_nonsilent |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
logging.info(f"Using device: {device}") |
|
|
|
|
|
|
|
HF_TOKEN = os.getenv("HF_TOKEN", None) |
|
if HF_TOKEN is None: |
|
logging.warning("HF_TOKEN is not set. Make sure to add it in Space Settings → Secrets.") |
|
|
|
|
|
|
|
MODEL_ID = os.getenv("MODEL_ID", "MohammadReza-Halakoo/1-persian-whisper-large-v") |
|
|
|
processor = AutoProcessor.from_pretrained(MODEL_ID, use_auth_token=HF_TOKEN) |
|
model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_ID, use_auth_token=HF_TOKEN) |
|
model = model.to(device) |
|
|
|
|
|
if model.config.pad_token_id is None: |
|
model.config.pad_token_id = processor.tokenizer.pad_token_id |
|
|
|
if model.config.pad_token_id == model.config.eos_token_id: |
|
if processor.tokenizer.pad_token_id != processor.tokenizer.eos_token_id: |
|
model.config.pad_token_id = processor.tokenizer.pad_token_id |
|
else: |
|
processor.tokenizer.add_special_tokens({'pad_token': '[PAD]'}) |
|
model.resize_token_embeddings(len(processor.tokenizer)) |
|
model.config.pad_token_id = processor.tokenizer.pad_token_id |
|
|
|
|
|
def load_audio_preserving_quality(audio_path, target_sr=16000): |
|
try: |
|
audio, sr = librosa.load(audio_path, sr=None, mono=False) |
|
if audio.ndim > 1: |
|
audio = np.mean(audio, axis=0) |
|
if sr != target_sr: |
|
audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr) |
|
sr = target_sr |
|
if audio.dtype != np.float32: |
|
audio = audio.astype(np.float32) |
|
audio = np.nan_to_num(audio) |
|
return audio, sr |
|
except Exception as e: |
|
logging.error(f"Audio load error: {str(e)}") |
|
return None, None |
|
|
|
def remove_intermediate_silence(audio, sr, silence_thresh=-38, min_silence_len=700, padding=200): |
|
try: |
|
audio_segment = AudioSegment( |
|
(audio * np.iinfo(np.int16).max).astype(np.int16).tobytes(), |
|
frame_rate=sr, |
|
sample_width=2, |
|
channels=1 |
|
) |
|
nonsilent_ranges = detect_nonsilent( |
|
audio_segment, |
|
min_silence_len=min_silence_len, |
|
silence_thresh=silence_thresh |
|
) |
|
if not nonsilent_ranges: |
|
return np.array([], dtype=np.float32), sr |
|
|
|
non_silent_audio = AudioSegment.empty() |
|
for start, end in nonsilent_ranges: |
|
start = max(0, start - padding) |
|
end = min(len(audio_segment), end + padding) |
|
non_silent_audio += audio_segment[start:end] |
|
|
|
processed_audio = np.array(non_silent_audio.get_array_of_samples()).astype(np.float32) |
|
processed_audio /= np.iinfo(np.int16).max |
|
return processed_audio, sr |
|
except Exception as e: |
|
logging.error(f"Silence removal error: {str(e)}") |
|
return audio, sr |
|
|
|
def is_silent(audio, threshold=1e-4): |
|
if audio is None or len(audio) == 0: |
|
return True |
|
rms = np.sqrt(np.mean(audio**2)) |
|
return rms < threshold |
|
|
|
def merge_transcriptions(transcriptions): |
|
if not transcriptions: |
|
return '' |
|
final_transcription = transcriptions[0] |
|
for i in range(1, len(transcriptions)): |
|
prev_transcription = final_transcription |
|
current_transcription = transcriptions[i] |
|
N = 50 |
|
prev_part = prev_transcription[-N:] |
|
curr_part = current_transcription[:N] |
|
match = SequenceMatcher(None, prev_part, curr_part).find_longest_match(0, len(prev_part), 0, len(curr_part)) |
|
if match.size > 10: |
|
non_overlapping_part = current_transcription[match.b + match.size:] |
|
final_transcription += non_overlapping_part |
|
else: |
|
final_transcription += ' ' + current_transcription |
|
return final_transcription |
|
|
|
|
|
def transcribe_audio(mic=None, upload_audio=None, file=None): |
|
start_time = time.time() |
|
|
|
audio_path = mic or upload_audio or (file.name if file else None) |
|
if not audio_path: |
|
return 'لطفاً یک فایل صوتی یا صدای ضبطشده ارسال کنید.', None, None, None |
|
|
|
audio, sr = load_audio_preserving_quality(audio_path, target_sr=16000) |
|
if audio is None: |
|
return "خطا در بارگذاری و پردازش صوت.", None, None, None |
|
|
|
audio, sr = remove_intermediate_silence(audio, sr) |
|
if is_silent(audio): |
|
return 'صوت ورودی حاوی صدای قابل پردازش نیست.', None, None, None |
|
|
|
|
|
max_chunk_length = 29 |
|
stride_length = 3 |
|
max_chunk_samples = max_chunk_length * sr |
|
stride_samples = stride_length * sr |
|
|
|
chunks, start = [], 0 |
|
while start < len(audio): |
|
end = min(start + max_chunk_samples, len(audio)) |
|
chunks.append(audio[int(start):int(end)]) |
|
if end == len(audio): |
|
break |
|
start += max_chunk_samples - stride_samples |
|
|
|
if not chunks: |
|
return 'صوت ورودی خالی است.', None, None, None |
|
|
|
transcriptions = [] |
|
for i, chunk in enumerate(chunks): |
|
try: |
|
inputs = processor(chunk, sampling_rate=sr, return_tensors="pt", padding=True) |
|
input_features = inputs.input_features.to(device) |
|
attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None |
|
|
|
with torch.no_grad(): |
|
generated_ids = model.generate( |
|
input_features, |
|
attention_mask=attention_mask, |
|
num_beams=5, |
|
length_penalty=1.0, |
|
repetition_penalty=1.1, |
|
no_repeat_ngram_size=4, |
|
temperature=0.9, |
|
) |
|
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] |
|
transcriptions.append(transcription) |
|
except Exception as e: |
|
logging.error(f"Model error on chunk {i+1}: {str(e)}") |
|
return "خطا در تبدیل گفتار به متن رخ داد.", None, None, None |
|
|
|
final_transcription = merge_transcriptions(transcriptions) |
|
if not final_transcription.strip(): |
|
return 'هیچ متنی استخراج نشد.', None, None, None |
|
|
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") |
|
filename = f"transcription_{timestamp}.txt" |
|
with open(filename, "w", encoding="utf-8") as f: |
|
f.write(final_transcription) |
|
|
|
escaped_transcription = escape(final_transcription) |
|
copy_download_buttons_html = f""" |
|
<div class="copy-download-buttons"> |
|
<button id="copy-button" data-transcription="{escaped_transcription}" |
|
onclick=" |
|
var t=this.getAttribute('data-transcription'); |
|
if(t){{navigator.clipboard.writeText(t).then(()=>alert('متن کپی شد!'),err=>alert('عدم موفقیت کپی: '+err));}} |
|
else{{alert('متنی یافت نشد!');}} |
|
" |
|
style="padding:8px 16px; background:#4CAF50; color:#fff; border:none; cursor:pointer;"> |
|
کپی متن |
|
</button> |
|
<button id="download-button" |
|
onclick=" |
|
var a=document.querySelector('#download-file a'); |
|
if(a) a.click(); else alert('لینک دانلود یافت نشد!'); |
|
" |
|
style="padding:8px 16px; background:#008CBA; color:#fff; border:none; cursor:pointer;"> |
|
دانلود متن |
|
</button> |
|
</div> |
|
""" |
|
|
|
audio_output = audio_path if file else None |
|
return final_transcription, filename, copy_download_buttons_html, audio_output |
|
|
|
|
|
def image_to_base64(image_path): |
|
try: |
|
with open(image_path, "rb") as image_file: |
|
return base64.b64encode(image_file.read()).decode('utf-8') |
|
except Exception: |
|
return None |
|
|
|
|
|
image_base64 = image_to_base64("assets/hero.jpg") |
|
|
|
|
|
custom_css = """ |
|
body { background-color: rgba(0,0,128,0.7); color:#fff; } |
|
h1 { color:#fff; } |
|
p { color:#ccc; } |
|
button { border:none; padding:10px 20px; border-radius:8px; color:#fff; } |
|
.copy-download-buttons { display:flex; gap:20px; justify-content:center; margin-top:20px; } |
|
textarea { border-radius:8px; padding:10px; background-color: rgba(52,58,64,0.9); color:white; border:none; direction:rtl; text-align:right; } |
|
.gradio-container { border-radius:10px; padding:20px; margin:20px; background-color: rgba(28,30,34,0.9); } |
|
#gradio-app .powered-by, footer { display:none !important; } |
|
""" |
|
|
|
title = "تبدیل گفتار به متن (Whisper فارسی)" |
|
img_html = f'<img src="data:image/jpeg;base64,{image_base64}" width="400px">' if image_base64 else "" |
|
description = f""" |
|
<div style="text-align:center; direction:rtl;"> |
|
<p>با استفاده از مدل خصوصی، صوت شما به متن تبدیل میشود. دسترسی مستقیم به فایلهای مدل امکانپذیر نیست.</p> |
|
<div style="display:flex; justify-content:center;">{img_html}</div> |
|
</div> |
|
""" |
|
|
|
article = """ |
|
<div style="direction:rtl;"> |
|
این یک دمو برای ماژول گفتار به متن فارسی است. |
|
</div> |
|
""" |
|
|
|
interface = gr.Interface( |
|
fn=transcribe_audio, |
|
inputs=[ |
|
gr.Audio(source="microphone", type="filepath", label="صدای خود را ضبط کنید", clear_on_submit=True), |
|
gr.Audio(source="upload", type="filepath", label="یک فایل صوتی بارگذاری کنید", max_size=300, clear_on_submit=True), |
|
gr.File(label="فایلهای صوتی بزرگ (اختیاری)", type="file") |
|
], |
|
outputs=[ |
|
gr.Textbox(label="متن تبدیلشده", interactive=False, lines=4, elem_id="output-text", placeholder="نتیجه اینجا نمایش داده میشود."), |
|
gr.File(label="دانلود متن", elem_id="download-file"), |
|
gr.HTML(value="", elem_id="copy-download-buttons"), |
|
gr.Audio(label="پخش فایل ورودی", type="filepath") |
|
], |
|
title=title, |
|
description=description, |
|
article=article, |
|
css=custom_css, |
|
allow_flagging="never", |
|
live=False |
|
) |
|
|
|
if __name__ == "__main__": |
|
|
|
interface.launch(show_error=True) |
|
|