MohammadReza-Halakoo commited on
Commit
4885664
·
verified ·
1 Parent(s): 830e248

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +271 -0
app.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
4
+ import librosa
5
+ import numpy as np
6
+ import logging
7
+ import base64
8
+ import os
9
+ import time
10
+ import datetime
11
+ from html import escape
12
+ from difflib import SequenceMatcher
13
+ from pydub import AudioSegment
14
+ from pydub.silence import detect_nonsilent
15
+
16
+ # ===== Logging =====
17
+ logging.basicConfig(level=logging.INFO)
18
+
19
+ # ===== Device =====
20
+ device = "cuda" if torch.cuda.is_available() else "cpu"
21
+ logging.info(f"Using device: {device}")
22
+
23
+ # ===== Model (Private) =====
24
+ # 1) در Settings → Secrets یک secret با نام HF_TOKEN بسازید
25
+ HF_TOKEN = os.getenv("HF_TOKEN", None)
26
+ if HF_TOKEN is None:
27
+ logging.warning("HF_TOKEN is not set. Make sure to add it in Space Settings → Secrets.")
28
+
29
+ # 2) آیدی مدل Private خودتان را اینجا قرار دهید
30
+ # مثال: "MohammadReza-Halakoo/1-persian-whisper-large-v"
31
+ MODEL_ID = os.getenv("MODEL_ID", "MohammadReza-Halakoo/1-persian-whisper-large-v")
32
+
33
+ processor = AutoProcessor.from_pretrained(MODEL_ID, use_auth_token=HF_TOKEN)
34
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_ID, use_auth_token=HF_TOKEN)
35
+ model = model.to(device)
36
+
37
+ # attention mask fix (ایمن)
38
+ if model.config.pad_token_id is None:
39
+ model.config.pad_token_id = processor.tokenizer.pad_token_id
40
+
41
+ if model.config.pad_token_id == model.config.eos_token_id:
42
+ if processor.tokenizer.pad_token_id != processor.tokenizer.eos_token_id:
43
+ model.config.pad_token_id = processor.tokenizer.pad_token_id
44
+ else:
45
+ processor.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
46
+ model.resize_token_embeddings(len(processor.tokenizer))
47
+ model.config.pad_token_id = processor.tokenizer.pad_token_id
48
+
49
+ # ===== Audio Utils =====
50
+ def load_audio_preserving_quality(audio_path, target_sr=16000):
51
+ try:
52
+ audio, sr = librosa.load(audio_path, sr=None, mono=False)
53
+ if audio.ndim > 1:
54
+ audio = np.mean(audio, axis=0)
55
+ if sr != target_sr:
56
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
57
+ sr = target_sr
58
+ if audio.dtype != np.float32:
59
+ audio = audio.astype(np.float32)
60
+ audio = np.nan_to_num(audio)
61
+ return audio, sr
62
+ except Exception as e:
63
+ logging.error(f"Audio load error: {str(e)}")
64
+ return None, None
65
+
66
+ def remove_intermediate_silence(audio, sr, silence_thresh=-38, min_silence_len=700, padding=200):
67
+ try:
68
+ audio_segment = AudioSegment(
69
+ (audio * np.iinfo(np.int16).max).astype(np.int16).tobytes(),
70
+ frame_rate=sr,
71
+ sample_width=2,
72
+ channels=1
73
+ )
74
+ nonsilent_ranges = detect_nonsilent(
75
+ audio_segment,
76
+ min_silence_len=min_silence_len,
77
+ silence_thresh=silence_thresh
78
+ )
79
+ if not nonsilent_ranges:
80
+ return np.array([], dtype=np.float32), sr
81
+
82
+ non_silent_audio = AudioSegment.empty()
83
+ for start, end in nonsilent_ranges:
84
+ start = max(0, start - padding)
85
+ end = min(len(audio_segment), end + padding)
86
+ non_silent_audio += audio_segment[start:end]
87
+
88
+ processed_audio = np.array(non_silent_audio.get_array_of_samples()).astype(np.float32)
89
+ processed_audio /= np.iinfo(np.int16).max
90
+ return processed_audio, sr
91
+ except Exception as e:
92
+ logging.error(f"Silence removal error: {str(e)}")
93
+ return audio, sr
94
+
95
+ def is_silent(audio, threshold=1e-4):
96
+ if audio is None or len(audio) == 0:
97
+ return True
98
+ rms = np.sqrt(np.mean(audio**2))
99
+ return rms < threshold
100
+
101
+ def merge_transcriptions(transcriptions):
102
+ if not transcriptions:
103
+ return ''
104
+ final_transcription = transcriptions[0]
105
+ for i in range(1, len(transcriptions)):
106
+ prev_transcription = final_transcription
107
+ current_transcription = transcriptions[i]
108
+ N = 50
109
+ prev_part = prev_transcription[-N:]
110
+ curr_part = current_transcription[:N]
111
+ match = SequenceMatcher(None, prev_part, curr_part).find_longest_match(0, len(prev_part), 0, len(curr_part))
112
+ if match.size > 10:
113
+ non_overlapping_part = current_transcription[match.b + match.size:]
114
+ final_transcription += non_overlapping_part
115
+ else:
116
+ final_transcription += ' ' + current_transcription
117
+ return final_transcription
118
+
119
+ # ===== Core Inference =====
120
+ def transcribe_audio(mic=None, upload_audio=None, file=None):
121
+ start_time = time.time()
122
+
123
+ audio_path = mic or upload_audio or (file.name if file else None)
124
+ if not audio_path:
125
+ return 'لطفاً یک فایل صوتی یا صدای ضبط‌شده ارسال کنید.', None, None, None
126
+
127
+ audio, sr = load_audio_preserving_quality(audio_path, target_sr=16000)
128
+ if audio is None:
129
+ return "خطا در بارگذاری و پردازش صوت.", None, None, None
130
+
131
+ audio, sr = remove_intermediate_silence(audio, sr)
132
+ if is_silent(audio):
133
+ return 'صوت ورودی حاوی صدای قابل پردازش نیست.', None, None, None
134
+
135
+ # تقسیم به چانک‌های 29 ثانیه با هم‌پوشانی 3 ثانیه
136
+ max_chunk_length = 29
137
+ stride_length = 3
138
+ max_chunk_samples = max_chunk_length * sr
139
+ stride_samples = stride_length * sr
140
+
141
+ chunks, start = [], 0
142
+ while start < len(audio):
143
+ end = min(start + max_chunk_samples, len(audio))
144
+ chunks.append(audio[int(start):int(end)])
145
+ if end == len(audio):
146
+ break
147
+ start += max_chunk_samples - stride_samples
148
+
149
+ if not chunks:
150
+ return 'صوت ورودی خالی است.', None, None, None
151
+
152
+ transcriptions = []
153
+ for i, chunk in enumerate(chunks):
154
+ try:
155
+ inputs = processor(chunk, sampling_rate=sr, return_tensors="pt", padding=True)
156
+ input_features = inputs.input_features.to(device)
157
+ attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
158
+
159
+ with torch.no_grad():
160
+ generated_ids = model.generate(
161
+ input_features,
162
+ attention_mask=attention_mask,
163
+ num_beams=5,
164
+ length_penalty=1.0,
165
+ repetition_penalty=1.1,
166
+ no_repeat_ngram_size=4,
167
+ temperature=0.9,
168
+ )
169
+ transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
170
+ transcriptions.append(transcription)
171
+ except Exception as e:
172
+ logging.error(f"Model error on chunk {i+1}: {str(e)}")
173
+ return "خطا در تبدیل گفتار به متن رخ داد.", None, None, None
174
+
175
+ final_transcription = merge_transcriptions(transcriptions)
176
+ if not final_transcription.strip():
177
+ return 'هیچ متنی استخراج نشد.', None, None, None
178
+
179
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
180
+ filename = f"transcription_{timestamp}.txt"
181
+ with open(filename, "w", encoding="utf-8") as f:
182
+ f.write(final_transcription)
183
+
184
+ escaped_transcription = escape(final_transcription)
185
+ copy_download_buttons_html = f"""
186
+ <div class="copy-download-buttons">
187
+ <button id="copy-button" data-transcription="{escaped_transcription}"
188
+ onclick="
189
+ var t=this.getAttribute('data-transcription');
190
+ if(t){{navigator.clipboard.writeText(t).then(()=>alert('متن کپی شد!'),err=>alert('عدم موفقیت کپی: '+err));}}
191
+ else{{alert('متنی یافت نشد!');}}
192
+ "
193
+ style="padding:8px 16px; background:#4CAF50; color:#fff; border:none; cursor:pointer;">
194
+ کپی متن
195
+ </button>
196
+ <button id="download-button"
197
+ onclick="
198
+ var a=document.querySelector('#download-file a');
199
+ if(a) a.click(); else alert('لینک دانلود یافت نشد!');
200
+ "
201
+ style="padding:8px 16px; background:#008CBA; color:#fff; border:none; cursor:pointer;">
202
+ دانلود متن
203
+ </button>
204
+ </div>
205
+ """
206
+
207
+ audio_output = audio_path if file else None
208
+ return final_transcription, filename, copy_download_buttons_html, audio_output
209
+
210
+ # ===== Image helper =====
211
+ def image_to_base64(image_path):
212
+ try:
213
+ with open(image_path, "rb") as image_file:
214
+ return base64.b64encode(image_file.read()).decode('utf-8')
215
+ except Exception:
216
+ return None
217
+
218
+ # لطفاً یک تصویر در مسیر assets/hero.jpg قرار دهید (دلخواه)
219
+ image_base64 = image_to_base64("assets/hero.jpg")
220
+
221
+ # ===== UI =====
222
+ custom_css = """
223
+ body { background-color: rgba(0,0,128,0.7); color:#fff; }
224
+ h1 { color:#fff; }
225
+ p { color:#ccc; }
226
+ button { border:none; padding:10px 20px; border-radius:8px; color:#fff; }
227
+ .copy-download-buttons { display:flex; gap:20px; justify-content:center; margin-top:20px; }
228
+ textarea { border-radius:8px; padding:10px; background-color: rgba(52,58,64,0.9); color:white; border:none; direction:rtl; text-align:right; }
229
+ .gradio-container { border-radius:10px; padding:20px; margin:20px; background-color: rgba(28,30,34,0.9); }
230
+ #gradio-app .powered-by, footer { display:none !important; }
231
+ """
232
+
233
+ title = "تبدیل گفتار به متن (Whisper فارسی)"
234
+ img_html = f'<img src="data:image/jpeg;base64,{image_base64}" width="400px">' if image_base64 else ""
235
+ description = f"""
236
+ <div style="text-align:center; direction:rtl;">
237
+ <p>با استفاده از مدل خصوصی، صوت شما به متن تبدیل می‌شود. دسترسی مستقیم به فایل‌های مدل امکان‌پذیر نیست.</p>
238
+ <div style="display:flex; justify-content:center;">{img_html}</div>
239
+ </div>
240
+ """
241
+
242
+ article = """
243
+ <div style="direction:rtl;">
244
+ این یک دمو برای ماژول گفتار به متن فارسی است.
245
+ </div>
246
+ """
247
+
248
+ interface = gr.Interface(
249
+ fn=transcribe_audio,
250
+ inputs=[
251
+ gr.Audio(source="microphone", type="filepath", label="صدای خود را ضبط کنید", clear_on_submit=True),
252
+ gr.Audio(source="upload", type="filepath", label="یک فایل صوتی بارگذاری کنید", max_size=300, clear_on_submit=True),
253
+ gr.File(label="فایل‌های صوتی بزرگ (اختیاری)", type="file")
254
+ ],
255
+ outputs=[
256
+ gr.Textbox(label="متن تبدیل‌شده", interactive=False, lines=4, elem_id="output-text", placeholder="نتیجه اینجا نمایش داده می‌شود."),
257
+ gr.File(label="دانلود متن", elem_id="download-file"),
258
+ gr.HTML(value="", elem_id="copy-download-buttons"),
259
+ gr.Audio(label="پخش فایل ورودی", type="filepath")
260
+ ],
261
+ title=title,
262
+ description=description,
263
+ article=article,
264
+ css=custom_css,
265
+ allow_flagging="never",
266
+ live=False
267
+ )
268
+
269
+ if __name__ == "__main__":
270
+ # روی Spaces فقط launch ساده نیاز است؛ نیازی به پورت/SSL/Share نیست.
271
+ interface.launch(show_error=True)