Files changed (7) hide show
  1. .gitattributes +0 -4
  2. app.py +146 -163
  3. assets/.gitattributes +0 -2
  4. assets/.gitkeep +0 -0
  5. assets/hero.jpg +0 -3
  6. packages.txt +0 -2
  7. requirements.txt +3 -2
.gitattributes CHANGED
@@ -33,7 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- *.jpg filter=lfs diff=lfs merge=lfs -text
37
- *.png filter=lfs diff=lfs merge=lfs -text
38
- *.wav filter=lfs diff=lfs merge=lfs -text
39
- *.mp3 filter=lfs diff=lfs merge=lfs -text
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
app.py CHANGED
@@ -1,86 +1,52 @@
1
- # app.py — Persian Whisper ASR (HF Spaces friendly)
2
- import os, time, base64, datetime, logging
3
- from html import escape
4
- from difflib import SequenceMatcher
5
-
6
  import gradio as gr
7
  import torch
8
- import numpy as np
9
- import librosa
10
  from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  # ===== Logging =====
13
  logging.basicConfig(level=logging.INFO)
14
- logger = logging.getLogger("persian-whisper-space")
15
 
16
- # ===== Env =====
 
 
 
 
 
17
  HF_TOKEN = os.getenv("HF_TOKEN", None)
18
  if HF_TOKEN is None:
19
- logger.warning("HF_TOKEN is not set. Add it in Space Settings → Secrets.")
20
 
21
- # مدل پیش‌فرض (می‌توانی هر موقع عوضش کنی)
22
- MODEL_ID = os.getenv(
23
- "MODEL_ID",
24
- "MohammadReza-Halakoo/Whisper-Small-PersianASR-20-percent-17-0"
25
- )
26
 
27
- # اگر خواستی حذفِ سکوت فعال شود: در Settings→Variables مقدار 1 بگذار
28
- ENABLE_SILENCE_REMOVAL = os.getenv("ENABLE_SILENCE_REMOVAL", "0") == "1"
 
29
 
30
- # ===== Device & dtype =====
31
- if torch.cuda.is_available():
32
- device = "cuda"
33
- torch_dtype = torch.float16
34
- logger.info("GPU detected → using CUDA + float16")
35
- else:
36
- device = "cpu"
37
- try:
38
- # روی CPU، اگر bf16 پشتیبانی نشود، می‌رویم روی float32
39
- torch_dtype = torch.bfloat16 # اکثر اوقات امن است؛ اگر خطا داد، except پایین می‌گیرد
40
- _ = torch.tensor([0], dtype=torch_dtype) # sanity check
41
- except Exception:
42
- torch_dtype = torch.float32
43
- logger.info(f"No GPU detected → falling back to CPU + {torch_dtype}")
44
-
45
- # ===== Load model =====
46
- processor = AutoProcessor.from_pretrained(MODEL_ID, token=HF_TOKEN)
47
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
48
- MODEL_ID,
49
- token=HF_TOKEN,
50
- torch_dtype=torch_dtype if device == "cuda" else None, # روی CPU بهتره float32/bf16 بماند
51
- low_cpu_mem_usage=True,
52
- device_map="auto" if device == "cuda" else None
53
- ).to(device)
54
-
55
- # Pad token safety
56
- if getattr(model.config, "pad_token_id", None) is None:
57
  model.config.pad_token_id = processor.tokenizer.pad_token_id
 
58
  if model.config.pad_token_id == model.config.eos_token_id:
59
  if processor.tokenizer.pad_token_id != processor.tokenizer.eos_token_id:
60
  model.config.pad_token_id = processor.tokenizer.pad_token_id
61
  else:
62
- processor.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
63
  model.resize_token_embeddings(len(processor.tokenizer))
64
  model.config.pad_token_id = processor.tokenizer.pad_token_id
65
 
66
- # ===== Optional prompt ids (fa/transcribe) =====
67
- try:
68
- prompt_ids = processor.get_decoder_prompt_ids(language="farsi", task="transcribe")
69
- except Exception:
70
- prompt_ids = None
71
-
72
- # ===== Audio utils =====
73
- def resolve_path(x):
74
- if not x:
75
- return None
76
- if isinstance(x, str):
77
- return x
78
- if hasattr(x, "name"):
79
- return x.name
80
- if isinstance(x, dict) and "name" in x:
81
- return x["name"]
82
- return None
83
-
84
  def load_audio_preserving_quality(audio_path, target_sr=16000):
85
  try:
86
  audio, sr = librosa.load(audio_path, sr=None, mono=False)
@@ -94,152 +60,165 @@ def load_audio_preserving_quality(audio_path, target_sr=16000):
94
  audio = np.nan_to_num(audio)
95
  return audio, sr
96
  except Exception as e:
97
- logger.exception(f"Audio load error: {e}")
98
  return None, None
99
 
100
  def remove_intermediate_silence(audio, sr, silence_thresh=-38, min_silence_len=700, padding=200):
101
- if not ENABLE_SILENCE_REMOVAL:
102
- return audio, sr
103
  try:
104
- # import اینجا تا اگر pydub/ffmpeg نصب نبود، کل برنامه crash نکند
105
- from pydub import AudioSegment
106
- from pydub.silence import detect_nonsilent
107
-
108
  audio_segment = AudioSegment(
109
  (audio * np.iinfo(np.int16).max).astype(np.int16).tobytes(),
110
- frame_rate=sr, sample_width=2, channels=1
 
 
 
 
 
 
 
111
  )
112
- ranges = detect_nonsilent(audio_segment, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
113
- if not ranges:
114
  return np.array([], dtype=np.float32), sr
115
- merged = AudioSegment.empty()
116
- for start, end in ranges:
 
117
  start = max(0, start - padding)
118
  end = min(len(audio_segment), end + padding)
119
- merged += audio_segment[start:end]
120
- data = np.array(merged.get_array_of_samples()).astype(np.float32)
121
- data /= np.iinfo(np.int16).max
122
- return data, sr
 
123
  except Exception as e:
124
- logger.warning(f"Silence removal disabled (error): {e}")
125
  return audio, sr
126
 
127
  def is_silent(audio, threshold=1e-4):
128
  if audio is None or len(audio) == 0:
129
  return True
130
- rms = float(np.sqrt(np.mean(audio**2)))
131
  return rms < threshold
132
 
133
  def merge_transcriptions(transcriptions):
134
  if not transcriptions:
135
- return ""
136
- out = transcriptions[0]
137
  for i in range(1, len(transcriptions)):
138
- prev, cur = out, transcriptions[i]
 
139
  N = 50
140
- match = SequenceMatcher(None, prev[-N:], cur[:N]).find_longest_match(0, min(N, len(prev)), 0, min(N, len(cur)))
 
 
141
  if match.size > 10:
142
- out += cur[match.b + match.size :]
 
143
  else:
144
- out += " " + cur
145
- return out
146
-
147
- # ===== Image helper (optional) =====
148
- def image_to_base64(path):
149
- try:
150
- with open(path, "rb") as f:
151
- return base64.b64encode(f.read()).decode("utf-8")
152
- except Exception:
153
- return None
154
-
155
- image_base64 = image_to_base64("assets/hero.jpg")
156
- img_html = f'<img src="data:image/jpeg;base64,{image_base64}" width="400" />' if image_base64 else ""
157
 
158
- # ===== Inference =====
159
  def transcribe_audio(mic=None, upload_audio=None, file=None):
160
- t0 = time.time()
161
- audio_path = resolve_path(mic) or resolve_path(upload_audio) or resolve_path(file)
162
- logger.info(f"audio_path: {audio_path!r}")
163
  if not audio_path:
164
- return "لطفاً یک فایل صوتی یا صدای ضبط‌شده ارسال کنید.", None, None, None
165
 
166
- audio, sr = load_audio_preserving_quality(audio_path, 16000)
167
  if audio is None:
168
  return "خطا در بارگذاری و پردازش صوت.", None, None, None
169
 
170
  audio, sr = remove_intermediate_silence(audio, sr)
171
  if is_silent(audio):
172
- return "صوت ورودی حاوی صدای قابل پردازش نیست.", None, None, None
173
 
174
- # Chunking ~22s with 3s stride (سبک‌تر از 29s)
175
- max_chunk_length, stride_length = 22, 3
176
- max_chunk_samples, stride_samples = int(max_chunk_length*sr), int(stride_length*sr)
 
 
177
 
178
  chunks, start = [], 0
179
- L = len(audio)
180
- while start < L:
181
- end = min(start + max_chunk_samples, L)
182
- chunks.append(audio[start:end])
183
- if end >= L: break
184
  start += max_chunk_samples - stride_samples
185
- if not chunks:
186
- return "صوت ورودی خالی است.", None, None, None
187
 
188
- # سبک‌تر برای CPU
189
- gen_kwargs = dict(max_new_tokens=225, do_sample=False, num_beams=1, length_penalty=1.0)
190
 
191
- trans = []
192
- for i, chunk in enumerate(chunks, 1):
193
  try:
194
  inputs = processor(chunk, sampling_rate=sr, return_tensors="pt", padding=True)
195
- feats = inputs.input_features.to(device)
 
196
 
197
  with torch.no_grad():
198
- if prompt_ids is not None:
199
- dec_ids = torch.tensor([x[1] for x in prompt_ids]).unsqueeze(0).to(device)
200
- ids = model.generate(feats, decoder_input_ids=dec_ids, **gen_kwargs)
201
- else:
202
- ids = model.generate(feats, **gen_kwargs)
203
-
204
- text = processor.batch_decode(ids, skip_special_tokens=True)[0].strip()
205
- trans.append(text)
 
 
 
206
  except Exception as e:
207
- logger.exception(f"Model error on chunk {i}: {e}")
208
  return "خطا در تبدیل گفتار به متن رخ داد.", None, None, None
209
 
210
- final_text = merge_transcriptions(trans).strip()
211
- if not final_text:
212
- return "هیچ متنی استخراج نشد.", None, None, None
213
 
214
- # Save .txt
215
- ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
216
- filename = f"transcription_{ts}.txt"
217
  with open(filename, "w", encoding="utf-8") as f:
218
- f.write(final_text)
219
 
220
- escaped = escape(final_text)
221
- html_buttons = f"""
222
  <div class="copy-download-buttons">
223
- <button id="copy-button" data-transcription="{escaped}"
224
- onclick="var t=this.getAttribute('data-transcription'); if(t){{navigator.clipboard.writeText(t).then(()=>alert('متن کپی شد!'),err=>alert('عدم موفقیت کپی: '+err));}}"
225
- style="padding:8px 16px; background:#4CAF50; color:#fff; border:none; cursor:pointer; border-radius:8px;">
 
 
 
 
226
  کپی متن
227
  </button>
228
  <button id="download-button"
229
- onclick="var a=document.querySelector('#download-file a'); if(a) a.click(); else alert('لینک دانلود یافت نشد!');"
230
- style="padding:8px 16px; background:#008CBA; color:#fff; border:none; cursor:pointer; border-radius:8px;">
 
 
 
231
  دانلود متن
232
  </button>
233
  </div>
234
  """
235
 
236
- # همیشه ورودی را برای پخش برگردان
237
- audio_output = audio_path
238
 
239
- logger.info(f"Done in {time.time()-t0:.2f}s, chunks={len(chunks)}")
240
- return final_text, filename, html_buttons, audio_output
 
 
 
 
 
241
 
242
- # ===== UI / CSS =====
 
 
 
243
  custom_css = """
244
  body { background-color: rgba(0,0,128,0.7); color:#fff; }
245
  h1 { color:#fff; }
@@ -252,37 +231,41 @@ textarea { border-radius:8px; padding:10px; background-color: rgba(52,58,64,0.9)
252
  """
253
 
254
  title = "تبدیل گفتار به متن (Whisper فارسی)"
 
255
  description = f"""
256
  <div style="text-align:center; direction:rtl;">
257
- <p>با استفاده از مدل خصوصی، صوت شما به متن تبدیل می‌شود.</p>
258
  <div style="display:flex; justify-content:center;">{img_html}</div>
259
  </div>
260
  """
 
261
  article = """
262
  <div style="direction:rtl;">
263
- نکته: برای سرعت بیشتر، Space را روی GPU اجرا کنید. برای فعال‌سازی حذف سکوت، متغیر ENABLE_SILENCE_REMOVAL=1 تنظیم شود.
264
  </div>
265
  """
266
 
267
- mic_in = gr.Audio(sources=["microphone"], type="filepath", label="صدای خود را ضبط کنید",
268
- streaming=False, interactive=True)
269
- upl_in = gr.Audio(sources=["upload"], type="filepath", label="یک فایل صوتی بارگذاری کنید",
270
- streaming=False, interactive=True)
271
- big_in = gr.File(label="فایل‌های صوتی بزرگ (اختیاری)")
272
-
273
  interface = gr.Interface(
274
  fn=transcribe_audio,
275
- inputs=[mic_in, upl_in, big_in],
 
 
 
 
276
  outputs=[
277
- gr.Textbox(label="متن تبدیل‌شده", interactive=False, lines=6, elem_id="output-text",
278
- placeholder="نتیجه اینجا نمایش داده می‌شود."),
279
  gr.File(label="دانلود متن", elem_id="download-file"),
280
  gr.HTML(value="", elem_id="copy-download-buttons"),
281
- gr.Audio(label="پخش فایل ورودی"),
282
  ],
283
- title=title, description=description, article=article,
284
- css=custom_css, allow_flagging="never", live=False
 
 
 
 
285
  )
286
 
287
  if __name__ == "__main__":
 
288
  interface.launch(show_error=True)
 
 
 
 
 
 
1
  import gradio as gr
2
  import torch
 
 
3
  from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
4
+ import librosa
5
+ import numpy as np
6
+ import logging
7
+ import base64
8
+ import os
9
+ import time
10
+ import datetime
11
+ from html import escape
12
+ from difflib import SequenceMatcher
13
+ from pydub import AudioSegment
14
+ from pydub.silence import detect_nonsilent
15
 
16
  # ===== Logging =====
17
  logging.basicConfig(level=logging.INFO)
 
18
 
19
+ # ===== Device =====
20
+ device = "cuda" if torch.cuda.is_available() else "cpu"
21
+ logging.info(f"Using device: {device}")
22
+
23
+ # ===== Model (Private) =====
24
+ # 1) در Settings → Secrets یک secret با نام HF_TOKEN بسازید
25
  HF_TOKEN = os.getenv("HF_TOKEN", None)
26
  if HF_TOKEN is None:
27
+ logging.warning("HF_TOKEN is not set. Make sure to add it in Space Settings → Secrets.")
28
 
29
+ # 2) آیدی مدل Private خودتان را اینجا قرار دهید
30
+ # مثال: "MohammadReza-Halakoo/1-persian-whisper-large-v"
31
+ MODEL_ID = os.getenv("MODEL_ID", "MohammadReza-Halakoo/1-persian-whisper-large-v")
 
 
32
 
33
+ processor = AutoProcessor.from_pretrained(MODEL_ID, use_auth_token=HF_TOKEN)
34
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_ID, use_auth_token=HF_TOKEN)
35
+ model = model.to(device)
36
 
37
+ # attention mask fix (ایمن)
38
+ if model.config.pad_token_id is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  model.config.pad_token_id = processor.tokenizer.pad_token_id
40
+
41
  if model.config.pad_token_id == model.config.eos_token_id:
42
  if processor.tokenizer.pad_token_id != processor.tokenizer.eos_token_id:
43
  model.config.pad_token_id = processor.tokenizer.pad_token_id
44
  else:
45
+ processor.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
46
  model.resize_token_embeddings(len(processor.tokenizer))
47
  model.config.pad_token_id = processor.tokenizer.pad_token_id
48
 
49
+ # ===== Audio Utils =====
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def load_audio_preserving_quality(audio_path, target_sr=16000):
51
  try:
52
  audio, sr = librosa.load(audio_path, sr=None, mono=False)
 
60
  audio = np.nan_to_num(audio)
61
  return audio, sr
62
  except Exception as e:
63
+ logging.error(f"Audio load error: {str(e)}")
64
  return None, None
65
 
66
  def remove_intermediate_silence(audio, sr, silence_thresh=-38, min_silence_len=700, padding=200):
 
 
67
  try:
 
 
 
 
68
  audio_segment = AudioSegment(
69
  (audio * np.iinfo(np.int16).max).astype(np.int16).tobytes(),
70
+ frame_rate=sr,
71
+ sample_width=2,
72
+ channels=1
73
+ )
74
+ nonsilent_ranges = detect_nonsilent(
75
+ audio_segment,
76
+ min_silence_len=min_silence_len,
77
+ silence_thresh=silence_thresh
78
  )
79
+ if not nonsilent_ranges:
 
80
  return np.array([], dtype=np.float32), sr
81
+
82
+ non_silent_audio = AudioSegment.empty()
83
+ for start, end in nonsilent_ranges:
84
  start = max(0, start - padding)
85
  end = min(len(audio_segment), end + padding)
86
+ non_silent_audio += audio_segment[start:end]
87
+
88
+ processed_audio = np.array(non_silent_audio.get_array_of_samples()).astype(np.float32)
89
+ processed_audio /= np.iinfo(np.int16).max
90
+ return processed_audio, sr
91
  except Exception as e:
92
+ logging.error(f"Silence removal error: {str(e)}")
93
  return audio, sr
94
 
95
  def is_silent(audio, threshold=1e-4):
96
  if audio is None or len(audio) == 0:
97
  return True
98
+ rms = np.sqrt(np.mean(audio**2))
99
  return rms < threshold
100
 
101
  def merge_transcriptions(transcriptions):
102
  if not transcriptions:
103
+ return ''
104
+ final_transcription = transcriptions[0]
105
  for i in range(1, len(transcriptions)):
106
+ prev_transcription = final_transcription
107
+ current_transcription = transcriptions[i]
108
  N = 50
109
+ prev_part = prev_transcription[-N:]
110
+ curr_part = current_transcription[:N]
111
+ match = SequenceMatcher(None, prev_part, curr_part).find_longest_match(0, len(prev_part), 0, len(curr_part))
112
  if match.size > 10:
113
+ non_overlapping_part = current_transcription[match.b + match.size:]
114
+ final_transcription += non_overlapping_part
115
  else:
116
+ final_transcription += ' ' + current_transcription
117
+ return final_transcription
 
 
 
 
 
 
 
 
 
 
 
118
 
119
+ # ===== Core Inference =====
120
  def transcribe_audio(mic=None, upload_audio=None, file=None):
121
+ start_time = time.time()
122
+
123
+ audio_path = mic or upload_audio or (file.name if file else None)
124
  if not audio_path:
125
+ return 'لطفاً یک فایل صوتی یا صدای ضبط‌شده ارسال کنید.', None, None, None
126
 
127
+ audio, sr = load_audio_preserving_quality(audio_path, target_sr=16000)
128
  if audio is None:
129
  return "خطا در بارگذاری و پردازش صوت.", None, None, None
130
 
131
  audio, sr = remove_intermediate_silence(audio, sr)
132
  if is_silent(audio):
133
+ return 'صوت ورودی حاوی صدای قابل پردازش نیست.', None, None, None
134
 
135
+ # تقسیم به چانک‌های 29 ثانیه با هم‌پوشانی 3 ثانیه
136
+ max_chunk_length = 29
137
+ stride_length = 3
138
+ max_chunk_samples = max_chunk_length * sr
139
+ stride_samples = stride_length * sr
140
 
141
  chunks, start = [], 0
142
+ while start < len(audio):
143
+ end = min(start + max_chunk_samples, len(audio))
144
+ chunks.append(audio[int(start):int(end)])
145
+ if end == len(audio):
146
+ break
147
  start += max_chunk_samples - stride_samples
 
 
148
 
149
+ if not chunks:
150
+ return 'صوت ورودی خالی است.', None, None, None
151
 
152
+ transcriptions = []
153
+ for i, chunk in enumerate(chunks):
154
  try:
155
  inputs = processor(chunk, sampling_rate=sr, return_tensors="pt", padding=True)
156
+ input_features = inputs.input_features.to(device)
157
+ attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
158
 
159
  with torch.no_grad():
160
+ generated_ids = model.generate(
161
+ input_features,
162
+ attention_mask=attention_mask,
163
+ num_beams=5,
164
+ length_penalty=1.0,
165
+ repetition_penalty=1.1,
166
+ no_repeat_ngram_size=4,
167
+ temperature=0.9,
168
+ )
169
+ transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
170
+ transcriptions.append(transcription)
171
  except Exception as e:
172
+ logging.error(f"Model error on chunk {i+1}: {str(e)}")
173
  return "خطا در تبدیل گفتار به متن رخ داد.", None, None, None
174
 
175
+ final_transcription = merge_transcriptions(transcriptions)
176
+ if not final_transcription.strip():
177
+ return 'هیچ متنی استخراج نشد.', None, None, None
178
 
179
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
180
+ filename = f"transcription_{timestamp}.txt"
 
181
  with open(filename, "w", encoding="utf-8") as f:
182
+ f.write(final_transcription)
183
 
184
+ escaped_transcription = escape(final_transcription)
185
+ copy_download_buttons_html = f"""
186
  <div class="copy-download-buttons">
187
+ <button id="copy-button" data-transcription="{escaped_transcription}"
188
+ onclick="
189
+ var t=this.getAttribute('data-transcription');
190
+ if(t){{navigator.clipboard.writeText(t).then(()=>alert('متن کپی شد!'),err=>alert('عدم موفقیت کپی: '+err));}}
191
+ else{{alert('متنی یافت نشد!');}}
192
+ "
193
+ style="padding:8px 16px; background:#4CAF50; color:#fff; border:none; cursor:pointer;">
194
  کپی متن
195
  </button>
196
  <button id="download-button"
197
+ onclick="
198
+ var a=document.querySelector('#download-file a');
199
+ if(a) a.click(); else alert('لینک دانلود یافت نشد!');
200
+ "
201
+ style="padding:8px 16px; background:#008CBA; color:#fff; border:none; cursor:pointer;">
202
  دانلود متن
203
  </button>
204
  </div>
205
  """
206
 
207
+ audio_output = audio_path if file else None
208
+ return final_transcription, filename, copy_download_buttons_html, audio_output
209
 
210
+ # ===== Image helper =====
211
+ def image_to_base64(image_path):
212
+ try:
213
+ with open(image_path, "rb") as image_file:
214
+ return base64.b64encode(image_file.read()).decode('utf-8')
215
+ except Exception:
216
+ return None
217
 
218
+ # لطفاً یک تصویر در مسیر assets/hero.jpg قرار دهید (دلخواه)
219
+ image_base64 = image_to_base64("assets/hero.jpg")
220
+
221
+ # ===== UI =====
222
  custom_css = """
223
  body { background-color: rgba(0,0,128,0.7); color:#fff; }
224
  h1 { color:#fff; }
 
231
  """
232
 
233
  title = "تبدیل گفتار به متن (Whisper فارسی)"
234
+ img_html = f'<img src="data:image/jpeg;base64,{image_base64}" width="400px">' if image_base64 else ""
235
  description = f"""
236
  <div style="text-align:center; direction:rtl;">
237
+ <p>با استفاده از مدل خصوصی، صوت شما به متن تبدیل می‌شود. دسترسی مستقیم به فایل‌های مدل امکان‌پذیر نیست.</p>
238
  <div style="display:flex; justify-content:center;">{img_html}</div>
239
  </div>
240
  """
241
+
242
  article = """
243
  <div style="direction:rtl;">
244
+ این یک دمو برای ماژول گفتار به متن فارسی است.
245
  </div>
246
  """
247
 
 
 
 
 
 
 
248
  interface = gr.Interface(
249
  fn=transcribe_audio,
250
+ inputs=[
251
+ gr.Audio(source="microphone", type="filepath", label="صدای خود را ضبط کنید", clear_on_submit=True),
252
+ gr.Audio(source="upload", type="filepath", label="یک فایل صوتی بارگذاری کنید", max_size=300, clear_on_submit=True),
253
+ gr.File(label="فایل‌های صوتی بزرگ (اختیاری)", type="file")
254
+ ],
255
  outputs=[
256
+ gr.Textbox(label="متن تبدیل‌شده", interactive=False, lines=4, elem_id="output-text", placeholder="نتیجه اینجا نمایش داده می‌شود."),
 
257
  gr.File(label="دانلود متن", elem_id="download-file"),
258
  gr.HTML(value="", elem_id="copy-download-buttons"),
259
+ gr.Audio(label="پخش فایل ورودی", type="filepath")
260
  ],
261
+ title=title,
262
+ description=description,
263
+ article=article,
264
+ css=custom_css,
265
+ allow_flagging="never",
266
+ live=False
267
  )
268
 
269
  if __name__ == "__main__":
270
+ # روی Spaces فقط launch ساده نیاز است؛ نیازی به پورت/SSL/Share نیست.
271
  interface.launch(show_error=True)
assets/.gitattributes DELETED
@@ -1,2 +0,0 @@
1
- *.png filter=lfs diff=lfs merge=lfs -text
2
- *.jpg filter=lfs diff=lfs merge=lfs -text
 
 
 
assets/.gitkeep DELETED
File without changes
assets/hero.jpg DELETED

Git LFS Details

  • SHA256: 9c5a95d02bb857e862d0fedb7cd497abfbaf52a06031e638ac01d343651d775e
  • Pointer size: 130 Bytes
  • Size of remote file: 68.7 kB
packages.txt DELETED
@@ -1,2 +0,0 @@
1
- ffmpeg
2
- libsndfile1
 
 
 
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
- gradio>=4.44.0
2
  transformers>=4.42.0
3
- torch # (نسخه مطابق هاردور Space)
 
 
4
  librosa
5
  numpy
6
  pydub
 
 
1
  transformers>=4.42.0
2
+ torch
3
+ torchaudio
4
+ gradio>=4.36.0
5
  librosa
6
  numpy
7
  pydub