cigol123 commited on
Commit
b2ce8ef
·
verified ·
1 Parent(s): 72f1cd9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -36
app.py CHANGED
@@ -4,51 +4,22 @@ from transformers import WhisperProcessor, WhisperForConditionalGeneration
4
  import soundfile as sf
5
  import numpy as np
6
  from scipy import signal
7
- import os
8
 
9
- # Set the cache directory to a writable location
10
- cache_dir = "/tmp/.cache"
11
- os.environ["TRANSFORMERS_CACHE"] = cache_dir
12
- os.environ["HF_DATASETS_CACHE"] = cache_dir
13
- os.environ["TORCH_HOME"] = cache_dir # Set PyTorch cache directory
14
-
15
- # Ensure the cache directory exists and is writable
16
- os.makedirs(cache_dir, exist_ok=True)
17
-
18
- # Load the base Whisper model and processor
19
- def load_model():
20
- print("Loading base Whisper model and processor...")
21
- processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
22
- model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
23
-
24
- # Load the fine-tuned weights from the Macedonian-ASR repository
25
- print("Loading fine-tuned weights from Macedonian-ASR...")
26
- model.load_state_dict(torch.hub.load_state_dict_from_url(
27
- "https://huggingface.co/Macedonian-ASR/whisper-large-v3-macedonian-asr/resolve/main/pytorch_model.bin",
28
- map_location="cpu",
29
- model_dir=cache_dir # Save downloaded weights to the writable cache directory
30
- ))
31
- print("✓ Model and processor loaded successfully!")
32
- return processor, model
33
-
34
- processor, model = load_model()
35
 
36
  def process_audio(audio_path):
37
- # Load and resample to 16kHz using scipy
38
  waveform, sr = sf.read(audio_path)
39
- if len(waveform.shape) > 1: # Convert stereo to mono
40
  waveform = waveform.mean(axis=1)
41
- if sr != 16000: # Resample if necessary
42
  num_samples = int(len(waveform) * 16000 / sr)
43
  waveform = signal.resample(waveform, num_samples)
44
 
45
- # Process the audio
46
  inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")
47
  predicted_ids = model.generate(**inputs, language="mk")
48
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
49
- return transcription
50
 
51
- # Gradio interface
52
  demo = gr.Interface(
53
  fn=process_audio,
54
  inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
@@ -57,5 +28,4 @@ demo = gr.Interface(
57
  description="Качете аудио или користете микрофон за транскрипција на македонски говор / Upload audio or use microphone to transcribe Macedonian speech"
58
  )
59
 
60
- if __name__ == "__main__":
61
- demo.launch()
 
4
  import soundfile as sf
5
  import numpy as np
6
  from scipy import signal
 
7
 
8
+ processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
9
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  def process_audio(audio_path):
 
12
  waveform, sr = sf.read(audio_path)
13
+ if len(waveform.shape) > 1:
14
  waveform = waveform.mean(axis=1)
15
+ if sr != 16000:
16
  num_samples = int(len(waveform) * 16000 / sr)
17
  waveform = signal.resample(waveform, num_samples)
18
 
 
19
  inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")
20
  predicted_ids = model.generate(**inputs, language="mk")
21
+ return processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
 
22
 
 
23
  demo = gr.Interface(
24
  fn=process_audio,
25
  inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
 
28
  description="Качете аудио или користете микрофон за транскрипција на македонски говор / Upload audio or use microphone to transcribe Macedonian speech"
29
  )
30
 
31
+ demo.launch(server_name="0.0.0.0", server_port=7860)