sevda commited on
Commit
e80497f
·
verified ·
1 Parent(s): e4cfdaf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -28
app.py CHANGED
@@ -1,15 +1,15 @@
1
  import torch
2
  import torchaudio
3
- # from transformers import WhisperProcessor, WhisperForConditionalGeneration
4
  from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline
5
  import gradio as gr
6
 
 
7
  DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
8
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
9
  WHISPER_SAMPLE_RATE = 16000
10
 
 
11
  model_id = "ixxan/whisper-small-thugy20"
12
-
13
  processor = AutoProcessor.from_pretrained("ixxan/whisper-small-thugy20")
14
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
15
  model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
@@ -26,37 +26,48 @@ pipe = pipeline(
26
  )
27
 
28
  def preprocess_audio(audio_path: str) -> torch.Tensor:
29
- audio, sample_rate = torchaudio.load(audio_path)
30
- # Resample if necessary
31
- if sample_rate != WHISPER_SAMPLE_RATE:
32
- resampler = torchaudio.transforms.Resample(
33
- orig_freq=sample_rate, new_freq=WHISPER_SAMPLE_RATE
34
- )
35
- audio = resampler(audio)
36
- # Convert to mono
37
- if audio.shape[0] > 1:
38
- audio = torch.mean(audio, dim=0)
39
- return audio.squeeze()
40
-
41
-
42
- def transcribe(audio_path: str) -> str:
43
- audio_input = preprocess_audio(audio_path)
44
- input_features = processor(
45
- audio_input,
46
- sampling_rate=WHISPER_SAMPLE_RATE,
47
- return_tensors="pt",
48
- # language="Chinese",
49
- ).input_features.to(DEVICE)
50
-
51
- predicted_ids = model.generate(input_features)
52
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
53
- return transcription
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
 
56
  iface = gr.Interface(
57
  fn=transcribe,
58
  inputs=gr.Audio(type="filepath"),
59
  outputs="text",
60
  title="Uyghur Speech Recognition",
 
 
61
  )
62
- iface.launch()
 
 
 
 
1
  import torch
2
  import torchaudio
 
3
  from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline
4
  import gradio as gr
5
 
6
+ # Setup device
7
  DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
8
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
9
  WHISPER_SAMPLE_RATE = 16000
10
 
11
+ # Load model and processor
12
  model_id = "ixxan/whisper-small-thugy20"
 
13
  processor = AutoProcessor.from_pretrained("ixxan/whisper-small-thugy20")
14
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
15
  model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
 
26
  )
27
 
28
  def preprocess_audio(audio_path: str) -> torch.Tensor:
29
+ try:
30
+ audio, sample_rate = torchaudio.load(audio_path)
31
+ # Resample if necessary
32
+ if sample_rate != WHISPER_SAMPLE_RATE:
33
+ resampler = torchaudio.transforms.Resample(
34
+ orig_freq=sample_rate, new_freq=WHISPER_SAMPLE_RATE
35
+ )
36
+ audio = resampler(audio)
37
+ # Convert to mono
38
+ if audio.shape[0] > 1:
39
+ audio = torch.mean(audio, dim=0)
40
+ return audio.squeeze()
41
+ except Exception as e:
42
+ raise RuntimeError(f"Error processing audio file: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ def transcribe(audio_path):
45
+ try:
46
+ if audio_path is None:
47
+ return "No audio provided. Please record or upload an audio file."
48
+
49
+ audio_input = preprocess_audio(audio_path)
50
+ input_features = processor(
51
+ audio_input,
52
+ sampling_rate=WHISPER_SAMPLE_RATE,
53
+ return_tensors="pt",
54
+ ).input_features.to(DEVICE)
55
+ predicted_ids = model.generate(input_features)
56
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
57
+ return transcription
58
+ except Exception as e:
59
+ return f"Error transcribing audio: {str(e)}"
60
 
61
+ # Create Gradio interface
62
  iface = gr.Interface(
63
  fn=transcribe,
64
  inputs=gr.Audio(type="filepath"),
65
  outputs="text",
66
  title="Uyghur Speech Recognition",
67
+ description="Upload or record audio in Uyghur to get its transcription.",
68
+ examples=[], # You can add example audio files here if you have them
69
  )
70
+
71
+ # Launch the app
72
+ if __name__ == "__main__":
73
+ iface.launch()