lokesh341 commited on
Commit
8994492
·
verified ·
1 Parent(s): 22a2612

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -10
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import torch
2
  from flask import Flask, render_template, request, jsonify
3
  import os
4
- import re
5
  from transformers import pipeline
6
  from gtts import gTTS
7
  from pydub import AudioSegment
@@ -10,11 +9,11 @@ from waitress import serve
10
 
11
  app = Flask(__name__)
12
 
13
- # Load Whisper ASR Model Correctly
14
  device = "cuda" if torch.cuda.is_available() else "cpu"
15
  asr_model = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=0 if device == "cuda" else -1)
16
 
17
- # Function to generate voice prompts
18
  def generate_audio_prompt(text, filename):
19
  tts = gTTS(text=text, lang="en")
20
  tts.save(os.path.join("static", filename))
@@ -30,16 +29,29 @@ prompts = {
30
  for key, text in prompts.items():
31
  generate_audio_prompt(text, f"{key}.mp3")
32
 
33
- # Ensure Proper Audio Format
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def convert_to_wav(input_path, output_path):
35
  try:
36
  audio = AudioSegment.from_file(input_path)
37
- audio = audio.set_frame_rate(16000).set_channels(1) # Convert to 16kHz, mono
38
  audio.export(output_path, format="wav")
39
  except Exception as e:
40
  raise Exception(f"Audio conversion failed: {str(e)}")
41
 
42
- # Check for Silence
43
  def is_silent_audio(audio_path):
44
  audio = AudioSegment.from_wav(audio_path)
45
  nonsilent_parts = detect_nonsilent(audio, min_silence_len=500, silence_thresh=audio.dBFS-16)
@@ -60,14 +72,14 @@ def transcribe():
60
  audio_file.save(input_audio_path)
61
 
62
  try:
63
- # Convert audio to proper format
64
  convert_to_wav(input_audio_path, output_audio_path)
65
 
66
- # Check for silent audio
67
  if is_silent_audio(output_audio_path):
68
  return jsonify({"error": "No speech detected. Please try again."}), 400
69
 
70
- # Transcribe Using Whisper ASR
71
  result = asr_model(output_audio_path, generate_kwargs={"language": "en"})
72
  transcribed_text = result["text"].strip().capitalize()
73
 
@@ -75,6 +87,6 @@ def transcribe():
75
  except Exception as e:
76
  return jsonify({"error": f"Speech recognition error: {str(e)}"}), 500
77
 
78
- # Start Production Server
79
  if __name__ == "__main__":
80
  serve(app, host="0.0.0.0", port=7860)
 
1
  import torch
2
  from flask import Flask, render_template, request, jsonify
3
  import os
 
4
  from transformers import pipeline
5
  from gtts import gTTS
6
  from pydub import AudioSegment
 
9
 
10
  app = Flask(__name__)
11
 
12
+ # Load Whisper ASR Model for Highly Accurate Speech-to-Text
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
  asr_model = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=0 if device == "cuda" else -1)
15
 
16
+ # Function to generate audio prompts
17
  def generate_audio_prompt(text, filename):
18
  tts = gTTS(text=text, lang="en")
19
  tts.save(os.path.join("static", filename))
 
29
  for key, text in prompts.items():
30
  generate_audio_prompt(text, f"{key}.mp3")
31
 
32
+ # Symbol mapping for proper recognition
33
+ SYMBOL_MAPPING = {
34
+ "at the rate": "@",
35
+ "at": "@",
36
+ "dot": ".",
37
+ "underscore": "_",
38
+ "hash": "#",
39
+ "plus": "+",
40
+ "dash": "-",
41
+ "comma": ",",
42
+ "space": " "
43
+ }
44
+
45
+ # Function to convert audio to WAV format
46
  def convert_to_wav(input_path, output_path):
47
  try:
48
  audio = AudioSegment.from_file(input_path)
49
+ audio = audio.set_frame_rate(16000).set_channels(1) # Convert to 16kHz, mono
50
  audio.export(output_path, format="wav")
51
  except Exception as e:
52
  raise Exception(f"Audio conversion failed: {str(e)}")
53
 
54
+ # Function to check if audio contains actual speech
55
  def is_silent_audio(audio_path):
56
  audio = AudioSegment.from_wav(audio_path)
57
  nonsilent_parts = detect_nonsilent(audio, min_silence_len=500, silence_thresh=audio.dBFS-16)
 
72
  audio_file.save(input_audio_path)
73
 
74
  try:
75
+ # Convert to WAV
76
  convert_to_wav(input_audio_path, output_audio_path)
77
 
78
+ # Check for silence
79
  if is_silent_audio(output_audio_path):
80
  return jsonify({"error": "No speech detected. Please try again."}), 400
81
 
82
+ # Use Whisper ASR model for transcription
83
  result = asr_model(output_audio_path, generate_kwargs={"language": "en"})
84
  transcribed_text = result["text"].strip().capitalize()
85
 
 
87
  except Exception as e:
88
  return jsonify({"error": f"Speech recognition error: {str(e)}"}), 500
89
 
90
+ # Start Production Server
91
  if __name__ == "__main__":
92
  serve(app, host="0.0.0.0", port=7860)