geethareddy commited on
Commit
caba180
Β·
verified Β·
1 Parent(s): e1fbc96

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -23
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import torch
2
  from flask import Flask, render_template, request, jsonify
3
  import os
4
- import re
5
  from transformers import pipeline
6
  from gtts import gTTS
7
  from pydub import AudioSegment
@@ -10,11 +9,11 @@ from waitress import serve
10
 
11
  app = Flask(__name__)
12
 
13
- # βœ… Load Whisper ASR Model Correctly
14
  device = "cuda" if torch.cuda.is_available() else "cpu"
15
- asr_model = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=0 if device == "cuda" else -1)
16
 
17
- # Function to generate voice prompts
18
  def generate_audio_prompt(text, filename):
19
  tts = gTTS(text=text, lang="en")
20
  tts.save(os.path.join("static", filename))
@@ -22,7 +21,7 @@ def generate_audio_prompt(text, filename):
22
  # Generate required voice prompts
23
  prompts = {
24
  "welcome": "Welcome to Biryani Hub.",
25
- "ask_name": "Tell me your full name.",
26
  "ask_email": "Please provide your email address.",
27
  "thank_you": "Thank you for registration."
28
  }
@@ -30,28 +29,33 @@ prompts = {
30
  for key, text in prompts.items():
31
  generate_audio_prompt(text, f"{key}.mp3")
32
 
33
- # βœ… Ensure Proper Audio Format (16kHz, Mono)
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def convert_to_wav(input_path, output_path):
35
  try:
36
  audio = AudioSegment.from_file(input_path)
37
- audio = audio.set_frame_rate(16000).set_channels(1) # βœ… Convert to 16kHz, mono
38
  audio.export(output_path, format="wav")
39
  except Exception as e:
40
  raise Exception(f"Audio conversion failed: {str(e)}")
41
 
42
- # βœ… Check for Silence
43
  def is_silent_audio(audio_path):
44
  audio = AudioSegment.from_wav(audio_path)
45
- nonsilent_parts = detect_nonsilent(audio, min_silence_len=500, silence_thresh=audio.dBFS-16)
46
- return len(nonsilent_parts) == 0
47
-
48
- # βœ… Clean Transcription Text (Improved Formatting)
49
- def clean_transcription(text):
50
- text = text.strip()
51
- text = re.sub(r"[-.]", "", text) # βœ… Remove unwanted characters
52
- # Fix email structure and common recognition errors
53
- text = text.replace(" at the rate ", "@").replace(" dot ", ".")
54
- return text.capitalize()
55
 
56
  @app.route("/")
57
  def index():
@@ -68,21 +72,21 @@ def transcribe():
68
  audio_file.save(input_audio_path)
69
 
70
  try:
71
- # βœ… Convert audio to proper format
72
  convert_to_wav(input_audio_path, output_audio_path)
73
 
74
- # βœ… Check for silent audio
75
  if is_silent_audio(output_audio_path):
76
  return jsonify({"error": "No speech detected. Please try again."}), 400
77
 
78
- # βœ… Transcribe Using Whisper ASR
79
  result = asr_model(output_audio_path, generate_kwargs={"language": "en"})
80
- transcribed_text = clean_transcription(result["text"])
81
 
82
  return jsonify({"text": transcribed_text})
83
  except Exception as e:
84
  return jsonify({"error": f"Speech recognition error: {str(e)}"}), 500
85
 
86
- # βœ… Start Production Server
87
  if __name__ == "__main__":
88
  serve(app, host="0.0.0.0", port=7860)
 
1
  import torch
2
  from flask import Flask, render_template, request, jsonify
3
  import os
 
4
  from transformers import pipeline
5
  from gtts import gTTS
6
  from pydub import AudioSegment
 
9
 
10
  app = Flask(__name__)
11
 
12
+ # Use whisper-small for faster processing and better speed
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ asr_model = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=0 if device == "cuda" else -1)
15
 
16
+ # Function to generate audio prompts
17
  def generate_audio_prompt(text, filename):
18
  tts = gTTS(text=text, lang="en")
19
  tts.save(os.path.join("static", filename))
 
21
  # Generate required voice prompts
22
  prompts = {
23
  "welcome": "Welcome to Biryani Hub.",
24
+ "ask_name": "Tell me your name.",
25
  "ask_email": "Please provide your email address.",
26
  "thank_you": "Thank you for registration."
27
  }
 
29
  for key, text in prompts.items():
30
  generate_audio_prompt(text, f"{key}.mp3")
31
 
32
+ # Symbol mapping for proper recognition
33
+ SYMBOL_MAPPING = {
34
+ "at the rate": "@",
35
+ "at": "@",
36
+ "dot": ".",
37
+ "underscore": "_",
38
+ "hash": "#",
39
+ "plus": "+",
40
+ "dash": "-",
41
+ "comma": ",",
42
+ "space": " "
43
+ }
44
+
45
+ # Function to convert audio to WAV format
46
  def convert_to_wav(input_path, output_path):
47
  try:
48
  audio = AudioSegment.from_file(input_path)
49
+ audio = audio.set_frame_rate(16000).set_channels(1) # Convert to 16kHz, mono
50
  audio.export(output_path, format="wav")
51
  except Exception as e:
52
  raise Exception(f"Audio conversion failed: {str(e)}")
53
 
54
+ # Function to check if audio contains actual speech
55
  def is_silent_audio(audio_path):
56
  audio = AudioSegment.from_wav(audio_path)
57
+ nonsilent_parts = detect_nonsilent(audio, min_silence_len=500, silence_thresh=audio.dBFS-16) # Reduced silence duration
58
+ return len(nonsilent_parts) == 0 # If no speech detected
 
 
 
 
 
 
 
 
59
 
60
  @app.route("/")
61
  def index():
 
72
  audio_file.save(input_audio_path)
73
 
74
  try:
75
+ # Convert to WAV
76
  convert_to_wav(input_audio_path, output_audio_path)
77
 
78
+ # Check for silence
79
  if is_silent_audio(output_audio_path):
80
  return jsonify({"error": "No speech detected. Please try again."}), 400
81
 
82
+ # Use Whisper ASR model for transcription
83
  result = asr_model(output_audio_path, generate_kwargs={"language": "en"})
84
+ transcribed_text = result["text"].strip().capitalize()
85
 
86
  return jsonify({"text": transcribed_text})
87
  except Exception as e:
88
  return jsonify({"error": f"Speech recognition error: {str(e)}"}), 500
89
 
90
+ # Start Production Server
91
  if __name__ == "__main__":
92
  serve(app, host="0.0.0.0", port=7860)