geethareddy commited on
Commit
4b54307
·
verified ·
1 Parent(s): f95dae2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -11
app.py CHANGED
@@ -1,25 +1,26 @@
1
  from flask import Flask, render_template, request, jsonify
2
  import os
3
  import torch
 
4
  from transformers import pipeline
5
  from gtts import gTTS
6
- import re
7
  from pydub import AudioSegment
8
  from pydub.silence import detect_nonsilent
9
  from waitress import serve
 
10
 
11
  app = Flask(__name__)
12
 
13
- # Load Whisper Model (Use whisper-small for better performance)
14
  device = "cuda" if torch.cuda.is_available() else "cpu"
15
- asr_model = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=0 if device == "cuda" else -1)
16
 
17
  # Function to generate audio prompts
18
  def generate_audio_prompt(text, filename):
19
  tts = gTTS(text=text, lang="en")
20
  tts.save(os.path.join("static", filename))
21
 
22
- # Generate all required voice prompts
23
  prompts = {
24
  "welcome": "Welcome to Biryani Hub.",
25
  "ask_name": "Tell me your name.",
@@ -30,7 +31,7 @@ prompts = {
30
  for key, text in prompts.items():
31
  generate_audio_prompt(text, f"{key}.mp3")
32
 
33
- # Symbol mapping for proper recognition
34
  SYMBOL_MAPPING = {
35
  "at the rate": "@",
36
  "at": "@",
@@ -43,17 +44,25 @@ SYMBOL_MAPPING = {
43
  "space": " "
44
  }
45
 
 
 
 
 
 
 
 
 
46
  # Function to clean and format transcribed text properly
47
  def clean_transcription(text):
48
  text = text.lower().strip()
49
  for word, symbol in SYMBOL_MAPPING.items():
50
  text = text.replace(word, symbol)
51
- return text.capitalize()
52
 
53
  # Function to check if the audio contains actual speech
54
  def is_silent_audio(audio_path):
55
  audio = AudioSegment.from_wav(audio_path)
56
- nonsilent_parts = detect_nonsilent(audio, min_silence_len=700, silence_thresh=audio.dBFS-16)
57
  return len(nonsilent_parts) == 0 # Returns True if silence detected
58
 
59
  @app.route("/")
@@ -70,18 +79,18 @@ def transcribe():
70
  audio_file.save(audio_path)
71
 
72
  try:
73
- # Check if audio contains valid speech
74
  if is_silent_audio(audio_path):
75
  return jsonify({"error": "No speech detected. Please try again."}), 400
76
 
77
- # Force Whisper to transcribe only in English
78
- result = asr_model(audio_path, generate_kwargs={"language": "en"})
79
  transcribed_text = clean_transcription(result["text"])
80
 
81
  return jsonify({"text": transcribed_text})
82
  except Exception as e:
83
  return jsonify({"error": f"Speech recognition error: {str(e)}"}), 500
84
 
85
- # Run Waitress Production Server
86
  if __name__ == "__main__":
87
  serve(app, host="0.0.0.0", port=7860)
 
1
  from flask import Flask, render_template, request, jsonify
2
  import os
3
  import torch
4
+ import re
5
  from transformers import pipeline
6
  from gtts import gTTS
 
7
  from pydub import AudioSegment
8
  from pydub.silence import detect_nonsilent
9
  from waitress import serve
10
+ import whisper_timestamped # Improved Whisper with timestamps
11
 
12
  app = Flask(__name__)
13
 
14
+ # Load Whisper Model for Highly Accurate Speech-to-Text
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ asr_model = whisper_timestamped.load_model("medium", device=device)
17
 
18
  # Function to generate audio prompts
19
  def generate_audio_prompt(text, filename):
20
  tts = gTTS(text=text, lang="en")
21
  tts.save(os.path.join("static", filename))
22
 
23
+ # Generate required voice prompts
24
  prompts = {
25
  "welcome": "Welcome to Biryani Hub.",
26
  "ask_name": "Tell me your name.",
 
31
  for key, text in prompts.items():
32
  generate_audio_prompt(text, f"{key}.mp3")
33
 
34
+ # Symbol mapping for better recognition
35
  SYMBOL_MAPPING = {
36
  "at the rate": "@",
37
  "at": "@",
 
44
  "space": " "
45
  }
46
 
47
+ # Function to extract meaningful words (Removes unnecessary phrases)
48
+ def extract_meaningful_text(text):
49
+ text = text.lower().strip()
50
+ ignore_phrases = ["my name is", "this is", "i am", "it's", "name"]
51
+ for phrase in ignore_phrases:
52
+ text = text.replace(phrase, "").strip()
53
+ return text.capitalize()
54
+
55
  # Function to clean and format transcribed text properly
56
  def clean_transcription(text):
57
  text = text.lower().strip()
58
  for word, symbol in SYMBOL_MAPPING.items():
59
  text = text.replace(word, symbol)
60
+ return extract_meaningful_text(text)
61
 
62
  # Function to check if the audio contains actual speech
63
  def is_silent_audio(audio_path):
64
  audio = AudioSegment.from_wav(audio_path)
65
+ nonsilent_parts = detect_nonsilent(audio, min_silence_len=500, silence_thresh=audio.dBFS-16)
66
  return len(nonsilent_parts) == 0 # Returns True if silence detected
67
 
68
  @app.route("/")
 
79
  audio_file.save(audio_path)
80
 
81
  try:
82
+ # Check if the audio contains valid speech
83
  if is_silent_audio(audio_path):
84
  return jsonify({"error": "No speech detected. Please try again."}), 400
85
 
86
+ # Transcribe using Whisper
87
+ result = asr_model.transcribe(audio_path, language="en")
88
  transcribed_text = clean_transcription(result["text"])
89
 
90
  return jsonify({"text": transcribed_text})
91
  except Exception as e:
92
  return jsonify({"error": f"Speech recognition error: {str(e)}"}), 500
93
 
94
+ # Use Waitress for Production Server
95
  if __name__ == "__main__":
96
  serve(app, host="0.0.0.0", port=7860)