lokesh341 commited on
Commit
b5fadc4
·
verified ·
1 Parent(s): 1e95876

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -42
app.py CHANGED
@@ -1,25 +1,25 @@
 
1
  from flask import Flask, render_template, request, jsonify
2
  import os
3
- import torch
4
- import whisper
5
  import re
 
 
6
  from pydub import AudioSegment
7
  from pydub.silence import detect_nonsilent
8
  from waitress import serve
9
- from gtts import gTTS
10
 
11
  app = Flask(__name__)
12
 
13
- # Load Whisper Model (Higher Accuracy)
14
  device = "cuda" if torch.cuda.is_available() else "cpu"
15
- whisper_model = whisper.load_model("medium") # Change to "large" for even better accuracy
16
 
17
- # Function to generate audio prompts
18
  def generate_audio_prompt(text, filename):
19
  tts = gTTS(text=text, lang="en")
20
  tts.save(os.path.join("static", filename))
21
 
22
- # Generate voice prompts
23
  prompts = {
24
  "welcome": "Welcome to Biryani Hub.",
25
  "ask_name": "Tell me your name.",
@@ -30,36 +30,20 @@ prompts = {
30
  for key, text in prompts.items():
31
  generate_audio_prompt(text, f"{key}.mp3")
32
 
33
- # Symbol mapping for proper recognition
34
- SYMBOL_MAPPING = {
35
- "at the rate": "@",
36
- "at": "@",
37
- "dot": ".",
38
- "underscore": "_",
39
- "hash": "#",
40
- "plus": "+",
41
- "dash": "-",
42
- "comma": ",",
43
- "space": " "
44
- }
45
-
46
- # Function to clean and format transcribed text
47
- def clean_transcription(text):
48
- text = text.lower().strip()
49
- for word, symbol in SYMBOL_MAPPING.items():
50
- text = text.replace(word, symbol)
51
- return text.capitalize()
52
 
53
- # Function to detect speech duration (trim silence)
54
- def trim_silence(audio_path):
55
  audio = AudioSegment.from_wav(audio_path)
56
  nonsilent_parts = detect_nonsilent(audio, min_silence_len=500, silence_thresh=audio.dBFS-16)
57
-
58
- if nonsilent_parts:
59
- start_trim = nonsilent_parts[0][0]
60
- end_trim = nonsilent_parts[-1][1]
61
- trimmed_audio = audio[start_trim:end_trim]
62
- trimmed_audio.export(audio_path, format="wav") # Save trimmed audio
63
 
64
  @app.route("/")
65
  def index():
@@ -71,20 +55,26 @@ def transcribe():
71
  return jsonify({"error": "No audio file provided"}), 400
72
 
73
  audio_file = request.files["audio"]
74
- audio_path = os.path.join("static", "temp.wav")
75
- audio_file.save(audio_path)
 
76
 
77
  try:
78
- trim_silence(audio_path) # Remove silence before processing
79
-
80
- # Transcribe using Whisper
81
- result = whisper_model.transcribe(audio_path, language="english")
82
- transcribed_text = clean_transcription(result["text"])
 
 
 
 
 
83
 
84
  return jsonify({"text": transcribed_text})
85
  except Exception as e:
86
  return jsonify({"error": f"Speech recognition error: {str(e)}"}), 500
87
 
88
- # Run Waitress Production Server
89
  if __name__ == "__main__":
90
  serve(app, host="0.0.0.0", port=7860)
 
1
+ import torch
2
  from flask import Flask, render_template, request, jsonify
3
  import os
 
 
4
  import re
5
+ from transformers import pipeline
6
+ from gtts import gTTS
7
  from pydub import AudioSegment
8
  from pydub.silence import detect_nonsilent
9
  from waitress import serve
 
10
 
11
  app = Flask(__name__)
12
 
13
+ # Load Whisper ASR Model Correctly
14
  device = "cuda" if torch.cuda.is_available() else "cpu"
15
+ asr_model = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=0 if device == "cuda" else -1)
16
 
17
+ # Function to generate voice prompts
18
  def generate_audio_prompt(text, filename):
19
  tts = gTTS(text=text, lang="en")
20
  tts.save(os.path.join("static", filename))
21
 
22
+ # Generate required voice prompts
23
  prompts = {
24
  "welcome": "Welcome to Biryani Hub.",
25
  "ask_name": "Tell me your name.",
 
30
  for key, text in prompts.items():
31
  generate_audio_prompt(text, f"{key}.mp3")
32
 
33
+ # Ensure Proper Audio Format
34
+ def convert_to_wav(input_path, output_path):
35
+ try:
36
+ audio = AudioSegment.from_file(input_path)
37
+ audio = audio.set_frame_rate(16000).set_channels(1) # ✅ Convert to 16kHz, mono
38
+ audio.export(output_path, format="wav")
39
+ except Exception as e:
40
+ raise Exception(f"Audio conversion failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
41
 
42
+ # Check for Silence
43
+ def is_silent_audio(audio_path):
44
  audio = AudioSegment.from_wav(audio_path)
45
  nonsilent_parts = detect_nonsilent(audio, min_silence_len=500, silence_thresh=audio.dBFS-16)
46
+ return len(nonsilent_parts) == 0
 
 
 
 
 
47
 
48
  @app.route("/")
49
  def index():
 
55
  return jsonify({"error": "No audio file provided"}), 400
56
 
57
  audio_file = request.files["audio"]
58
+ input_audio_path = os.path.join("static", "temp_input.wav")
59
+ output_audio_path = os.path.join("static", "temp.wav")
60
+ audio_file.save(input_audio_path)
61
 
62
  try:
63
+ # Convert audio to proper format
64
+ convert_to_wav(input_audio_path, output_audio_path)
65
+
66
+ # Check for silent audio
67
+ if is_silent_audio(output_audio_path):
68
+ return jsonify({"error": "No speech detected. Please try again."}), 400
69
+
70
+ # ✅ Transcribe Using Whisper ASR
71
+ result = asr_model(output_audio_path, generate_kwargs={"language": "en"})
72
+ transcribed_text = result["text"].strip().capitalize()
73
 
74
  return jsonify({"text": transcribed_text})
75
  except Exception as e:
76
  return jsonify({"error": f"Speech recognition error: {str(e)}"}), 500
77
 
78
+ # Start Production Server
79
  if __name__ == "__main__":
80
  serve(app, host="0.0.0.0", port=7860)