geethareddy commited on
Commit
2c6d73a
·
verified ·
1 Parent(s): b048f8a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -0
app.py CHANGED
@@ -1,6 +1,112 @@
1
  from flask import Flask, render_template, request, jsonify
2
  import os
 
 
3
  import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import re
5
  import ffmpeg # Ensures FFmpeg is installed
6
  from transformers import pipeline
 
1
  from flask import Flask, render_template, request, jsonify
2
  import os
3
+ import torchafrom flask import Flask, render_template, request, jsonify
4
+ import os
5
  import torch
6
+ import re
7
+ import ffmpeg # Ensure FFmpeg is installed
8
+ from transformers import pipeline
9
+ from gtts import gTTS
10
+ from pydub import AudioSegment
11
+ from pydub.silence import detect_nonsilent
12
+ from waitress import serve
13
+ import whisper # Improved Whisper ASR Model
14
+
15
+ app = Flask(__name__)
16
+
17
+ # Load Whisper Model for Highly Accurate Speech-to-Text
18
+ device = "cuda" if torch.cuda.is_available() else "cpu"
19
+ asr_model = whisper.load_model("large-v3", device=device)
20
+
21
+ # Function to generate audio prompts
22
+ def generate_audio_prompt(text, filename):
23
+ tts = gTTS(text=text, lang="en")
24
+ tts.save(os.path.join("static", filename))
25
+
26
+ # Generate required voice prompts
27
+ prompts = {
28
+ "welcome": "Welcome to Biryani Hub.",
29
+ "ask_name": "Tell me your name.",
30
+ "ask_email": "Please provide your email address.",
31
+ "thank_you": "Thank you for registration."
32
+ }
33
+
34
+ for key, text in prompts.items():
35
+ generate_audio_prompt(text, f"{key}.mp3")
36
+
37
+ # Symbol mapping for better recognition
38
+ SYMBOL_MAPPING = {
39
+ "at the rate": "@",
40
+ "at": "@",
41
+ "dot": ".",
42
+ "underscore": "_",
43
+ "hash": "#",
44
+ "plus": "+",
45
+ "dash": "-",
46
+ "comma": ",",
47
+ "space": " "
48
+ }
49
+
50
+ # Function to convert audio to WAV format (Fixes FFmpeg issues)
51
+ def convert_to_wav(input_path, output_path):
52
+ try:
53
+ audio = AudioSegment.from_file(input_path)
54
+ audio.export(output_path, format="wav")
55
+ except Exception as e:
56
+ raise Exception(f"Audio conversion failed: {str(e)}")
57
+
58
+ # Function to clean transcribed text (Removes unnecessary words)
59
+ def clean_transcription(text):
60
+ text = text.lower().strip()
61
+ ignore_phrases = ["my name is", "this is", "i am", "it's", "name"]
62
+ for phrase in ignore_phrases:
63
+ text = text.replace(phrase, "").strip()
64
+
65
+ for word, symbol in SYMBOL_MAPPING.items():
66
+ text = text.replace(word, symbol)
67
+
68
+ return text.capitalize()
69
+
70
+ # Function to check if the audio contains actual speech
71
+ def is_silent_audio(audio_path):
72
+ audio = AudioSegment.from_wav(audio_path)
73
+ nonsilent_parts = detect_nonsilent(audio, min_silence_len=500, silence_thresh=audio.dBFS-16)
74
+ return len(nonsilent_parts) == 0 # Returns True if silence detected
75
+
76
+ @app.route("/")
77
+ def index():
78
+ return render_template("index.html")
79
+
80
+ @app.route("/transcribe", methods=["POST"])
81
+ def transcribe():
82
+ if "audio" not in request.files:
83
+ return jsonify({"error": "No audio file provided"}), 400
84
+
85
+ audio_file = request.files["audio"]
86
+ input_audio_path = os.path.join("static", "temp_input")
87
+ output_audio_path = os.path.join("static", "temp.wav")
88
+ audio_file.save(input_audio_path)
89
+
90
+ try:
91
+ # Convert audio to WAV format
92
+ convert_to_wav(input_audio_path, output_audio_path)
93
+
94
+ # Check if the audio contains valid speech
95
+ if is_silent_audio(output_audio_path):
96
+ return jsonify({"error": "No speech detected. Please try again."}), 400
97
+
98
+ # Transcribe using Whisper
99
+ result = asr_model.transcribe(output_audio_path, language="en")
100
+ transcribed_text = clean_transcription(result["text"])
101
+
102
+ return jsonify({"text": transcribed_text})
103
+ except Exception as e:
104
+ return jsonify({"error": f"Speech recognition error: {str(e)}"}), 500
105
+
106
+ # Use Waitress for Production Server
107
+ if __name__ == "__main__":
108
+ serve(app, host="0.0.0.0", port=7860)
109
+
110
  import re
111
  import ffmpeg # Ensures FFmpeg is installed
112
  from transformers import pipeline