voicemenu1433

Sleeping

App Files Files Community

lokesh341 commited on Feb 1

Commit

b5fadc4

verified ·

1 Parent(s): 1e95876

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -42

app.py CHANGED Viewed

@@ -1,25 +1,25 @@
 from flask import Flask, render_template, request, jsonify
 import os
-import torch
-import whisper
 import re
 from pydub import AudioSegment
 from pydub.silence import detect_nonsilent
 from waitress import serve
-from gtts import gTTS
 app = Flask(__name__)
-# Load Whisper Model (Higher Accuracy)
 device = "cuda" if torch.cuda.is_available() else "cpu"
-whisper_model = whisper.load_model("medium")  # Change to "large" for even better accuracy
-# Function to generate audio prompts
 def generate_audio_prompt(text, filename):
     tts = gTTS(text=text, lang="en")
     tts.save(os.path.join("static", filename))
-# Generate voice prompts
 prompts = {
     "welcome": "Welcome to Biryani Hub.",
     "ask_name": "Tell me your name.",
@@ -30,36 +30,20 @@ prompts = {
 for key, text in prompts.items():
     generate_audio_prompt(text, f"{key}.mp3")
-# Symbol mapping for proper recognition
-SYMBOL_MAPPING = {
-    "at the rate": "@",
-    "at": "@",
-    "dot": ".",
-    "underscore": "_",
-    "hash": "#",
-    "plus": "+",
-    "dash": "-",
-    "comma": ",",
-    "space": " "
-}
-# Function to clean and format transcribed text
-def clean_transcription(text):
-    text = text.lower().strip()
-    for word, symbol in SYMBOL_MAPPING.items():
-        text = text.replace(word, symbol)
-    return text.capitalize()
-# Function to detect speech duration (trim silence)
-def trim_silence(audio_path):
     audio = AudioSegment.from_wav(audio_path)
     nonsilent_parts = detect_nonsilent(audio, min_silence_len=500, silence_thresh=audio.dBFS-16)
-    if nonsilent_parts:
-        start_trim = nonsilent_parts[0][0]
-        end_trim = nonsilent_parts[-1][1]
-        trimmed_audio = audio[start_trim:end_trim]
-        trimmed_audio.export(audio_path, format="wav")  # Save trimmed audio
 @app.route("/")
 def index():
@@ -71,20 +55,26 @@ def transcribe():
         return jsonify({"error": "No audio file provided"}), 400
     audio_file = request.files["audio"]
-    audio_path = os.path.join("static", "temp.wav")
-    audio_file.save(audio_path)
     try:
-        trim_silence(audio_path)  # Remove silence before processing
-        # Transcribe using Whisper
-        result = whisper_model.transcribe(audio_path, language="english")
-        transcribed_text = clean_transcription(result["text"])
         return jsonify({"text": transcribed_text})
     except Exception as e:
         return jsonify({"error": f"Speech recognition error: {str(e)}"}), 500
-# Run Waitress Production Server
 if __name__ == "__main__":
     serve(app, host="0.0.0.0", port=7860)

+import torch
 from flask import Flask, render_template, request, jsonify
 import os
 import re
+from transformers import pipeline
+from gtts import gTTS
 from pydub import AudioSegment
 from pydub.silence import detect_nonsilent
 from waitress import serve
 app = Flask(__name__)
+# ✅ Load Whisper ASR Model Correctly
 device = "cuda" if torch.cuda.is_available() else "cpu"
+asr_model = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=0 if device == "cuda" else -1)
+# Function to generate voice prompts
 def generate_audio_prompt(text, filename):
     tts = gTTS(text=text, lang="en")
     tts.save(os.path.join("static", filename))
+# Generate required voice prompts
 prompts = {
     "welcome": "Welcome to Biryani Hub.",
     "ask_name": "Tell me your name.",
 for key, text in prompts.items():
     generate_audio_prompt(text, f"{key}.mp3")
+# ✅ Ensure Proper Audio Format
+def convert_to_wav(input_path, output_path):
+    try:
+        audio = AudioSegment.from_file(input_path)
+        audio = audio.set_frame_rate(16000).set_channels(1)  # ✅ Convert to 16kHz, mono
+        audio.export(output_path, format="wav")
+    except Exception as e:
+        raise Exception(f"Audio conversion failed: {str(e)}")
+# ✅ Check for Silence
+def is_silent_audio(audio_path):
     audio = AudioSegment.from_wav(audio_path)
     nonsilent_parts = detect_nonsilent(audio, min_silence_len=500, silence_thresh=audio.dBFS-16)
+    return len(nonsilent_parts) == 0
 @app.route("/")
 def index():
         return jsonify({"error": "No audio file provided"}), 400
     audio_file = request.files["audio"]
+    input_audio_path = os.path.join("static", "temp_input.wav")
+    output_audio_path = os.path.join("static", "temp.wav")
+    audio_file.save(input_audio_path)
     try:
+        # ✅ Convert audio to proper format
+        convert_to_wav(input_audio_path, output_audio_path)
+        # ✅ Check for silent audio
+        if is_silent_audio(output_audio_path):
+            return jsonify({"error": "No speech detected. Please try again."}), 400
+        # ✅ Transcribe Using Whisper ASR
+        result = asr_model(output_audio_path, generate_kwargs={"language": "en"})
+        transcribed_text = result["text"].strip().capitalize()
         return jsonify({"text": transcribed_text})
     except Exception as e:
         return jsonify({"error": f"Speech recognition error: {str(e)}"}), 500
+# ✅ Start Production Server
 if __name__ == "__main__":
     serve(app, host="0.0.0.0", port=7860)