Spaces:
Sleeping
Sleeping
File size: 2,710 Bytes
69067ae 7467739 9061ed1 a3e60d6 9061ed1 0197ed3 69067ae 9061ed1 7494646 9061ed1 7494646 7467739 69067ae 7467739 9061ed1 7467739 adb5e2a 7467739 685e8d2 9061ed1 7467739 9061ed1 a3e60d6 69067ae 7467739 69067ae 7467739 69067ae 7467739 8ab530a 69067ae 7467739 69067ae 8ab530a 7494646 9061ed1 7467739 9061ed1 7467739 7494646 9061ed1 0197ed3 9061ed1 69067ae 9061ed1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
from flask import Flask, render_template, request, jsonify
import os
import torch
import whisper
import re
from pydub import AudioSegment
from pydub.silence import detect_nonsilent
from waitress import serve
from gtts import gTTS
app = Flask(__name__)
# Load Whisper Model (Higher Accuracy)
device = "cuda" if torch.cuda.is_available() else "cpu"
whisper_model = whisper.load_model("medium") # Change to "large" for even better accuracy
# Function to generate audio prompts
def generate_audio_prompt(text, filename):
tts = gTTS(text=text, lang="en")
tts.save(os.path.join("static", filename))
# Generate voice prompts
prompts = {
"welcome": "Welcome to Biryani Hub.",
"ask_name": "Tell me your name.",
"ask_email": "Please provide your email address.",
"thank_you": "Thank you for registration."
}
for key, text in prompts.items():
generate_audio_prompt(text, f"{key}.mp3")
# Symbol mapping for proper recognition
SYMBOL_MAPPING = {
"at the rate": "@",
"at": "@",
"dot": ".",
"underscore": "_",
"hash": "#",
"plus": "+",
"dash": "-",
"comma": ",",
"space": " "
}
# Function to clean and format transcribed text
def clean_transcription(text):
text = text.lower().strip()
for word, symbol in SYMBOL_MAPPING.items():
text = text.replace(word, symbol)
return text.capitalize()
# Function to detect speech duration (trim silence)
def trim_silence(audio_path):
audio = AudioSegment.from_wav(audio_path)
nonsilent_parts = detect_nonsilent(audio, min_silence_len=500, silence_thresh=audio.dBFS-16)
if nonsilent_parts:
start_trim = nonsilent_parts[0][0]
end_trim = nonsilent_parts[-1][1]
trimmed_audio = audio[start_trim:end_trim]
trimmed_audio.export(audio_path, format="wav") # Save trimmed audio
@app.route("/")
def index():
return render_template("index.html")
@app.route("/transcribe", methods=["POST"])
def transcribe():
if "audio" not in request.files:
return jsonify({"error": "No audio file provided"}), 400
audio_file = request.files["audio"]
audio_path = os.path.join("static", "temp.wav")
audio_file.save(audio_path)
try:
trim_silence(audio_path) # Remove silence before processing
# Transcribe using Whisper
result = whisper_model.transcribe(audio_path, language="english")
transcribed_text = clean_transcription(result["text"])
return jsonify({"text": transcribed_text})
except Exception as e:
return jsonify({"error": f"Speech recognition error: {str(e)}"}), 500
# Run Waitress Production Server
if __name__ == "__main__":
serve(app, host="0.0.0.0", port=7860)
|