File size: 2,710 Bytes
69067ae
7467739
9061ed1
 
a3e60d6
9061ed1
 
 
 
0197ed3
69067ae
 
9061ed1
7494646
9061ed1
7494646
7467739
 
69067ae
7467739
 
9061ed1
7467739
 
 
 
 
 
adb5e2a
7467739
 
685e8d2
9061ed1
 
 
 
 
 
 
 
 
 
 
 
 
 
7467739
9061ed1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3e60d6
69067ae
7467739
69067ae
 
7467739
 
69067ae
7467739
8ab530a
69067ae
7467739
69067ae
8ab530a
7494646
9061ed1
 
 
 
7467739
9061ed1
7467739
7494646
9061ed1
0197ed3
9061ed1
69067ae
9061ed1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from flask import Flask, render_template, request, jsonify
import os
import torch
import whisper
import re
from pydub import AudioSegment
from pydub.silence import detect_nonsilent
from waitress import serve
from gtts import gTTS

app = Flask(__name__)

# Load Whisper Model (Higher Accuracy)
device = "cuda" if torch.cuda.is_available() else "cpu"
whisper_model = whisper.load_model("medium")  # Change to "large" for even better accuracy

# Function to generate audio prompts
def generate_audio_prompt(text, filename):
    tts = gTTS(text=text, lang="en")
    tts.save(os.path.join("static", filename))

# Generate voice prompts
prompts = {
    "welcome": "Welcome to Biryani Hub.",
    "ask_name": "Tell me your name.",
    "ask_email": "Please provide your email address.",
    "thank_you": "Thank you for registration."
}

for key, text in prompts.items():
    generate_audio_prompt(text, f"{key}.mp3")

# Symbol mapping for proper recognition
SYMBOL_MAPPING = {
    "at the rate": "@",
    "at": "@",
    "dot": ".",
    "underscore": "_",
    "hash": "#",
    "plus": "+",
    "dash": "-",
    "comma": ",",
    "space": " "
}

# Function to clean and format transcribed text
def clean_transcription(text):
    text = text.lower().strip()
    for word, symbol in SYMBOL_MAPPING.items():
        text = text.replace(word, symbol)
    return text.capitalize()

# Function to detect speech duration (trim silence)
def trim_silence(audio_path):
    audio = AudioSegment.from_wav(audio_path)
    nonsilent_parts = detect_nonsilent(audio, min_silence_len=500, silence_thresh=audio.dBFS-16)

    if nonsilent_parts:
        start_trim = nonsilent_parts[0][0]
        end_trim = nonsilent_parts[-1][1]
        trimmed_audio = audio[start_trim:end_trim]
        trimmed_audio.export(audio_path, format="wav")  # Save trimmed audio

@app.route("/")
def index():
    return render_template("index.html")

@app.route("/transcribe", methods=["POST"])
def transcribe():
    if "audio" not in request.files:
        return jsonify({"error": "No audio file provided"}), 400

    audio_file = request.files["audio"]
    audio_path = os.path.join("static", "temp.wav")
    audio_file.save(audio_path)

    try:
        trim_silence(audio_path)  # Remove silence before processing
        
        # Transcribe using Whisper
        result = whisper_model.transcribe(audio_path, language="english")
        transcribed_text = clean_transcription(result["text"])

        return jsonify({"text": transcribed_text})
    except Exception as e:
        return jsonify({"error": f"Speech recognition error: {str(e)}"}), 500

# Run Waitress Production Server
if __name__ == "__main__":
    serve(app, host="0.0.0.0", port=7860)