File size: 2,895 Bytes
69067ae
d445f81
69067ae
d445f81
0197ed3
adb5e2a
a3e60d6
d445f81
 
 
0197ed3
69067ae
 
d445f81
7494646
d445f81
7494646
7467739
 
69067ae
7467739
 
d445f81
7467739
 
 
 
 
 
adb5e2a
7467739
 
685e8d2
d445f81
78a5c3d
 
e215f5e
78a5c3d
 
 
 
 
 
 
 
 
d445f81
7467739
78a5c3d
d445f81
78a5c3d
 
d445f81
 
 
 
 
 
 
 
 
 
 
 
a3e60d6
69067ae
7467739
69067ae
 
7467739
 
69067ae
7467739
8ab530a
69067ae
7467739
69067ae
8ab530a
7494646
d445f81
 
 
7467739
 
d445f81
7467739
7494646
69067ae
0197ed3
d445f81
69067ae
e215f5e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from flask import Flask, render_template, request, jsonify
import os
import torch
import speech_recognition as sr
from transformers import pipeline
from gtts import gTTS
import re
from pydub import AudioSegment
from pydub.silence import detect_nonsilent
from waitress import serve

app = Flask(__name__)

# Load Whisper Model for Accurate Speech-to-Text
device = "cuda" if torch.cuda.is_available() else "cpu"
asr_model = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=0 if device == "cuda" else -1)

# Function to generate audio prompts
def generate_audio_prompt(text, filename):
    tts = gTTS(text=text, lang="en")
    tts.save(os.path.join("static", filename))

# Generate all required voice prompts
prompts = {
    "welcome": "Welcome to Biryani Hub.",
    "ask_name": "Tell me your name.",
    "ask_email": "Please provide your email address.",
    "thank_you": "Thank you for registration."
}

for key, text in prompts.items():
    generate_audio_prompt(text, f"{key}.mp3")

# Symbol mapping for better recognition
SYMBOL_MAPPING = {
    "at the rate": "@",
    "at": "@",
    "dot": ".",
    "underscore": "_",
    "hash": "#",
    "plus": "+",
    "dash": "-",
    "comma": ",",
    "space": " "
}

# Function to clean and format transcribed text properly
def clean_transcription(text):
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    for word, symbol in SYMBOL_MAPPING.items():
        text = text.replace(word, symbol)
    return text.capitalize()

# Function to detect speech duration and avoid cutting words
def trim_silence(audio_path):
    audio = AudioSegment.from_wav(audio_path)
    nonsilent_parts = detect_nonsilent(audio, min_silence_len=700, silence_thresh=audio.dBFS-16)
    
    if nonsilent_parts:
        start_trim = nonsilent_parts[0][0]
        end_trim = nonsilent_parts[-1][1]
        trimmed_audio = audio[start_trim:end_trim]
        trimmed_audio.export(audio_path, format="wav")  # Save trimmed audio

@app.route("/")
def index():
    return render_template("index.html")

@app.route("/transcribe", methods=["POST"])
def transcribe():
    if "audio" not in request.files:
        return jsonify({"error": "No audio file provided"}), 400

    audio_file = request.files["audio"]
    audio_path = os.path.join("static", "temp.wav")
    audio_file.save(audio_path)

    try:
        trim_silence(audio_path)  # Trim silence before processing
        
        # Force Whisper to transcribe only in English
        result = asr_model(audio_path, generate_kwargs={"language": "en"})
        transcribed_text = clean_transcription(result["text"])
        
        return jsonify({"text": transcribed_text})
    except Exception as e:
        return jsonify({"error": str(e)}), 500

# Use Waitress for Production Server
if __name__ == "__main__":
    serve(app, host="0.0.0.0", port=7860)