Spaces:

geethareddy
/

boltvoice

Runtime error

File size: 2,548 Bytes

bb2885a
69067ae
d445f81
9a63e27
2c6d73a
 
 
 
 
 
 
 
caba180
2c6d73a
 
 
 
 
 
 
caba180
2c6d73a
 
 
 
 
 
 
caba180
 
 
 
 
 
 
 
 
57090aa
caba180
 
2c6d73a
 
 
fb84286
2c6d73a
 
 
 
 
 
fb84286
 
2c6d73a
 
 
 
 
 
 
 
 
 
 
bb2885a
2c6d73a
 
 
 
 
 
 
 
9a63e27
56becc0
406eb19
9a63e27
2c6d73a
 
 
 
 
ca55819

import torch
from flask import Flask, render_template, request, jsonify
import os
from transformers import pipeline
from gtts import gTTS
from pydub import AudioSegment
from pydub.silence import detect_nonsilent
from waitress import serve

app = Flask(__name__)

device = "cuda" if torch.cuda.is_available() else "cpu"
asr_model = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=0 if device == "cuda" else -1)

def generate_audio_prompt(text, filename):
    tts = gTTS(text=text, lang="en")
    tts.save(os.path.join("static", filename))

prompts = {
    "welcome": "Welcome to Biryani Hub.",
    "ask_name": "Tell me your name.",
    "ask_email": "Please provide your email address.",
    "thank_you": "Thank you for registration."
}

for key, text in prompts.items():
    generate_audio_prompt(text, f"{key}.mp3")

SYMBOL_MAPPING = {
    "at the rate": "@",
    "at": "@",
    "dot": ".",
    "underscore": "_",
    "hash": "#",
    "plus": "+",
    "dash": "-",
    "comma": ",",
    "space": ""
}

def convert_to_wav(input_path, output_path):
    try:
        audio = AudioSegment.from_file(input_path)
        audio = audio.set_frame_rate(16000).set_channels(1)
        audio.export(output_path, format="wav")
    except Exception as e:
        raise Exception(f"Audio conversion failed: {str(e)}")

def is_silent_audio(audio_path):
    audio = AudioSegment.from_wav(audio_path)
    nonsilent_parts = detect_nonsilent(audio, min_silence_len=500, silence_thresh=audio.dBFS-16)
    return len(nonsilent_parts) == 0

@app.route("/")
def index():
    return render_template("index.html")

@app.route("/transcribe", methods=["POST"])
def transcribe():
    if "audio" not in request.files:
        return jsonify({"error": "No audio file provided"}), 400

    audio_file = request.files["audio"]
    input_audio_path = os.path.join("static", "temp_input.wav")
    output_audio_path = os.path.join("static", "temp.wav")
    audio_file.save(input_audio_path)

    try:
        convert_to_wav(input_audio_path, output_audio_path)

        if is_silent_audio(output_audio_path):
            return jsonify({"error": "No speech detected. Please try again."}), 400

        result = asr_model(output_audio_path, generate_kwargs={"language": "en"})
        transcribed_text = result["text"].strip()

        return jsonify({"text": transcribed_text})
    except Exception as e:
        return jsonify({"error": f"Speech recognition error: {str(e)}"}), 500

if __name__ == "__main__":
    serve(app, host="0.0.0.0", port=7860)