piper / app.py
aigmixer's picture
back to file based audio output
f6a94c1
raw
history blame contribute delete
1.87 kB
import gradio as gr
import wave
import numpy as np
from io import BytesIO
from huggingface_hub import hf_hub_download
from piper import PiperVoice
from transformers import pipeline
# Load the NSFW classifier model
nsfw_detector = pipeline("text-classification", model="michellejieli/NSFW_text_classifier")
def synthesize_speech(text):
# Check for NSFW content
nsfw_result = nsfw_detector(text)
if nsfw_result[0]['label'] == 'NSFW':
return "NSFW content detected. Cannot process.", None
model_path = hf_hub_download(repo_id="aigmixer/speaker_00", filename="speaker_00_model.onnx")
config_path = hf_hub_download(repo_id="aigmixer/speaker_00", filename="speaker_00_model.onnx.json")
voice = PiperVoice.load(model_path, config_path)
# Create an in-memory buffer for the WAV file
buffer = BytesIO()
with wave.open(buffer, 'wb') as wav_file:
wav_file.setframerate(voice.config.sample_rate)
wav_file.setsampwidth(2) # 16-bit
wav_file.setnchannels(1) # mono
# Synthesize speech
voice.synthesize(text, wav_file)
# Convert buffer to NumPy array for Gradio output
buffer.seek(0)
audio_data = np.frombuffer(buffer.read(), dtype=np.int16)
return audio_data.tobytes(), None
# Using Gradio Blocks
with gr.Blocks(theme=gr.themes.Base()) as blocks:
gr.Markdown("# Text to Speech Synthesizer")
gr.Markdown("Enter text to synthesize it into speech using PiperVoice.")
input_text = gr.Textbox(label="Input Text")
output_audio = gr.Audio(label="Synthesized Speech", type="numpy")
output_text = gr.Textbox(label="Output Text", visible=False) # This is the new text output component
submit_button = gr.Button("Synthesize")
submit_button.click(synthesize_speech, inputs=input_text, outputs=[output_audio, output_text])
# Run the app
blocks.launch()