bgdialogue / app.py
englissi's picture
Create app.py
8a3c0fb verified
import gradio as gr
from gtts import gTTS
from pydub import AudioSegment
from io import BytesIO
import numpy as np
import librosa
def change_pitch_librosa(audio_segment, semitones):
sr = audio_segment.frame_rate
# ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ๋ฅผ numpy array๋กœ ๋ณ€ํ™˜ (float32)
samples = np.array(audio_segment.get_array_of_samples()).astype(np.float32)
samples /= 32768.0 # int16 ๋ฒ”์œ„๋ฅผ [-1, 1]๋กœ ์ •๊ทœํ™”
if audio_segment.channels > 1:
# ๋‹ค์ค‘ ์ฑ„๋„์ธ ๊ฒฝ์šฐ, (์ฑ„๋„, ์ƒ˜ํ”Œ์ˆ˜) ํ˜•ํƒœ๋กœ ๋ณ€ํ™˜
samples = samples.reshape((-1, audio_segment.channels)).T
else:
samples = samples.flatten()
# librosa์˜ pitch_shift ํ˜ธ์ถœ ์‹œ keyword ์ธ์ž๋กœ ์ „๋‹ฌ (sr์™€ n_steps)
shifted = librosa.effects.pitch_shift(samples, sr=sr, n_steps=semitones)
if audio_segment.channels > 1:
shifted = shifted.T.flatten()
else:
shifted = shifted.flatten()
shifted = np.clip(shifted, -1.0, 1.0)
shifted_int16 = np.int16(shifted * 32767)
new_audio = AudioSegment(
shifted_int16.tobytes(),
frame_rate=sr,
sample_width=audio_segment.sample_width,
channels=audio_segment.channels
)
return new_audio
def change_pitch(audio_segment, semitones):
# ๋‚จ์„ฑ ๋ชฉ์†Œ๋ฆฌ(-5 semitones)์—๋Š” Librosa ๋ฐฉ์‹์„ ์‚ฌ์šฉํ•˜์—ฌ ํ”ผ์น˜ ๋ณ€๊ฒฝ์„ ์ง„ํ–‰ํ•ฉ๋‹ˆ๋‹ค.
if semitones < 0:
return change_pitch_librosa(audio_segment, semitones)
else:
new_sample_rate = int(audio_segment.frame_rate * (2.0 ** (semitones / 12.0)))
pitched_audio = audio_segment._spawn(audio_segment.raw_data, overrides={'frame_rate': new_sample_rate})
return pitched_audio.set_frame_rate(audio_segment.frame_rate)
def alternating_tts(multiline_text):
sentences = multiline_text.strip().splitlines()
if len(sentences) > 10:
sentences = sentences[:10]
combined_audio = AudioSegment.silent(duration=0)
for i, sentence in enumerate(sentences):
sentence = sentence.strip()
if not sentence:
continue
tts = gTTS(sentence, lang="bg")
audio_file = BytesIO()
tts.write_to_fp(audio_file)
audio_file.seek(0)
sentence_audio = AudioSegment.from_file(audio_file, format="mp3")
# ์ง์ˆ˜๋ฒˆ์งธ ๋ฌธ์žฅ์€ ๋‚จ์„ฑ ๋ชฉ์†Œ๋ฆฌ (ํ”ผ์น˜ -5 ๋ฐ˜์Œ), ํ™€์ˆ˜๋ฒˆ์งธ ๋ฌธ์žฅ์€ ์—ฌ์„ฑ ๋ชฉ์†Œ๋ฆฌ (ํ”ผ์น˜ +3 ๋ฐ˜์Œ)
if i % 2 == 0:
sentence_audio = change_pitch(sentence_audio, -5)
else:
sentence_audio = change_pitch(sentence_audio, 3)
combined_audio += sentence_audio + AudioSegment.silent(duration=500)
output_file = "alternating_output.mp3"
combined_audio.export(output_file, format="mp3")
return output_file
with gr.Blocks() as demo:
gr.Markdown(
"## Bulgarian TTS with Alternating Male and Female Voices\n"
"์ตœ๋Œ€ 10๋ฌธ์žฅ์„ ์ž…๋ ฅํ•˜์„ธ์š”. ๊ฐ ์ค„๋งˆ๋‹ค ํ•˜๋‚˜์˜ ๋ฌธ์žฅ์„ ์ž…๋ ฅํ•˜๋ฉด, "
"์ง์ˆ˜๋ฒˆ์งธ ๋ฌธ์žฅ์€ ๋‚จ์„ฑ ๋ชฉ์†Œ๋ฆฌ(ํ”ผ์น˜ -5 ๋ฐ˜์Œ)๋กœ, "
"ํ™€์ˆ˜๋ฒˆ์งธ ๋ฌธ์žฅ์€ ์—ฌ์„ฑ ๋ชฉ์†Œ๋ฆฌ(ํ”ผ์น˜ +3 ๋ฐ˜์Œ)๋กœ ์ƒ์„ฑ๋ฉ๋‹ˆ๋‹ค."
)
input_text = gr.Textbox(
label="Enter sentences (max 10, one per line):",
lines=10,
placeholder="๋ฌธ์žฅ 1\n๋ฌธ์žฅ 2\n๋ฌธ์žฅ 3\n..."
)
output_audio = gr.Audio(label="Generated Speech", type="filepath")
generate_button = gr.Button("Generate Speech")
generate_button.click(alternating_tts, inputs=input_text, outputs=output_audio)
if __name__ == "__main__":
demo.launch()