Spaces:
Sleeping
Sleeping
import gradio as gr | |
from gtts import gTTS | |
from pydub import AudioSegment | |
from io import BytesIO | |
import numpy as np | |
import librosa | |
def change_pitch_librosa(audio_segment, semitones): | |
sr = audio_segment.frame_rate | |
# 오디오 데이터를 numpy array로 변환 (float32) | |
samples = np.array(audio_segment.get_array_of_samples()).astype(np.float32) | |
samples /= 32768.0 # int16 범위를 [-1, 1]로 정규화 | |
if audio_segment.channels > 1: | |
# 다중 채널인 경우, (채널, 샘플수) 형태로 변환 | |
samples = samples.reshape((-1, audio_segment.channels)).T | |
else: | |
samples = samples.flatten() | |
# librosa의 pitch_shift 호출 시 keyword 인자로 전달 (sr와 n_steps) | |
shifted = librosa.effects.pitch_shift(samples, sr=sr, n_steps=semitones) | |
if audio_segment.channels > 1: | |
shifted = shifted.T.flatten() | |
else: | |
shifted = shifted.flatten() | |
shifted = np.clip(shifted, -1.0, 1.0) | |
shifted_int16 = np.int16(shifted * 32767) | |
new_audio = AudioSegment( | |
shifted_int16.tobytes(), | |
frame_rate=sr, | |
sample_width=audio_segment.sample_width, | |
channels=audio_segment.channels | |
) | |
return new_audio | |
def change_pitch(audio_segment, semitones): | |
# 남성 목소리 (-10 semitones)에는 Librosa 방식을 사용하여 재생 속도를 유지합니다. | |
if semitones == -10: | |
return change_pitch_librosa(audio_segment, semitones) | |
else: | |
new_sample_rate = int(audio_segment.frame_rate * (2.0 ** (semitones / 12.0))) | |
pitched_audio = audio_segment._spawn(audio_segment.raw_data, overrides={'frame_rate': new_sample_rate}) | |
return pitched_audio.set_frame_rate(audio_segment.frame_rate) | |
def alternating_tts(multiline_text): | |
sentences = multiline_text.strip().splitlines() | |
if len(sentences) > 10: | |
sentences = sentences[:10] | |
combined_audio = AudioSegment.silent(duration=0) | |
for i, sentence in enumerate(sentences): | |
sentence = sentence.strip() | |
if not sentence: | |
continue | |
tts = gTTS(sentence, lang="bg") | |
audio_file = BytesIO() | |
tts.write_to_fp(audio_file) | |
audio_file.seek(0) | |
sentence_audio = AudioSegment.from_file(audio_file, format="mp3") | |
if i % 2 == 0: | |
sentence_audio = change_pitch(sentence_audio, -10) | |
else: | |
sentence_audio = change_pitch(sentence_audio, 3) | |
combined_audio += sentence_audio + AudioSegment.silent(duration=500) | |
output_file = "alternating_output.mp3" | |
combined_audio.export(output_file, format="mp3") | |
return output_file | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
"## Bulgarian TTS with Alternating Male and Female Voices\n" | |
"최대 10문장을 입력하세요. 각 줄마다 하나의 문장을 입력하면, " | |
"짝수번째 문장은 남성 목소리(피치 -10 반음, 정상 속도)로, " | |
"홀수번째 문장은 여성 목소리(피치 +3 반음)로 생성됩니다." | |
) | |
input_text = gr.Textbox( | |
label="Enter sentences (max 10, one per line):", | |
lines=10, | |
placeholder="문장 1\n문장 2\n문장 3\n..." | |
) | |
output_audio = gr.Audio(label="Generated Speech", type="filepath") | |
generate_button = gr.Button("Generate Speech") | |
generate_button.click(alternating_tts, inputs=input_text, outputs=output_audio) | |
if __name__ == "__main__": | |
demo.launch() | |