import gradio as gr from gtts import gTTS from pydub import AudioSegment from io import BytesIO import numpy as np import librosa def change_pitch_librosa(audio_segment, semitones): sr = audio_segment.frame_rate # 오디오 데이터를 numpy array로 변환 (float32) samples = np.array(audio_segment.get_array_of_samples()).astype(np.float32) samples /= 32768.0 # int16 범위를 [-1, 1]로 정규화 if audio_segment.channels > 1: # 다중 채널인 경우, (채널, 샘플수) 형태로 변환 samples = samples.reshape((-1, audio_segment.channels)).T else: samples = samples.flatten() # librosa의 pitch_shift 호출 시 keyword 인자로 전달 (sr와 n_steps) shifted = librosa.effects.pitch_shift(samples, sr=sr, n_steps=semitones) if audio_segment.channels > 1: shifted = shifted.T.flatten() else: shifted = shifted.flatten() shifted = np.clip(shifted, -1.0, 1.0) shifted_int16 = np.int16(shifted * 32767) new_audio = AudioSegment( shifted_int16.tobytes(), frame_rate=sr, sample_width=audio_segment.sample_width, channels=audio_segment.channels ) return new_audio def change_pitch(audio_segment, semitones): # 남성 목소리(-5 semitones)에는 Librosa 방식을 사용하여 피치 변경을 진행합니다. if semitones < 0: return change_pitch_librosa(audio_segment, semitones) else: new_sample_rate = int(audio_segment.frame_rate * (2.0 ** (semitones / 12.0))) pitched_audio = audio_segment._spawn(audio_segment.raw_data, overrides={'frame_rate': new_sample_rate}) return pitched_audio.set_frame_rate(audio_segment.frame_rate) def alternating_tts(multiline_text): sentences = multiline_text.strip().splitlines() if len(sentences) > 10: sentences = sentences[:10] combined_audio = AudioSegment.silent(duration=0) for i, sentence in enumerate(sentences): sentence = sentence.strip() if not sentence: continue tts = gTTS(sentence, lang="bg") audio_file = BytesIO() tts.write_to_fp(audio_file) audio_file.seek(0) sentence_audio = AudioSegment.from_file(audio_file, format="mp3") # 짝수번째 문장은 남성 목소리 (피치 -5 반음), 홀수번째 문장은 여성 목소리 (피치 +3 반음) if i % 2 == 0: sentence_audio = change_pitch(sentence_audio, -5) else: sentence_audio = change_pitch(sentence_audio, 3) combined_audio += sentence_audio + AudioSegment.silent(duration=500) output_file = "alternating_output.mp3" combined_audio.export(output_file, format="mp3") return output_file with gr.Blocks() as demo: gr.Markdown( "## Bulgarian TTS with Alternating Male and Female Voices\n" "최대 10문장을 입력하세요. 각 줄마다 하나의 문장을 입력하면, " "짝수번째 문장은 남성 목소리(피치 -5 반음)로, " "홀수번째 문장은 여성 목소리(피치 +3 반음)로 생성됩니다." ) input_text = gr.Textbox( label="Enter sentences (max 10, one per line):", lines=10, placeholder="문장 1\n문장 2\n문장 3\n..." ) output_audio = gr.Audio(label="Generated Speech", type="filepath") generate_button = gr.Button("Generate Speech") generate_button.click(alternating_tts, inputs=input_text, outputs=output_audio) if __name__ == "__main__": demo.launch()