bgdialogue / app 정상작동 but 목소리 무서움
englissi's picture
Rename app.py to app 정상작동 but 목소리 무서움
ece22fe verified
import gradio as gr
from gtts import gTTS
from pydub import AudioSegment
from io import BytesIO
import numpy as np
import librosa
def change_pitch_librosa(audio_segment, semitones):
sr = audio_segment.frame_rate
# 오디오 데이터를 numpy array로 변환 (float32)
samples = np.array(audio_segment.get_array_of_samples()).astype(np.float32)
samples /= 32768.0 # int16 범위를 [-1, 1]로 정규화
if audio_segment.channels > 1:
# 다중 채널인 경우, (채널, 샘플수) 형태로 변환
samples = samples.reshape((-1, audio_segment.channels)).T
else:
samples = samples.flatten()
# librosa의 pitch_shift 호출 시 keyword 인자로 전달 (sr와 n_steps)
shifted = librosa.effects.pitch_shift(samples, sr=sr, n_steps=semitones)
if audio_segment.channels > 1:
shifted = shifted.T.flatten()
else:
shifted = shifted.flatten()
shifted = np.clip(shifted, -1.0, 1.0)
shifted_int16 = np.int16(shifted * 32767)
new_audio = AudioSegment(
shifted_int16.tobytes(),
frame_rate=sr,
sample_width=audio_segment.sample_width,
channels=audio_segment.channels
)
return new_audio
def change_pitch(audio_segment, semitones):
# 남성 목소리 (-10 semitones)에는 Librosa 방식을 사용하여 재생 속도를 유지합니다.
if semitones == -10:
return change_pitch_librosa(audio_segment, semitones)
else:
new_sample_rate = int(audio_segment.frame_rate * (2.0 ** (semitones / 12.0)))
pitched_audio = audio_segment._spawn(audio_segment.raw_data, overrides={'frame_rate': new_sample_rate})
return pitched_audio.set_frame_rate(audio_segment.frame_rate)
def alternating_tts(multiline_text):
sentences = multiline_text.strip().splitlines()
if len(sentences) > 10:
sentences = sentences[:10]
combined_audio = AudioSegment.silent(duration=0)
for i, sentence in enumerate(sentences):
sentence = sentence.strip()
if not sentence:
continue
tts = gTTS(sentence, lang="bg")
audio_file = BytesIO()
tts.write_to_fp(audio_file)
audio_file.seek(0)
sentence_audio = AudioSegment.from_file(audio_file, format="mp3")
if i % 2 == 0:
sentence_audio = change_pitch(sentence_audio, -10)
else:
sentence_audio = change_pitch(sentence_audio, 3)
combined_audio += sentence_audio + AudioSegment.silent(duration=500)
output_file = "alternating_output.mp3"
combined_audio.export(output_file, format="mp3")
return output_file
with gr.Blocks() as demo:
gr.Markdown(
"## Bulgarian TTS with Alternating Male and Female Voices\n"
"최대 10문장을 입력하세요. 각 줄마다 하나의 문장을 입력하면, "
"짝수번째 문장은 남성 목소리(피치 -10 반음, 정상 속도)로, "
"홀수번째 문장은 여성 목소리(피치 +3 반음)로 생성됩니다."
)
input_text = gr.Textbox(
label="Enter sentences (max 10, one per line):",
lines=10,
placeholder="문장 1\n문장 2\n문장 3\n..."
)
output_audio = gr.Audio(label="Generated Speech", type="filepath")
generate_button = gr.Button("Generate Speech")
generate_button.click(alternating_tts, inputs=input_text, outputs=output_audio)
if __name__ == "__main__":
demo.launch()