Spaces:
Sleeping
Sleeping
File size: 3,605 Bytes
8a3c0fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import gradio as gr
from gtts import gTTS
from pydub import AudioSegment
from io import BytesIO
import numpy as np
import librosa
def change_pitch_librosa(audio_segment, semitones):
sr = audio_segment.frame_rate
# ์ค๋์ค ๋ฐ์ดํฐ๋ฅผ numpy array๋ก ๋ณํ (float32)
samples = np.array(audio_segment.get_array_of_samples()).astype(np.float32)
samples /= 32768.0 # int16 ๋ฒ์๋ฅผ [-1, 1]๋ก ์ ๊ทํ
if audio_segment.channels > 1:
# ๋ค์ค ์ฑ๋์ธ ๊ฒฝ์ฐ, (์ฑ๋, ์ํ์) ํํ๋ก ๋ณํ
samples = samples.reshape((-1, audio_segment.channels)).T
else:
samples = samples.flatten()
# librosa์ pitch_shift ํธ์ถ ์ keyword ์ธ์๋ก ์ ๋ฌ (sr์ n_steps)
shifted = librosa.effects.pitch_shift(samples, sr=sr, n_steps=semitones)
if audio_segment.channels > 1:
shifted = shifted.T.flatten()
else:
shifted = shifted.flatten()
shifted = np.clip(shifted, -1.0, 1.0)
shifted_int16 = np.int16(shifted * 32767)
new_audio = AudioSegment(
shifted_int16.tobytes(),
frame_rate=sr,
sample_width=audio_segment.sample_width,
channels=audio_segment.channels
)
return new_audio
def change_pitch(audio_segment, semitones):
# ๋จ์ฑ ๋ชฉ์๋ฆฌ(-5 semitones)์๋ Librosa ๋ฐฉ์์ ์ฌ์ฉํ์ฌ ํผ์น ๋ณ๊ฒฝ์ ์งํํฉ๋๋ค.
if semitones < 0:
return change_pitch_librosa(audio_segment, semitones)
else:
new_sample_rate = int(audio_segment.frame_rate * (2.0 ** (semitones / 12.0)))
pitched_audio = audio_segment._spawn(audio_segment.raw_data, overrides={'frame_rate': new_sample_rate})
return pitched_audio.set_frame_rate(audio_segment.frame_rate)
def alternating_tts(multiline_text):
sentences = multiline_text.strip().splitlines()
if len(sentences) > 10:
sentences = sentences[:10]
combined_audio = AudioSegment.silent(duration=0)
for i, sentence in enumerate(sentences):
sentence = sentence.strip()
if not sentence:
continue
tts = gTTS(sentence, lang="bg")
audio_file = BytesIO()
tts.write_to_fp(audio_file)
audio_file.seek(0)
sentence_audio = AudioSegment.from_file(audio_file, format="mp3")
# ์ง์๋ฒ์งธ ๋ฌธ์ฅ์ ๋จ์ฑ ๋ชฉ์๋ฆฌ (ํผ์น -5 ๋ฐ์), ํ์๋ฒ์งธ ๋ฌธ์ฅ์ ์ฌ์ฑ ๋ชฉ์๋ฆฌ (ํผ์น +3 ๋ฐ์)
if i % 2 == 0:
sentence_audio = change_pitch(sentence_audio, -5)
else:
sentence_audio = change_pitch(sentence_audio, 3)
combined_audio += sentence_audio + AudioSegment.silent(duration=500)
output_file = "alternating_output.mp3"
combined_audio.export(output_file, format="mp3")
return output_file
with gr.Blocks() as demo:
gr.Markdown(
"## Bulgarian TTS with Alternating Male and Female Voices\n"
"์ต๋ 10๋ฌธ์ฅ์ ์
๋ ฅํ์ธ์. ๊ฐ ์ค๋ง๋ค ํ๋์ ๋ฌธ์ฅ์ ์
๋ ฅํ๋ฉด, "
"์ง์๋ฒ์งธ ๋ฌธ์ฅ์ ๋จ์ฑ ๋ชฉ์๋ฆฌ(ํผ์น -5 ๋ฐ์)๋ก, "
"ํ์๋ฒ์งธ ๋ฌธ์ฅ์ ์ฌ์ฑ ๋ชฉ์๋ฆฌ(ํผ์น +3 ๋ฐ์)๋ก ์์ฑ๋ฉ๋๋ค."
)
input_text = gr.Textbox(
label="Enter sentences (max 10, one per line):",
lines=10,
placeholder="๋ฌธ์ฅ 1\n๋ฌธ์ฅ 2\n๋ฌธ์ฅ 3\n..."
)
output_audio = gr.Audio(label="Generated Speech", type="filepath")
generate_button = gr.Button("Generate Speech")
generate_button.click(alternating_tts, inputs=input_text, outputs=output_audio)
if __name__ == "__main__":
demo.launch()
|