File size: 5,335 Bytes
87269ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a8ab50
87269ae
7bfb172
 
 
 
2a8ab50
87269ae
b7f2ede
 
 
 
 
 
 
 
0b575c2
87269ae
7bfb172
 
0b575c2
87269ae
 
 
 
2522402
87269ae
 
 
 
 
 
 
 
7bfb172
 
87269ae
 
 
 
7bfb172
87269ae
 
7bfb172
87269ae
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# import streamlit as st
# from transformers import SeamlessM4Tv2Model, AutoProcessor
# import torch
# import numpy as np
# from scipy.io.wavfile import write
# import re
# from io import BytesIO

# # Load the processor and model
# processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
# model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# # Number to words function for Uzbek
# number_words = {
#     0: "nol", 1: "bir", 2: "ikki", 3: "uch", 4: "to'rt", 5: "besh", 6: "olti", 7: "yetti", 8: "sakkiz", 9: "to'qqiz",
#     10: "o'n", 11: "o'n bir", 12: "o'n ikki", 13: "o'n uch", 14: "o'n to'rt", 15: "o'n besh", 16: "o'n oltı", 17: "o'n yetti",
#     18: "o'n sakkiz", 19: "o'n toqqiz", 20: "yigirma", 30: "o'ttiz", 40: "qirq", 50: "ellik", 60: "oltmish", 70: "yetmish",
#     80: "sakson", 90: "to'qson", 100: "yuz", 1000: "ming", 1000000: "million"
# }

# def number_to_words(number):
#     if number < 20:
#         return number_words[number]
#     elif number < 100:
#         tens, unit = divmod(number, 10)
#         return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
#     elif number < 1000:
#         hundreds, remainder = divmod(number, 100)
#         return (number_words[hundreds] + " yuz" if hundreds > 1 else "yuz") + (" " + number_to_words(remainder) if remainder else "")
#     elif number < 1000000:
#         thousands, remainder = divmod(number, 1000)
#         return (number_to_words(thousands) + " ming" if thousands > 1 else "ming") + (" " + number_to_words(remainder) if remainder else "")
#     elif number < 1000000000:
#         millions, remainder = divmod(number, 1000000)
#         return number_to_words(millions) + " million" + (" " + number_to_words(remainder) if remainder else "")
#     elif number < 1000000000000:
#         billions, remainder = divmod(number, 1000000000)
#         return number_to_words(billions) + " milliard" + (" " + number_to_words(remainder) if remainder else "")
#     else:
#         return str(number)

# def replace_numbers_with_words(text):
#     def replace(match):
#         number = int(match.group())
#         return number_to_words(number)
#     result = re.sub(r'\b\d+\b', replace, text)
#     return result

# # Replacements
# replacements = [
#     ("bo‘ladi", "bo'ladi"),
#     ("yog‘ingarchilik", "yog'ingarchilik"),
# ]

# def cleanup_text(text):
#     for src, dst in replacements:
#         text = text.replace(src, dst)
#     return text

# # Streamlit App
# st.title("Text-to-Speech using Seamless M4T Model")

# # User Input
# user_input = st.text_area("Enter the text for speech generation", height=200)

# # Process the text and generate speech
# if st.button("Generate Speech"):
#     if user_input.strip():
#         # Apply text transformations
#         converted_text = replace_numbers_with_words(user_input)
#         cleaned_text = cleanup_text(converted_text)
        
#         # Process input for model
#         inputs = processor(text=cleaned_text, src_lang="uzn", return_tensors="pt").to(device)
        
#         # Generate audio from text
#         audio_array_from_text = model.generate(**inputs, tgt_lang="uzn")[0].cpu().numpy().squeeze()

#         # Save to BytesIO
#         audio_io = BytesIO()
#         write(audio_io, 16000, audio_array_from_text.astype(np.float32))
#         audio_io.seek(0)

#         # Provide audio for playback
#         st.audio(audio_io, format='audio/wav')
#     else:
#         st.warning("Please enter some text to generate speech.")
import streamlit as st
from transformers import SeamlessM4TTokenizer, SeamlessM4Tv2Model
import torch
import numpy as np
from scipy.io.wavfile import write
from io import BytesIO

# Load the tokenizer and model
# tokenizer = SeamlessM4TTokenizer.from_pretrained("facebook/seamless-m4t-v2-large")
# model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")

# Load model directly
from transformers import AutoProcessor, AutoModelForTextToSpectrogram

processor = AutoProcessor.from_pretrained("Beehzod/speecht5_finetuned_uz_customData")
model = AutoModelForTextToSpectrogram.from_pretrained("Beehzod/speecht5_finetuned_uz_customData")

# Set the device (CUDA if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Streamlit title
st.title("Text-to-Speech with Seamless M4T Model")

# Input text field
text = st.text_area("Enter text for audio generation")

# Button to generate audio
if st.button("Generate Audio"):
    if text:
        # Preprocess the text and convert to tensor
        inputs = tokenizer(text=text, src_lang="uzn", return_tensors="pt").to(device)

        # Generate audio from the model
        audio_array_from_text = model.generate(**inputs, tgt_lang="uzn")[0].cpu().numpy().squeeze()

        # Save the audio as a .wav file in memory
        audio_file = BytesIO()
        write(audio_file, 16000, audio_array_from_text.astype(np.float32))
        audio_file.seek(0)  # Reset the pointer to the start of the file

        # Display the audio player in the Streamlit app
        st.audio(audio_file, format="audio/wav")
    else:
        st.warning("Please enter text to generate audio.")