Spaces:
Running
Running
File size: 5,335 Bytes
87269ae 2a8ab50 87269ae 7bfb172 2a8ab50 87269ae b7f2ede 0b575c2 87269ae 7bfb172 0b575c2 87269ae 2522402 87269ae 7bfb172 87269ae 7bfb172 87269ae 7bfb172 87269ae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
# import streamlit as st
# from transformers import SeamlessM4Tv2Model, AutoProcessor
# import torch
# import numpy as np
# from scipy.io.wavfile import write
# import re
# from io import BytesIO
# # Load the processor and model
# processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
# model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# # Number to words function for Uzbek
# number_words = {
# 0: "nol", 1: "bir", 2: "ikki", 3: "uch", 4: "to'rt", 5: "besh", 6: "olti", 7: "yetti", 8: "sakkiz", 9: "to'qqiz",
# 10: "o'n", 11: "o'n bir", 12: "o'n ikki", 13: "o'n uch", 14: "o'n to'rt", 15: "o'n besh", 16: "o'n oltı", 17: "o'n yetti",
# 18: "o'n sakkiz", 19: "o'n toqqiz", 20: "yigirma", 30: "o'ttiz", 40: "qirq", 50: "ellik", 60: "oltmish", 70: "yetmish",
# 80: "sakson", 90: "to'qson", 100: "yuz", 1000: "ming", 1000000: "million"
# }
# def number_to_words(number):
# if number < 20:
# return number_words[number]
# elif number < 100:
# tens, unit = divmod(number, 10)
# return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
# elif number < 1000:
# hundreds, remainder = divmod(number, 100)
# return (number_words[hundreds] + " yuz" if hundreds > 1 else "yuz") + (" " + number_to_words(remainder) if remainder else "")
# elif number < 1000000:
# thousands, remainder = divmod(number, 1000)
# return (number_to_words(thousands) + " ming" if thousands > 1 else "ming") + (" " + number_to_words(remainder) if remainder else "")
# elif number < 1000000000:
# millions, remainder = divmod(number, 1000000)
# return number_to_words(millions) + " million" + (" " + number_to_words(remainder) if remainder else "")
# elif number < 1000000000000:
# billions, remainder = divmod(number, 1000000000)
# return number_to_words(billions) + " milliard" + (" " + number_to_words(remainder) if remainder else "")
# else:
# return str(number)
# def replace_numbers_with_words(text):
# def replace(match):
# number = int(match.group())
# return number_to_words(number)
# result = re.sub(r'\b\d+\b', replace, text)
# return result
# # Replacements
# replacements = [
# ("bo‘ladi", "bo'ladi"),
# ("yog‘ingarchilik", "yog'ingarchilik"),
# ]
# def cleanup_text(text):
# for src, dst in replacements:
# text = text.replace(src, dst)
# return text
# # Streamlit App
# st.title("Text-to-Speech using Seamless M4T Model")
# # User Input
# user_input = st.text_area("Enter the text for speech generation", height=200)
# # Process the text and generate speech
# if st.button("Generate Speech"):
# if user_input.strip():
# # Apply text transformations
# converted_text = replace_numbers_with_words(user_input)
# cleaned_text = cleanup_text(converted_text)
# # Process input for model
# inputs = processor(text=cleaned_text, src_lang="uzn", return_tensors="pt").to(device)
# # Generate audio from text
# audio_array_from_text = model.generate(**inputs, tgt_lang="uzn")[0].cpu().numpy().squeeze()
# # Save to BytesIO
# audio_io = BytesIO()
# write(audio_io, 16000, audio_array_from_text.astype(np.float32))
# audio_io.seek(0)
# # Provide audio for playback
# st.audio(audio_io, format='audio/wav')
# else:
# st.warning("Please enter some text to generate speech.")
import streamlit as st
from transformers import SeamlessM4TTokenizer, SeamlessM4Tv2Model
import torch
import numpy as np
from scipy.io.wavfile import write
from io import BytesIO
# Load the tokenizer and model
# tokenizer = SeamlessM4TTokenizer.from_pretrained("facebook/seamless-m4t-v2-large")
# model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
# Load model directly
from transformers import AutoProcessor, AutoModelForTextToSpectrogram
processor = AutoProcessor.from_pretrained("Beehzod/speecht5_finetuned_uz_customData")
model = AutoModelForTextToSpectrogram.from_pretrained("Beehzod/speecht5_finetuned_uz_customData")
# Set the device (CUDA if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Streamlit title
st.title("Text-to-Speech with Seamless M4T Model")
# Input text field
text = st.text_area("Enter text for audio generation")
# Button to generate audio
if st.button("Generate Audio"):
if text:
# Preprocess the text and convert to tensor
inputs = tokenizer(text=text, src_lang="uzn", return_tensors="pt").to(device)
# Generate audio from the model
audio_array_from_text = model.generate(**inputs, tgt_lang="uzn")[0].cpu().numpy().squeeze()
# Save the audio as a .wav file in memory
audio_file = BytesIO()
write(audio_file, 16000, audio_array_from_text.astype(np.float32))
audio_file.seek(0) # Reset the pointer to the start of the file
# Display the audio player in the Streamlit app
st.audio(audio_file, format="audio/wav")
else:
st.warning("Please enter text to generate audio.")
|