File size: 6,571 Bytes
2f29448
7683895
2add214
2f29448
 
2add214
 
 
201c7a0
2add214
2f29448
2add214
85d8a82
6ee3f53
85d8a82
2add214
 
 
 
 
 
 
 
 
 
 
 
 
2f29448
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2add214
 
9e03404
2add214
 
 
 
 
 
 
 
 
 
 
 
a8f1469
2add214
 
 
 
 
a8f1469
2add214
 
2f29448
 
7683895
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f29448
 
3a5211e
 
2f29448
f645855
2f29448
b55a54f
2f29448
 
 
 
 
3a5211e
2f29448
3dc4140
2f29448
 
 
 
 
7683895
2f29448
 
 
 
 
 
 
 
 
adf12f4
 
 
bf4d868
adf12f4
 
 
 
 
 
 
 
 
0f59257
adf12f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e55f2bb
 
2f29448
 
 
 
adf12f4
 
 
2f29448
 
 
 
3dc4140
2f29448
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import gradio as gr
import numpy as np
import torch
import random
from diffusers import DiffusionPipeline
from datasets import load_dataset
from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline

device = "cuda:0" if torch.cuda.is_available() else "cpu"

title = "GenAI StoryTeller"
description = """
Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for Speech Translation, 
Microsoft's [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for Text-to-Speech and 
StabilityAI's [StableDiffusion](https://huggingface.co/stabilityai/sdxl-turbo) model for Image Generation
"""

# Load speech translation pipeline
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)

# Load text-to-speech processor from pretrained dataset
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")

model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

# Load diffusion pipeline for image generation
if torch.cuda.is_available():
    torch.cuda.max_memory_allocated(device=device)
    pipe = DiffusionPipeline.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16", use_safetensors=True)
    pipe.enable_xformers_memory_efficient_attention()
    pipe = pipe.to(device)
else: 
    pipe = DiffusionPipeline.from_pretrained("stabilityai/sdxl-turbo", use_safetensors=True)
    pipe = pipe.to(device)

if torch.cuda.is_available():
    power_device = "GPU"
else:
    power_device = "CPU"

# Limit the file size
MAX_SEED = np.iinfo(np.int32).max
MAX_IMAGE_SIZE = 1024

# Speech GenAI
# Function for translating different language using pretrained models
def translate(audio):
    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
    return outputs["text"]

# Function to synthesise the text using the processor above
def synthesise(text):
    inputs = processor(text=text, return_tensors="pt")
    speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
    return speech.cpu()

# Main function
def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)  # Ensure int16 format
    return 16000, synthesised_speech

# Function for text to speech
def text_to_speech(text):
    synthesised_speech = synthesise(text)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)  # Ensure int16 format
    return 16000, synthesised_speech

# Image GenAI
# Text to Image
def infer(prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps):

    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
        
    generator = torch.Generator().manual_seed(seed)
    
    image = pipe(
        prompt = prompt, 
        negative_prompt = negative_prompt,
        guidance_scale = guidance_scale, 
        num_inference_steps = num_inference_steps, 
        width = width, 
        height = height,
        generator = generator
    ).images[0] 
    
    return image

demo = gr.Blocks()

# Audio translation using microphone as the input
audio_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    examples=[["./english.wav"], ["./chinese.wav"]],
    title=title,
    description=description,
)

# File translation using uploaded files as input
file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    examples=[["./english.wav"], ["./chinese.wav"]],
    title=title,
    description=description,
)

 # Text translation using text as input
text_translate = gr.Interface(
    fn=text_to_speech, 
    inputs="textbox",
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description
)

# Inputs for Image Generation
prompt = gr.Text(
                label="Prompt",
                show_label=True,
                max_lines=1,
                placeholder="Enter your prompt",
                container=False,
            )

negative_prompt = gr.Text(
                label="Negative prompt",
                max_lines=1,
                placeholder="Enter a negative prompt",
                visible=True,
            )

seed = gr.Slider(
                label="Seed",
                minimum=0,
                maximum=MAX_SEED,
                step=1,
                value=0,
            )

randomize_seed = gr.Checkbox(label="Randomize seed", value=True)

width = gr.Slider(
                    label="Width",
                    minimum=256,
                    maximum=MAX_IMAGE_SIZE,
                    step=32,
                    value=512,
                )
                
height = gr.Slider(
                    label="Height",
                    minimum=256,
                    maximum=MAX_IMAGE_SIZE,
                    step=32,
                    value=512,
                )

guidance_scale = gr.Slider(
                    label="Guidance scale",
                    minimum=0.0,
                    maximum=10.0,
                    step=0.1,
                    value=0.0,
                )
                
num_inference_steps = gr.Slider(
                    label="Number of inference steps",
                    minimum=1,
                    maximum=12,
                    step=1,
                    value=2,
                )

result = gr.Image(label="Result", show_label=False)

# Text to Image interface
image_generation = gr.Interface(
    fn=infer,
    inputs=[prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps],
    outputs=[result],
    title=title,
    description=description,
)

# Showcase the demo using different tabs of the different features
with demo:
    gr.TabbedInterface([audio_translate, file_translate, text_translate, image_generation], ["Speech to Text", "Audio to Text", "Text to Speech", "Text to Image"])

demo.launch()