Update app.py
Browse files
app.py
CHANGED
@@ -1,84 +1,94 @@
|
|
1 |
-
import streamlit as st
|
2 |
# Import libraries
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
import numpy as np
|
4 |
-
import tensorflow as tf
|
5 |
-
from tensorflow import keras
|
6 |
-
from tensorflow.keras import layers
|
7 |
-
|
8 |
-
# Load the text data
|
9 |
-
text = open('shakespeare.txt', 'r').read() # Read the text file
|
10 |
-
vocab = sorted(set(text)) # Get the unique characters in the text
|
11 |
-
char2idx = {c: i for i, c in enumerate(vocab)} # Map characters to indices
|
12 |
-
idx2char = np.array(vocab) # Map indices to characters
|
13 |
-
text_as_int = np.array([char2idx[c] for c in text]) # Convert text to integers
|
14 |
-
|
15 |
-
# Create training examples and targets
|
16 |
-
seq_length = 100 # Length of the input sequence
|
17 |
-
examples_per_epoch = len(text) // (seq_length + 1) # Number of examples per epoch
|
18 |
-
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int) # Create a dataset from the text
|
19 |
-
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True) # Create batches of sequences
|
20 |
-
|
21 |
-
def split_input_target(chunk): # Define a function to split the input and target
|
22 |
-
input_text = chunk[:-1] # Input is the sequence except the last character
|
23 |
-
target_text = chunk[1:] # Target is the sequence except the first character
|
24 |
-
return input_text, target_text
|
25 |
-
|
26 |
-
dataset = sequences.map(split_input_target) # Apply the function to the dataset
|
27 |
-
|
28 |
-
# Shuffle and batch the dataset
|
29 |
-
BATCH_SIZE = 1 # Batch size
|
30 |
-
BUFFER_SIZE = 10000 # Buffer size for shuffling
|
31 |
-
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True) # Shuffle and batch the dataset
|
32 |
-
|
33 |
-
# Define the model
|
34 |
-
vocab_size = len(vocab) # Size of the vocabulary
|
35 |
-
embedding_dim = 256 # Dimension of the embedding layer
|
36 |
-
rnn_units = 1024 # Number of units in the RNN layer
|
37 |
-
|
38 |
-
model = keras.Sequential([
|
39 |
-
layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[BATCH_SIZE, None]), # Embedding layer
|
40 |
-
layers.GRU(rnn_units, return_sequences=True, stateful=True), # GRU layer
|
41 |
-
layers.Dense(vocab_size) # Dense layer with vocab_size units
|
42 |
-
])
|
43 |
-
|
44 |
-
# Define the loss function
|
45 |
-
def loss(labels, logits):
|
46 |
-
return keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
|
47 |
-
|
48 |
-
# Compile the model
|
49 |
-
model.compile(optimizer='adam', loss=loss)
|
50 |
-
|
51 |
-
# Define a function to generate text
|
52 |
-
def generate_text(model, start_string):
|
53 |
-
num_generate = 50 # Number of characters to generate
|
54 |
-
input_eval = [char2idx[s] for s in start_string] # Convert the start string to numbers
|
55 |
-
input_eval = tf.expand_dims(input_eval, 0) # Expand the dimension for batch size
|
56 |
-
text_generated = [] # Empty list to store the generated text
|
57 |
-
|
58 |
-
temperature = 1.0 # Temperature parameter to control the randomness
|
59 |
-
|
60 |
-
model.reset_states() # Reset the states of the model
|
61 |
-
|
62 |
-
for i in range(num_generate): # Loop over the number of characters to generate
|
63 |
-
predictions = model(input_eval) # Get the predictions from the model
|
64 |
-
predictions = tf.squeeze(predictions, 0) # Remove the batch dimension
|
65 |
-
|
66 |
-
predictions = predictions / temperature # Divide by temperature to increase or decrease randomness
|
67 |
-
predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy() # Sample from the predictions
|
68 |
-
|
69 |
-
input_eval = tf.expand_dims([predicted_id], 0) # Update the input with the predicted id
|
70 |
-
|
71 |
-
text_generated.append(idx2char[predicted_id]) # Append the predicted character to the generated text
|
72 |
-
|
73 |
-
return (start_string + ''.join(text_generated)) # Return the start string and the generated text
|
74 |
-
|
75 |
-
# Train the model
|
76 |
-
EPOCHS = 1 # Number of epochs to train
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Import libraries
|
2 |
+
import streamlit as st
|
3 |
+
import gradio as gr
|
4 |
+
import torch
|
5 |
+
import transformers
|
6 |
+
import librosa
|
7 |
+
import cv2
|
8 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
+
# Load models
|
11 |
+
text_model = transformers.pipeline("text-generation")
|
12 |
+
audio_model = transformers.Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
|
13 |
+
audio_tokenizer = transformers.Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
|
14 |
+
image_model = transformers.pipeline("image-classification")
|
15 |
+
video_model = transformers.VideoClassificationPipeline(model="facebook/mmf-vit-base-16", feature_extractor="facebook/mmf-vit-base-16")
|
16 |
+
|
17 |
+
# Define functions for processing inputs and outputs
|
18 |
+
def text_to_text(input):
|
19 |
+
output = text_model(input, max_length=50)
|
20 |
+
return output[0]["generated_text"]
|
21 |
+
|
22 |
+
def text_to_audio(input):
|
23 |
+
output = text_model(input, max_length=50)
|
24 |
+
output = gr.outputs.Audio.from_str(output[0]["generated_text"])
|
25 |
+
return output
|
26 |
+
|
27 |
+
def text_to_image(input):
|
28 |
+
output = text_model(input, max_length=50)
|
29 |
+
output = gr.outputs.Image.from_str(output[0]["generated_text"])
|
30 |
+
return output
|
31 |
+
|
32 |
+
def text_to_video(input):
|
33 |
+
output = text_model(input, max_length=50)
|
34 |
+
output = gr.outputs.Video.from_str(output[0]["generated_text"])
|
35 |
+
return output
|
36 |
+
|
37 |
+
def audio_to_text(input):
|
38 |
+
input = librosa.load(input)[0]
|
39 |
+
input = torch.from_numpy(input).unsqueeze(0)
|
40 |
+
logits = audio_model(input).logits
|
41 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
42 |
+
output = audio_tokenizer.batch_decode(predicted_ids)[0]
|
43 |
+
return output
|
44 |
+
|
45 |
+
def audio_to_audio(input):
|
46 |
+
return input
|
47 |
+
|
48 |
+
def audio_to_image(input):
|
49 |
+
input = librosa.load(input)[0]
|
50 |
+
input = torch.from_numpy(input).unsqueeze(0)
|
51 |
+
logits = audio_model(input).logits
|
52 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
53 |
+
output = audio_tokenizer.batch_decode(predicted_ids)[0]
|
54 |
+
output = gr.outputs.Image.from_str(output)
|
55 |
+
return output
|
56 |
+
|
57 |
+
def audio_to_video(input):
|
58 |
+
input = librosa.load(input)[0]
|
59 |
+
input = torch.from_numpy(input).unsqueeze(0)
|
60 |
+
logits = audio_model(input).logits
|
61 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
62 |
+
output = audio_tokenizer.batch_decode(predicted_ids)[0]
|
63 |
+
output = gr.outputs.Video.from_str(output)
|
64 |
+
return output
|
65 |
+
|
66 |
+
def image_to_text(input):
|
67 |
+
input = cv2.imread(input)
|
68 |
+
input = cv2.cvtColor(input, cv2.COLOR_BGR2RGB)
|
69 |
+
input = np.expand_dims(input, axis=0)
|
70 |
+
output = image_model(input)
|
71 |
+
return output[0]["label"]
|
72 |
+
|
73 |
+
def image_to_audio(input):
|
74 |
+
input = cv2.imread(input)
|
75 |
+
input = cv2.cvtColor(input, cv2.COLOR_BGR2RGB)
|
76 |
+
input = np.expand_dims(input, axis=0)
|
77 |
+
output = image_model(input)
|
78 |
+
output = gr.outputs.Audio.from_str(output[0]["label"])
|
79 |
+
return output
|
80 |
+
|
81 |
+
def image_to_image(input):
|
82 |
+
return input
|
83 |
+
|
84 |
+
def image_to_video(input):
|
85 |
+
input = cv2.imread(input)
|
86 |
+
input = cv2.cvtColor(input, cv2.COLOR_BGR2RGB)
|
87 |
+
input = np.expand_dims(input, axis=0)
|
88 |
+
output = image_model(input)
|
89 |
+
output = gr.outputs.Video.from_str(output[0]["label"])
|
90 |
+
return output
|
91 |
+
|
92 |
+
def video_to_text(input):
|
93 |
+
input = cv2.VideoCapture(input)
|
94 |
+
frames = []
|