Spaces:

Jabrain
/

Zbot

Sleeping

App Files Files Community

Jabrain commited on Nov 2, 2023

Commit

97fea5d

1 Parent(s): af67342

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -81

app.py CHANGED Viewed

@@ -1,84 +1,94 @@
-import streamlit as st
 # Import libraries
 import numpy as np
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
-# Load the text data
-text = open('shakespeare.txt', 'r').read() # Read the text file
-vocab = sorted(set(text)) # Get the unique characters in the text
-char2idx = {c: i for i, c in enumerate(vocab)} # Map characters to indices
-idx2char = np.array(vocab) # Map indices to characters
-text_as_int = np.array([char2idx[c] for c in text]) # Convert text to integers
-# Create training examples and targets
-seq_length = 100 # Length of the input sequence
-examples_per_epoch = len(text) // (seq_length + 1) # Number of examples per epoch
-char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int) # Create a dataset from the text
-sequences = char_dataset.batch(seq_length + 1, drop_remainder=True) # Create batches of sequences
-def split_input_target(chunk): # Define a function to split the input and target
-  input_text = chunk[:-1] # Input is the sequence except the last character
-  target_text = chunk[1:] # Target is the sequence except the first character
-  return input_text, target_text
-dataset = sequences.map(split_input_target) # Apply the function to the dataset
-# Shuffle and batch the dataset
-BATCH_SIZE = 1 # Batch size
-BUFFER_SIZE = 10000 # Buffer size for shuffling
-dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True) # Shuffle and batch the dataset
-# Define the model
-vocab_size = len(vocab) # Size of the vocabulary
-embedding_dim = 256 # Dimension of the embedding layer
-rnn_units = 1024 # Number of units in the RNN layer
-model = keras.Sequential([
-  layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[BATCH_SIZE, None]), # Embedding layer
-  layers.GRU(rnn_units, return_sequences=True, stateful=True), # GRU layer
-  layers.Dense(vocab_size) # Dense layer with vocab_size units
-])
-# Define the loss function
-def loss(labels, logits):
-  return keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
-# Compile the model
-model.compile(optimizer='adam', loss=loss)
-# Define a function to generate text
-def generate_text(model, start_string):
-  num_generate = 50 # Number of characters to generate
-  input_eval = [char2idx[s] for s in start_string] # Convert the start string to numbers
-  input_eval = tf.expand_dims(input_eval, 0) # Expand the dimension for batch size
-  text_generated = [] # Empty list to store the generated text
-  temperature = 1.0 # Temperature parameter to control the randomness
-  model.reset_states() # Reset the states of the model
-  for i in range(num_generate): # Loop over the number of characters to generate
-    predictions = model(input_eval) # Get the predictions from the model
-    predictions = tf.squeeze(predictions, 0) # Remove the batch dimension
-    predictions = predictions / temperature # Divide by temperature to increase or decrease randomness
-    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy() # Sample from the predictions
-    input_eval = tf.expand_dims([predicted_id], 0) # Update the input with the predicted id
-    text_generated.append(idx2char[predicted_id]) # Append the predicted character to the generated text
-  return (start_string + ''.join(text_generated)) # Return the start string and the generated text
-# Train the model
-EPOCHS = 1 # Number of epochs to train
-for epoch in range(EPOCHS): # Loop over the epochs
-  print(f'Epoch {epoch + 1}')
-  model.fit(dataset, epochs=1) # Fit the model on the dataset for one epoch
-  start_string = 'ROMEO: ' # Define a start string to generate text from
-  print(generate_text(model, start_string)) # Print the generated text

 # Import libraries
+import streamlit as st
+import gradio as gr
+import torch
+import transformers
+import librosa
+import cv2
 import numpy as np
+# Load models
+text_model = transformers.pipeline("text-generation")
+audio_model = transformers.Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+audio_tokenizer = transformers.Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
+image_model = transformers.pipeline("image-classification")
+video_model = transformers.VideoClassificationPipeline(model="facebook/mmf-vit-base-16", feature_extractor="facebook/mmf-vit-base-16")
+# Define functions for processing inputs and outputs
+def text_to_text(input):
+  output = text_model(input, max_length=50)
+  return output[0]["generated_text"]
+def text_to_audio(input):
+  output = text_model(input, max_length=50)
+  output = gr.outputs.Audio.from_str(output[0]["generated_text"])
+  return output
+def text_to_image(input):
+  output = text_model(input, max_length=50)
+  output = gr.outputs.Image.from_str(output[0]["generated_text"])
+  return output
+def text_to_video(input):
+  output = text_model(input, max_length=50)
+  output = gr.outputs.Video.from_str(output[0]["generated_text"])
+  return output
+def audio_to_text(input):
+  input = librosa.load(input)[0]
+  input = torch.from_numpy(input).unsqueeze(0)
+  logits = audio_model(input).logits
+  predicted_ids = torch.argmax(logits, dim=-1)
+  output = audio_tokenizer.batch_decode(predicted_ids)[0]
+  return output
+def audio_to_audio(input):
+  return input
+def audio_to_image(input):
+  input = librosa.load(input)[0]
+  input = torch.from_numpy(input).unsqueeze(0)
+  logits = audio_model(input).logits
+  predicted_ids = torch.argmax(logits, dim=-1)
+  output = audio_tokenizer.batch_decode(predicted_ids)[0]
+  output = gr.outputs.Image.from_str(output)
+  return output
+def audio_to_video(input):
+  input = librosa.load(input)[0]
+  input = torch.from_numpy(input).unsqueeze(0)
+  logits = audio_model(input).logits
+  predicted_ids = torch.argmax(logits, dim=-1)
+  output = audio_tokenizer.batch_decode(predicted_ids)[0]
+  output = gr.outputs.Video.from_str(output)
+  return output
+def image_to_text(input):
+  input = cv2.imread(input)
+  input = cv2.cvtColor(input, cv2.COLOR_BGR2RGB)
+  input = np.expand_dims(input, axis=0)
+  output = image_model(input)
+  return output[0]["label"]
+def image_to_audio(input):
+  input = cv2.imread(input)
+  input = cv2.cvtColor(input, cv2.COLOR_BGR2RGB)
+  input = np.expand_dims(input, axis=0)
+  output = image_model(input)
+  output = gr.outputs.Audio.from_str(output[0]["label"])
+  return output
+def image_to_image(input):
+  return input
+def image_to_video(input):
+  input = cv2.imread(input)
+  input = cv2.cvtColor(input, cv2.COLOR_BGR2RGB)
+  input = np.expand_dims(input, axis=0)
+  output = image_model(input)
+  output = gr.outputs.Video.from_str(output[0]["label"])
+  return output
+def video_to_text(input):
+  input = cv2.VideoCapture(input)
+  frames = []