File size: 7,067 Bytes
4b03ca9 97fea5d 0f72fdd 4b03ca9 2d5548b 1c3242f 97fea5d 3822af4 b9f2236 97fea5d 0f72fdd 97fea5d 0f72fdd efa3178 2ec0cea efa3178 2ec0cea efa3178 8d533bc efa3178 cd4675c efa3178 cd4675c efa3178 b45225d efa3178 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
# Import libraries
import streamlit as st
import gradio as gr
import torch
import transformers
import librosa
#import cv2
import numpy as np
# Load models pop
text_model = transformers.pipeline("text-generation")
audio_model = transformers.Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
audio_tokenizer = transformers.Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
image_model = transformers.pipeline("image-classification")
#video_model = transformers.VideoClassificationPipeline(model="mmf/facebook-mmf-vit-base-16", feature_extractor="mmf/facebook-mmf-vit-base-16")
# Define functions for processing inputs and outputs
def text_to_text(input):
output = text_model(input, max_length=50)
return output[0]["generated_text"]
def text_to_audio(input):
output = text_model(input, max_length=50)
output = gr.outputs.Audio.from_str(output[0]["generated_text"])
return output
def text_to_image(input):
output = text_model(input, max_length=50)
output = gr.outputs.Image.from_str(output[0]["generated_text"])
return output
def text_to_video(input):
output = text_model(input, max_length=50)
output = gr.outputs.Video.from_str(output[0]["generated_text"])
return output
def audio_to_text(input):
input = librosa.load(input)[0]
input = torch.from_numpy(input).unsqueeze(0)
logits = audio_model(input).logits
predicted_ids = torch.argmax(logits, dim=-1)
output = audio_tokenizer.batch_decode(predicted_ids)[0]
return output
def audio_to_audio(input):
return input
def audio_to_image(input):
input = librosa.load(input)[0]
input = torch.from_numpy(input).unsqueeze(0)
logits = audio_model(input).logits
predicted_ids = torch.argmax(logits, dim=-1)
output = audio_tokenizer.batch_decode(predicted_ids)[0]
output = gr.outputs.Image.from_str(output)
return output
def audio_to_video(input):
input = librosa.load(input)[0]
input = torch.from_numpy(input).unsqueeze(0)
logits = audio_model(input).logits
predicted_ids = torch.argmax(logits, dim=-1)
output = audio_tokenizer.batch_decode(predicted_ids)[0]
output = gr.outputs.Video.from_str(output)
return output
#def image_to_text(input):
# input = cv2.imread(input)
# input = cv2.cvtColor(input, cv2.COLOR_BGR2RGB)
# input = np.expand_dims(input, axis=0)
# output = image_model(input)
# return output[0]["label"]
#def image_to_audio(input):
# input = cv2.imread(input)
# input = cv2.cvtColor(input, cv2.COLOR_BGR2RGB)
# input = np.expand_dims(input, axis=0)
# output = image_model(input)
# output = gr.outputs.Audio.from_str(output[0]["label"])
# return output
def image_to_image(input):
return input
#def image_to_video(input):
# input = cv2.imread(input)
# input = cv2.cvtColor(input, cv2.COLOR_BGR2RGB)
# input = np.expand_dims(input, axis=0)
# output = image_model(input)
# output = gr.outputs.Video.from_str(output[0]["label"])
# return output
#def video_to_text(input):
# input = cv2.VideoCapture(input)
# frames = []
# while input.isOpened():
# ret, frame = input.read()
# if ret:
# frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# frames.append(frame)
# else:
# breakpoint
# input.release()
# frames = np.array(frames)
# output = video_model(frames)
# return output[0]["label"]
#Output switches
# Define the process_input function
def process_input(user_input, input_format, output_format):
# Use a switch case to call the appropriate function based on the input and output formats
if input_format == "Text" and output_format == "Text":
output = text_to_text(user_input)
elif input_format == "Text" and output_format == "Audio":
output = text_to_audio(user_input)
elif input_format == "Text" and output_format == "Image":
output = text_to_image(user_input)
elif input_format == "Text" and output_format == "Video":
output = text_to_video(user_input)
elif input_format == "Audio" and output_format == "Text":
output = audio_to_text(user_input)
elif input_format == "Audio" and output_format == "Audio":
output = audio_to_audio(user_input)
elif input_format == "Audio" and output_format == "Image":
output = audio_to_image(user_input)
elif input_format == "Audio" and output_format == "Video":
output = audio_to_video(user_input)
elif input_format == "Image" and output_format == "Text":
output = image_to_text(user_input)
elif input_format == "Image" and output_format == "Audio":
output = image_to_audio(user_input)
elif input_format == "Image" and output_format == "Image":
output = image_to_image(user_input)
elif input_format == "Image" and output_format == "Video":
output = image_to_video(user_input)
elif input_format == "Video" and output_format == "Text":
output = video_to_text(user_input)
elif input_format == "Video" and output_format == "Audio":
output = video_to_audio(user_input)
elif input_format == "Video" and output_format == "Image":
output = video_to_image(user_input)
elif input_format == "Video" and output_format == "Video":
output = video_to_video(user_input)
else:
output = "Invalid input or output format"
# Return the output data as a gradio output object
return output
# Create a title for the app
st.title("My Generic AI App")
# Create a sidebar for selecting the input and output formats
st.sidebar.header("Select the input and output formats")
input_format = st.sidebar.selectbox("Input format", ["Text", "Audio", "Image", "Video"])
output_format = st.sidebar.selectbox("Output format", ["Text", "Audio", "Image", "Video"])
# Create a container for the input and output widgets
io_container = st.container()
# Create a chat input widget for text input
if input_format == "Text":
user_input = st.text_input("Type a text")
# Create a file uploader widget for audio input
elif input_format == "Audio":
user_input = st.file_uploader("Upload an audio file", type=["wav", "mp3", "ogg"])
# Create a file uploader widget for image input
elif input_format == "Image":
user_input = st.file_uploader("Upload an image file", type=["jpg", "png", "gif"])
# Create a file uploader widget for video input
else:
user_input = st.file_uploader("Upload a video file", type=["mp4", "avi", "mov"])
# Check if the user input is not empty
if user_input:
# Display the user input as a chat message or an image
with io_container:
if input_format == "Text":
st.write(f"user: {user_input}")
else:
st.image(user_input, caption="User input")
# Process the user input and generate a response
# You can use your own logic or a language model here
# For example, you can use a switch case to call the appropriate function
# based on the input and output formats
response = process_input(user_input, input_format, output_format)
# Display the response as a chat message or an image
with io_container:
if output_format == "Text":
st.write(f"assistant: {response}")
else:
st.image(response, caption="Assistant output")
|