Spaces:

HakimHa
/

wanderJoy

Runtime error

App Files Files Community

HakimHa commited on Jul 20, 2023

Commit

50d8db6

1 Parent(s): 9378660

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -4

app.py CHANGED Viewed

@@ -1,10 +1,23 @@
 import gradio as gr
 from PIL import Image
-from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, Wav2Vec2Processor, Wav2Vec2ForCTC
 import soundfile as sf
 import torch
 import numpy as np
 model_name_or_path = "microsoft/DialoGPT-large"
 tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side="left", use_fast=False)
@@ -21,6 +34,10 @@ model = AutoModelForCausalLM.from_pretrained(
 wav2vec2_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
 wav2vec2_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
 # Function to handle text input
 def handle_text(text):
     new_user_input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors='pt')
@@ -30,9 +47,28 @@ def handle_text(text):
     return chat_output
 # Function to handle image input
 def handle_image(img):
-    return "This image seems nice!"
 # Function to handle audio input
 def handle_audio(audio):
     # gradio Audio returns a tuple (sample_rate, audio_np_array)
@@ -48,10 +84,11 @@ def handle_audio(audio):
 def chatbot(text, img, audio):
     text_output = handle_text(text) if text is not None else ''
-    img_output = handle_image(img) if img is not None else ''
-    audio_output = handle_audio(audio) if audio is not None else ''
     outputs = [o for o in [text_output, img_output, audio_output] if o]
     return "\n".join(outputs)

 import gradio as gr
 from PIL import Image
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, Wav2Vec2Processor, Wav2Vec2ForCTC, ViTFeatureExtractor, ViTForImageClassification
 import soundfile as sf
 import torch
 import numpy as np
+class_names = {
+    0: "Dog",
+    1: "Cat",
+    2: "Horse",
+    3: "Bird",
+    4: "Elephant",
+    5: "Lion",
+    6: "Fish",
+    7: "Bear",
+    8: "Snake"
+}
 model_name_or_path = "microsoft/DialoGPT-large"
 tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side="left", use_fast=False)
 wav2vec2_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
 wav2vec2_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+vit_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
+vit_feature_extractor = ViTFeatureExtractor.from_pretrained('ohidaoui/monuments-morocco-v1')
 # Function to handle text input
 def handle_text(text):
     new_user_input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors='pt')
     return chat_output
 # Function to handle image input
+def get_class_name(class_idx):
+    return class_names[class_idx]
 def handle_image(img):
+    # Convert PIL image to numpy array
+    img = np.array(img)
+    # Apply transformations and prepare image for the model
+    inputs = vit_feature_extractor(images=img, return_tensors="pt")
+    # Pass through the Vision Transformer model
+    outputs = vit_model(**inputs)
+    # Get the predicted class
+    predicted_class_idx = torch.argmax(outputs.logits, dim=1).item()
+    predicted_class_name = get_class_name(predicted_class_idx)
+    return predicted_class_name
 # Function to handle audio input
 def handle_audio(audio):
     # gradio Audio returns a tuple (sample_rate, audio_np_array)
 def chatbot(text, img, audio):
     text_output = handle_text(text) if text is not None else ''
+    img_output = handle_text(handle_image(img)) if img is not None else ''
+    audio_output = handle_text(handle_audio(audio)) if audio is not None else ''
     outputs = [o for o in [text_output, img_output, audio_output] if o]
     return "\n".join(outputs)