added files

Browse files

Files changed (13) hide show

README.md +19 -4
app.py +51 -0
examples/test_0.jpg +0 -0
examples/test_1.jpg +0 -0
examples/test_2.jpg +0 -0
examples/test_3.jpg +0 -0
examples/test_4.jpg +0 -0
examples/test_5.jpg +0 -0
examples/test_6.jpg +0 -0
examples/test_7.jpg +0 -0
examples/test_8.jpg +0 -0
examples/test_9.jpg +0 -0
requirements.txt +4 -0

README.md CHANGED Viewed

@@ -1,12 +1,27 @@
 ---
 title: CLIP ERA S19
 emoji: 👁
-colorFrom: red
-colorTo: green
 sdk: gradio
-sdk_version: 3.47.1
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: CLIP ERA S19
 emoji: 👁
+colorFrom: purple
+colorTo: blue
 sdk: gradio
+sdk_version: 3.45.2
 app_file: app.py
 pinned: false
+license: mit
 ---
+# Session 19 - ERA Phase I - Assignment
+## Goals
+1. Build app on HuggingFace using CLIP model from OpenAI
+## Usage
+In the App tab, the UI is present for different functionalities like:
+1. Uploading an image and entering text and getting a similarity score between the two.
+2. Variety of examples given
+Contributors
+-------------------------
+Lavanya Nemani
+Shashank Gupta

app.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import requests
+import numpy as np
+import gradio as gr
+from transformers import CLIPProcessor, CLIPModel
+model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+def inference(input_img=None, input_text=None):
+    if input_img is not None and input_text is not None:
+      inputs = processor(text=input_text.split(","), images=input_img, return_tensors="pt", padding=True)
+      outputs = model(**inputs)
+      logits_per_image = outputs.logits_per_image # this is the image-text similarity score
+      probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
+      output_prob = ', '.join([str(probs.detach().numpy()[0][i]) for i in range(np.shape(probs.detach().numpy()[0])[0])])
+    else:
+        output_prob = None
+    return output_prob
+title = "CLIP OpenAI model"
+description = "A simple Gradio interface to find similarity between images and text"
+text_examples = ["A man and a dog, A man wearing a blue coat with a dog inside",
+                 "Train tracks and a train, A dog playing in the field",
+                 "An outdoor seating glass box, A movie theater",
+                 "A building, A building and multiple cars on the road",
+                 "A living area, Planet earth",
+                 "A dining room, A football stadium",
+                 "A red car, A yellow car",
+                 "A chair and a book, A building falling",
+                 "A man and a horse, A child playing with a dog",
+                 "A man and a horse, A child playing with a dog"
+                 ]
+examples = [['examples/test_'+str(i)+'.jpg', text_examples[i]] for i in range(10)]
+demo = gr.Interface(inference,
+                    inputs = [gr.Image(label="Input image"),
+                              gr.Textbox(placeholder="Input text (Multiple entries separated by commas)")],
+                    outputs = [gr.Textbox(label="Similarity score between the input image and input text")],
+                    title = title,
+                    description = description,
+                    examples = examples
+                    )
+demo.launch()