gupta1912 commited on
Commit
360d45d
Β·
1 Parent(s): f00a7cf

added files

Browse files
README.md CHANGED
@@ -1,12 +1,27 @@
1
  ---
2
  title: CLIP ERA S19
3
  emoji: πŸ‘
4
- colorFrom: red
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 3.47.1
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: CLIP ERA S19
3
  emoji: πŸ‘
4
+ colorFrom: purple
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 3.45.2
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
+
14
+ # Session 19 - ERA Phase I - Assignment
15
+ ## Goals
16
+ 1. Build app on HuggingFace using CLIP model from OpenAI
17
+
18
+ ## Usage
19
+ In the App tab, the UI is present for different functionalities like:
20
+ 1. Uploading an image and entering text and getting a similarity score between the two.
21
+ 2. Variety of examples given
22
+
23
+ Contributors
24
+ -------------------------
25
+ Lavanya Nemani
26
+
27
+ Shashank Gupta
app.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import numpy as np
3
+ import gradio as gr
4
+
5
+ from transformers import CLIPProcessor, CLIPModel
6
+
7
+ model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
8
+ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
9
+
10
+
11
+ def inference(input_img=None, input_text=None):
12
+
13
+ if input_img is not None and input_text is not None:
14
+
15
+ inputs = processor(text=input_text.split(","), images=input_img, return_tensors="pt", padding=True)
16
+ outputs = model(**inputs)
17
+ logits_per_image = outputs.logits_per_image # this is the image-text similarity score
18
+ probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
19
+
20
+ output_prob = ', '.join([str(probs.detach().numpy()[0][i]) for i in range(np.shape(probs.detach().numpy()[0])[0])])
21
+
22
+ else:
23
+ output_prob = None
24
+
25
+ return output_prob
26
+
27
+
28
+ title = "CLIP OpenAI model"
29
+ description = "A simple Gradio interface to find similarity between images and text"
30
+ text_examples = ["A man and a dog, A man wearing a blue coat with a dog inside",
31
+ "Train tracks and a train, A dog playing in the field",
32
+ "An outdoor seating glass box, A movie theater",
33
+ "A building, A building and multiple cars on the road",
34
+ "A living area, Planet earth",
35
+ "A dining room, A football stadium",
36
+ "A red car, A yellow car",
37
+ "A chair and a book, A building falling",
38
+ "A man and a horse, A child playing with a dog",
39
+ "A man and a horse, A child playing with a dog"
40
+ ]
41
+ examples = [['examples/test_'+str(i)+'.jpg', text_examples[i]] for i in range(10)]
42
+
43
+ demo = gr.Interface(inference,
44
+ inputs = [gr.Image(label="Input image"),
45
+ gr.Textbox(placeholder="Input text (Multiple entries separated by commas)")],
46
+ outputs = [gr.Textbox(label="Similarity score between the input image and input text")],
47
+ title = title,
48
+ description = description,
49
+ examples = examples
50
+ )
51
+ demo.launch()
examples/test_0.jpg ADDED
examples/test_1.jpg ADDED
examples/test_2.jpg ADDED
examples/test_3.jpg ADDED
examples/test_4.jpg ADDED
examples/test_5.jpg ADDED
examples/test_6.jpg ADDED
examples/test_7.jpg ADDED
examples/test_8.jpg ADDED
examples/test_9.jpg ADDED
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ numpy
3
+ torch
4
+ transformers