seu-ebers commited on
Commit
9c94ced
·
1 Parent(s): 8ec71b6
Files changed (4) hide show
  1. app.py +21 -0
  2. notebook.ipynb +141 -0
  3. old.py +53 -0
  4. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import torch
3
+ import streamlit as st
4
+ from transformers import pipeline, AutoProcessor, LlavaForConditionalGeneration
5
+ from PIL import Image
6
+
7
+ pipeline = pipeline(task="image-classification", model="julien-c/hotdog-not-hotdog")
8
+
9
+ st.title("Hot Dog? Or Not?")
10
+ file_name = st.file_uploader("Upload a hot dog candidate image")
11
+
12
+ if file_name is not None:
13
+ col1, col2 = st.columns(2)
14
+
15
+ image = Image.open(file_name)
16
+ col1.image(image, use_column_width=True)
17
+ predictions = pipeline(image)
18
+
19
+ col2.header("Probabilities")
20
+ for p in predictions:
21
+ col2.subheader(f"{ p['label'] }: { round(p['score'] * 100, 1)}%")
notebook.ipynb ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "initial_id",
7
+ "metadata": {
8
+ "collapsed": true
9
+ },
10
+ "outputs": [],
11
+ "source": [
12
+ "# Load Image to Text model\n",
13
+ "from transformers import AutoProcessor, AutoModelForCausalLM\n",
14
+ "import requests\n",
15
+ "\n",
16
+ "image_processor = AutoProcessor.from_pretrained(\"microsoft/git-base\")\n",
17
+ "image_to_text_model = AutoModelForCausalLM.from_pretrained(\"microsoft/git-base\")"
18
+ ]
19
+ },
20
+ {
21
+ "metadata": {},
22
+ "cell_type": "code",
23
+ "outputs": [],
24
+ "execution_count": null,
25
+ "source": [
26
+ "# Load Translation model\n",
27
+ "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
28
+ "\n",
29
+ "tokenizer = AutoTokenizer.from_pretrained(\"google-t5/t5-small\")\n",
30
+ "model = AutoModelForSeq2SeqLM.from_pretrained(\"google-t5/t5-small\")"
31
+ ],
32
+ "id": "be52bb44374be3a1"
33
+ },
34
+ {
35
+ "metadata": {},
36
+ "cell_type": "code",
37
+ "outputs": [],
38
+ "execution_count": null,
39
+ "source": [
40
+ "def generate_caption(image):\n",
41
+ " pixel_values = image_processor(images=image, return_tensors=\"pt\").pixel_values\n",
42
+ " generated_ids = image_to_text_model.generate(pixel_values=pixel_values, max_length=200)\n",
43
+ " generated_caption = image_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]\n",
44
+ "\n",
45
+ " return generated_caption"
46
+ ],
47
+ "id": "eb994d7ef0dc73f6"
48
+ },
49
+ {
50
+ "metadata": {},
51
+ "cell_type": "code",
52
+ "outputs": [],
53
+ "execution_count": null,
54
+ "source": [
55
+ "def translate(text):\n",
56
+ " inputs = tokenizer(text, return_tensors='pt')\n",
57
+ " input_ids = inputs.input_ids\n",
58
+ " attention_mask = inputs.attention_mask\n",
59
+ "\n",
60
+ " try:\n",
61
+ " input_ids = input_ids.to('cuda')\n",
62
+ " attention_mask = attention_mask.to('cuda')\n",
63
+ " model = translation_model.to(\"cuda\")\n",
64
+ " except:\n",
65
+ " print('No NVidia GPU, model performance may not be as good')\n",
66
+ " model = translation_model\n",
67
+ "\n",
68
+ " output = model.generate(input_ids, attention_mask=attention_mask, forced_bos_token_id=tokenizer.lang_code_to_id['pt_XX'])\n",
69
+ " translated = tokenizer.decode(output[0], skip_special_tokens=True)\n",
70
+ "\n",
71
+ " return translated"
72
+ ],
73
+ "id": "f9742a337b32cc1"
74
+ },
75
+ {
76
+ "metadata": {},
77
+ "cell_type": "code",
78
+ "outputs": [],
79
+ "execution_count": null,
80
+ "source": [
81
+ "# Carregamento de imagens locais\n",
82
+ "import sys\n",
83
+ "import cv2\n",
84
+ "from PIL import Image\n",
85
+ "\n",
86
+ "img_url = 'http://images.cocodataset.org/val2017/000000039769.jpg'\n",
87
+ "# img_url = 'https://farm4.staticflickr.com/3733/9000662079_ce3599d0d8_z.jpg'\n",
88
+ "# img_url = 'https://farm4.staticflickr.com/3088/5793281956_2a15b2559c_z.jpg'\n",
89
+ "# img_url = 'https://farm5.staticflickr.com/4073/4816939054_844feb0078_z.jpg'\n",
90
+ "\n",
91
+ "image = Image.open(requests.get(img_url, stream=True).raw)"
92
+ ],
93
+ "id": "97f3e60bca81b195"
94
+ },
95
+ {
96
+ "metadata": {},
97
+ "cell_type": "code",
98
+ "outputs": [],
99
+ "execution_count": null,
100
+ "source": [
101
+ "caption = generate_caption(image)\n",
102
+ "\n",
103
+ "print(caption)"
104
+ ],
105
+ "id": "1a4c1ed0fc31fd60"
106
+ },
107
+ {
108
+ "metadata": {},
109
+ "cell_type": "code",
110
+ "outputs": [],
111
+ "execution_count": null,
112
+ "source": [
113
+ "translated_caption = translate(caption)\n",
114
+ "\n",
115
+ "print(translated_caption)"
116
+ ],
117
+ "id": "a4d4f92f2c0b3922"
118
+ }
119
+ ],
120
+ "metadata": {
121
+ "kernelspec": {
122
+ "display_name": "Python 3",
123
+ "language": "python",
124
+ "name": "python3"
125
+ },
126
+ "language_info": {
127
+ "codemirror_mode": {
128
+ "name": "ipython",
129
+ "version": 2
130
+ },
131
+ "file_extension": ".py",
132
+ "mimetype": "text/x-python",
133
+ "name": "python",
134
+ "nbconvert_exporter": "python",
135
+ "pygments_lexer": "ipython2",
136
+ "version": "2.7.6"
137
+ }
138
+ },
139
+ "nbformat": 4,
140
+ "nbformat_minor": 5
141
+ }
old.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import torch
3
+ import streamlit as st
4
+ from transformers import pipeline, AutoProcessor, LlavaForConditionalGeneration
5
+ from PIL import Image
6
+
7
+ pipeline = pipeline(task="image-classification", model="julien-c/hotdog-not-hotdog")
8
+ # processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
9
+ # model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
10
+
11
+ st.title("Hot Dog? Or Not?")
12
+ file_name = st.file_uploader("Upload a hot dog candidate image")
13
+
14
+ if file_name is not None:
15
+ col1, col2 = st.columns(2)
16
+
17
+ image = Image.open(file_name)
18
+ col1.image(image, use_column_width=True)
19
+ predictions = pipeline(image)
20
+
21
+ col2.header("Probabilities")
22
+ for p in predictions:
23
+ col2.subheader(f"{ p['label'] }: { round(p['score'] * 100, 1)}%")
24
+
25
+ # img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
26
+ # raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
27
+ #
28
+ # question = "how many dogs are in the picture?"
29
+ # inputs = processor(raw_image, question, return_tensors="pt")
30
+ #
31
+ # out = model.generate(**inputs)
32
+ # print(processor.decode(out[0], skip_special_tokens=True).strip())
33
+
34
+ #
35
+ # model_id = "llava-hf/llava-1.5-7b-hf"
36
+ #
37
+ # prompt = "USER: <image>\nWhat are these?\nASSISTANT:"
38
+ # image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
39
+ #
40
+ # model = LlavaForConditionalGeneration.from_pretrained(
41
+ # model_id,
42
+ # torch_dtype=torch.float16,
43
+ # low_cpu_mem_usage=True,
44
+ # ).to(0)
45
+ #
46
+ # processor = AutoProcessor.from_pretrained(model_id)
47
+ #
48
+ #
49
+ # raw_image = Image.open(requests.get(image_file, stream=True).raw)
50
+ # inputs = processor(prompt, raw_image, return_tensors='pt').to(0, torch.float16)
51
+ #
52
+ # output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
53
+ # print(processor.decode(output[0][2:], skip_special_tokens=True))
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers
2
+ torch
3
+ accelerate