File size: 5,023 Bytes
e1050c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import os
import json

os.environ["CUDA_VISIBLE_DEVICES"] = "1" 


def read_json(file_path): 
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def write_json(file_path, data):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)


import os 
from openai import OpenAI
import pprint
import json
from llamaapi import LlamaAPI

# Initialize the SDK
llama = LlamaAPI("LL-SmrO4FiBWvkfaGskA4fe6qLSVa7Ob5B83jOojHNq8HkrukjRRG4Xt3CF1mLV9u6o")
os.environ["OPENAI_API_KEY"] = "sk-proj-Jmlrkk0HauWRhffybWOKT3BlbkFJIIuX6dFVCyVG7y6lGwsh"


# client = OpenAI()
# def reponse(sample):
#     completion = client.chat.completions.create(
#     model="gpt-3.5-turbo",
#     # model="gpt-4",
#     # model= "gpt-4-1106-vision-preview",
#     messages=[
#         {"role": "system", "content": ""},
#         {"role": "user", "content": sample}
#     ]
#     )

#     # print(completion.choices[0].message.content)
#     return completion.choices[0].message.content
#     return completion



from chat import MiniCPMVChat, img2base64
import torch
import json
from PIL import Image


torch.manual_seed(0)
chat_model = MiniCPMVChat('/code/ICLR_2024/Model/MiniCPM-Llama3-V-2_5')


image_path = '/code/ICLR_2024/SeeClick/output_image_27.png'
# image = Image.open(image_path)
# image.show()

qs = """  
List all the application name and location in the image that can be interacted with, the result shoudl be like a list
"""

im_64 = img2base64(image_path)
msgs = [{"role": "user", "content": qs}]
inputs = {"image": im_64, "question": json.dumps(msgs)}
answer = chat_model.chat(inputs)

data = read_json("/code/ICLR_2024/Auto-GUI/dataset/blip/single_blip_train_llava_10000_caption_elements_llama3_70b.json")


retrival_dict = {}
for index, i in enumerate(data):
    retrival_dict[i['image']] = index

path = '/code/ICLR_2024/Auto-GUI/dataset/'
image_id = [ x['image'].split('/')[2].split('.')[0] for x in data]
 
all_pair_id = {}
all_pair_key = []
for i in image_id:
    key = i.split('_')[0] 
    all_pair_id[key] = []
    all_pair_key.append(key)

for i in image_id:
    key = i.split('_')[0]
    value = i.split('_')[1]
    all_pair_id[key].append(value)

all_pair_key = list(set(all_pair_key))
path2 = 'blip/single_texts_splits/'


from tqdm import tqdm
for i in tqdm(all_pair_key[770:]):

    num_list = all_pair_id[i]
    for j in num_list:

        retival_path = path2 + i + '_' + j + '.png'
        new_path = path + path2 + i + '_' + j + '.png'
        ids = retrival_dict[retival_path] 

        image_path = path + data[ids]['image']
        caption = data[ids]['caption']
        Previous = data[ids]['conversations'][0]['value'] 

        Previous = Previous.lower()
        task = Previous.split('goal')[1]
 
        Demo_prompt_step1 =  """  
        List all the application name and location in the image that can be interacted with, the result shoudl be like a list
        """

        im_64 = img2base64(image_path)
        msgs = [{"role": "user", "content": Demo_prompt_step1}]
        inputs = {"image": im_64, "question": json.dumps(msgs)}
        answer = chat_model.chat(inputs)

        data[ids]['icon_list_raw'] = answer 
        pprint.pprint(answer)

        prompt = """ ##### refine it to a list, list name must be elements , just like: 
        elements = [
            "Newegg",
            "Newegg CEO",
            "Newegg customer service",
            "Newegg founder",
            "Newegg promo code",
            "Newegg return policy",
            "Newegg revenue",
            "Newegg military discounts"] 

        Answer the python list only! 
        ##### """
        
        import time
        time.sleep(2)   

        api_request_json = {
        "model": "llama3-70b",
        "messages": [
            {"role": "system", "content": "You are a assistant that will handle the corresponding text formatting for me."},
            {"role": "user", "content": answer + prompt},
            
        ],
        "max_tokens": 1024
        
        }

        try:
            # new_answer = reponse(answer + prompt) # GPT4 Version
            response = llama.run(api_request_json)
            new_answer = response.json()['choices'][0]['message']['content']
            print('======================================================')
            pprint.pprint(new_answer)
            print('======================================================')
        except Exception as e:
            print(f"Error in LLAMA API Generation : {e}")
            import time
            time.sleep(30)   
            continue
         
        try:
            exec(new_answer)
            data[ids]['icon_list'] = elements
        except Exception as e:
            print(f"Error in setting data[ids]['icon_list']: {e}")
            continue

        

    write_json('/code/ICLR_2024/Auto-GUI/dataset/blip/single_blip_train_llava_10000_caption_elements_llama3_70b.json',data)