File size: 5,023 Bytes
e1050c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import os
import json
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
def read_json(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
return data
def write_json(file_path, data):
with open(file_path, 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=4)
import os
from openai import OpenAI
import pprint
import json
from llamaapi import LlamaAPI
# Initialize the SDK
llama = LlamaAPI("LL-SmrO4FiBWvkfaGskA4fe6qLSVa7Ob5B83jOojHNq8HkrukjRRG4Xt3CF1mLV9u6o")
os.environ["OPENAI_API_KEY"] = "sk-proj-Jmlrkk0HauWRhffybWOKT3BlbkFJIIuX6dFVCyVG7y6lGwsh"
# client = OpenAI()
# def reponse(sample):
# completion = client.chat.completions.create(
# model="gpt-3.5-turbo",
# # model="gpt-4",
# # model= "gpt-4-1106-vision-preview",
# messages=[
# {"role": "system", "content": ""},
# {"role": "user", "content": sample}
# ]
# )
# # print(completion.choices[0].message.content)
# return completion.choices[0].message.content
# return completion
from chat import MiniCPMVChat, img2base64
import torch
import json
from PIL import Image
torch.manual_seed(0)
chat_model = MiniCPMVChat('/code/ICLR_2024/Model/MiniCPM-Llama3-V-2_5')
image_path = '/code/ICLR_2024/SeeClick/output_image_27.png'
# image = Image.open(image_path)
# image.show()
qs = """
List all the application name and location in the image that can be interacted with, the result shoudl be like a list
"""
im_64 = img2base64(image_path)
msgs = [{"role": "user", "content": qs}]
inputs = {"image": im_64, "question": json.dumps(msgs)}
answer = chat_model.chat(inputs)
data = read_json("/code/ICLR_2024/Auto-GUI/dataset/blip/single_blip_train_llava_10000_caption_elements_llama3_70b.json")
retrival_dict = {}
for index, i in enumerate(data):
retrival_dict[i['image']] = index
path = '/code/ICLR_2024/Auto-GUI/dataset/'
image_id = [ x['image'].split('/')[2].split('.')[0] for x in data]
all_pair_id = {}
all_pair_key = []
for i in image_id:
key = i.split('_')[0]
all_pair_id[key] = []
all_pair_key.append(key)
for i in image_id:
key = i.split('_')[0]
value = i.split('_')[1]
all_pair_id[key].append(value)
all_pair_key = list(set(all_pair_key))
path2 = 'blip/single_texts_splits/'
from tqdm import tqdm
for i in tqdm(all_pair_key[770:]):
num_list = all_pair_id[i]
for j in num_list:
retival_path = path2 + i + '_' + j + '.png'
new_path = path + path2 + i + '_' + j + '.png'
ids = retrival_dict[retival_path]
image_path = path + data[ids]['image']
caption = data[ids]['caption']
Previous = data[ids]['conversations'][0]['value']
Previous = Previous.lower()
task = Previous.split('goal')[1]
Demo_prompt_step1 = """
List all the application name and location in the image that can be interacted with, the result shoudl be like a list
"""
im_64 = img2base64(image_path)
msgs = [{"role": "user", "content": Demo_prompt_step1}]
inputs = {"image": im_64, "question": json.dumps(msgs)}
answer = chat_model.chat(inputs)
data[ids]['icon_list_raw'] = answer
pprint.pprint(answer)
prompt = """ ##### refine it to a list, list name must be elements , just like:
elements = [
"Newegg",
"Newegg CEO",
"Newegg customer service",
"Newegg founder",
"Newegg promo code",
"Newegg return policy",
"Newegg revenue",
"Newegg military discounts"]
Answer the python list only!
##### """
import time
time.sleep(2)
api_request_json = {
"model": "llama3-70b",
"messages": [
{"role": "system", "content": "You are a assistant that will handle the corresponding text formatting for me."},
{"role": "user", "content": answer + prompt},
],
"max_tokens": 1024
}
try:
# new_answer = reponse(answer + prompt) # GPT4 Version
response = llama.run(api_request_json)
new_answer = response.json()['choices'][0]['message']['content']
print('======================================================')
pprint.pprint(new_answer)
print('======================================================')
except Exception as e:
print(f"Error in LLAMA API Generation : {e}")
import time
time.sleep(30)
continue
try:
exec(new_answer)
data[ids]['icon_list'] = elements
except Exception as e:
print(f"Error in setting data[ids]['icon_list']: {e}")
continue
write_json('/code/ICLR_2024/Auto-GUI/dataset/blip/single_blip_train_llava_10000_caption_elements_llama3_70b.json',data)
|