File size: 5,023 Bytes

e1050c3

import os
import json

os.environ["CUDA_VISIBLE_DEVICES"] = "1" 


def read_json(file_path): 
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def write_json(file_path, data):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)


import os 
from openai import OpenAI
import pprint
import json
from llamaapi import LlamaAPI

# Initialize the SDK
llama = LlamaAPI("LL-SmrO4FiBWvkfaGskA4fe6qLSVa7Ob5B83jOojHNq8HkrukjRRG4Xt3CF1mLV9u6o")
os.environ["OPENAI_API_KEY"] = "sk-proj-Jmlrkk0HauWRhffybWOKT3BlbkFJIIuX6dFVCyVG7y6lGwsh"


# client = OpenAI()
# def reponse(sample):
#     completion = client.chat.completions.create(
#     model="gpt-3.5-turbo",
#     # model="gpt-4",
#     # model= "gpt-4-1106-vision-preview",
#     messages=[
#         {"role": "system", "content": ""},
#         {"role": "user", "content": sample}
#     ]
#     )

#     # print(completion.choices[0].message.content)
#     return completion.choices[0].message.content
#     return completion



from chat import MiniCPMVChat, img2base64
import torch
import json
from PIL import Image


torch.manual_seed(0)
chat_model = MiniCPMVChat('/code/ICLR_2024/Model/MiniCPM-Llama3-V-2_5')


image_path = '/code/ICLR_2024/SeeClick/output_image_27.png'
# image = Image.open(image_path)
# image.show()

qs = """  
List all the application name and location in the image that can be interacted with, the result shoudl be like a list
"""

im_64 = img2base64(image_path)
msgs = [{"role": "user", "content": qs}]
inputs = {"image": im_64, "question": json.dumps(msgs)}
answer = chat_model.chat(inputs)

data = read_json("/code/ICLR_2024/Auto-GUI/dataset/blip/single_blip_train_llava_10000_caption_elements_llama3_70b.json")


retrival_dict = {}
for index, i in enumerate(data):
    retrival_dict[i['image']] = index

path = '/code/ICLR_2024/Auto-GUI/dataset/'
image_id = [ x['image'].split('/')[2].split('.')[0] for x in data]
 
all_pair_id = {}
all_pair_key = []
for i in image_id:
    key = i.split('_')[0] 
    all_pair_id[key] = []
    all_pair_key.append(key)

for i in image_id:
    key = i.split('_')[0]
    value = i.split('_')[1]
    all_pair_id[key].append(value)

all_pair_key = list(set(all_pair_key))
path2 = 'blip/single_texts_splits/'


from tqdm import tqdm
for i in tqdm(all_pair_key[770:]):

    num_list = all_pair_id[i]
    for j in num_list:

        retival_path = path2 + i + '_' + j + '.png'
        new_path = path + path2 + i + '_' + j + '.png'
        ids = retrival_dict[retival_path] 

        image_path = path + data[ids]['image']
        caption = data[ids]['caption']
        Previous = data[ids]['conversations'][0]['value'] 

        Previous = Previous.lower()
        task = Previous.split('goal')[1]
 
        Demo_prompt_step1 =  """  
        List all the application name and location in the image that can be interacted with, the result shoudl be like a list
        """

        im_64 = img2base64(image_path)
        msgs = [{"role": "user", "content": Demo_prompt_step1}]
        inputs = {"image": im_64, "question": json.dumps(msgs)}
        answer = chat_model.chat(inputs)

        data[ids]['icon_list_raw'] = answer 
        pprint.pprint(answer)

        prompt = """ ##### refine it to a list, list name must be elements , just like: 
        elements = [
            "Newegg",
            "Newegg CEO",
            "Newegg customer service",
            "Newegg founder",
            "Newegg promo code",
            "Newegg return policy",
            "Newegg revenue",
            "Newegg military discounts"] 

        Answer the python list only! 
        ##### """
        
        import time
        time.sleep(2)   

        api_request_json = {
        "model": "llama3-70b",
        "messages": [
            {"role": "system", "content": "You are a assistant that will handle the corresponding text formatting for me."},
            {"role": "user", "content": answer + prompt},
            
        ],
        "max_tokens": 1024
        
        }

        try:
            # new_answer = reponse(answer + prompt) # GPT4 Version
            response = llama.run(api_request_json)
            new_answer = response.json()['choices'][0]['message']['content']
            print('======================================================')
            pprint.pprint(new_answer)
            print('======================================================')
        except Exception as e:
            print(f"Error in LLAMA API Generation : {e}")
            import time
            time.sleep(30)   
            continue
         
        try:
            exec(new_answer)
            data[ids]['icon_list'] = elements
        except Exception as e:
            print(f"Error in setting data[ids]['icon_list']: {e}")
            continue

        

    write_json('/code/ICLR_2024/Auto-GUI/dataset/blip/single_blip_train_llava_10000_caption_elements_llama3_70b.json',data)