import os import torch import json from PIL import Image import pprint from tqdm import tqdm from multiprocessing import Pool, cpu_count from chat import MiniCPMVChat, img2base64 def read_json(file_path): with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) return data def write_json(file_path, data): with open(file_path, 'w', encoding='utf-8') as file: json.dump(data, file, ensure_ascii=False, indent=4) def preprocess_data(data, path_base): """将图像路径替换为 base64 编码,减少重复 I/O。""" for item in data: img_path = os.path.join(path_base, item['image']) item['image_base64'] = img2base64(img_path) return data def chat_minicpm_application(image_path): qs = """ List the names and locations of all interactive applications in the image, as well as their functionality and potential applications. """ # qs = f'''{context}. The green frame in the picture represents the situation of clicking, need to explain why click in the corresponding area. # ''' im_64 = img2base64(image_path) msgs = [{"role": "user", "content": qs}] inputs = {"image": im_64, "question": json.dumps(msgs)} answer = chat_model.chat(inputs) return answer def chat_minicpm_content(image_path): qs = """ Describe the content of this image. """ im_64 = img2base64(image_path) msgs = [{"role": "user", "content": qs}] inputs = {"image": im_64, "question": json.dumps(msgs)} answer = chat_model.chat(inputs) return answer def chat_minicpm_mind(image_path): qs = """ The green frame in the picture represents the situation of clicking, need to explain why click in the corresponding area. Answer template: The green box .... """ im_64 = img2base64(image_path) msgs = [{"role": "user", "content": qs}] inputs = {"image": im_64, "question": json.dumps(msgs)} answer = chat_model.chat(inputs) return answer torch.manual_seed(0) chat_model = MiniCPMVChat('/code/Model/MiniCPM-Llama3-V-2_5') path_base = '/code/Auto-GUI/dataset/' data = read_json("/code/Auto-GUI/dataset/mind/general_blip_train_llava_coco.json") data = [line for line in data if line['action_type'] == '#DUAL_POINT#'][17370:] for idx, i in enumerate(tqdm(data), 1): # 从1开始计数,便于后续计数判断 img_path = path_base + i['image'] # context = data[idx]['conversations'][0]['value'] i['application'] = chat_minicpm_application(img_path) i['content'] = chat_minicpm_content(img_path) i['mind'] = chat_minicpm_mind(img_path) # 每100次保存一次 if idx % 100 == 0: write_json('/code/MiniCPM-V/general_blip_train_llava_coco_caption_mind2.json', data) # 最后保存一次,确保未满100的剩余数据也能保存 write_json('/code/MiniCPM-V/general_blip_train_llava_coco_caption_mind2.json', data)