|
import os |
|
import torch |
|
import json |
|
from PIL import Image |
|
import pprint |
|
from tqdm import tqdm |
|
from multiprocessing import Pool, cpu_count |
|
|
|
|
|
from chat import MiniCPMVChat, img2base64 |
|
|
|
|
|
|
|
def read_json(file_path): |
|
with open(file_path, 'r', encoding='utf-8') as file: |
|
data = json.load(file) |
|
return data |
|
|
|
def write_json(file_path, data): |
|
with open(file_path, 'w', encoding='utf-8') as file: |
|
json.dump(data, file, ensure_ascii=False, indent=4) |
|
|
|
def preprocess_data(data, path_base): |
|
"""将图像路径替换为 base64 编码,减少重复 I/O。""" |
|
for item in data: |
|
img_path = os.path.join(path_base, item['image']) |
|
item['image_base64'] = img2base64(img_path) |
|
return data |
|
|
|
|
|
|
|
def chat_minicpm_application(image_path): |
|
|
|
qs = """ |
|
List the names and locations of all interactive applications in the image, as well as their functionality and potential applications. |
|
""" |
|
|
|
|
|
im_64 = img2base64(image_path) |
|
msgs = [{"role": "user", "content": qs}] |
|
inputs = {"image": im_64, "question": json.dumps(msgs)} |
|
answer = chat_model.chat(inputs) |
|
return answer |
|
|
|
|
|
def chat_minicpm_content(image_path): |
|
|
|
qs = """ |
|
Describe the content of this image. |
|
""" |
|
|
|
im_64 = img2base64(image_path) |
|
msgs = [{"role": "user", "content": qs}] |
|
inputs = {"image": im_64, "question": json.dumps(msgs)} |
|
answer = chat_model.chat(inputs) |
|
return answer |
|
|
|
def chat_minicpm_mind(image_path): |
|
|
|
qs = """ |
|
The green frame in the picture represents the situation of clicking, need to explain why click in the corresponding area. Answer template: The green box .... |
|
""" |
|
|
|
im_64 = img2base64(image_path) |
|
msgs = [{"role": "user", "content": qs}] |
|
inputs = {"image": im_64, "question": json.dumps(msgs)} |
|
answer = chat_model.chat(inputs) |
|
return answer |
|
|
|
|
|
|
|
torch.manual_seed(0) |
|
chat_model = MiniCPMVChat('/code/Model/MiniCPM-Llama3-V-2_5') |
|
path_base = '/code/Auto-GUI/dataset/' |
|
|
|
|
|
data = read_json("/code/Auto-GUI/dataset/mind/general_blip_train_llava_coco.json") |
|
data = [line for line in data if line['action_type'] == '#DUAL_POINT#'][17370:] |
|
|
|
|
|
|
|
for idx, i in enumerate(tqdm(data), 1): |
|
img_path = path_base + i['image'] |
|
|
|
i['application'] = chat_minicpm_application(img_path) |
|
i['content'] = chat_minicpm_content(img_path) |
|
i['mind'] = chat_minicpm_mind(img_path) |
|
|
|
|
|
if idx % 100 == 0: |
|
write_json('/code/MiniCPM-V/general_blip_train_llava_coco_caption_mind2.json', data) |
|
|
|
|
|
write_json('/code/MiniCPM-V/general_blip_train_llava_coco_caption_mind2.json', data) |
|
|
|
|