File size: 2,931 Bytes
a5aab08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import torch
import json
from PIL import Image
import pprint 
from tqdm import  tqdm
from multiprocessing import Pool, cpu_count


from chat import MiniCPMVChat, img2base64



def read_json(file_path): 
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def write_json(file_path, data):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

def preprocess_data(data, path_base):
    """将图像路径替换为 base64 编码,减少重复 I/O。"""
    for item in data:
        img_path = os.path.join(path_base, item['image'])
        item['image_base64'] = img2base64(img_path)
    return data



def chat_minicpm_application(image_path):
    
    qs = """  
    List the names and locations of all interactive applications in the image, as well as their functionality and potential applications.
    """
    # qs = f'''{context}.  The green frame in the picture represents the situation of clicking, need to explain why click in the corresponding area. 
    # '''
    im_64 = img2base64(image_path)
    msgs = [{"role": "user", "content": qs}]
    inputs = {"image": im_64, "question": json.dumps(msgs)}
    answer = chat_model.chat(inputs)
    return answer


def chat_minicpm_content(image_path):
 
    qs = """
    Describe the content of this image.
    """

    im_64 = img2base64(image_path)
    msgs = [{"role": "user", "content": qs}]
    inputs = {"image": im_64, "question": json.dumps(msgs)}
    answer = chat_model.chat(inputs)
    return answer

def chat_minicpm_mind(image_path):
 
    qs = """
    The green frame in the picture represents the situation of clicking, need to explain why click in the corresponding area. Answer template: The green box ....
    """

    im_64 = img2base64(image_path)
    msgs = [{"role": "user", "content": qs}]
    inputs = {"image": im_64, "question": json.dumps(msgs)}
    answer = chat_model.chat(inputs)
    return answer



torch.manual_seed(0)
chat_model = MiniCPMVChat('/code/Model/MiniCPM-Llama3-V-2_5')
path_base = '/code/Auto-GUI/dataset/'


data = read_json("/code/Auto-GUI/dataset/mind/general_blip_train_llava_coco.json")
data = [line for line in data if line['action_type'] == '#DUAL_POINT#'][17370:]


 
for idx, i in enumerate(tqdm(data), 1):  # 从1开始计数,便于后续计数判断
    img_path = path_base + i['image']
    # context = data[idx]['conversations'][0]['value']
    i['application'] = chat_minicpm_application(img_path)
    i['content'] = chat_minicpm_content(img_path)
    i['mind'] = chat_minicpm_mind(img_path)

    # 每100次保存一次
    if idx % 100 == 0:
        write_json('/code/MiniCPM-V/general_blip_train_llava_coco_caption_mind2.json', data)

# 最后保存一次,确保未满100的剩余数据也能保存
write_json('/code/MiniCPM-V/general_blip_train_llava_coco_caption_mind2.json', data)