|
import os |
|
import sys |
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) |
|
|
|
from datasets import build_dataset |
|
import argparse |
|
import opts |
|
import time |
|
|
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import cv2 |
|
from io import BytesIO |
|
import base64 |
|
from PIL import Image |
|
import json |
|
|
|
from openai import OpenAI |
|
|
|
def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False): |
|
|
|
if color_mask == True: |
|
alpha = 0.1 |
|
|
|
colored_mask = np.zeros_like(frame) |
|
colored_mask[mask == 1] = [255, 0, 0] |
|
frame[mask == 1] = ( |
|
(1 - alpha) * frame[mask == 1] + |
|
alpha * colored_mask[mask == 1] |
|
) |
|
|
|
|
|
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) |
|
cv2.drawContours(frame, contours, -1, [255, 0, 0], 2) |
|
|
|
|
|
if label_number == True: |
|
if len(contours) > 0: |
|
largest_contour = max(contours, key=cv2.contourArea) |
|
M = cv2.moments(largest_contour) |
|
if M["m00"] != 0: |
|
center_x = int(M["m10"] / M["m00"]) |
|
center_y = int(M["m01"] / M["m00"]) |
|
else: |
|
center_x, center_y = 0, 0 |
|
|
|
font = cv2.FONT_HERSHEY_SIMPLEX |
|
text = str(instance_id) |
|
font_scale = 0.6 |
|
text_size = cv2.getTextSize(text, font, font_scale, 2)[0] |
|
text_x = center_x - text_size[0] // 1 |
|
text_y = center_y |
|
|
|
|
|
|
|
rect_start = (text_x - 5, text_y - text_size[1] - 5) |
|
|
|
rect_end = (text_x + text_size[0] + 5, text_y) |
|
|
|
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1) |
|
cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
buffer = BytesIO() |
|
frame = Image.fromarray(frame) |
|
frame.save(buffer, format='jpeg') |
|
buffer.seek(0) |
|
encoded_frame = base64.b64encode(buffer.read()).decode("utf-8") |
|
|
|
return encoded_frame |
|
|
|
def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True): |
|
|
|
base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number) |
|
|
|
captioner = OpenAI() |
|
|
|
|
|
dense_caption_prompt = f""" |
|
You are a visual assistant analyzing a single frame of a video. |
|
In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary. |
|
I also give you a text query describing the marked object. |
|
I want to use your expression to create an **action-centric referring expression** dataset. |
|
Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions |
|
--- |
|
## Guidelines: |
|
1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object). |
|
2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head"). |
|
3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”). |
|
4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button"). |
|
5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction. |
|
6. If there are multiple objects, ensure the description for the marked object **differentiates** its action. |
|
7. Base your description on these action definitions: |
|
- Avoid using term 'minimal' or 'slightly'. |
|
- General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back") |
|
- details such as motion and intention, facial with object manipulation |
|
- movements with object or other entities when they are prominent and observable. expression should be specific. |
|
(e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X)) |
|
-- |
|
## Output Format: |
|
- For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format : |
|
object id. action-oriented description |
|
(e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.) |
|
### Example |
|
If the frame has 1 labeled bear, your output should look like: |
|
1. the bear reaching his right arm while leaning forward to capture the prey |
|
--- |
|
**Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”). |
|
**Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed). |
|
**Do not include markdown** in the output. |
|
Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). |
|
For each labeled object, output referring expressions for each object id. |
|
""" |
|
prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}" |
|
|
|
MAX_RETRIES = 2 |
|
retry_count = 0 |
|
|
|
while retry_count < MAX_RETRIES: |
|
response = captioner.chat.completions.create( |
|
model=model, |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": prompt_with_text_query, |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, |
|
}, |
|
], |
|
} |
|
], |
|
) |
|
|
|
|
|
caption = response.choices[0].message.content.strip() |
|
caption_lower = caption.lower().lstrip() |
|
if caption_lower.startswith("1.") and not any( |
|
phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"] |
|
): |
|
break |
|
print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})") |
|
retry_count += 1 |
|
time.sleep(2) |
|
|
|
if retry_count == MAX_RETRIES: |
|
caption = None |
|
print("Max retries reached. Caption generation failed.") |
|
|
|
else: |
|
caption = None |
|
|
|
return caption |
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) |
|
parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json') |
|
args = parser.parse_args() |
|
|
|
train_dataset = build_dataset('a2d', image_set = 'train', args = args) |
|
text_annotations = train_dataset.text_annotations |
|
|
|
all_captions = {} |
|
|
|
|
|
os.environ['OPENAI_API_KEY'] = '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' |
|
|
|
first_text_query = "" |
|
for idx in range(300): |
|
imgs, target = train_dataset[idx] |
|
frames_idx = target['frames_idx'].tolist() |
|
text_query, vid_id, frame_id, instance_id = text_annotations[idx] |
|
|
|
if text_query == first_text_query: |
|
continue |
|
|
|
print(f"------------vid id: {vid_id}, frame id: {frame_id}, instance id: {instance_id}", flush=True) |
|
|
|
frame_id = frame_id - 1 |
|
frame_order = frames_idx.index(frame_id) |
|
|
|
frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy() |
|
mask = target['masks'].numpy().astype(np.uint8).squeeze() |
|
|
|
caption = getCaption(frame, mask, instance_id, text_query, model='gpt-4o-mini') |
|
|
|
if vid_id in all_captions: |
|
if frame_id in all_captions[vid_id]: |
|
all_captions[vid_id][frame_id][instance_id] = caption |
|
else: |
|
all_captions[vid_id][frame_id] = {instance_id : caption} |
|
else: |
|
all_captions[vid_id] = {frame_id : {instance_id: caption}} |
|
|
|
if idx % 50 == 0: |
|
with open(args.save_caption_path, 'w') as file: |
|
json.dump(all_captions, file, indent=4) |
|
|
|
print("Finished!", flush=True) |
|
|
|
with open(args.save_caption_path, 'w') as file: |
|
json.dump(all_captions, file, indent=4) |
|
|
|
|