Spaces:

Kayson
/

InstructDiffusion

Runtime error

File size: 26,174 Bytes

7ae68fe

# ------------------------------------------------------------------------------
# Copyright (c) Microsoft
# Licensed under the MIT License.
# Written by Bin Xiao ([email protected])
# Modified by Zigang Geng ([email protected])
# ------------------------------------------------------------------------------

from __future__ import annotations

import logging
import os
import json
import copy
import math
import random
from pathlib import Path
from typing import Any

import cv2
import numpy as np
import torch
import torchvision
from einops import rearrange
from PIL import Image
from torch.utils.data import Dataset
import torchvision.transforms as transforms
from pycocotools.coco import COCO


logger = logging.getLogger(__name__)


colors = {
    'red': (255, 0, 0),
    'green': (0, 255, 0),
    'blue': (0, 0, 255),
    'yellow': (255, 255, 0),
    'cyan': (0, 255, 255),
    'magenta': (255, 0, 255),
    'gray': (128, 128, 128),
    'white': (255, 255, 255),
    'black': (0, 0, 0)}


def readTXT(txt_path):
    with open(txt_path, 'r') as f:
        listInTXT = [line.strip() for line in f]

    return listInTXT


class PoseDataset(Dataset):
    def __init__(self, root, image_set, is_train, max_prompt_num=5, min_prompt_num=1,
        radius=10, size=256, transparency=0.0, sample_weight=1.0, transform=None):
        
        self.sample_weight = sample_weight
        self.max_prompt_num = max_prompt_num
        self.min_prompt_num = min_prompt_num
        self.radius = radius
        self.transparency = transparency
        self.num_joints = 0
        self.pixel_std = 200
        self.flip_pairs = []
        self.parent_ids = []
        
        self.keypoints_type = {}
        
        self.is_train = is_train
        self.image_set = image_set
        self.root = root

        self.scale_factor = 0.35
        self.rotation_factor = 45
        self.flip = True
        self.num_joints_half_body = 8
        self.prob_half_body = 0.3

        self.image_size = np.array((size, size))
        self.heatmap_size = np.array((size, size))

        self.transform = transform
        self.db = []

        pose_diverse_prompt_path = 'dataset/prompt/prompt_pose.txt'
        self.pose_diverse_prompt_list = []
        with open(pose_diverse_prompt_path) as f:
            line = f.readline()
            while line:
                line = line.strip('\n')
                self.pose_diverse_prompt_list.append(line)
                line = f.readline()

    def _get_db(self):
        raise NotImplementedError

    def evaluate(self, preds, output_dir, *args, **kwargs):
        raise NotImplementedError

    def half_body_transform(self, joints, joints_vis):
        upper_joints = []
        lower_joints = []
        for joint_id in range(self.num_joints):
            if joints_vis[joint_id][0] > 0:
                if joint_id in self.upper_body_ids:
                    upper_joints.append(joints[joint_id])
                else:
                    lower_joints.append(joints[joint_id])

        if np.random.randn() < 0.5 and len(upper_joints) > 2:
            selected_joints = upper_joints
        else:
            selected_joints = lower_joints \
                if len(lower_joints) > 2 else upper_joints

        if len(selected_joints) < 2:
            return None, None

        selected_joints = np.array(selected_joints, dtype=np.float32)
        center = selected_joints.mean(axis=0)[:2]

        left_top = np.amin(selected_joints, axis=0)
        right_bottom = np.amax(selected_joints, axis=0)

        w = right_bottom[0] - left_top[0]
        h = right_bottom[1] - left_top[1]

        if w > self.aspect_ratio * h:
            h = w * 1.0 / self.aspect_ratio
        elif w < self.aspect_ratio * h:
            w = h * self.aspect_ratio

        scale = np.array(
            [
                w * 1.0 / self.pixel_std,
                h * 1.0 / self.pixel_std
            ],
            dtype=np.float32
        )

        scale = scale * 1.5

        return center, scale

    def __len__(self,):
        return int(len(self.db) * self.sample_weight)

    def __getitem__(self, idx):
        if self.sample_weight >= 1:
            idx = idx % len(self.db)
        else:
            idx = int(idx / self.sample_weight) + random.randint(0, int(1 / self.sample_weight) - 1)

        db_rec = copy.deepcopy(self.db[idx])

        image_file = db_rec['image']
        filename = db_rec['filename'] if 'filename' in db_rec else ''
        imgnum = db_rec['imgnum'] if 'imgnum' in db_rec else ''

        data_numpy = cv2.imread(
            image_file, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION
        )
        data_numpy = cv2.cvtColor(data_numpy, cv2.COLOR_BGR2RGB)

        if data_numpy is None:
            logger.error('=> fail to read {}'.format(image_file))
            raise ValueError('Fail to read {}'.format(image_file))

        joints = db_rec['joints_3d']
        joints_vis = db_rec['joints_3d_vis']

        c = db_rec['center']
        s = db_rec['scale']
        score = db_rec['score'] if 'score' in db_rec else 1
        r = 0

        if self.is_train:
            if (np.sum(joints_vis[:, 0]) > self.num_joints_half_body
                and np.random.rand() < self.prob_half_body):
                c_half_body, s_half_body = self.half_body_transform(
                    joints, joints_vis
                )

                if c_half_body is not None and s_half_body is not None:
                    c, s = c_half_body, s_half_body

            sf = self.scale_factor
            rf = self.rotation_factor
            s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf)
            r = np.clip(np.random.randn()*rf, -rf*2, rf*2) \
                if random.random() <= 0.6 else 0

            if self.flip and random.random() <= 0.5:
                data_numpy = data_numpy[:, ::-1, :]
                joints, joints_vis = fliplr_joints(
                    joints, joints_vis, data_numpy.shape[1], self.flip_pairs)
                c[0] = data_numpy.shape[1] - c[0] - 1

        trans = get_affine_transform(c, s, r, self.image_size)
        input = cv2.warpAffine(
            data_numpy,
            trans,
            (int(self.image_size[0]), int(self.image_size[1])),
            flags=cv2.INTER_LINEAR)

        if self.transform:
            input = self.transform(input)

        for i in range(self.num_joints):
            if joints_vis[i, 0] > 0.0:
                joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)

        target, prompt = self.generate_target(input, joints, joints_vis)
        
        # return Image.fromarray(input), Image.fromarray(target), prompt

        image_0 = rearrange(2 * torch.tensor(np.array(input)).float() / 255 - 1, "h w c -> c h w")
        image_1 = rearrange(2 * torch.tensor(np.array(target)).float() / 255 - 1, "h w c -> c h w")

        return dict(edited=image_1, edit=dict(c_concat=image_0, c_crossattn=prompt))

    def generate_target(self, input, joints, joints_vis):
        '''
        :param input: [height, width, 3]
        :param joints:  [num_joints, 3]
        :param joints_vis: [num_joints, 3]
        :return: target
        '''
        radius = self.radius
        target = copy.deepcopy(input)

        joint_num = random.randint(self.min_prompt_num, self.max_prompt_num)
        joint_ids = np.random.choice([i for i in range(self.num_joints)], joint_num, replace=False)
        random_color_names = random.sample(list(colors.keys()), len(joint_ids))
        random_marker_names = ['circle' for i in range(len(joint_ids))]

        prompt = ""

        for color_idx, joint_id in enumerate(joint_ids):
            feat_stride = self.image_size / self.heatmap_size
            mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5)
            mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5)
            # Check that any part of the gaussian is in-bounds
            ul = [int(mu_x - radius), int(mu_y - radius)]
            br = [int(mu_x + radius + 1), int(mu_y + radius + 1)]
            if ul[0] >= self.heatmap_size[0] or ul[1] >= self.heatmap_size[1] \
                    or br[0] < 0 or br[1] < 0:
                # If not, just return the image as is
                joints_vis[joint_id][0] = 0
                continue

            marker_size = 2 * radius + 1
            g = np.zeros((marker_size, marker_size))
            x, y = np.indices((marker_size, marker_size))
            interval = int((marker_size - marker_size / math.sqrt(2)) // 2)
            mask = (x - radius) ** 2 + (y - radius) ** 2 <= radius ** 2 + 1
            g[mask] = 1

            # Usable gaussian range
            g_x = max(0, -ul[0]), min(br[0], self.heatmap_size[0]) - ul[0]
            g_y = max(0, -ul[1]), min(br[1], self.heatmap_size[1]) - ul[1]
            # Image range
            img_x = max(0, ul[0]), min(br[0], self.heatmap_size[0])
            img_y = max(0, ul[1]), min(br[1], self.heatmap_size[1])

            v = joints_vis[joint_id][0]
            random_color_name = random_color_names[color_idx]
            random_color = colors[random_color_name]
            
            prompt += random.choice(self.pose_diverse_prompt_list).format(
                color=random_color_name, 
                joint=self.keypoints_type[joint_id])

            if v > 0.5:
                target[img_y[0]:img_y[1], img_x[0]:img_x[1]][g[g_y[0]:g_y[1], g_x[0]:g_x[1]]>0] \
                    = self.transparency*target[img_y[0]:img_y[1], img_x[0]:img_x[1]][g[g_y[0]:g_y[1], g_x[0]:g_x[1]]>0] \
                        + (1-self.transparency)*np.array(random_color)

        return target, prompt


class COCODataset(PoseDataset):
    def __init__(self, root, image_set, is_train, max_prompt_num=5, min_prompt_num=1, 
            radius=10, size=256, transparency=0.0, sample_weight=1.0, transform=None):

        super().__init__(root, image_set, is_train, max_prompt_num, min_prompt_num, 
            radius, size, transparency, sample_weight, transform)

        self.keypoints_type = {
                0: "nose",
                1: "left eye",
                2: "right eye",
                3: "left ear",
                4: "right ear",
                5: "left shoulder",
                6: "right shoulder",
                7: "left elbow",
                8: "right elbow",
                9: "left wrist",
                10: "right wrist",
                11: "left hip",
                12: "right hip",
                13: "left knee",
                14: "right knee",
                15: "left ankle",
                16: "right ankle"
            }

        self.image_width = size
        self.image_height = size
        self.aspect_ratio = self.image_width * 1.0 / self.image_height
        self.pixel_std = 200

        self.coco = COCO(self._get_ann_file_keypoint())

        # deal with class names
        cats = [cat['name']
                for cat in self.coco.loadCats(self.coco.getCatIds())]
        self.classes = ['__background__'] + cats
        logger.info('=> classes: {}'.format(self.classes))
        self.num_classes = len(self.classes)
        self._class_to_ind = dict(zip(self.classes, range(self.num_classes)))
        self._class_to_coco_ind = dict(zip(cats, self.coco.getCatIds()))
        self._coco_ind_to_class_ind = dict(
            [
                (self._class_to_coco_ind[cls], self._class_to_ind[cls])
                for cls in self.classes[1:]
            ]
        )

        # load image file names
        self.image_set_index = self._load_image_set_index()
        self.num_images = len(self.image_set_index)
        logger.info('=> num_images: {}'.format(self.num_images))

        self.num_joints = 17
        self.flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8],
                           [9, 10], [11, 12], [13, 14], [15, 16]]
        self.parent_ids = None
        self.upper_body_ids = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
        self.lower_body_ids = (11, 12, 13, 14, 15, 16)
        
        if 'coco' in self.root:
            self.db = self._get_db()

        logger.info('=> load {} samples'.format(len(self.db)))

    def _get_ann_file_keypoint(self):
        """ self.root / annotations / person_keypoints_train2017.json """
        if 'coco' in self.root:
            prefix = 'person_keypoints' \
                if 'test' not in self.image_set else 'image_info'
            return os.path.join(
                self.root,
                'annotations',
                prefix + '_' + self.image_set + '.json'
            )
        elif 'crowdpose' in self.root:
            prefix = 'crowdpose'
            return os.path.join(
                self.root,
                'json',
                prefix + '_' + self.image_set + '.json'
            )
        elif 'aic' in self.root:
            prefix = 'aic'
            return os.path.join(
                self.root,
                'annotations',
                prefix + '_' + self.image_set + '.json'
            )
        else:
            raise ValueError('Please write the path for this new dataset.')

    def _load_image_set_index(self):
        """ image id: int """
        image_ids = self.coco.getImgIds()
        return image_ids

    def _get_db(self):
        gt_db = self._load_coco_keypoint_annotations()
        return gt_db

    def _load_coco_keypoint_annotations(self):
        """ ground truth bbox and keypoints """
        gt_db = []
        for index in self.image_set_index:
            gt_db.extend(self._load_coco_keypoint_annotation_kernal(index))
        return gt_db

    def _load_coco_keypoint_annotation_kernal(self, index):
        """
        coco ann: [u'segmentation', u'area', u'iscrowd', u'image_id', u'bbox', u'category_id', u'id']
        iscrowd:
            crowd instances are handled by marking their overlaps with all categories to -1
            and later excluded in training
        bbox:
            [x1, y1, w, h]
        :param index: coco image id
        :return: db entry
        """
        im_ann = self.coco.loadImgs(index)[0]
        width = im_ann['width']
        height = im_ann['height']

        annIds = self.coco.getAnnIds(imgIds=index, iscrowd=False)
        objs = self.coco.loadAnns(annIds)

        # sanitize bboxes
        valid_objs = []
        for obj in objs:
            x, y, w, h = obj['bbox']
            x1 = np.max((0, x))
            y1 = np.max((0, y))
            x2 = np.min((width - 1, x1 + np.max((0, w - 1))))
            y2 = np.min((height - 1, y1 + np.max((0, h - 1))))
            if 'crowdpose' in self.root:
                obj['area'] = 1
            if obj['area'] > 0 and x2 >= x1 and y2 >= y1:
                obj['clean_bbox'] = [x1, y1, x2-x1, y2-y1]
                valid_objs.append(obj)
        objs = valid_objs

        rec = []
        for obj in objs:
            cls = self._coco_ind_to_class_ind[obj['category_id']]
            if cls != 1:
                continue

            # ignore objs without keypoints annotation
            if max(obj['keypoints']) == 0:
                continue

            joints_3d = np.zeros((self.num_joints, 3), dtype=np.float32)
            joints_3d_vis = np.zeros((self.num_joints, 3), dtype=np.float32)
            for ipt in range(self.num_joints):
                joints_3d[ipt, 0] = obj['keypoints'][ipt * 3 + 0]
                joints_3d[ipt, 1] = obj['keypoints'][ipt * 3 + 1]
                joints_3d[ipt, 2] = 0
                t_vis = obj['keypoints'][ipt * 3 + 2]
                if t_vis > 1:
                    t_vis = 1
                joints_3d_vis[ipt, 0] = t_vis
                joints_3d_vis[ipt, 1] = t_vis
                joints_3d_vis[ipt, 2] = 0

            center, scale = self._box2cs(obj['clean_bbox'][:4])
            rec.append({
                'image': self.image_path_from_index(index, im_ann),
                'center': center,
                'scale': scale,
                'joints_3d': joints_3d,
                'joints_3d_vis': joints_3d_vis,
                'filename': '',
                'imgnum': 0,
            })

        return rec

    def _box2cs(self, box):
        x, y, w, h = box[:4]
        return self._xywh2cs(x, y, w, h)

    def _xywh2cs(self, x, y, w, h):
        center = np.zeros((2), dtype=np.float32)
        center[0] = x + w * 0.5
        center[1] = y + h * 0.5

        if w > self.aspect_ratio * h:
            h = w * 1.0 / self.aspect_ratio
        elif w < self.aspect_ratio * h:
            w = h * self.aspect_ratio
        scale = np.array(
            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
            dtype=np.float32)
        if center[0] != -1:
            scale = scale * 1.25

        return center, scale

    def image_path_from_index(self, index, im_ann):
        """ example: images / train2017 / 000000119993.jpg """
        if 'coco' in self.root:
            file_name = '%012d.jpg' % index
            if '2014' in self.image_set:
                file_name = 'COCO_%s_' % self.image_set + file_name

            prefix = 'test2017' if 'test' in self.image_set else self.image_set

            data_name = prefix

            image_path = os.path.join(
                self.root, 'images', data_name, file_name)

            return image_path
        elif 'crowdpose' in self.root:
            file_name = f'{index}.jpg'

            image_path = os.path.join(
                self.root, 'images', file_name)

            return image_path
        elif 'aic' in self.root:
            file_name = im_ann["file_name"]

            image_path = os.path.join(
                self.root, 'ai_challenger_keypoint_train_20170902', 'keypoint_train_images_20170902', file_name)

            return image_path


def flip_back(output_flipped, matched_parts):
    '''
    ouput_flipped: numpy.ndarray(batch_size, num_joints, height, width)
    '''
    assert output_flipped.ndim == 4,\
        'output_flipped should be [batch_size, num_joints, height, width]'

    output_flipped = output_flipped[:, :, :, ::-1]

    for pair in matched_parts:
        tmp = output_flipped[:, pair[0], :, :].copy()
        output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
        output_flipped[:, pair[1], :, :] = tmp

    return output_flipped


def fliplr_joints(joints, joints_vis, width, matched_parts):
    """
    flip coords
    """
    # Flip horizontal
    joints[:, 0] = width - joints[:, 0] - 1

    # Change left-right parts
    for pair in matched_parts:
        joints[pair[0], :], joints[pair[1], :] = \
            joints[pair[1], :], joints[pair[0], :].copy()
        joints_vis[pair[0], :], joints_vis[pair[1], :] = \
            joints_vis[pair[1], :], joints_vis[pair[0], :].copy()

    return joints*joints_vis, joints_vis


def get_affine_transform(
        center, scale, rot, output_size,
        shift=np.array([0, 0], dtype=np.float32), inv=0
):
    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
        print(scale)
        scale = np.array([scale, scale])

    scale_tmp = scale * 200.0
    src_w = scale_tmp[0]
    dst_w = output_size[0]
    dst_h = output_size[1]

    rot_rad = np.pi * rot / 180
    src_dir = get_dir([0, src_w * -0.5], rot_rad)
    dst_dir = np.array([0, dst_w * -0.5], np.float32)

    src = np.zeros((3, 2), dtype=np.float32)
    dst = np.zeros((3, 2), dtype=np.float32)
    src[0, :] = center + scale_tmp * shift
    src[1, :] = center + src_dir + scale_tmp * shift
    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir

    src[2:, :] = get_3rd_point(src[0, :], src[1, :])
    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])

    if inv:
        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
    else:
        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))

    return trans


def affine_transform(pt, t):
    new_pt = np.array([pt[0], pt[1], 1.]).T
    new_pt = np.dot(t, new_pt)
    return new_pt[:2]


def get_3rd_point(a, b):
    direct = a - b
    return b + np.array([-direct[1], direct[0]], dtype=np.float32)


def get_dir(src_point, rot_rad):
    sn, cs = np.sin(rot_rad), np.cos(rot_rad)

    src_result = [0, 0]
    src_result[0] = src_point[0] * cs - src_point[1] * sn
    src_result[1] = src_point[0] * sn + src_point[1] * cs

    return src_result


class CrowdPoseDataset(COCODataset):
    def __init__(self, root, image_set, is_train, max_prompt_num=5, min_prompt_num=1, 
            radius=10, size=256, transparency=0.0, sample_weight=1.0, transform=None):

        super().__init__(root, image_set, is_train, max_prompt_num, min_prompt_num, 
            radius, size, transparency, sample_weight, transform)

        self.keypoints_type = {
                0: 'left_shoulder',
                1: 'right_shoulder',
                2: 'left_elbow',
                3: 'right_elbow',
                4: 'left_wrist',
                5: 'right_wrist',
                6: 'left_hip',
                7: 'right_hip',
                8: 'left_knee',
                9: 'right_knee',
                10: 'left_ankle',
                11: 'right_ankle',
                12: 'top_head',
                13: 'neck'
            }
        
        self.num_joints = 14
        self.prob_half_body = -1
        self.flip_pairs = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11]]
        self.parent_ids = None
        self.upper_body_ids = (0, 1, 2, 3, 4, 5, 12, 13)
        self.lower_body_ids = (6, 7, 8, 9, 10, 11)

        self.db = self._get_db()

        logger.info('=> load {} samples'.format(len(self.db)))


class AICDataset(COCODataset):
    def __init__(self, root, image_set, is_train, max_prompt_num=5, min_prompt_num=1, 
            radius=10, size=256, transparency=0.0, sample_weight=1.0, transform=None):
        super().__init__(root, image_set, is_train, max_prompt_num, min_prompt_num, 
            radius, size, transparency, sample_weight, transform)

        self.keypoints_type = {
                0: "right_shoulder",
                1: "right_elbow",
                2: "right_wrist",
                3: "left_shoulder",
                4: "left_elbow",
                5: "left_wrist",
                6: "right_hip",
                7: "right_knee",
                8: "right_ankle",
                9: "left_hip",
                10: "left_knee",
                11: "left_ankle",
                12: "head_top",
                13: "neck"
            }
        
        self.num_joints = 14
        self.prob_half_body = -1
        self.flip_pairs = [[0, 3], [1, 4], [2, 5], [6, 9], [7, 10], [8, 11]]
        self.parent_ids = None
        self.upper_body_ids = (0, 1, 2, 3, 4, 5, 12, 13)
        self.lower_body_ids = (6, 7, 8, 9, 10, 11)

        self.db = self._get_db()

        logger.info('=> load {} samples'.format(len(self.db)))


class MPIIDataset(PoseDataset):
    def __init__(self, root, image_set, is_train, max_prompt_num=5, min_prompt_num=1, 
            radius=10, size=256, transparency=0.0, sample_weight=1.0, transform=None):
        super().__init__(root, image_set, is_train, max_prompt_num, min_prompt_num, 
            radius, size, transparency, sample_weight, transform)

        self.keypoints_type = {
                0: 'right_ankle',
                1: 'right_knee',
                2: 'right_hip',
                3: 'left_hip',
                4: 'left_knee',
                5: 'left_ankle',
                6: 'pelvis',
                7: 'thorax',
                8: 'upper_neck',
                9: 'head_top',
                10: 'right_wrist',
                11: 'right_elbow',
                12: 'right_shoulder',
                13: 'left_shoulder',
                14: 'left_elbow',
                15: 'left_wrist'
            }
        
        self.data_format = 'jpg'
        self.num_joints = 16 
        self.prob_half_body = -1
        self.flip_pairs = [[0, 5], [1, 4], [2, 3], [10, 15], [11, 14], [12, 13]]
        self.parent_ids = None
        self.upper_body_ids = (7, 8, 9, 10, 11, 12, 13, 14, 15)
        self.lower_body_ids = (0, 1, 2, 3, 4, 5, 6)

        self.db = self._get_db()

        logger.info('=> load {} samples'.format(len(self.db)))

    def _get_db(self):
        # create train/val split
        file_name = os.path.join(
            self.root, 'annot', self.image_set+'.json'
        )
        with open(file_name) as anno_file:
            anno = json.load(anno_file)

        gt_db = []
        for a in anno:
            image_name = a['image']

            c = np.array(a['center'], dtype=np.float32)
            s = np.array([a['scale'], a['scale']], dtype=np.float32)

            # Adjust center/scale slightly to avoid cropping limbs
            if c[0] != -1:
                c[1] = c[1] + 15 * s[1]
                s = s * 1.25

            # MPII uses matlab format, index is based 1,
            # we should first convert to 0-based index
            c = c - 1

            joints_3d = np.zeros((self.num_joints, 3), dtype=np.float32)
            joints_3d_vis = np.zeros((self.num_joints,  3), dtype=np.float32)
            if self.image_set != 'test':
                joints = np.array(a['joints'])
                joints[:, 0:2] = joints[:, 0:2] - 1
                joints_vis = np.array(a['joints_vis'])
                assert len(joints) == self.num_joints, \
                    'joint num diff: {} vs {}'.format(len(joints),
                                                      self.num_joints)

                joints_3d[:, 0:2] = joints[:, 0:2]
                joints_3d_vis[:, 0] = joints_vis[:]
                joints_3d_vis[:, 1] = joints_vis[:]

            image_dir = 'images.zip@' if self.data_format == 'zip' else 'images'
            gt_db.append(
                {
                    'image': os.path.join(self.root, image_dir, image_name),
                    'center': c,
                    'scale': s,
                    'joints_3d': joints_3d,
                    'joints_3d_vis': joints_3d_vis,
                    'filename': '',
                    'imgnum': 0,
                }
            )

        return gt_db