dianecy
/

VerbCentric-RIS

Model card Files Files and versions Community

dianecy commited on Oct 13, 2024

Commit

fce6bfe

verified ·

1 Parent(s): 10842b4

Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

utils/.gitignore +1 -0
utils/__init__.py +4 -0
utils/bpe_simple_vocab_16e6.txt.gz +3 -0
utils/config.py +157 -0
utils/dataset.py +341 -0
utils/dataset_verbonly.py +358 -0
utils/misc.py +294 -0
utils/simple_tokenizer.py +132 -0

utils/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .simple_tokenizer import SimpleTokenizer
+from .config import *
+from .dataset import *
+from .misc import *

utils/bpe_simple_vocab_16e6.txt.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
+size 1356917

utils/config.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# -----------------------------------------------------------------------------
+# Functions for parsing args
+# -----------------------------------------------------------------------------
+import copy
+import os
+from ast import literal_eval
+import yaml
+class CfgNode(dict):
+    """
+    CfgNode represents an internal node in the configuration tree. It's a simple
+    dict-like container that allows for attribute-based access to keys.
+    """
+    def __init__(self, init_dict=None, key_list=None, new_allowed=False):
+        # Recursively convert nested dictionaries in init_dict into CfgNodes
+        init_dict = {} if init_dict is None else init_dict
+        key_list = [] if key_list is None else key_list
+        for k, v in init_dict.items():
+            if type(v) is dict:
+                # Convert dict to CfgNode
+                init_dict[k] = CfgNode(v, key_list=key_list + [k])
+        super(CfgNode, self).__init__(init_dict)
+    def __getattr__(self, name):
+        if name in self:
+            return self[name]
+        else:
+            raise AttributeError(name)
+    def __setattr__(self, name, value):
+        self[name] = value
+    def __str__(self):
+        def _indent(s_, num_spaces):
+            s = s_.split("\n")
+            if len(s) == 1:
+                return s_
+            first = s.pop(0)
+            s = [(num_spaces * " ") + line for line in s]
+            s = "\n".join(s)
+            s = first + "\n" + s
+            return s
+        r = ""
+        s = []
+        for k, v in sorted(self.items()):
+            seperator = "\n" if isinstance(v, CfgNode) else " "
+            attr_str = "{}:{}{}".format(str(k), seperator, str(v))
+            attr_str = _indent(attr_str, 2)
+            s.append(attr_str)
+        r += "\n".join(s)
+        return r
+    def __repr__(self):
+        return "{}({})".format(self.__class__.__name__,
+                               super(CfgNode, self).__repr__())
+def load_cfg_from_cfg_file(file):
+    cfg = {}
+    assert os.path.isfile(file) and file.endswith('.yaml'), \
+        '{} is not a yaml file'.format(file)
+    with open(file, 'r') as f:
+        cfg_from_file = yaml.safe_load(f)
+    for key in cfg_from_file:
+        for k, v in cfg_from_file[key].items():
+            cfg[k] = v
+    cfg = CfgNode(cfg)
+    return cfg
+def merge_cfg_from_list(cfg, cfg_list):
+    new_cfg = copy.deepcopy(cfg)
+    assert len(cfg_list) % 2 == 0
+    for full_key, v in zip(cfg_list[0::2], cfg_list[1::2]):
+        subkey = full_key.split('.')[-1]
+        assert subkey in cfg, 'Non-existent key: {}'.format(full_key)
+        value = _decode_cfg_value(v)
+        value = _check_and_coerce_cfg_value_type(value, cfg[subkey], subkey,
+                                                 full_key)
+        setattr(new_cfg, subkey, value)
+    return new_cfg
+def _decode_cfg_value(v):
+    """Decodes a raw config value (e.g., from a yaml config files or command
+    line argument) into a Python object.
+    """
+    # All remaining processing is only applied to strings
+    if not isinstance(v, str):
+        return v
+    # Try to interpret `v` as a:
+    #   string, number, tuple, list, dict, boolean, or None
+    try:
+        v = literal_eval(v)
+    # The following two excepts allow v to pass through when it represents a
+    # string.
+    #
+    # Longer explanation:
+    # The type of v is always a string (before calling literal_eval), but
+    # sometimes it *represents* a string and other times a data structure, like
+    # a list. In the case that v represents a string, what we got back from the
+    # yaml parser is 'foo' *without quotes* (so, not '"foo"'). literal_eval is
+    # ok with '"foo"', but will raise a ValueError if given 'foo'. In other
+    # cases, like paths (v = 'foo/bar' and not v = '"foo/bar"'), literal_eval
+    # will raise a SyntaxError.
+    except ValueError:
+        pass
+    except SyntaxError:
+        pass
+    return v
+def _check_and_coerce_cfg_value_type(replacement, original, key, full_key):
+    """Checks that `replacement`, which is intended to replace `original` is of
+    the right type. The type is correct if it matches exactly or is one of a few
+    cases in which the type can be easily coerced.
+    """
+    original_type = type(original)
+    replacement_type = type(replacement)
+    # The types must match (with some exceptions)
+    if replacement_type == original_type:
+        return replacement
+    # Cast replacement from from_type to to_type if the replacement and original
+    # types match from_type and to_type
+    def conditional_cast(from_type, to_type):
+        if replacement_type == from_type and original_type == to_type:
+            return True, to_type(replacement)
+        else:
+            return False, None
+    # Conditionally casts
+    # list <-> tuple
+    casts = [(tuple, list), (list, tuple)]
+    # For py2: allow converting from str (bytes) to a unicode string
+    try:
+        casts.append((str, unicode))  # noqa: F821
+    except Exception:
+        pass
+    for (from_type, to_type) in casts:
+        converted, converted_value = conditional_cast(from_type, to_type)
+        if converted:
+            return converted_value
+    raise ValueError(
+        "Type mismatch ({} vs. {}) with values ({} vs. {}) for config "
+        "key: {}".format(original_type, replacement_type, original,
+                         replacement, full_key))

utils/dataset.py ADDED Viewed

	@@ -0,0 +1,341 @@

+#%%
+import os
+from typing import List, Union
+import json
+import cv2
+import lmdb
+import random
+import numpy as np
+import pyarrow as pa
+import torch
+from torch.utils.data import Dataset
+import itertools
+import albumentations as A
+from albumentations.pytorch import ToTensorV2
+from .simple_tokenizer import SimpleTokenizer as _Tokenizer
+info = {
+    'refcoco': {
+        'train': 42404,
+        'val': 3811,
+        'val-test': 3811,
+        'testA': 1975,
+        'testB': 1810
+    },
+    'refcoco+': {
+        'train': 42278,
+        'val': 3805,
+        'val-test': 3805,
+        'testA': 1975,
+        'testB': 1798
+    },
+    'refcocog_u': {
+        'train': 42226,
+        'val': 2573,
+        'val-test': 2573,
+        'test': 5023,
+        'test_0-5_verb' : 572,
+        'test_0-5_static' : 1688,
+        'test_6-7_verb' : 949,
+        'test_6-7_static' : 1240,
+        'test_8-10_verb' : 1523,
+        'test_8-10_static' : 1194,
+        'test_11-20_verb' : 1768,
+        'test_11-20_static' : 584,
+        'test_abl_motion' : 267,
+        'test_abl_static' : 267
+    },
+    'refcocog_g': {
+        'train': 44822,
+        'val': 5000,
+        'val-test': 5000
+    }
+}
+_tokenizer = _Tokenizer()
+#%%
+def tokenize(texts: Union[str, List[str]],
+             context_length: int = 77,
+             truncate: bool = False) -> torch.LongTensor:
+    """
+    Returns the tokenized representation of given input string(s)
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+    truncate: bool
+        Whether to truncate the text in case its encoding is longer than the context length
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+    sot_token = _tokenizer.encoder["<|startoftext|>"]
+    eot_token = _tokenizer.encoder["<|endoftext|>"]
+    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token]
+                  for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:
+            if truncate:
+                tokens = tokens[:context_length]
+                tokens[-1] = eot_token
+            else:
+                raise RuntimeError(
+                    f"Input {texts[i]} is too long for context length {context_length}"
+                )
+        result[i, :len(tokens)] = torch.tensor(tokens)
+    return result
+def loads_pyarrow(buf):
+    """
+    Args:
+        buf: the output of `dumps`.
+    """
+    return pa.deserialize(buf)
+class RefDataset(Dataset):
+    def __init__(self, lmdb_dir, mask_dir, dataset, split, mode, input_size,
+                    word_length, args):
+        super(RefDataset, self).__init__()
+        self.lmdb_dir = lmdb_dir
+        self.mask_dir = mask_dir
+        self.dataset = dataset
+        self.split = split
+        self.mode = mode
+        self.input_size = (input_size, input_size)
+        self.word_length = word_length
+        self.mean = torch.tensor([0.48145466, 0.4578275,
+                                    0.40821073]).reshape(3, 1, 1)
+        self.std = torch.tensor([0.26862954, 0.26130258,
+                                    0.27577711]).reshape(3, 1, 1)
+        self.length = info[dataset][split]
+        self.env = None
+        self.exclude_position = args.exclude_pos
+        self.metric_learning = args.metric_learning
+        self.hardpos_rigid = args.hardpos_rigid
+        self.resize_bg1 = A.Compose([
+            A.Resize(input_size, input_size, always_apply=True)])
+        if self.metric_learning :
+            if self.hardpos_rigid and self.exclude_position :
+                multiobj_path = '/home/chaeyun/data/projects/chaeyun/RIS/CRIS.pytorch/multiobj_nopos.txt'
+                with open(multiobj_path, 'r') as f:
+                    self.multi_obj_ref_ids = [int(line.strip()) for line in f.readlines()]
+            elif self.hardpos_rigid :
+                multiobj_path = '/home/chaeyun/data/projects/chaeyun/RIS/CRIS.pytorch/multiobj.txt'
+                with open(multiobj_path, 'r') as f:
+                    self.multi_obj_ref_ids = [int(line.strip()) for line in f.readlines()]
+            else :
+                self.multi_obj_ref_ids = None
+            path = '/home/chaeyun/data/projects/chaeyun/RIS/CRIS.pytorch/llama3-demo/llama3/hardpos_verbphrase_0906upd.json'
+            with open(path, 'r', encoding='utf-8') as f:
+                self.metadata = json.load(f)
+        else :
+            self.metadata = None
+    def _init_db(self):
+        self.env = lmdb.open(self.lmdb_dir,
+                                subdir=os.path.isdir(self.lmdb_dir),
+                                readonly=True,
+                                lock=False,
+                                readahead=False,
+                                meminit=False)
+        with self.env.begin(write=False) as txn:
+            self.length = loads_pyarrow(txn.get(b'__len__'))
+            self.keys = loads_pyarrow(txn.get(b'__keys__'))
+    def __len__(self):
+        return self.length
+    def __getitem__(self, index):
+        # Delay loading LMDB data until after initialization: https://github.com/chainer/chainermn/issues/129
+        if self.env is None:
+            self._init_db()
+        env = self.env
+        with env.begin(write=False) as txn:
+            byteflow = txn.get(self.keys[index])
+        ref = loads_pyarrow(byteflow)
+        # img
+        ori_img = cv2.imdecode(np.frombuffer(ref['img'], np.uint8),
+                                cv2.IMREAD_COLOR)
+        img = cv2.cvtColor(ori_img, cv2.COLOR_BGR2RGB)
+        # mask
+        seg_id = ref['seg_id']
+        mask_dir = os.path.join(self.mask_dir, str(seg_id) + '.png')
+        mask = cv2.imdecode(np.frombuffer(ref['mask'], np.uint8),
+                            cv2.IMREAD_GRAYSCALE)
+        mask = mask / 255.
+        # image resizing
+        resized = self.resize_bg1(image=img, mask=mask)
+        imgs, masks = [resized['image']], [resized['mask']]
+        img = imgs[0]
+        mask = masks[0]
+        mask = mask.astype(np.uint8)
+        mask[mask>0] = 1
+        # image transform
+        img_size = img.shape[:2]
+        mat, mat_inv = self.getTransformMat(img_size, True)
+        img = cv2.warpAffine(
+            img,
+            mat,
+            self.input_size,
+            flags=cv2.INTER_CUBIC,
+            borderValue=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255])
+        # sentences
+        sents = ref['sents']
+        n_sentences = ref['num_sents']
+        if self.mode == 'train':
+            # mask transform
+            mask = cv2.warpAffine(mask,
+                                    mat,
+                                    self.input_size,
+                                    flags=cv2.INTER_LINEAR,
+                                    borderValue=0.)
+            # if metric learning, select 2 positive sentences
+            if self.metric_learning:
+                if self.hardpos_rigid and seg_id in self.multi_obj_ref_ids:
+                    if n_sentences > 1:
+                        idx = np.random.choice(ref['num_sents'], 2, replace=False)
+                        sent = [sents[i] for i in idx]
+                    else:
+                        sent = [sents[0], sents[0]]
+                else:
+                    # Added processing hardpos data
+                    hardpos_dict = self.metadata[str(ref['seg_id'])]
+                    hardpos_list = list(itertools.chain(*hardpos_dict.values()))
+                    sent_id_list = list(hardpos_dict.keys())
+                    if n_sentences > 1:
+                        if self.hardpos_rigid :
+                            idx = np.random.choice(ref['num_sents'], 1, replace=False)[0]
+                            cur_hardpos = hardpos_dict[sent_id_list[idx]]
+                            if len(cur_hardpos) == 0 :
+                                idx = np.random.choice(ref['num_sents'], 2, replace=False)
+                                sent = [sents[i] for i in idx]
+                            else :
+                                hardpos_choice = random.choice(cur_hardpos)
+                                sent = [sents[idx], hardpos_choice]
+                                random.shuffle(sent)
+                        else :
+                            if len(hardpos_list) == 0 :
+                                idx = np.random.choice(ref['num_sents'], 2, replace=False)
+                                sent = [sents[i] for i in idx]
+                            else :
+                                idx = np.random.choice(ref['num_sents'], 1, replace=False)[0]
+                                hardpos_choice = random.choice(hardpos_list)
+                                sent = [sents[idx], hardpos_choice]
+                                random.shuffle(sent)
+                    # if there's only one, duplicate it
+                    else:
+                        if len(hardpos_list) == 0 :
+                            sent = [sents[0], sents[0]]
+                        else :
+                            hardpos_choice = random.choice(hardpos_list)
+                            sent = [sents[0], hardpos_choice]
+                            random.shuffle(sent)
+                    # print(f"Generated sentences: {sent}")
+            else:
+                idx = np.random.choice(ref['num_sents'], 1, replace=False)
+                sent = sents[idx]
+            word_vec = tokenize(sent, self.word_length, True).squeeze(0)
+            img, mask = self.convert(img, mask)
+            # params = {
+            #     'ori_img': ori_img,
+            #     'seg_id': seg_id,
+            #     'mask_dir': mask_dir,
+            #     'inverse': mat_inv,
+            #     'ori_size': np.array(img_size),
+            #     'sents': sents
+            # }
+            return img, word_vec, mask
+        elif self.mode == 'val':
+            # sentence -> vector
+            sent = sents[0]
+            word_vec = tokenize(sent, self.word_length, True).squeeze(0)
+            img = self.convert(img)[0]
+            params = {
+                'mask_dir': mask_dir,
+                'inverse': mat_inv,
+                'ori_size': np.array(img_size)
+            }
+            return img, word_vec, mask, params
+        else:
+            # sentence -> vector
+            img = self.convert(img)[0]
+            params = {
+                'ori_img': ori_img,
+                'seg_id': seg_id,
+                'mask_dir': mask_dir,
+                'inverse': mat_inv,
+                'ori_size': np.array(img_size),
+                'sents': sents
+            }
+            return img, mask, params
+    def getTransformMat(self, img_size, inverse=False):
+        ori_h, ori_w = img_size
+        inp_h, inp_w = self.input_size
+        scale = min(inp_h / ori_h, inp_w / ori_w)
+        new_h, new_w = ori_h * scale, ori_w * scale
+        bias_x, bias_y = (inp_w - new_w) / 2., (inp_h - new_h) / 2.
+        src = np.array([[0, 0], [ori_w, 0], [0, ori_h]], np.float32)
+        dst = np.array([[bias_x, bias_y], [new_w + bias_x, bias_y],
+                        [bias_x, new_h + bias_y]], np.float32)
+        mat = cv2.getAffineTransform(src, dst)
+        if inverse:
+            mat_inv = cv2.getAffineTransform(dst, src)
+            return mat, mat_inv
+        return mat, None
+    def convert(self, img, mask=None):
+        # Image ToTensor & Normalize
+        img = torch.from_numpy(img.transpose((2, 0, 1)))
+        if not isinstance(img, torch.FloatTensor):
+            img = img.float()
+        img.div_(255.).sub_(self.mean).div_(self.std)
+        # Mask ToTensor
+        if mask is not None:
+            mask = torch.from_numpy(mask)
+            if not isinstance(mask, torch.FloatTensor):
+                mask = mask.float()
+        return img, mask
+    def __repr__(self):
+        return self.__class__.__name__ + "(" + \
+            f"db_path={self.lmdb_dir}, " + \
+            f"dataset={self.dataset}, " + \
+            f"split={self.split}, " + \
+            f"mode={self.mode}, " + \
+            f"input_size={self.input_size}, " + \
+            f"word_length={self.word_length}"
+    # def get_length(self):
+    #     return self.length
+    # def get_sample(self, idx):
+    #     return self.__getitem__(idx)

utils/dataset_verbonly.py ADDED Viewed

	@@ -0,0 +1,358 @@

+#%%
+import os
+from typing import List, Union
+import json
+import cv2
+import lmdb
+import random
+import numpy as np
+import pyarrow as pa
+import torch
+from torch.utils.data import Dataset
+import itertools
+import albumentations as A
+from albumentations.pytorch import ToTensorV2
+from .simple_tokenizer import SimpleTokenizer as _Tokenizer
+info = {
+    'refcoco': {
+        'train': 42404,
+        'val': 3811,
+        'val-test': 3811,
+        'testA': 1975,
+        'testB': 1810
+    },
+    'refcoco+': {
+        'train': 42278,
+        'val': 3805,
+        'val-test': 3805,
+        'testA': 1975,
+        'testB': 1798
+    },
+    'refcocog_u': {
+        'train': 42226,
+        'val': 2573,
+        'val-test': 2573,
+        'test': 5023,
+    },
+    'refcocog_g': {
+        'train': 44822,
+        'val': 5000,
+        'val-test': 5000
+    }
+}
+_tokenizer = _Tokenizer()
+#%%
+def tokenize(texts: Union[str, List[str]],
+             context_length: int = 77,
+             truncate: bool = False) -> torch.LongTensor:
+    """
+    Returns the tokenized representation of given input string(s)
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+    truncate: bool
+        Whether to truncate the text in case its encoding is longer than the context length
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+    sot_token = _tokenizer.encoder["<|startoftext|>"]
+    eot_token = _tokenizer.encoder["<|endoftext|>"]
+    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token]
+                  for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:
+            if truncate:
+                tokens = tokens[:context_length]
+                tokens[-1] = eot_token
+            else:
+                raise RuntimeError(
+                    f"Input {texts[i]} is too long for context length {context_length}"
+                )
+        result[i, :len(tokens)] = torch.tensor(tokens)
+    return result
+def loads_pyarrow(buf):
+    """
+    Args:
+        buf: the output of `dumps`.
+    """
+    return pa.deserialize(buf)
+class RefDataset(Dataset):
+    def __init__(self, lmdb_dir, mask_dir, dataset, split, mode, input_size,
+                    word_length, args):
+        super(RefDataset, self).__init__()
+        self.lmdb_dir = lmdb_dir
+        self.mask_dir = mask_dir
+        self.dataset = dataset
+        self.split = split
+        self.mode = mode
+        self.input_size = (input_size, input_size)
+        self.word_length = word_length
+        self.mean = torch.tensor([0.48145466, 0.4578275,
+                                    0.40821073]).reshape(3, 1, 1)
+        self.std = torch.tensor([0.26862954, 0.26130258,
+                                    0.27577711]).reshape(3, 1, 1)
+        self.length = info[dataset][split]
+        self.env = None
+        self.exclude_position = args.exclude_pos
+        self.metric_learning = args.metric_learning
+        self.exclude_multiobj = args.exclude_multiobj
+        self.metric_mode = args.metric_mode
+        self.resize_bg1 = A.Compose([
+            A.Resize(input_size, input_size, always_apply=True)])
+        if self.metric_learning:
+            self.hardneg_prob = args.hn_prob  # Hard negative probability �߰�
+            self.multi_obj_ref_ids = self._load_multi_obj_ref_ids()
+            self.hardpos_meta, self.hardneg_meta = self._load_metadata()
+        else:
+            self.hardneg_prob = 0.0
+            self.multi_obj_ref_ids = None
+            self.hardpos_meta, self.hardneg_meta = None, None
+    def _load_multi_obj_ref_ids(self):
+        # Load multi-object reference IDs based on configurations
+        if not self.exclude_multiobj and not self.exclude_position :
+            return None
+        elif self.exclude_position:
+            multiobj_path = '/home/chaeyun/data/projects/chaeyun/RIS/CRIS.pytorch/multiobj_ov2_nopos.txt'
+        elif self.exclude_multiobj :
+            multiobj_path = '/home/chaeyun/data/projects/chaeyun/RIS/CRIS.pytorch/multiobj_ov3.txt'
+        with open(multiobj_path, 'r') as f:
+            return [int(line.strip()) for line in f.readlines()]
+    def _load_metadata(self):
+        # Load metadata for hard positive verb phrases, hard negative queries
+        hardpos_path = '/data2/projects/chaeyun/VerbCentric_RIS/hardpos_verbphrase_0906upd.json'
+        hardneg_path = '/data2/projects/chaeyun/VerbCentric_RIS/hardneg_verb.json'
+        with open(hardpos_path, 'r', encoding='utf-8') as f:
+            hardpos_json = json.load(f)
+        if self.metric_mode == "hardpos_only" :
+            hardneg_json = None
+        else :
+            with open(hardneg_path, 'r', encoding='utf-8') as q:
+                hardneg_json = json.load(q)
+        return hardpos_json, hardneg_json
+    def _init_db(self):
+        self.env = lmdb.open(self.lmdb_dir,
+                                subdir=os.path.isdir(self.lmdb_dir),
+                                readonly=True,
+                                lock=False,
+                                readahead=False,
+                                meminit=False)
+        with self.env.begin(write=False) as txn:
+            self.length = loads_pyarrow(txn.get(b'__len__'))
+            self.keys = loads_pyarrow(txn.get(b'__keys__'))
+    def __len__(self):
+        return self.length
+    def __getitem__(self, index):
+        # Delay loading LMDB data until after initialization: https://github.com/chainer/chainermn/issues/129
+        if self.env is None:
+            self._init_db()
+        env = self.env
+        with env.begin(write=False) as txn:
+            byteflow = txn.get(self.keys[index])
+        ref = loads_pyarrow(byteflow)
+        # img
+        ori_img = cv2.imdecode(np.frombuffer(ref['img'], np.uint8),
+                                cv2.IMREAD_COLOR)
+        img = cv2.cvtColor(ori_img, cv2.COLOR_BGR2RGB)
+        # mask
+        seg_id = ref['seg_id']
+        mask_dir = os.path.join(self.mask_dir, str(seg_id) + '.png')
+        mask = cv2.imdecode(np.frombuffer(ref['mask'], np.uint8),
+                            cv2.IMREAD_GRAYSCALE)
+        mask = mask / 255.
+        # image resizing
+        resized = self.resize_bg1(image=img, mask=mask)
+        imgs, masks = [resized['image']], [resized['mask']]
+        img = imgs[0]
+        mask = masks[0]
+        mask = mask.astype(np.uint8)
+        mask[mask>0] = 1
+        # image transform
+        img_size = img.shape[:2]
+        mat, mat_inv = self.getTransformMat(img_size, True)
+        img = cv2.warpAffine(
+            img,
+            mat,
+            self.input_size,
+            flags=cv2.INTER_CUBIC,
+            borderValue=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255])
+        # sentences
+        sents = ref['sents']
+        n_sentences = ref['num_sents']
+        if self.mode == 'train':
+            # mask transform
+            mask = cv2.warpAffine(mask,
+                                    mat,
+                                    self.input_size,
+                                    flags=cv2.INTER_LINEAR,
+                                    borderValue=0.)
+            # if metric learning, assign hard positive verb phrase if applicable
+            idx = np.random.choice(n_sentences, 1, replace=False)[0]
+            sent = sents[idx]
+            raw_hardpos, hardpos = self._get_hardpos_verb(ref, seg_id, idx)
+            img, mask = self.convert(img, mask)
+            word_vec = tokenize(sent, self.word_length, True).squeeze(0)
+            if self.metric_mode == "hardpos_only" :
+                return img, word_vec, mask, hardpos
+            else :
+                choice = np.random.choice(['hn', 'no_hn'], p=[self.hardneg_prob, 1 - self.hardneg_prob])
+                if choice == 'hn' and raw_hardpos :
+                    raw_hardneg, hardneg = self._get_hardneg_verb(ref, seg_id, idx)
+                else :
+                    hardneg = torch.zeros(self.word_length, dtype=torch.long)
+                return img, word_vec, mask, hardpos, hardneg
+        elif self.mode == 'val':
+            # sentence -> vector
+            sent = sents[0]
+            word_vec = tokenize(sent, self.word_length, True).squeeze(0)
+            img = self.convert(img)[0]
+            params = {
+                'mask_dir': mask_dir,
+                'inverse': mat_inv,
+                'ori_size': np.array(img_size)
+            }
+            return img, word_vec, mask, params
+        else:
+            # sentence -> vector
+            img = self.convert(img)[0]
+            params = {
+                'ori_img': ori_img,
+                'seg_id': seg_id,
+                'mask_dir': mask_dir,
+                'inverse': mat_inv,
+                'ori_size': np.array(img_size),
+                'sents': sents
+            }
+            return img, mask, params
+    def _get_hardneg_verb(self, ref, seg_id, sent_idx):
+        """
+        Handle the logic for selecting hard positive verb phrases during metric learning.
+        Returns the sentence, raw_verb, and tokenized verb if applicable.
+        """
+        # Extract metadata for hard positives if present
+        hardneg_dict = self.hardneg_meta.get(str(seg_id), {})
+        sent_id_list = list(hardneg_dict.keys())
+        cur_hardneg = hardpos_dict.get(sent_id_list[sent_idx], [])
+        if cur_hardneg:
+            # Assign a hard positive verb phrase if available
+            raw_verb_hardneg = random.choice(cur_hardneg)
+            verb_hardneg = tokenize(raw_verb_hardneg, self.word_length, True).squeeze(0)
+            return raw_verb_hardneg, verb_hardneg
+        verb_hardneg = torch.zeros(self.word_length, dtype=torch.long)
+        return '', verb_hardneg
+    def _get_hardpos_verb(self, ref, seg_id, sent_idx):
+        """
+        Handle the logic for selecting hard positive verb phrases during metric learning.
+        Returns the sentence, raw_verb, and tokenized verb if applicable.
+        """
+        # If the object appears multiple times, no hard positive is used
+        if seg_id in self.multi_obj_ref_ids:
+            verb_hardpos = torch.zeros(self.word_length, dtype=torch.long)
+            return '', verb_hardpos
+        # Extract metadata for hard positives if present
+        hardpos_dict = self.hardpos_meta.get(str(seg_id), {})
+        sent_id_list = list(hardpos_dict.keys())
+        # cur_hardpos = hardpos_dict.get(sent_id_list[sent_idx], [])
+        cur_hardpos = list(itertools.chain(*hardpos_dict.values()))
+        if cur_hardpos:
+            # Assign a hard positive verb phrase if available
+            raw_verb = random.choice(cur_hardpos)
+            verb_hardpos = tokenize(raw_verb, self.word_length, True).squeeze(0)
+            return raw_verb, verb_hardpos
+        verb_hardpos = torch.zeros(self.word_length, dtype=torch.long)
+        return '', verb_hardpos
+    def getTransformMat(self, img_size, inverse=False):
+        ori_h, ori_w = img_size
+        inp_h, inp_w = self.input_size
+        scale = min(inp_h / ori_h, inp_w / ori_w)
+        new_h, new_w = ori_h * scale, ori_w * scale
+        bias_x, bias_y = (inp_w - new_w) / 2., (inp_h - new_h) / 2.
+        src = np.array([[0, 0], [ori_w, 0], [0, ori_h]], np.float32)
+        dst = np.array([[bias_x, bias_y], [new_w + bias_x, bias_y],
+                        [bias_x, new_h + bias_y]], np.float32)
+        mat = cv2.getAffineTransform(src, dst)
+        if inverse:
+            mat_inv = cv2.getAffineTransform(dst, src)
+            return mat, mat_inv
+        return mat, None
+    def convert(self, img, mask=None):
+        # Image ToTensor & Normalize
+        img = torch.from_numpy(img.transpose((2, 0, 1)))
+        if not isinstance(img, torch.FloatTensor):
+            img = img.float()
+        img.div_(255.).sub_(self.mean).div_(self.std)
+        # Mask ToTensor
+        if mask is not None:
+            mask = torch.from_numpy(mask)
+            if not isinstance(mask, torch.FloatTensor):
+                mask = mask.float()
+        return img, mask
+    def __repr__(self):
+        return self.__class__.__name__ + "(" + \
+            f"db_path={self.lmdb_dir}, " + \
+            f"dataset={self.dataset}, " + \
+            f"split={self.split}, " + \
+            f"mode={self.mode}, " + \
+            f"input_size={self.input_size}, " + \
+            f"word_length={self.word_length}"
+    # def get_length(self):
+    #     return self.length
+    # def get_sample(self, idx):
+    #     return self.__getitem__(idx)

utils/misc.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import os
+import random
+import numpy as np
+from PIL import Image
+from loguru import logger
+import sys
+import inspect
+import torch
+from torch import nn
+import torch.distributed as dist
+def init_random_seed(seed=None, device='cuda', rank=0, world_size=1):
+    """Initialize random seed."""
+    if seed is not None:
+        return seed
+    # Make sure all ranks share the same random seed to prevent
+    # some potential bugs. Please refer to
+    # https://github.com/open-mmlab/mmdetection/issues/6339
+    seed = np.random.randint(2**31)
+    if world_size == 1:
+        return seed
+    if rank == 0:
+        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
+    else:
+        random_num = torch.tensor(0, dtype=torch.int32, device=device)
+    dist.broadcast(random_num, src=0)
+    return random_num.item()
+def set_random_seed(seed, deterministic=False):
+    """Set random seed."""
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    if deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+@torch.no_grad()
+def concat_all_gather(tensor):
+    """
+    Performs all_gather operation on the provided tensors.
+    *** Warning ***: torch.distributed.all_gather has no gradient.
+    """
+    tensor = tensor.contiguous()
+    tensors_gather = [
+        torch.ones_like(tensor)
+        for _ in range(torch.distributed.get_world_size())
+    ]
+    torch.distributed.all_gather(tensors_gather, tensor, async_op=False)
+    output = torch.cat(tensors_gather, dim=0)
+    return output
+def worker_init_fn(worker_id, num_workers, rank, seed):
+    # The seed of each worker equals to
+    # num_worker * rank + worker_id + user_seed
+    worker_seed = num_workers * rank + worker_id + seed
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=":f"):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+    def __str__(self):
+        if self.name == "Lr":
+            fmtstr = "{name}={val" + self.fmt + "}"
+        else:
+            fmtstr = "{name}={val" + self.fmt + "} ({avg" + self.fmt + "})"
+        return fmtstr.format(**self.__dict__)
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        logger.info("  ".join(entries))
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = "{:" + str(num_digits) + "d}"
+        return "[" + fmt + "/" + fmt.format(num_batches) + "]"
+def trainMetricGPU(output, target, threshold=0.35, pr_iou=0.5):
+    assert (output.dim() in [2, 3, 4])
+    assert output.shape == target.shape
+    output = output.flatten(1)
+    target = target.flatten(1)
+    output = torch.sigmoid(output)
+    output[output < threshold] = 0.
+    output[output >= threshold] = 1.
+    # inter & union
+    inter = (output.bool() & target.bool()).sum(dim=1)  # b
+    union = (output.bool() | target.bool()).sum(dim=1)  # b
+    ious = inter / (union + 1e-6)  # 0 ~ 1
+    # iou & pr@5
+    iou = ious.mean()
+    prec = (ious > pr_iou).float().mean()
+    return 100. * iou, 100. * prec
+def ValMetricGPU(output, target, threshold=0.35):
+    assert output.size(0) == 1
+    output = output.flatten(1)
+    target = target.flatten(1)
+    output = torch.sigmoid(output)
+    output[output < threshold] = 0.
+    output[output >= threshold] = 1.
+    # inter & union
+    inter = (output.bool() & target.bool()).sum(dim=1)  # b
+    union = (output.bool() | target.bool()).sum(dim=1)  # b
+    ious = inter / (union + 1e-6)  # 0 ~ 1
+    return ious
+def intersectionAndUnionGPU(output, target, K, threshold=0.5):
+    # 'K' classes, output and target sizes are N or N * L or N * H * W, each value in range 0 to K - 1.
+    assert (output.dim() in [1, 2, 3])
+    assert output.shape == target.shape
+    output = output.view(-1)
+    target = target.view(-1)
+    output = torch.sigmoid(output)
+    output[output < threshold] = 0.
+    output[output >= threshold] = 1.
+    intersection = output[output == target]
+    area_intersection = torch.histc(intersection.float(),
+                                    bins=K,
+                                    min=0,
+                                    max=K - 1)
+    area_output = torch.histc(output.float(), bins=K, min=0, max=K - 1)
+    area_target = torch.histc(target.float(), bins=K, min=0, max=K - 1)
+    area_union = area_output + area_target - area_intersection
+    return area_intersection[1], area_union[1]
+def group_weight(weight_group, module, lr):
+    group_decay = []
+    group_no_decay = []
+    for m in module.modules():
+        if isinstance(m, nn.Linear):
+            group_decay.append(m.weight)
+            if m.bias is not None:
+                group_no_decay.append(m.bias)
+        elif isinstance(m, nn.modules.conv._ConvNd):
+            group_decay.append(m.weight)
+            if m.bias is not None:
+                group_no_decay.append(m.bias)
+        elif isinstance(m, nn.modules.batchnorm._BatchNorm):
+            if m.weight is not None:
+                group_no_decay.append(m.weight)
+            if m.bias is not None:
+                group_no_decay.append(m.bias)
+    assert len(list(
+        module.parameters())) == len(group_decay) + len(group_no_decay)
+    weight_group.append(dict(params=group_decay, lr=lr))
+    weight_group.append(dict(params=group_no_decay, weight_decay=.0, lr=lr))
+    return weight_group
+def colorize(gray, palette):
+    # gray: numpy array of the label and 1*3N size list palette
+    color = Image.fromarray(gray.astype(np.uint8)).convert('P')
+    color.putpalette(palette)
+    return color
+def find_free_port():
+    import socket
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    # Binding to port 0 will cause the OS to find an available port for us
+    sock.bind(("", 0))
+    port = sock.getsockname()[1]
+    sock.close()
+    # NOTE: there is still a chance the port could be taken by other processes.
+    return port
+def get_caller_name(depth=0):
+    """
+    Args:
+        depth (int): Depth of caller conext, use 0 for caller depth.
+        Default value: 0.
+    Returns:
+        str: module name of the caller
+    """
+    # the following logic is a little bit faster than inspect.stack() logic
+    frame = inspect.currentframe().f_back
+    for _ in range(depth):
+        frame = frame.f_back
+    return frame.f_globals["__name__"]
+class StreamToLoguru:
+    """
+    stream object that redirects writes to a logger instance.
+    """
+    def __init__(self, level="INFO", caller_names=("apex", "pycocotools")):
+        """
+        Args:
+            level(str): log level string of loguru. Default value: "INFO".
+            caller_names(tuple): caller names of redirected module.
+                Default value: (apex, pycocotools).
+        """
+        self.level = level
+        self.linebuf = ""
+        self.caller_names = caller_names
+    def write(self, buf):
+        full_name = get_caller_name(depth=1)
+        module_name = full_name.rsplit(".", maxsplit=-1)[0]
+        if module_name in self.caller_names:
+            for line in buf.rstrip().splitlines():
+                # use caller level log
+                logger.opt(depth=2).log(self.level, line.rstrip())
+        else:
+            sys.__stdout__.write(buf)
+    def flush(self):
+        pass
+def redirect_sys_output(log_level="INFO"):
+    redirect_logger = StreamToLoguru(log_level)
+    sys.stderr = redirect_logger
+    sys.stdout = redirect_logger
+def setup_logger(save_dir, distributed_rank=0, filename="log.txt", mode="a"):
+    """setup logger for training and testing.
+    Args:
+        save_dir(str): location to save log file
+        distributed_rank(int): device rank when multi-gpu environment
+        filename (string): log save name.
+        mode(str): log file write mode, `append` or `override`. default is `a`.
+    Return:
+        logger instance.
+    """
+    loguru_format = (
+        "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
+        "<level>{level: <8}</level> | "
+        "<cyan>{name}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>")
+    logger.remove()
+    save_file = os.path.join(save_dir, filename)
+    if mode == "o" and os.path.exists(save_file):
+        os.remove(save_file)
+    # only keep logger in rank0 process
+    if distributed_rank == 0:
+        logger.add(
+            sys.stderr,
+            format=loguru_format,
+            level="INFO",
+            enqueue=True,
+        )
+        logger.add(save_file)
+    # redirect stdout/stderr to loguru
+    redirect_sys_output("INFO")

utils/simple_tokenizer.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import gzip
+import html
+import os
+from functools import lru_cache
+import ftfy
+import regex as re
+@lru_cache()
+def default_bpe():
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+class SimpleTokenizer(object):
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
+        merges = merges[1:49152-256-2+1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v+'</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
+        self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token+'</w>'
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
+        return text