TMR

Build error

App Files Files

4RiZ4

Mathux commited on May 13, 2023

Commit

9e2bdc0

0 Parent(s):

Duplicate from Mathux/TMR

Browse files

Co-authored-by: Mathis Petrovich <[email protected]>

Files changed (8) hide show

.gitattributes +34 -0
README.md +13 -0
amass-annotations/amass_to_babel.json +0 -0
amass-annotations/humanml3d.json +0 -0
app.py +313 -0
load.py +53 -0
model.py +128 -0
requirements.txt +5 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: TMR
+emoji: 🐨
+colorFrom: yellow
+colorTo: purple
+sdk: gradio
+sdk_version: 3.24.1
+app_file: app.py
+pinned: false
+duplicated_from: Mathux/TMR
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

amass-annotations/amass_to_babel.json ADDED Viewed

The diff for this file is too large to render. See raw diff

amass-annotations/humanml3d.json ADDED Viewed

The diff for this file is too large to render. See raw diff

app.py ADDED Viewed

	@@ -0,0 +1,313 @@

+from functools import partial
+import os
+import torch
+import numpy as np
+import gradio as gr
+import gdown
+from load import load_model, load_json
+from load import load_unit_motion_embs_splits, load_keyids_splits
+WEBSITE = """
+<div class="embed_hidden">
+<h1 style='text-align: center'>TMR: Text-to-Motion Retrieval Using Contrastive 3D Human Motion Synthesis </h1>
+<h2 style='text-align: center'>
+<a href="https://mathis.petrovich.fr" target="_blank"><nobr>Mathis Petrovich</nobr></a> &emsp;
+<a href="https://ps.is.mpg.de/~black" target="_blank"><nobr>Michael J. Black</nobr></a> &emsp;
+<a href="https://imagine.enpc.fr/~varolg" target="_blank"><nobr>G&uumll Varol</nobr></a>
+</h2>
+<h2 style='text-align: center'>
+<nobr>arXiv 2023</nobr>
+</h2>
+<h3 style="text-align:center;">
+<a target="_blank" href="https://arxiv.org/abs/2305.00976"> <button type="button" class="btn btn-primary btn-lg"> Paper </button></a>
+<a target="_blank" href="https://github.com/Mathux/TMR"> <button type="button" class="btn btn-primary btn-lg"> Code </button></a>
+<a target="_blank" href="https://mathis.petrovich.fr/tmr"> <button type="button" class="btn btn-primary btn-lg"> Webpage </button></a>
+<a target="_blank" href="https://mathis.petrovich.fr/tmr/tmr.bib"> <button type="button" class="btn btn-primary btn-lg"> BibTex </button></a>
+</h3>
+<h3> Description </h3>
+<p>
+This space illustrates <a href='https://mathis.petrovich.fr/tmr/' target='_blank'><b>TMR</b></a>, a method for text-to-motion retrieval. Given a gallery of 3D human motions (which can be unseen during training) and a text query, the goal is to search for motions which are close to the text query.
+</p>
+</div>
+"""
+EXAMPLES = [
+    "A person is walking slowly",
+    "A person is walking in a circle",
+    "A person is jumping rope",
+    "Someone is doing a backflip",
+    "A person is doing a moonwalk",
+    "A person walks forward and then turns back",
+    "Picking up an object",
+    "A person is swimming in the sea",
+    "A human is squatting",
+    "Someone is jumping with one foot",
+    "A person is chopping vegetables",
+    "Someone walks backward",
+    "Somebody is ascending a staircase",
+    "A person is sitting down",
+    "A person is taking the stairs",
+    "Someone is doing jumping jacks",
+    "The person walked forward and is picking up his toolbox",
+    "The person angrily punching the air"
+]
+# Show closest text in the training
+# css to make videos look nice
+# var(--block-border-color);
+CSS = """
+.retrieved_video {
+    position: relative;
+    margin: 0;
+    box-shadow: var(--block-shadow);
+    border-width: var(--block-border-width);
+    border-color: #000000;
+    border-radius: var(--block-radius);
+    background: var(--block-background-fill);
+    width: 100%;
+    line-height: var(--line-sm);
+}
+.contour_video {
+    display: flex;
+    flex-direction: column;
+    justify-content: center;
+    align-items: center;
+    z-index: var(--layer-5);
+    border-radius: var(--block-radius);
+    background: var(--background-fill-primary);
+    padding: 0 var(--size-6);
+    max-height: var(--size-screen-h);
+    overflow: hidden;
+}
+"""
+DEFAULT_TEXT = "A person is "
+def humanml3d_keyid_to_babel_rendered_url(h3d_index, amass_to_babel, keyid):
+    # Don't show the mirrored version of HumanMl3D
+    if "M" in keyid:
+        return None
+    dico = h3d_index[keyid]
+    path = dico["path"]
+    # HumanAct12 motions are not rendered online
+    # so we skip them for now
+    if "humanact12" in path:
+        return None
+    # This motion is not rendered in BABEL
+    # so we skip them for now
+    if path not in amass_to_babel:
+        return None
+    babel_id = amass_to_babel[path].zfill(6)
+    url = f"https://babel-renders.s3.eu-central-1.amazonaws.com/{babel_id}.mp4"
+    # For the demo, we retrieve from the first annotation only
+    ann = dico["annotations"][0]
+    start = ann["start"]
+    end = ann["end"]
+    text = ann["text"]
+    data = {
+        "url": url,
+        "start": start,
+        "end": end,
+        "text": text,
+        "keyid": keyid,
+        "babel_id": babel_id,
+        "path": path
+    }
+    return data
+def retrieve(model, keyid_to_url, all_unit_motion_embs, all_keyids, text, splits=["test"], nmax=8):
+    unit_motion_embs = torch.cat([all_unit_motion_embs[s] for s in splits])
+    keyids = np.concatenate([all_keyids[s] for s in splits])
+    scores = model.compute_scores(text, unit_embs=unit_motion_embs)
+    sorted_idxs = np.argsort(-scores)
+    best_keyids = keyids[sorted_idxs]
+    best_scores = scores[sorted_idxs]
+    datas = []
+    for keyid, score in zip(best_keyids, best_scores):
+        if len(datas) == nmax:
+            break
+        data = keyid_to_url(keyid)
+        if data is None:
+            continue
+        data["score"] = round(float(score), 2)
+        datas.append(data)
+    return datas
+# HTML component
+def get_video_html(data, video_id, width=700, height=700):
+    url = data["url"]
+    start = data["start"]
+    end = data["end"]
+    score = data["score"]
+    text = data["text"]
+    keyid = data["keyid"]
+    babel_id = data["babel_id"]
+    path = data["path"]
+    trim = f"#t={start},{end}"
+    title = f'''Score = {score}
+Corresponding text: {text}
+HumanML3D keyid: {keyid}
+BABEL keyid: {babel_id}
+AMASS path: {path}'''
+    # class="wrap default svelte-gjihhp hide"
+    # <div class="contour_video" style="position: absolute; padding: 10px;">
+    # width="{width}" height="{height}"
+    video_html = f'''
+<video class="retrieved_video" width="{width}" height="{height}" preload="auto" muted playsinline onpause="this.load()"
+autoplay loop disablepictureinpicture id="{video_id}" title="{title}">
+  <source src="{url}{trim}" type="video/mp4">
+  Your browser does not support the video tag.
+</video>
+'''
+    return video_html
+def retrieve_component(retrieve_function, text, splits_choice, nvids, n_component=24):
+    if text == DEFAULT_TEXT or text == "" or text is None:
+        return [None for _ in range(n_component)]
+    # cannot produce more than n_compoenent
+    nvids = min(nvids, n_component)
+    if "Unseen" in splits_choice:
+        splits = ["test"]
+    else:
+        splits = ["train", "val", "test"]
+    datas = retrieve_function(text, splits=splits, nmax=nvids)
+    htmls = [get_video_html(data, idx) for idx, data in enumerate(datas)]
+    # get n_component exactly if asked less
+    # pad with dummy blocks
+    htmls = htmls + [None for _ in range(max(0, n_component-nvids))]
+    return htmls
+if not os.path.exists("data"):
+    gdown.download_folder("https://drive.google.com/drive/folders/1MgPFgHZ28AMd01M1tJ7YW_1-ut3-4j08",
+                          use_cookies=False)
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# LOADING
+model = load_model(device)
+splits = ["train", "val", "test"]
+all_unit_motion_embs = load_unit_motion_embs_splits(splits, device)
+all_keyids = load_keyids_splits(splits)
+h3d_index = load_json("amass-annotations/humanml3d.json")
+amass_to_babel = load_json("amass-annotations/amass_to_babel.json")
+keyid_to_url = partial(humanml3d_keyid_to_babel_rendered_url, h3d_index, amass_to_babel)
+retrieve_function = partial(retrieve, model, keyid_to_url, all_unit_motion_embs, all_keyids)
+# DEMO
+theme = gr.themes.Default(primary_hue="blue", secondary_hue="gray")
+retrieve_and_show = partial(retrieve_component, retrieve_function)
+with gr.Blocks(css=CSS, theme=theme) as demo:
+    gr.Markdown(WEBSITE)
+    videos = []
+    with gr.Row():
+        with gr.Column(scale=3):
+            with gr.Column(scale=2):
+                text = gr.Textbox(placeholder="Type the motion you want to search with a sentence",
+                                  show_label=True, label="Text prompt", value=DEFAULT_TEXT)
+            with gr.Column(scale=1):
+                btn = gr.Button("Retrieve", variant='primary')
+                clear = gr.Button("Clear", variant='secondary')
+            with gr.Row():
+                with gr.Column(scale=1):
+                    splits_choice = gr.Radio(["All motions", "Unseen motions"], label="Gallery of motion",
+                                             value="All motions",
+                                             info="The motion gallery is coming from HumanML3D")
+                with gr.Column(scale=1):
+                    # nvideo_slider = gr.Slider(minimum=4, maximum=24, step=4, value=8, label="Number of videos")
+                    nvideo_slider = gr.Radio([4, 8, 12, 16, 24], label="Videos",
+                                             value=8,
+                                             info="Number of videos to display")
+        with gr.Column(scale=2):
+            def retrieve_example(text, splits_choice, nvideo_slider):
+                return retrieve_and_show(text, splits_choice, nvideo_slider)
+            examples = gr.Examples(examples=[[x, None, None] for x in EXAMPLES],
+                                   inputs=[text, splits_choice, nvideo_slider],
+                                   examples_per_page=20,
+                                   run_on_click=False, cache_examples=False,
+                                   fn=retrieve_example, outputs=[])
+    i = -1
+    # should indent
+    for _ in range(6):
+        with gr.Row():
+            for _ in range(4):
+                i += 1
+                video = gr.HTML()
+                videos.append(video)
+    # connect the examples to the output
+    # a bit hacky
+    examples.outputs = videos
+    def load_example(example_id):
+        processed_example = examples.non_none_processed_examples[example_id]
+        return gr.utils.resolve_singleton(processed_example)
+    examples.dataset.click(
+        load_example,
+        inputs=[examples.dataset],
+        outputs=examples.inputs_with_examples,  # type: ignore
+        show_progress=False,
+        postprocess=False,
+        queue=False,
+        ).then(
+            fn=retrieve_example,
+            inputs=examples.inputs,
+            outputs=videos
+        )
+    btn.click(fn=retrieve_and_show, inputs=[text, splits_choice, nvideo_slider], outputs=videos)
+    text.submit(fn=retrieve_and_show, inputs=[text, splits_choice, nvideo_slider], outputs=videos)
+    splits_choice.change(fn=retrieve_and_show, inputs=[text, splits_choice, nvideo_slider], outputs=videos)
+    nvideo_slider.change(fn=retrieve_and_show, inputs=[text, splits_choice, nvideo_slider], outputs=videos)
+    def clear_videos():
+        return [None for x in range(24)] + [DEFAULT_TEXT]
+    clear.click(fn=clear_videos, outputs=videos + [text])
+demo.launch()

load.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+import orjson
+import torch
+import numpy as np
+from model import TMR_textencoder
+EMBS = "data/unit_motion_embs"
+def load_json(path):
+    with open(path, "rb") as ff:
+        return orjson.loads(ff.read())
+def load_keyids(split):
+    path = os.path.join(EMBS, f"{split}.keyids")
+    with open(path) as ff:
+        keyids = np.array([x.strip() for x in ff.readlines()])
+    return keyids
+def load_keyids_splits(splits):
+    return {
+        split: load_keyids(split)
+        for split in splits
+    }
+def load_unit_motion_embs(split, device):
+    path = os.path.join(EMBS, f"{split}_motion_embs_unit.npy")
+    tensor = torch.from_numpy(np.load(path)).to(device)
+    return tensor
+def load_unit_motion_embs_splits(splits, device):
+    return {
+        split: load_unit_motion_embs(split, device)
+        for split in splits
+    }
+def load_model(device):
+    text_params = {
+        'latent_dim': 256, 'ff_size': 1024, 'num_layers': 6, 'num_heads': 4,
+        'activation': 'gelu', 'modelpath': 'distilbert-base-uncased'
+    }
+    "unit_motion_embs"
+    model = TMR_textencoder(**text_params)
+    state_dict = torch.load("data/textencoder.pt", map_location=device)
+    # load values for the transformer only
+    model.load_state_dict(state_dict, strict=False)
+    model = model.eval()
+    return model

model.py ADDED Viewed

	@@ -0,0 +1,128 @@

+from typing import List
+import torch.nn as nn
+import os
+import torch
+import numpy as np
+from torch import Tensor
+from transformers import AutoTokenizer, AutoModel
+from transformers import logging
+from torch.nn.functional import normalize
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len=5000):
+        super().__init__()
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer('pe', pe, persistent=False)
+    def forward(self, x):
+        return x + self.pe[:x.shape[0], :]
+class TMR_textencoder(nn.Module):
+    def __init__(self, modelpath: str, latent_dim: int, ff_size: int,
+                 num_layers: int, num_heads: int, activation: str, **kwargs) -> None:
+        super().__init__()
+        logging.set_verbosity_error()
+        # Tokenizer
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        self.tokenizer = AutoTokenizer.from_pretrained(modelpath)
+        # Text model
+        self.text_model = AutoModel.from_pretrained(modelpath)
+        # Then configure the model
+        self.text_encoded_dim = self.text_model.config.hidden_size
+        # Projection of the text-outputs into the latent space
+        self.projection = nn.Sequential(
+            nn.ReLU(),
+            nn.Linear(self.text_encoded_dim, latent_dim)
+        )
+        self.mu_token = nn.Parameter(torch.randn(latent_dim))
+        self.logvar_token = nn.Parameter(torch.randn(latent_dim))
+        self.sequence_pos_encoding = PositionalEncoding(latent_dim)
+        seq_trans_encoder_layer = nn.TransformerEncoderLayer(d_model=latent_dim,
+                                                             nhead=num_heads,
+                                                             dim_feedforward=ff_size,
+                                                             dropout=0.0,
+                                                             activation=activation)
+        self.seqTransEncoder = nn.TransformerEncoder(
+            seq_trans_encoder_layer,
+            num_layers=num_layers
+        )
+    def get_last_hidden_state(self, texts: List[str],
+                              return_mask: bool = False):
+        encoded_inputs = self.tokenizer(texts, return_tensors="pt", padding=True)
+        output = self.text_model(**encoded_inputs.to(self.text_model.device))
+        if not return_mask:
+            return output.last_hidden_state
+        return output.last_hidden_state, encoded_inputs.attention_mask.to(dtype=bool)
+    def forward(self, texts: List[str]) -> Tensor:
+        text_encoded, mask = self.get_last_hidden_state(texts, return_mask=True)
+        x = self.projection(text_encoded)
+        bs, nframes, _ = x.shape
+        # bs, nframes, totjoints, nfeats = x.shape
+        # Switch sequence and batch_size because the input of
+        # Pytorch Transformer is [Sequence, Batch size, ...]
+        x = x.permute(1, 0, 2)  # now it is [nframes, bs, latent_dim]
+        mu_token = torch.tile(self.mu_token, (bs,)).reshape(bs, -1)
+        logvar_token = torch.tile(self.logvar_token, (bs,)).reshape(bs, -1)
+        # adding the distribution tokens for all sequences
+        xseq = torch.cat((mu_token[None], logvar_token[None], x), 0)
+        # create a bigger mask, to allow attend to mu and logvar
+        token_mask = torch.ones((bs, 2), dtype=bool, device=x.device)
+        aug_mask = torch.cat((token_mask, mask), 1)
+        # add positional encoding
+        xseq = self.sequence_pos_encoding(xseq)
+        final = self.seqTransEncoder(xseq, src_key_padding_mask=~aug_mask)
+        # only mu for inference
+        mu = final[0]
+        return mu
+    # compute score for retrieval
+    def compute_scores(self, texts, unit_embs=None, embs=None):
+        # not both empty
+        assert not (unit_embs is None and embs is None)
+        # not both filled
+        assert not (unit_embs is not None and embs is not None)
+        output_str = False
+        # if one input, squeeze the output
+        if isinstance(texts, str):
+            texts = [texts]
+            output_str = True
+        # compute unit_embs from embs if not given
+        if embs is not None:
+            unit_embs = normalize(embs)
+        with torch.no_grad():
+            latent_unit_texts = normalize(self(texts))
+            # compute cosine similarity between 0 and 1
+            scores = (unit_embs @ latent_unit_texts.T).T/2 + 0.5
+            scores = scores.cpu().numpy()
+        if output_str:
+            scores = scores[0]
+        return scores

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch
+orjson
+numpy
+gdown
+transformers