Add scripts

Browse files

Files changed (6) hide show

README.md +54 -0
config.json +79 -0
configuration_amdetr.py +113 -0
model.safetensors +3 -0
modeling_amdetr.py +67 -0
pytorch_model.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,54 @@

+---
+license: apache-2.0
+datasets:
+- lighthouse-emnlp2024/Clotho-Moment
+language:
+- en
+---
+# Audio Moment-DETR
+This is a Audio Moment DETR (AM-DETR) proposed in Language-based Audio Moment Retrieval.
+Given the text query, AM-DETR searches for specific audio segments relevant to the query from the long audio recording.
+## Install
+Installing [Lighthouse](https://github.com/line/lighthouse) is required.
+```bash
+pip install 'git+https://github.com/line/lighthouse.git'
+```
+## Sample script
+```python
+import io
+import requests
+import torch
+from transformers import AutoModel, AutoConfig
+repo_id = "lighthouse-emnlp2024/AM-DETR"
+config = AutoConfig.from_pretrained(repo_id, trust_remote_code=True)
+config.device="cpu"
+model = AutoModel.from_pretrained(repo_id, config=config, trust_remote_code=True)
+audio_bytes = io.BytesIO(requests.get('https://github.com/line/lighthouse/raw/refs/heads/main/api_example/1a-ODBWMUAE.wav').content)
+query = "Heavy rain falls"
+feats = model.encode_audio(audio_path=audio_bytes)
+prediction = model.predict(query, feats)
+for start, end, score in prediction["pred_relevant_windows"]:
+    print(f"Moment, Score: {start:05.2f} - {end:05.2f}, {score:.2f}")
+```
+## Citation
+```bibtex
+@inproceedings{munakata2025language,
+  title={Language-based Audio Moment Retrieval},
+  author={Munakata, Hokuto and Nishimura, Taichi and Nakada, Shota and Komatsu, Tatsuya},
+  booktitle={ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  pages={1--5},
+  year={2025},
+  organization={IEEE}
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,79 @@

+{
+  "auto_map": {
+    "AutoConfig": "configuration_amdetr.AMDETRConfig",
+    "AutoModel": "modeling_amdetr.AMDETRPredictorWrapper"
+  },
+  "a_feat_dim": 768,
+  "a_feat_dirs": [
+    "features/clotho-moment/clap"
+  ],
+  "a_feat_types": "clap",
+  "architectures": [
+    "AMDETRPredictorWrapper"
+  ],
+  "aux_loss": true,
+  "bsz": 32,
+  "ckpt_filename": "best.ckpt",
+  "ckpt_filepath": "results/qd_detr/clotho-moment/clap/best.ckpt",
+  "clip_length": 1,
+  "ctx_mode": "audio_tef",
+  "dec_layers": 2,
+  "device": "cpu",
+  "dim_feedforward": 1024,
+  "dropout": 0.1,
+  "dset_name": "clotho-moment",
+  "ema_decay": 0.9,
+  "enc_layers": 2,
+  "eos_coef": 0.1,
+  "eval_bsz": 100,
+  "eval_epoch_interval": 1,
+  "eval_log_filename": "val.log",
+  "eval_log_filepath": "results/qd_detr/clotho-moment/clap/val.log",
+  "eval_log_txt_formatter": "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str} [Metrics] {eval_metrics_str}\n",
+  "eval_path": "data/clotho_moment/clotho_moment_val_release.jsonl",
+  "eval_split_name": "val",
+  "giou_loss_coef": 1,
+  "grad_clip": 0.1,
+  "hidden_dim": 256,
+  "input_dropout": 0.5,
+  "kwargs": {},
+  "label_loss_coef": 4,
+  "lr": 0.0001,
+  "lr_drop": 400,
+  "lw_saliency": 1,
+  "max_es_cnt": 200,
+  "max_q_l": 32,
+  "max_v_l": 75,
+  "max_windows": 5,
+  "model_ema": false,
+  "model_name": "qd_detr",
+  "model_type": "amdetr",
+  "n_epoch": 100,
+  "n_input_proj": 2,
+  "nheads": 8,
+  "num_queries": 10,
+  "num_workers": 4,
+  "position_embedding": "sine",
+  "results_dir": "results/qd_detr/clotho-moment/clap",
+  "saliency_margin": 0.2,
+  "seed": 2023,
+  "set_cost_class": 4,
+  "set_cost_giou": 1,
+  "set_cost_span": 10,
+  "span_loss_coef": 10,
+  "span_loss_type": "l1",
+  "t_feat_dim": 768,
+  "t_feat_dir": "features/clotho-moment/clap_text",
+  "t_feat_dir_pretrain_eval": null,
+  "t_feat_type": "clap",
+  "torch_dtype": "float32",
+  "train_log_filename": "train.log",
+  "train_log_filepath": "results/qd_detr/clotho-moment/clap/train.log",
+  "train_log_txt_formatter": "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str}\n",
+  "train_path": "data/clotho_moment/clotho_moment_train_release.jsonl",
+  "transformers_version": "4.51.3",
+  "v_feat_dim": 2,
+  "v_feat_dirs": null,
+  "v_feat_types": null,
+  "wd": 0.0001
+}

configuration_amdetr.py ADDED Viewed

	@@ -0,0 +1,113 @@

+"""
+Copyright $today.year LY Corporation
+LY Corporation licenses this file to you under the Apache License,
+version 2.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at:
+  https://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+License for the specific language governing permissions and limitations
+under the License.
+Moment-DETR (https://github.com/jayleicn/moment_detr)
+Copyright (c) 2021 Jie Lei
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+from transformers import PretrainedConfig
+class AMDETRConfig(PretrainedConfig):
+    model_type = "amdetr"
+    def __init__(
+        self,
+        seed: int = 2023,
+        device: str = "cuda",
+        num_workers: int = 4,
+        lr: float = 0.0001,
+        lr_drop: int = 400,
+        wd: float = 0.0001,
+        n_epoch: int = 100,
+        max_es_cnt: int = 200,
+        bsz: int = 32,
+        eval_bsz: int = 100,
+        grad_clip: float = 0.1,
+        max_q_l: int = 32,
+        max_v_l: int = 75,
+        max_windows: int = 5,
+        clip_length: int = 1,
+        eval_epoch_interval: int = 1,
+        position_embedding: str = "sine",
+        enc_layers: int = 2,
+        dec_layers: int = 2,
+        dim_feedforward: int = 1024,
+        hidden_dim: int = 256,
+        input_dropout: float = 0.5,
+        dropout: float = 0.1,
+        nheads: int = 8,
+        num_queries: int = 10,
+        n_input_proj: int = 2,
+        saliency_margin: float = 0.2,
+        span_loss_type: str = "l1",
+        set_cost_span: int = 10,
+        set_cost_giou: int = 1,
+        set_cost_class: int = 4,
+        span_loss_coef: int = 10,
+        giou_loss_coef: int = 1,
+        label_loss_coef: int = 4,
+        eos_coef: float = 0.1,
+        lw_saliency: int = 1,
+        ckpt_filename: str = "best.ckpt",
+        train_log_filename: str = "train.log",
+        eval_log_filename: str = "val.log",
+        eval_split_name: str = "val",
+        aux_loss: bool = True,
+        model_ema: bool = False,
+        ema_decay: float = 0.9,
+        results_dir: str = "results/qd_detr/clotho-moment/clap",
+        ctx_mode: str = "audio_tef",
+        v_feat_types: None = None,
+        a_feat_types: str = "clap",
+        t_feat_type: str = "clap",
+        v_feat_dim: int = 2,
+        a_feat_dim: int = 768,
+        t_feat_dim: int = 768,
+        model_name: str = "qd_detr",
+        dset_name: str = "clotho-moment",
+        train_path: str = "data/clotho_moment/clotho_moment_train_release.jsonl",
+        eval_path: str = "data/clotho_moment/clotho_moment_val_release.jsonl",
+        ckpt_filepath: str = "results/qd_detr/clotho-moment/clap/best.ckpt",
+        train_log_filepath: str = "results/qd_detr/clotho-moment/clap/train.log",
+        eval_log_filepath: str = "results/qd_detr/clotho-moment/clap/val.log",
+        v_feat_dirs: None = None,
+        t_feat_dir: str = "features/clotho-moment/clap_text",
+        a_feat_dirs: list = ['features/clotho-moment/clap'],
+        t_feat_dir_pretrain_eval: None = None,
+        train_log_txt_formatter: str = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str}\n",
+        eval_log_txt_formatter: str = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str} [Metrics] {eval_metrics_str}\n",
+        **kwargs
+    ) -> None:
+        args_and_values = locals()
+        for arg_name, arg_value in args_and_values.items():
+            if arg_name != 'self':
+                setattr(self, arg_name, arg_value)
+        super().__init__(**kwargs)
+if __name__ == "__main__":
+    cfg = AMDETRConfig()
+    print(cfg)

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:752a87a2db7c197970cd108bcc4952fc627d7e91dfe5027d162781ea0b734f3e
+size 28513996

modeling_amdetr.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""
+Copyright $today.year LY Corporation
+LY Corporation licenses this file to you under the Apache License,
+version 2.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at:
+  https://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+License for the specific language governing permissions and limitations
+under the License.
+Moment-DETR (https://github.com/jayleicn/moment_detr)
+Copyright (c) 2021 Jie Lei
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+from typing import Dict, List, Optional
+import torch
+from transformers import PreTrainedModel
+from lighthouse.feature_extractor.audio_encoder import AudioEncoder
+from lighthouse.feature_extractor.text_encoder import TextEncoder
+from lighthouse.models import BasePredictor
+from .configuration_amdetr import AMDETRConfig
+class AMDETRPredictorWrapper(BasePredictor, PreTrainedModel):
+    config_class = AMDETRConfig
+    def __init__(self, config: AMDETRConfig, feature_name: str="clap") -> None:
+        PreTrainedModel.__init__(self, config)
+        args = config
+        self._clip_len: float = args.clip_length
+        self._device: str = args.device
+        self._size = 224
+        self._moment_num = 10
+        self._model: torch.nn.Module = self._initialize_model(args, args.model_name)
+        self._model.eval()
+        self._feature_name: str = feature_name
+        self._model_name: str = args.model_name
+    def load_encoders(self) -> None:
+        self._vision_encoder = None
+        self._audio_encoder: AudioEncoder = self._initialize_audio_encoder(self._feature_name, pann_path=None)
+        self._text_encoder: TextEncoder = self._initialize_text_encoder(self._feature_name)
+    @torch.no_grad()
+    def encode_audio(self, audio_path: str) -> Dict[str, torch.Tensor]:
+        if not hasattr(self, "_audio_encoder") or not hasattr(self, "_text_encoder"):
+            self.load_encoders()
+        return super().encode_audio(audio_path)

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:756a8aaf6db555da8342c2a2483545f4b4c2ab946f731f0f4eed774dd591c1f9
+size 28552082