lighthouse-emnlp2024 commited on
Commit
5a8f38f
·
1 Parent(s): b118f8c

Add scripts

Browse files
README.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ datasets:
4
+ - lighthouse-emnlp2024/Clotho-Moment
5
+ language:
6
+ - en
7
+ ---
8
+ # Audio Moment-DETR
9
+ This is a Audio Moment DETR (AM-DETR) proposed in Language-based Audio Moment Retrieval.
10
+ Given the text query, AM-DETR searches for specific audio segments relevant to the query from the long audio recording.
11
+
12
+ ## Install
13
+ Installing [Lighthouse](https://github.com/line/lighthouse) is required.
14
+ ```bash
15
+ pip install 'git+https://github.com/line/lighthouse.git'
16
+ ```
17
+
18
+ ## Sample script
19
+ ```python
20
+ import io
21
+ import requests
22
+
23
+ import torch
24
+ from transformers import AutoModel, AutoConfig
25
+
26
+
27
+ repo_id = "lighthouse-emnlp2024/AM-DETR"
28
+
29
+ config = AutoConfig.from_pretrained(repo_id, trust_remote_code=True)
30
+ config.device="cpu"
31
+ model = AutoModel.from_pretrained(repo_id, config=config, trust_remote_code=True)
32
+
33
+ audio_bytes = io.BytesIO(requests.get('https://github.com/line/lighthouse/raw/refs/heads/main/api_example/1a-ODBWMUAE.wav').content)
34
+ query = "Heavy rain falls"
35
+
36
+ feats = model.encode_audio(audio_path=audio_bytes)
37
+ prediction = model.predict(query, feats)
38
+ for start, end, score in prediction["pred_relevant_windows"]:
39
+ print(f"Moment, Score: {start:05.2f} - {end:05.2f}, {score:.2f}")
40
+
41
+ ```
42
+
43
+
44
+ ## Citation
45
+ ```bibtex
46
+ @inproceedings{munakata2025language,
47
+ title={Language-based Audio Moment Retrieval},
48
+ author={Munakata, Hokuto and Nishimura, Taichi and Nakada, Shota and Komatsu, Tatsuya},
49
+ booktitle={ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
50
+ pages={1--5},
51
+ year={2025},
52
+ organization={IEEE}
53
+ }
54
+ ```
config.json ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoConfig": "configuration_amdetr.AMDETRConfig",
4
+ "AutoModel": "modeling_amdetr.AMDETRPredictorWrapper"
5
+ },
6
+ "a_feat_dim": 768,
7
+ "a_feat_dirs": [
8
+ "features/clotho-moment/clap"
9
+ ],
10
+ "a_feat_types": "clap",
11
+ "architectures": [
12
+ "AMDETRPredictorWrapper"
13
+ ],
14
+ "aux_loss": true,
15
+ "bsz": 32,
16
+ "ckpt_filename": "best.ckpt",
17
+ "ckpt_filepath": "results/qd_detr/clotho-moment/clap/best.ckpt",
18
+ "clip_length": 1,
19
+ "ctx_mode": "audio_tef",
20
+ "dec_layers": 2,
21
+ "device": "cpu",
22
+ "dim_feedforward": 1024,
23
+ "dropout": 0.1,
24
+ "dset_name": "clotho-moment",
25
+ "ema_decay": 0.9,
26
+ "enc_layers": 2,
27
+ "eos_coef": 0.1,
28
+ "eval_bsz": 100,
29
+ "eval_epoch_interval": 1,
30
+ "eval_log_filename": "val.log",
31
+ "eval_log_filepath": "results/qd_detr/clotho-moment/clap/val.log",
32
+ "eval_log_txt_formatter": "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str} [Metrics] {eval_metrics_str}\n",
33
+ "eval_path": "data/clotho_moment/clotho_moment_val_release.jsonl",
34
+ "eval_split_name": "val",
35
+ "giou_loss_coef": 1,
36
+ "grad_clip": 0.1,
37
+ "hidden_dim": 256,
38
+ "input_dropout": 0.5,
39
+ "kwargs": {},
40
+ "label_loss_coef": 4,
41
+ "lr": 0.0001,
42
+ "lr_drop": 400,
43
+ "lw_saliency": 1,
44
+ "max_es_cnt": 200,
45
+ "max_q_l": 32,
46
+ "max_v_l": 75,
47
+ "max_windows": 5,
48
+ "model_ema": false,
49
+ "model_name": "qd_detr",
50
+ "model_type": "amdetr",
51
+ "n_epoch": 100,
52
+ "n_input_proj": 2,
53
+ "nheads": 8,
54
+ "num_queries": 10,
55
+ "num_workers": 4,
56
+ "position_embedding": "sine",
57
+ "results_dir": "results/qd_detr/clotho-moment/clap",
58
+ "saliency_margin": 0.2,
59
+ "seed": 2023,
60
+ "set_cost_class": 4,
61
+ "set_cost_giou": 1,
62
+ "set_cost_span": 10,
63
+ "span_loss_coef": 10,
64
+ "span_loss_type": "l1",
65
+ "t_feat_dim": 768,
66
+ "t_feat_dir": "features/clotho-moment/clap_text",
67
+ "t_feat_dir_pretrain_eval": null,
68
+ "t_feat_type": "clap",
69
+ "torch_dtype": "float32",
70
+ "train_log_filename": "train.log",
71
+ "train_log_filepath": "results/qd_detr/clotho-moment/clap/train.log",
72
+ "train_log_txt_formatter": "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str}\n",
73
+ "train_path": "data/clotho_moment/clotho_moment_train_release.jsonl",
74
+ "transformers_version": "4.51.3",
75
+ "v_feat_dim": 2,
76
+ "v_feat_dirs": null,
77
+ "v_feat_types": null,
78
+ "wd": 0.0001
79
+ }
configuration_amdetr.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright $today.year LY Corporation
3
+ LY Corporation licenses this file to you under the Apache License,
4
+ version 2.0 (the "License"); you may not use this file except in compliance
5
+ with the License. You may obtain a copy of the License at:
6
+ https://www.apache.org/licenses/LICENSE-2.0
7
+ Unless required by applicable law or agreed to in writing, software
8
+ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
9
+ WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
10
+ License for the specific language governing permissions and limitations
11
+ under the License.
12
+ Moment-DETR (https://github.com/jayleicn/moment_detr)
13
+ Copyright (c) 2021 Jie Lei
14
+ Permission is hereby granted, free of charge, to any person obtaining a copy
15
+ of this software and associated documentation files (the "Software"), to deal
16
+ in the Software without restriction, including without limitation the rights
17
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
18
+ copies of the Software, and to permit persons to whom the Software is
19
+ furnished to do so, subject to the following conditions:
20
+ The above copyright notice and this permission notice shall be included in all
21
+ copies or substantial portions of the Software.
22
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28
+ SOFTWARE.
29
+ """
30
+ from transformers import PretrainedConfig
31
+
32
+
33
+ class AMDETRConfig(PretrainedConfig):
34
+ model_type = "amdetr"
35
+
36
+ def __init__(
37
+ self,
38
+ seed: int = 2023,
39
+ device: str = "cuda",
40
+ num_workers: int = 4,
41
+ lr: float = 0.0001,
42
+ lr_drop: int = 400,
43
+ wd: float = 0.0001,
44
+ n_epoch: int = 100,
45
+ max_es_cnt: int = 200,
46
+ bsz: int = 32,
47
+ eval_bsz: int = 100,
48
+ grad_clip: float = 0.1,
49
+ max_q_l: int = 32,
50
+ max_v_l: int = 75,
51
+ max_windows: int = 5,
52
+ clip_length: int = 1,
53
+ eval_epoch_interval: int = 1,
54
+ position_embedding: str = "sine",
55
+ enc_layers: int = 2,
56
+ dec_layers: int = 2,
57
+ dim_feedforward: int = 1024,
58
+ hidden_dim: int = 256,
59
+ input_dropout: float = 0.5,
60
+ dropout: float = 0.1,
61
+ nheads: int = 8,
62
+ num_queries: int = 10,
63
+ n_input_proj: int = 2,
64
+ saliency_margin: float = 0.2,
65
+ span_loss_type: str = "l1",
66
+ set_cost_span: int = 10,
67
+ set_cost_giou: int = 1,
68
+ set_cost_class: int = 4,
69
+ span_loss_coef: int = 10,
70
+ giou_loss_coef: int = 1,
71
+ label_loss_coef: int = 4,
72
+ eos_coef: float = 0.1,
73
+ lw_saliency: int = 1,
74
+ ckpt_filename: str = "best.ckpt",
75
+ train_log_filename: str = "train.log",
76
+ eval_log_filename: str = "val.log",
77
+ eval_split_name: str = "val",
78
+ aux_loss: bool = True,
79
+ model_ema: bool = False,
80
+ ema_decay: float = 0.9,
81
+ results_dir: str = "results/qd_detr/clotho-moment/clap",
82
+ ctx_mode: str = "audio_tef",
83
+ v_feat_types: None = None,
84
+ a_feat_types: str = "clap",
85
+ t_feat_type: str = "clap",
86
+ v_feat_dim: int = 2,
87
+ a_feat_dim: int = 768,
88
+ t_feat_dim: int = 768,
89
+ model_name: str = "qd_detr",
90
+ dset_name: str = "clotho-moment",
91
+ train_path: str = "data/clotho_moment/clotho_moment_train_release.jsonl",
92
+ eval_path: str = "data/clotho_moment/clotho_moment_val_release.jsonl",
93
+ ckpt_filepath: str = "results/qd_detr/clotho-moment/clap/best.ckpt",
94
+ train_log_filepath: str = "results/qd_detr/clotho-moment/clap/train.log",
95
+ eval_log_filepath: str = "results/qd_detr/clotho-moment/clap/val.log",
96
+ v_feat_dirs: None = None,
97
+ t_feat_dir: str = "features/clotho-moment/clap_text",
98
+ a_feat_dirs: list = ['features/clotho-moment/clap'],
99
+ t_feat_dir_pretrain_eval: None = None,
100
+ train_log_txt_formatter: str = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str}\n",
101
+ eval_log_txt_formatter: str = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str} [Metrics] {eval_metrics_str}\n",
102
+ **kwargs
103
+ ) -> None:
104
+ args_and_values = locals()
105
+ for arg_name, arg_value in args_and_values.items():
106
+ if arg_name != 'self':
107
+ setattr(self, arg_name, arg_value)
108
+ super().__init__(**kwargs)
109
+
110
+
111
+ if __name__ == "__main__":
112
+ cfg = AMDETRConfig()
113
+ print(cfg)
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:752a87a2db7c197970cd108bcc4952fc627d7e91dfe5027d162781ea0b734f3e
3
+ size 28513996
modeling_amdetr.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright $today.year LY Corporation
3
+ LY Corporation licenses this file to you under the Apache License,
4
+ version 2.0 (the "License"); you may not use this file except in compliance
5
+ with the License. You may obtain a copy of the License at:
6
+ https://www.apache.org/licenses/LICENSE-2.0
7
+ Unless required by applicable law or agreed to in writing, software
8
+ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
9
+ WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
10
+ License for the specific language governing permissions and limitations
11
+ under the License.
12
+ Moment-DETR (https://github.com/jayleicn/moment_detr)
13
+ Copyright (c) 2021 Jie Lei
14
+ Permission is hereby granted, free of charge, to any person obtaining a copy
15
+ of this software and associated documentation files (the "Software"), to deal
16
+ in the Software without restriction, including without limitation the rights
17
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
18
+ copies of the Software, and to permit persons to whom the Software is
19
+ furnished to do so, subject to the following conditions:
20
+ The above copyright notice and this permission notice shall be included in all
21
+ copies or substantial portions of the Software.
22
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28
+ SOFTWARE.
29
+ """
30
+ from typing import Dict, List, Optional
31
+
32
+ import torch
33
+ from transformers import PreTrainedModel
34
+ from lighthouse.feature_extractor.audio_encoder import AudioEncoder
35
+ from lighthouse.feature_extractor.text_encoder import TextEncoder
36
+ from lighthouse.models import BasePredictor
37
+
38
+ from .configuration_amdetr import AMDETRConfig
39
+
40
+
41
+ class AMDETRPredictorWrapper(BasePredictor, PreTrainedModel):
42
+ config_class = AMDETRConfig
43
+
44
+ def __init__(self, config: AMDETRConfig, feature_name: str="clap") -> None:
45
+ PreTrainedModel.__init__(self, config)
46
+ args = config
47
+ self._clip_len: float = args.clip_length
48
+ self._device: str = args.device
49
+ self._size = 224
50
+ self._moment_num = 10
51
+
52
+ self._model: torch.nn.Module = self._initialize_model(args, args.model_name)
53
+ self._model.eval()
54
+
55
+ self._feature_name: str = feature_name
56
+ self._model_name: str = args.model_name
57
+
58
+ def load_encoders(self) -> None:
59
+ self._vision_encoder = None
60
+ self._audio_encoder: AudioEncoder = self._initialize_audio_encoder(self._feature_name, pann_path=None)
61
+ self._text_encoder: TextEncoder = self._initialize_text_encoder(self._feature_name)
62
+
63
+ @torch.no_grad()
64
+ def encode_audio(self, audio_path: str) -> Dict[str, torch.Tensor]:
65
+ if not hasattr(self, "_audio_encoder") or not hasattr(self, "_text_encoder"):
66
+ self.load_encoders()
67
+ return super().encode_audio(audio_path)
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:756a8aaf6db555da8342c2a2483545f4b4c2ab946f731f0f4eed774dd591c1f9
3
+ size 28552082