File size: 1,572 Bytes
33ac3a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69b9e45
33ac3a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# Copyright (2024) Tsinghua University, Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model:
  # paths
  llama_path: "DeepSeek-R1-Distill-Qwen-1.5B/"
  whisper_path: "distil-whisper/distil-large-v3/"
  beats_path: "BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt"

  ckpt: "tiny_all_tasks_319.pth"

  freeze_whisper: True
  freeze_beats: True

  # window-level Q-Former
  use_speech_Qformer: True
  freeze_speech_QFormer: False
  window_level_Qformer: True
  num_speech_query_token: 1
  second_per_window: 0.333333
  second_stride: 0.333333

  speech_llama_proj_model: ""
  freeze_speech_llama_proj: False

  # LoRA
  lora: True
  lora_rank: 8
  lora_alpha: 32
  lora_dropout: 0.1

  multi_prompt: True
  prompt_template: "USER: {}\nASSISTANT:"
  prompt_path: "prompts/train_prompt.json"
  test_prompt_path: "prompts/test_prompt.json"
  max_txt_len: 300
  end_sym: "</s>"

generate:
  max_new_tokens: 200
  num_beams: 4
  do_sample: False
  min_length: 1
  temperature: 1.0
  top_p: 0.9
  repetition_penalty: 1.0
  length_penalty: 1.0