SaraAlthubaiti commited on
Commit
33ac3a6
·
verified ·
1 Parent(s): 3447959
Files changed (2) hide show
  1. decode_config.yaml +58 -0
  2. test_prompt.json +31 -0
decode_config.yaml ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (2024) Tsinghua University, Bytedance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ model:
16
+ # paths
17
+ llama_path: "DeepSeek-R1-Distill-Qwen-1.5B/"
18
+ whisper_path: "distil-large-v3/"
19
+ beats_path: "BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt"
20
+
21
+ ckpt: "tiny_all_tasks_319.pth"
22
+
23
+ freeze_whisper: True
24
+ freeze_beats: True
25
+
26
+ # window-level Q-Former
27
+ use_speech_Qformer: True
28
+ freeze_speech_QFormer: False
29
+ window_level_Qformer: True
30
+ num_speech_query_token: 1
31
+ second_per_window: 0.333333
32
+ second_stride: 0.333333
33
+
34
+ speech_llama_proj_model: ""
35
+ freeze_speech_llama_proj: False
36
+
37
+ # LoRA
38
+ lora: True
39
+ lora_rank: 8
40
+ lora_alpha: 32
41
+ lora_dropout: 0.1
42
+
43
+ multi_prompt: True
44
+ prompt_template: "USER: {}\nASSISTANT:"
45
+ prompt_path: "prompts/train_prompt.json"
46
+ test_prompt_path: "prompts/test_prompt.json"
47
+ max_txt_len: 300
48
+ end_sym: "</s>"
49
+
50
+ generate:
51
+ max_new_tokens: 200
52
+ num_beams: 4
53
+ do_sample: False
54
+ min_length: 1
55
+ temperature: 1.0
56
+ top_p: 0.9
57
+ repetition_penalty: 1.0
58
+ length_penalty: 1.0
test_prompt.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "asr": "<Speech><SpeechHere></Speech> Recognize the speech and give me the transcription.",
3
+ "gender_recognition": "<Speech><SpeechHere></Speech> What is the gender of the speaker?",
4
+ "dialect_identification": "<Speech><SpeechHere></Speech> What is the dialect of the speaker?",
5
+ "asr_zh": "<Speech><SpeechHere></Speech> 请将语音中的内容写下来。",
6
+ "summarization": "<Speech><SpeechHere></Speech> Could you capture the main points of this audio in a short summary?",
7
+ "translation_ae": "<Speech><SpeechHere></Speech> Listen to the speech and translate it into English.",
8
+ "asr_de": "<Speech><SpeechHere></Speech> Hören Sie sich die Rede an und schreiben Sie ihren Inhalt auf.",
9
+ "translation_ec": "<Speech><SpeechHere></Speech> Listen to the speech and translate it into Chinese.",
10
+ "audiocaption": "<Speech><SpeechHere></Speech> Please describe the audio.",
11
+ "audiocaption_v2": "<Speech><SpeechHere></Speech> Please write down what your hear in the audio.",
12
+ "QA": "<Speech><SpeechHere></Speech> {}",
13
+ "gender_QA": "<Speech><SpeechHere></Speech> {}",
14
+ "phone_recognition": "<Speech><SpeechHere></Speech> Provide the phonetic transcription for the speech.",
15
+ "speech_query": "<Speech><SpeechHere></Speech> Please answer the question in detail.",
16
+ "emotion_recognition": "<Speech><SpeechHere></Speech> Describe the emotion of the speaker in one word.",
17
+ "lyrics_recognition": "<Speech><SpeechHere></Speech> Listen to the song and write down its content.",
18
+ "audio_speech_description": "<Speech><SpeechHere></Speech> Describe the speech and the background audio",
19
+ "speaker_verification": "<Speech><SpeechHere></Speech> Do you only hear the same person talking? Answer yes or no.",
20
+ "fluent_speech_audio": "<Speech><SpeechHere></Speech> Describe the background audio and the speech in a fluent sentence.",
21
+ "speech_separation": "<Speech><SpeechHere></Speech> Please write down what you hear each person says.",
22
+ "audio_story_telling": "<Speech><SpeechHere></Speech> Based on the audio, write a story in detail. Your story should be highly related to the audio.",
23
+ "speech_audio_query": "<Speech><SpeechHere></Speech> Please answer the speaker's question in detail based on the background sound.",
24
+ "slot_filling": "<Speech><SpeechHere></Speech> According to the speech, what is the {}?",
25
+ "music_description": "<Speech><SpeechHere></Speech> Listen to this music clip and describe the music.",
26
+ "translation_en2ja": "<Speech><SpeechHere></Speech> Listen to the speech and translate it into Japanese.",
27
+ "translation_en2de": "<Speech><SpeechHere></Speech> Listen to the speech and translate it into German.",
28
+ "speech_audio_coreasoning": "<Speech><SpeechHere></Speech> Use your strong reasoning skills to answer the speaker's question in detail based on the background sound.",
29
+ "keywords": "<Speech><SpeechHere></Speech> Give me only three keywords of the text.",
30
+ "speaker_diarization_asr": "<Speech><SpeechHere></Speech> Please recognize each speaker and transcribe their speech content."
31
+ }