Huck Yang commited on
Commit
920b3b4
·
1 Parent(s): eb75108

[baseline] qwen2-audio baseline

Browse files
qwen2_audio_baseline.zip DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c6f7b6c031286a657c61ad662f48c10c26544bfbf4487e6063c2fdd474a54976
3
- size 11488
 
 
 
 
qwen2_audio_baseline/qwen_audio_test.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import http.client
2
+ import json
3
+ import os
4
+ import time
5
+ import sys
6
+ from dashscope import MultiModalConversation
7
+ import random
8
+
9
+ def qwen_audio(prompt, audio_file_path):
10
+ messages = [
11
+ {
12
+ "role": "system",
13
+ "content": [
14
+ {"text": "You are a helpful assistant."}
15
+ ]},
16
+ {
17
+ "role": "user",
18
+ "content": [{"audio": audio_file_path}, {"text": prompt}],
19
+ }
20
+ ]
21
+ response = MultiModalConversation.call(model="qwen2-audio-instruct", messages=messages)
22
+ content = response['output']['choices'][0]['message']['content'][0]['text']
23
+ return content
24
+
25
+ if __name__ == '__main__':
26
+ json_path = sys.argv[1]
27
+ output_directory = "./dev_output_rest"
28
+ wav_path = json_path.split('.j')[0] + ".wav"
29
+ json_name = os.path.basename(json_path)
30
+ txt_file_name = os.path.splitext(json_name)[0] + ".txt"
31
+ output_path = os.path.join(output_directory, txt_file_name)
32
+ with open(json_path, 'r') as file:
33
+ QA_all = json.load(file)
34
+ question = QA_all["question"]
35
+ choices = QA_all["choice"]
36
+ element_count = len(choices)
37
+ if element_count == 4:
38
+ numbers = [0, 1, 2, 3]
39
+ random.shuffle(numbers)
40
+ if 'question_type' in QA_all and QA_all['question_type'] in ["sound counting", "sound detection", "audio tagging"]:
41
+ prompt = question + ' Please analyze the audio and provide the most accurate answer based on what you hear.'
42
+ else:
43
+ prompt = 'I want you to answer the question about the aduio. I will provide you with the question and four options. Your task is to generate the only correct option for the question. Here is the question: '+ question + ' and four options:' + choices[numbers[0]][3:] + ';' + choices[numbers[1]][3:] + ';' + choices[numbers[2]][3:] + ';' + choices[numbers[3]][3:] + '.'
44
+ if element_count == 3:
45
+ numbers = [0, 1, 2]
46
+ random.shuffle(numbers)
47
+ prompt = 'I want you to answer the question about the aduio. I will provide you with the question and four options. Your task is to generate the only correct option for the question. Here is the question: '+ question + ' and four options:' + choices[numbers[0]][3:] + ';' + choices[numbers[1]][3:] + ';' + choices[numbers[2]][3:] + '.' # ' question + choices[numbers[0]][3:] + ';' + choices[numbers[1]][3:] + ';' +choices[numbers[2]][3:] + ';' + choices[numbers[3]][3:] + '.' + ' Please analyze the audio and provide the most accurate answer based on what you hear.'
48
+ if element_count == 2:
49
+ numbers = [0, 1]
50
+ random.shuffle(numbers)
51
+ prompt = 'I want you to answer the question about the aduio. I will provide you with the question and four options. Your task is to generate the only correct option for the question. Here is the question: '+ question + ' and four options:' + choices[numbers[0]][3:] + ';' + choices[numbers[1]][3:] + '.' # ' question + choices[numbers[0]][3:] + ';' + choices[numbers[1]][3:] + ';' +choices[numbers[2]][3:] + ';' + choices[numbers[3]][3:] + '.' + ' Please analyze the audio and provide the most accurate answer based on what you hear.'
52
+
53
+ print(prompt)
54
+ answer_en = qwen_audio(prompt, wav_path)
55
+ txt_file = open(output_path, 'w')
56
+ print(answer_en)
57
+ txt_file.write(answer_en)
58
+ txt_file.close()
59
+
60
+
qwen2_audio_baseline/qwen_audio_test.sh ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ for i in {1..20}
2
+ do
3
+ python ./wav_json_no_response.py
4
+ directory_2="./dev_data_rest"
5
+ for file in "$directory_2"/*.json; do
6
+ if [ -f "$file" ]; then
7
+ python3 ./qwen_audio_test.py "$file"
8
+ fi
9
+ done
10
+ cp -r ./dev_output_rest/* ./dev_output/
11
+ rm -f ./dev_output_rest/*
12
+ rm -f ./dev_data_rest/*
13
+ done
14
+
15
+ python ./sentence_sim.py
qwen2_audio_baseline/requirements.txt ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.7.4.post0
2
+ aliyun-python-sdk-core==2.16.0
3
+ aliyun-python-sdk-kms==2.16.5
4
+ annotated-types==0.7.0
5
+ anyio==4.6.2.post1
6
+ asttokens==2.4.1
7
+ async-timeout==3.0.1
8
+ attrs==23.2.0
9
+ audioread==3.0.1
10
+ Brotli==1.1.0
11
+ certifi==2023.7.22
12
+ cffi==1.15.1
13
+ cftime==1.6.3
14
+ chardet==4.0.0
15
+ charset-normalizer==3.2.0
16
+ click==8.1.7
17
+ colorama==0.4.6
18
+ comm==0.2.2
19
+ conda-pack==0.7.1
20
+ contourpy==1.1.1
21
+ coverage==7.5.0
22
+ crcmod==1.7
23
+ cryptography==44.0.2
24
+ cycler==0.12.1
25
+ Cython==3.0.10
26
+ dashscope==1.22.0
27
+ decorator==5.1.1
28
+ einops==0.7.0
29
+ et_xmlfile==2.0.0
30
+ exceptiongroup==1.2.0
31
+ executing==2.0.1
32
+ fastapi==0.115.5
33
+ filelock==3.14.0
34
+ flake8==7.0.0
35
+ fonttools==4.43.1
36
+ fsspec==2024.3.1
37
+ future==1.0.0
38
+ fvcore==0.1.5.post20221221
39
+ h11==0.14.0
40
+ h5py==3.9.0
41
+ hear21passt==0.0.26
42
+ httpcore==1.0.7
43
+ httptools==0.6.4
44
+ httpx==0.28.1
45
+ huggingface-hub==0.29.3
46
+ idna==3.4
47
+ imageio==2.34.1
48
+ importlib-resources==6.1.0
49
+ inflate64==1.0.0
50
+ iniconfig==2.0.0
51
+ iopath==0.1.10
52
+ ipython==8.18.1
53
+ ipywidgets==8.1.2
54
+ jams==0.3.4
55
+ jedi==0.19.1
56
+ Jinja2==3.1.3
57
+ jmespath==0.10.0
58
+ joblib==1.3.2
59
+ jsonschema==4.21.1
60
+ jsonschema-specifications==2023.12.1
61
+ jupyterlab_widgets==3.0.10
62
+ kiwisolver==1.4.5
63
+ lazy_loader==0.3
64
+ librosa==0.10.1
65
+ llvmlite==0.41.1
66
+ lmdb==1.4.1
67
+ lxml==5.1.0
68
+ markdown-it-py==3.0.0
69
+ MarkupSafe==2.1.5
70
+ mat73==0.62
71
+ matplotlib==3.8.0
72
+ matplotlib-inline==0.1.6
73
+ mccabe==0.7.0
74
+ mdurl==0.1.2
75
+ mir-eval==0.7
76
+ modelscope==1.24.0
77
+ mpmath==1.3.0
78
+ msgpack==1.0.7
79
+ multidict==6.1.0
80
+ multivolumefile==0.2.3
81
+ mypy==1.10.0
82
+ mypy-extensions==1.0.0
83
+ netCDF4==1.6.5
84
+ networkx==3.2.1
85
+ numba==0.58.1
86
+ numpy==1.24.3
87
+ nvidia-cublas-cu12==12.4.5.8
88
+ nvidia-cuda-cupti-cu12==12.1.105
89
+ nvidia-cuda-nvrtc-cu12==12.1.105
90
+ nvidia-cuda-runtime-cu12==12.1.105
91
+ nvidia-cudnn-cu12==9.1.0.70
92
+ nvidia-cufft-cu12==11.0.2.54
93
+ nvidia-curand-cu12==10.3.2.106
94
+ nvidia-cusolver-cu12==11.6.1.9
95
+ nvidia-cusparse-cu12==12.1.0.106
96
+ nvidia-cusparselt-cu12==0.6.2
97
+ nvidia-ml-py3==7.352.0
98
+ nvidia-nccl-cu12==2.18.1
99
+ nvidia-nvjitlink-cu12==12.4.127
100
+ nvidia-nvtx-cu12==12.1.105
101
+ opencv-python==4.8.1.78
102
+ openpyxl==3.1.5
103
+ openxlab==0.1.2
104
+ oss2==2.17.0
105
+ packaging==24.2
106
+ pandas==2.2.1
107
+ parso==0.8.3
108
+ pexpect==4.9.0
109
+ Pillow==9.4.0
110
+ platformdirs==3.10.0
111
+ pluggy==1.5.0
112
+ pooch==1.7.0
113
+ portalocker==2.8.2
114
+ prompt-toolkit==3.0.43
115
+ propcache==0.2.1
116
+ protobuf==4.24.4
117
+ psutil==5.9.8
118
+ ptyprocess==0.7.0
119
+ pure-eval==0.2.2
120
+ py7zr==0.21.0
121
+ pybcj==1.0.2
122
+ pybind11==2.12.0
123
+ pycodestyle==2.11.1
124
+ pycparser==2.21
125
+ pycryptodome==3.21.0
126
+ pycryptodomex==3.20.0
127
+ pydantic==2.10.2
128
+ pydantic_core==2.27.1
129
+ pydub==0.25.1
130
+ pyflakes==3.2.0
131
+ Pygments==2.17.2
132
+ pyparsing==3.1.1
133
+ pyppmd==1.1.0
134
+ pyproject==1.3.1
135
+ pyroomacoustics==0.7.4
136
+ PySocks==1.7.1
137
+ pysofaconventions==0.1.5
138
+ pytest==8.2.0
139
+ python-dateutil==2.8.2
140
+ python-dotenv==1.0.1
141
+ python-multipart==0.0.17
142
+ python-telegram-bot==21.10
143
+ pytz==2023.4
144
+ PyYAML==6.0.1
145
+ pyzstd==0.15.10
146
+ referencing==0.35.0
147
+ regex==2023.12.25
148
+ requests==2.28.2
149
+ rich==13.4.2
150
+ rpds-py==0.18.0
151
+ sacrebleu==2.4.0
152
+ safetensors==0.5.3
153
+ scikit-learn==1.3.2
154
+ scipy==1.11.2
155
+ seaborn==0.13.2
156
+ sentence-transformers==3.4.1
157
+ sentencepiece==0.1.99
158
+ simpleaudio==1.0.4
159
+ six==1.16.0
160
+ sniffio==1.3.1
161
+ some-package==0.1
162
+ sortedcontainers==2.4.0
163
+ soundfile ==0.12.1
164
+ soxr==0.3.7
165
+ spatialscaper ==0.1.4
166
+ stack-data==0.6.3
167
+ starlette==0.41.3
168
+ sympy==1.13.1
169
+ tabulate==0.9.0
170
+ termcolor==2.4.0
171
+ texttable==1.7.0
172
+ thop==0.1.1.post2209072238
173
+ threadpoolctl==3.2.0
174
+ timm==0.9.16
175
+ tokenizers==0.21.1
176
+ tomli==2.0.1
177
+ torch==2.10.0
178
+ torchaudio==0.13.0
179
+ torchinfo==1.8.0
180
+ torchvision==0.14.0
181
+ tornado==5.1
182
+ tqdm==4.65.2
183
+ traitlets==5.14.2
184
+ transformers==4.50.3
185
+ triton==2.1.0
186
+ typing_extensions==4.12.2
187
+ tzdata==2024.1
188
+ urllib3==1.26.20
189
+ uvicorn==0.32.1
190
+ uvloop==0.21.0
191
+ watchfiles==1.0.0
192
+ wcwidth==0.2.13
193
+ webrtcvad==2.0.10
194
+ websocket-client==1.8.0
195
+ websockets==14.1
196
+ widgetsnbextension==4.0.10
197
+ ws4py==0.3.2
198
+ yacs==0.1.8
199
+ yarl==1.18.3
200
+ zipp==3.17.0
qwen2_audio_baseline/sentence_sim.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from transformers import AutoTokenizer, AutoModel
4
+ import torch
5
+ import torch.nn.functional as F
6
+ from transformers import AutoTokenizer, AutoModel
7
+ import torch
8
+ import torch.nn.functional as F
9
+
10
+ def encode_number_to_char(number):
11
+ mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D'}
12
+ return mapping.get(number, None)
13
+ #Mean Pooling - Take attention mask into account for correct averaging
14
+ def mean_pooling(model_output, attention_mask):
15
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
16
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
17
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
18
+
19
+
20
+ txt_dir = './dev_output'
21
+ json_dir = './dev_data'
22
+ show_detail_path = './show_detail'
23
+
24
+
25
+ txt_files = {os.path.splitext(file)[0]: os.path.join(txt_dir, file) for file in os.listdir(txt_dir) if file.endswith('.txt')}
26
+ json_files = {os.path.splitext(file)[0]: os.path.join(json_dir, file) for file in os.listdir(json_dir) if file.endswith('.json')}
27
+ tokenizer = AutoTokenizer.from_pretrained('/disk6/hyhong/QWen/Sbert_pretrain') # init
28
+ model = AutoModel.from_pretrained('/disk6/hyhong/QWen/Sbert_pretrain')
29
+ question_num = 0
30
+
31
+ tau_num = 0
32
+ l3d_num = 0
33
+ neu_num = 0
34
+ huck_num = 0
35
+ india_num = 0
36
+
37
+
38
+ correct_num = 0
39
+ correct_tau_num = 0
40
+ correct_l3d_num = 0
41
+ correct_neu_num = 0
42
+ correct_huck_num = 0
43
+ correct_india_num = 0
44
+
45
+ score = []
46
+
47
+ for txt_id, txt_path in txt_files.items():
48
+ if txt_id in json_files:
49
+ json_path = json_files[txt_id]
50
+ with open(json_path, 'r', encoding='utf-8') as json_file:
51
+ json_data = json.load(json_file)
52
+ question_num += 1
53
+ if "-a-" in txt_path:
54
+ l3d_num += 1
55
+ choices = [choice[3:] for choice in json_data.get("choice", [])]
56
+ if "-b-" in txt_path:
57
+ tau_num += 1
58
+ choices = [choice[3:] for choice in json_data.get("choice", [])]
59
+ if "-c-" in txt_path:
60
+ neu_num += 1
61
+ choices = [choice[3:] for choice in json_data.get("choice", [])]
62
+ if "-d-" in txt_path:
63
+ huck_num += 1
64
+ choices = [choice[3:] for choice in json_data.get("choice", [])]
65
+ if "-e-" in txt_path:
66
+ india_num += 1
67
+ choices = [choice[3:] for choice in json_data.get("choice", [])]
68
+
69
+ with open(txt_path, 'r', encoding='utf-8') as txt_file:
70
+ txt_sentence = txt_file.read().strip()
71
+
72
+ # choices = [choice[3:] for choice in json_data.get("choice", [])]
73
+ #choices = [choice for choice in json_data.get("choice", [])]
74
+ question = json_data["question"]
75
+ answer_all = json_data["answer"]
76
+ answer = answer_all[0]
77
+
78
+ txt_name = os.path.basename(txt_path)
79
+ show_detail_txt = os.path.join(show_detail_path, txt_name)
80
+
81
+ detail_txt = open(show_detail_txt, 'w')
82
+ detail_txt.write(json_data["audio_url"])
83
+ detail_txt.write('\n')
84
+ detail_txt.write("Question:" + json_data["question"])
85
+ detail_txt.write('\n')
86
+ detail_txt.write("Choice:")
87
+ detail_txt.write('\n')
88
+ detail_txt.write(json_data["choice"][0])
89
+ detail_txt.write('\n')
90
+ if len(json_data["choice"]) == 2:
91
+ detail_txt.write(json_data["choice"][1])
92
+ detail_txt.write('\n')
93
+ if len(json_data["choice"]) == 3:
94
+ detail_txt.write(json_data["choice"][2])
95
+ detail_txt.write('\n')
96
+ if len(json_data["choice"]) == 4:
97
+ detail_txt.write(json_data["choice"][3])
98
+ detail_txt.write('\n')
99
+ detail_txt.write("Correct answer:" + answer)
100
+ detail_txt.write('\n')
101
+
102
+ new_lists = [[txt_sentence, choice] for choice in choices]
103
+ score = []
104
+ for i, new_list in enumerate(new_lists, start=1):
105
+ qwen_response = new_list[0]
106
+ new_list[0] = question + " Answer:" + new_list[0]
107
+ new_list[1] = question + " Answer:" + new_list[1]
108
+ encoded_input = tokenizer(new_list, padding=True, truncation=True, return_tensors='pt')
109
+ with torch.no_grad():
110
+ model_output = model(**encoded_input)
111
+ sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
112
+ sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
113
+ sentence_embeddings_1 = sentence_embeddings[0,:]
114
+ sentence_embeddings_2 = sentence_embeddings[1,:]
115
+ dot_product = torch.dot(sentence_embeddings_1, sentence_embeddings_2)
116
+ score.append(dot_product)
117
+ #print(f"File: {txt_id}")
118
+ #print(f"List {i}: {new_list}")
119
+ max_score = max(score)
120
+ max_index = score.index(max_score)
121
+ answer_qwen = encode_number_to_char(max_index)
122
+ detail_txt.write("Model respond:" + qwen_response)
123
+ detail_txt.write('\n')
124
+ detail_txt.write("Model answer:" + answer_qwen)
125
+ detail_txt.close()
126
+
127
+ if answer_qwen == answer:
128
+ correct_num += 1
129
+ if "-a-" in txt_path:
130
+ correct_l3d_num += 1
131
+ if "-b-" in txt_path:
132
+ correct_tau_num += 1
133
+ if "-c-" in txt_path:
134
+ correct_neu_num += 1
135
+ if "-d-" in txt_path:
136
+ correct_huck_num += 1
137
+ if "-e-" in txt_path:
138
+ correct_india_num += 1
139
+
140
+ correct_rate_all = correct_num / question_num
141
+ correct_rate_part2 = (correct_l3d_num + correct_tau_num + correct_neu_num) / (l3d_num + tau_num + neu_num)
142
+ #correct_rate_tau = correct_tau_num / tau_num
143
+ #correct_rate_neu = correct_neu_num / neu_num
144
+ correct_rate_part1 = correct_huck_num / huck_num
145
+
146
+ correct_rate_part3 = correct_india_num / india_num
147
+
148
+ print('Overall accuracy rate:')
149
+ print(correct_rate_all)
150
+
151
+ print('PART 1 accuracy rate:')
152
+ print(correct_rate_part1)
153
+ print('PART 2 accuracy rate:')
154
+ print(correct_rate_part2)
155
+ print('PART 3 accuracy rate:')
156
+ print(correct_rate_part3)
157
+
158
+
159
+
qwen2_audio_baseline/show_detail/fold-e-audio-00001.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ./dev/audio_00001.wav
2
+ Question:What background sound enhances the calm atmosphere?
3
+ Choice:
4
+ A. Wind blowing
5
+
6
+
7
+ D. Chirping birds
8
+ Correct answer:B
9
+ Model respond:Raindrops.
10
+ Model answer:B
qwen2_audio_baseline/show_detail/fold-e-audio-00002.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ./dev/audio_00002.wav
2
+ Question:What type of musical element is highlighted despite background noise?
3
+ Choice:
4
+ A. Soft violin harmony
5
+
6
+
7
+ D. Groovy bassline
8
+ Correct answer:D
9
+ Model respond:Despite background noise, the type of musical element that is highlighted is a slow piano melody.
10
+ Model answer:B
qwen2_audio_baseline/show_detail/fold2-a-0022.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ./fold2-a-0022.wav
2
+ Question:How long does the keys jangling sound last in the audio clip?
3
+ Choice:
4
+ A. 1.2s
5
+
6
+
7
+ D. 2.4s
8
+ Correct answer:B
9
+ Model respond:The keys jangling sound lasts approximately from 9.54 seconds to 10.00 seconds, making it last for about 0.46 seconds.
10
+ Model answer:B
qwen2_audio_baseline/show_detail/fold2-a-0023.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ./fold2-a-0023.wav
2
+ Question:How many times does the printer sound occur in the audio clip?
3
+ Choice:
4
+ A. 1
5
+
6
+
7
+ D. 4
8
+ Correct answer:B
9
+ Model respond:The printer sound occurs 12 times in the audio clip.
10
+ Model answer:A
qwen2_audio_baseline/show_detail/fold2-b-0098.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ./fold2-b-0098.wav
2
+ Question:How many different sounds are present in the audio clip?
3
+ Choice:
4
+ A. 3
5
+
6
+
7
+ D. 2
8
+ Correct answer:B
9
+ Model respond:There are four distinct sounds present in the audio clip: a telephone ringtone at intervals (0.53,2.84) and (6.70,9.25), continuous background noise throughout (0.53,10.00), generic impact sounds at various points such as (3.08,3.23), (3.39,3.70), (3.89,4.05), (4.43,4.55), (4.71,4.82), (4.96,5.10), (5.24,5.50), (5.65,5.84), (5.99,6.13), (6.30,6.40), (6.53,6.66), (6.83,6.95), (7.11,7.21), (7.34,7.52), (7.65,7.75), (7.90,8.02), (8.15,8.27), (8.43,8.54), (8.70,8.83), (8.96,9.10), (9.28,9.40), (9.54,9.66), (9.82,9.95), and surface contact noises from (8.07,8.46).
10
+ Model answer:B
qwen2_audio_baseline/show_detail/fold2-b-0099.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ./fold2-b-0099.wav
2
+ Question:What is the first occurring sound in the audio?
3
+ Choice:
4
+ A. Laughter
5
+
6
+
7
+ D. Keys dropping
8
+ Correct answer:C
9
+ Model respond:The first occurrence of an ambiguous sourceless sound effect is between 0.42 and 0.97 seconds.
10
+ Model answer:C
qwen2_audio_baseline/show_detail/fold2-c-0074.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ./fold2-c-0074.wav
2
+ Question:At what time does the second occurrence of the scream sound start?
3
+ Choice:
4
+ A. 5.5s
5
+
6
+
7
+ D. 9.0s
8
+ Correct answer:B
9
+ Model respond:The second scream starts at 3.71 seconds.
10
+ Model answer:A
qwen2_audio_baseline/show_detail/fold2-c-0075.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ./fold2-c-0075.wav
2
+ Question:What is the duration of the first scream sound in the audio?
3
+ Choice:
4
+ A. 2.2 seconds
5
+
6
+
7
+ D. 4.9 seconds
8
+ Correct answer:C
9
+ Model respond:The first scream sound in the audio lasts from 0.65 seconds to 2.48 seconds, making its duration approximately 1.83 seconds.
10
+ Model answer:C
qwen2_audio_baseline/show_detail/fold2-d-66002A01.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ../../local_audio_path/66002A01.wav
2
+ Question:Based on the acoustic characteristics of the sound recording, which of the following best describes the signal features observed in the spectrogram?
3
+ Choice:
4
+ A. A continuous high-frequency tone above 10 kHz with no significant modulation.
5
+
6
+
7
+ D. Broad, unmodulated noise spread evenly across all frequencies.
8
+ Correct answer:B
9
+ Model respond:The audio contains speech with corresponding timestamps (0.00,2.38) and background noise throughout the duration (0.00,10.00). There's also a moment of silence between 2.38 and 10.00.
10
+ Model answer:A
qwen2_audio_baseline/show_detail/fold2-d-66002A04.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ../../local_audio_path/66002A04.wav
2
+ Question:Based on the sound recording, which of the following most accurately describes the acoustic characteristics of the signal?
3
+ Choice:
4
+ A. A continuous low-frequency signal below 500 Hz with distinct horizontal patterns indicating harmonics.
5
+
6
+
7
+ D. Broad noise evenly distributed across all frequencies without noticeable harmonic structure.
8
+ Correct answer:A
9
+ Model respond:The audio contains sounds that resemble digestive stomach rumbles and background noise suggesting an indoor setting. There's also an instance of a man speaking in the foreground.
10
+ Model answer:B
qwen2_audio_baseline/wav_json_no_response.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ txt_dir = './dev_output'
4
+ wav_json_dir = './dev_data'
5
+ output_dir = './dev_data_rest'
6
+
7
+ txt_files = [os.path.splitext(file)[0] for file in os.listdir(txt_dir) if file.endswith('.txt')]
8
+ for file in os.listdir(wav_json_dir):
9
+ if file.endswith('.wav') or file.endswith('.json'):
10
+ file_name_without_extension = os.path.splitext(file)[0]
11
+ if file_name_without_extension not in txt_files:
12
+ source_file_path = os.path.join(wav_json_dir, file)
13
+ destination_file_path = os.path.join(output_dir, file)
14
+ shutil.copy2(source_file_path, destination_file_path)
15
+ print(f'Copied: {file}')
16
+ print('Operation completed.')
17
+
18
+