[baseline] qwen2-audio baseline

Files changed (16) hide show

qwen2_audio_baseline.zip +0 -3
qwen2_audio_baseline/qwen_audio_test.py +60 -0
qwen2_audio_baseline/qwen_audio_test.sh +15 -0
qwen2_audio_baseline/requirements.txt +200 -0
qwen2_audio_baseline/sentence_sim.py +159 -0
qwen2_audio_baseline/show_detail/fold-e-audio-00001.txt +10 -0
qwen2_audio_baseline/show_detail/fold-e-audio-00002.txt +10 -0
qwen2_audio_baseline/show_detail/fold2-a-0022.txt +10 -0
qwen2_audio_baseline/show_detail/fold2-a-0023.txt +10 -0
qwen2_audio_baseline/show_detail/fold2-b-0098.txt +10 -0
qwen2_audio_baseline/show_detail/fold2-b-0099.txt +10 -0
qwen2_audio_baseline/show_detail/fold2-c-0074.txt +10 -0
qwen2_audio_baseline/show_detail/fold2-c-0075.txt +10 -0
qwen2_audio_baseline/show_detail/fold2-d-66002A01.txt +10 -0
qwen2_audio_baseline/show_detail/fold2-d-66002A04.txt +10 -0
qwen2_audio_baseline/wav_json_no_response.py +18 -0

qwen2_audio_baseline.zip DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c6f7b6c031286a657c61ad662f48c10c26544bfbf4487e6063c2fdd474a54976
-size 11488

qwen2_audio_baseline/qwen_audio_test.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import http.client
+import json
+import os
+import time
+import sys
+from dashscope import MultiModalConversation
+import random
+def qwen_audio(prompt, audio_file_path):
+    messages = [
+        {
+        "role": "system",
+        "content": [
+            {"text": "You are a helpful assistant."}
+        ]},
+        {
+        "role": "user",
+        "content": [{"audio": audio_file_path}, {"text": prompt}],
+        }
+        ]
+    response = MultiModalConversation.call(model="qwen2-audio-instruct", messages=messages)
+    content = response['output']['choices'][0]['message']['content'][0]['text']
+    return content
+if __name__ == '__main__':
+    json_path = sys.argv[1]
+    output_directory = "./dev_output_rest"
+    wav_path = json_path.split('.j')[0] + ".wav"
+    json_name = os.path.basename(json_path)
+    txt_file_name = os.path.splitext(json_name)[0] + ".txt"
+    output_path = os.path.join(output_directory, txt_file_name)
+    with open(json_path, 'r') as file:
+        QA_all = json.load(file)
+    question = QA_all["question"]
+    choices = QA_all["choice"]
+    element_count = len(choices)
+    if element_count == 4:
+        numbers = [0, 1, 2, 3]
+        random.shuffle(numbers)
+        if 'question_type' in QA_all and QA_all['question_type'] in ["sound counting", "sound detection", "audio tagging"]:
+            prompt = question + ' Please analyze the audio and provide the most accurate answer based on what you hear.'
+        else:
+            prompt = 'I want you to answer the question about the aduio. I will provide you with the question and four options. Your task is to generate the only correct option for the question. Here is the question: '+ question + ' and four options:' + choices[numbers[0]][3:] + ';' + choices[numbers[1]][3:] + ';' + choices[numbers[2]][3:] + ';' + choices[numbers[3]][3:] + '.'
+    if element_count == 3:
+        numbers = [0, 1, 2]
+        random.shuffle(numbers)
+        prompt = 'I want you to answer the question about the aduio. I will provide you with the question and four options. Your task is to generate the only correct option for the question. Here is the question: '+ question + ' and four options:' + choices[numbers[0]][3:] + ';' + choices[numbers[1]][3:] + ';' + choices[numbers[2]][3:] + '.' # ' question + choices[numbers[0]][3:] + ';' + choices[numbers[1]][3:] + ';' +choices[numbers[2]][3:] + ';' + choices[numbers[3]][3:] + '.' + ' Please analyze the audio and provide the most accurate answer based on what you hear.'
+    if element_count == 2:
+        numbers = [0, 1]
+        random.shuffle(numbers)
+        prompt = 'I want you to answer the question about the aduio. I will provide you with the question and four options. Your task is to generate the only correct option for the question. Here is the question: '+ question + ' and four options:' + choices[numbers[0]][3:] + ';' + choices[numbers[1]][3:] + '.' # ' question + choices[numbers[0]][3:] + ';' + choices[numbers[1]][3:] + ';' +choices[numbers[2]][3:] + ';' + choices[numbers[3]][3:] + '.' + ' Please analyze the audio and provide the most accurate answer based on what you hear.'
+    print(prompt)
+    answer_en = qwen_audio(prompt, wav_path)
+    txt_file = open(output_path, 'w')
+    print(answer_en)
+    txt_file.write(answer_en)
+    txt_file.close()

qwen2_audio_baseline/qwen_audio_test.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+for i in {1..20}
+do
+    python ./wav_json_no_response.py
+    directory_2="./dev_data_rest"
+    for file in "$directory_2"/*.json; do
+        if [ -f "$file" ]; then
+            python3 ./qwen_audio_test.py "$file"
+        fi
+    done
+    cp -r ./dev_output_rest/* ./dev_output/
+    rm -f ./dev_output_rest/*
+    rm -f ./dev_data_rest/*
+done
+python ./sentence_sim.py

qwen2_audio_baseline/requirements.txt ADDED Viewed

	@@ -0,0 +1,200 @@

+aiohttp==3.7.4.post0
+aliyun-python-sdk-core==2.16.0
+aliyun-python-sdk-kms==2.16.5
+annotated-types==0.7.0
+anyio==4.6.2.post1
+asttokens==2.4.1
+async-timeout==3.0.1
+attrs==23.2.0
+audioread==3.0.1
+Brotli==1.1.0
+certifi==2023.7.22
+cffi==1.15.1
+cftime==1.6.3
+chardet==4.0.0
+charset-normalizer==3.2.0
+click==8.1.7
+colorama==0.4.6
+comm==0.2.2
+conda-pack==0.7.1
+contourpy==1.1.1
+coverage==7.5.0
+crcmod==1.7
+cryptography==44.0.2
+cycler==0.12.1
+Cython==3.0.10
+dashscope==1.22.0
+decorator==5.1.1
+einops==0.7.0
+et_xmlfile==2.0.0
+exceptiongroup==1.2.0
+executing==2.0.1
+fastapi==0.115.5
+filelock==3.14.0
+flake8==7.0.0
+fonttools==4.43.1
+fsspec==2024.3.1
+future==1.0.0
+fvcore==0.1.5.post20221221
+h11==0.14.0
+h5py==3.9.0
+hear21passt==0.0.26
+httpcore==1.0.7
+httptools==0.6.4
+httpx==0.28.1
+huggingface-hub==0.29.3
+idna==3.4
+imageio==2.34.1
+importlib-resources==6.1.0
+inflate64==1.0.0
+iniconfig==2.0.0
+iopath==0.1.10
+ipython==8.18.1
+ipywidgets==8.1.2
+jams==0.3.4
+jedi==0.19.1
+Jinja2==3.1.3
+jmespath==0.10.0
+joblib==1.3.2
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+jupyterlab_widgets==3.0.10
+kiwisolver==1.4.5
+lazy_loader==0.3
+librosa==0.10.1
+llvmlite==0.41.1
+lmdb==1.4.1
+lxml==5.1.0
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mat73==0.62
+matplotlib==3.8.0
+matplotlib-inline==0.1.6
+mccabe==0.7.0
+mdurl==0.1.2
+mir-eval==0.7
+modelscope==1.24.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.1.0
+multivolumefile==0.2.3
+mypy==1.10.0
+mypy-extensions==1.0.0
+netCDF4==1.6.5
+networkx==3.2.1
+numba==0.58.1
+numpy==1.24.3
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-cusparselt-cu12==0.6.2
+nvidia-ml-py3==7.352.0
+nvidia-nccl-cu12==2.18.1
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.1.105
+opencv-python==4.8.1.78
+openpyxl==3.1.5
+openxlab==0.1.2
+oss2==2.17.0
+packaging==24.2
+pandas==2.2.1
+parso==0.8.3
+pexpect==4.9.0
+Pillow==9.4.0
+platformdirs==3.10.0
+pluggy==1.5.0
+pooch==1.7.0
+portalocker==2.8.2
+prompt-toolkit==3.0.43
+propcache==0.2.1
+protobuf==4.24.4
+psutil==5.9.8
+ptyprocess==0.7.0
+pure-eval==0.2.2
+py7zr==0.21.0
+pybcj==1.0.2
+pybind11==2.12.0
+pycodestyle==2.11.1
+pycparser==2.21
+pycryptodome==3.21.0
+pycryptodomex==3.20.0
+pydantic==2.10.2
+pydantic_core==2.27.1
+pydub==0.25.1
+pyflakes==3.2.0
+Pygments==2.17.2
+pyparsing==3.1.1
+pyppmd==1.1.0
+pyproject==1.3.1
+pyroomacoustics==0.7.4
+PySocks==1.7.1
+pysofaconventions==0.1.5
+pytest==8.2.0
+python-dateutil==2.8.2
+python-dotenv==1.0.1
+python-multipart==0.0.17
+python-telegram-bot==21.10
+pytz==2023.4
+PyYAML==6.0.1
+pyzstd==0.15.10
+referencing==0.35.0
+regex==2023.12.25
+requests==2.28.2
+rich==13.4.2
+rpds-py==0.18.0
+sacrebleu==2.4.0
+safetensors==0.5.3
+scikit-learn==1.3.2
+scipy==1.11.2
+seaborn==0.13.2
+sentence-transformers==3.4.1
+sentencepiece==0.1.99
+simpleaudio==1.0.4
+six==1.16.0
+sniffio==1.3.1
+some-package==0.1
+sortedcontainers==2.4.0
+soundfile ==0.12.1
+soxr==0.3.7
+spatialscaper ==0.1.4
+stack-data==0.6.3
+starlette==0.41.3
+sympy==1.13.1
+tabulate==0.9.0
+termcolor==2.4.0
+texttable==1.7.0
+thop==0.1.1.post2209072238
+threadpoolctl==3.2.0
+timm==0.9.16
+tokenizers==0.21.1
+tomli==2.0.1
+torch==2.10.0
+torchaudio==0.13.0
+torchinfo==1.8.0
+torchvision==0.14.0
+tornado==5.1
+tqdm==4.65.2
+traitlets==5.14.2
+transformers==4.50.3
+triton==2.1.0
+typing_extensions==4.12.2
+tzdata==2024.1
+urllib3==1.26.20
+uvicorn==0.32.1
+uvloop==0.21.0
+watchfiles==1.0.0
+wcwidth==0.2.13
+webrtcvad==2.0.10
+websocket-client==1.8.0
+websockets==14.1
+widgetsnbextension==4.0.10
+ws4py==0.3.2
+yacs==0.1.8
+yarl==1.18.3
+zipp==3.17.0

qwen2_audio_baseline/sentence_sim.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import os
+import json
+from transformers import AutoTokenizer, AutoModel
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModel
+import torch
+import torch.nn.functional as F
+def encode_number_to_char(number):
+    mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D'}
+    return mapping.get(number, None)
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+txt_dir = './dev_output'
+json_dir = './dev_data'
+show_detail_path = './show_detail'
+txt_files = {os.path.splitext(file)[0]: os.path.join(txt_dir, file) for file in os.listdir(txt_dir) if file.endswith('.txt')}
+json_files = {os.path.splitext(file)[0]: os.path.join(json_dir, file) for file in os.listdir(json_dir) if file.endswith('.json')}
+tokenizer = AutoTokenizer.from_pretrained('/disk6/hyhong/QWen/Sbert_pretrain')   # init
+model = AutoModel.from_pretrained('/disk6/hyhong/QWen/Sbert_pretrain')
+question_num = 0
+tau_num = 0
+l3d_num = 0
+neu_num = 0
+huck_num = 0
+india_num = 0
+correct_num = 0
+correct_tau_num = 0
+correct_l3d_num = 0
+correct_neu_num = 0
+correct_huck_num = 0
+correct_india_num = 0
+score = []
+for txt_id, txt_path in txt_files.items():
+    if txt_id in json_files:
+        json_path = json_files[txt_id]
+        with open(json_path, 'r', encoding='utf-8') as json_file:
+            json_data = json.load(json_file)
+        question_num += 1
+        if "-a-" in txt_path:
+            l3d_num += 1
+            choices = [choice[3:] for choice in json_data.get("choice", [])]
+        if "-b-" in txt_path:
+            tau_num += 1
+            choices = [choice[3:] for choice in json_data.get("choice", [])]
+        if "-c-" in txt_path:
+            neu_num += 1
+            choices = [choice[3:] for choice in json_data.get("choice", [])]
+        if "-d-" in txt_path:
+            huck_num += 1
+            choices = [choice[3:] for choice in json_data.get("choice", [])]
+        if "-e-" in txt_path:
+            india_num += 1
+            choices = [choice[3:] for choice in json_data.get("choice", [])]
+        with open(txt_path, 'r', encoding='utf-8') as txt_file:
+            txt_sentence = txt_file.read().strip()
+        # choices = [choice[3:] for choice in json_data.get("choice", [])]
+        #choices = [choice for choice in json_data.get("choice", [])]
+        question = json_data["question"]
+        answer_all = json_data["answer"]
+        answer = answer_all[0]
+        txt_name = os.path.basename(txt_path)
+        show_detail_txt = os.path.join(show_detail_path, txt_name)
+        detail_txt = open(show_detail_txt, 'w')
+        detail_txt.write(json_data["audio_url"])
+        detail_txt.write('\n')
+        detail_txt.write("Question:" + json_data["question"])
+        detail_txt.write('\n')
+        detail_txt.write("Choice:")
+        detail_txt.write('\n')
+        detail_txt.write(json_data["choice"][0])
+        detail_txt.write('\n')
+        if len(json_data["choice"]) == 2:
+            detail_txt.write(json_data["choice"][1])
+        detail_txt.write('\n')
+        if len(json_data["choice"]) == 3:
+            detail_txt.write(json_data["choice"][2])
+        detail_txt.write('\n')
+        if len(json_data["choice"]) == 4:
+            detail_txt.write(json_data["choice"][3])
+        detail_txt.write('\n')
+        detail_txt.write("Correct answer:" + answer)
+        detail_txt.write('\n')
+        new_lists = [[txt_sentence, choice] for choice in choices]
+        score = []
+        for i, new_list in enumerate(new_lists, start=1):
+            qwen_response = new_list[0]
+            new_list[0] = question + " Answer:" + new_list[0]
+            new_list[1] = question + " Answer:" + new_list[1]
+            encoded_input = tokenizer(new_list, padding=True, truncation=True, return_tensors='pt')
+            with torch.no_grad():
+                model_output = model(**encoded_input)
+            sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+            sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
+            sentence_embeddings_1 = sentence_embeddings[0,:]
+            sentence_embeddings_2 = sentence_embeddings[1,:]
+            dot_product = torch.dot(sentence_embeddings_1, sentence_embeddings_2)
+            score.append(dot_product)
+            #print(f"File: {txt_id}")
+            #print(f"List {i}: {new_list}")
+        max_score = max(score)
+        max_index = score.index(max_score)
+        answer_qwen = encode_number_to_char(max_index)
+        detail_txt.write("Model respond:" + qwen_response)
+        detail_txt.write('\n')
+        detail_txt.write("Model answer:" + answer_qwen)
+        detail_txt.close()
+        if answer_qwen == answer:
+            correct_num += 1
+            if "-a-" in txt_path:
+                correct_l3d_num += 1
+            if "-b-" in txt_path:
+                correct_tau_num += 1
+            if "-c-" in txt_path:
+                correct_neu_num += 1
+            if "-d-" in txt_path:
+                correct_huck_num += 1
+            if "-e-" in txt_path:
+                correct_india_num += 1
+correct_rate_all = correct_num / question_num
+correct_rate_part2 = (correct_l3d_num + correct_tau_num + correct_neu_num) / (l3d_num + tau_num + neu_num)
+#correct_rate_tau = correct_tau_num / tau_num
+#correct_rate_neu = correct_neu_num / neu_num
+correct_rate_part1 = correct_huck_num / huck_num
+correct_rate_part3 = correct_india_num / india_num
+print('Overall accuracy rate:')
+print(correct_rate_all)
+print('PART 1 accuracy rate:')
+print(correct_rate_part1)
+print('PART 2 accuracy rate:')
+print(correct_rate_part2)
+print('PART 3 accuracy rate:')
+print(correct_rate_part3)

qwen2_audio_baseline/show_detail/fold-e-audio-00001.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+./dev/audio_00001.wav
+Question:What background sound enhances the calm atmosphere?
+Choice:
+A. Wind blowing
+D. Chirping birds
+Correct answer:B
+Model respond:Raindrops.
+Model answer:B

qwen2_audio_baseline/show_detail/fold-e-audio-00002.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+./dev/audio_00002.wav
+Question:What type of musical element is highlighted despite background noise?
+Choice:
+A. Soft violin harmony
+D. Groovy bassline
+Correct answer:D
+Model respond:Despite background noise, the type of musical element that is highlighted is a slow piano melody.
+Model answer:B

qwen2_audio_baseline/show_detail/fold2-a-0022.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+./fold2-a-0022.wav
+Question:How long does the keys jangling sound last in the audio clip?
+Choice:
+A. 1.2s
+D. 2.4s
+Correct answer:B
+Model respond:The keys jangling sound lasts approximately from 9.54 seconds to 10.00 seconds, making it last for about 0.46 seconds.
+Model answer:B

qwen2_audio_baseline/show_detail/fold2-a-0023.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+./fold2-a-0023.wav
+Question:How many times does the printer sound occur in the audio clip?
+Choice:
+A. 1
+D. 4
+Correct answer:B
+Model respond:The printer sound occurs 12 times in the audio clip.
+Model answer:A

qwen2_audio_baseline/show_detail/fold2-b-0098.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+./fold2-b-0098.wav
+Question:How many different sounds are present in the audio clip?
+Choice:
+A. 3
+D. 2
+Correct answer:B
+Model respond:There are four distinct sounds present in the audio clip: a telephone ringtone at intervals (0.53,2.84) and (6.70,9.25), continuous background noise throughout (0.53,10.00), generic impact sounds at various points such as (3.08,3.23), (3.39,3.70), (3.89,4.05), (4.43,4.55), (4.71,4.82), (4.96,5.10), (5.24,5.50), (5.65,5.84), (5.99,6.13), (6.30,6.40), (6.53,6.66), (6.83,6.95), (7.11,7.21), (7.34,7.52), (7.65,7.75), (7.90,8.02), (8.15,8.27), (8.43,8.54), (8.70,8.83), (8.96,9.10), (9.28,9.40), (9.54,9.66), (9.82,9.95), and surface contact noises from (8.07,8.46).
+Model answer:B

qwen2_audio_baseline/show_detail/fold2-b-0099.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+./fold2-b-0099.wav
+Question:What is the first occurring sound in the audio?
+Choice:
+A. Laughter
+D. Keys dropping
+Correct answer:C
+Model respond:The first occurrence of an ambiguous sourceless sound effect is between 0.42 and 0.97 seconds.
+Model answer:C

qwen2_audio_baseline/show_detail/fold2-c-0074.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+./fold2-c-0074.wav
+Question:At what time does the second occurrence of the scream sound start?
+Choice:
+A. 5.5s
+D. 9.0s
+Correct answer:B
+Model respond:The second scream starts at 3.71 seconds.
+Model answer:A

qwen2_audio_baseline/show_detail/fold2-c-0075.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+./fold2-c-0075.wav
+Question:What is the duration of the first scream sound in the audio?
+Choice:
+A. 2.2 seconds
+D. 4.9 seconds
+Correct answer:C
+Model respond:The first scream sound in the audio lasts from 0.65 seconds to 2.48 seconds, making its duration approximately 1.83 seconds.
+Model answer:C

qwen2_audio_baseline/show_detail/fold2-d-66002A01.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+../../local_audio_path/66002A01.wav
+Question:Based on the acoustic characteristics of the sound recording, which of the following best describes the signal features observed in the spectrogram?
+Choice:
+A. A continuous high-frequency tone above 10 kHz with no significant modulation.
+D. Broad, unmodulated noise spread evenly across all frequencies.
+Correct answer:B
+Model respond:The audio contains speech with corresponding timestamps (0.00,2.38) and background noise throughout the duration (0.00,10.00). There's also a moment of silence between 2.38 and 10.00.
+Model answer:A

qwen2_audio_baseline/show_detail/fold2-d-66002A04.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+../../local_audio_path/66002A04.wav
+Question:Based on the sound recording, which of the following most accurately describes the acoustic characteristics of the signal?
+Choice:
+A. A continuous low-frequency signal below 500 Hz with distinct horizontal patterns indicating harmonics.
+D. Broad noise evenly distributed across all frequencies without noticeable harmonic structure.
+Correct answer:A
+Model respond:The audio contains sounds that resemble digestive stomach rumbles and background noise suggesting an indoor setting. There's also an instance of a man speaking in the foreground.
+Model answer:B

qwen2_audio_baseline/wav_json_no_response.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import os
+import shutil
+txt_dir = './dev_output'
+wav_json_dir = './dev_data'
+output_dir = './dev_data_rest'
+txt_files = [os.path.splitext(file)[0] for file in os.listdir(txt_dir) if file.endswith('.txt')]
+for file in os.listdir(wav_json_dir):
+    if file.endswith('.wav') or file.endswith('.json'):
+        file_name_without_extension = os.path.splitext(file)[0]
+        if file_name_without_extension not in txt_files:
+            source_file_path = os.path.join(wav_json_dir, file)
+            destination_file_path = os.path.join(output_dir, file)
+            shutil.copy2(source_file_path, destination_file_path)
+            print(f'Copied: {file}')
+print('Operation completed.')