Spaces:

nvidia
/

audio-flamingo-2

Runtime error

audio-flamingo-2 / data /prepare_each_dataset.py

root

initial commit

a344f64 9 months ago

125 kB

	import os
	import json
	import csv
	import yaml
	from collections import defaultdict
	import pickle
	import glob
	import math
	from functools import partial
	import sys
	import io
	import warnings
	import random

	import numpy as np
	import torch
	import laion_clap

	import librosa
	from pydub import AudioSegment
	import soundfile as sf

	import faiss

	import multiprocessing
	multiprocessing.set_start_method('spawn', force=True)

	try:
	from tqdm import tqdm
	except:
	tqdm = lambda x: x


	def suppress_all_output(func):
	def wrapper(args, *kwargs):
	old_stdout = sys.stdout
	old_stderr = sys.stderr

	sys.stdout = io.StringIO()
	sys.stderr = io.StringIO()

	old_fd_out = os.dup(1)
	old_fd_err = os.dup(2)
	null_fd = os.open(os.devnull, os.O_RDWR)

	os.dup2(null_fd, 1)
	os.dup2(null_fd, 2)

	with warnings.catch_warnings():
	warnings.simplefilter("ignore")

	try:
	result = func(args, *kwargs)
	finally:
	os.dup2(old_fd_out, 1)
	os.dup2(old_fd_err, 2)
	os.close(null_fd)
	os.close(old_fd_out)
	os.close(old_fd_err)

	sys.stdout = old_stdout
	sys.stderr = old_stderr

	return result
	return wrapper


	def filter_file(file_path, file_list, filename):
	if file_list is not None:
	if filename not in file_list:
	print(filename, 'not exist')
	return True
	else:
	if not os.path.exists(os.path.join(file_path, filename)):
	print(filename, 'not exist')
	return True

	if os.path.getsize(os.path.join(file_path, filename)) < 16000:
	print(filename, 'less than 0.5 to 1 second')
	return True

	return False


	# ==================== Prepare dataset files from each data folder ====================

	EMOTION_MAP_DICT = {
	'amused': 'amused' ,
	'anger': 'angry' , 'angry': 'angry' ,
	'anxious': 'anxious' ,
	'apologetic': 'apologetic' ,
	'assertive': 'assertive' ,
	'calm': 'calm' ,
	'concerned': 'concerned' ,
	'contempt': 'contempt' ,
	'disgust': 'disgusted' , 'disgusted': 'disgusted' ,
	'encouraging': 'encouraging' ,
	'excited': 'excited' ,
	'fear': 'fearful' , 'fearful': 'fearful' ,
	'frustated': 'frustated' ,
	'happy': 'happy' , 'joy': 'happy' ,
	'neutral': 'neutral' ,
	'sad': 'sad' , 'sadness': 'sad' ,
	'sleepy': 'sleepy' ,
	'surprise': 'surprised' , 'surprised': 'surprised' ,
	'pleasantly surprised': 'pleasantly surprised' ,
	}

	def load_dataset_file(dataset_file):
	with open(dataset_file) as f:
	contents = f.read()
	contents = json.loads(contents)

	audio_files = [
	os.path.join(
	contents["dataset_path"],
	contents["split_path"],
	contents["data"][str(i)]["name"]
	) for i in range(contents["total_num"])
	]

	return contents, audio_files


	def compute_label_graph(dataset_name, dataset_path, top_n, output_file):
	if os.path.exists(output_file):
	print('loading precomputed graph:', output_file)
	with open(output_file, 'r') as json_file:
	graph = json.load(json_file)

	else:
	import torch
	from sentence_transformers import SentenceTransformer, util
	embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

	print('precomputing graph and save to:', output_file)

	if dataset_name == 'AudioSetSL_singlelabel':
	names = []
	with open(os.path.join(dataset_path, 'class_labels_indices.csv'), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in reader:
	_, label, name = row # 123, /m/02zsn, "Female speech, woman speaking"
	names += name.split(', ')
	names = [x.lower().strip() for x in names]

	elif dataset_name == "Clotho-AQA_singlelabel":
	names = set([])
	with open(os.path.join(dataset_path, 'clotho_aqa_metadata.csv'), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	_, file_name, keywords, _, _, _, _ = row
	names \|= set(keywords.split(';'))
	names = [x.lower().strip() for x in names]

	names_embeddings = embedding_model.encode(names, convert_to_tensor=True)
	similarity_matrix = util.pytorch_cos_sim(names_embeddings, names_embeddings)

	similarity_threshold = 0.75
	n_items = len(names)

	graph = {}
	for i in range(n_items):
	adjusted_top_n = min(top_n, n_items - 1)
	values, indices = torch.topk(similarity_matrix[i], adjusted_top_n + 1, largest=True)

	most_similar_items = []
	for value, idx in zip(values, indices):
	if idx != i and value <= similarity_threshold:
	most_similar_items.append(idx.item())
	if len(most_similar_items) == adjusted_top_n:
	break
	graph[names[i]] = [names[j] for j in most_similar_items]

	with open(output_file, 'w') as json_file:
	json.dump(graph, json_file)

	# graph is a dict: key = each label, value = List[20 similar labels]
	return graph


	def prepare_files(dataset_name, dataset_path, split, flamingo_task, output_file):

	assert not os.path.exists(output_file)
	dataset_dic = {
	"dataset_path": dataset_path,
	"split": split,
	"split_path": None,
	"flamingo_task": "{}-{}".format(dataset_name, flamingo_task),
	"total_num": 0,
	"data": {} # {id: {'name': name, 'prompt': prompt, 'output': output}}
	}

	if dataset_name == "AudioSet":
	assert flamingo_task == "EventClassification"

	assert split == 'train'
	map_split = lambda split: 'train_wav' if split == 'train' else ''
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	dic = defaultdict(str)
	with open(os.path.join(dataset_path, 'class_labels_indices.csv'), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	_, label, name = row # /m/02zsn,"Female speech, woman speaking"
	dic[label] = name

	with open(os.path.join(dataset_path, 'train.csv'), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	filename, _, _, labels = row # --aE2O5G5WE /m/03fwl,/m/04rlf,/m/09x0r
	filename = filename + '.wav'
	if filter_file(file_path, file_list, filename):
	continue

	label_list = labels.split(",")
	assert all(label in dic for label in label_list)

	text_output = ", ".join([dic[label] for label in label_list])
	if len(text_output) <= 1:
	continue
	text_prompt = 'this is a sound of'

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "AudioSetFull":
	assert flamingo_task == "EventClassification"

	assert split == 'train'
	map_split = lambda split: '/mnt/fsx-main/rafaelvalle/datasets/audioset/unbalanced_train_segments/22khz'
	file_path = map_split(split)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = None

	dic_code2label = defaultdict(str)
	with open(os.path.join(dataset_path, 'audioset-processing/data/class_labels_indices.csv'), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	_, code, name = row # /m/02zsn,"Female speech, woman speaking"
	dic_code2label[code] = name

	dic_filename2code = {}
	with open(os.path.join(dataset_path, 'audioset-processing/data/unbalanced_train_segments.csv'), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	next(reader)
	for row in tqdm(reader):
	filename, _, _, codes = row # --aE2O5G5WE /m/03fwl,/m/04rlf,/m/09x0r
	filename = 'Y' + filename + '.wav'
	dic_filename2code[filename] = codes.split(",")

	for part in tqdm(range(41)):
	part_str = str(part)
	if len(part_str) == 1:
	part_str = '0' + part_str
	part_folder = 'unbalanced_train_segments_part{}'.format(part_str)

	for filename in os.listdir(os.path.join(file_path, part_folder)):
	if not filename.endswith('.wav'):
	continue

	if filter_file(file_path, file_list, os.path.join(part_folder, filename)):
	continue

	if filename not in dic_filename2code:
	continue

	text_output = ", ".join([dic_code2label[code] for code in dic_filename2code[filename] if code in dic_code2label])
	if len(text_output) <= 1:
	continue
	text_prompt = 'this is a sound of'

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": os.path.join(part_folder, filename),
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "AudioSetFullwoAudioMusicCaps":
	assert flamingo_task == "EventClassification"

	assert split == 'train'
	map_split = lambda split: '/mnt/fsx-main/rafaelvalle/datasets/audioset/unbalanced_train_segments/22khz'
	file_path = map_split(split)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = None

	print('extracting AudioCaps and MusicCaps ytid to avoid these samples')
	audiocaps_ytid = []
	for f in ['audiocaps_dataset/train.csv', 'audiocaps_dataset/test.csv', 'audiocaps_dataset/val.csv']:
	with open(os.path.join(dataset_path, f), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in reader:
	_, ytid, _, _ = row
	audiocaps_ytid.append('Y' + ytid + '.wav')
	audiocaps_ytid = set(audiocaps_ytid)

	musiccaps_ytid = []
	with open(os.path.join(dataset_path, 'musiccaps_dataset/musiccaps_manifest.json')) as f:
	data = f.read()
	musiccaps_list = json.loads(data)
	for row in musiccaps_list:
	musiccaps_ytid.append('Y' + row["ytid"] + '.wav')
	musiccaps_ytid = set(musiccaps_ytid)

	print('Will exclude {} samples from MusicCaps and {} from AudioCaps'.format(len(audiocaps_ytid), len(musiccaps_ytid)))

	dic_code2label = defaultdict(str)
	with open(os.path.join(dataset_path, '../AudioSetFull/audioset-processing/data/class_labels_indices.csv'), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	_, code, name = row # /m/02zsn,"Female speech, woman speaking"
	dic_code2label[code] = name

	dic_filename2code = {}
	with open(os.path.join(dataset_path, '../AudioSetFull/audioset-processing/data/unbalanced_train_segments.csv'), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	next(reader)
	for row in tqdm(reader):
	filename, _, _, codes = row # --aE2O5G5WE /m/03fwl,/m/04rlf,/m/09x0r
	filename = 'Y' + filename + '.wav'
	dic_filename2code[filename] = codes.split(",")

	music_audio_caps_excluded = 0
	for part in tqdm(range(41)):
	part_str = str(part)
	if len(part_str) == 1:
	part_str = '0' + part_str
	part_folder = 'unbalanced_train_segments_part{}'.format(part_str)

	for filename in os.listdir(os.path.join(file_path, part_folder)):
	if not filename.endswith('.wav'):
	continue

	if filename in audiocaps_ytid or filename in musiccaps_ytid:
	music_audio_caps_excluded += 1
	continue

	if filter_file(file_path, file_list, os.path.join(part_folder, filename)):
	continue

	if filename not in dic_filename2code:
	continue

	text_output = ", ".join([dic_code2label[code] for code in dic_filename2code[filename] if code in dic_code2label])
	if len(text_output) <= 1:
	continue
	text_prompt = 'this is a sound of'

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": os.path.join(part_folder, filename),
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "AudioSetSL_singlelabel":
	import numpy as np

	assert flamingo_task == "EventClassification"

	assert split == 'train'
	map_split = lambda split: '../AudioSet/train_wav'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	dic = defaultdict(str)
	with open(os.path.join(dataset_path, 'class_labels_indices.csv'), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	_, label, name = row # /m/02zsn,"Female speech, woman speaking"
	dic[label] = name

	graph = compute_label_graph(
	dataset_name,
	dataset_path,
	top_n=200,
	output_file=os.path.join(dataset_path, 'label_graph.json')
	)

	with open(os.path.join(dataset_path, 'train.csv'), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	filename, _, _, labels = row # --aE2O5G5WE /m/03fwl,/m/04rlf,/m/09x0r
	filename = filename + '.wav'
	if filter_file(file_path, file_list, filename):
	continue

	label_list = labels.split(",")
	assert all(label in dic for label in label_list)

	text_labels = ", ".join([dic[label] for label in label_list]).lower()
	text_labels = text_labels.split(', ')
	text_output = np.random.choice(text_labels)
	if len(text_output) <= 1:
	continue

	num_options = np.random.choice(
	[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
	p=[ 0.05, 0.1, 0.1, 0.1, 0.1,
	0.05, 0.05, 0.05, 0.1, 0.05,
	0.05, 0.1, 0.05, 0.05]
	)

	negative_samples = [x for x in graph[text_output] if x not in set(text_labels)]
	candidate_negative_labels = list(np.random.choice(
	negative_samples[:num_options*10],
	size=num_options-1,
	replace=False
	))
	if type(candidate_negative_labels) is str:
	candidate_negative_labels = [candidate_negative_labels]

	all_options = [text_output] + candidate_negative_labels
	np.random.shuffle(all_options)

	text_prompt = 'Classify this sound.\nOPTIONS:\n - {}.'.format(
	'.\n - '.join(all_options)
	)

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "AUDIOCAPS13k":
	assert flamingo_task == 'AudioCaptioning'

	map_split = lambda split: 'audio_32000Hz/{}'.format(split)
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.flac'), os.listdir(file_path)))

	with open(os.path.join(
	dataset_path,
	'{}_manifest.json'.format(split + ('_v2' if split == 'train' else ''))
	), 'r') as f:
	data = f.readlines()
	data = [json.loads(row) for row in data]

	for row in tqdm(data):
	filename = row['audio_filepath'].split('/')[-1]
	if filter_file(file_path, file_list, filename):
	continue

	text_output = row['text']
	if len(text_output) <= 1:
	continue
	text_prompt = 'generate audio caption'

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "audiocaps":
	assert flamingo_task == 'AudioCaptioning'

	map_split = lambda split: 'audio/{}'.format(split if split in ['train', 'test'] else 'valid')
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.flac'), os.listdir(file_path)))

	for filename in tqdm(file_list):
	if filter_file(file_path, file_list, filename):
	continue

	with open(os.path.join(file_path, filename.replace('.flac', '.json')), 'r') as f:
	data = json.load(f)

	captions = data['text']
	for text_output in captions:
	if len(text_output) <= 1:
	continue
	text_prompt = 'generate audio caption'

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == 'BG-Gun-Sound-Dataset':
	assert flamingo_task == "SoundClassification"
	assert split in ["train", "test"]

	map_split = lambda split: 'data/gun_sound_v2'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = os.listdir(file_path)

	all_cates = set([])
	with open(os.path.join(dataset_path, 'data/v3_exp3_{}.csv'.format(split)), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	filename, cate, dist, dire = row
	if filter_file(file_path, file_list, filename):
	continue

	text_output = cate
	if len(text_output) <= 1:
	continue
	text_prompt = 'What is the gun of this sound?'

	all_cates.add(cate)

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	print(all_cates)

	elif dataset_name == "BirdsDataset":
	assert flamingo_task == "SoundClassification"
	assert split == 'train'

	map_split = lambda split: 'Voice_of_Birds'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = None

	for bird_type in tqdm(os.listdir(file_path)):
	bird_name = ' '.join(bird_type.split('_')[:-1])
	for filename in os.listdir(os.path.join(file_path, bird_type)):
	if filter_file(file_path, file_list, os.path.join(bird_type, filename)):
	continue

	text_output = bird_name
	if len(text_output) <= 1:
	continue
	text_prompt = 'What is the name of bird in this sound?'

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": os.path.join(bird_type, filename),
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "BBCSoundEffects":
	assert split in ['train']
	assert flamingo_task == 'AudioDescription'

	map_split = lambda split: '../WavCaps/BBC_Sound_Effects_flac'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.flac'), os.listdir(file_path)))

	with open(os.path.join(dataset_path, 'BBCSoundDownloader/BBCSoundEffects.csv'), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	if len(row) != 7:
	continue
	filename, description, _, _, _, _, _ = row
	filename = filename.replace('.wav', '.flac')

	if filter_file(file_path, file_list, filename):
	continue

	text_output = description
	if len(text_output) <= 1:
	continue
	text_prompt = 'generate audio description'

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "chime-home":
	assert flamingo_task == "EventClassification"
	assert split == 'train'

	map_split = lambda split: 'chime_home/chunks'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file48k_list = list(filter(lambda x: x.endswith('48kHz.wav'), os.listdir(file_path)))
	file16k_list = list(filter(lambda x: x.endswith('16kHz.wav'), os.listdir(file_path)))
	csv_file_list = list(filter(lambda x: x.endswith('.csv'), os.listdir(file_path)))

	label_mapping = {
	'c': 'child speaking',
	'm': 'male speaking',
	'f': 'female speaking',
	'p': 'human activity',
	't': 'television',
	'b': 'household appliances',
	's': 'silence'
	}

	for csv_file in tqdm(csv_file_list):
	with open(os.path.join(file_path, csv_file), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')

	labels = None
	for row in reader:
	if row[0] == 'majorityvote':
	labels = row[1]
	break

	if labels is None or len(labels) == 0:
	continue

	filename = csv_file.replace('.csv', '.48kHz.wav')
	if filter_file(file_path, file48k_list, filename):
	filename = csv_file.replace('.csv', '.16kHz.wav')
	if filter_file(file_path, file16k_list, filename):
	continue

	text_output = ", ".join([label_mapping[l] for l in labels if l in label_mapping])
	if len(text_output) <= 1:
	continue
	text_prompt = 'this is a sound of'

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "CLAP_freesound":
	assert flamingo_task == "AudioCaptioning"
	assert split in ["train", "test"]

	map_split = lambda split: os.path.join('freesound_no_overlap/split', split)
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.flac'), os.listdir(file_path)))

	with open(os.path.join(
	dataset_path,
	'freesound_no_overlap_meta.csv'
	), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	if len(row[0].split('/')) != 2:
	continue
	if len(row) <= 1:
	continue

	file_split, filename = row[0].split('/')

	if file_split != split:
	continue
	if filter_file(file_path, file_list, filename):
	continue

	caption_1 = row[1] # caption_2 = row[2] but not very good
	text_output = caption_1
	if len(text_output) <= 2:
	continue

	text_prompt = 'generate audio caption'
	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "Clotho-AQA":

	map_split = lambda split: 'audio_files'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	if flamingo_task == "EventClassification":
	dic = defaultdict(str)
	with open(os.path.join(dataset_path, 'clotho_aqa_metadata.csv'), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	_, file_name, keywords, _, _, _, _ = row
	dic[file_name] = keywords.replace(';', ', ')

	with open(os.path.join(dataset_path, 'clotho_aqa_{}.csv'.format(split)), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	filename = row[0]
	if filename not in dic or filter_file(file_path, file_list, filename):
	continue

	text_output = dic[filename]
	if len(text_output) <= 1:
	continue
	text_prompt = 'this is a sound of'
	del dic[filename]

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif flamingo_task == "AQA":
	dic_qa = defaultdict(list)
	with open(os.path.join(dataset_path, 'clotho_aqa_{}.csv'.format(split)), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	filename, question, answer, confidence = row
	dic_qa[(filename, question)].append((answer.lower(), confidence.lower()))

	# get binary -> trinary
	def preprocess(list_ans_conf):
	assert set([x[1] for x in list_ans_conf]) <= set(['yes', 'no', 'maybe'])

	answers = set([x[0].lower() for x in list_ans_conf])
	if answers <= set(['yes', 'no']):
	if len(answers) > 1:
	return ['unsure']
	else:
	return list(answers)
	else:
	return list(answers)

	# get majority vote
	def majority_vote(list_ans_conf):
	assert set([x[1] for x in list_ans_conf]) <= set(['yes', 'no', 'maybe'])
	weight = {'yes': 1.0, 'no': 0.1, 'maybe': 0.6}

	if set([x[0] for x in list_ans_conf]) <= set(['yes', 'no']):
	score = {'yes': 1.0, 'no': -1.0}
	pred = sum([score[x[0]] * weight[x[1]] for x in list_ans_conf])
	if pred > 0:
	return ['yes']
	else:
	return ['no']
	else:
	return list(set([x[0] for x in list_ans_conf]))

	for key in dic_qa:
	filename, question = key
	if filter_file(file_path, file_list, filename):
	continue

	if split == 'train':
	answers = majority_vote(dic_qa[key]) # majority vote
	else:
	answers = [x[0].strip().lower() for x in dic_qa[key]]
	answers = [', '.join(answers)]

	for answer in answers:
	text_output = answer
	if len(text_output) <= 1:
	continue
	text_prompt = "Question: " + question

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "Clotho-AQA_singlelabel":
	import numpy as np

	assert flamingo_task == "EventClassification"

	map_split = lambda split: '../Clotho-AQA/audio_files'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	dic = defaultdict(str)
	with open(os.path.join(dataset_path, 'clotho_aqa_metadata.csv'), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	_, file_name, keywords, _, _, _, _ = row
	dic[file_name] = keywords.split(';')

	graph = compute_label_graph(
	dataset_name,
	dataset_path,
	top_n=300,
	output_file=os.path.join(dataset_path, 'label_graph.json')
	)

	with open(os.path.join(dataset_path, 'clotho_aqa_{}.csv'.format(split)), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	filename = row[0]
	if filename not in dic or filter_file(file_path, file_list, filename):
	continue

	text_labels = [x.lower().strip() for x in dic[filename]]
	del dic[filename]

	for _ in range(6):
	text_output = np.random.choice(text_labels)
	if len(text_output) <= 1:
	continue

	num_options = np.random.choice(
	[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
	p=[ 0.05, 0.1, 0.1, 0.1, 0.1,
	0.05, 0.05, 0.05, 0.1, 0.05,
	0.05, 0.1, 0.05, 0.05]
	)

	negative_samples = [x for x in graph[text_output] if x not in set(text_labels)]
	candidate_negative_labels = list(np.random.choice(
	negative_samples[:num_options*20],
	size=num_options-1,
	replace=False
	))
	if type(candidate_negative_labels) is str:
	candidate_negative_labels = [candidate_negative_labels]

	all_options = [text_output] + candidate_negative_labels
	np.random.shuffle(all_options)

	text_prompt = 'Classify this sound.\nOPTIONS:\n - {}.'.format(
	'.\n - '.join(all_options)
	)

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "Clotho-v2":
	assert flamingo_task == "AudioCaptioning"
	assert split in ["train", "val", "test"]

	map_split = lambda split: 'development' if split == 'train' else ('validation' if split == "val" else "evaluation")
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	with open(os.path.join(
	dataset_path,
	'clotho_captions_{}.csv'.format(map_split(split))
	), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	filename = row[0]
	if filter_file(file_path, file_list, filename):
	continue

	for text_output in row[1:]:
	if len(text_output) <= 1:
	continue
	text_prompt = 'generate audio caption'
	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "CochlScene":
	import ndjson
	assert flamingo_task == "SceneClassification"

	map_split = lambda split: split.capitalize()
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = None

	with open(os.path.join(dataset_path, 'cochlscene_{}.ndjson'.format(split))) as ndjsonfile:
	reader = ndjson.load(ndjsonfile)
	for row in tqdm(reader):
	filename = "/".join(row["audiopath"].split("/")[1:])
	if filter_file(file_path, file_list, filename):
	continue

	text_output = row["labels"].lower()
	if len(text_output) <= 1:
	continue
	text_prompt = 'this acoustic scene is'

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "common-accent":
	import ndjson
	import re

	assert flamingo_task == "AccentClassification"
	assert split in ["train", "test"]

	map_split = lambda split: '22khz'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = os.listdir(file_path)

	all_accent = []
	split_file = [f for f in os.listdir(dataset_path) if f.startswith(split) and f.endswith('.ndjson')][0]
	with open(os.path.join(dataset_path, split_file)) as ndjsonfile:
	reader = ndjson.load(ndjsonfile)
	for row in tqdm(reader):
	accent = row["accent"]
	accent = re.sub(r'\(.*?\)', '', accent)
	accent = accent.replace('English', '')
	accent = accent.split(',')
	accent = [x.strip() for x in accent if 'school' not in x]
	all_accent += accent

	filename = row["filename"]
	if filter_file(file_path, file_list, filename):
	continue

	for accent_each in accent:
	if accent_each == 'Javanese':
	accent_each = 'Japanese'
	if len(accent_each) > 25:
	continue

	text_output = accent_each
	if len(text_output) <= 1:
	continue
	text_prompt = 'Classify the accent of this speech.'

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	print('all accents:', list(set(all_accent)))

	elif dataset_name == "CREMA-D":
	assert flamingo_task == "EmotionClassification"
	assert split in ["train"]

	map_split = lambda split: 'AudioWAV'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	split_file = os.path.join(
	dataset_path,
	'crema-d_audiopath_text_sid_emotion_filelist.txt'
	)
	with open(split_file, 'r') as f:
	data = f.readlines()
	data = [x.replace('\n', '') for x in data]

	for row in tqdm(data):
	if row.count('\|') != 3:
	continue
	filename, utterances, speaker, emotion = row.split('\|')
	if filter_file(file_path, file_list, filename):
	continue

	text_output = emotion
	text_prompt = 'this emotion is'

	if len(text_output) <= 1:
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "DCASE17Task4":
	assert flamingo_task == "SceneClassification"
	assert split in ["test"]

	map_split = lambda split: 'unbalanced_train_segments_testing_set_audio_formatted_and_segmented_downloads'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	split_file = os.path.join(
	dataset_path,
	'Task-4-Large-scale-weakly-supervised-sound-event-detection-for-smart-cars',
	'groundtruth_release',
	'groundtruth_strong_label_testing_set.csv'
	)

	dic = defaultdict(list)
	all_labels = []
	with open(split_file, newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter='\t', quotechar='"')
	for row in tqdm(reader):
	filename = 'Y' + row[0]
	label = row[-1]

	if filter_file(file_path, file_list, filename):
	continue

	dic[filename] += label.split(', ')
	all_labels += label.split(', ')

	print('all labels:\n', ', '.join(list(set(all_labels))))

	for filename in dic:
	text_output = ', '.join(list(set(dic[filename])))
	text_prompt = 'this acoustic scene is'

	if len(text_output) <= 1:
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "emov-db":
	assert flamingo_task == "EmotionClassification"
	assert split in ["train", "val"]

	map_split = lambda split: '22khz_from_16khz'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	split_file = os.path.join(
	dataset_path,
	'cleaned_emov_db_audiopath_text_sid_emotion_duration_filelist_merged_{}.txt'.format(split)
	)
	with open(split_file, 'r') as f:
	data = f.readlines()
	data = [x.replace('\n', '') for x in data]

	for row in tqdm(data):
	if row.count('\|') != 4:
	continue
	filename, utterances, speaker, emotion, duration = row.split('\|')
	if filter_file(file_path, file_list, filename):
	continue

	text_output = emotion
	text_output = EMOTION_MAP_DICT[text_output]
	text_prompt = 'this emotion is'

	if len(text_output) <= 1:
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "Epidemic_sound":
	assert split == 'train'
	assert flamingo_task in ["AudioCaptioning", "Tagging"]

	map_split = lambda split: 'audio'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.mp3'), os.listdir(file_path)))

	with open(os.path.join(dataset_path, 'Epidemic_all_debiased.csv'), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	if len(row) != 5:
	continue
	_, caption_1, caption_2, caption_t5, fileid = row
	filename = '{}.mp3'.format(fileid)
	if filter_file(file_path, file_list, filename):
	continue

	if flamingo_task == "AudioCaptioning":
	text_output = caption_t5
	if len(text_output) <= 1:
	continue
	text_prompt = 'generate audio caption'

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif flamingo_task == "Tagging":
	if not caption_2.startswith('the sounds of'):
	continue
	caption_2 = caption_2.replace('the sounds of ', '')
	caption_2 = caption_2.replace(', and', ',')
	if len(caption_2) < 2:
	continue

	tags = caption_2.split(', ')
	tags = list(map(lambda x: x.replace("'", "").strip().lower(), tags))
	text_output = '{}'.format(', '.join(tags))
	if len(text_output) <= 1:
	continue
	text_prompt = 'generate tags'

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "ESC50":
	assert flamingo_task in ["EventClassification"]
	assert split == 'train'

	map_split = lambda split: 'ESC-50-master/audio'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	with open(os.path.join(dataset_path, 'ESC-50-master/meta/esc50.csv'), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	if len(row) != 7:
	continue

	filename, fold, target, category, esc10, src_file, take = row
	if filter_file(file_path, file_list, filename):
	continue

	text_output = category.replace('_', ' ')
	text_prompt = 'classify this sound.'
	if len(text_output) <= 1:
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "FMA":
	import ast

	assert flamingo_task in ["GenreClassification"]
	assert split == 'train'

	map_split = lambda split: 'fma_large'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = None

	with open(os.path.join(dataset_path, 'fma_metadata/raw_tracks.csv'), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	if len(row) != 39:
	continue
	track_id,album_id,album_title,album_url, \
	artist_id,artist_name,artist_url,artist_website, \
	license_image_file,license_image_file_large, \
	license_parent_id,license_title,license_url, \
	tags,track_bit_rate,track_comments,track_composer, \
	track_copyright_c,track_copyright_p,track_date_created,track_date_recorded, \
	track_disc_number,track_duration,track_explicit,track_explicit_notes, \
	track_favorites,track_file,track_genres,track_image_file,track_information, \
	track_instrumental,track_interest,track_language_code, \
	track_listens,track_lyricist,track_number,track_publisher,track_title,track_url = row

	l = len(str(track_id))
	if l <= 3:
	filename = '{}/{}.mp3'.format(
	'000',
	'0'*(6-l)+str(track_id)
	)
	else:
	filename = '{}/{}.mp3'.format(
	'0'*(6-l)+str(track_id)[:l-3],
	'0'*(6-l)+str(track_id)
	)
	if filter_file(file_path, file_list, filename):
	continue

	if len(track_genres) == 0:
	continue

	track_genres = ast.literal_eval(track_genres)
	genres = ', '.join([dic['genre_title'].lower().strip() for dic in track_genres])
	text_output = genres + '.'

	text_prompt = "what is the genre of this music?"

	if len(text_output) <= 1:
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "FSD50k":
	import ndjson
	assert flamingo_task == "EventClassification"
	assert split in ["train", "test"]

	map_split = lambda split: '44khz/dev' if split == 'train' else '44khz/eval'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	with open(os.path.join(dataset_path, '{}.ndjson'.format(map_split(split).replace('44khz/', '')))) as ndjsonfile:
	reader = ndjson.load(ndjsonfile)
	for row in tqdm(reader):
	filename = row["filepath"].split("/")[1]
	if filter_file(file_path, file_list, filename):
	continue

	labels = [x.replace("_", " ").lower() for x in row["labels"]]
	text_output = ", ".join(labels)
	if len(text_output) <= 1:
	continue
	text_prompt = 'this is a sound of'

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "GTZAN":
	assert flamingo_task == "GenreClassification"
	assert split in ["train"]

	map_split = lambda split: 'gtzan/data/genres'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = None

	for genre in os.listdir(file_path):
	genre_wavs = [x for x in os.listdir(os.path.join(file_path, genre)) if x.endswith('.wav')]

	for genre_wav in genre_wavs:
	filename = os.path.join(genre, genre_wav)
	if filter_file(file_path, file_list, filename):
	continue

	text_output = genre
	if len(text_output) <= 1:
	continue
	text_prompt = 'What is the genre of this music?'

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "IEMOCAP":
	assert flamingo_task == "EmotionClassification"
	assert split in ["train", "test"]

	map_split = lambda split: 'IEMOCAP_full_release/16khz'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = None

	def read_this_ndjson(file_path):
	dic_list = []
	with open(file_path, 'r') as f:
	for line in f:
	turn_name = line.split("'turn_name': ")[-1].split(',')[0].replace("'", "")
	emotion = line.split("'emotion': ")[-1].split(',')[0].replace("'", "")
	dic = {
	'turn_name': turn_name,
	'emotion': emotion
	}
	dic_list.append(dic)
	return dic_list

	all_emotions = []
	meta_files = [x for x in os.listdir(os.path.join(dataset_path, 'IEMOCAP_full_release/ndjson')) if x.endswith('.ndjson')]
	for meta_file in tqdm(meta_files):
	main_folder = meta_file.split('_')[0]
	sub_folder = (meta_file.split('.ndjson')[0])[len(main_folder)+1:]

	if split == "train" and main_folder == "Session5":
	continue
	elif split == "test" and main_folder != "Session5":
	continue

	metadata_list = read_this_ndjson(os.path.join(dataset_path, 'IEMOCAP_full_release/ndjson', meta_file))

	for dic in metadata_list:
	filename = os.path.join(main_folder, sub_folder, dic['turn_name']+'.wav')
	if filter_file(file_path, file_list, filename):
	continue

	if dic['emotion'] in ['unknown', 'other']:
	continue

	text_output = dic['emotion']
	text_output = EMOTION_MAP_DICT[text_output]
	all_emotions.append(text_output)

	text_prompt = 'this emotion is'

	if len(text_output) <= 1:
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	print('all emotions:', list(set(all_emotions)))

	elif dataset_name == "jl-corpus":
	assert flamingo_task == "EmotionClassification"
	assert split in ["train", "val"]

	map_split = lambda split: '44khz'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	split_file = os.path.join(
	dataset_path,
	'jl-corpus_audiopath_text_sid_emotion_duration_{}_filelist.txt'.format(split)
	)
	with open(split_file, 'r') as f:
	data = f.readlines()
	data = [x.replace('\n', '') for x in data]

	for row in tqdm(data):
	if row.count('\|') != 4:
	continue
	filename, utterances, speaker, emotion, duration = row.split('\|')
	if filter_file(file_path, file_list, filename):
	continue

	text_output = emotion
	text_output = EMOTION_MAP_DICT[text_output]
	text_prompt = 'this emotion is'

	if len(text_output) <= 1:
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "LP-MusicCaps-MC":
	import pandas as pd
	assert flamingo_task in ["AudioCaptioning"]
	assert split in ["train", "test"]

	map_split = lambda split: '../MusicCaps/44khz'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	parquet_files = [f for f in os.listdir(os.path.join(dataset_path, 'data')) if f.endswith('.parquet') and f.startswith(split)]
	print('parquet_files', parquet_files)
	metadata_df = pd.concat([pd.read_parquet(os.path.join(dataset_path, 'data', f)) for f in parquet_files])

	for index, row in tqdm(metadata_df.iterrows()):
	filename = row['ytid'] + '.wav'
	if filter_file(file_path, file_list, filename):
	continue

	text_prompt = 'generate audio caption'
	for caption in [row['caption_writing'], row['caption_summary'], row['caption_paraphrase']]:
	text_output = caption

	if len(text_output) <= 1:
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "LP-MusicCaps-MSD":
	import pandas as pd
	assert flamingo_task in ["AudioCaptioning"]
	assert split in ["train", "test", "val"]

	map_split = lambda split: '../MSD/mp3s_22khz'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = None

	parquet_files = [f for f in os.listdir(dataset_path) if f.endswith('.parquet') and f.startswith(split)]
	print('parquet_files', parquet_files)
	metadata_df = pd.concat([pd.read_parquet(os.path.join(dataset_path, f)) for f in parquet_files])

	for index, row in tqdm(metadata_df.iterrows()):
	filename = row['path']
	if filter_file(file_path, file_list, filename):
	continue

	text_prompt = 'generate audio caption'
	for caption in [row['caption_writing'], row['caption_summary'], row['caption_paraphrase']]:
	text_output = caption

	if len(text_output) <= 1:
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "LP-MusicCaps-MTT":
	import pandas as pd
	assert flamingo_task in ["AudioCaptioning"]
	assert split in ["train", "test", "val"]

	map_split = lambda split: '../MagnaTagATune/16khz'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = None

	parquet_files = [f for f in os.listdir(dataset_path) if f.endswith('.parquet') and f.startswith(split)]
	print('parquet_files', parquet_files)
	metadata_df = pd.concat([pd.read_parquet(os.path.join(dataset_path, f)) for f in parquet_files])

	for index, row in tqdm(metadata_df.iterrows()):
	filename = row['path']
	if filter_file(file_path, file_list, filename):
	continue

	text_prompt = 'generate audio caption'
	for caption in [row['caption_writing'], row['caption_summary'], row['caption_paraphrase']]:
	text_output = caption

	if len(text_output) <= 1:
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "MACS":
	assert flamingo_task in ["AudioCaptioning", "Tagging"]
	assert split == 'train'

	map_split = lambda split: 'TAU_Urban_Acoustic_Scenes_2019/TAU-urban-acoustic-scenes-2019-development/audio'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	metadata_list = yaml.load(open(os.path.join(dataset_path, 'MACS.yaml')), Loader=yaml.FullLoader)['files']

	for file_metadata in tqdm(metadata_list):
	filename = file_metadata['filename']
	if filter_file(file_path, file_list, filename):
	continue

	for each_annotated in file_metadata['annotations']:
	caption = each_annotated['sentence']
	tags = ', '.join(each_annotated['tags']).replace('_', ' ')

	if flamingo_task == "AudioCaptioning":
	text_output = caption
	text_prompt = 'generate audio caption'

	elif flamingo_task == "Tagging":
	raise NotImplementedError

	if len(text_output) <= 1:
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "Medley-solos-DB":
	import ndjson
	assert flamingo_task in ["InstrClassification"]

	map_split = lambda split: '44khz'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	with open(os.path.join(dataset_path, 'medleysolosdb_manifest.ndjson')) as ndjsonfile:
	metadata_list = ndjson.load(ndjsonfile)

	for file_metadata in tqdm(metadata_list):
	subset = file_metadata['subset']
	if not subset.startswith(split):
	continue

	filename = file_metadata['filepath']
	if filter_file(file_path, file_list, filename):
	continue

	instrument = file_metadata["instrument"]

	text_output = instrument
	text_prompt = 'this music note is produced by'

	if len(text_output) <= 1:
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "MELD":
	import numpy as np
	assert flamingo_task in ["EmotionClassification", "SentimentClassification"]

	map_split = lambda split: '44khz'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	split_file = os.path.join(
	dataset_path,
	'{}.txt'.format(split if split in ['train', 'test'] else 'dev')
	)
	with open(split_file, 'r') as f:
	data = f.readlines()
	data = [x.replace('\n', '') for x in data]

	emotion_count = {
	'neutral': 4703, 'happy': 1739, 'sad': 683, 'surprised': 1204,
	'disgusted': 271, 'angry': 1108, 'fearful': 268,
	}
	sentiment_count = {
	'neutral': 4703, 'positive': 2330, 'negative': 2943,
	}
	balancing_factor = 1

	for row in tqdm(data):
	if row.count('\|') != 4:
	continue
	filename, utterances, speaker, emotion, sentiment = row.split('\|')
	if filter_file(file_path, file_list, filename):
	continue

	if flamingo_task == "EmotionClassification":
	text_output = emotion
	text_output = EMOTION_MAP_DICT[text_output]
	text_prompt = 'this emotion is'

	if split == 'train':
	balancing_factor = float(emotion_count['neutral']) / float(emotion_count[text_output])

	elif flamingo_task == "SentimentClassification":
	text_output = sentiment
	text_prompt = 'this sentiment is'

	if split == 'train':
	balancing_factor = float(sentiment_count['neutral']) / float(sentiment_count[text_output])

	if len(text_output) <= 1:
	continue

	for _ in range(int(np.floor(balancing_factor))):
	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	if np.random.rand() < balancing_factor - np.floor(balancing_factor):
	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "MSP-PODCAST-Publish-1.9":
	assert flamingo_task == "EmotionClassification"
	assert split in ["train", "val", "test"]

	map_split = lambda split: 'Audio'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)

	file_list = glob.glob('{}//.wav'.format(file_path))
	file_list = [x[len(file_path)+1:] for x in file_list]

	subfolder_map = {}
	for f in tqdm(file_list):
	subfolder, filename = f.split('/')
	subfolder_map[filename] = subfolder
	file_list = None

	emotion_dic = {
	'A': 'Angry',
	'S': 'Sad',
	'H': 'Happy',
	'U': 'Surprise',
	'F': 'Fear',
	'D': 'Disgust',
	'C': 'Contempt',
	'N': 'Neutral',
	'O': 'Other',
	'X': 'Not clear'
	}

	with open(os.path.join(dataset_path, 'Labels/labels_concensus.json')) as f:
	data = f.read()
	metadata_dic = json.loads(data)

	for filename in tqdm(list(metadata_dic.keys())):
	values = metadata_dic[filename]
	if not values["Split_Set"].lower().startswith(split):
	continue
	if values["EmoClass"] in ["O", "X"] or values["EmoClass"] not in emotion_dic.keys():
	continue

	subfolder = subfolder_map[filename]
	filename = '{}/{}'.format(subfolder, filename)
	if filter_file(file_path, file_list, filename):
	continue

	text_output = emotion_dic[values["EmoClass"]].lower()
	text_output = EMOTION_MAP_DICT[text_output]
	text_prompt = 'this emotion is'

	if len(text_output) <= 1:
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "mtg-jamendo":
	import ndjson
	assert flamingo_task == "MusicTagging"
	assert split in ["train", "val"]

	map_split = lambda split: '44khz'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = None

	with open(os.path.join(dataset_path, 'mtg_jamendo_{}_manifest.ndjson'.format(split))) as ndjsonfile:
	reader = ndjson.load(ndjsonfile)
	for row in tqdm(reader):
	filename = row["audiopath"]
	if filter_file(file_path, file_list, filename):
	continue

	text_output = row["caption"]
	text_prompt = 'generate music tags (genre, instrument, mood/theme)'

	if len(text_output) <= 1:
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "MU-LLAMA":

	assert flamingo_task in ['AQA']
	assert split in ['train', 'test']

	map_split = lambda split: 'MusicQA/audios'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	split_file = 'MusicQA/FinetuneMusicQA.json' if split == 'train' else 'MusicQA/EvalMusicQA.json'
	with open(os.path.join(dataset_path, split_file), 'r') as f:
	data = f.read()
	metadata_list = json.loads(data)

	for dic in tqdm(metadata_list):
	filename = dic["audio_name"]
	if filter_file(file_path, file_list, filename):
	continue

	text_prompt = 'Question: ' + dic["conversation"][0]["value"].strip()
	if not (text_prompt.endswith('.') or text_prompt.endswith('?')):
	text_prompt = text_prompt + '.'

	text_output = dic["conversation"][1]["value"].strip()
	if len(text_output) <= 1:
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "musdbhq":
	assert flamingo_task in ["InstrClassification"]
	assert split in ["train", "test", "val"]

	map_split = lambda split: './'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = None

	with open(os.path.join(dataset_path, 'file_list_44k_{}.txt'.format(split))) as f:
	data = f.readlines()
	data = [x.replace('\n', '') for x in data]

	for row in tqdm(data):
	if row.count('\|') != 1:
	continue

	filename, duration = row.split('\|')
	duration = float(duration)

	if filter_file(file_path, file_list, filename):
	continue

	text_output = filename.split('/')[-1].split('.wav')[0]
	if len(text_output) <= 1:
	continue
	text_prompt = 'this music is produced by'

	segment_length = 10
	for audio_start_idx in range(int(duration // segment_length)):
	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' '),
	"audio_start": audio_start_idx * segment_length
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "Music-AVQA":
	import ast
	import re

	assert flamingo_task in [
	"{}_{}".format(q, t) \
	for q in ['AQA', 'AVQA'] \
	for t in ['Comparative', 'Counting', 'Existential', 'Location', 'Temporal', 'All']
	]

	def replace_bracketed_words(input_string, replacements):
	def replacer(match):
	word = next(replacements)
	return word

	replacements = iter(replacements)
	output_string = re.sub(r'<[^>]*>', replacer, input_string)
	return output_string

	map_split = lambda split: 'audio'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	with open(os.path.join(dataset_path, 'MUSIC-AVQA/data/json/avqa-{}.json'.format(split)), 'r') as f:
	data = f.read()
	metadata_list = json.loads(data)

	for dic in tqdm(metadata_list):
	filename = dic["video_id"] + '.wav'
	if filter_file(file_path, file_list, filename):
	continue

	types = ast.literal_eval(dic["type"])
	if 'Visual' in types:
	continue

	if flamingo_task.startswith('AQA_') and 'Audio-Visual' in types:
	continue

	if flamingo_task.startswith('AVQA_') and 'Audio' in types:
	continue

	t = flamingo_task.split('_')[1]
	if (not t == 'All') and (not t in types):
	continue

	text_output = dic["anser"]
	if len(text_output) <= 1:
	continue

	question = dic["question_content"].replace("\uff1f", '?')
	templ_values = ast.literal_eval(dic["templ_values"])
	if len(templ_values) > 0:
	question = replace_bracketed_words(question, templ_values)
	text_prompt = "Question: " + question

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "MusicCaps":
	assert flamingo_task in ["AudioCaptioning", "EventClassification"]
	assert split in ["train", "test"]

	map_split = lambda split: '44khz'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	with open(os.path.join(dataset_path, 'musiccaps_manifest.json')) as f:
	data = f.read()
	metadata_list = json.loads(data)

	for file_metadata in tqdm(metadata_list):
	filename = file_metadata['filepath']
	if filter_file(file_path, file_list, filename):
	continue

	start_s, end_s = file_metadata["start_s"], file_metadata["end_s"]
	caption = file_metadata["caption"]
	audioset_positive_labels = file_metadata["audioset_positive_labels"] # audioset classes
	aspect_list = file_metadata["aspect_list"] # annotated classes

	if (split == 'train') == file_metadata["is_audioset_eval"]:
	continue

	if flamingo_task == "AudioCaptioning":
	text_output = caption
	text_prompt = 'generate audio caption'

	elif flamingo_task == "EventClassification":
	raise NotImplementedError

	if len(text_output) <= 1:
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "NonSpeech7k":
	assert flamingo_task in ["EventClassification"]
	assert split in ["train", "test"]

	map_split = lambda split: split
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	all_classes = []
	with open(os.path.join(dataset_path, 'metadata of {} set.csv').format(split), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	filename, _, _, _, classname, _, _, _ = row
	if filter_file(file_path, file_list, filename):
	continue

	text_output = classname.lower()
	if len(text_output) <= 1:
	continue
	text_prompt = 'this is a sound of'

	all_classes.append(classname)

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	print('all classes:', list(set(all_classes)))

	elif dataset_name == "NSynth":
	import ndjson
	assert flamingo_task in [
	"InstrClassification",
	"PitchClassification",
	"VelocityClassification",
	"SourceClassification",
	"QualityClassification",
	"MIR"
	]
	assert split in ["train", "test", "val"]

	map_split = lambda split: 'nsynth-{}/audio'.format('valid' if split == 'val' else split)
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	with open(os.path.join(dataset_path, map_split(split), '../examples.json')) as f:
	data = f.read()
	reader = json.loads(data)

	for key in tqdm(reader):
	filename = key + '.wav'
	if filter_file(file_path, file_list, filename):
	continue

	if flamingo_task == "InstrClassification":
	text_output = reader[key]["instrument_family_str"]
	text_prompt = 'this music note is produced by'

	elif flamingo_task == "PitchClassification":
	text_output = str(reader[key]["pitch"])
	text_prompt = 'this music note has pitch'

	elif flamingo_task == "VelocityClassification":
	text_output = str(reader[key]["velocity"])
	text_prompt = 'this music note has velocity'

	elif flamingo_task == "SourceClassification":
	text_output = reader[key]["instrument_source_str"]
	text_prompt = 'this music note has sonic source'

	elif flamingo_task == "QualityClassification":
	qualities_str = reader[key]["qualities_str"]
	if len(qualities_str) >= 1:
	text_output = ', '.join(qualities_str).replace('_', ' ')
	else:
	text_output = 'none'
	text_prompt = 'this music note has sonic qualities'

	elif flamingo_task == "MIR":
	instrument = reader[key]["instrument_family_str"]
	pitch = str(reader[key]["pitch"])
	velocity = str(reader[key]["velocity"])
	source = reader[key]["instrument_source_str"]
	qualities_str = ', '.join(reader[key]["qualities_str"]).replace('_', ' ')

	assert len(instrument) > 0
	text_output = 'produced by {}'.format(instrument)
	if len(pitch) > 0:
	text_output = text_output + ', pitch {}'.format(pitch)
	if len(velocity) > 0:
	text_output = text_output + ', velocity {}'.format(velocity)
	if len(source) > 0:
	text_output = text_output + ', source {}'.format(source)
	if len(qualities_str) > 0:
	text_output = text_output + ', and having qualities like {}'.format(qualities_str)

	text_prompt = 'this music note is'

	if len(text_output) <= 1:
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "OMGEmotion":
	import numpy as np
	import webrtcvad
	import wave
	from pydub import AudioSegment

	assert flamingo_task == "EmotionClassification"
	assert split in ["train", "val"]

	def convert_to_wav(file_path):
	audio = AudioSegment.from_file(file_path).set_frame_rate(16000).set_channels(1)
	wav_path = file_path.rsplit('.', 1)[0] + "_converted.wav"
	audio.export(wav_path, format="wav")
	return wav_path

	def contains_speech(file_path, aggressiveness=0):
	# aggressiveness between 0 and 3, 0 for very clean speech, and 3 for noisy speech
	wav_path = convert_to_wav(file_path)
	vad = webrtcvad.Vad(aggressiveness)

	with wave.open(wav_path, 'rb') as audio:
	assert audio.getsampwidth() == 2, "Audio must be 16-bit"
	assert audio.getnchannels() == 1, "Audio must be mono"
	assert audio.getframerate() == 16000, "Audio must be sampled at 16kHz"

	frame_duration = 10 # ms
	frame_size = int(audio.getframerate() * frame_duration / 1000)
	num_frames = int(audio.getnframes() / frame_size)

	for _ in range(num_frames):
	frame = audio.readframes(frame_size)
	if vad.is_speech(frame, audio.getframerate()):
	return True

	return False

	map_split = lambda split: 'processed-{}_utterance_data'.format(split)
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = None

	dic_code2emotion = {
	"0": "anger",
	"1": "disgust",
	"2": "fear",
	"3": "happy",
	"4": "neutral",
	"5": "sad",
	"6": "surprise",
	}

	all_emotions = []
	meta_file = os.path.join(
	dataset_path,
	'OMGEmotionChallenge',
	'omg_{}Videos.csv'.format('Train' if split == 'train' else 'Validation')
	)

	with open(meta_file, newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	link, start, end, video, utterance, _, _, EmotionMaxVote = row
	emotion = dic_code2emotion[str(EmotionMaxVote)]

	filename = os.path.join(video, utterance.replace('.mp4', '.mp3'))
	if filter_file(file_path, file_list, filename):
	continue

	if not contains_speech(os.path.join(file_path, filename)):
	print('{} does not contain speech'.format(filename))
	continue

	text_prompt = 'this emotion is'
	text_output = emotion
	if len(text_output) <= 1:
	continue

	all_emotions.append(EMOTION_MAP_DICT[emotion])

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	print('all emotions:', list(set(all_emotions)))

	elif dataset_name == "OpenAQA":

	assert flamingo_task == 'AQA'
	assert split == 'train'

	map_split = lambda split: './'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = None

	no_word_list = [
	'cannot determine', 'not provided', 'cannot be determined', 'sorry', 'i cannot',
	'without more information', 'enough information',
	'not possible', 'more context', 'enough', 'impossible', 'cannot be determined',
	'without additional information',
	'unclear', 'cannot', 'not clear', 'do not provide sufficient', 'does not provide',
	'difficult to determine', 'no information provided',
	"can't infer", "difficult to infer", "not specified", "no specific", "no information",
	"without additional", 'it is difficult to', "no indication"
	]

	print('computing dic_audiosetfull_parts')
	audiosetfull_root = '/mnt/fsx-main/rafaelvalle/datasets/audioset/unbalanced_train_segments/22khz/'
	part_strings = [('0'*(2-len(str(p))) + str(p)) for p in range(41)]
	dic_audiosetfull_parts = {
	part: set(os.listdir(os.path.join(audiosetfull_root, 'unbalanced_train_segments_part{}'.format(part)))) \
	for part in part_strings
	}

	audioset20k_filelist = set(os.listdir(os.path.join(file_path, '../AudioSet/train_wav')))

	print('computing dic_clotho_filename')
	clotho_files = os.listdir(os.path.join(dataset_path, '../Clotho-AQA/audio_files'))
	dic_clotho_filename = {
	'_'.join([s for s in f.split(' ') if len(s) > 0]): f \
	for f in clotho_files
	}

	print('reading open_ended/all_open_qa.json')
	with open(os.path.join(dataset_path, 'openaqa/data/open_ended/all_open_qa.json'), 'r') as f:
	data = f.read()
	metadata_list = json.loads(data)

	for dic in tqdm(metadata_list):
	#keys: instruction, input, dataset, audio_id, output, task

	text_output = dic["output"]
	if len(text_output) <= 1:
	continue
	if any(word in text_output.lower() for word in no_word_list):
	continue

	question = dic["instruction"]
	text_prompt = question

	audio_id = dic["audio_id"]
	subset = dic["dataset"]
	if subset == 'clotho_development':
	filename = audio_id.split('/')[-1]
	processed_filename = '_'.join([s for s in filename.split('_') if len(s) > 0])
	if processed_filename in dic_clotho_filename:
	filename = os.path.join(
	'../Clotho-AQA/audio_files',
	dic_clotho_filename[processed_filename]
	)
	else:
	continue

	elif subset in ['audiocaps_train', 'as_20k', 'as_strong_train']:
	found = False

	filename = audio_id.split('/')[-1].split('.flac')[0] + '.wav'
	if filename in audioset20k_filelist:
	filename = os.path.join('../AudioSet/train_wav', filename)
	found = True
	else:
	filename = 'Y' + filename
	for part in part_strings:
	if filename in dic_audiosetfull_parts[part]:
	filename = os.path.join(
	audiosetfull_root,
	'unbalanced_train_segments_part{}'.format(part),
	filename
	)
	found = True
	break

	if not found:
	print(filename, 'not found')
	continue

	elif subset == 'freesound_10s':
	filename = os.path.join(
	'../CLAP_freesound/freesound_no_overlap/split/train',
	audio_id.split('/')[-1]
	)

	elif subset == 'vggsound_train':
	continue

	if filter_file(file_path, file_list, filename):
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "ravdess":
	assert flamingo_task == "EmotionClassification"
	assert split in ["train", "val"]

	map_split = lambda split: '44khz'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = None

	split_file = os.path.join(
	dataset_path,
	'ravdess_audiopath_text_sid_emotion_duration_{}_filelist.txt'.format(split)
	)
	with open(split_file, 'r') as f:
	data = f.readlines()
	data = [x.replace('\n', '') for x in data]

	for row in tqdm(data):
	if row.count('\|') != 4:
	continue
	filename, utterances, speaker, emotion, duration = row.split('\|')
	if filter_file(file_path, file_list, filename):
	continue

	text_output = emotion
	text_prompt = 'this emotion is'

	if len(text_output) <= 1:
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "SongDescriber":
	assert flamingo_task in ["AudioCaptioning"]
	assert split in ["train"]

	map_split = lambda split: './audio/audio'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = None

	with open(os.path.join(dataset_path, 'song_describer.csv'), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)

	for row in tqdm(reader):
	caption_id,track_id,caption,is_valid_subset,familiarity,artist_id,album_id,path,duration = row
	filename = '{}/{}.2min.mp3'.format(track_id[-2:], track_id)
	duration = float(duration)

	if filter_file(file_path, file_list, filename):
	continue

	text_output = caption
	if len(text_output) <= 1:
	continue
	text_prompt = 'generate audio caption'

	segment_length = 30
	for audio_start_idx in range(int(duration // segment_length)):
	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' '),
	"audio_start": audio_start_idx * segment_length
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "SONYC-UST":
	import numpy as np

	assert flamingo_task == "EventClassification"
	assert split in ["train", "test", "val"]

	map_split = lambda split: 'audio'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	all_labels = []
	with open(os.path.join(dataset_path, 'annotations.csv'), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	for idx, row in tqdm(enumerate(reader)):
	if idx == 0:
	header = np.array(row)
	continue

	if not row[0].startswith(split):
	continue

	filename = row[2]
	if filter_file(file_path, file_list, filename):
	continue

	labels = [header[i] for i in range(12, len(header)-8) if str(row[i]) == "1"]
	labels = [x.split("_")[1].replace('-', ' ').lower() for x in labels if 'X_' not in x]
	all_labels += labels

	text_output = ", ".join(labels)
	if len(text_output) <= 1:
	continue
	text_prompt = 'this is a sound of'

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	print('all labels:', list(set(all_labels)))

	elif dataset_name == "SoundDescs":
	import torch
	assert flamingo_task in ["AudioDescription"]
	assert split in ["train"]

	map_split = lambda split: 'raw/audios'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	split_file = os.path.join(dataset_path, 'audio-retrieval-benchmark/data/SoundDescs/{}_list.txt'.format(split))
	with open(split_file, 'r') as f:
	data = f.readlines()
	names = set([x.replace('\n', '') for x in data])

	with open(os.path.join(dataset_path, 'audio-retrieval-benchmark/sounddescs_data/descriptions.pkl'), 'rb') as f:
	obj = f.read()
	metadata_dic = pickle.loads(obj, encoding='latin1')

	for name in tqdm(names):
	if name not in metadata_dic.keys():
	continue

	filename = '{}.wav'.format(name)
	if filter_file(file_path, file_list, filename):
	continue

	description = metadata_dic[name]
	text_output = description
	text_prompt = 'generate audio description'

	if len(text_output) <= 1:
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "tess":
	assert flamingo_task == "EmotionClassification"
	assert split in ["train", "val"]

	map_split = lambda split: '24414hz'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = None

	split_file = os.path.join(
	dataset_path,
	'tess_audiopath_text_sid_emotion_duration_{}_filelist.txt'.format(split)
	)
	with open(split_file, 'r') as f:
	data = f.readlines()
	data = [x.replace('\n', '') for x in data]

	for row in tqdm(data):
	if row.count('\|') != 4:
	continue
	filename, utterances, speaker, emotion, duration = row.split('\|')
	if filter_file(file_path, file_list, filename):
	continue

	text_output = emotion.replace('_', ' ')
	text_output = EMOTION_MAP_DICT[text_output]
	text_prompt = 'this emotion is'

	if len(text_output) <= 1:
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "UrbanSound8K":
	assert flamingo_task in ["EventClassification"]
	assert split in ["train"]

	map_split = lambda split: 'audio'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = None

	with open(os.path.join(dataset_path, 'metadata/UrbanSound8K.csv'), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	filename, fsID, start, end, salience, fold, classID, class_name = row
	filename = 'fold{}/{}'.format(fold, filename)
	if filter_file(file_path, file_list, filename):
	continue

	text_output = class_name.replace("_", " ").lower()
	if len(text_output) <= 1:
	continue
	text_prompt = 'this is a sound of'

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "VocalSound":
	assert flamingo_task == "VocalClassification"

	map_split = lambda split: 'data_44k'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	split_file = os.path.join(
	dataset_path,
	'meta/{}_meta.csv'.format(split[:2] if split in ['train', 'test'] else split[:3])
	)

	prefix = set([])
	with open(split_file, newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	for row in reader:
	prefix.add(row[0])

	all_labels = set([])
	for filename in tqdm(file_list):
	if not filename.split('_')[0] in prefix:
	continue

	if filter_file(file_path, file_list, filename):
	continue

	label = filename.split('_')[2].split('.wav')[0]
	if label == 'throatclearing':
	label = 'throat clearing'

	text_output = label
	text_prompt = 'this vocal sound is'
	all_labels.add(label)

	if len(text_output) <= 1:
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	print('all labels:\n', "\'" + "\', \'".join(list(all_labels)) + "\'")

	elif dataset_name.startswith("WavCaps"):
	assert split in ["train"]

	dataset_name, subset_name = dataset_name.split('-')
	dataset_path = os.path.join(
	'/'.join(dataset_path.split('/')[:-1]),
	dataset_name
	)
	dataset_dic['dataset_path'] = dataset_path

	map_split = lambda split: subset_name + '_flac'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.flac'), os.listdir(file_path)))

	metadata_file = os.listdir(os.path.join(dataset_path, "json_files", subset_name))
	metadata_file = [x for x in metadata_file if x.endswith('json')][0]
	with open(os.path.join(dataset_path, "json_files", subset_name, metadata_file)) as f:
	data = f.read()
	reader = json.loads(data)

	if subset_name == "AudioSet_SL":
	assert flamingo_task == 'AudioCaptioning'

	for sample in tqdm(reader['data']):
	filename = sample["id"].replace('.wav', '.flac')
	if filter_file(file_path, file_list, filename):
	continue

	text_output = sample['caption']
	if len(text_output) <= 1:
	continue
	text_prompt = 'generate audio caption'

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	else:
	assert flamingo_task in ['AudioCaptioning', 'AudioDescription']

	for sample in tqdm(reader['data']):
	filename = sample["id"] + '.flac'
	if filter_file(file_path, file_list, filename):
	continue

	if flamingo_task == 'AudioCaptioning':
	text_output = sample['caption']
	text_prompt = 'generate audio caption'

	elif flamingo_task == 'AudioDescription':
	text_output = sample['description']
	text_prompt = 'generate audio description'

	if len(text_output) <= 1:
	continue

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif dataset_name == "WavText5K":
	assert split == 'train'

	map_split = lambda split: 'Webcrawl/44100/audios'
	file_path = os.path.join(
	dataset_path,
	map_split(split)
	)
	assert os.path.exists(file_path), '{} not exist'.format(file_path)

	dataset_dic["split_path"] = map_split(split)
	file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))

	dic = defaultdict(str)
	with open(os.path.join(dataset_path, 'WavText5K.csv'), newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=',', quotechar='"')
	next(reader)
	for row in tqdm(reader):
	_, _, title, description, filename, tags = row
	dic[filename] = (title, description, tags)

	if flamingo_task == "AudioCaptioning":
	for filename in tqdm(dic.keys()):
	if filter_file(file_path, file_list, filename):
	continue

	title, description, tags = dic[filename]
	text_output = description
	if len(text_output) <= 1:
	continue
	text_prompt = 'generate audio caption'

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1

	elif flamingo_task == "Tagging":
	for filename in tqdm(dic.keys()):
	if filter_file(file_path, file_list, filename):
	continue

	title, description, tags = dic[filename]
	if len(tags) < 2 or not tags.startswith('[') or not tags.endswith(']'):
	continue

	tags = tags[1:-1].split(', ')
	tags = list(map(lambda x: x.replace("'", ""), tags))
	text_output = '{}'.format(', '.join(tags))
	if len(text_output) <= 1:
	continue
	text_prompt = 'generate tags'

	dataset_dic["data"][dataset_dic["total_num"]] = {
	"name": filename,
	"prompt": text_prompt,
	"output": text_output.replace('\n', ' ')
	}
	dataset_dic["total_num"] += 1


	with open(output_file, 'w') as json_file:
	json.dump(dataset_dic, json_file)


	# ==================== Precompute CLAP and build Hashing ====================

	def int16_to_float32(x):
	return (x / 32767.0).astype(np.float32)


	def float32_to_int16(x):
	x = np.clip(x, a_min=-1., a_max=1.)
	return (x * 32767.).astype(np.int16)


	def update_progress_bar(arg):
	pbar.update()


	@suppress_all_output
	def load_clap_model(checkpoint):
	if checkpoint in ['630k-audioset-best.pt', '630k-best.pt', '630k-audioset-fusion-best.pt', '630k-fusion-best.pt']:
	amodel = 'HTSAT-tiny'
	elif checkpoint in ['music_speech_audioset_epoch_15_esc_89.98.pt']:
	amodel = 'HTSAT-base'
	else:
	raise NotImplementedError

	model = laion_clap.CLAP_Module(
	enable_fusion=('fusion' in checkpoint.lower()),
	amodel=amodel
	).cuda()
	model.load_ckpt(ckpt=os.path.join(
	'/lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/laion-clap-pretrained/laion_clap',
	checkpoint
	))
	return model


	def load_audio(file_path, target_sr=44100, duration=30.0, start=0.0):
	if file_path.endswith('.mp3'):
	audio = AudioSegment.from_file(file_path)
	if len(audio) > (start + duration) * 1000:
	audio = audio[start * 1000:(start + duration) * 1000]

	if audio.frame_rate != target_sr:
	audio = audio.set_frame_rate(target_sr)

	if audio.channels > 1:
	audio = audio.set_channels(1)

	data = np.array(audio.get_array_of_samples())
	if audio.sample_width == 2:
	data = data.astype(np.float32) / np.iinfo(np.int16).max
	elif audio.sample_width == 4:
	data = data.astype(np.float32) / np.iinfo(np.int32).max
	else:
	raise ValueError("Unsupported bit depth: {}".format(audio.sample_width))

	else:
	with sf.SoundFile(file_path) as audio:
	original_sr = audio.samplerate
	channels = audio.channels

	max_frames = int((start + duration) * original_sr)

	audio.seek(int(start * original_sr))
	frames_to_read = min(max_frames, len(audio))
	data = audio.read(frames_to_read)

	if data.max() > 1 or data.min() < -1:
	data = data / max(abs(data.max()), abs(data.min()))

	if original_sr != target_sr:
	if channels == 1:
	data = librosa.resample(data.flatten(), orig_sr=original_sr, target_sr=target_sr)
	else:
	data = librosa.resample(data.T, orig_sr=original_sr, target_sr=target_sr)[0]
	else:
	if channels != 1:
	data = data.T[0]

	if data.min() >= 0:
	data = 2 * data / abs(data.max()) - 1.0
	else:
	data = data / max(abs(data.max()), abs(data.min()))
	return data


	@torch.no_grad()
	def compute_clap_each(audio_file, model):
	try:
	data = load_audio(audio_file, target_sr=48000, duration=10)
	print(audio_file, 'loaded')

	except Exception as e:
	print(audio_file, 'unsuccessful due to', e)
	return None

	audio_data = data.reshape(1, -1)

	audio_data_tensor = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float().cuda()
	audio_embed = model.get_audio_embedding_from_data(x=audio_data_tensor, use_tensor=True)
	audio_embed = audio_embed.squeeze(0).cpu()
	return audio_embed


	@torch.no_grad()
	def compute_embeddings_batch(batch, audio_files, model):
	batch_results = []
	for i in batch:
	if i >= len(audio_files):
	break
	audio_file = audio_files[i]
	audio_embed = compute_clap_each(audio_file, model)
	batch_results.append((i, audio_file, audio_embed))
	return batch_results


	@torch.no_grad()
	def precompute_clap_for_dataset(
	dataset_file,
	embedding_output_file,
	checkpoint='630k-audioset-fusion-best.pt'
	):
	contents, audio_files = load_dataset_file(dataset_file)

	model = load_clap_model(checkpoint)

	if os.path.exists(embedding_output_file):
	print('loading already computed embedding file from', embedding_output_file)
	with open(embedding_output_file, 'rb') as f:
	saved_data = pickle.load(f)
	curr_audio_indices = saved_data['audio_indices']
	curr_audio_files = saved_data['audio_files']
	curr_audio_embeds = saved_data['audio_embeds']

	else:
	curr_audio_indices = []
	curr_audio_files = []
	curr_audio_embeds = []

	print('computing embeddings for {}'.format(dataset_file))
	start_index = len(curr_audio_files)
	remaining_indices = list(range(start_index, len(audio_files)))

	batch_size = 128
	batches = [
	list(range(i, min(i + batch_size, len(audio_files)))) \
	for i in range(start_index, len(audio_files), batch_size)
	]

	with multiprocessing.Pool(processes=4) as pool:
	for i, batch in enumerate(batches):
	batch_results = pool.map(
	partial(compute_embeddings_batch, model=model, audio_files=audio_files),
	[batch]
	)

	for result in batch_results[0]:
	curr_audio_indices.append(result[0])
	curr_audio_files.append(result[1])
	curr_audio_embeds.append(result[2])

	with open(embedding_output_file, 'wb') as f:
	pickle.dump({
	'audio_indices': curr_audio_indices,
	'audio_files': curr_audio_files,
	'audio_embeds': curr_audio_embeds
	}, f)

	print(f"Saved progress for batch {i+1}/{len(batches)}: \
	audio_indices {len(curr_audio_indices)}, \
	audio_files {len(curr_audio_files)}, \
	audio_embeds {len(curr_audio_embeds)}*{curr_audio_embeds[0].shape}")

	return curr_audio_indices, curr_audio_files, curr_audio_embeds


	def build_faiss_index(embeddings):
	d = embeddings[0].size(0)
	index = faiss.IndexFlatL2(d)
	np_embeddings = np.vstack([emb.numpy() for emb in embeddings])
	index.add(np_embeddings)
	return index


	def build_faiss_index_dataset(
	dataset_file,
	embedding_output_file,
	faiss_output_file,
	checkpoint='630k-audioset-fusion-best.pt',
	only_precompute_clap=False
	):
	audio_indices, audio_files, audio_embeds = precompute_clap_for_dataset(dataset_file, embedding_output_file, checkpoint)

	if only_precompute_clap:
	return

	valid_indices, valid_files, valid_embeds = [], [], []
	for audio_index, audio_file, audio_embed in zip(audio_indices, audio_files, audio_embeds):
	if audio_embed is not None:
	valid_indices.append(audio_index)
	valid_files.append(audio_file)
	valid_embeds.append(audio_embed)

	print('building faiss index')
	faiss_index = build_faiss_index(valid_embeds)

	print('saving faiss index')
	faiss.write_index(faiss_index, faiss_output_file)
	with open(faiss_output_file + '.filenames', 'wb') as f:
	pickle.dump({'audio_indices': valid_indices, 'audio_files': valid_files}, f)


	# ==================== Generate interleaved dataset files ====================
	# only save index so that one can recover

	def build_interleaved_dataset(dataset_file, interleaved_output_file, embedding_output_file, faiss_output_file, mode='random', n_samples=3):
	contents, audio_files = load_dataset_file(dataset_file)

	dataset_dic = {
	"dataset_path": contents["dataset_path"],
	"split": contents["split"],
	"split_path": contents["split_path"],
	"flamingo_task": contents["flamingo_task"],
	"total_num": 0,
	"interleaved_data": {},
	}

	# interleaved_data is
	# {
	# id: {
	# "generation_index_in_split": index of sample in the train or val or test.json,
	# "fewshot_indices_in_train": list(indices) of few shot samples in train.json
	# }
	# }

	if mode == 'knn':
	model = load_clap_model(checkpoint='630k-audioset-fusion-best.pt')

	print('loading already computed embedding file from', embedding_output_file)
	with open(embedding_output_file, 'rb') as f:
	precomputed_data = pickle.load(f)
	precomputed_audio_indices = precomputed_data['audio_indices']
	precomputed_audio_files = precomputed_data['audio_files']
	precomputed_audio_embeds = precomputed_data['audio_embeds']

	faiss_index = faiss.read_index(faiss_output_file)
	with open(faiss_output_file+'.filenames', 'rb') as f:
	_data = pickle.load(f)
	faiss_index_audio_indices = _data['audio_indices']
	faiss_index_audio_files = _data['audio_files']

	print('looking for few shot samples and building interleaved_{} data'.format(mode))
	for i in tqdm(range(contents["total_num"])):
	if mode == 'random':
	few_shot_indices = list(np.random.choice(
	list(set(list(range(contents["total_num"]))) - set([i])),
	size=n_samples-1,
	replace=False
	))
	few_shot_indices = list(map(int, few_shot_indices))

	elif mode == 'knn':
	if audio_files[i] in precomputed_audio_files:
	idx = precomputed_audio_files.index(audio_files[i])
	query_embedding_np = precomputed_audio_embeds[idx]
	if query_embedding_np is not None:
	query_embedding_np = query_embedding_np.numpy().reshape(1, -1)
	else:
	continue

	else:
	query_embedding_np = compute_clap_each(audio_files[i], model)
	if query_embedding_np is not None:
	query_embedding_np = query_embedding_np.numpy().reshape(1, -1)
	else:
	continue

	distances, knn_indices = faiss_index.search(query_embedding_np, n_samples+50)
	distances = distances[0]
	knn_indices = knn_indices[0]

	knn_filenames = [faiss_index_audio_files[idx] for idx in knn_indices]
	combined = list(zip(knn_indices, knn_filenames))
	unique_indices = defaultdict(list)
	for idx, filename in combined:
	unique_indices[filename].append(idx)

	cleared_knn_indices = [random.choice(unique_indices[filename]) for filename in unique_indices if filename != audio_files[i]]

	if dataset_file.endswith('train.json'):
	cleared_knn_indices = [knn_i for knn_i in cleared_knn_indices if faiss_index_audio_indices[knn_i] != i]
	cleared_knn_indices = cleared_knn_indices[:n_samples-1]
	np.random.shuffle(cleared_knn_indices)

	few_shot_indices = [faiss_index_audio_indices[knn_i] for knn_i in cleared_knn_indices]

	dataset_dic["interleaved_data"][dataset_dic["total_num"]] = {
	"generation_index_in_split": i,
	"fewshot_indices_in_train": few_shot_indices
	}
	dataset_dic["total_num"] += 1

	with open(interleaved_output_file, 'w') as json_file:
	json.dump(dataset_dic, json_file)


	if __name__ == '__main__':
	import argparse

	parser = argparse.ArgumentParser()
	parser.add_argument('-d', '--dataset_name', type=str, help='dataset name')
	parser.add_argument('-f', '--flamingo_task', type=str, help='flamingo task')
	parser.add_argument('--interleave', action="store_true", help='prepare the interleave dataset')
	args = parser.parse_args()

	ROOT = "/lustre/fsw/portfolios/adlr/users/zkong"
	dataset_root = os.path.join(ROOT, "datasets")
	output_root = os.path.join(ROOT, "audio-flamingo-data/dataset_files")
	os.makedirs(output_root, exist_ok=True)

	dataset_name = args.dataset_name # "Clotho-v2", "AudioSet", "Clotho-AQA", "WavText5K", "FSD50k", ...
	flamingo_task = args.flamingo_task # AQA, AudioCaptioning, EventClassification, SceneClassification, Tagging, ...

	# must be train first otherwise there's no train.embedding for query
	for split in ["train", "val", "test"]:
	dataset_path = os.path.join(dataset_root, dataset_name)

	output_folder = '{}-{}'.format(dataset_name, flamingo_task)
	os.makedirs(os.path.join(output_root, output_folder), exist_ok=True)

	dataset_file = os.path.join(output_root, output_folder, '{}.json'.format(split))
	if not os.path.exists(dataset_file):
	try:
	prepare_files(dataset_name, dataset_path, split, flamingo_task, dataset_file)
	except AssertionError as e:
	print('split {} not exist for {}: {}'.format(split, dataset_name, e))
	continue
	else:
	print('{} exists; exiting'.format(dataset_file))

	if args.interleave:
	faiss_output_file = dataset_file.replace('{}.json'.format(split), "train_faiss_index.index")
	embedding_output_file = dataset_file.replace('.json', ".embedding")

	if split == 'train':
	if (not os.path.exists(faiss_output_file)) or (not os.path.exists(faiss_output_file + '.filenames')):
	build_faiss_index_dataset(
	dataset_file, embedding_output_file, faiss_output_file,
	only_precompute_clap=False
	)
	else:
	print('{} exists; exiting'.format(faiss_output_file))
	else:
	build_faiss_index_dataset(
	dataset_file, embedding_output_file,
	faiss_output_file=None,
	only_precompute_clap=True
	)
	print('precomputing embedding for {} subset finished'.format(split))

	for mode in ['knn', 'random']:
	interleaved_output_file = '/'.join(
	dataset_file.split('/')[:-1] + \
	['interleaved_{}-'.format(mode) + dataset_file.split('/')[-1]]
	)
	if not os.path.exists(interleaved_output_file):
	build_interleaved_dataset(
	dataset_file=dataset_file,
	interleaved_output_file=interleaved_output_file,
	embedding_output_file=embedding_output_file,
	faiss_output_file=faiss_output_file,
	mode=mode,
	n_samples=4
	)
	else:
	print('{} exists; exiting'.format(interleaved_output_file))