import io
import copy
import librosa
import numpy as np
AUDIO_TOKEN_FORMAT = "<|{}|>"
DEFAULT_SYSTEM_START_TOKEN = ""
DEFAULT_SYSTEM_END_TOKEN = ""
DEFAULT_TTS_REF_START_TOKEN = ""
DEFAULT_TTS_REF_END_TOKEN = ""
DEFAULT_TTS_REF_TOKEN = ""
DEFAULT_CHAT_REF_START_TOKEN = ""
DEFAULT_CHAT_REF_END_TOKEN = ""
DEFAULT_CHAT_REF_TOKEN = ""
DEFAULT_HUMAN_TOKEN = "<|HUMAN|>"
DEFAULT_ASSISTANT_TOKEN = "<|VOILA|>"
DEFAULT_AUDIO_TOKEN = ""
# ===================================
# task special token
# -----------------------------------
TASK_ASR_TOKEN = ""
TASK_TTS_TOKEN = ""
TASK_CHAT_TOKEN = ""
TASK_STREAM_CHAT_TOKEN = ""
TASK_ASR_TEXT_OUTPUT = ""
TASK_TTS_AUDIO_OUTPUT = ""
TASK_CHAT_TEXT_OUTPUT = ""
TASK_CHAT_AUDIO_OUTPUT = ""
CHAT_AUDIO_TEXT_SPLIT_TOKEN = ""
# ===================================
PREPEND_LEN = 80
SEG_LEN = 640
AUDIO_SR = 16000
TASK_TYPE_CONF = {
"chat_asr": TASK_ASR_TOKEN + TASK_ASR_TEXT_OUTPUT,
"chat_tts": TASK_TTS_TOKEN + TASK_TTS_AUDIO_OUTPUT,
"chat_tito": TASK_CHAT_TOKEN + TASK_CHAT_TEXT_OUTPUT,
"chat_tiao": TASK_CHAT_TOKEN + TASK_CHAT_AUDIO_OUTPUT,
"chat_aiao": TASK_CHAT_TOKEN + TASK_CHAT_AUDIO_OUTPUT,
"chat_atiao": TASK_CHAT_TOKEN + TASK_CHAT_AUDIO_OUTPUT,
"chat_aiao_auto": TASK_STREAM_CHAT_TOKEN + TASK_CHAT_AUDIO_OUTPUT,
}
def _get_zero_audio_pad(token_num):
return np.zeros(SEG_LEN*token_num)
def _wrapper_audio_tokens(audio_tokens, num_codebooks, codebook_size):
ret_audio_tokens = []
for n in range(num_codebooks):
audio_token = audio_tokens[n]
ret_audio_tokens.append(''.join([AUDIO_TOKEN_FORMAT.format(au + n*codebook_size) if isinstance(au, int) else au for au in audio_token]))
return ret_audio_tokens
def _wrapper_audio_tokens_autonomous(audio_tokens, num_codebooks, codebook_size, audio_token_min_id):
ret_audio_tokens = []
for n in range(num_codebooks):
audio_token = audio_tokens[n]
ret_audio_tokens.append([(au + n*codebook_size + audio_token_min_id) for au in audio_token])
return ret_audio_tokens
# Item format
# {
# "instruction": "",
# "conversations": [
# {
# "from": "user" or "assistant",
# "text": "",
# "audio": {
# "array": [],
# "sr": 16000,
# "bytes": "",
# "file": "",
# },
# }
# ],
# }
def _token_input_format(item, tokenizer, tokenizer_voila, dataset_cfg):
task_type = dataset_cfg["task_type"]
num_codebooks = dataset_cfg["num_codebooks"]
codebook_size = dataset_cfg["codebook_size"]
task_token = TASK_TYPE_CONF[task_type]
# Construct system message
system = item["instruction"]
if task_type in ["chat_aiao", "chat_atiao", "chat_tiao"]:
system = DEFAULT_CHAT_REF_START_TOKEN + DEFAULT_CHAT_REF_TOKEN + DEFAULT_CHAT_REF_END_TOKEN + system
elif task_type == "chat_tts":
system = DEFAULT_TTS_REF_START_TOKEN + DEFAULT_TTS_REF_TOKEN + DEFAULT_TTS_REF_END_TOKEN + system
else:
print (f"task type {task_type} do not use ref.")
system = task_token + system
system = DEFAULT_SYSTEM_START_TOKEN + system + DEFAULT_SYSTEM_END_TOKEN
# Get ids for system
system_ids = tokenizer.encode(system, add_special_tokens=False)
# Copy into num_codebooks input ids
input_ids_list = []
for _ in range(num_codebooks):
input_ids_list.append(copy.deepcopy(system_ids))
# Assemble conversations
for i, turn in enumerate(item["conversations"]):
if turn['from'] == 'assistant':
# task with audio token as input, prepare audio token
if task_type in ["chat_aiao", "chat_tts"]:
if "audio" not in turn:
content = DEFAULT_ASSISTANT_TOKEN
content_ids = tokenizer.encode(content, add_special_tokens=False)
for n in range(num_codebooks):
input_ids_list[n] += copy.deepcopy(content_ids)
else:
# Load audio
if 'array' in turn['audio']:
assert "sr" in turn["audio"]
if len(turn["audio"]['array'].shape) > 1:
assert turn["audio"]['array'].shape[0] <= 2
turn["audio"]['array'] = librosa.to_mono(turn["audio"]['array'])
audio = librosa.resample(turn["audio"]['array'], orig_sr=turn["audio"]["sr"], target_sr=AUDIO_SR)
elif "bytes" in turn['audio']:
audio, _ = librosa.load(io.BytesIO(turn["audio"]['bytes']), sr=AUDIO_SR)
elif "file" in turn['audio']:
audio, _ = librosa.load(turn["audio"]['file'], sr=AUDIO_SR)
else:
raise Exception(f"No audio input for task {task_type}")
# get audio token
audio_tokens = tokenizer_voila.encode(audio, sr=AUDIO_SR)
audio_tokens = audio_tokens.cpu().numpy().tolist()
audio_tokens = _wrapper_audio_tokens(audio_tokens, num_codebooks, codebook_size)
for n in range(num_codebooks):
content = DEFAULT_ASSISTANT_TOKEN + audio_tokens[n] + tokenizer.eos_token
content_ids = tokenizer.encode(content, add_special_tokens=False, truncation=True,
max_length=tokenizer.model_max_length)
input_ids_list[n] += content_ids
elif task_type in ["chat_tito", "chat_asr"]:
if "text" not in turn:
content = DEFAULT_ASSISTANT_TOKEN
content_ids = tokenizer.encode(content, add_special_tokens=False)
for n in range(num_codebooks):
input_ids_list[n] += copy.deepcopy(content_ids)
else:
text = turn['text'].strip()
content = DEFAULT_ASSISTANT_TOKEN + text + tokenizer.eos_token
content_ids = tokenizer.encode(content, add_special_tokens=False, truncation=True,
max_length=tokenizer.model_max_length)
for n in range(num_codebooks):
input_ids_list[n] += copy.deepcopy(content_ids)
else:
raise ValueError (f"[Error] Invalid data type of {task_type}.")
else:
# task with audio token as input, prepare audio token
if task_type in ["chat_aiao", "chat_asr"]:
# Load audio
assert "audio" in turn
if 'array' in turn['audio']:
assert "sr" in turn["audio"]
if len(turn["audio"]['array'].shape) > 1:
assert turn["audio"]['array'].shape[0] <= 2
turn["audio"]['array'] = librosa.to_mono(turn["audio"]['array'])
audio = librosa.resample(turn["audio"]['array'], orig_sr=turn["audio"]["sr"], target_sr=AUDIO_SR)
elif "bytes" in turn['audio']:
audio, _ = librosa.load(io.BytesIO(turn["audio"]['bytes']), sr=AUDIO_SR)
elif "file" in turn['audio']:
audio, _ = librosa.load(turn["audio"]['file'], sr=AUDIO_SR)
else:
raise Exception(f"No audio input for task {task_type}")
# get audio token
audio_tokens = tokenizer_voila.encode(audio, sr=AUDIO_SR)
audio_tokens = audio_tokens.cpu().numpy().tolist()
audio_tokens = _wrapper_audio_tokens(audio_tokens, num_codebooks, codebook_size)
for n in range(num_codebooks):
content = DEFAULT_HUMAN_TOKEN + audio_tokens[n]
content_ids = tokenizer.encode(content, add_special_tokens=False, truncation=True,
max_length=tokenizer.model_max_length)
input_ids_list[n] += copy.deepcopy(content_ids)
elif task_type in ["chat_tito", "chat_tts"]:
text = turn['text'].strip()
content = DEFAULT_HUMAN_TOKEN + text
content_ids = tokenizer.encode(content, add_special_tokens=False, truncation=True,
max_length=tokenizer.model_max_length)
for n in range(num_codebooks):
input_ids_list[n] += copy.deepcopy(content_ids)
else:
raise ValueError (f"[Error] Invalid data type of {task_type}.")
for n in range(num_codebooks):
input_ids_list[n] = input_ids_list[n][:tokenizer.model_max_length]
return input_ids_list, None, None, None
def _token_input_format_autonomous(item, tokenizer, tokenizer_voila, dataset_cfg):
task_type = dataset_cfg["task_type"]
num_codebooks = dataset_cfg["num_codebooks"]
codebook_size = dataset_cfg["codebook_size"]
assert task_type == "chat_aiao_auto", f"only support chat_aiao_auto, {task_type} is invalid"
DEFAULT_HUMAN_TOKEN_ID = tokenizer.convert_tokens_to_ids(DEFAULT_HUMAN_TOKEN)
assert isinstance(DEFAULT_HUMAN_TOKEN_ID, int), "DEFAULT_HUMAN_TOKEN_ID should be an integer"
AUDIO_MIN_TOKEN_ID = tokenizer.convert_tokens_to_ids(AUDIO_TOKEN_FORMAT.format(0))
assert isinstance(AUDIO_MIN_TOKEN_ID, int), "AUDIO_MIN_TOKEN_ID should be an integer"
task_token = TASK_TYPE_CONF[task_type]
# Construct system message
system = DEFAULT_CHAT_REF_START_TOKEN + DEFAULT_CHAT_REF_TOKEN + DEFAULT_CHAT_REF_END_TOKEN
system = task_token + system
system = DEFAULT_SYSTEM_START_TOKEN + system + DEFAULT_SYSTEM_END_TOKEN
# Get ids for system
system_ids_list = [[], []]
system_ids = tokenizer.encode(system, add_special_tokens=False)
# Insert instruction tokens into system prompt tokens
instruction = item["instruction"]
if instruction != "":
instruction_ids = tokenizer.encode(instruction, add_special_tokens=False)
else:
instruction_ids = []
system_ids_list[0] = system_ids[:-1] + instruction_ids + system_ids[-1:]
system_ids_list[1] = system_ids[:-1] + instruction_ids + system_ids[-1:]
# Copy into num_codebooks input ids
channel1_input_ids_list = [[] for _ in range(num_codebooks)]
channel2_input_ids_list = [[] for _ in range(num_codebooks)]
for n in range(num_codebooks):
channel1_input_ids_list[n] += copy.deepcopy(system_ids_list[0]) + [DEFAULT_HUMAN_TOKEN_ID]
channel2_input_ids_list[n] += copy.deepcopy(system_ids_list[1]) + [DEFAULT_HUMAN_TOKEN_ID]
# prepare audio token to simulate streaming input
audio_meta = item['conversations'][0]['audio']
if 'array' in audio_meta:
assert "sr" in audio_meta
if len(audio_meta['array'].shape) > 1:
assert audio_meta['array'].shape[0] <= 2
audio_meta['array'] = librosa.to_mono(audio_meta['array'])
audio = librosa.resample(audio_meta['array'], orig_sr=audio_meta["sr"], target_sr=AUDIO_SR)
elif "bytes" in audio_meta:
audio, _ = librosa.load(io.BytesIO(audio_meta['bytes']), sr=AUDIO_SR)
elif "file" in audio_meta:
audio, _ = librosa.load(audio_meta['file'], sr=AUDIO_SR)
else:
raise Exception(f"No audio input for task {task_type}")
# get audio token
streaming_user_input_audio_tokens = tokenizer_voila.encode(audio, sr=AUDIO_SR)
streaming_user_input_audio_tokens = streaming_user_input_audio_tokens.cpu().numpy().tolist()
streaming_user_input_audio_tokens = _wrapper_audio_tokens_autonomous(streaming_user_input_audio_tokens, num_codebooks, codebook_size, AUDIO_MIN_TOKEN_ID)
return [channel1_input_ids_list, channel2_input_ids_list], None, None, streaming_user_input_audio_tokens
def _alpha_audio_input_format(item, tokenizer, dataset_cfg):
task_type = dataset_cfg["task_type"]
num_codebooks = dataset_cfg["num_codebooks"]
codebook_size = dataset_cfg["codebook_size"]
task_token = TASK_TYPE_CONF[task_type]
# Construct system message
system = item["instruction"]
if task_type in ["chat_aiao", "chat_atiao", "chat_tiao"]:
system = DEFAULT_CHAT_REF_START_TOKEN + DEFAULT_CHAT_REF_TOKEN + DEFAULT_CHAT_REF_END_TOKEN + system
elif task_type == "chat_tts":
system = DEFAULT_TTS_REF_START_TOKEN + DEFAULT_TTS_REF_TOKEN + DEFAULT_TTS_REF_END_TOKEN + system
else:
print (f"task type {task_type} do not use ref.")
system = task_token + system
system = DEFAULT_SYSTEM_START_TOKEN + system + DEFAULT_SYSTEM_END_TOKEN
# Get ids for system
system_ids = tokenizer.encode(system, add_special_tokens=False)
# Copy into num_codebooks input ids
input_ids_list = []
for _ in range(num_codebooks):
input_ids_list.append(copy.deepcopy(system_ids))
# Construct audio data and mask
audio_data = [np.array([0]*PREPEND_LEN)]
audio_data.append(_get_zero_audio_pad(len(system_ids)))
audio_data_mask = [0] * len(system_ids)
# Assemble conversations
for i, turn in enumerate(item["conversations"]):
if turn['from'] == 'assistant':
# task with audio token as input, prepare audio token
if task_type in ["chat_aiao"]:
if "audio" not in turn:
content = DEFAULT_ASSISTANT_TOKEN
content_ids = tokenizer.encode(content, add_special_tokens=False)
for n in range(num_codebooks):
input_ids_list[n] += copy.deepcopy(content_ids)
# preprocess audio_data & audio_data_mask
audio_data.append(_get_zero_audio_pad(len(content_ids)))
audio_data_mask += [0] * len(content_ids)
else:
# Load audio
if 'array' in turn['audio']:
assert "sr" in turn["audio"]
if len(turn["audio"]['array'].shape) > 1:
assert turn["audio"]['array'].shape[0] <= 2
turn["audio"]['array'] = librosa.to_mono(turn["audio"]['array'])
audio = librosa.resample(turn["audio"]['array'], orig_sr=turn["audio"]["sr"], target_sr=AUDIO_SR)
elif "bytes" in turn['audio']:
audio, _ = librosa.load(io.BytesIO(turn["audio"]['bytes']), sr=AUDIO_SR)
elif "file" in turn['audio']:
audio, _ = librosa.load(turn["audio"]['file'], sr=AUDIO_SR)
else:
raise Exception(f"No audio input for task {task_type}")
# get audio token
audio_token_num = int(len(audio) / SEG_LEN)
audio_token = [DEFAULT_AUDIO_TOKEN] * audio_token_num
audio_token = ''.join(audio_token)
audio = audio[:SEG_LEN*audio_token_num] # trim audio
content = DEFAULT_ASSISTANT_TOKEN + audio_token + tokenizer.eos_token
content_ids = tokenizer.encode(content, add_special_tokens=False, truncation=True,
max_length=tokenizer.model_max_length)
for n in range(num_codebooks):
input_ids_list[n] += copy.deepcopy(content_ids)
audio_data.append(_get_zero_audio_pad(1))
audio_data_mask += [0]
audio_data.append(audio)
audio_data_mask += [1] * audio_token_num
audio_data.append(_get_zero_audio_pad(1))
audio_data_mask += [0]
elif task_type in ["chat_tito"]:
if "text" not in turn:
content = DEFAULT_ASSISTANT_TOKEN
content_ids = tokenizer.encode(content, add_special_tokens=False)
for n in range(num_codebooks):
input_ids_list[n] += copy.deepcopy(content_ids)
# preprocess audio_data & audio_data_mask
audio_data.append(_get_zero_audio_pad(len(content_ids)))
audio_data_mask += [0] * len(content_ids)
else:
text = turn['text'].strip()
content = DEFAULT_ASSISTANT_TOKEN + text + tokenizer.eos_token
content_ids = tokenizer.encode(content, add_special_tokens=False, truncation=True,
max_length=tokenizer.model_max_length)
for n in range(num_codebooks):
input_ids_list[n] += copy.deepcopy(content_ids)
audio_data.append(_get_zero_audio_pad(len(content_ids)))
audio_data_mask += [0] * len(content_ids)
else:
raise ValueError (f"[Error] Invalid data type of {task_type}.")
else:
# task with audio token as input, prepare audio token
if task_type in ["chat_aiao"]:
# Load audio
assert "audio" in turn
if 'array' in turn['audio']:
assert "sr" in turn["audio"]
if len(turn["audio"]['array'].shape) > 1:
assert turn["audio"]['array'].shape[0] <= 2
turn["audio"]['array'] = librosa.to_mono(turn["audio"]['array'])
audio = librosa.resample(turn["audio"]['array'], orig_sr=turn["audio"]["sr"], target_sr=AUDIO_SR)
elif "bytes" in turn['audio']:
audio, _ = librosa.load(io.BytesIO(turn["audio"]['bytes']), sr=AUDIO_SR)
elif "file" in turn['audio']:
audio, _ = librosa.load(turn["audio"]['file'], sr=AUDIO_SR)
else:
raise Exception(f"No audio input for task {task_type}")
# get audio token
audio_token_num = int(len(audio) / SEG_LEN)
audio_token = [DEFAULT_AUDIO_TOKEN] * audio_token_num
audio_token = ''.join(audio_token)
audio = audio[:SEG_LEN*audio_token_num] # trim audio
content = DEFAULT_HUMAN_TOKEN + audio_token
content_ids = tokenizer.encode(content, add_special_tokens=False, truncation=True,
max_length=tokenizer.model_max_length)
for n in range(num_codebooks):
input_ids_list[n] += copy.deepcopy(content_ids)
audio_data.append(_get_zero_audio_pad(1))
audio_data_mask += [0]
audio_data.append(audio)
audio_data_mask += [1] * audio_token_num
elif task_type in ["chat_tito"]:
text = turn['text'].strip()
content = DEFAULT_HUMAN_TOKEN + text
content_ids = tokenizer.encode(content, add_special_tokens=False, truncation=True,
max_length=tokenizer.model_max_length)
for n in range(num_codebooks):
input_ids_list[n] += copy.deepcopy(content_ids)
audio_data.append(_get_zero_audio_pad(len(content_ids)))
audio_data_mask += [0] * len(content_ids)
else:
raise ValueError (f"[Error] Invalid data type of {task_type}.")
for n in range(num_codebooks):
input_ids_list[n] = input_ids_list[n][:tokenizer.model_max_length]
audio_data_mask = audio_data_mask[:tokenizer.model_max_length]
audio_data = np.concatenate(audio_data)
audio_data = audio_data[:PREPEND_LEN + tokenizer.model_max_length*SEG_LEN]
return input_ids_list, audio_data, audio_data_mask, None
# Item format
# {
# "instruction": "",
# "conversations": [
# {
# "from": "user" or "assistant",
# "text": "",
# "audio": {
# "array": [],
# "sr": 16000,
# "bytes": "",
# "file": "",
# },
# }
# ],
# }
def voila_input_format(item, tokenizer, tokenizer_voila, dataset_cfg):
if dataset_cfg["input_type"] == "audio":
return _alpha_audio_input_format(item, tokenizer, dataset_cfg)
elif dataset_cfg["input_type"] == "autonomous":
return _token_input_format_autonomous(item, tokenizer, tokenizer_voila, dataset_cfg)
else:
return _token_input_format(item, tokenizer, tokenizer_voila, dataset_cfg)