from data_gen.tts.base_preprocess import BasePreprocessor
import re


class BiaobeiPreprocess(BasePreprocessor):
    def meta_data(self):
        input_dir = self.raw_data_dir
        with open(f"{input_dir}/ProsodyLabeling/000001-010000.txt", encoding='utf-8') as f:
            bb_lines = f.readlines()[::2]
        for l_idx, l in (enumerate([re.sub("\#\d+", "", l.split('\t')[1].strip()) for l in bb_lines])):
            item_name = f'{l_idx + 1:06d}'
            wav_fn = f"{input_dir}/wav/{l_idx + 1:06d}.wav"
            yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': l}

if __name__ == "__main__":
    BiaobeiPreprocess().process()