File size: 5,614 Bytes

bf1f674

""" Utilities for reading and writing data files.
"""
import multiprocessing as mp
import os
from pathlib import PosixPath
from typing import Callable, Dict, List, Optional, Tuple, Union
from datasets import load_dataset
from torch.utils.data import Dataset

from transformers import (
    DataCollatorForLanguageModeling,
    PreTrainedTokenizer,
    default_data_collator,
)

from . import config

# To avoid huggingface warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"
UBUNTU_ROOT = str(config.root)

def load_datasets(
    tokenizer: PreTrainedTokenizer,
    train_data: Union[str, PosixPath],
    eval_data: Optional[Union[str, PosixPath]] = None,
    test_data: Union[str, PosixPath] = None,
    file_type: str = "csv",
    delimiter: str = "\t",
    seq_key: str = "sequence",
    shuffle: bool = True,
    filter_empty: bool = False,
    n_workers: int = mp.cpu_count(),
    **kwargs,
) -> Dataset:
    """Load and cache data using Huggingface datasets library

    Args:
        tokenizer (PreTrainedTokenizer): tokenizer to apply to the sequences
        train_data (Union[str, PosixPath]): location of training data
        eval_data (Union[str, PosixPath], optional): location of evaluation data. Defaults to None.
        test_data (Union[str, PosixPath], optional): location of test data. Defaults to None.
        file_type (str, optional): type of file. Possible values are 'text' and 'csv'. Defaults to 'csv'.
        delimiter (str, optional): Defaults to '\t'.
        seq_key (str, optional): Column name of sequence data Can be 'sequence', 'seq', or 'text'. Defaults to 'sequence'.
        shuffle (bool, optional): Whether to shuffle the dataset. Defaults to True.
        filter_empty (bool, optional): Whether to filter out empty sequences. Defaults to False.
            NOTE: This completes an additional iteration, which can be time-consuming.
            Only enable if you have reason to believe that preprocessing steps will
            result in empty sequences.
        transformation (str, optional): type of transformation to apply.
            Options are 'log', 'boxcox'. Defaults to None.
        log_offset (Union[float, int]): value to offset gene expression values
            by before log transforming. Defaults to 1.
        preprocessor (BaseEstimator): preprocessor Yeoh-Johnson transformation.
        tissue_subset (Union[str, int, list], optional): tissues to subset labels to.
            Defaults to None.
        nshards (int, optional): Number of shards to divide data into, only
            keeping the first. Defaults to None.
        threshold (float, optional): filter out rows where all labels are
            below `threshold`. OR if `discretize` is True, see `discretize`.
            Defaults to None.
        discretize (bool, optional): set gene expression values below
            `threshold` to 0, above `threshold` to 1.
        kmer (int, optional): whether to run the kmer flip experiment and if so,
            how large kmers to flip. Defaults to None.
        n_workers (int, optional): number of processes to use for preprocessing.
            Defaults to `mp.cpu_count()` (number of available CPUs).
        position_buckets (Tuple[int], optional): the different buckets for the bucketed
            positional importance experiment

    Returns:
        Dataset
    """
    data_files = {"train": str(train_data)}
    if eval_data:
        data_files["eval"] = str(eval_data)
    if test_data:
        data_files["test"] = str(test_data)
    if file_type == "csv":
        kwargs.update({"delimiter": delimiter})
    datasets = load_dataset(file_type, data_files=data_files, **kwargs)
    # Tokenizing
    preprocess_fn = make_preprocess_function(tokenizer, seq_key=seq_key)
    # print("Tokenizing...")
    datasets = datasets.map(preprocess_fn, batched=True, num_proc=n_workers)
    if filter_empty:
        datasets = datasets.filter(filter_empty_sequence)
    if shuffle:
        seed = config.settings["random_seed"]
        datasets = datasets.shuffle(seeds={"train": seed, "eval": seed, "test": seed})
    return datasets


def make_preprocess_function(tokenizer, seq_key: str = "sequence") -> callable:
    """Make a preprocessing function that selects the appropriate column and
    tokenizes it.

    Args:
        tokenizer (PreTrainedTokenizer): tokenizer to apply to each sequence
        seq_key (str, optional): column name of the text data. Defaults to 'sequence'.

    Returns:
        callable: preprocessing function
    """

    def preprocess_function(examples):
        if seq_key:
            seqs = examples[seq_key]
        else:
            seqs = examples
        return tokenizer(
            seqs,
            max_length=tokenizer.model_max_length,
            truncation=True,
            padding="max_length",
        )

    return preprocess_function

def filter_empty_sequence(example: dict) -> bool:
    """Filter out empty sequences."""
    # sum(example['attention_mask']) gives the number of tokens, including SOS and EOS
    return sum(example["attention_mask"]) > 2

def load_data_collator(model_type: str, tokenizer=None, mlm_prob=None):
    if model_type == "language-model":
        assert (
            tokenizer is not None
        ), "tokenizer must not be None if model is type language-model"
        assert (
            mlm_prob is not None
        ), "mlm_prob must not be None if model is type language-model"

        return DataCollatorForLanguageModeling(
            tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob
        )
    else:
        return default_data_collator