|
import multiprocessing |
|
import time |
|
|
|
from arguments import PretokenizationArguments |
|
from datasets import load_dataset |
|
|
|
from transformers import AutoTokenizer, HfArgumentParser |
|
|
|
|
|
def tokenize(example): |
|
output = {} |
|
output["input_ids"] = tokenizer(example["content"], truncation=False)["input_ids"] |
|
output["ratio_char_token"] = len(example["content"]) / len(output["input_ids"]) |
|
return output |
|
|
|
|
|
parser = HfArgumentParser(PretokenizationArguments) |
|
args = parser.parse_args() |
|
if args.num_workers is None: |
|
args.num_workers = multiprocessing.cpu_count() |
|
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir) |
|
|
|
t_start = time.time() |
|
ds = load_dataset(args.dataset_name, split="train") |
|
print(f"Dataset loaded in {time.time()-t_start:.2f}s") |
|
|
|
t_start = time.time() |
|
ds = ds.map( |
|
tokenize, |
|
num_proc=args.num_workers, |
|
remove_columns=[ |
|
"repo_name", |
|
"path", |
|
"copies", |
|
"size", |
|
"content", |
|
"license", |
|
"hash", |
|
"line_mean", |
|
"line_max", |
|
"alpha_frac", |
|
"autogenerated", |
|
], |
|
) |
|
print(f"Dataset tokenized in {time.time()-t_start:.2f}s") |
|
|
|
t_start = time.time() |
|
ds.push_to_hub(args.tokenized_data_repo) |
|
print(f"Data pushed to the hub in {time.time()-t_start:.2f}s") |
|
|