File size: 2,053 Bytes
e7d695a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
# ------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
import json
import numpy as np
import encoder
import argparse
import os
import random
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import numpy
import io
import sys
import threading
import math
import random
import json
import collections
from collections import Counter
from collections import OrderedDict
from progress.bar import Bar as Bar
parser = argparse.ArgumentParser()
parser.add_argument('--input', default=None, type=str, help='ft input file')
parser.add_argument('--vocab', type=str, default=None, help='vocab path')
parser.add_argument('--output', default=None, type=str, help='ft output file')
parser.add_argument('--add_bos', action='store_true', help='')
parser.add_argument('--add_eos', action='store_true', help='')
args = parser.parse_args()
if __name__ == "__main__":
enc = encoder.get_encoder(args.vocab)
writer = open(args.output, 'w')
with open(args.input, 'r') as reader:
line_idx = 0
for line in reader:
items = json.loads(line.strip())
context = items['context']
completion = items['completion']
bos = 50256
eos = 50256
context_bpes, _ = enc.encode(context)
context_bpes += [bos] if args.add_bos else []
completion_bpes, _ = enc.encode(' ' + completion)
completion_bpes += [eos] if args.add_eos else []
ft_json = {}
ft_json['context'] = context_bpes
ft_json['completion'] = completion_bpes
writer.write(json.dumps(ft_json)+'\n')
line_idx += 1
writer.close()
|