File size: 1,367 Bytes
4f09c24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from combo.predict import COMBO
from allennlp.data import tokenizers
from argparse import ArgumentParser

parser = ArgumentParser()
parser.add_argument('--parser')
parser.add_argument('--infile')
parser.add_argument('--pretokenized', action='store_true')
args = parser.parse_args()

# If your data is pre-tokenized, you can add the --pretokenized flag
# If you have a GPU available, you can add cuda_device=<your-device> to COMBO.from_pretrained
# The parser expects input in the same format as test_file.txt, i.e. one sentence per line

if args.pretokenized:
    from Tokenizer.src.tokenizer import split_into_sentences
    nlp = COMBO.from_pretrained('combo-is-combined-v211', tokenizer=tokenizers.SpacyTokenizer(split_on_spaces=True))
else:
    nlp = COMBO.from_pretrained(args.parser)

def read_test_file(file):
    with open(file, 'r', encoding='utf-8') as infile:
        for line in infile:
            if args.pretokenized:
                yield ' '.join(split_into_sentences(line))
            else:
                yield line.rstrip()

test_file = read_test_file(args.infile)

for sent in test_file:
    sentence = nlp(sent)
    for index, token in enumerate(sentence.tokens, 1):
       print(f'{token.id}\t{token.token}\t{token.lemma}\t{token.upostag}\t{token.xpostag}\t{token.feats}\t{token.head}\t{token.deprel}\t{token.deps}\t{token.misc}')
    print()