File size: 993 Bytes
0e5da39 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
#!/usr/bin/perl
use warnings;
use strict;
use utf8;
use open qw(:std :utf8);
use File::Copy;
sub write_sentences {
my ($target_file, @sentences) = @_;
open (my $f, ">", $target_file) or die "Cannot open file $target_file: $!";
foreach my $sentence (@sentences) {
print $f "$sentence\n";
}
close $f;
}
@ARGV >= 2 or die "Usage: $0 target_directory lang <train_file\n";
my $target = shift @ARGV;
my $lang = shift @ARGV;
# Load training sentences
my ($sentence, @sentences) = ("");
while (<STDIN>) {
chomp;
if (/^$/) {
push @sentences, $sentence;
$sentence = "";
} else {
$sentence .= $_ . "\n";
}
}
die "Unfinished sentence" if $sentence;
# Generate everything from train_file
my $train_end = int(@sentences * 90 / 100);
die "Zero sentences for dev_file" if $train_end == int(@sentences);
write_sentences("$target/$lang-ud-train.conllu", @sentences[0..$train_end-1]);
write_sentences("$target/$lang-ud-dev.conllu", @sentences[$train_end..$#sentences]);
|