Siddharth63 commited on
Commit
1a7fe47
·
1 Parent(s): 0786a9d

Upload 3 files

Browse files
Files changed (3) hide show
  1. spiece.model +3 -0
  2. spiece.vocab +0 -0
  3. tokenizer.py +138 -0
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c489236e2ac4df783bdb4fc930323620027ee0279d2665d263cd74385d899425
3
+ size 802920
spiece.vocab ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %pip install sentencepiece
2
+ # %pip install datasets
3
+
4
+ import unicodedata
5
+ import os
6
+ import nltk
7
+ from tqdm import tqdm
8
+ import glob
9
+ from random import sample
10
+
11
+ def sample_and_make_tempfile(sentences_dir, num_files):
12
+ """ Use the set of files containing a sentence per line,
13
+ sample num_files out of those and save as a temp file """
14
+
15
+ sentence_files = glob.glob(sentences_dir + "/*.txt")
16
+
17
+ # sample num_files
18
+ sampled_files=sample(sentence_files, num_files)
19
+
20
+ print("sampled files:")
21
+ print(sampled_files)
22
+
23
+ #read all the lines from sampled files and save to a list
24
+ all_lines = []
25
+ for filename in sampled_files:
26
+ with open(filename) as f:
27
+ lines = f.read().splitlines()
28
+
29
+ all_lines.extend(lines)
30
+
31
+ print("number of lines sampled:", len(all_lines))
32
+
33
+ #combine into a single file and save
34
+ tempfile_path = os.path.join("text", "temp.txt")
35
+ with open(tempfile_path, "w") as f:
36
+
37
+ for sentence in tqdm(all_lines):
38
+
39
+ # remove newlines
40
+ line = sentence.strip()
41
+
42
+ # do not save empty items such as
43
+ if sentence != []:
44
+
45
+ f.writelines(sentence + '\n')
46
+
47
+ print("Wrote to ", tempfile_path)
48
+ return tempfile_path
49
+
50
+
51
+ def chunks(sentences, n, tot_len):
52
+ """Yield successive n-sized chunks from sentences."""
53
+ for i in range(0, tot_len, n):
54
+ end_i = min(len(sentences),i + n)
55
+ yield sentences[i:end_i]["text"]
56
+
57
+
58
+
59
+ def make_sentence_files(dataset, chunksize = 5600000, data_dir = 'text/sentences'):
60
+ """
61
+ Make a sentence per line files, chuncsize sentences per file"""
62
+
63
+ # make sure data dir exists
64
+ if not os.path.exists(data_dir):
65
+ os.makedirs(data_dir)
66
+
67
+ # use simple regex for sentence tokenizing
68
+ sent_detector = nltk.RegexpTokenizer(u'[^ !?。]*[!?。.\n]')
69
+
70
+ # loop over the chunks
71
+ for chunk_ind, sentence_chunk in enumerate(chunks(dataset, chunksize, len(dataset))):
72
+
73
+ # new file for each chunk
74
+ filename = "sent_{}.txt".format(chunk_ind)
75
+ filepath = os.path.join(data_dir, filename)
76
+
77
+ print("writing to ", filepath)
78
+
79
+ with open(filepath, "w") as f:
80
+
81
+ for sentence in tqdm(sentence_chunk):
82
+
83
+ # remove newlines
84
+ line = sentence.strip()
85
+
86
+ # unicode normalize japanese spaces etc
87
+ unicodedata.normalize('NFKC', line)
88
+
89
+ # tokenize into sentences
90
+ sentences = sent_detector.tokenize(line)
91
+
92
+ # do not save empty items such as
93
+ if sentences != []:
94
+
95
+ f.writelines(s + '\n' for s in sentences)
96
+
97
+
98
+ def combine_files(output_file, *files):
99
+ """
100
+ Combines the contents of multiple text files into a single file.
101
+
102
+ :param output_file: Path to the output file.
103
+ :param files: Paths to the files to be combined.
104
+ :return: Total number of lines in the combined file.
105
+ """
106
+ total_lines = 0
107
+
108
+ with open(output_file, 'w') as outfile:
109
+ for file in files:
110
+ with open(file, 'r') as infile:
111
+ lines = infile.readlines()
112
+ total_lines += len(lines)
113
+ outfile.writelines(lines)
114
+ # Add a newline for separation (optional)
115
+ outfile.write('\n')
116
+
117
+
118
+ return total_lines
119
+
120
+ # make sentence files from hugingface dataset
121
+ dataset_bio = datasets.load_dataset("Siddharth63/biological_dataset")
122
+ make_sentence_files(dataset_bio["train"])
123
+
124
+ # combine files to get 45 million sentences
125
+ files_to_combine = glob.glob("text/sentences/*.txt")
126
+ files_to_combine = files_to_combine[:2]
127
+ total_lines = combine_files(output_file_path, *files_to_combine)
128
+
129
+ # Train the sentencepiece transformers on 45 million sentences
130
+ import sentencepiece as spm
131
+
132
+ spm.SentencePieceTrainer.train(input="text/final_file.txt", model_prefix='spiece', vocab_size=32000, character_coverage=1.0,
133
+ pad_id=0, unk_id=2, eos_id=1, bos_id=-1,
134
+ user_defined_symbols=['[NLU]', '[NLG]', '[S2S]'],
135
+ train_extremely_large_corpus=True,
136
+ num_threads=90, input_sentence_size=45000000, shuffle_input_sentence=True)
137
+
138
+