File size: 9,748 Bytes
cc0b62b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 |
# -*- coding: utf-8 -*-
from __future__ import print_function, division
import glob
import json
import uuid
from copy import deepcopy
from collections import defaultdict, OrderedDict
import numpy as np
from torchmoji.filter_utils import is_special_token
from torchmoji.word_generator import WordGenerator
from torchmoji.global_variables import SPECIAL_TOKENS, VOCAB_PATH
class VocabBuilder():
""" Create vocabulary with words extracted from sentences as fed from a
word generator.
def __init__(self, word_gen):
# initialize any new key with value of 0
self.word_counts = defaultdict(lambda: 0, {})
for token in SPECIAL_TOKENS:
assert len(token) < self.word_length_limit
self.word_counts[token] = 0
self.word_gen = word_gen
def count_words_in_sentence(self, words):
""" Generates word counts for all tokens in the given sentence.
# Arguments:
words: Tokenized sentence whose words should be counted.
for word in words:
if 0 < len(word) and len(word) <= self.word_length_limit:
self.word_counts[word] += 1
except KeyError:
self.word_counts[word] = 1
def save_vocab(self, path=None):
""" Saves the vocabulary into a file.
# Arguments:
path: Where the vocabulary should be saved. If not specified, a
randomly generated filename is used instead.
dtype = ([('word','|S{}'.format(self.word_length_limit)),('count','int')])
np_dict = np.array(self.word_counts.items(), dtype=dtype)
# sort from highest to lowest frequency
data = np_dict
if path is None:
path = str(uuid.uuid4())
np.savez_compressed(path, data=data)
print("Saved dict to {}".format(path))
def get_next_word(self):
""" Returns next tokenized sentence from the word geneerator.
# Returns:
List of strings, representing the next tokenized sentence.
return self.word_gen.__iter__().next()
def count_all_words(self):
""" Generates word counts for all words in all sentences of the word
for words, _ in self.word_gen:
class MasterVocab():
""" Combines vocabularies.
def __init__(self):
# initialize custom tokens
self.master_vocab = {}
def populate_master_vocab(self, vocab_path, min_words=1, force_appearance=None):
""" Populates the master vocabulary using all vocabularies found in the
given path. Vocabularies should be named *.npz. Expects the
vocabularies to be numpy arrays with counts. Normalizes the counts
and combines them.
# Arguments:
vocab_path: Path containing vocabularies to be combined.
min_words: Minimum amount of occurences a word must have in order
to be included in the master vocabulary.
force_appearance: Optional vocabulary filename that will be added
to the master vocabulary no matter what. This vocabulary must
be present in vocab_path.
paths = glob.glob(vocab_path + '*.npz')
sizes = {path: 0 for path in paths}
dicts = {path: {} for path in paths}
# set up and get sizes of individual dictionaries
for path in paths:
np_data = np.load(path)['data']
for entry in np_data:
word, count = entry
if count < min_words:
if is_special_token(word):
dicts[path][word] = count
sizes[path] = sum(dicts[path].values())
print('Overall word count for {} -> {}'.format(path, sizes[path]))
print('Overall word number for {} -> {}'.format(path, len(dicts[path])))
vocab_of_max_size = max(sizes, key=sizes.get)
max_size = sizes[vocab_of_max_size]
print('Min: {}, {}, {}'.format(sizes, vocab_of_max_size, max_size))
# can force one vocabulary to always be present
if force_appearance is not None:
force_appearance_path = [p for p in paths if force_appearance in p][0]
force_appearance_vocab = deepcopy(dicts[force_appearance_path])
force_appearance_path, force_appearance_vocab = None, None
# normalize word counts before inserting into master dict
for path in paths:
normalization_factor = max_size / sizes[path]
print('Norm factor for path {} -> {}'.format(path, normalization_factor))
for word in dicts[path]:
if is_special_token(word):
print("SPECIAL - ", word)
normalized_count = dicts[path][word] * normalization_factor
# can force one vocabulary to always be present
if force_appearance_vocab is not None:
force_word_count = force_appearance_vocab[word]
except KeyError:
#if force_word_count < 5:
if word in self.master_vocab:
self.master_vocab[word] += normalized_count
self.master_vocab[word] = normalized_count
print('Size of master_dict {}'.format(len(self.master_vocab)))
print("Hashes for master dict: {}".format(
len([w for w in self.master_vocab if '#' in w[0]])))
def save_vocab(self, path_count, path_vocab, word_limit=100000):
""" Saves the master vocabulary into a file.
# reserve space for 10 special tokens
words = OrderedDict()
for token in SPECIAL_TOKENS:
# store -1 instead of np.inf, which can overflow
words[token] = -1
# sort words by frequency
desc_order = OrderedDict(sorted(self.master_vocab.items(),
key=lambda kv: kv[1], reverse=True))
# use encoding of up to 30 characters (no token conversions)
# use float to store large numbers (we don't care about precision loss)
np_vocab = np.array(words.items(),
# output count for debugging
counts = np_vocab[:word_limit]
np.savez_compressed(path_count, counts=counts)
# output the index of each word for easy lookup
final_words = OrderedDict()
for i, w in enumerate(words.keys()[:word_limit]):
with open(path_vocab, 'w') as f:
f.write(json.dumps(final_words, indent=4, separators=(',', ': ')))
def all_words_in_sentences(sentences):
""" Extracts all unique words from a given list of sentences.
# Arguments:
sentences: List or word generator of sentences to be processed.
# Returns:
List of all unique words contained in the given sentences.
vocab = []
if isinstance(sentences, WordGenerator):
sentences = [s for s, _ in sentences]
for sentence in sentences:
for word in sentence:
if word not in vocab:
return vocab
def extend_vocab_in_file(vocab, max_tokens=10000, vocab_path=VOCAB_PATH):
""" Extends JSON-formatted vocabulary with words from vocab that are not
present in the current vocabulary. Adds up to max_tokens words.
Overwrites file in vocab_path.
# Arguments:
new_vocab: Vocabulary to be added. MUST have word_counts populated, i.e.
must have run count_all_words() previously.
max_tokens: Maximum number of words to be added.
vocab_path: Path to the vocabulary json which is to be extended.
with open(vocab_path, 'r') as f:
current_vocab = json.load(f)
except IOError:
print('Vocabulary file not found, expected at ' + vocab_path)
extend_vocab(current_vocab, vocab, max_tokens)
# Save back to file
with open(vocab_path, 'w') as f:
json.dump(current_vocab, f, sort_keys=True, indent=4, separators=(',',': '))
def extend_vocab(current_vocab, new_vocab, max_tokens=10000):
""" Extends current vocabulary with words from vocab that are not
present in the current vocabulary. Adds up to max_tokens words.
# Arguments:
current_vocab: Current dictionary of tokens.
new_vocab: Vocabulary to be added. MUST have word_counts populated, i.e.
must have run count_all_words() previously.
max_tokens: Maximum number of words to be added.
# Returns:
How many new tokens have been added.
if max_tokens < 0:
max_tokens = 10000
words = OrderedDict()
# sort words by frequency
desc_order = OrderedDict(sorted(new_vocab.word_counts.items(),
key=lambda kv: kv[1], reverse=True))
base_index = len(current_vocab.keys())
added = 0
for word in words:
if added >= max_tokens:
if word not in current_vocab.keys():
current_vocab[word] = base_index + added
added += 1
return added