Spaces:
Runtime error
Runtime error
import numpy as np | |
import os, sys | |
import torch | |
from torch import nn, optim | |
import subprocess | |
from tqdm import tqdm, trange | |
from torch.utils.data import DataLoader, Dataset, Sampler, SequentialSampler, RandomSampler | |
from torch.nn.utils.rnn import pad_sequence | |
import json | |
import pdb | |
import torch.nn.init as init | |
import glob | |
import logging | |
import pickle | |
import random | |
from torch.utils.data.distributed import DistributedSampler | |
logger = logging.getLogger(__name__) | |
class Meter(object): | |
'''Meters provide a way to keep track of important statistics in an online manner. | |
This class is abstract, but provides a standard interface for all meters to follow. | |
''' | |
def reset(self): | |
'''Resets the meter to default settings.''' | |
pass | |
def add(self, value): | |
'''Log a new value to the meter | |
Args: | |
value: Next restult to include. | |
''' | |
pass | |
def value(self): | |
'''Get the value of the meter in the current state.''' | |
pass | |
class AverageValueMeter(Meter): | |
def __init__(self): | |
super(AverageValueMeter, self).__init__() | |
self.reset() | |
self.val = 0 | |
def add(self, value, n=1): | |
self.val = value | |
self.sum += value | |
self.var += value * value | |
self.n += n | |
if self.n == 0: | |
self.mean, self.std = np.nan, np.nan | |
elif self.n == 1: | |
self.mean = 0.0 + self.sum # This is to force a copy in torch/numpy | |
self.std = np.inf | |
self.mean_old = self.mean | |
self.m_s = 0.0 | |
else: | |
self.mean = self.mean_old + (value - n * self.mean_old) / float(self.n) | |
self.m_s += (value - self.mean_old) * (value - self.mean) | |
self.mean_old = self.mean | |
self.std = np.sqrt(self.m_s / (self.n - 1.0)) | |
def value(self): | |
return self.mean, self.std | |
def reset(self): | |
self.n = 0 | |
self.sum = 0.0 | |
self.var = 0.0 | |
self.val = 0.0 | |
self.mean = np.nan | |
self.mean_old = 0.0 | |
self.m_s = 0.0 | |
self.std = np.nan | |
class BucketSampler(Sampler): | |
def __init__(self, lens, bucket_size, batch_size, droplast=False, shuffle=True): | |
self._lens = lens | |
self._batch_size = batch_size | |
self._bucket_size = bucket_size | |
self._droplast = droplast | |
self._shuf = shuffle | |
def __iter__(self): | |
ids = list(range(len(self._lens))) | |
if self._shuf: | |
random.shuffle(ids) | |
buckets = [sorted(ids[i:i+self._bucket_size], | |
key=lambda i: self._lens[i], reverse=True) | |
for i in range(0, len(ids), self._bucket_size)] | |
# buckets = [ids[i:i+self._bucket_size] for i in range(0, len(ids), self._bucket_size)] | |
batches = [bucket[i:i+self._batch_size] | |
for bucket in buckets | |
for i in range(0, len(bucket), self._batch_size)] | |
if self._droplast: | |
batches = [batch for batch in batches | |
if len(batch) == self._batch_size] | |
if self._shuf: | |
random.shuffle(batches) | |
return iter(batches) | |
def __len__(self): | |
bucket_sizes = ([self._bucket_size] | |
* (len(self._lens) // self._bucket_size) | |
+ [len(self._lens) % self._bucket_size]) | |
if self._droplast: | |
return sum(s//self._batch_size for s in bucket_sizes) | |
else: | |
return sum(math.ceil(s/self._batch_size) for s in bucket_sizes) | |
class FeatureDataset(Dataset): | |
def __init__(self, features, max_len=None): | |
self.features = features | |
self.max_len = max_len # this max_len do truncate | |
def __getitem__(self, i): | |
feat_dict = self.features[i] | |
feat = InputFeatures(**feat_dict) | |
return feat | |
def __len__(self): | |
return len(self.features) | |
def collate(features): | |
input_ids_bert = pad_sequence([torch.tensor(f.input_ids_bert, dtype=torch.long) for f in features], batch_first=True, padding_value=0) | |
input_ids_gpt = pad_sequence([torch.tensor(f.input_ids_gpt, dtype=torch.long) for f in features], batch_first=True, padding_value=0) | |
lm_labels = pad_sequence([torch.tensor(f.input_ids_gpt, dtype=torch.long) for f in features], batch_first=True, padding_value=-1) | |
return (input_ids_bert, input_ids_gpt, lm_labels) | |
class BucketingDataLoader(object): | |
def __init__(self, file_path, batch_size, max_seq_length, tokenizer, args, bucket=100, shuffle=True): | |
self.dataset = TokenDataset(tokenizer, args, file_path, block_size=args.block_size) | |
self.batch_size = batch_size | |
self.max_len = max_seq_length | |
self.bucket_size = bucket * batch_size | |
self.shuffle = shuffle | |
self.num_examples = len(self.dataset.examples) | |
self.num_batches = self.num_examples//batch_size | |
self.example_lengths = [example['bert_token_length'] for example in self.dataset.examples] | |
def __iter__(self): | |
sampler = BucketSampler(self.example_lengths, self.bucket_size, self.batch_size, droplast=True, shuffle=self.shuffle) | |
loader = DataLoader(self.dataset, batch_sampler=sampler, num_workers=0, collate_fn=TokenDataset.collate) | |
yield from loader | |
def __len__(self): | |
return self.num_batches | |
def __del__(self): | |
pass | |
class Dialog_BucketingDataLoader(object): | |
def __init__(self, file_path, batch_size, max_seq_length, tokenizer, args, bucket=100, shuffle=True): | |
self.dataset = Dialog_TokenDataset(tokenizer, args, file_path, block_size=args.block_size) | |
self.batch_size = batch_size | |
self.max_len = max_seq_length | |
self.bucket_size = bucket * batch_size | |
self.shuffle = shuffle | |
self.num_examples = len(self.dataset.examples) | |
self.num_batches = self.num_examples//batch_size | |
self.example_lengths = [example['bert_token_length'] for example in self.dataset.examples] | |
def __iter__(self): | |
sampler = BucketSampler(self.example_lengths, self.bucket_size, self.batch_size, droplast=True, shuffle=self.shuffle) | |
loader = DataLoader(self.dataset, batch_sampler=sampler, num_workers=0, collate_fn=Dialog_TokenDataset.collate) | |
yield from loader | |
def __len__(self): | |
return self.num_batches | |
def __del__(self): | |
pass | |
class MultipleFiles_DataLoader(object): | |
def __init__(self, file_path, batch_size, max_seq_length, tokenizer, args, bucket=100, shuffle=True, use_tensor=True): | |
self.batch_size = batch_size | |
self.max_len = max_seq_length | |
self.bucket_size = bucket * batch_size | |
self.shuffle = shuffle | |
self.file_path = file_path | |
self.tokenizer = tokenizer | |
self.args = args | |
self.use_tensor=use_tensor | |
# prepare for the first file | |
self.file_idx = 0 | |
self.cached_features_file = os.path.join(self.file_path, args.dataset.lower()+f'.segmented.nltk.split.seq64.{self.file_idx}.json' ) | |
self.dataset = PreparedTokenDataset(tokenizer, self.args, self.cached_features_file, block_size=self.args.block_size) | |
self.num_examples = len(self.dataset.examples) | |
self.num_batches = self.num_examples//batch_size | |
self.example_lengths = [example['bert_token_length'] for example in self.dataset.examples] | |
def __iter__(self): | |
sampler = BucketSampler(self.example_lengths, self.bucket_size, self.batch_size, droplast=True, shuffle=self.shuffle) | |
loader = DataLoader(self.dataset, batch_sampler=sampler, num_workers=0, collate_fn=PreparedTokenDataset.collate if self.use_tensor else PreparedTokenDataset.get_examples ) | |
yield from loader | |
# update file name for next file | |
self.file_idx += 1 | |
self.cached_features_file = os.path.join(self.file_path, self.args.dataset.lower()+f'.segmented.nltk.split.seq64.{self.file_idx}.json' ) | |
self.dataset = PreparedTokenDataset(self.tokenizer, self.args, self.cached_features_file, block_size=self.args.block_size) | |
self.num_examples = len(self.dataset.examples) | |
self.num_batches = self.num_examples//self.batch_size | |
self.example_lengths = [example['bert_token_length'] for example in self.dataset.examples] | |
def __len__(self): | |
return self.num_batches | |
def __del__(self): | |
pass | |
def reset(self): | |
self.file_idx = 0 | |
# When the dataset is too big, we can divide it into multiple small files. | |
# This class is used load multiple files. | |
class BucketingMultipleFiles_DataLoader(object): | |
def __init__(self, file_path, batch_size, max_seq_length, tokenizer, args, bucket=100, shuffle=True): | |
self.batch_size = batch_size | |
self.max_len = max_seq_length | |
self.bucket_size = bucket * batch_size | |
self.shuffle = shuffle | |
self.file_path = file_path | |
self.tokenizer = tokenizer | |
self.args = args | |
# prepare for the first file | |
self.file_idx = 0 | |
self.cached_features_file = os.path.join(self.file_path, args.dataset.lower()+f'.segmented.nltk.split.seq64.{self.file_idx}.json' ) | |
self.dataset = PreparedTokenDataset(tokenizer, self.args, self.cached_features_file, block_size=self.args.block_size) | |
self.num_examples = len(self.dataset.examples) | |
self.num_batches = self.num_examples//batch_size | |
self.example_lengths = [example['bert_token_length'] for example in self.dataset.examples] | |
def __iter__(self): | |
# sampler = BucketSampler(self.example_lengths, self.bucket_size, self.batch_size, droplast=True, shuffle=self.shuffle) | |
# loader = DataLoader(self.dataset, batch_sampler=sampler, num_workers=0, collate_fn=PreparedTokenDataset.collate) | |
# distributed | |
sampler = DistributedSampler(self.dataset) | |
loader = DataLoader(self.dataset, sampler=sampler, batch_size=self.batch_size, pin_memory=True, num_workers=0, collate_fn=PreparedTokenDataset.collate) | |
yield from loader | |
# update file name for next file | |
self.file_idx += 1 | |
self.cached_features_file = os.path.join(self.file_path, self.args.dataset.lower()+f'.segmented.nltk.split.seq64.{self.file_idx}.json' ) | |
self.dataset = PreparedTokenDataset(self.tokenizer, self.args, self.cached_features_file, block_size=self.args.block_size) | |
self.num_examples = len(self.dataset.examples) | |
self.num_batches = self.num_examples//self.batch_size | |
self.example_lengths = [example['bert_token_length'] for example in self.dataset.examples] | |
def __len__(self): | |
return self.num_batches | |
def __del__(self): | |
pass | |
def reset(self): | |
self.file_idx = 0 | |
class PreparedTokenDataset(Dataset): | |
def __init__(self, tokenizers, args, cached_features_file='train', text_split_mode='natural', block_size=512): | |
logger.info(cached_features_file) | |
assert os.path.isfile(cached_features_file) | |
self.examples = [] | |
self.tokenizers = tokenizers | |
# Bert tokenizer special tokens | |
self.bert_pad_token=tokenizers[0].convert_tokens_to_ids([tokenizers[0].pad_token])[0] | |
# GPT-2 tokenizer special tokens | |
self.gpt2_pad_token=tokenizers[1].convert_tokens_to_ids([tokenizers[1].pad_token])[0] | |
self.gpt2_bos_token=tokenizers[1].convert_tokens_to_ids([tokenizers[1].bos_token])[0] | |
self.gpt2_eos_token=tokenizers[1].convert_tokens_to_ids([tokenizers[1].eos_token])[0] | |
global bert_pad_token | |
global gpt2_pad_token | |
bert_pad_token = self.bert_pad_token | |
gpt2_pad_token = self.gpt2_pad_token | |
if args.dataset == 'Yahoo' or args.dataset == 'Penn' or args.dataset == 'Snli' or args.dataset == 'Debug' or args.dataset == 'wikipedia': | |
label_on = False | |
elif args.dataset == 'Yelp': | |
label_on = True | |
logger.info("Loading features from cached file %s", cached_features_file) | |
with open(cached_features_file, 'r') as handle: | |
self.examples = json.load(handle) | |
def __len__(self): | |
return len(self.examples) | |
def __getitem__(self, item): | |
return self.examples[item] | |
def get_examples(examples): | |
token_lengths = torch.tensor( [[f['bert_token_length'], f['gpt2_token_length']] for f in examples] , dtype=torch.long) | |
return examples, token_lengths | |
def collate(examples): | |
# Convert to Tensors and build dataset | |
input_ids_bert = pad_sequence([torch.tensor(f['bert_token'], dtype=torch.long) for f in examples], batch_first=True, padding_value=bert_pad_token) | |
input_ids_gpt = pad_sequence([torch.tensor(f['gpt2_token'], dtype=torch.long) for f in examples], batch_first=True, padding_value=gpt2_pad_token) | |
token_lengths = torch.tensor( [[f['bert_token_length'], f['gpt2_token_length']] for f in examples] , dtype=torch.long) | |
return (input_ids_bert, input_ids_gpt, token_lengths) | |
class TokenDataset(Dataset): | |
def __init__(self, tokenizers, args, file_path='train', text_split_mode='natural', block_size=512): | |
assert os.path.isfile(file_path) | |
directory, filename = os.path.split(file_path) | |
cached_features_file = os.path.join(directory, f'cached_lm_gpt_bert_{block_size}_{filename[:-4]}.json') | |
self.examples = [] | |
self.tokenizers = tokenizers | |
# Bert tokenizer special tokens | |
self.bert_pad_token=tokenizers[0].convert_tokens_to_ids([tokenizers[0].pad_token])[0] | |
# GPT-2 tokenizer special tokens | |
self.gpt2_pad_token=tokenizers[1].convert_tokens_to_ids([tokenizers[1].pad_token])[0] | |
self.gpt2_bos_token=tokenizers[1].convert_tokens_to_ids([tokenizers[1].bos_token])[0] | |
self.gpt2_eos_token=tokenizers[1].convert_tokens_to_ids([tokenizers[1].eos_token])[0] | |
global bert_pad_token | |
global gpt2_pad_token | |
bert_pad_token = self.bert_pad_token | |
gpt2_pad_token = self.gpt2_pad_token | |
if args.dataset == 'Yelp': | |
label_on = True | |
else: | |
label_on = False | |
if os.path.exists(cached_features_file): | |
logger.info("Loading features from cached file %s", cached_features_file) | |
with open(cached_features_file, 'r') as handle: | |
self.examples = json.load(handle) | |
else: | |
logger.info("Creating features from dataset file at %s", directory) | |
dropped, count = self._read_corpus_natural_split(fname=file_path, label=label_on, max_length=block_size, block_size=block_size, args=args) | |
logger.info("The number of dropped sentences is %d", dropped) | |
logger.info("The number of processed sentences is %d", count) | |
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding) | |
# If your dataset is small, first you should loook for a bigger one :-) and second you | |
# can change this behavior by adding (model specific) padding. | |
logger.info("Saving features into cached file %s", cached_features_file) | |
if args.use_philly: | |
save_solid = False | |
while not save_solid: | |
try: | |
with open(cached_features_file, 'w') as handle: | |
json.dump(self.examples, handle) | |
except: | |
pass | |
else: | |
with open(cached_features_file, 'w') as handle: | |
json.dump(self.examples, handle) | |
def __len__(self): | |
return len(self.examples) | |
def __getitem__(self, item): | |
return self.examples[item] | |
def collate(examples): | |
# Convert to Tensors and build dataset | |
input_ids_bert = pad_sequence([torch.tensor(f['bert_token'], dtype=torch.long) for f in examples], batch_first=True, padding_value=bert_pad_token) | |
input_ids_gpt = pad_sequence([torch.tensor(f['gpt2_token'], dtype=torch.long) for f in examples], batch_first=True, padding_value=gpt2_pad_token) | |
token_lengths = torch.tensor( [[f['bert_token_length'], f['gpt2_token_length']] for f in examples] , dtype=torch.long) | |
return (input_ids_bert, input_ids_gpt, token_lengths) | |
def _read_corpus_natural_split(self, fname, label, max_length, block_size, args): | |
data = [] | |
labels = [] if label else None | |
dropped = 0 | |
count = 0 | |
with open(fname) as fin: | |
for line in fin: | |
if label: | |
split_line = line.split('\t') | |
lb = split_line[0] | |
split_line_text = split_line[1] | |
else: | |
split_line_text = line | |
split_line_text = split_line_text.strip() | |
if len(split_line_text.split()) < 1: | |
dropped += 1 | |
continue | |
if max_length: | |
if len(split_line_text.split()) > max_length: | |
dropped += 1 | |
continue | |
if label: | |
labels.append(lb) | |
tokenized_text0 = self.tokenizers[0].convert_tokens_to_ids(self.tokenizers[0].tokenize(split_line_text)) | |
tokenized_text0 = self.tokenizers[0].add_special_tokens_single_sentence(tokenized_text0) | |
tokenized_text0_length = len(tokenized_text0) | |
tokenized_text1 = self.tokenizers[1].convert_tokens_to_ids(self.tokenizers[1].tokenize(split_line_text)) | |
tokenized_text1 = self.tokenizers[1].add_special_tokens_single_sentence(tokenized_text1) | |
tokenized_text1 = [self.gpt2_bos_token] + tokenized_text1 + [self.gpt2_eos_token] | |
tokenized_text1_length = len(tokenized_text1) | |
example = { | |
'bert_token': tokenized_text0, | |
'bert_token_length':tokenized_text0_length, | |
'gpt2_token':tokenized_text1, | |
'gpt2_token_length': tokenized_text1_length | |
} | |
self.examples.append(example) | |
count +=1 | |
return dropped, count | |
class Dialog_TokenDataset(Dataset): | |
def __init__(self, tokenizers, args, file_path='train', text_split_mode='natural', block_size=512): | |
assert os.path.isfile(file_path) | |
directory, filename = os.path.split(file_path) | |
cached_features_file = os.path.join(directory, f'cached_lm_gpt_bert_{block_size}_{filename[:-4]}.json') | |
self.examples = [] | |
self.tokenizers = tokenizers | |
# Bert tokenizer special tokens | |
self.bert_pad_token=tokenizers[0].convert_tokens_to_ids([tokenizers[0].pad_token])[0] | |
# GPT-2 tokenizer special tokens | |
self.gpt2_pad_token=tokenizers[1].convert_tokens_to_ids([tokenizers[1].pad_token])[0] | |
self.gpt2_bos_token=tokenizers[1].convert_tokens_to_ids([tokenizers[1].bos_token])[0] | |
self.gpt2_eos_token=tokenizers[1].convert_tokens_to_ids([tokenizers[1].eos_token])[0] | |
global bert_pad_token | |
global gpt2_pad_token | |
bert_pad_token = self.bert_pad_token | |
gpt2_pad_token = self.gpt2_pad_token | |
if args.dataset == 'Yelp': | |
label_on = True | |
else: | |
label_on = False | |
if os.path.exists(cached_features_file): | |
logger.info("Loading features from cached file %s", cached_features_file) | |
with open(cached_features_file, 'r') as handle: | |
self.examples = json.load(handle) | |
else: | |
logger.info("Creating features from dataset file at %s", directory) | |
dropped, count = self._read_dialog_corpus_natural_split(fname=file_path, label=label_on, max_length=block_size, block_size=block_size, args=args) | |
logger.info("The number of dropped sentences is %d", dropped) | |
logger.info("The number of processed sentences is %d", count) | |
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding) | |
# If your dataset is small, first you should loook for a bigger one :-) and second you | |
# can change this behavior by adding (model specific) padding. | |
logger.info("Saving features into cached file %s", cached_features_file) | |
if args.use_philly: | |
save_solid = False | |
while not save_solid: | |
try: | |
with open(cached_features_file, 'w') as handle: | |
json.dump(self.examples, handle) | |
except: | |
pass | |
else: | |
with open(cached_features_file, 'w') as handle: | |
json.dump(self.examples, handle) | |
def __len__(self): | |
return len(self.examples) | |
def __getitem__(self, item): | |
return self.examples[item] | |
def collate(examples): | |
# Convert to Tensors and build dataset | |
input_ids_bert_ctx = pad_sequence([torch.tensor(f['bert_token_ctx'], dtype=torch.long) for f in examples], batch_first=True, padding_value=bert_pad_token) | |
input_ids_bert = pad_sequence([torch.tensor(f['bert_token'], dtype=torch.long) for f in examples], batch_first=True, padding_value=bert_pad_token) | |
input_ids_gpt = pad_sequence([torch.tensor(f['gpt2_token'], dtype=torch.long) for f in examples], batch_first=True, padding_value=gpt2_pad_token) | |
token_lengths = torch.tensor( [[f['bert_token_ctx_length'], f['bert_token_length'], f['gpt2_token_length']] for f in examples] , dtype=torch.long) | |
return (input_ids_bert_ctx, input_ids_bert, input_ids_gpt, token_lengths) | |
def _read_dialog_corpus_natural_split(self, fname, label, max_length, block_size, args): | |
data = [] | |
labels = [] if label else None | |
dropped = 0 | |
count = 0 | |
with open(fname) as fin: | |
for line in fin: | |
split_line_text = line | |
split_line_text = split_line_text.strip() | |
if len(split_line_text.split()) < 1: | |
dropped += 1 | |
continue | |
# if max_length: | |
# if len(split_line_text.split()) > max_length: | |
# dropped += 1 | |
# continue | |
context_text, response_text = split_line_text.split('\t') | |
tokenized_text_ctx = self.tokenizers[0].convert_tokens_to_ids(self.tokenizers[0].tokenize(context_text)) | |
tokenized_text_ctx = self.tokenizers[0].add_special_tokens_single_sentence(tokenized_text_ctx) | |
if len(tokenized_text_ctx)>512: | |
tokenized_text_ctx = tokenized_text_ctx[-512:] | |
# pdb.set_trace() | |
tokenized_text_ctx_length = len(tokenized_text_ctx) | |
tokenized_text0 = self.tokenizers[0].convert_tokens_to_ids(self.tokenizers[0].tokenize(response_text)) | |
tokenized_text0 = self.tokenizers[0].add_special_tokens_single_sentence(tokenized_text0) | |
if len(tokenized_text0)>512: | |
tokenized_text0 = tokenized_text0[-512:] | |
tokenized_text0_length = len(tokenized_text0) | |
tokenized_text1 = self.tokenizers[1].convert_tokens_to_ids(self.tokenizers[1].tokenize(response_text)) | |
tokenized_text1 = self.tokenizers[1].add_special_tokens_single_sentence(tokenized_text1) | |
tokenized_text1 = [self.gpt2_bos_token] + tokenized_text1 + [self.gpt2_eos_token] | |
tokenized_text1_length = len(tokenized_text1) | |
# pdb.set_trace() | |
example = { | |
'bert_token_ctx': tokenized_text_ctx, | |
'bert_token_ctx_length':tokenized_text_ctx_length, | |
'bert_token': tokenized_text0, | |
'bert_token_length':tokenized_text0_length, | |
'gpt2_token':tokenized_text1, | |
'gpt2_token_length': tokenized_text1_length | |
} | |
self.examples.append(example) | |
count +=1 | |
return dropped, count | |
class TextDataset_Split(Dataset): | |
def __init__(self, tokenizer, args, file_path='train', text_split_mode='natural', block_size=512): | |
assert os.path.isfile(file_path) | |
directory, filename = os.path.split(file_path) | |
cached_features_file = os.path.join(directory, f'cached_lm_gpt_{block_size}_{filename}') | |
self.examples = [] | |
self.tokenizer = tokenizer | |
# GPT tokenizers | |
self.pad_token_id=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0] | |
self.bos_token_id=tokenizer.convert_tokens_to_ids([tokenizer.bos_token])[0] | |
self.eos_token_id=tokenizer.convert_tokens_to_ids([tokenizer.eos_token])[0] | |
if args.dataset == 'Yelp': | |
label_on = True | |
else: | |
label_on = False | |
if os.path.exists(cached_features_file): | |
logger.info("Loading features from cached file %s", cached_features_file) | |
with open(cached_features_file, 'rb') as handle: | |
self.examples = pickle.load(handle) | |
else: | |
logger.info("Creating features from dataset file at %s", directory) | |
if text_split_mode == 'block': | |
self._read_corpus_block_split(fname=file_path, block_size = block_size) | |
elif text_split_mode == 'natural': | |
self._read_corpus_natural_split(fname=file_path, label=label_on, max_length=block_size, block_size=block_size) | |
else: | |
print('Please specify the mode to split the raw text') | |
# pdb.set_trace() | |
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding) | |
# If your dataset is small, first you should loook for a bigger one :-) and second you | |
# can change this behavior by adding (model specific) padding. | |
logger.info("Saving features into cached file %s", cached_features_file) | |
with open(cached_features_file, 'wb') as handle: | |
pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL) | |
def __len__(self): | |
return len(self.examples) | |
def __getitem__(self, item): | |
# pdb.set_trace() | |
# Convert to Tensors and build dataset | |
tokenized_text1= torch.tensor(self.examples[item][0], dtype=torch.long) | |
tokenized_text_lengths = torch.tensor([self.examples[item][1]], dtype=torch.long) | |
# pdb.set_trace() | |
return (tokenized_text1, tokenized_text_lengths) | |
def _read_corpus_natural_split(self, fname, label, max_length, block_size): | |
data = [] | |
labels = [] if label else None | |
dropped = 0 | |
with open(fname) as fin: | |
for line in fin: | |
if label: | |
split_line = line.split('\t') | |
lb = split_line[0] | |
split_line_text = split_line[1] | |
else: | |
split_line_text = line | |
if len(split_line_text) < 1: | |
dropped += 1 | |
continue | |
if max_length: | |
if len(split_line_text.split()) > max_length: | |
dropped += 1 | |
continue | |
if label: | |
labels.append(lb) | |
tokenized_text1 = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(split_line_text)) | |
tokenized_text1 = self.tokenizer.add_special_tokens_single_sentence(tokenized_text1) | |
tokenized_text1_length = len(tokenized_text1) | |
tokenized_text1 = [self.bos_token_id] + tokenized_text1 + [self.eos_token_id] | |
tokenized_text1 = tokenized_text1 + ([self.pad_token_id] * (block_size - tokenized_text1_length - 2) ) # Pad up to the sequence length. | |
assert len(tokenized_text1) == block_size | |
self.examples.append([tokenized_text1, tokenized_text1_length]) | |
def _read_corpus_block_split(self, fname, block_size): | |
with open(fname, encoding="utf-8") as f: | |
text = f.read() | |
# Chunyuan: divide the linguistic text into the same length, then different tokenization schemes are applied | |
while len(text) >= block_size: # Truncate in block of block_size | |
tokenized_text1 = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text[:block_size])) | |
tokenized_text1 = self.tokenizer.add_special_tokens_single_sentence(tokenized_text1) | |
tokenized_text1_length = len(tokenized_text1) | |
tokenized_text1 = [bos_token_id] + tokenized_text1 + [eos_token_id] | |
tokenized_text1 = tokenized_text1 + ([pad_token_id] * (block_size - tokenized_text1_length - 2) ) # Pad up to the sequence length. | |
assert len(tokenized_text1) == block_size | |
self.examples.append([tokenized_text1, tokenized_text1_length]) | |
text = text[block_size:] | |
class TextDataset_2Tokenizers_LCtrlG(Dataset): | |
def __init__(self, tokenizers, args, file_path='train', text_split_mode='natural', block_size=512, create_new=0): | |
print(file_path) | |
assert os.path.isfile(file_path) | |
directory, filename = os.path.split(file_path) | |
cached_features_file = os.path.join(directory, f'cached_lm_gpt_bert_{block_size}_{filename}') | |
self.examples = [] | |
self.tokenizers = tokenizers | |
# GPT tokenizers | |
self.pad_token=tokenizers[1].convert_tokens_to_ids([tokenizers[1].pad_token])[0] | |
self.bos_token=tokenizers[1].convert_tokens_to_ids([tokenizers[1].bos_token])[0] | |
self.eos_token=tokenizers[1].convert_tokens_to_ids([tokenizers[1].eos_token])[0] | |
if not create_new and os.path.exists(cached_features_file): | |
logger.info("Loading features from cached file %s", cached_features_file) | |
with open(cached_features_file, 'rb') as handle: | |
self.examples = pickle.load(handle) | |
else: | |
logger.info("Creating features from dataset file at %s", directory) | |
if text_split_mode == 'natural': | |
if args.dataset == 'Yelp': | |
dropped = self._read_corpus_natural_split_yelp(fname=file_path, label=True, max_length=block_size, block_size=block_size) | |
logger.info("The number of dropped sentences is %d", dropped) | |
elif args.dataset == 'yahoo': | |
pass | |
else: | |
raise NotImplementedError | |
else: | |
raise ValueError('Please specify the mode to split the raw text') | |
logger.info("Saving features into cached file %s", cached_features_file) | |
with open(cached_features_file, 'wb') as handle: | |
pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL) | |
def __len__(self): | |
return len(self.examples) | |
def __getitem__(self, item): | |
# pdb.set_trace() | |
# Convert to Tensors and build dataset | |
tokenized_text0= torch.tensor(self.examples[item][0], dtype=torch.long) | |
tokenized_text1= torch.tensor(self.examples[item][2], dtype=torch.long) | |
tokenized_text_lengths = torch.tensor([self.examples[item][1], self.examples[item][3]], dtype=torch.long) | |
label = torch.tensor(self.examples[item][4], dtype=torch.long) | |
# pdb.set_trace() | |
return (tokenized_text0, tokenized_text1, tokenized_text_lengths, label) | |
def get_labels(self): | |
return ['0', '1'] | |
def _read_corpus_natural_split_yelp(self, fname, label, max_length, block_size): | |
# label: the file contains labels. | |
dropped = 0 | |
label_fname = fname.replace('.text', '.labels') | |
with open(fname) as fin, open(label_fname) as lfin: | |
for line, label_line in zip(fin, lfin): | |
# pdb.set_trace() | |
split_line_text = line | |
lb = int(label_line) | |
assert lb in [0, 1] # binary sentiment in yelp dataset. | |
if len(split_line_text) < 1: | |
dropped += 1 | |
continue | |
if max_length: | |
if len(split_line_text.split()) > max_length: | |
dropped += 1 | |
continue | |
# tokenize by tokenizers[0] | |
tokenized_text0 = self.tokenizers[0].convert_tokens_to_ids(self.tokenizers[0].tokenize(split_line_text)) | |
tokenized_text0 = self.tokenizers[0].add_special_tokens_single_sentence(tokenized_text0) | |
tokenized_text0_length = len(tokenized_text0) | |
pad_token=self.tokenizers[0].convert_tokens_to_ids([self.tokenizers[0].pad_token])[0] | |
# pad to max_seq_length (block_size) | |
if block_size > tokenized_text0_length: | |
tokenized_text0 = tokenized_text0 + ([pad_token] * (block_size - tokenized_text0_length) ) # Pad up to the sequence length. | |
else: | |
dropped += 1 | |
continue | |
assert len(tokenized_text0) == block_size | |
# tokenize by tokenizers[1] | |
tokenized_text1 = self.tokenizers[1].convert_tokens_to_ids(self.tokenizers[1].tokenize(split_line_text)) | |
tokenized_text1 = self.tokenizers[1].add_special_tokens_single_sentence(tokenized_text1) | |
tokenized_text1 = [self.bos_token] + tokenized_text1 + [self.eos_token] | |
tokenized_text1_length = len(tokenized_text1) | |
# pad to max_seq_length (block_size) | |
if block_size > tokenized_text1_length: | |
tokenized_text1 = tokenized_text1 + ([self.pad_token] * (block_size - tokenized_text1_length) ) # Pad up to the sequence length. | |
else: | |
dropped += 1 | |
continue | |
assert len(tokenized_text1) == block_size | |
self.examples.append([tokenized_text0, tokenized_text0_length, tokenized_text1, tokenized_text1_length, lb]) | |
return dropped | |
class TextDataset_2Tokenizers(Dataset): | |
def __init__(self, tokenizers, args, file_path='train', text_split_mode='natural', block_size=512): | |
assert os.path.isfile(file_path) | |
directory, filename = os.path.split(file_path) | |
cached_features_file = os.path.join(directory, f'cached_lm_gpt_bert_{block_size}_{filename}') | |
self.examples = [] | |
self.tokenizers = tokenizers | |
# GPT tokenizers | |
self.pad_token=tokenizers[1].convert_tokens_to_ids([tokenizers[1].pad_token])[0] | |
self.bos_token=tokenizers[1].convert_tokens_to_ids([tokenizers[1].bos_token])[0] | |
self.eos_token=tokenizers[1].convert_tokens_to_ids([tokenizers[1].eos_token])[0] | |
if args.dataset == 'Yelp': | |
label_on = True | |
else: | |
label_on = False | |
if os.path.exists(cached_features_file): | |
logger.info("Loading features from cached file %s", cached_features_file) | |
with open(cached_features_file, 'rb') as handle: | |
self.examples = pickle.load(handle) | |
else: | |
logger.info("Creating features from dataset file at %s", directory) | |
if text_split_mode == 'block': | |
self._read_corpus_block_split(fname=file_path, block_size = block_size) | |
elif text_split_mode == 'natural': | |
dropped, count = self._read_corpus_natural_split(fname=file_path, label=label_on, max_length=block_size, block_size=block_size, args=args) | |
logger.info("The number of dropped sentences is %d", dropped) | |
logger.info("The number of used sentences is %d", count) | |
else: | |
print('Please specify the mode to split the raw text') | |
# pdb.set_trace() | |
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding) | |
# If your dataset is small, first you should loook for a bigger one :-) and second you | |
# can change this behavior by adding (model specific) padding. | |
logger.info("Saving features into cached file %s", cached_features_file) | |
if args.use_philly: | |
save_solid = False | |
while not save_solid: | |
try: | |
with open(cached_features_file, 'wb') as handle: | |
pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL) | |
except: | |
pass | |
else: | |
with open(cached_features_file, 'wb') as handle: | |
pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL) | |
def __len__(self): | |
return len(self.examples) | |
def __getitem__(self, item): | |
# pdb.set_trace() | |
# Convert to Tensors and build dataset | |
tokenized_text0= torch.tensor(self.examples[item][0], dtype=torch.long) | |
tokenized_text1= torch.tensor(self.examples[item][2], dtype=torch.long) | |
tokenized_text_lengths = torch.tensor([self.examples[item][1], self.examples[item][3]], dtype=torch.long) | |
# pdb.set_trace() | |
return (tokenized_text0, tokenized_text1, tokenized_text_lengths) | |
def _read_corpus_natural_split(self, fname, label, max_length, block_size, args): | |
data = [] | |
labels = [] if label else None | |
dropped = 0 | |
count = 0 | |
with open(fname) as fin: | |
for line in fin: | |
# pdb.set_trace() | |
if label: | |
split_line = line.split('\t') | |
lb = split_line[0] | |
split_line_text = split_line[1] | |
else: | |
split_line_text = line | |
if len(split_line_text.split()) < 1: | |
dropped += 1 | |
continue | |
if max_length: | |
if len(split_line_text.split()) > max_length: | |
dropped += 1 | |
continue | |
if label: | |
labels.append(lb) | |
tokenized_text0 = self.tokenizers[0].convert_tokens_to_ids(self.tokenizers[0].tokenize(split_line_text)) | |
tokenized_text0 = self.tokenizers[0].add_special_tokens_single_sentence(tokenized_text0) | |
tokenized_text0_length = len(tokenized_text0) | |
pad_token=self.tokenizers[0].convert_tokens_to_ids([self.tokenizers[0].pad_token])[0] | |
if block_size>tokenized_text0_length: | |
tokenized_text0 = tokenized_text0 + ([pad_token] * (block_size - tokenized_text0_length) ) # Pad up to the sequence length. | |
else: | |
dropped += 1 | |
continue | |
assert len(tokenized_text0) == block_size | |
tokenized_text1 = self.tokenizers[1].convert_tokens_to_ids(self.tokenizers[1].tokenize(split_line_text)) | |
tokenized_text1 = self.tokenizers[1].add_special_tokens_single_sentence(tokenized_text1) | |
tokenized_text1 = [self.bos_token] + tokenized_text1 + [self.eos_token] | |
tokenized_text1_length = len(tokenized_text1) | |
if block_size>tokenized_text1_length: | |
tokenized_text1 = tokenized_text1 + ([self.pad_token] * (block_size - tokenized_text1_length) ) # Pad up to the sequence length. | |
else: | |
dropped += 1 | |
continue | |
assert len(tokenized_text1) == block_size | |
self.examples.append([tokenized_text0, tokenized_text0_length, tokenized_text1, tokenized_text1_length]) | |
count +=1 | |
# if args.dataset == 'wikipedia' and count==10: | |
# break | |
return dropped, count | |
def _read_corpus_block_split(self, fname, block_size): | |
with open(fname, encoding="utf-8") as f: | |
text = f.read() | |
# Chunyuan: divide the linguistic text into the same length, then different tokenization schemes are applied | |
while len(text) >= block_size: # Truncate in block of block_size | |
tokenized_text0 = self.tokenizers[0].convert_tokens_to_ids(self.tokenizers[0].tokenize(text[:block_size])) | |
tokenized_text0 = self.tokenizers[0].add_special_tokens_single_sentence(tokenized_text0) | |
tokenized_text0_length = len(tokenized_text0) | |
pad_token=self.tokenizers[0].convert_tokens_to_ids([self.tokenizers[0].pad_token])[0] | |
tokenized_text0 = tokenized_text0 + ([pad_token] * (block_size - tokenized_text0_length) ) # Pad up to the sequence length. | |
assert len(tokenized_text0) == block_size | |
tokenized_text1 = self.tokenizers[1].convert_tokens_to_ids(self.tokenizers[1].tokenize(text[:block_size])) | |
tokenized_text1 = self.tokenizers[1].add_special_tokens_single_sentence(tokenized_text1) | |
tokenized_text1_length = len(tokenized_text1) | |
tokenized_text1 = [bos_token] + tokenized_text1 + [eos_token] | |
tokenized_text1 = tokenized_text1 + ([pad_token] * (block_size - tokenized_text1_length - 2) ) # Pad up to the sequence length. | |
assert len(tokenized_text1) == block_size | |
self.examples.append([tokenized_text0, tokenized_text0_length, tokenized_text1, tokenized_text1_length]) | |
text = text[block_size:] | |
def frange_cycle_linear(n_iter, start=0.0, stop=1.0, n_cycle=4, ratio=0.5): | |
L = np.ones(n_iter) * stop | |
period = n_iter/n_cycle | |
step = (stop-start)/(period*ratio) # linear schedule | |
for c in range(n_cycle): | |
v, i = start, 0 | |
while v <= stop and (int(i+c*period) < n_iter): | |
L[int(i+c*period)] = v | |
v += step | |
i += 1 | |
return L | |
def frange_cycle_zero_linear(n_iter, start=0.0, stop=1.0, n_cycle=4, ratio_increase=0.5, ratio_zero=0.3): | |
L = np.ones(n_iter) * stop | |
period = n_iter/n_cycle | |
step = (stop-start)/(period*ratio_increase) # linear schedule | |
for c in range(n_cycle): | |
v, i = start, 0 | |
while v <= stop and (int(i+c*period) < n_iter): | |
if i < period*ratio_zero: | |
L[int(i+c*period)] = start | |
else: | |
L[int(i+c*period)] = v | |
v += step | |
i += 1 | |
return L | |
class uniform_initializer(object): | |
def __init__(self, stdv): | |
self.stdv = stdv | |
def __call__(self, tensor): | |
nn.init.uniform_(tensor, -self.stdv, self.stdv) | |
class xavier_normal_initializer(object): | |
def __call__(self, tensor): | |
nn.init.xavier_normal_(tensor) | |
def reconstruct(model, test_data_batch, vocab, strategy, fname): | |
hyps = [] | |
refs = [] | |
with open(fname, "w") as fout: | |
#for i in range(10): | |
# batch_data = test_data_batch[i] | |
for batch_data in test_data_batch: | |
decoded_batch = model.reconstruct(batch_data, strategy) | |
source = [[vocab.id2word(id_.item()) for id_ in sent] for sent in batch_data] | |
for j in range(len(batch_data)): | |
ref = " ".join(source[j]) | |
hyp = " ".join(decoded_batch[j]) | |
fout.write("SOURCE: {}\n".format(ref)) | |
fout.write("RECON: {}\n\n".format(hyp)) | |
refs += [ref[len("<s>"): -len("</s>")]] | |
if strategy == "beam": | |
hyps += [hyp[len("<s>"): -len("</s>")]] | |
else: | |
hyps += [hyp[: -len("</s>")]] | |
fname_ref = fname + ".ref" | |
fname_hyp = fname + ".hyp" | |
with open(fname_ref, "w") as f: | |
f.write("\n".join(refs)) | |
with open(fname_hyp, "w") as f: | |
f.write("\n".join(hyps)) | |
call_multi_bleu_perl("scripts/multi-bleu.perl", fname_hyp, fname_ref, verbose=True) | |
def calc_iwnll(model_vae, eval_dataloader, args, ns=20): | |
eval_loss = 0.0 | |
############ Perplexity ############ | |
report_kl_loss = report_rec_loss = report_loss = 0 | |
report_num_words = report_num_sents = 0 | |
for batch in tqdm(eval_dataloader, desc="Evaluating PPL"): | |
# pdb.set_trace() | |
x0, x1, x_lengths = batch | |
max_len_values, _ = x_lengths.max(0) | |
x0 = x0[:,:max_len_values[0]] | |
x1 = x1[:,:max_len_values[1]] | |
x0 = x0.to(args.device) | |
x1 = x1.to(args.device) | |
x_lengths = x_lengths.to(args.device) | |
# pdb.set_trace() | |
# not predict start symbol | |
report_num_words += x_lengths[:,1].sum().item() | |
report_num_sents += args.eval_batch_size | |
with torch.no_grad(): | |
loss, loss_rc, loss_kl = model_vae.loss_iw(x0, x1, nsamples=100, ns=5) | |
loss_rc = loss_rc.sum() | |
loss_kl = loss_kl.sum() | |
loss = loss.sum() | |
report_rec_loss += loss_rc.item() | |
report_kl_loss += loss_kl.item() | |
report_loss += loss.item() | |
# pdb.set_trace() | |
test_loss = report_loss / report_num_sents | |
elbo = (report_kl_loss - report_rec_loss) / report_num_sents | |
nll = - report_rec_loss / report_num_sents | |
kl = report_kl_loss / report_num_sents | |
ppl = np.exp(-report_loss / report_num_words) | |
return ppl, elbo, nll, kl | |
def calc_rec(model_vae, eval_dataloader, args, ns=1): | |
eval_loss = 0.0 | |
############ Perplexity ############ | |
report_kl_loss = report_rec_loss = report_loss = 0 | |
report_num_words = report_num_sents = 0 | |
i = 0 | |
for batch in tqdm(eval_dataloader, desc="Evaluating PPL"): | |
# pdb.set_trace() | |
x0, x1, x_lengths = batch | |
max_len_values, _ = x_lengths.max(0) | |
x0 = x0[:,:max_len_values[0]] | |
x1 = x1[:,:max_len_values[1]] | |
x0 = x0.to(args.device) | |
x1 = x1.to(args.device) | |
x_lengths = x_lengths.to(args.device) | |
# pdb.set_trace() | |
# not predict start symbol | |
report_num_words += x_lengths[:,1].sum().item() | |
report_num_sents += args.eval_batch_size | |
with torch.no_grad(): | |
loss, loss_rc, loss_kl = model_vae.loss_iw(x0, x1, nsamples=1, ns=1) | |
loss_rc = loss_rc.sum() | |
report_rec_loss += loss_rc.item() | |
i += 1 | |
if i > 500: | |
break | |
# pdb.set_trace() | |
nll_s = - report_rec_loss / report_num_sents | |
nll_w = - report_rec_loss / report_num_words | |
return nll_s, nll_w | |
# def calc_mi(model, test_data_batch): | |
# mi = 0 | |
# num_examples = 0 | |
# for batch_data in test_data_batch: | |
# batch_size = batch_data.size(0) | |
# num_examples += batch_size | |
# mutual_info = model.calc_mi_q(batch_data) | |
# mi += mutual_info * batch_size | |
# return mi / num_examples | |
def calc_mi(model_vae, test_data_batch, args): | |
# calc_mi_v3 | |
import math | |
from modules.utils import log_sum_exp | |
mi = 0 | |
num_examples = 0 | |
mu_batch_list, logvar_batch_list = [], [] | |
neg_entropy = 0. | |
for batch in tqdm(test_data_batch, desc="Evaluating MI, Stage 1"): | |
x0, _, x_lengths = batch | |
max_len_values, _ = x_lengths.max(0) | |
x0 = x0[:,:max_len_values[0]] | |
x0 = x0.to(args.device) | |
with torch.no_grad(): | |
# encoding into bert features | |
bert_fea = model_vae.encoder(x0)[1] | |
# (batch_size, nz) | |
mu, logvar = model_vae.encoder.linear(bert_fea).chunk(2, -1) | |
x_batch, nz = mu.size() | |
#print(x_batch, end=' ') | |
num_examples += x_batch | |
# E_{q(z|x)}log(q(z|x)) = -0.5*nz*log(2*\pi) - 0.5*(1+logvar).sum(-1) | |
neg_entropy += (-0.5 * nz * math.log(2 * math.pi)- 0.5 * (1 + logvar).sum(-1)).sum().item() | |
mu_batch_list += [mu.cpu()] | |
logvar_batch_list += [logvar.cpu()] | |
neg_entropy = neg_entropy / num_examples | |
##print() | |
num_examples = 0 | |
log_qz = 0. | |
for i in tqdm(range(len(mu_batch_list)), desc="Evaluating MI, Stage 2"): | |
############### | |
# get z_samples | |
############### | |
mu, logvar = mu_batch_list[i].cuda(), logvar_batch_list[i].cuda() | |
# [z_batch, 1, nz] | |
with torch.no_grad(): | |
z_samples = model_vae.reparameterize(mu, logvar, 1) | |
z_samples = z_samples.view(-1, 1, nz) | |
num_examples += z_samples.size(0) | |
############### | |
# compute density | |
############### | |
# [1, x_batch, nz] | |
#mu, logvar = mu_batch_list[i].cuda(), logvar_batch_list[i].cuda() | |
#indices = list(np.random.choice(np.arange(len(mu_batch_list)), 10)) + [i] | |
indices = np.arange(len(mu_batch_list)) | |
mu = torch.cat([mu_batch_list[_] for _ in indices], dim=0).cuda() | |
logvar = torch.cat([logvar_batch_list[_] for _ in indices], dim=0).cuda() | |
x_batch, nz = mu.size() | |
mu, logvar = mu.unsqueeze(0), logvar.unsqueeze(0) | |
var = logvar.exp() | |
# (z_batch, x_batch, nz) | |
dev = z_samples - mu | |
# (z_batch, x_batch) | |
log_density = -0.5 * ((dev ** 2) / var).sum(dim=-1) - \ | |
0.5 * (nz * math.log(2 * math.pi) + logvar.sum(-1)) | |
# log q(z): aggregate posterior | |
# [z_batch] | |
log_qz += (log_sum_exp(log_density, dim=1) - math.log(x_batch)).sum(-1) | |
log_qz /= num_examples | |
mi = neg_entropy - log_qz | |
return mi.item() | |
def calc_au(model_vae, eval_dataloader, args, delta=0.01): | |
"""compute the number of active units | |
""" | |
cnt = 0 | |
for batch in tqdm(eval_dataloader, desc="Evaluating AU, Stage 1"): | |
x0, _, x_lengths = batch | |
max_len_values, _ = x_lengths.max(0) | |
x0 = x0[:,:max_len_values[0]] | |
x0 = x0.to(args.device) | |
with torch.no_grad(): | |
# encoding into bert features | |
bert_fea = model_vae.encoder(x0)[1] | |
# (batch_size, nz) | |
mean, logvar = model_vae.encoder.linear(bert_fea).chunk(2, -1) | |
if cnt == 0: | |
means_sum = mean.sum(dim=0, keepdim=True) | |
else: | |
means_sum = means_sum + mean.sum(dim=0, keepdim=True) | |
cnt += mean.size(0) | |
# (1, nz) | |
mean_mean = means_sum / cnt | |
cnt = 0 | |
for batch in tqdm(eval_dataloader, desc="Evaluating AU, Stage 2"): | |
x0, _, _ = batch | |
x0 = x0.to(args.device) | |
with torch.no_grad(): | |
# encoding into bert features | |
bert_fea = model_vae.encoder(x0)[1] | |
# (batch_size, nz) | |
mean, _ = model_vae.encoder.linear(bert_fea).chunk(2, -1) | |
if cnt == 0: | |
var_sum = ((mean - mean_mean) ** 2).sum(dim=0) | |
else: | |
var_sum = var_sum + ((mean - mean_mean) ** 2).sum(dim=0) | |
cnt += mean.size(0) | |
# (nz) | |
au_var = var_sum / (cnt - 1) | |
# pdb.set_trace() | |
return (au_var >= delta).sum().item(), au_var | |
def sample_sentences(vae, vocab, device, num_sentences): | |
global logging | |
vae.eval() | |
sampled_sents = [] | |
for i in range(num_sentences): | |
z = vae.sample_from_prior(1) | |
z = z.view(1,1,-1) | |
start = vocab.word2id['<s>'] | |
# START = torch.tensor([[[start]]]) | |
START = torch.tensor([[start]]) | |
end = vocab.word2id['</s>'] | |
START = START.to(device) | |
z = z.to(device) | |
vae.eval() | |
sentence = vae.decoder.sample_text(START, z, end, device) | |
decoded_sentence = vocab.decode_sentence(sentence) | |
sampled_sents.append(decoded_sentence) | |
for i, sent in enumerate(sampled_sents): | |
logging(i,":",' '.join(sent)) | |
# def visualize_latent(args, vae, device, test_data): | |
# f = open('yelp_embeddings_z','w') | |
# g = open('yelp_embeddings_labels','w') | |
# test_data_batch, test_label_batch = test_data.create_data_batch_labels(batch_size=args.batch_size, device=device, batch_first=True) | |
# for i in range(len(test_data_batch)): | |
# batch_data = test_data_batch[i] | |
# batch_label = test_label_batch[i] | |
# batch_size, sent_len = batch_data.size() | |
# means, _ = vae.encoder.forward(batch_data) | |
# for i in range(batch_size): | |
# mean = means[i,:].cpu().detach().numpy().tolist() | |
# for val in mean: | |
# f.write(str(val)+'\t') | |
# f.write('\n') | |
# for label in batch_label: | |
# g.write(label+'\n') | |
# fo | |
# print(mean.size()) | |
# print(logvar.size()) | |
# fooo | |
def visualize_latent(args, epoch, vae, device, test_data): | |
nsamples = 1 | |
with open(os.path.join(args.exp_dir, f'synthetic_latent_{epoch}.txt'),'w') as f: | |
test_data_batch, test_label_batch = test_data.create_data_batch_labels(batch_size=args.batch_size, device=device, batch_first=True) | |
for i in range(len(test_data_batch)): | |
batch_data = test_data_batch[i] | |
batch_label = test_label_batch[i] | |
batch_size, sent_len = batch_data.size() | |
samples, _ = vae.encoder.encode(batch_data, nsamples) | |
for i in range(batch_size): | |
for j in range(nsamples): | |
sample = samples[i,j,:].cpu().detach().numpy().tolist() | |
f.write(batch_label[i] + '\t' + ' '.join([str(val) for val in sample]) + '\n') | |
def call_multi_bleu_perl(fname_bleu_script, fname_hyp, fname_ref, verbose=True): | |
cmd = "perl %s %s < %s" % (fname_bleu_script, fname_ref, fname_hyp) | |
popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, \ | |
stderr=subprocess.PIPE, shell=True) | |
popen.wait() | |
try: | |
bleu_result = popen.stdout.readline().strip().decode("utf-8") | |
if verbose: | |
print(bleu_result) | |
bleu = float(bleu_result[7:bleu_result.index(',')]) | |
stderrs = popen.stderr.readlines() | |
if len(stderrs) > 1: | |
for line in stderrs: | |
print(line.strip()) | |
except Exception as e: | |
print(e) | |
bleu = 0. | |
return bleu | |
def weight_init(m): | |
''' | |
Usage: | |
model = Model() | |
model.apply(weight_init) | |
''' | |
if isinstance(m, nn.Conv1d): | |
init.normal_(m.weight.data) | |
if m.bias is not None: | |
init.normal_(m.bias.data) | |
elif isinstance(m, nn.Conv2d): | |
init.xavier_normal_(m.weight.data) | |
if m.bias is not None: | |
init.normal_(m.bias.data) | |
elif isinstance(m, nn.Conv3d): | |
init.xavier_normal_(m.weight.data) | |
if m.bias is not None: | |
init.normal_(m.bias.data) | |
elif isinstance(m, nn.ConvTranspose1d): | |
init.normal_(m.weight.data) | |
if m.bias is not None: | |
init.normal_(m.bias.data) | |
elif isinstance(m, nn.ConvTranspose2d): | |
init.xavier_normal_(m.weight.data) | |
if m.bias is not None: | |
init.normal_(m.bias.data) | |
elif isinstance(m, nn.ConvTranspose3d): | |
init.xavier_normal_(m.weight.data) | |
if m.bias is not None: | |
init.normal_(m.bias.data) | |
elif isinstance(m, nn.BatchNorm1d): | |
init.normal_(m.weight.data, mean=1, std=0.02) | |
init.constant_(m.bias.data, 0) | |
elif isinstance(m, nn.BatchNorm2d): | |
init.normal_(m.weight.data, mean=1, std=0.02) | |
init.constant_(m.bias.data, 0) | |
elif isinstance(m, nn.BatchNorm3d): | |
init.normal_(m.weight.data, mean=1, std=0.02) | |
init.constant_(m.bias.data, 0) | |
elif isinstance(m, nn.Linear): | |
init.xavier_normal_(m.weight.data) | |
init.normal_(m.bias.data) | |
elif isinstance(m, nn.LSTM): | |
for param in m.parameters(): | |
if len(param.shape) >= 2: | |
init.orthogonal_(param.data) | |
else: | |
init.normal_(param.data) | |
elif isinstance(m, nn.LSTMCell): | |
for param in m.parameters(): | |
if len(param.shape) >= 2: | |
init.orthogonal_(param.data) | |
else: | |
init.normal_(param.data) | |
elif isinstance(m, nn.GRU): | |
for param in m.parameters(): | |
if len(param.shape) >= 2: | |
init.orthogonal_(param.data) | |
else: | |
init.normal_(param.data) | |
elif isinstance(m, nn.GRUCell): | |
for param in m.parameters(): | |
if len(param.shape) >= 2: | |
init.orthogonal_(param.data) | |
else: | |
init.normal_(param.data) | |
if __name__ == '__main__': | |
pass |