Spaces:
Runtime error
Runtime error
File size: 2,614 Bytes
550b30c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense,Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import string
from string import digits
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
from tqdm import tqdm
class Dataset:
def __init__(self, data, tknizer_ass, tknizer_eng, max_len):
self.encoder_inps = data['ass'].values
self.decoder_inps = data['eng_inp'].values
self.decoder_outs = data['eng_out'].values
self.tknizer_eng = tknizer_eng
self.tknizer_ass = tknizer_ass
self.max_len = max_len
def __getitem__(self, i):
self.encoder_seq = self.tknizer_ass.texts_to_sequences([self.encoder_inps[i]]) # need to pass list of values
self.decoder_inp_seq = self.tknizer_eng.texts_to_sequences([self.decoder_inps[i]])
self.decoder_out_seq = self.tknizer_eng.texts_to_sequences([self.decoder_outs[i]])
self.encoder_seq = pad_sequences(self.encoder_seq, maxlen=self.max_len, dtype='int32', padding='post')
self.decoder_inp_seq = pad_sequences(self.decoder_inp_seq, maxlen=self.max_len, dtype='int32', padding='post')
self.decoder_out_seq = pad_sequences(self.decoder_out_seq, maxlen=self.max_len, dtype='int32', padding='post')
return self.encoder_seq, self.decoder_inp_seq, self.decoder_out_seq
def __len__(self): # your model.fit_gen requires this function
return len(self.encoder_inps)
class Dataloder(tf.keras.utils.Sequence):
def __init__(self, dataset, batch_size=1):
self.dataset = dataset
self.batch_size = batch_size
self.indexes = np.arange(len(self.dataset.encoder_inps))
def __getitem__(self, i):
start = i * self.batch_size
stop = (i + 1) * self.batch_size
data = []
for j in range(start, stop):
data.append(self.dataset[j])
batch = [np.squeeze(np.stack(samples, axis=1), axis=0) for samples in zip(*data)]
# we are creating data like ([italian, english_inp], english_out) these are already converted into seq
return tuple([[batch[0],batch[1]],batch[2]])
def __len__(self): # your model.fit_gen requires this function
return len(self.indexes) // self.batch_size
def on_epoch_end(self):
self.indexes = np.random.permutation(self.indexes) |