|
import re
|
|
import tensorflow as tf
|
|
import tensorflow_datasets as tfds
|
|
import nltk
|
|
from nltk.stem import WordNetLemmatizer
|
|
|
|
nltk.download('wordnet')
|
|
nltk.download('punkt')
|
|
|
|
lemmatizer = WordNetLemmatizer()
|
|
|
|
|
|
def preprocess_sentence(sentence):
|
|
sentence = sentence.lower().strip()
|
|
|
|
sentence = re.sub(r"([?.!¿])", r" \1 ", sentence)
|
|
sentence = re.sub(r'[" "]+', " ", sentence)
|
|
sentence = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", sentence)
|
|
|
|
sentence = re.sub(r"[^a-zA-ZğüşöçıİĞÜŞÖÇ?.!,¿]+", " ", sentence)
|
|
sentence = sentence.strip()
|
|
|
|
sentence = ' '.join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(sentence)])
|
|
|
|
return sentence
|
|
|
|
def load_conversations(hparams, lines_file, conversations_file):
|
|
id2line = {}
|
|
|
|
with open(lines_file, encoding = "utf-8", errors="ignore") as file:
|
|
lines = file.readlines()
|
|
|
|
for line in lines:
|
|
parts = line.replace("\n", "").split(" +++$+++ ")
|
|
id2line[parts[0]] = parts[4]
|
|
|
|
questions = []
|
|
answers = []
|
|
|
|
with open(conversations_file, "r") as file:
|
|
lines = file.readlines()
|
|
for line in lines:
|
|
parts = line.replace("\n", "").split(" +++$+++ ")
|
|
conversation = [line[1:-1] for line in parts[3][1:-1].split(", ")]
|
|
for i in range(len(conversation) - 1):
|
|
questions.append(preprocess_sentence(id2line[conversation[i]]))
|
|
answers.append(preprocess_sentence(id2line[conversation[i + 1]]))
|
|
if len(questions) >= hparams.max_samples:
|
|
return questions, answers
|
|
|
|
return questions, answers
|
|
|
|
|
|
def tokenize(hparams, tokenizer, questions, answers):
|
|
tokenized_inputs, tokenized_outputs = [], []
|
|
|
|
for (question, answer) in zip(questions, answers):
|
|
sentence1 = hparams.start_token + tokenizer.encode(question) + hparams.end_token
|
|
sentence2 = hparams.start_token + tokenizer.encode(answer) + hparams.end_token
|
|
|
|
if (len(sentence1) <= hparams.max_length and len(sentence2) <= hparams.max_length):
|
|
tokenized_inputs.append(sentence1)
|
|
tokenized_outputs.append(sentence2)
|
|
|
|
tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
|
|
tokenized_inputs, maxlen=hparams.max_length, padding="post")
|
|
tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences(
|
|
tokenized_outputs, maxlen=hparams.max_length, padding="post")
|
|
|
|
return tokenized_inputs, tokenized_outputs
|
|
|
|
|
|
def get_dataset(hparams):
|
|
lines_file ="data/lines.txt"
|
|
conversations_file = "data/conversations.txt"
|
|
|
|
questions, answers = load_conversations(hparams, lines_file, conversations_file)
|
|
|
|
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(questions + answers, target_vocab_size=2**13)
|
|
|
|
tokenizer.save_to_file('tokenizer')
|
|
|
|
hparams.start_token = [tokenizer.vocab_size]
|
|
hparams.end_token = [tokenizer.vocab_size + 1]
|
|
hparams.vocab_size = tokenizer.vocab_size + 2
|
|
|
|
questions, answers = tokenize(hparams, tokenizer, questions, answers)
|
|
|
|
dataset = tf.data.Dataset.from_tensor_slices(
|
|
({"inputs": questions, "dec_inputs": answers[:, :-1]}, answers[:, 1:])
|
|
)
|
|
|
|
dataset = dataset.cache()
|
|
dataset = dataset.shuffle(len(questions))
|
|
dataset = dataset.batch(hparams.batch_size)
|
|
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
|
|
|
|
return dataset, tokenizer
|
|
|
|
|
|
|
|
|