|
import os
|
|
import re
|
|
import numpy as np
|
|
from gensim.models import Word2Vec
|
|
from tensorflow.keras.models import Sequential
|
|
from tensorflow.keras.layers import Dense, LSTM, Embedding, GRU, Dropout
|
|
from tensorflow.keras.preprocessing.text import Tokenizer
|
|
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|
from tensorflow.keras.layers import Layer, MultiHeadAttention, LayerNormalization
|
|
|
|
|
|
max_worker = os.cpu_count()
|
|
vector_size = 1000
|
|
window_size = 500
|
|
min_count = 1
|
|
context_length = 4096
|
|
sentence_length = 5
|
|
|
|
|
|
embed_dim = 128
|
|
num_heads = 80
|
|
feed_forward_dim = 512
|
|
dropout_rate_transformer = 0.1
|
|
epsilon = 1e-6
|
|
|
|
|
|
lstm_units = [16000, 16000, 16000, 16000, 8000, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096]
|
|
gru_units = [2048, 2048, 2048, 2048, 1024, 1024, 512, 512, 512, 512, 512, 512, 512, 512, 256, 256, 256, 256, 128, 64, 32, 16, 8, 4, 2, 1]
|
|
dropout_rate_rnn = 0.2
|
|
return_sequences = True
|
|
|
|
|
|
input_dim = 10000
|
|
output_dim = 1000
|
|
input_length = context_length
|
|
|
|
|
|
dense_units = input_dim
|
|
activation = "softmax"
|
|
|
|
|
|
loss = "sparse_categorical_crossentropy"
|
|
optimizer = "adam"
|
|
metrics = ["accuracy"]
|
|
epochs = 60
|
|
batch_size = 64
|
|
|
|
|
|
class TransformerEncoder(Layer):
|
|
def __init__(self, embed_dim, num_heads, feed_forward_dim, dropout_rate=0.1):
|
|
super(TransformerEncoder, self).__init__()
|
|
self.attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
|
|
self.dropout1 = Dropout(dropout_rate)
|
|
self.norm1 = LayerNormalization(epsilon=epsilon)
|
|
self.dense1 = Dense(feed_forward_dim, activation="relu")
|
|
self.dense2 = Dense(embed_dim)
|
|
self.dropout2 = Dropout(dropout_rate)
|
|
self.norm2 = LayerNormalization(epsilon=epsilon)
|
|
|
|
def call(self, inputs, training=None):
|
|
|
|
attention_output = self.attention(inputs, inputs)
|
|
attention_output = self.dropout1(attention_output, training=training)
|
|
out1 = self.norm1(inputs + attention_output)
|
|
|
|
|
|
dense_output = self.dense1(out1)
|
|
dense_output = self.dense2(dense_output)
|
|
dense_output = self.dropout2(dense_output, training=training)
|
|
return self.norm2(out1 + dense_output)
|
|
|
|
|
|
def train_model(X, y, tokenizer):
|
|
nn_model = Sequential([
|
|
|
|
Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length),
|
|
|
|
*[LSTM(units, return_sequences=True, dropout=dropout_rate_rnn) for units in lstm_units[:-1]],
|
|
LSTM(lstm_units[-1], return_sequences=True, dropout=dropout_rate_rnn),
|
|
|
|
*[GRU(units, return_sequences=True, dropout=dropout_rate_rnn) for units in gru_units[:-1]],
|
|
GRU(gru_units[-1], return_sequences=False, dropout=dropout_rate_rnn),
|
|
|
|
Dense(dense_units, activation=activation)
|
|
])
|
|
|
|
nn_model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
|
|
print("Model eğitiliyor...")
|
|
nn_model.fit(X, y, epochs=epochs, batch_size=batch_size)
|
|
return nn_model
|
|
|
|
|
|
def generate_sentence(model, tokenizer, start_word, sentence_length, temperature=1.0):
|
|
sentence = [start_word]
|
|
for _ in range(sentence_length - 1):
|
|
sequence = tokenizer.texts_to_sequences([' '.join(sentence)])
|
|
sequence = pad_sequences(sequence, maxlen=context_length, padding='post')
|
|
predicted_probs = model.predict(sequence)[0]
|
|
|
|
predicted_probs = np.asarray(predicted_probs).astype('float64')
|
|
predicted_probs = np.log(predicted_probs + 1e-10) / temperature
|
|
predicted_probs = np.exp(predicted_probs) / np.sum(np.exp(predicted_probs))
|
|
|
|
predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)
|
|
next_word = tokenizer.index_word.get(predicted_index, '')
|
|
|
|
if not next_word:
|
|
break
|
|
sentence.append(next_word)
|
|
|
|
return ' '.join(sentence)
|
|
|
|
|
|
file_path = input("Veri setinin dosya yolunu giriniz: ")
|
|
try:
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
dataset = f.readlines()
|
|
except FileNotFoundError:
|
|
print("Dosya bulunamadı!")
|
|
exit()
|
|
|
|
tokenized_sentences = [re.findall(r'\b\w+\b', sentence.lower()) for sentence in dataset]
|
|
word2vec_model = Word2Vec(tokenized_sentences, vector_size=vector_size, window=window_size, min_count=min_count, workers=max_worker)
|
|
|
|
tokenizer = Tokenizer()
|
|
tokenizer.fit_on_texts([' '.join(sentence) for sentence in tokenized_sentences])
|
|
sequences = tokenizer.texts_to_sequences([' '.join(sentence) for sentence in tokenized_sentences])
|
|
X = pad_sequences(sequences, maxlen=context_length, padding='post')
|
|
y = np.array([seq[-1] if len(seq) > 0 else 0 for seq in sequences])
|
|
|
|
model = train_model(X, y, tokenizer)
|
|
|
|
|
|
start_word = input("Başlangıç kelimesi giriniz: ")
|
|
print("\nÜretilen Cümle:", generate_sentence(model, tokenizer, start_word, sentence_length, temperature=1.0)) |