File size: 2,495 Bytes
a4a5dbc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from Perceptron import  Perceptron
import pickle

# Load the SMS Spam Collection dataset
sms_dataset_path = 'SMSSpamCollection.txt'
sms_data = []
sms_labels = []

with open(sms_dataset_path, 'r', encoding='utf-8') as file:
    for line in file:
        parts = line.strip().split('\t')
        if len(parts) == 2:
            label, message = parts
            sms_labels.append(label)
            sms_data.append(message)

label_encoder = LabelEncoder()
sms_labels = label_encoder.fit_transform(sms_labels)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(sms_data, sms_labels, test_size=0.2, random_state=42)

# Tokenize the text data
max_words = 10000
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)

# Pad sequences to a fixed length
max_sequence_length = 200
X_train_padded = pad_sequences(sequences_train, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(sequences_test, maxlen=max_sequence_length, padding='post')


# Create and train the Perceptron using your Perceptron class
perceptron = Perceptron(learning_rate=0.01, epochs=100, activation_function='step')
perceptron.fit(X_train_padded, y_train)

# Use the same tokenizer to transform the test data
sequences_test = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(sequences_test, maxlen=max_sequence_length, padding='post')

# Make predictions on the test set
predictions = perceptron.predict(X_test_padded)

# Evaluate and print results
print("Perceptron Classification Report:")
print(classification_report(y_test, predictions))
print("Perceptron Accuracy:", accuracy_score(y_test, predictions))


# Save the trained Perceptron model using pickle
perceptron_model_path = 'spam_perceptron_model.pkl'
with open(perceptron_model_path, 'wb') as model_file:
    pickle.dump(perceptron, model_file)
    
# Save the tokenizer using pickle
tokenizer_path = 'tokenizer_per.pkl'
with open(tokenizer_path, 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)