spam-mail-classifier / training.py
ipatate
add files
781432b
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
model_name = "microsoft/Multilingual-MiniLM-L12-H384"
dataset = load_dataset("Goodmotion/spam-mail")
tokenizer = AutoTokenizer.from_pretrained(model_name)
# insert labels
def encode_labels(data):
label_map = {"SPAM": 1, "NOSPAM": 0}
data["label"] = label_map[data["label"]]
return data
def tokenize_data(data):
return tokenizer(
data["text"],
padding="max_length",
truncation=True,
max_length=128
)
# tokenize the dataset
tokenized_dataset = dataset.map(tokenize_data, batched=True)
# define the model
model = AutoModelForSequenceClassification.from_pretrained(
"microsoft/Multilingual-MiniLM-L12-H384",
num_labels=2
)
model.classifier.weight.data.normal_(mean=0.0, std=0.02)
model.classifier.bias.data.zero_()
training_args = TrainingArguments(
output_dir="./results",
# speed training
learning_rate=5e-5,
# 16 examples per device
per_device_train_batch_size=16,
# 3 times on the same data
num_train_epochs=3,
# weight coef
weight_decay=0.01,
logging_dir='./logs'
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
)
# train the model
trainer.train()
# save the model
model.save_pretrained("./spam-classifier")
# save the tokenizer
tokenizer.save_pretrained("./spam-classifier")