|
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer |
|
from datasets import load_dataset |
|
|
|
model_name = "microsoft/Multilingual-MiniLM-L12-H384" |
|
|
|
|
|
dataset = load_dataset("Goodmotion/spam-mail") |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
def encode_labels(data): |
|
label_map = {"SPAM": 1, "NOSPAM": 0} |
|
data["label"] = label_map[data["label"]] |
|
return data |
|
|
|
def tokenize_data(data): |
|
return tokenizer( |
|
data["text"], |
|
padding="max_length", |
|
truncation=True, |
|
max_length=128 |
|
) |
|
|
|
|
|
tokenized_dataset = dataset.map(tokenize_data, batched=True) |
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained( |
|
"microsoft/Multilingual-MiniLM-L12-H384", |
|
num_labels=2 |
|
) |
|
model.classifier.weight.data.normal_(mean=0.0, std=0.02) |
|
model.classifier.bias.data.zero_() |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./results", |
|
|
|
learning_rate=5e-5, |
|
|
|
per_device_train_batch_size=16, |
|
|
|
num_train_epochs=3, |
|
|
|
weight_decay=0.01, |
|
logging_dir='./logs' |
|
) |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_dataset["train"], |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
model.save_pretrained("./spam-classifier") |
|
|
|
tokenizer.save_pretrained("./spam-classifier") |
|
|