File size: 1,487 Bytes
781432b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
model_name = "microsoft/Multilingual-MiniLM-L12-H384"
dataset = load_dataset("Goodmotion/spam-mail")
tokenizer = AutoTokenizer.from_pretrained(model_name)
# insert labels
def encode_labels(data):
label_map = {"SPAM": 1, "NOSPAM": 0}
data["label"] = label_map[data["label"]]
return data
def tokenize_data(data):
return tokenizer(
data["text"],
padding="max_length",
truncation=True,
max_length=128
)
# tokenize the dataset
tokenized_dataset = dataset.map(tokenize_data, batched=True)
# define the model
model = AutoModelForSequenceClassification.from_pretrained(
"microsoft/Multilingual-MiniLM-L12-H384",
num_labels=2
)
model.classifier.weight.data.normal_(mean=0.0, std=0.02)
model.classifier.bias.data.zero_()
training_args = TrainingArguments(
output_dir="./results",
# speed training
learning_rate=5e-5,
# 16 examples per device
per_device_train_batch_size=16,
# 3 times on the same data
num_train_epochs=3,
# weight coef
weight_decay=0.01,
logging_dir='./logs'
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
)
# train the model
trainer.train()
# save the model
model.save_pretrained("./spam-classifier")
# save the tokenizer
tokenizer.save_pretrained("./spam-classifier")
|