|
import csv |
|
from typing import TypedDict |
|
import numpy as np |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer |
|
from datasets import load_dataset, Dataset |
|
import pandas as pd |
|
import evaluate |
|
import os |
|
import torch |
|
|
|
data_files = { |
|
"train": os.path.realpath(os.path.join(__file__, "..", "./datasets/train.csv")), |
|
"test": os.path.realpath(os.path.join(__file__, "..", "./datasets/test.csv")) |
|
} |
|
output_dir = os.path.realpath(os.path.join(__file__, "..", "./outputs/v2-deberta-100-max")) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base") |
|
|
|
label_map = { |
|
"FAVOR": 0, |
|
"NONE": 1, |
|
"AGAINST": 2 |
|
} |
|
|
|
torch.cuda.empty_cache() |
|
|
|
def tokenize(examples): |
|
examples["label"] = [label_map[label] for label in examples["label"]] |
|
examples["text"] = [examples["Target"][i] + " [SEP] " + text for i , text in enumerate(examples["text"])] |
|
return tokenizer(examples["text"], padding="max_length", return_tensors='pt', truncation=True, max_length=100) |
|
|
|
|
|
def load_dataset(path: str) -> Dataset: |
|
dataframe = pd.read_csv(path) |
|
dataframe = dataframe.drop("Opinion Towards", axis=1) |
|
dataframe = dataframe.drop("Sentiment", axis=1) |
|
dataset = Dataset.from_pandas(dataframe) |
|
dataset = dataset.rename_column('Tweet', 'text') |
|
dataset = dataset.rename_column("Stance", "label") |
|
|
|
return dataset.map(tokenize, batched=True) |
|
|
|
train_ds = load_dataset(data_files["train"]) |
|
test_ds = load_dataset(data_files["test"]) |
|
|
|
model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-base", num_labels=3) |
|
|
|
metric = evaluate.load("accuracy") |
|
|
|
def compute_metrics(eval_pred): |
|
logits, labels = eval_pred |
|
predictions = np.argmax(logits, axis=-1) |
|
return metric.compute(predictions=predictions, references=labels) |
|
|
|
training_args = TrainingArguments(output_dir=output_dir, evaluation_strategy="epoch") |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_ds, |
|
eval_dataset=test_ds, |
|
compute_metrics=compute_metrics |
|
) |
|
print("TRAINING") |
|
|
|
trainer.train() |