import csv from typing import TypedDict import numpy as np from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer from datasets import load_dataset, Dataset import pandas as pd import evaluate import os import torch data_files = { "train": os.path.realpath(os.path.join(__file__, "..", "./datasets/train.csv")), "test": os.path.realpath(os.path.join(__file__, "..", "./datasets/test.csv")) } output_dir = os.path.realpath(os.path.join(__file__, "..", "./outputs/v2-deberta-100-max")) tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base") label_map = { "FAVOR": 0, "NONE": 1, "AGAINST": 2 } torch.cuda.empty_cache() def tokenize(examples): examples["label"] = [label_map[label] for label in examples["label"]] examples["text"] = [examples["Target"][i] + " [SEP] " + text for i , text in enumerate(examples["text"])] return tokenizer(examples["text"], padding="max_length", return_tensors='pt', truncation=True, max_length=100) def load_dataset(path: str) -> Dataset: dataframe = pd.read_csv(path) dataframe = dataframe.drop("Opinion Towards", axis=1) dataframe = dataframe.drop("Sentiment", axis=1) dataset = Dataset.from_pandas(dataframe) dataset = dataset.rename_column('Tweet', 'text') dataset = dataset.rename_column("Stance", "label") return dataset.map(tokenize, batched=True) train_ds = load_dataset(data_files["train"]) test_ds = load_dataset(data_files["test"]) model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-base", num_labels=3) metric = evaluate.load("accuracy") def compute_metrics(eval_pred): logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) return metric.compute(predictions=predictions, references=labels) training_args = TrainingArguments(output_dir=output_dir, evaluation_strategy="epoch") trainer = Trainer( model=model, args=training_args, train_dataset=train_ds, eval_dataset=test_ds, compute_metrics=compute_metrics ) print("TRAINING") trainer.train()