froggydood's picture
Initial model output
b53369f
import csv
from typing import TypedDict
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, Dataset
import pandas as pd
import evaluate
import os
import torch
data_files = {
"train": os.path.realpath(os.path.join(__file__, "..", "./datasets/train.csv")),
"test": os.path.realpath(os.path.join(__file__, "..", "./datasets/test.csv"))
}
output_dir = os.path.realpath(os.path.join(__file__, "..", "./outputs/v2-deberta-100-max"))
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
label_map = {
"FAVOR": 0,
"NONE": 1,
"AGAINST": 2
}
torch.cuda.empty_cache()
def tokenize(examples):
examples["label"] = [label_map[label] for label in examples["label"]]
examples["text"] = [examples["Target"][i] + " [SEP] " + text for i , text in enumerate(examples["text"])]
return tokenizer(examples["text"], padding="max_length", return_tensors='pt', truncation=True, max_length=100)
def load_dataset(path: str) -> Dataset:
dataframe = pd.read_csv(path)
dataframe = dataframe.drop("Opinion Towards", axis=1)
dataframe = dataframe.drop("Sentiment", axis=1)
dataset = Dataset.from_pandas(dataframe)
dataset = dataset.rename_column('Tweet', 'text')
dataset = dataset.rename_column("Stance", "label")
return dataset.map(tokenize, batched=True)
train_ds = load_dataset(data_files["train"])
test_ds = load_dataset(data_files["test"])
model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-base", num_labels=3)
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
training_args = TrainingArguments(output_dir=output_dir, evaluation_strategy="epoch")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_ds,
eval_dataset=test_ds,
compute_metrics=compute_metrics
)
print("TRAINING")
trainer.train()