from datasets import load_dataset, Dataset import random import numpy as np from transformers import ( AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, PreTrainedTokenizer, ElectraForSequenceClassification, EarlyStoppingCallback ) from dataclasses import dataclass from sklearn.metrics import accuracy_score, precision_recall_fscore_support def label_mapper(label: str) -> int: if label == "SP": return 0 elif label == "WP": return 1 elif label == "WN": return 2 elif label == "SN": return 3 elif label == "NU": return 4 else: raise ValueError(f"Invalid label: {label}") def process(batch: dict, tokenizer: PreTrainedTokenizer) -> dict: new_labels = [label_mapper(label) for label in batch["Polarity"]] inputs = tokenizer(batch["Text"], truncation=True) batch["input_ids"] = inputs["input_ids"] batch["attention_mask"] = inputs["attention_mask"] batch["labels"] = new_labels return batch def compute_metrics(eval_pred): logits, labels = eval_pred predictions = logits.argmax(-1) accuracy = accuracy_score(labels, predictions) precision, recall, f1, _ = precision_recall_fscore_support( labels, predictions, average="macro" ) return { "accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1, } def pipeline(args): model = AutoModelForSequenceClassification.from_pretrained(args.model_name, num_labels=5) tokenizer = AutoTokenizer.from_pretrained(args.model_name) dataset = load_dataset(args.dataset_name) dataset = dataset.map(process, batched=True, fn_kwargs={'tokenizer': tokenizer}) dataset = dataset["train"].train_test_split(args.split_ratio) train_dataset = dataset["train"] test_dataset = dataset["test"] data_collator = DataCollatorWithPadding(tokenizer=tokenizer) trainer = Trainer( model=model, args=TrainingArguments( output_dir="./results", learning_rate=args.learning_rate, per_device_train_batch_size=args.batch_size, per_device_eval_batch_size=args.batch_size, num_train_epochs=args.epochs, weight_decay=0.01, eval_strategy="steps", save_strategy="steps", load_best_model_at_end=True, report_to="none", save_steps=500, eval_steps=500, save_total_limit=1, logging_steps=500, fp16=args.fp16, greater_is_better=True, metric_for_best_model="f1", ), train_dataset=train_dataset, eval_dataset=test_dataset, processing_class=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=5)] ) trainer.train() trainer.evaluate() trainer.predict(test_dataset) # Push to Hub tokenizer.push_to_hub(args.hub_location) model.push_to_hub(args.hub_location) @dataclass class Arguments: model_name: str = "csebuetnlp/banglabert" dataset_name: str = "SayedShaun/sentigold" split_ratio: float = 0.1 batch_size: int = 128 epochs: int = 40 learning_rate: float = 1e-5 fp16: bool = True hub_location: str = "SayedShaun/bangla-classifier-multiclass" if __name__=="__main__": args = Arguments() pipeline(args)