Spaces:

phucdev
/

vihsd

Runtime error

App Files Files Community

vihsd / vihsd.py

phucdev

Remove wrong reference

0046569 4 months ago

raw

history blame contribute delete

3.83 kB

	from typing import Any

	import datasets
	import evaluate

	from sklearn.metrics import f1_score, accuracy_score


	_DESCRIPTION = """
	This metric is used to compute the accuracy and F1 score of models on the ViHSD dataset from [A Large-scale Dataset for Hate Speech Detection on Vietnamese Social Media Texts](https://arxiv.org/abs/2103.11528) by Luu et al. (2021).
	The ViHSD dataset is a large-scale dataset for hate speech detection on Vietnamese social media texts.
	It contains over 30,000 comments, each labeled as CLEAN, OFFENSIVE, or HATE.
	The dataset is used to evaluate the quality of hate speech detection models, including deep learning and transformer models.
	"""

	_KWARGS_DESCRIPTION = """
	Args:
	predictions: list of predictions to score.
	Each translation should be tokenized into a list of tokens.
	references: list of lists of references for each translation.
	Each reference should be tokenized into a list of tokens.
	Returns: depending on the GLUE subset, one or several of:
	"accuracy": Accuracy
	"micro_f1": Micro averaged F1 score
	"macro_f1": Macro averaged F1 score
	"weighted_f1": Weighted averaged F1 score
	"""

	_CITATION = """
	@InProceedings{10.1007/978-3-030-79457-6_35,
	author="Luu, Son T.
	and Nguyen, Kiet Van
	and Nguyen, Ngan Luu-Thuy",
	editor="Fujita, Hamido
	and Selamat, Ali
	and Lin, Jerry Chun-Wei
	and Ali, Moonis",
	title="A Large-Scale Dataset for Hate Speech Detection on Vietnamese Social Media Texts",
	booktitle="Advances and Trends in Artificial Intelligence. Artificial Intelligence Practices",
	year="2021",
	publisher="Springer International Publishing",
	address="Cham",
	pages="415--426",
	abstract="In recent years, Vietnam witnesses the mass development of social network users on different social platforms such as Facebook, Youtube, Instagram, and Tiktok. On social media, hate speech has become a critical problem for social network users. To solve this problem, we introduce the ViHSD - a human-annotated dataset for automatically detecting hate speech on the social network. This dataset contains over 30,000 comments, each comment in the dataset has one of three labels: CLEAN, OFFENSIVE, or HATE. Besides, we introduce the data creation process for annotating and evaluating the quality of the dataset. Finally, we evaluate the dataset by deep learning and transformer models.",
	isbn="978-3-030-79457-6"
	}
	"""


	def acc_and_f1(preds, labels):
	return {
	"accuracy": float(accuracy_score(y_true=labels, y_pred=preds)),
	"micro_f1": float(f1_score(y_true=labels, y_pred=preds, average="micro")),
	"macro_f1": float(f1_score(y_true=labels, y_pred=preds, average="macro")),
	"weighted_f1": float(f1_score(y_true=labels, y_pred=preds, average="weighted")),
	}


	@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
	class ViHSD(evaluate.Metric):
	def _info(self):
	return evaluate.MetricInfo(
	description=_DESCRIPTION,
	citation=_CITATION,
	homepage="https://github.com/sonlam1102/vihsd",
	inputs_description=_KWARGS_DESCRIPTION,
	features=datasets.Features(
	{
	"predictions": datasets.Value("int64"),
	"references": datasets.Value("int64"),
	}
	),
	codebase_urls=["https://github.com/sonlam1102/vihsd"],
	reference_urls=[
	"https://github.com/sonlam1102/vihsd",
	"https://arxiv.org/abs/2103.11528",
	],
	format="numpy",
	)


	def _compute(
	self,
	predictions: Any = None,
	references: Any = None,
	**kwargs: Any
	):
	return acc_and_f1(predictions, references)