toxic-comment-classifier_rlhf

Paused

App Files Files Community

toxic-comment-classifier_rlhf / metrics.py

JanviMl

Update metrics.py

7477097 verified 5 months ago

raw

history blame

4.2 kB

	# metrics.py
	from model_loader import metrics_models
	from classifier import classify_toxic_comment
	import re

	def compute_semantic_similarity(original, paraphrased):
	"""
	Compute semantic similarity between the original and paraphrased comment using Sentence-BERT.
	Returns a similarity score between 0 and 1.
	"""
	try:
	sentence_bert = metrics_models.load_sentence_bert()
	embeddings = sentence_bert.encode([original, paraphrased])
	similarity = float(embeddings[0] @ embeddings[1].T)
	return round(similarity, 2)
	except Exception as e:
	print(f"Error computing semantic similarity: {str(e)}")
	return None

	def compute_empathy_score(paraphrased):
	"""
	Compute an empathy score for the paraphrased comment.
	Enhanced to consider positive sentiment and supportive language.
	Returns a score between 0 and 1.
	"""
	try:
	empathy_words = ["sorry", "understand", "care", "help", "support", "appreciate", "encourage", "positive"]
	words = paraphrased.lower().split()
	empathy_count = sum(1 for word in words if word in empathy_words)
	# Normalize by length, cap at 1.0
	score = min(empathy_count / max(len(words), 1) * 2, 1.0) # Amplify for better sensitivity
	return round(score, 2)
	except Exception as e:
	print(f"Error computing empathy score: {str(e)}")
	return None

	def compute_bias_score(paraphrased):
	"""
	Compute a bias score for the paraphrased comment (placeholder).
	Detects stereotypical or discriminatory language.
	Returns a score between 0 and 1 (lower is less biased).
	"""
	try:
	bias_indicators = ["race", "gender", "religion", "stereotype", "discriminate", "bias"]
	words = paraphrased.lower().split()
	bias_count = sum(1 for word in words if word in bias_indicators)
	score = bias_count / max(len(words), 1)
	return round(score, 2)
	except Exception as e:
	print(f"Error computing bias score: {str(e)}")
	return None

	def compute_hallucination_score(original, paraphrased):
	"""
	Compute a hallucination score by checking factual consistency.
	High score indicates deviation from original meaning.
	Returns a score between 0 and 1 (lower is better).
	"""
	try:
	# Use semantic similarity as a proxy; low similarity suggests hallucination
	similarity = compute_semantic_similarity(original, paraphrased)
	if similarity is None:
	return 0.5 # Default if similarity fails
	# Inverse similarity scaled to penalize low similarity
	score = max(0.0, (1.0 - similarity) * 0.5)
	return round(score, 2)
	except Exception as e:
	print(f"Error computing hallucination score: {str(e)}")
	return None

	def compute_reward_scores(original, paraphrased):
	"""
	Compute all reward scores for a paraphrase.
	Returns a dictionary with empathy, toxicity, bias, hallucination, and overall reward.
	"""
	try:
	# Get toxicity from classifier
	_, _, _, toxicity_score, bias_score, _, _, _, _, paraphrased_toxicity_score, paraphrased_bias_score, _, _ = classify_toxic_comment(paraphrased)
	toxicity = paraphrased_toxicity_score if paraphrased_toxicity_score is not None else 0.5

	# Compute other metrics
	empathy = compute_empathy_score(paraphrased) or 0.5
	bias = compute_bias_score(paraphrased) or 0.5
	hallucination = compute_hallucination_score(original, paraphrased) or 0.5

	# Overall reward: Weighted combination (adjust weights as needed)
	reward = (0.4 * empathy) - (0.2 * toxicity) - (0.2 * bias) - (0.2 * hallucination)
	reward = max(0.0, min(1.0, round(reward, 2)))

	return {
	"empathy": empathy,
	"toxicity": toxicity,
	"bias": bias,
	"hallucination": hallucination,
	"reward": reward
	}
	except Exception as e:
	print(f"Error computing reward scores: {str(e)}")
	return {
	"empathy": 0.5,
	"toxicity": 0.5,
	"bias": 0.5,
	"hallucination": 0.5,
	"reward": 0.5
	}