toxic-comment-classifier_rlhf

Paused

App Files Files Community

JanviMl commited on Apr 19

Commit

7477097

verified ·

1 Parent(s): 7923692

Update metrics.py

Browse files

Files changed (1) hide show

metrics.py +78 -5

metrics.py CHANGED Viewed

@@ -1,5 +1,7 @@
 # metrics.py
 from model_loader import metrics_models
 def compute_semantic_similarity(original, paraphrased):
     """
@@ -17,16 +19,87 @@ def compute_semantic_similarity(original, paraphrased):
 def compute_empathy_score(paraphrased):
     """
-    Compute an empathy score for the paraphrased comment (placeholder).
     Returns a score between 0 and 1.
     """
     try:
-        # Placeholder: Compute empathy based on word presence (e.g., "sorry", "understand")
-        empathy_words = ["sorry", "understand", "care", "help", "support"]
         words = paraphrased.lower().split()
         empathy_count = sum(1 for word in words if word in empathy_words)
-        score = empathy_count / len(words) if words else 0
         return round(score, 2)
     except Exception as e:
         print(f"Error computing empathy score: {str(e)}")
-        return None

 # metrics.py
 from model_loader import metrics_models
+from classifier import classify_toxic_comment
+import re
 def compute_semantic_similarity(original, paraphrased):
     """
 def compute_empathy_score(paraphrased):
     """
+    Compute an empathy score for the paraphrased comment.
+    Enhanced to consider positive sentiment and supportive language.
     Returns a score between 0 and 1.
     """
     try:
+        empathy_words = ["sorry", "understand", "care", "help", "support", "appreciate", "encourage", "positive"]
         words = paraphrased.lower().split()
         empathy_count = sum(1 for word in words if word in empathy_words)
+        # Normalize by length, cap at 1.0
+        score = min(empathy_count / max(len(words), 1) * 2, 1.0)  # Amplify for better sensitivity
         return round(score, 2)
     except Exception as e:
         print(f"Error computing empathy score: {str(e)}")
+        return None
+def compute_bias_score(paraphrased):
+    """
+    Compute a bias score for the paraphrased comment (placeholder).
+    Detects stereotypical or discriminatory language.
+    Returns a score between 0 and 1 (lower is less biased).
+    """
+    try:
+        bias_indicators = ["race", "gender", "religion", "stereotype", "discriminate", "bias"]
+        words = paraphrased.lower().split()
+        bias_count = sum(1 for word in words if word in bias_indicators)
+        score = bias_count / max(len(words), 1)
+        return round(score, 2)
+    except Exception as e:
+        print(f"Error computing bias score: {str(e)}")
+        return None
+def compute_hallucination_score(original, paraphrased):
+    """
+    Compute a hallucination score by checking factual consistency.
+    High score indicates deviation from original meaning.
+    Returns a score between 0 and 1 (lower is better).
+    """
+    try:
+        # Use semantic similarity as a proxy; low similarity suggests hallucination
+        similarity = compute_semantic_similarity(original, paraphrased)
+        if similarity is None:
+            return 0.5  # Default if similarity fails
+        # Inverse similarity scaled to penalize low similarity
+        score = max(0.0, (1.0 - similarity) * 0.5)
+        return round(score, 2)
+    except Exception as e:
+        print(f"Error computing hallucination score: {str(e)}")
+        return None
+def compute_reward_scores(original, paraphrased):
+    """
+    Compute all reward scores for a paraphrase.
+    Returns a dictionary with empathy, toxicity, bias, hallucination, and overall reward.
+    """
+    try:
+        # Get toxicity from classifier
+        _, _, _, toxicity_score, bias_score, _, _, _, _, paraphrased_toxicity_score, paraphrased_bias_score, _, _ = classify_toxic_comment(paraphrased)
+        toxicity = paraphrased_toxicity_score if paraphrased_toxicity_score is not None else 0.5
+        # Compute other metrics
+        empathy = compute_empathy_score(paraphrased) or 0.5
+        bias = compute_bias_score(paraphrased) or 0.5
+        hallucination = compute_hallucination_score(original, paraphrased) or 0.5
+        # Overall reward: Weighted combination (adjust weights as needed)
+        reward = (0.4 * empathy) - (0.2 * toxicity) - (0.2 * bias) - (0.2 * hallucination)
+        reward = max(0.0, min(1.0, round(reward, 2)))
+        return {
+            "empathy": empathy,
+            "toxicity": toxicity,
+            "bias": bias,
+            "hallucination": hallucination,
+            "reward": reward
+        }
+    except Exception as e:
+        print(f"Error computing reward scores: {str(e)}")
+        return {
+            "empathy": 0.5,
+            "toxicity": 0.5,
+            "bias": 0.5,
+            "hallucination": 0.5,
+            "reward": 0.5
+        }