toxic-comment-classifier_rlhf

Paused

App Files Files Community

JanviMl commited on Apr 19

Commit

e0084d4

verified ·

1 Parent(s): 95613a1

Update classifier.py

Browse files

Files changed (1) hide show

classifier.py +38 -2

classifier.py CHANGED Viewed

@@ -3,7 +3,43 @@ import torch
 import time
 from model_loader import classifier_model
 from paraphraser import paraphrase_comment
-from metrics import compute_semantic_similarity, compute_empathy_score
 def classify_toxic_comment(comment):
     """
@@ -15,7 +51,7 @@ def classify_toxic_comment(comment):
     print("Starting classification...")
     if not comment.strip():
-        return "Error: Please enter a comment.", None, None, None, None, None, None, None, None, None, None
     # Access the model and tokenizer
     model = classifier_model.model

 import time
 from model_loader import classifier_model
 from paraphraser import paraphrase_comment
+from metrics import compute_semantic_similarity, compute_empathy_score, compute_bias_score, compute_hallucination_score
+def compute_reward_scores(original, paraphrased):
+    """
+    Compute all reward scores for a paraphrase.
+    Returns a dictionary with empathy, toxicity, bias, hallucination, and overall reward.
+    """
+    try:
+        # Get toxicity from classifier
+        _, _, _, toxicity_score, bias_score, _, _, _, _, paraphrased_toxicity_score, paraphrased_bias_score, _, _ = classify_toxic_comment(paraphrased)
+        toxicity = paraphrased_toxicity_score if paraphrased_toxicity_score is not None else 0.5
+        # Compute other metrics
+        empathy = compute_empathy_score(paraphrased) or 0.5
+        bias = compute_bias_score(paraphrased) or 0.5
+        hallucination = compute_hallucination_score(original, paraphrased) or 0.5
+        # Overall reward: Weighted combination (adjust weights as needed)
+        reward = (0.4 * empathy) - (0.2 * toxicity) - (0.2 * bias) - (0.2 * hallucination)
+        reward = max(0.0, min(1.0, round(reward, 2)))
+        return {
+            "empathy": empathy,
+            "toxicity": toxicity,
+            "bias": bias,
+            "hallucination": hallucination,
+            "reward": reward
+        }
+    except Exception as e:
+        print(f"Error computing reward scores: {str(e)}")
+        return {
+            "empathy": 0.5,
+            "toxicity": 0.5,
+            "bias": 0.5,
+            "hallucination": 0.5,
+            "reward": 0.5
+        }
 def classify_toxic_comment(comment):
     """
     print("Starting classification...")
     if not comment.strip():
+        return "Error: Please enter a comment.", None, None, None, None, None, None, None, None, None, None, None, None
     # Access the model and tokenizer
     model = classifier_model.model