toxic-comment-classifier_rlhf

Paused

App Files Files Community

JanviMl commited on Apr 19

Commit

5094855

verified ·

1 Parent(s): 1612ccf

Update metrics.py

Browse files

Files changed (1) hide show

metrics.py +71 -59

metrics.py CHANGED Viewed

@@ -1,68 +1,80 @@
 # metrics.py
-from model_loader import metrics_models
-import re
-def compute_semantic_similarity(original, paraphrased):
-    """
-    Compute semantic similarity between the original and paraphrased comment using Sentence-BERT.
-    Returns a similarity score between 0 and 1.
-    """
-    try:
-        sentence_bert = metrics_models.load_sentence_bert()
-        embeddings = sentence_bert.encode([original, paraphrased])
-        similarity = float(embeddings[0] @ embeddings[1].T)
-        return round(similarity, 2)
-    except Exception as e:
-        print(f"Error computing semantic similarity: {str(e)}")
-        return None
-def compute_empathy_score(paraphrased):
     """
-    Compute an empathy score for the paraphrased comment.
-    Enhanced to consider positive sentiment and supportive language.
-    Returns a score between 0 and 1.
     """
     try:
-        empathy_words = ["sorry", "understand", "care", "help", "support", "appreciate", "encourage", "positive"]
-        words = paraphrased.lower().split()
-        empathy_count = sum(1 for word in words if word in empathy_words)
-        # Normalize by length, cap at 1.0
-        score = min(empathy_count / max(len(words), 1) * 2, 1.0)  # Amplify for better sensitivity
-        return round(score, 2)
-    except Exception as e:
-        print(f"Error computing empathy score: {str(e)}")
-        return None
-def compute_bias_score(paraphrased):
-    """
-    Compute a bias score for the paraphrased comment (placeholder).
-    Detects stereotypical or discriminatory language.
-    Returns a score between 0 and 1 (lower is less biased).
-    """
-    try:
-        bias_indicators = ["race", "gender", "religion", "stereotype", "discriminate", "bias"]
-        words = paraphrased.lower().split()
-        bias_count = sum(1 for word in words if word in bias_indicators)
-        score = bias_count / max(len(words), 1)
-        return round(score, 2)
-    except Exception as e:
-        print(f"Error computing bias score: {str(e)}")
-        return None
-def compute_hallucination_score(original, paraphrased):
-    """
-    Compute a hallucination score by checking factual consistency.
-    High score indicates deviation from original meaning.
-    Returns a score between 0 and 1 (lower is better).
-    """
-    try:
-        # Use semantic similarity as a proxy; low similarity suggests hallucination
-        similarity = compute_semantic_similarity(original, paraphrased)
-        if similarity is None:
-            return 0.5  # Default if similarity fails
-        # Inverse similarity scaled to penalize low similarity
-        score = max(0.0, (1.0 - similarity) * 0.5)
-        return round(score, 2)
     except Exception as e:
-        print(f"Error computing hallucination score: {str(e)}")
-        return None

 # metrics.py
+from model_loader import classifier_model, metrics_models
+import torch
+import numpy as np
+import time
+def softmax(logits):
+    exp_logits = np.exp(logits - np.max(logits))
+    return exp_logits / exp_logits.sum()
+def compute_reward_scores(original, paraphrase):
     """
+    Compute reward scores for a paraphrased comment.
+    Returns a dictionary with empathy, toxicity, bias, hallucination, and reward scores.
     """
     try:
+        start_time = time.time()
+        print("Starting reward computation...")
+        # Check if paraphrase is valid
+        if not isinstance(paraphrase, str) or "Error: Unable to generate paraphrase" in paraphrase:
+            print(f"Invalid paraphrase: {paraphrase}. Returning default scores.")
+            return {
+                "empathy": 0.0,
+                "toxicity": 1.0,
+                "bias": 1.0,
+                "hallucination": 1.0,
+                "reward": 0.0
+            }
+        # Classify the paraphrased comment
+        print("Starting classification...")
+        inputs = classifier_model.tokenizer(
+            paraphrase,
+            return_tensors="pt",
+            truncation=True,
+            padding=True,
+            max_length=512
+        ).to(classifier_model.device)
+        with torch.no_grad():
+            outputs = classifier_model.model(**inputs)
+            logits = outputs.logits.cpu().numpy()[0]
+            probs = softmax(logits)
+        toxicity = probs[1]  # Assuming label 1 is toxic
+        empathy = 1.0 - toxicity  # Simplified empathy score
+        bias = probs[1]  # Placeholder for bias
+        print(f"Classification took {time.time() - start_time:.2f} seconds")
+        # Compute semantic similarity using Sentence-BERT
+        print("Computing semantic similarity...")
+        sentence_bert = metrics_models.sentence_bert
+        embeddings = sentence_bert.encode([original, paraphrase], convert_to_tensor=True)
+        similarity = torch.cosine_similarity(embeddings[0], embeddings[1], dim=0).item()
+        hallucination = 1.0 - similarity  # High difference means potential hallucination
+        print(f"Semantic similarity computed: {similarity}")
+        # Compute reward score (weighted combination)
+        reward = 0.4 * empathy - 0.2 * toxicity - 0.2 * bias - 0.2 * hallucination
+        reward = max(0.0, min(1.0, reward))
+        print(f"Total processing time: {time.time() - start_time:.2f} seconds")
+        return {
+            "empathy": empathy,
+            "toxicity": toxicity,
+            "bias": bias,
+            "hallucination": hallucination,
+            "reward": reward
+        }
     except Exception as e:
+        print(f"Error in reward computation: {str(e)}")
+        return {
+            "empathy": 0.0,
+            "toxicity": 1.0,
+            "bias": 1.0,
+            "hallucination": 1.0,
+            "reward": 0.0
+        }