toxic-comment-classifier_rlhf

Paused

App Files Files Community

JanviMl commited on Mar 25

Commit

59e622f

verified ·

1 Parent(s): af98023

Update classifier.py

Browse files

Files changed (1) hide show

classifier.py +43 -12

classifier.py CHANGED Viewed

@@ -1,21 +1,23 @@
 # classifier.py
 import torch
-from model_loader import model, tokenizer
 def classify_toxic_comment(comment):
     """
     Classify a comment as toxic or non-toxic using the fine-tuned XLM-RoBERTa model.
-    Returns the prediction label, confidence, color, toxicity score, and bias score for UI display.
     """
     if not comment.strip():
-        return "Error: Please enter a comment.", None, None, None, None
     # Tokenize the input comment
-    inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True, max_length=512)
     # Run inference
     with torch.no_grad():
-        outputs = model(**inputs)
         logits = outputs.logits
     # Get the predicted class (0 = non-toxic, 1 = toxic)
@@ -24,14 +26,43 @@ def classify_toxic_comment(comment):
     confidence = torch.softmax(logits, dim=1)[0][predicted_class].item()
     label_color = "red" if label == "Toxic" else "green"
-    # Simulate Toxicity Score (in a real scenario, use a model like Detoxify)
-    # For now, we'll approximate it based on the confidence of the toxic class
-    toxicity_score = torch.softmax(logits, dim=1)[0][1].item()  # Probability of toxic class
     toxicity_score = round(toxicity_score, 2)
-    # Simulate Bias Score (in a real scenario, use a bias detection model like WEAT)
-    # For now, we'll use a placeholder value (since the example comment is non-toxic)
-    bias_score = 0.01 if label == "Non-Toxic" else 0.15  # Placeholder logic
     bias_score = round(bias_score, 2)
-    return f"Prediction: {label}", confidence, label_color, toxicity_score, bias_score

 # classifier.py
 import torch
+from model_loader import classifier_model, classifier_tokenizer
+from paraphraser import paraphrase_comment
 def classify_toxic_comment(comment):
     """
     Classify a comment as toxic or non-toxic using the fine-tuned XLM-RoBERTa model.
+    If toxic, paraphrase the comment and re-evaluate.
+    Returns the prediction label, confidence, color, toxicity score, bias score, paraphrased comment (if applicable), and its metrics.
     """
     if not comment.strip():
+        return "Error: Please enter a comment.", None, None, None, None, None, None, None, None, None
     # Tokenize the input comment
+    inputs = classifier_tokenizer(comment, return_tensors="pt", truncation=True, padding=True, max_length=512)
     # Run inference
     with torch.no_grad():
+        outputs = classifier_model(**inputs)
         logits = outputs.logits
     # Get the predicted class (0 = non-toxic, 1 = toxic)
     confidence = torch.softmax(logits, dim=1)[0][predicted_class].item()
     label_color = "red" if label == "Toxic" else "green"
+    # Compute Toxicity Score (approximated as the probability of the toxic class)
+    toxicity_score = torch.softmax(logits, dim=1)[0][1].item()
     toxicity_score = round(toxicity_score, 2)
+    # Simulate Bias Score (placeholder)
+    bias_score = 0.01 if label == "Non-Toxic" else 0.15
     bias_score = round(bias_score, 2)
+    # If the comment is toxic, paraphrase it
+    paraphrased_comment = None
+    paraphrased_prediction = None
+    paraphrased_confidence = None
+    paraphrased_color = None
+    paraphrased_toxicity_score = None
+    paraphrased_bias_score = None
+    if label == "Toxic":
+        # Paraphrase the comment
+        paraphrased_comment = paraphrase_comment(comment)
+        # Re-evaluate the paraphrased comment
+        paraphrased_inputs = classifier_tokenizer(paraphrased_comment, return_tensors="pt", truncation=True, padding=True, max_length=512)
+        with torch.no_grad():
+            paraphrased_outputs = classifier_model(**paraphrased_inputs)
+            paraphrased_logits = paraphrased_outputs.logits
+        paraphrased_predicted_class = torch.argmax(paraphrased_logits, dim=1).item()
+        paraphrased_label = "Toxic" if paraphrased_predicted_class == 1 else "Non-Toxic"
+        paraphrased_confidence = torch.softmax(paraphrased_logits, dim=1)[0][paraphrased_predicted_class].item()
+        paraphrased_color = "red" if paraphrased_label == "Toxic" else "green"
+        paraphrased_toxicity_score = torch.softmax(paraphrased_logits, dim=1)[0][1].item()
+        paraphrased_toxicity_score = round(paraphrased_toxicity_score, 2)
+        paraphrased_bias_score = 0.01 if paraphrased_label == "Non-Toxic" else 0.15  # Placeholder
+        paraphrased_bias_score = round(paraphrased_bias_score, 2)
+    return (
+        f"Prediction: {label}", confidence, label_color, toxicity_score, bias_score,
+        paraphrased_comment, f"Prediction: {paraphrased_label}" if paraphrased_comment else None,
+        paraphrased_confidence, paraphrased_color, paraphrased_toxicity_score, paraphrased_bias_score
+    )