Update classifier.py
Browse files- classifier.py +38 -2
classifier.py
CHANGED
@@ -3,7 +3,43 @@ import torch
|
|
3 |
import time
|
4 |
from model_loader import classifier_model
|
5 |
from paraphraser import paraphrase_comment
|
6 |
-
from metrics import compute_semantic_similarity, compute_empathy_score
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
def classify_toxic_comment(comment):
|
9 |
"""
|
@@ -15,7 +51,7 @@ def classify_toxic_comment(comment):
|
|
15 |
print("Starting classification...")
|
16 |
|
17 |
if not comment.strip():
|
18 |
-
return "Error: Please enter a comment.", None, None, None, None, None, None, None, None, None, None
|
19 |
|
20 |
# Access the model and tokenizer
|
21 |
model = classifier_model.model
|
|
|
3 |
import time
|
4 |
from model_loader import classifier_model
|
5 |
from paraphraser import paraphrase_comment
|
6 |
+
from metrics import compute_semantic_similarity, compute_empathy_score, compute_bias_score, compute_hallucination_score
|
7 |
+
|
8 |
+
def compute_reward_scores(original, paraphrased):
|
9 |
+
"""
|
10 |
+
Compute all reward scores for a paraphrase.
|
11 |
+
Returns a dictionary with empathy, toxicity, bias, hallucination, and overall reward.
|
12 |
+
"""
|
13 |
+
try:
|
14 |
+
# Get toxicity from classifier
|
15 |
+
_, _, _, toxicity_score, bias_score, _, _, _, _, paraphrased_toxicity_score, paraphrased_bias_score, _, _ = classify_toxic_comment(paraphrased)
|
16 |
+
toxicity = paraphrased_toxicity_score if paraphrased_toxicity_score is not None else 0.5
|
17 |
+
|
18 |
+
# Compute other metrics
|
19 |
+
empathy = compute_empathy_score(paraphrased) or 0.5
|
20 |
+
bias = compute_bias_score(paraphrased) or 0.5
|
21 |
+
hallucination = compute_hallucination_score(original, paraphrased) or 0.5
|
22 |
+
|
23 |
+
# Overall reward: Weighted combination (adjust weights as needed)
|
24 |
+
reward = (0.4 * empathy) - (0.2 * toxicity) - (0.2 * bias) - (0.2 * hallucination)
|
25 |
+
reward = max(0.0, min(1.0, round(reward, 2)))
|
26 |
+
|
27 |
+
return {
|
28 |
+
"empathy": empathy,
|
29 |
+
"toxicity": toxicity,
|
30 |
+
"bias": bias,
|
31 |
+
"hallucination": hallucination,
|
32 |
+
"reward": reward
|
33 |
+
}
|
34 |
+
except Exception as e:
|
35 |
+
print(f"Error computing reward scores: {str(e)}")
|
36 |
+
return {
|
37 |
+
"empathy": 0.5,
|
38 |
+
"toxicity": 0.5,
|
39 |
+
"bias": 0.5,
|
40 |
+
"hallucination": 0.5,
|
41 |
+
"reward": 0.5
|
42 |
+
}
|
43 |
|
44 |
def classify_toxic_comment(comment):
|
45 |
"""
|
|
|
51 |
print("Starting classification...")
|
52 |
|
53 |
if not comment.strip():
|
54 |
+
return "Error: Please enter a comment.", None, None, None, None, None, None, None, None, None, None, None, None
|
55 |
|
56 |
# Access the model and tokenizer
|
57 |
model = classifier_model.model
|