|
|
|
from model_loader import metrics_models |
|
from classifier import classify_toxic_comment |
|
import re |
|
|
|
def compute_semantic_similarity(original, paraphrased): |
|
""" |
|
Compute semantic similarity between the original and paraphrased comment using Sentence-BERT. |
|
Returns a similarity score between 0 and 1. |
|
""" |
|
try: |
|
sentence_bert = metrics_models.load_sentence_bert() |
|
embeddings = sentence_bert.encode([original, paraphrased]) |
|
similarity = float(embeddings[0] @ embeddings[1].T) |
|
return round(similarity, 2) |
|
except Exception as e: |
|
print(f"Error computing semantic similarity: {str(e)}") |
|
return None |
|
|
|
def compute_empathy_score(paraphrased): |
|
""" |
|
Compute an empathy score for the paraphrased comment. |
|
Enhanced to consider positive sentiment and supportive language. |
|
Returns a score between 0 and 1. |
|
""" |
|
try: |
|
empathy_words = ["sorry", "understand", "care", "help", "support", "appreciate", "encourage", "positive"] |
|
words = paraphrased.lower().split() |
|
empathy_count = sum(1 for word in words if word in empathy_words) |
|
|
|
score = min(empathy_count / max(len(words), 1) * 2, 1.0) |
|
return round(score, 2) |
|
except Exception as e: |
|
print(f"Error computing empathy score: {str(e)}") |
|
return None |
|
|
|
def compute_bias_score(paraphrased): |
|
""" |
|
Compute a bias score for the paraphrased comment (placeholder). |
|
Detects stereotypical or discriminatory language. |
|
Returns a score between 0 and 1 (lower is less biased). |
|
""" |
|
try: |
|
bias_indicators = ["race", "gender", "religion", "stereotype", "discriminate", "bias"] |
|
words = paraphrased.lower().split() |
|
bias_count = sum(1 for word in words if word in bias_indicators) |
|
score = bias_count / max(len(words), 1) |
|
return round(score, 2) |
|
except Exception as e: |
|
print(f"Error computing bias score: {str(e)}") |
|
return None |
|
|
|
def compute_hallucination_score(original, paraphrased): |
|
""" |
|
Compute a hallucination score by checking factual consistency. |
|
High score indicates deviation from original meaning. |
|
Returns a score between 0 and 1 (lower is better). |
|
""" |
|
try: |
|
|
|
similarity = compute_semantic_similarity(original, paraphrased) |
|
if similarity is None: |
|
return 0.5 |
|
|
|
score = max(0.0, (1.0 - similarity) * 0.5) |
|
return round(score, 2) |
|
except Exception as e: |
|
print(f"Error computing hallucination score: {str(e)}") |
|
return None |
|
|
|
def compute_reward_scores(original, paraphrased): |
|
""" |
|
Compute all reward scores for a paraphrase. |
|
Returns a dictionary with empathy, toxicity, bias, hallucination, and overall reward. |
|
""" |
|
try: |
|
|
|
_, _, _, toxicity_score, bias_score, _, _, _, _, paraphrased_toxicity_score, paraphrased_bias_score, _, _ = classify_toxic_comment(paraphrased) |
|
toxicity = paraphrased_toxicity_score if paraphrased_toxicity_score is not None else 0.5 |
|
|
|
|
|
empathy = compute_empathy_score(paraphrased) or 0.5 |
|
bias = compute_bias_score(paraphrased) or 0.5 |
|
hallucination = compute_hallucination_score(original, paraphrased) or 0.5 |
|
|
|
|
|
reward = (0.4 * empathy) - (0.2 * toxicity) - (0.2 * bias) - (0.2 * hallucination) |
|
reward = max(0.0, min(1.0, round(reward, 2))) |
|
|
|
return { |
|
"empathy": empathy, |
|
"toxicity": toxicity, |
|
"bias": bias, |
|
"hallucination": hallucination, |
|
"reward": reward |
|
} |
|
except Exception as e: |
|
print(f"Error computing reward scores: {str(e)}") |
|
return { |
|
"empathy": 0.5, |
|
"toxicity": 0.5, |
|
"bias": 0.5, |
|
"hallucination": 0.5, |
|
"reward": 0.5 |
|
} |