File size: 4,204 Bytes
829572e
685f359
7477097
 
829572e
685f359
829572e
685f359
 
829572e
685f359
 
 
 
 
 
 
 
829572e
685f359
 
7477097
 
685f359
 
 
7477097
685f359
 
7477097
 
685f359
 
 
7477097
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# metrics.py
from model_loader import metrics_models
from classifier import classify_toxic_comment
import re

def compute_semantic_similarity(original, paraphrased):
    """
    Compute semantic similarity between the original and paraphrased comment using Sentence-BERT.
    Returns a similarity score between 0 and 1.
    """
    try:
        sentence_bert = metrics_models.load_sentence_bert()
        embeddings = sentence_bert.encode([original, paraphrased])
        similarity = float(embeddings[0] @ embeddings[1].T)
        return round(similarity, 2)
    except Exception as e:
        print(f"Error computing semantic similarity: {str(e)}")
        return None

def compute_empathy_score(paraphrased):
    """
    Compute an empathy score for the paraphrased comment.
    Enhanced to consider positive sentiment and supportive language.
    Returns a score between 0 and 1.
    """
    try:
        empathy_words = ["sorry", "understand", "care", "help", "support", "appreciate", "encourage", "positive"]
        words = paraphrased.lower().split()
        empathy_count = sum(1 for word in words if word in empathy_words)
        # Normalize by length, cap at 1.0
        score = min(empathy_count / max(len(words), 1) * 2, 1.0)  # Amplify for better sensitivity
        return round(score, 2)
    except Exception as e:
        print(f"Error computing empathy score: {str(e)}")
        return None

def compute_bias_score(paraphrased):
    """
    Compute a bias score for the paraphrased comment (placeholder).
    Detects stereotypical or discriminatory language.
    Returns a score between 0 and 1 (lower is less biased).
    """
    try:
        bias_indicators = ["race", "gender", "religion", "stereotype", "discriminate", "bias"]
        words = paraphrased.lower().split()
        bias_count = sum(1 for word in words if word in bias_indicators)
        score = bias_count / max(len(words), 1)
        return round(score, 2)
    except Exception as e:
        print(f"Error computing bias score: {str(e)}")
        return None

def compute_hallucination_score(original, paraphrased):
    """
    Compute a hallucination score by checking factual consistency.
    High score indicates deviation from original meaning.
    Returns a score between 0 and 1 (lower is better).
    """
    try:
        # Use semantic similarity as a proxy; low similarity suggests hallucination
        similarity = compute_semantic_similarity(original, paraphrased)
        if similarity is None:
            return 0.5  # Default if similarity fails
        # Inverse similarity scaled to penalize low similarity
        score = max(0.0, (1.0 - similarity) * 0.5)
        return round(score, 2)
    except Exception as e:
        print(f"Error computing hallucination score: {str(e)}")
        return None

def compute_reward_scores(original, paraphrased):
    """
    Compute all reward scores for a paraphrase.
    Returns a dictionary with empathy, toxicity, bias, hallucination, and overall reward.
    """
    try:
        # Get toxicity from classifier
        _, _, _, toxicity_score, bias_score, _, _, _, _, paraphrased_toxicity_score, paraphrased_bias_score, _, _ = classify_toxic_comment(paraphrased)
        toxicity = paraphrased_toxicity_score if paraphrased_toxicity_score is not None else 0.5

        # Compute other metrics
        empathy = compute_empathy_score(paraphrased) or 0.5
        bias = compute_bias_score(paraphrased) or 0.5
        hallucination = compute_hallucination_score(original, paraphrased) or 0.5

        # Overall reward: Weighted combination (adjust weights as needed)
        reward = (0.4 * empathy) - (0.2 * toxicity) - (0.2 * bias) - (0.2 * hallucination)
        reward = max(0.0, min(1.0, round(reward, 2)))

        return {
            "empathy": empathy,
            "toxicity": toxicity,
            "bias": bias,
            "hallucination": hallucination,
            "reward": reward
        }
    except Exception as e:
        print(f"Error computing reward scores: {str(e)}")
        return {
            "empathy": 0.5,
            "toxicity": 0.5,
            "bias": 0.5,
            "hallucination": 0.5,
            "reward": 0.5
        }