# metrics.py from model_loader import metrics_models from classifier import classify_toxic_comment import re def compute_semantic_similarity(original, paraphrased): """ Compute semantic similarity between the original and paraphrased comment using Sentence-BERT. Returns a similarity score between 0 and 1. """ try: sentence_bert = metrics_models.load_sentence_bert() embeddings = sentence_bert.encode([original, paraphrased]) similarity = float(embeddings[0] @ embeddings[1].T) return round(similarity, 2) except Exception as e: print(f"Error computing semantic similarity: {str(e)}") return None def compute_empathy_score(paraphrased): """ Compute an empathy score for the paraphrased comment. Enhanced to consider positive sentiment and supportive language. Returns a score between 0 and 1. """ try: empathy_words = ["sorry", "understand", "care", "help", "support", "appreciate", "encourage", "positive"] words = paraphrased.lower().split() empathy_count = sum(1 for word in words if word in empathy_words) # Normalize by length, cap at 1.0 score = min(empathy_count / max(len(words), 1) * 2, 1.0) # Amplify for better sensitivity return round(score, 2) except Exception as e: print(f"Error computing empathy score: {str(e)}") return None def compute_bias_score(paraphrased): """ Compute a bias score for the paraphrased comment (placeholder). Detects stereotypical or discriminatory language. Returns a score between 0 and 1 (lower is less biased). """ try: bias_indicators = ["race", "gender", "religion", "stereotype", "discriminate", "bias"] words = paraphrased.lower().split() bias_count = sum(1 for word in words if word in bias_indicators) score = bias_count / max(len(words), 1) return round(score, 2) except Exception as e: print(f"Error computing bias score: {str(e)}") return None def compute_hallucination_score(original, paraphrased): """ Compute a hallucination score by checking factual consistency. High score indicates deviation from original meaning. Returns a score between 0 and 1 (lower is better). """ try: # Use semantic similarity as a proxy; low similarity suggests hallucination similarity = compute_semantic_similarity(original, paraphrased) if similarity is None: return 0.5 # Default if similarity fails # Inverse similarity scaled to penalize low similarity score = max(0.0, (1.0 - similarity) * 0.5) return round(score, 2) except Exception as e: print(f"Error computing hallucination score: {str(e)}") return None def compute_reward_scores(original, paraphrased): """ Compute all reward scores for a paraphrase. Returns a dictionary with empathy, toxicity, bias, hallucination, and overall reward. """ try: # Get toxicity from classifier _, _, _, toxicity_score, bias_score, _, _, _, _, paraphrased_toxicity_score, paraphrased_bias_score, _, _ = classify_toxic_comment(paraphrased) toxicity = paraphrased_toxicity_score if paraphrased_toxicity_score is not None else 0.5 # Compute other metrics empathy = compute_empathy_score(paraphrased) or 0.5 bias = compute_bias_score(paraphrased) or 0.5 hallucination = compute_hallucination_score(original, paraphrased) or 0.5 # Overall reward: Weighted combination (adjust weights as needed) reward = (0.4 * empathy) - (0.2 * toxicity) - (0.2 * bias) - (0.2 * hallucination) reward = max(0.0, min(1.0, round(reward, 2))) return { "empathy": empathy, "toxicity": toxicity, "bias": bias, "hallucination": hallucination, "reward": reward } except Exception as e: print(f"Error computing reward scores: {str(e)}") return { "empathy": 0.5, "toxicity": 0.5, "bias": 0.5, "hallucination": 0.5, "reward": 0.5 }