toxic-comment-classifier_rlhf

Paused

App Files Files Community

JanviMl commited on Apr 19

Commit

8cbcee4

verified ·

1 Parent(s): 7477097

Create refine_paraphrases.py

Browse files

Files changed (1) hide show

refine_paraphrases.py +115 -0

refine_paraphrases.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# refine_paraphrases.py
+import pandas as pd
+from paraphraser import paraphrase_comment
+from metrics import compute_reward_scores
+from model_loader import paraphraser_model
+# Configuration
+DATA_PATH = "refined_paraphrases.csv"
+OUTPUT_PATH = "iterated_paraphrases.csv"
+MAX_ITERATIONS = 3
+TARGET_SCORES = {
+    "empathy": 0.9,
+    "toxicity": 0.1,
+    "bias": 0.1,
+    "hallucination": 0.1,
+    "reward": 0.25
+}
+def generate_new_paraphrase(original: str, current: str, scores: dict, issues: str) -> str:
+    """
+    Generate a new paraphrase using the Granite 3.2-2B-Instruct model.
+    Incorporates feedback from human evaluation and reward scores.
+    """
+    prompt = (
+        f"You are a content moderator refining a comment paraphrase to be emotionally intelligent, non-toxic, and professional while retaining the original intent. "
+        f"The original comment is: \"{original}\". The current paraphrase is: \"{current}\". "
+        f"Reward model scores: Empathy={scores['empathy']}, Toxicity={scores['toxicity']}, Bias={scores['bias']}, Hallucination={scores['hallucination']}, Overall={scores['reward']}. "
+        f"Issues: {issues}. "
+        f"Guidelines: Remove hate speech, keep neutral/professional tone, retain intent, match brevity. "
+        f"Generate a new paraphrase that improves empathy (≥0.9), reduces toxicity/bias/hallucination (≤0.1), and increases reward score (≥0.25)."
+    )
+    return paraphrase_comment(prompt)
+def meets_targets(scores: dict) -> bool:
+    """
+    Check if paraphrase scores meet target thresholds.
+    """
+    return (
+        scores["empathy"] >= TARGET_SCORES["empathy"] and
+        scores["toxicity"] <= TARGET_SCORES["toxicity"] and
+        scores["bias"] <= TARGET_SCORES["bias"] and
+        scores["hallucination"] <= TARGET_SCORES["hallucination"] and
+        scores["reward"] >= TARGET_SCORES["reward"]
+    )
+def refine_paraphrase(row: pd.Series) -> tuple:
+    """
+    Iteratively refine a single paraphrase.
+    Returns new paraphrase, scores, and reasoning.
+    """
+    original = row["Comment"]
+    current_paraphrase = row["Refined_Paraphrase"]
+    current_scores = {
+        "empathy": row["Refined_Empathy"],
+        "toxicity": row["Refined_Toxicity"],
+        "bias": row["Refined_Bias"],
+        "hallucination": row["Refined_Hallucination"],
+        "reward": row["Refined_Reward_Score"]
+    }
+    issues = row["Human_Evaluation_Reasoning"]
+    iteration = 0
+    reasoning = []
+    while iteration < MAX_ITERATIONS and not meets_targets(current_scores):
+        # Generate new paraphrase
+        new_paraphrase = generate_new_paraphrase(original, current_paraphrase, current_scores, issues)
+        # Evaluate new paraphrase
+        new_scores = compute_reward_scores(original, new_paraphrase)
+        # Log reasoning
+        reasoning.append(
+            f"Iteration {iteration + 1}: Generated '{new_paraphrase}' with scores {new_scores}. "
+            f"Previous scores {current_scores}."
+        )
+        # Update if improved
+        if new_scores["reward"] > current_scores["reward"]:
+            current_paraphrase = new_paraphrase
+            current_scores = new_scores
+            reasoning.append("Accepted new paraphrase due to improved reward score.")
+        else:
+            reasoning.append("Rejected new paraphrase; no improvement in reward score.")
+        iteration += 1
+    return current_paraphrase, current_scores, "; ".join(reasoning)
+def main():
+    # Load dataset
+    df = pd.read_csv(DATA_PATH)
+    # Process each row
+    results = []
+    for idx, row in df.iterrows():
+        new_paraphrase, new_scores, reasoning = refine_paraphrase(row)
+        result = {
+            "Comment": row["Comment"],
+            "Original_Paraphrase": row["Original_Paraphrase"],
+            "Refined_Paraphrase": row["Refined_Paraphrase"],
+            "Iterated_Paraphrase": new_paraphrase,
+            "Original_Reward_Score": row["Original_Reward_Score"],
+            "Refined_Reward_Score": row["Refined_Reward_Score"],
+            "Iterated_Reward_Score": new_scores["reward"],
+            "Iterated_Empathy": new_scores["empathy"],
+            "Iterated_Toxicity": new_scores["toxicity"],
+            "Iterated_Bias": new_scores["bias"],
+            "Iterated_Hallucination": new_scores["hallucination"],
+            "Iteration_Reasoning": reasoning
+        }
+        results.append(result)
+    # Save results
+    result_df = pd.DataFrame(results)
+    result_df.to_csv(OUTPUT_PATH, index=False)
+    print(f"Refinement complete. Results saved to {OUTPUT_PATH}")
+if __name__ == "__main__":
+    main()