toxic-comment-classifier_rlhf

Paused

File size: 4,659 Bytes

8cbcee4
 
 
 
 
 
 
6af00fb
 
8cbcee4

# refine_paraphrases.py
import pandas as pd
from paraphraser import paraphrase_comment
from metrics import compute_reward_scores
from model_loader import paraphraser_model

# Configuration
DATA_PATH = "toxic-comment-classifier_rlhf/refined_paraphrases.csv"
OUTPUT_PATH = "toxic-comment-classifier_rlhf/iterated_paraphrases.csv"
MAX_ITERATIONS = 3
TARGET_SCORES = {
    "empathy": 0.9,
    "toxicity": 0.1,
    "bias": 0.1,
    "hallucination": 0.1,
    "reward": 0.25
}

def generate_new_paraphrase(original: str, current: str, scores: dict, issues: str) -> str:
    """
    Generate a new paraphrase using the Granite 3.2-2B-Instruct model.
    Incorporates feedback from human evaluation and reward scores.
    """
    prompt = (
        f"You are a content moderator refining a comment paraphrase to be emotionally intelligent, non-toxic, and professional while retaining the original intent. "
        f"The original comment is: \"{original}\". The current paraphrase is: \"{current}\". "
        f"Reward model scores: Empathy={scores['empathy']}, Toxicity={scores['toxicity']}, Bias={scores['bias']}, Hallucination={scores['hallucination']}, Overall={scores['reward']}. "
        f"Issues: {issues}. "
        f"Guidelines: Remove hate speech, keep neutral/professional tone, retain intent, match brevity. "
        f"Generate a new paraphrase that improves empathy (≥0.9), reduces toxicity/bias/hallucination (≤0.1), and increases reward score (≥0.25)."
    )
    return paraphrase_comment(prompt)

def meets_targets(scores: dict) -> bool:
    """
    Check if paraphrase scores meet target thresholds.
    """
    return (
        scores["empathy"] >= TARGET_SCORES["empathy"] and
        scores["toxicity"] <= TARGET_SCORES["toxicity"] and
        scores["bias"] <= TARGET_SCORES["bias"] and
        scores["hallucination"] <= TARGET_SCORES["hallucination"] and
        scores["reward"] >= TARGET_SCORES["reward"]
    )

def refine_paraphrase(row: pd.Series) -> tuple:
    """
    Iteratively refine a single paraphrase.
    Returns new paraphrase, scores, and reasoning.
    """
    original = row["Comment"]
    current_paraphrase = row["Refined_Paraphrase"]
    current_scores = {
        "empathy": row["Refined_Empathy"],
        "toxicity": row["Refined_Toxicity"],
        "bias": row["Refined_Bias"],
        "hallucination": row["Refined_Hallucination"],
        "reward": row["Refined_Reward_Score"]
    }
    issues = row["Human_Evaluation_Reasoning"]
    iteration = 0
    reasoning = []

    while iteration < MAX_ITERATIONS and not meets_targets(current_scores):
        # Generate new paraphrase
        new_paraphrase = generate_new_paraphrase(original, current_paraphrase, current_scores, issues)
        # Evaluate new paraphrase
        new_scores = compute_reward_scores(original, new_paraphrase)
        # Log reasoning
        reasoning.append(
            f"Iteration {iteration + 1}: Generated '{new_paraphrase}' with scores {new_scores}. "
            f"Previous scores {current_scores}."
        )
        # Update if improved
        if new_scores["reward"] > current_scores["reward"]:
            current_paraphrase = new_paraphrase
            current_scores = new_scores
            reasoning.append("Accepted new paraphrase due to improved reward score.")
        else:
            reasoning.append("Rejected new paraphrase; no improvement in reward score.")
        iteration += 1

    return current_paraphrase, current_scores, "; ".join(reasoning)

def main():
    # Load dataset
    df = pd.read_csv(DATA_PATH)
    
    # Process each row
    results = []
    for idx, row in df.iterrows():
        new_paraphrase, new_scores, reasoning = refine_paraphrase(row)
        result = {
            "Comment": row["Comment"],
            "Original_Paraphrase": row["Original_Paraphrase"],
            "Refined_Paraphrase": row["Refined_Paraphrase"],
            "Iterated_Paraphrase": new_paraphrase,
            "Original_Reward_Score": row["Original_Reward_Score"],
            "Refined_Reward_Score": row["Refined_Reward_Score"],
            "Iterated_Reward_Score": new_scores["reward"],
            "Iterated_Empathy": new_scores["empathy"],
            "Iterated_Toxicity": new_scores["toxicity"],
            "Iterated_Bias": new_scores["bias"],
            "Iterated_Hallucination": new_scores["hallucination"],
            "Iteration_Reasoning": reasoning
        }
        results.append(result)

    # Save results
    result_df = pd.DataFrame(results)
    result_df.to_csv(OUTPUT_PATH, index=False)
    print(f"Refinement complete. Results saved to {OUTPUT_PATH}")

if __name__ == "__main__":
    main()