toxic-comment-classifier_rlhf / refine_paraphrases.py
JanviMl's picture
Update refine_paraphrases.py
ac0ca8d verified
raw
history blame
5.2 kB
# refine_paraphrases.py
import pandas as pd
from paraphraser import paraphrase_comment
from classifier import compute_reward_scores
from model_loader import paraphraser_model
from datasets import load_dataset
import os
# Configuration
DATA_PATH = "JanviMl/toxi_refined_paraphrases"
OUTPUT_PATH = "iterated_paraphrases.csv"
MAX_ITERATIONS = 3
TARGET_SCORES = {
"empathy": 0.9,
"toxicity": 0.1,
"bias": 0.1,
"hallucination": 0.1,
"reward": 0.25
}
def generate_new_paraphrase(original: str, current: str, scores: dict, issues: str) -> str:
"""
Generate a new paraphrase using the Granite 3.2-2B-Instruct model.
Incorporates feedback from human evaluation and reward scores.
"""
prompt = (
f"You are a content moderator refining a comment paraphrase to be emotionally intelligent, non-toxic, and professional while retaining the original intent. "
f"The original comment is: \"{original}\". The current paraphrase is: \"{current}\". "
f"Reward model scores: Empathy={scores['empathy']}, Toxicity={scores['toxicity']}, Bias={scores['bias']}, Hallucination={scores['hallucination']}, Overall={scores['reward']}. "
f"Issues: {issues}. "
f"Guidelines: Remove hate speech, keep neutral/professional tone, retain intent, match brevity. "
f"Generate a new paraphrase that improves empathy (≥0.9), reduces toxicity/bias/hallucination (≤0.1), and increases reward score (≥0.25)."
)
return paraphrase_comment(prompt)
def meets_targets(scores: dict) -> bool:
"""
Check if paraphrase scores meet target thresholds.
"""
return (
scores["empathy"] >= TARGET_SCORES["empathy"] and
scores["toxicity"] <= TARGET_SCORES["toxicity"] and
scores["bias"] <= TARGET_SCORES["bias"] and
scores["hallucination"] <= TARGET_SCORES["hallucination"] and
scores["reward"] >= TARGET_SCORES["reward"]
)
def refine_paraphrase(row: pd.Series) -> tuple:
"""
Iteratively refine a single paraphrase.
Returns new paraphrase, scores, and reasoning.
"""
original = row["Comment"]
current_paraphrase = row["Refined_Paraphrase"]
current_scores = {
"empathy": row["Refined_Empathy"],
"toxicity": row["Refined_Toxicity"],
"bias": row["Refined_Bias"],
"hallucination": row["Refined_Hallucination"],
"reward": row["Refined_Reward_Score"]
}
issues = row["Human_Evaluation_Reasoning"]
iteration = 0
reasoning = []
while iteration < MAX_ITERATIONS and not meets_targets(current_scores):
# Generate new paraphrase
new_paraphrase = generate_new_paraphrase(original, current_paraphrase, current_scores, issues)
# Evaluate new paraphrase
new_scores = compute_reward_scores(original, new_paraphrase)
# Log reasoning
reasoning.append(
f"Iteration {iteration + 1}: Generated '{new_paraphrase}' with scores {new_scores}. "
f"Previous scores {current_scores}."
)
# Update if improved
if new_scores["reward"] > current_scores["reward"]:
current_paraphrase = new_paraphrase
current_scores = new_scores
reasoning.append("Accepted new paraphrase due to improved reward score.")
else:
reasoning.append("Rejected new paraphrase; no improvement in reward score.")
iteration += 1
return current_paraphrase, current_scores, "; ".join(reasoning)
def main():
# Load dataset from Hugging Face Hub
try:
df = load_dataset(DATA_PATH, split="train").to_pandas()
except Exception as e:
print(f"Error loading dataset: {str(e)}")
return
# Process each row
results = []
for idx, row in df.iterrows():
new_paraphrase, new_scores, reasoning = refine_paraphrase(row)
result = {
"Comment": row["Comment"],
"Original_Paraphrase": row["Original_Paraphrase"],
"Refined_Paraphrase": row["Refined_Paraphrase"],
"Iterated_Paraphrase": new_paraphrase,
"Original_Reward_Score": row["Original_Reward_Score"],
"Refined_Reward_Score": row["Refined_Reward_Score"],
"Iterated_Reward_Score": new_scores["reward"],
"Iterated_Empathy": new_scores["empathy"],
"Iterated_Toxicity": new_scores["toxicity"],
"Iterated_Bias": new_scores["bias"],
"Iterated_Hallucination": new_scores["hallucination"],
"Iteration_Reasoning": reasoning
}
results.append(result)
# Save results locally
result_df = pd.DataFrame(results)
result_df.to_csv(OUTPUT_PATH, index=False)
print(f"Refinement complete. Results saved to {OUTPUT_PATH}")
# Push to Hugging Face Hub
try:
from datasets import Dataset
dataset = Dataset.from_pandas(result_df)
dataset.push_to_hub("JanviMl/toxi_iterated_paraphrases", token=os.getenv("HF_TOKEN"))
print("Pushed to Hugging Face Hub: JanviMl/toxi_iterated_paraphrases")
except Exception as e:
print(f"Error pushing to Hub: {str(e)}")
if __name__ == "__main__":
main()