|
|
|
import pandas as pd |
|
from paraphraser import paraphrase_comment |
|
from classifier import compute_reward_scores |
|
from model_loader import paraphraser_model |
|
from datasets import load_dataset |
|
import os |
|
|
|
|
|
DATA_PATH = "JanviMl/toxi_refined_paraphrases" |
|
OUTPUT_PATH = "iterated_paraphrases.csv" |
|
MAX_ITERATIONS = 3 |
|
TARGET_SCORES = { |
|
"empathy": 0.9, |
|
"toxicity": 0.1, |
|
"bias": 0.1, |
|
"hallucination": 0.1, |
|
"reward": 0.25 |
|
} |
|
|
|
def generate_new_paraphrase(original: str, current: str, scores: dict, issues: str) -> str: |
|
""" |
|
Generate a new paraphrase using the Granite 3.2-2B-Instruct model. |
|
Incorporates feedback from human evaluation and reward scores. |
|
""" |
|
prompt = ( |
|
f"You are a content moderator refining a comment paraphrase to be emotionally intelligent, non-toxic, and professional while retaining the original intent. " |
|
f"The original comment is: \"{original}\". The current paraphrase is: \"{current}\". " |
|
f"Reward model scores: Empathy={scores['empathy']}, Toxicity={scores['toxicity']}, Bias={scores['bias']}, Hallucination={scores['hallucination']}, Overall={scores['reward']}. " |
|
f"Issues: {issues}. " |
|
f"Guidelines: Remove hate speech, keep neutral/professional tone, retain intent, match brevity. " |
|
f"Generate a new paraphrase that improves empathy (≥0.9), reduces toxicity/bias/hallucination (≤0.1), and increases reward score (≥0.25)." |
|
) |
|
return paraphrase_comment(prompt) |
|
|
|
def meets_targets(scores: dict) -> bool: |
|
""" |
|
Check if paraphrase scores meet target thresholds. |
|
""" |
|
return ( |
|
scores["empathy"] >= TARGET_SCORES["empathy"] and |
|
scores["toxicity"] <= TARGET_SCORES["toxicity"] and |
|
scores["bias"] <= TARGET_SCORES["bias"] and |
|
scores["hallucination"] <= TARGET_SCORES["hallucination"] and |
|
scores["reward"] >= TARGET_SCORES["reward"] |
|
) |
|
|
|
def refine_paraphrase(row: pd.Series) -> tuple: |
|
""" |
|
Iteratively refine a single paraphrase. |
|
Returns new paraphrase, scores, and reasoning. |
|
""" |
|
original = row["Comment"] |
|
current_paraphrase = row["Refined_Paraphrase"] |
|
current_scores = { |
|
"empathy": row["Refined_Empathy"], |
|
"toxicity": row["Refined_Toxicity"], |
|
"bias": row["Refined_Bias"], |
|
"hallucination": row["Refined_Hallucination"], |
|
"reward": row["Refined_Reward_Score"] |
|
} |
|
issues = row["Human_Evaluation_Reasoning"] |
|
iteration = 0 |
|
reasoning = [] |
|
|
|
while iteration < MAX_ITERATIONS and not meets_targets(current_scores): |
|
|
|
new_paraphrase = generate_new_paraphrase(original, current_paraphrase, current_scores, issues) |
|
|
|
new_scores = compute_reward_scores(original, new_paraphrase) |
|
|
|
reasoning.append( |
|
f"Iteration {iteration + 1}: Generated '{new_paraphrase}' with scores {new_scores}. " |
|
f"Previous scores {current_scores}." |
|
) |
|
|
|
if new_scores["reward"] > current_scores["reward"]: |
|
current_paraphrase = new_paraphrase |
|
current_scores = new_scores |
|
reasoning.append("Accepted new paraphrase due to improved reward score.") |
|
else: |
|
reasoning.append("Rejected new paraphrase; no improvement in reward score.") |
|
iteration += 1 |
|
|
|
return current_paraphrase, current_scores, "; ".join(reasoning) |
|
|
|
def main(): |
|
|
|
try: |
|
df = load_dataset(DATA_PATH, split="train").to_pandas() |
|
except Exception as e: |
|
print(f"Error loading dataset: {str(e)}") |
|
return |
|
|
|
|
|
results = [] |
|
for idx, row in df.iterrows(): |
|
new_paraphrase, new_scores, reasoning = refine_paraphrase(row) |
|
result = { |
|
"Comment": row["Comment"], |
|
"Original_Paraphrase": row["Original_Paraphrase"], |
|
"Refined_Paraphrase": row["Refined_Paraphrase"], |
|
"Iterated_Paraphrase": new_paraphrase, |
|
"Original_Reward_Score": row["Original_Reward_Score"], |
|
"Refined_Reward_Score": row["Refined_Reward_Score"], |
|
"Iterated_Reward_Score": new_scores["reward"], |
|
"Iterated_Empathy": new_scores["empathy"], |
|
"Iterated_Toxicity": new_scores["toxicity"], |
|
"Iterated_Bias": new_scores["bias"], |
|
"Iterated_Hallucination": new_scores["hallucination"], |
|
"Iteration_Reasoning": reasoning |
|
} |
|
results.append(result) |
|
|
|
|
|
result_df = pd.DataFrame(results) |
|
result_df.to_csv(OUTPUT_PATH, index=False) |
|
print(f"Refinement complete. Results saved to {OUTPUT_PATH}") |
|
|
|
|
|
try: |
|
from datasets import Dataset |
|
dataset = Dataset.from_pandas(result_df) |
|
dataset.push_to_hub("JanviMl/toxi_iterated_paraphrases", token=os.getenv("HF_TOKEN")) |
|
print("Pushed to Hugging Face Hub: JanviMl/toxi_iterated_paraphrases") |
|
except Exception as e: |
|
print(f"Error pushing to Hub: {str(e)}") |
|
|
|
if __name__ == "__main__": |
|
main() |