Create refine_paraphrases.py
Browse files- refine_paraphrases.py +115 -0
refine_paraphrases.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# refine_paraphrases.py
|
2 |
+
import pandas as pd
|
3 |
+
from paraphraser import paraphrase_comment
|
4 |
+
from metrics import compute_reward_scores
|
5 |
+
from model_loader import paraphraser_model
|
6 |
+
|
7 |
+
# Configuration
|
8 |
+
DATA_PATH = "refined_paraphrases.csv"
|
9 |
+
OUTPUT_PATH = "iterated_paraphrases.csv"
|
10 |
+
MAX_ITERATIONS = 3
|
11 |
+
TARGET_SCORES = {
|
12 |
+
"empathy": 0.9,
|
13 |
+
"toxicity": 0.1,
|
14 |
+
"bias": 0.1,
|
15 |
+
"hallucination": 0.1,
|
16 |
+
"reward": 0.25
|
17 |
+
}
|
18 |
+
|
19 |
+
def generate_new_paraphrase(original: str, current: str, scores: dict, issues: str) -> str:
|
20 |
+
"""
|
21 |
+
Generate a new paraphrase using the Granite 3.2-2B-Instruct model.
|
22 |
+
Incorporates feedback from human evaluation and reward scores.
|
23 |
+
"""
|
24 |
+
prompt = (
|
25 |
+
f"You are a content moderator refining a comment paraphrase to be emotionally intelligent, non-toxic, and professional while retaining the original intent. "
|
26 |
+
f"The original comment is: \"{original}\". The current paraphrase is: \"{current}\". "
|
27 |
+
f"Reward model scores: Empathy={scores['empathy']}, Toxicity={scores['toxicity']}, Bias={scores['bias']}, Hallucination={scores['hallucination']}, Overall={scores['reward']}. "
|
28 |
+
f"Issues: {issues}. "
|
29 |
+
f"Guidelines: Remove hate speech, keep neutral/professional tone, retain intent, match brevity. "
|
30 |
+
f"Generate a new paraphrase that improves empathy (≥0.9), reduces toxicity/bias/hallucination (≤0.1), and increases reward score (≥0.25)."
|
31 |
+
)
|
32 |
+
return paraphrase_comment(prompt)
|
33 |
+
|
34 |
+
def meets_targets(scores: dict) -> bool:
|
35 |
+
"""
|
36 |
+
Check if paraphrase scores meet target thresholds.
|
37 |
+
"""
|
38 |
+
return (
|
39 |
+
scores["empathy"] >= TARGET_SCORES["empathy"] and
|
40 |
+
scores["toxicity"] <= TARGET_SCORES["toxicity"] and
|
41 |
+
scores["bias"] <= TARGET_SCORES["bias"] and
|
42 |
+
scores["hallucination"] <= TARGET_SCORES["hallucination"] and
|
43 |
+
scores["reward"] >= TARGET_SCORES["reward"]
|
44 |
+
)
|
45 |
+
|
46 |
+
def refine_paraphrase(row: pd.Series) -> tuple:
|
47 |
+
"""
|
48 |
+
Iteratively refine a single paraphrase.
|
49 |
+
Returns new paraphrase, scores, and reasoning.
|
50 |
+
"""
|
51 |
+
original = row["Comment"]
|
52 |
+
current_paraphrase = row["Refined_Paraphrase"]
|
53 |
+
current_scores = {
|
54 |
+
"empathy": row["Refined_Empathy"],
|
55 |
+
"toxicity": row["Refined_Toxicity"],
|
56 |
+
"bias": row["Refined_Bias"],
|
57 |
+
"hallucination": row["Refined_Hallucination"],
|
58 |
+
"reward": row["Refined_Reward_Score"]
|
59 |
+
}
|
60 |
+
issues = row["Human_Evaluation_Reasoning"]
|
61 |
+
iteration = 0
|
62 |
+
reasoning = []
|
63 |
+
|
64 |
+
while iteration < MAX_ITERATIONS and not meets_targets(current_scores):
|
65 |
+
# Generate new paraphrase
|
66 |
+
new_paraphrase = generate_new_paraphrase(original, current_paraphrase, current_scores, issues)
|
67 |
+
# Evaluate new paraphrase
|
68 |
+
new_scores = compute_reward_scores(original, new_paraphrase)
|
69 |
+
# Log reasoning
|
70 |
+
reasoning.append(
|
71 |
+
f"Iteration {iteration + 1}: Generated '{new_paraphrase}' with scores {new_scores}. "
|
72 |
+
f"Previous scores {current_scores}."
|
73 |
+
)
|
74 |
+
# Update if improved
|
75 |
+
if new_scores["reward"] > current_scores["reward"]:
|
76 |
+
current_paraphrase = new_paraphrase
|
77 |
+
current_scores = new_scores
|
78 |
+
reasoning.append("Accepted new paraphrase due to improved reward score.")
|
79 |
+
else:
|
80 |
+
reasoning.append("Rejected new paraphrase; no improvement in reward score.")
|
81 |
+
iteration += 1
|
82 |
+
|
83 |
+
return current_paraphrase, current_scores, "; ".join(reasoning)
|
84 |
+
|
85 |
+
def main():
|
86 |
+
# Load dataset
|
87 |
+
df = pd.read_csv(DATA_PATH)
|
88 |
+
|
89 |
+
# Process each row
|
90 |
+
results = []
|
91 |
+
for idx, row in df.iterrows():
|
92 |
+
new_paraphrase, new_scores, reasoning = refine_paraphrase(row)
|
93 |
+
result = {
|
94 |
+
"Comment": row["Comment"],
|
95 |
+
"Original_Paraphrase": row["Original_Paraphrase"],
|
96 |
+
"Refined_Paraphrase": row["Refined_Paraphrase"],
|
97 |
+
"Iterated_Paraphrase": new_paraphrase,
|
98 |
+
"Original_Reward_Score": row["Original_Reward_Score"],
|
99 |
+
"Refined_Reward_Score": row["Refined_Reward_Score"],
|
100 |
+
"Iterated_Reward_Score": new_scores["reward"],
|
101 |
+
"Iterated_Empathy": new_scores["empathy"],
|
102 |
+
"Iterated_Toxicity": new_scores["toxicity"],
|
103 |
+
"Iterated_Bias": new_scores["bias"],
|
104 |
+
"Iterated_Hallucination": new_scores["hallucination"],
|
105 |
+
"Iteration_Reasoning": reasoning
|
106 |
+
}
|
107 |
+
results.append(result)
|
108 |
+
|
109 |
+
# Save results
|
110 |
+
result_df = pd.DataFrame(results)
|
111 |
+
result_df.to_csv(OUTPUT_PATH, index=False)
|
112 |
+
print(f"Refinement complete. Results saved to {OUTPUT_PATH}")
|
113 |
+
|
114 |
+
if __name__ == "__main__":
|
115 |
+
main()
|