JanviMl commited on
Commit
8cbcee4
·
verified ·
1 Parent(s): 7477097

Create refine_paraphrases.py

Browse files
Files changed (1) hide show
  1. refine_paraphrases.py +115 -0
refine_paraphrases.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # refine_paraphrases.py
2
+ import pandas as pd
3
+ from paraphraser import paraphrase_comment
4
+ from metrics import compute_reward_scores
5
+ from model_loader import paraphraser_model
6
+
7
+ # Configuration
8
+ DATA_PATH = "refined_paraphrases.csv"
9
+ OUTPUT_PATH = "iterated_paraphrases.csv"
10
+ MAX_ITERATIONS = 3
11
+ TARGET_SCORES = {
12
+ "empathy": 0.9,
13
+ "toxicity": 0.1,
14
+ "bias": 0.1,
15
+ "hallucination": 0.1,
16
+ "reward": 0.25
17
+ }
18
+
19
+ def generate_new_paraphrase(original: str, current: str, scores: dict, issues: str) -> str:
20
+ """
21
+ Generate a new paraphrase using the Granite 3.2-2B-Instruct model.
22
+ Incorporates feedback from human evaluation and reward scores.
23
+ """
24
+ prompt = (
25
+ f"You are a content moderator refining a comment paraphrase to be emotionally intelligent, non-toxic, and professional while retaining the original intent. "
26
+ f"The original comment is: \"{original}\". The current paraphrase is: \"{current}\". "
27
+ f"Reward model scores: Empathy={scores['empathy']}, Toxicity={scores['toxicity']}, Bias={scores['bias']}, Hallucination={scores['hallucination']}, Overall={scores['reward']}. "
28
+ f"Issues: {issues}. "
29
+ f"Guidelines: Remove hate speech, keep neutral/professional tone, retain intent, match brevity. "
30
+ f"Generate a new paraphrase that improves empathy (≥0.9), reduces toxicity/bias/hallucination (≤0.1), and increases reward score (≥0.25)."
31
+ )
32
+ return paraphrase_comment(prompt)
33
+
34
+ def meets_targets(scores: dict) -> bool:
35
+ """
36
+ Check if paraphrase scores meet target thresholds.
37
+ """
38
+ return (
39
+ scores["empathy"] >= TARGET_SCORES["empathy"] and
40
+ scores["toxicity"] <= TARGET_SCORES["toxicity"] and
41
+ scores["bias"] <= TARGET_SCORES["bias"] and
42
+ scores["hallucination"] <= TARGET_SCORES["hallucination"] and
43
+ scores["reward"] >= TARGET_SCORES["reward"]
44
+ )
45
+
46
+ def refine_paraphrase(row: pd.Series) -> tuple:
47
+ """
48
+ Iteratively refine a single paraphrase.
49
+ Returns new paraphrase, scores, and reasoning.
50
+ """
51
+ original = row["Comment"]
52
+ current_paraphrase = row["Refined_Paraphrase"]
53
+ current_scores = {
54
+ "empathy": row["Refined_Empathy"],
55
+ "toxicity": row["Refined_Toxicity"],
56
+ "bias": row["Refined_Bias"],
57
+ "hallucination": row["Refined_Hallucination"],
58
+ "reward": row["Refined_Reward_Score"]
59
+ }
60
+ issues = row["Human_Evaluation_Reasoning"]
61
+ iteration = 0
62
+ reasoning = []
63
+
64
+ while iteration < MAX_ITERATIONS and not meets_targets(current_scores):
65
+ # Generate new paraphrase
66
+ new_paraphrase = generate_new_paraphrase(original, current_paraphrase, current_scores, issues)
67
+ # Evaluate new paraphrase
68
+ new_scores = compute_reward_scores(original, new_paraphrase)
69
+ # Log reasoning
70
+ reasoning.append(
71
+ f"Iteration {iteration + 1}: Generated '{new_paraphrase}' with scores {new_scores}. "
72
+ f"Previous scores {current_scores}."
73
+ )
74
+ # Update if improved
75
+ if new_scores["reward"] > current_scores["reward"]:
76
+ current_paraphrase = new_paraphrase
77
+ current_scores = new_scores
78
+ reasoning.append("Accepted new paraphrase due to improved reward score.")
79
+ else:
80
+ reasoning.append("Rejected new paraphrase; no improvement in reward score.")
81
+ iteration += 1
82
+
83
+ return current_paraphrase, current_scores, "; ".join(reasoning)
84
+
85
+ def main():
86
+ # Load dataset
87
+ df = pd.read_csv(DATA_PATH)
88
+
89
+ # Process each row
90
+ results = []
91
+ for idx, row in df.iterrows():
92
+ new_paraphrase, new_scores, reasoning = refine_paraphrase(row)
93
+ result = {
94
+ "Comment": row["Comment"],
95
+ "Original_Paraphrase": row["Original_Paraphrase"],
96
+ "Refined_Paraphrase": row["Refined_Paraphrase"],
97
+ "Iterated_Paraphrase": new_paraphrase,
98
+ "Original_Reward_Score": row["Original_Reward_Score"],
99
+ "Refined_Reward_Score": row["Refined_Reward_Score"],
100
+ "Iterated_Reward_Score": new_scores["reward"],
101
+ "Iterated_Empathy": new_scores["empathy"],
102
+ "Iterated_Toxicity": new_scores["toxicity"],
103
+ "Iterated_Bias": new_scores["bias"],
104
+ "Iterated_Hallucination": new_scores["hallucination"],
105
+ "Iteration_Reasoning": reasoning
106
+ }
107
+ results.append(result)
108
+
109
+ # Save results
110
+ result_df = pd.DataFrame(results)
111
+ result_df.to_csv(OUTPUT_PATH, index=False)
112
+ print(f"Refinement complete. Results saved to {OUTPUT_PATH}")
113
+
114
+ if __name__ == "__main__":
115
+ main()