NoaiGPT commited on
Commit
9a04025
·
1 Parent(s): a63315d
Files changed (1) hide show
  1. app.py +37 -22
app.py CHANGED
@@ -3,6 +3,7 @@ import json
3
  import gradio as gr
4
  import spaces
5
  import torch
 
6
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
7
  from sentence_splitter import SentenceSplitter
8
  from itertools import product
@@ -37,33 +38,45 @@ def classify_text(text):
37
  main_score = probabilities[0][predicted_class].item()
38
  return main_label, main_score
39
 
 
 
 
 
 
 
 
40
  @spaces.GPU
41
  def generate_paraphrases(text, setting, output_format):
42
  sentences = splitter.split(text)
43
  all_sentence_paraphrases = []
44
 
45
  if setting == 1:
46
- temperature = 0.6
47
- num_beams = 8
48
- num_return_sequences = 2
49
- elif setting == 2:
50
  temperature = 0.7
51
- num_beams = 8
 
52
  num_return_sequences = 3
53
- elif setting == 3:
54
  temperature = 0.8
55
- num_beams = 8
 
56
  num_return_sequences = 4
57
- elif setting == 4:
58
  temperature = 0.9
59
- num_beams = 8
 
60
  num_return_sequences = 5
61
- else:
62
  temperature = 1.0
63
- num_beams = 8
64
- num_return_sequences = 5
 
 
 
 
 
 
65
 
66
- max_length = 64
67
 
68
  formatted_output = "Original text:\n" + text + "\n\n"
69
  formatted_output += "Paraphrased versions:\n"
@@ -78,20 +91,21 @@ def generate_paraphrases(text, setting, output_format):
78
  for i, sentence in enumerate(sentences):
79
  inputs = paraphraser_tokenizer(f'paraphraser: {sentence}', return_tensors="pt", padding="longest", truncation=True, max_length=max_length).input_ids.to(device)
80
 
81
- # Generate paraphrases
82
  outputs = paraphraser_model.generate(
83
  inputs,
84
- num_beams=num_beams,
85
- num_beam_groups=num_beams,
86
  num_return_sequences=num_return_sequences,
87
- repetition_penalty=10.0,
88
- diversity_penalty=3.0,
89
- no_repeat_ngram_size=2,
90
  temperature=temperature,
 
 
 
 
91
  max_length=max_length
92
  )
93
 
94
  paraphrases = paraphraser_tokenizer.batch_decode(outputs, skip_special_tokens=True)
 
95
 
96
  formatted_output += f"Original sentence {i+1}: {sentence}\n"
97
  for j, paraphrase in enumerate(paraphrases, 1):
@@ -106,6 +120,7 @@ def generate_paraphrases(text, setting, output_format):
106
  formatted_output += "\n"
107
 
108
  all_combinations = list(product(*all_sentence_paraphrases))
 
109
 
110
  formatted_output += "\nCombined paraphrased versions:\n"
111
  combined_versions = []
@@ -121,7 +136,7 @@ def generate_paraphrases(text, setting, output_format):
121
  label, score = classify_text(version)
122
  formatted_output += f"Version {i}:\n{version}\n"
123
  formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
124
- if label == "human-produced" or (label == "machine-generated" and score < 0.98):
125
  human_versions.append((version, label, score))
126
 
127
  formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n"
@@ -152,7 +167,7 @@ iface = gr.Interface(
152
  fn=generate_paraphrases,
153
  inputs=[
154
  gr.Textbox(lines=5, label="Input Text"),
155
- gr.Slider(minimum=1, maximum=5, step=1, label="Readability to Human-like Setting"),
156
  gr.Radio(["text", "json"], label="Output Format")
157
  ],
158
  outputs=[
@@ -160,7 +175,7 @@ iface = gr.Interface(
160
  gr.Textbox(lines=10, label="Human-like or Less Confident Machine-generated Paraphrases")
161
  ],
162
  title="Advanced Diverse Paraphraser with Human-like Filter",
163
- description="Enter a text, select a setting from readable to human-like, and choose the output format to generate diverse paraphrased versions. Combined versions are classified, and those detected as human-produced or less confidently machine-generated are presented in the final output."
164
  )
165
 
166
  # Launch the interface
 
3
  import gradio as gr
4
  import spaces
5
  import torch
6
+ import random
7
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
8
  from sentence_splitter import SentenceSplitter
9
  from itertools import product
 
38
  main_score = probabilities[0][predicted_class].item()
39
  return main_label, main_score
40
 
41
+ def introduce_errors(text):
42
+ words = text.split()
43
+ if len(words) > 3:
44
+ i = random.randint(0, len(words) - 1)
45
+ words[i] = words[i].lower() if words[i][0].isupper() else words[i].capitalize()
46
+ return ' '.join(words)
47
+
48
  @spaces.GPU
49
  def generate_paraphrases(text, setting, output_format):
50
  sentences = splitter.split(text)
51
  all_sentence_paraphrases = []
52
 
53
  if setting == 1:
 
 
 
 
54
  temperature = 0.7
55
+ top_p = 0.95
56
+ top_k = 50
57
  num_return_sequences = 3
58
+ elif setting == 2:
59
  temperature = 0.8
60
+ top_p = 0.9
61
+ top_k = 40
62
  num_return_sequences = 4
63
+ elif setting == 3:
64
  temperature = 0.9
65
+ top_p = 0.85
66
+ top_k = 30
67
  num_return_sequences = 5
68
+ elif setting == 4:
69
  temperature = 1.0
70
+ top_p = 0.8
71
+ top_k = 20
72
+ num_return_sequences = 6
73
+ else:
74
+ temperature = 1.1
75
+ top_p = 0.75
76
+ top_k = 10
77
+ num_return_sequences = 7
78
 
79
+ max_length = 128
80
 
81
  formatted_output = "Original text:\n" + text + "\n\n"
82
  formatted_output += "Paraphrased versions:\n"
 
91
  for i, sentence in enumerate(sentences):
92
  inputs = paraphraser_tokenizer(f'paraphraser: {sentence}', return_tensors="pt", padding="longest", truncation=True, max_length=max_length).input_ids.to(device)
93
 
94
+ # Generate paraphrases using sampling
95
  outputs = paraphraser_model.generate(
96
  inputs,
97
+ do_sample=True,
 
98
  num_return_sequences=num_return_sequences,
 
 
 
99
  temperature=temperature,
100
+ top_p=top_p,
101
+ top_k=top_k,
102
+ repetition_penalty=1.2,
103
+ no_repeat_ngram_size=2,
104
  max_length=max_length
105
  )
106
 
107
  paraphrases = paraphraser_tokenizer.batch_decode(outputs, skip_special_tokens=True)
108
+ paraphrases = [introduce_errors(p) for p in paraphrases]
109
 
110
  formatted_output += f"Original sentence {i+1}: {sentence}\n"
111
  for j, paraphrase in enumerate(paraphrases, 1):
 
120
  formatted_output += "\n"
121
 
122
  all_combinations = list(product(*all_sentence_paraphrases))
123
+ random.shuffle(all_combinations)
124
 
125
  formatted_output += "\nCombined paraphrased versions:\n"
126
  combined_versions = []
 
136
  label, score = classify_text(version)
137
  formatted_output += f"Version {i}:\n{version}\n"
138
  formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
139
+ if label == "human-produced" or (label == "machine-generated" and score < 0.95):
140
  human_versions.append((version, label, score))
141
 
142
  formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n"
 
167
  fn=generate_paraphrases,
168
  inputs=[
169
  gr.Textbox(lines=5, label="Input Text"),
170
+ gr.Slider(minimum=1, maximum=5, step=1, label="Diversity Setting"),
171
  gr.Radio(["text", "json"], label="Output Format")
172
  ],
173
  outputs=[
 
175
  gr.Textbox(lines=10, label="Human-like or Less Confident Machine-generated Paraphrases")
176
  ],
177
  title="Advanced Diverse Paraphraser with Human-like Filter",
178
+ description="Enter a text, select a diversity setting, and choose the output format to generate diverse paraphrased versions. Combined versions are classified, and those detected as human-produced or less confidently machine-generated are presented in the final output."
179
  )
180
 
181
  # Launch the interface