NoaiGPT commited on
Commit
e4ff9c5
·
1 Parent(s): 1b32d80
Files changed (1) hide show
  1. app.py +26 -201
app.py CHANGED
@@ -1,175 +1,9 @@
1
- # import os
2
- # import json
3
- # import gradio as gr
4
- # import spaces
5
- # import torch
6
- # from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
7
- # from sentence_splitter import SentenceSplitter
8
- # from itertools import product
9
-
10
- # # Get the Hugging Face token from environment variable
11
- # hf_token = os.getenv('HF_TOKEN')
12
-
13
- # cuda_available = torch.cuda.is_available()
14
- # device = torch.device("cuda" if cuda_available else "cpu")
15
- # print(f"Using device: {device}")
16
-
17
- # # Initialize paraphraser model and tokenizer
18
- # paraphraser_model_name = "SamSJackson/paraphrase-dipper-no-ctx"
19
- # paraphraser_tokenizer = AutoTokenizer.from_pretrained("google/t5-efficient-large-nl32")
20
- # paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name).to(device)
21
-
22
- # # Initialize classifier model and tokenizer
23
- # classifier_model_name = "andreas122001/roberta-mixed-detector"
24
- # classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_name)
25
- # classifier_model = AutoModelForSequenceClassification.from_pretrained(classifier_model_name).to(device)
26
-
27
- # # Initialize sentence splitter
28
- # splitter = SentenceSplitter(language='en')
29
-
30
- # def classify_text(text):
31
- # inputs = classifier_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
32
- # with torch.no_grad():
33
- # outputs = classifier_model(**inputs)
34
- # probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
35
- # predicted_class = torch.argmax(probabilities, dim=-1).item()
36
- # main_label = classifier_model.config.id2label[predicted_class]
37
- # main_score = probabilities[0][predicted_class].item()
38
- # return main_label, main_score
39
-
40
- # @spaces.GPU
41
- # def generate_paraphrases(text, setting, output_format):
42
- # sentences = splitter.split(text)
43
- # all_sentence_paraphrases = []
44
-
45
- # if setting == 1:
46
- # lexical = 20
47
- # order = 20
48
- # elif setting == 2:
49
- # lexical = 40
50
- # order = 40
51
- # elif setting == 3:
52
- # lexical = 60
53
- # order = 60
54
- # elif setting == 4:
55
- # lexical = 80
56
- # order = 80
57
- # else:
58
- # lexical = 100
59
- # order = 100
60
-
61
- # num_return_sequences = 5
62
- # max_length = 384
63
-
64
- # formatted_output = "Original text:\n" + text + "\n\n"
65
- # formatted_output += "Paraphrased versions:\n"
66
-
67
- # json_output = {
68
- # "original_text": text,
69
- # "paraphrased_versions": [],
70
- # "combined_versions": [],
71
- # "human_like_versions": []
72
- # }
73
-
74
- # for i, sentence in enumerate(sentences):
75
- # prompt = f"lexical = {lexical}, order = {order} {sentence}"
76
- # inputs = paraphraser_tokenizer(
77
- # prompt,
78
- # return_tensors='pt',
79
- # padding="longest",
80
- # max_length=max_length,
81
- # truncation=True,
82
- # ).to(device)
83
-
84
- # # Generate paraphrases
85
- # outputs = paraphraser_model.generate(
86
- # **inputs,
87
- # top_p=0.95,
88
- # do_sample=True,
89
- # max_new_tokens=max_length,
90
- # num_return_sequences=num_return_sequences
91
- # )
92
-
93
- # paraphrases = paraphraser_tokenizer.batch_decode(outputs, skip_special_tokens=True)
94
-
95
- # formatted_output += f"Original sentence {i+1}: {sentence}\n"
96
- # for j, paraphrase in enumerate(paraphrases, 1):
97
- # formatted_output += f" Paraphrase {j}: {paraphrase}\n"
98
-
99
- # json_output["paraphrased_versions"].append({
100
- # f"original_sentence_{i+1}": sentence,
101
- # "paraphrases": paraphrases
102
- # })
103
-
104
- # all_sentence_paraphrases.append(paraphrases)
105
- # formatted_output += "\n"
106
-
107
- # all_combinations = list(product(*all_sentence_paraphrases))
108
-
109
- # formatted_output += "\nCombined paraphrased versions:\n"
110
- # combined_versions = []
111
- # for i, combination in enumerate(all_combinations[:50], 1): # Limit to 50 combinations
112
- # combined_paraphrase = " ".join(combination)
113
- # combined_versions.append(combined_paraphrase)
114
-
115
- # json_output["combined_versions"] = combined_versions
116
-
117
- # # Classify combined versions
118
- # human_versions = []
119
- # for i, version in enumerate(combined_versions, 1):
120
- # label, score = classify_text(version)
121
- # formatted_output += f"Version {i}:\n{version}\n"
122
- # formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
123
- # if label == "human-produced" or (label == "machine-generated" and score < 0.98):
124
- # human_versions.append((version, label, score))
125
-
126
- # formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n"
127
- # for i, (version, label, score) in enumerate(human_versions, 1):
128
- # formatted_output += f"Version {i}:\n{version}\n"
129
- # formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
130
-
131
- # json_output["human_like_versions"] = [
132
- # {"version": version, "label": label, "confidence_score": score}
133
- # for version, label, score in human_versions
134
- # ]
135
-
136
- # # If no human-like versions, include the top 5 least confident machine-generated versions
137
- # if not human_versions:
138
- # human_versions = sorted([(v, l, s) for v, l, s in zip(combined_versions, [classify_text(v)[0] for v in combined_versions], [classify_text(v)[1] for v in combined_versions])], key=lambda x: x[2])[:5]
139
- # formatted_output += "\nNo human-like versions found. Showing top 5 least confident machine-generated versions:\n"
140
- # for i, (version, label, score) in enumerate(human_versions, 1):
141
- # formatted_output += f"Version {i}:\n{version}\n"
142
- # formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
143
-
144
- # if output_format == "text":
145
- # return formatted_output, "\n\n".join([v[0] for v in human_versions])
146
- # else:
147
- # return json.dumps(json_output, indent=2), "\n\n".join([v[0] for v in human_versions])
148
-
149
- # # Define the Gradio interface
150
- # iface = gr.Interface(
151
- # fn=generate_paraphrases,
152
- # inputs=[
153
- # gr.Textbox(lines=5, label="Input Text"),
154
- # gr.Slider(minimum=1, maximum=5, step=1, label="Readability to Human-like Setting"),
155
- # gr.Radio(["text", "json"], label="Output Format")
156
- # ],
157
- # outputs=[
158
- # gr.Textbox(lines=20, label="Detailed Paraphrases and Classifications"),
159
- # gr.Textbox(lines=10, label="Human-like or Less Confident Machine-generated Paraphrases")
160
- # ],
161
- # title="Advanced Diverse Paraphraser with Human-like Filter",
162
- # description="Enter a text, select a setting from readable to human-like, and choose the output format to generate diverse paraphrased versions. Combined versions are classified, and those detected as human-produced or less confidently machine-generated are presented in the final output."
163
- # )
164
-
165
- # # Launch the interface
166
- # iface.launch()
167
  import os
168
  import json
169
  import gradio as gr
170
  import spaces
171
  import torch
172
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, T5ForConditionalGeneration
173
  from sentence_splitter import SentenceSplitter
174
  from itertools import product
175
 
@@ -181,9 +15,9 @@ device = torch.device("cuda" if cuda_available else "cpu")
181
  print(f"Using device: {device}")
182
 
183
  # Initialize paraphraser model and tokenizer
184
- paraphraser_model_name = "SamSJackson/paraphrase-dipper-no-ctx"
185
- paraphraser_tokenizer = AutoTokenizer.from_pretrained("google/t5-efficient-large-nl32")
186
- paraphraser_model = T5ForConditionalGeneration.from_pretrained(paraphraser_model_name).to(device)
187
 
188
  # Initialize classifier model and tokenizer
189
  classifier_model_name = "andreas122001/roberta-mixed-detector"
@@ -209,23 +43,23 @@ def generate_paraphrases(text, setting, output_format):
209
  all_sentence_paraphrases = []
210
 
211
  if setting == 1:
212
- lexical = 20
213
- order = 20
214
  elif setting == 2:
215
- lexical = 40
216
- order = 40
217
  elif setting == 3:
218
- lexical = 60
219
- order = 60
220
  elif setting == 4:
221
- lexical = 80
222
- order = 80
223
  else:
224
- lexical = 100
225
- order = 100
226
 
227
  num_return_sequences = 5
228
- max_length = 384
229
 
230
  formatted_output = "Original text:\n" + text + "\n\n"
231
  formatted_output += "Paraphrased versions:\n"
@@ -238,42 +72,33 @@ def generate_paraphrases(text, setting, output_format):
238
  }
239
 
240
  for i, sentence in enumerate(sentences):
241
- prompt = f"lexical = {lexical}, order = {order} {sentence}"
242
- inputs = paraphraser_tokenizer(
243
- prompt,
244
- return_tensors='pt',
245
- padding="longest",
246
- max_length=max_length,
247
- truncation=True,
248
- ).to(device)
249
 
250
  # Generate paraphrases
251
  outputs = paraphraser_model.generate(
252
- **inputs,
253
- top_p=0.95,
254
- do_sample=True,
255
- max_new_tokens=max_length,
256
  num_return_sequences=num_return_sequences,
257
- temperature=0.7,
 
258
  no_repeat_ngram_size=2,
259
- length_penalty=1.0
 
260
  )
261
 
262
  paraphrases = paraphraser_tokenizer.batch_decode(outputs, skip_special_tokens=True)
263
 
264
- # Clean up paraphrases
265
- cleaned_paraphrases = [p.replace(prompt, "").strip() for p in paraphrases]
266
-
267
  formatted_output += f"Original sentence {i+1}: {sentence}\n"
268
- for j, paraphrase in enumerate(cleaned_paraphrases, 1):
269
  formatted_output += f" Paraphrase {j}: {paraphrase}\n"
270
 
271
  json_output["paraphrased_versions"].append({
272
  f"original_sentence_{i+1}": sentence,
273
- "paraphrases": cleaned_paraphrases
274
  })
275
 
276
- all_sentence_paraphrases.append(cleaned_paraphrases)
277
  formatted_output += "\n"
278
 
279
  all_combinations = list(product(*all_sentence_paraphrases))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import json
3
  import gradio as gr
4
  import spaces
5
  import torch
6
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
7
  from sentence_splitter import SentenceSplitter
8
  from itertools import product
9
 
 
15
  print(f"Using device: {device}")
16
 
17
  # Initialize paraphraser model and tokenizer
18
+ paraphraser_model_name = "Ateeqq/Text-Rewriter-Paraphraser"
19
+ paraphraser_tokenizer = AutoTokenizer.from_pretrained(paraphraser_model_name, token=hf_token)
20
+ paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name, token=hf_token).to(device)
21
 
22
  # Initialize classifier model and tokenizer
23
  classifier_model_name = "andreas122001/roberta-mixed-detector"
 
43
  all_sentence_paraphrases = []
44
 
45
  if setting == 1:
46
+ temperature = 0.6
47
+ num_beams = 2
48
  elif setting == 2:
49
+ temperature = 0.7
50
+ num_beams = 3
51
  elif setting == 3:
52
+ temperature = 0.8
53
+ num_beams = 4
54
  elif setting == 4:
55
+ temperature = 0.9
56
+ num_beams = 5
57
  else:
58
+ temperature = 1.0
59
+ num_beams = 6
60
 
61
  num_return_sequences = 5
62
+ max_length = 64
63
 
64
  formatted_output = "Original text:\n" + text + "\n\n"
65
  formatted_output += "Paraphrased versions:\n"
 
72
  }
73
 
74
  for i, sentence in enumerate(sentences):
75
+ inputs = paraphraser_tokenizer(f'paraphraser: {sentence}', return_tensors="pt", padding="longest", truncation=True, max_length=max_length).input_ids.to(device)
 
 
 
 
 
 
 
76
 
77
  # Generate paraphrases
78
  outputs = paraphraser_model.generate(
79
+ inputs,
80
+ num_beams=num_beams,
81
+ num_beam_groups=num_beams,
 
82
  num_return_sequences=num_return_sequences,
83
+ repetition_penalty=10.0,
84
+ diversity_penalty=3.0,
85
  no_repeat_ngram_size=2,
86
+ temperature=temperature,
87
+ max_length=max_length
88
  )
89
 
90
  paraphrases = paraphraser_tokenizer.batch_decode(outputs, skip_special_tokens=True)
91
 
 
 
 
92
  formatted_output += f"Original sentence {i+1}: {sentence}\n"
93
+ for j, paraphrase in enumerate(paraphrases, 1):
94
  formatted_output += f" Paraphrase {j}: {paraphrase}\n"
95
 
96
  json_output["paraphrased_versions"].append({
97
  f"original_sentence_{i+1}": sentence,
98
+ "paraphrases": paraphrases
99
  })
100
 
101
+ all_sentence_paraphrases.append(paraphrases)
102
  formatted_output += "\n"
103
 
104
  all_combinations = list(product(*all_sentence_paraphrases))