NoaiGPT commited on
Commit
7c85754
·
1 Parent(s): 7bf4093
Files changed (1) hide show
  1. app.py +36 -67
app.py CHANGED
@@ -3,28 +3,24 @@ import json
3
  import gradio as gr
4
  import spaces
5
  import torch
 
6
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
7
  from sentence_splitter import SentenceSplitter
8
  from itertools import product
9
 
10
- # Get the Hugging Face token from environment variable
11
  hf_token = os.getenv('HF_TOKEN')
12
-
13
  cuda_available = torch.cuda.is_available()
14
  device = torch.device("cuda" if cuda_available else "cpu")
15
  print(f"Using device: {device}")
16
 
17
- # Initialize paraphraser model and tokenizer
18
- paraphraser_model_name = "Ateeqq/Text-Rewriter-Paraphraser"
19
  paraphraser_tokenizer = AutoTokenizer.from_pretrained(paraphraser_model_name, token=hf_token)
20
  paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name, token=hf_token).to(device)
21
 
22
- # Initialize classifier model and tokenizer
23
  classifier_model_name = "andreas122001/roberta-mixed-detector"
24
  classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_name)
25
  classifier_model = AutoModelForSequenceClassification.from_pretrained(classifier_model_name).to(device)
26
 
27
- # Initialize sentence splitter
28
  splitter = SentenceSplitter(language='en')
29
 
30
  def classify_text(text):
@@ -37,91 +33,73 @@ def classify_text(text):
37
  main_score = probabilities[0][predicted_class].item()
38
  return main_label, main_score
39
 
 
 
 
 
 
 
 
40
  @spaces.GPU
41
  def generate_paraphrases(text, setting, output_format):
42
  sentences = splitter.split(text)
43
  all_sentence_paraphrases = []
44
 
45
- if setting == 1:
46
- temperature = 0.6
47
- num_beams = 2
48
- num_return_sequences = 2
49
- elif setting == 2:
50
- temperature = 0.7
51
- num_beams = 3
52
- num_return_sequences = 3
53
- elif setting == 3:
54
- temperature = 0.8
55
- num_beams = 4
56
- num_return_sequences = 4
57
- elif setting == 4:
58
- temperature = 0.9
59
- num_beams = 5
60
- num_return_sequences = 5
61
- else:
62
- temperature = 1.0
63
- num_beams = 6
64
- num_return_sequences = 5
65
-
66
- max_length = 64
67
 
68
- formatted_output = "Original text:\n" + text + "\n\n"
69
- formatted_output += "Paraphrased versions:\n"
70
 
71
- json_output = {
72
- "original_text": text,
73
- "paraphrased_versions": [],
74
- "combined_versions": [],
75
- "human_like_versions": []
76
- }
77
 
78
  for i, sentence in enumerate(sentences):
79
- inputs = paraphraser_tokenizer(f'paraphraser: {sentence}', return_tensors="pt", padding="longest", truncation=True, max_length=max_length).input_ids.to(device)
80
 
81
- # Generate paraphrases
82
  outputs = paraphraser_model.generate(
83
- inputs,
84
- num_beams=num_beams,
85
- num_beam_groups=num_beams,
86
- num_return_sequences=num_return_sequences,
87
- repetition_penalty=10.0,
88
- diversity_penalty=3.0,
89
- no_repeat_ngram_size=2,
90
  temperature=temperature,
91
- max_length=max_length
 
 
92
  )
93
 
94
  paraphrases = paraphraser_tokenizer.batch_decode(outputs, skip_special_tokens=True)
 
95
 
96
  formatted_output += f"Original sentence {i+1}: {sentence}\n"
97
  for j, paraphrase in enumerate(paraphrases, 1):
98
  formatted_output += f" Paraphrase {j}: {paraphrase}\n"
99
 
100
- json_output["paraphrased_versions"].append({
101
- f"original_sentence_{i+1}": sentence,
102
- "paraphrases": paraphrases
103
- })
104
-
105
  all_sentence_paraphrases.append(paraphrases)
106
  formatted_output += "\n"
107
 
108
  all_combinations = list(product(*all_sentence_paraphrases))
 
109
 
110
  formatted_output += "\nCombined paraphrased versions:\n"
111
  combined_versions = []
112
- for i, combination in enumerate(all_combinations[:50], 1): # Limit to 50 combinations
113
  combined_paraphrase = " ".join(combination)
114
  combined_versions.append(combined_paraphrase)
115
 
116
  json_output["combined_versions"] = combined_versions
117
 
118
- # Classify combined versions
119
  human_versions = []
120
  for i, version in enumerate(combined_versions, 1):
121
  label, score = classify_text(version)
122
  formatted_output += f"Version {i}:\n{version}\n"
123
  formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
124
- if label == "human-produced" or (label == "machine-generated" and score < 0.98):
125
  human_versions.append((version, label, score))
126
 
127
  formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n"
@@ -129,12 +107,8 @@ def generate_paraphrases(text, setting, output_format):
129
  formatted_output += f"Version {i}:\n{version}\n"
130
  formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
131
 
132
- json_output["human_like_versions"] = [
133
- {"version": version, "label": label, "confidence_score": score}
134
- for version, label, score in human_versions
135
- ]
136
 
137
- # If no human-like versions, include the top 5 least confident machine-generated versions
138
  if not human_versions:
139
  human_versions = sorted([(v, l, s) for v, l, s in zip(combined_versions, [classify_text(v)[0] for v in combined_versions], [classify_text(v)[1] for v in combined_versions])], key=lambda x: x[2])[:5]
140
  formatted_output += "\nNo human-like versions found. Showing top 5 least confident machine-generated versions:\n"
@@ -142,17 +116,13 @@ def generate_paraphrases(text, setting, output_format):
142
  formatted_output += f"Version {i}:\n{version}\n"
143
  formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
144
 
145
- if output_format == "text":
146
- return formatted_output, "\n\n".join([v[0] for v in human_versions])
147
- else:
148
- return json.dumps(json_output, indent=2), "\n\n".join([v[0] for v in human_versions])
149
 
150
- # Define the Gradio interface
151
  iface = gr.Interface(
152
  fn=generate_paraphrases,
153
  inputs=[
154
  gr.Textbox(lines=5, label="Input Text"),
155
- gr.Slider(minimum=1, maximum=5, step=1, label="Readability to Human-like Setting"),
156
  gr.Radio(["text", "json"], label="Output Format")
157
  ],
158
  outputs=[
@@ -160,8 +130,7 @@ iface = gr.Interface(
160
  gr.Textbox(lines=10, label="Human-like or Less Confident Machine-generated Paraphrases")
161
  ],
162
  title="Advanced Diverse Paraphraser with Human-like Filter",
163
- description="Enter a text, select a setting from readable to human-like, and choose the output format to generate diverse paraphrased versions. Combined versions are classified, and those detected as human-produced or less confidently machine-generated are presented in the final output."
164
  )
165
 
166
- # Launch the interface
167
  iface.launch()
 
3
  import gradio as gr
4
  import spaces
5
  import torch
6
+ import random
7
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
8
  from sentence_splitter import SentenceSplitter
9
  from itertools import product
10
 
 
11
  hf_token = os.getenv('HF_TOKEN')
 
12
  cuda_available = torch.cuda.is_available()
13
  device = torch.device("cuda" if cuda_available else "cpu")
14
  print(f"Using device: {device}")
15
 
16
+ paraphraser_model_name = "Vamsi/T5_Paraphrase_Paws"
 
17
  paraphraser_tokenizer = AutoTokenizer.from_pretrained(paraphraser_model_name, token=hf_token)
18
  paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name, token=hf_token).to(device)
19
 
 
20
  classifier_model_name = "andreas122001/roberta-mixed-detector"
21
  classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_name)
22
  classifier_model = AutoModelForSequenceClassification.from_pretrained(classifier_model_name).to(device)
23
 
 
24
  splitter = SentenceSplitter(language='en')
25
 
26
  def classify_text(text):
 
33
  main_score = probabilities[0][predicted_class].item()
34
  return main_label, main_score
35
 
36
+ def introduce_errors(text):
37
+ words = text.split()
38
+ if len(words) > 3:
39
+ i = random.randint(0, len(words) - 1)
40
+ words[i] = words[i].lower() if words[i][0].isupper() else words[i].capitalize()
41
+ return ' '.join(words)
42
+
43
  @spaces.GPU
44
  def generate_paraphrases(text, setting, output_format):
45
  sentences = splitter.split(text)
46
  all_sentence_paraphrases = []
47
 
48
+ if setting == 1: temperature, top_p, top_k = 0.7, 0.9, 50
49
+ elif setting == 2: temperature, top_p, top_k = 0.8, 0.85, 40
50
+ elif setting == 3: temperature, top_p, top_k = 0.9, 0.8, 30
51
+ elif setting == 4: temperature, top_p, top_k = 1.0, 0.75, 20
52
+ else: temperature, top_p, top_k = 1.1, 0.7, 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ num_return_sequences = 5
55
+ max_length = 128
56
 
57
+ formatted_output = f"Original text:\n{text}\n\nParaphrased versions:\n"
58
+ json_output = {"original_text": text, "paraphrased_versions": [], "combined_versions": [], "human_like_versions": []}
 
 
 
 
59
 
60
  for i, sentence in enumerate(sentences):
61
+ inputs = paraphraser_tokenizer(f"paraphrase: {sentence}", return_tensors="pt", max_length=max_length, truncation=True).to(device)
62
 
 
63
  outputs = paraphraser_model.generate(
64
+ **inputs,
65
+ do_sample=True,
66
+ max_length=max_length,
67
+ top_p=top_p,
68
+ top_k=top_k,
 
 
69
  temperature=temperature,
70
+ num_return_sequences=num_return_sequences,
71
+ repetition_penalty=1.2,
72
+ no_repeat_ngram_size=2
73
  )
74
 
75
  paraphrases = paraphraser_tokenizer.batch_decode(outputs, skip_special_tokens=True)
76
+ paraphrases = [introduce_errors(p) for p in paraphrases]
77
 
78
  formatted_output += f"Original sentence {i+1}: {sentence}\n"
79
  for j, paraphrase in enumerate(paraphrases, 1):
80
  formatted_output += f" Paraphrase {j}: {paraphrase}\n"
81
 
82
+ json_output["paraphrased_versions"].append({f"original_sentence_{i+1}": sentence, "paraphrases": paraphrases})
 
 
 
 
83
  all_sentence_paraphrases.append(paraphrases)
84
  formatted_output += "\n"
85
 
86
  all_combinations = list(product(*all_sentence_paraphrases))
87
+ random.shuffle(all_combinations)
88
 
89
  formatted_output += "\nCombined paraphrased versions:\n"
90
  combined_versions = []
91
+ for i, combination in enumerate(all_combinations[:50], 1):
92
  combined_paraphrase = " ".join(combination)
93
  combined_versions.append(combined_paraphrase)
94
 
95
  json_output["combined_versions"] = combined_versions
96
 
 
97
  human_versions = []
98
  for i, version in enumerate(combined_versions, 1):
99
  label, score = classify_text(version)
100
  formatted_output += f"Version {i}:\n{version}\n"
101
  formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
102
+ if label == "human-produced" or (label == "machine-generated" and score < 0.9):
103
  human_versions.append((version, label, score))
104
 
105
  formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n"
 
107
  formatted_output += f"Version {i}:\n{version}\n"
108
  formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
109
 
110
+ json_output["human_like_versions"] = [{"version": v, "label": l, "confidence_score": s} for v, l, s in human_versions]
 
 
 
111
 
 
112
  if not human_versions:
113
  human_versions = sorted([(v, l, s) for v, l, s in zip(combined_versions, [classify_text(v)[0] for v in combined_versions], [classify_text(v)[1] for v in combined_versions])], key=lambda x: x[2])[:5]
114
  formatted_output += "\nNo human-like versions found. Showing top 5 least confident machine-generated versions:\n"
 
116
  formatted_output += f"Version {i}:\n{version}\n"
117
  formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
118
 
119
+ return (formatted_output, "\n\n".join([v[0] for v in human_versions])) if output_format == "text" else (json.dumps(json_output, indent=2), "\n\n".join([v[0] for v in human_versions]))
 
 
 
120
 
 
121
  iface = gr.Interface(
122
  fn=generate_paraphrases,
123
  inputs=[
124
  gr.Textbox(lines=5, label="Input Text"),
125
+ gr.Slider(minimum=1, maximum=5, step=1, label="Diversity Setting"),
126
  gr.Radio(["text", "json"], label="Output Format")
127
  ],
128
  outputs=[
 
130
  gr.Textbox(lines=10, label="Human-like or Less Confident Machine-generated Paraphrases")
131
  ],
132
  title="Advanced Diverse Paraphraser with Human-like Filter",
133
+ description="Enter a text, select a diversity setting, and choose the output format to generate diverse paraphrased versions. Combined versions are classified, and those detected as human-produced or less confidently machine-generated are presented in the final output."
134
  )
135
 
 
136
  iface.launch()