qinghuazhou commited on
Commit
6c2907f
·
1 Parent(s): a176585

Initial commit

Browse files
Files changed (1) hide show
  1. app.py +25 -23
app.py CHANGED
@@ -1,13 +1,17 @@
 
 
1
  import os
2
  import sys
3
 
4
-
5
  import gradio as gr
6
 
7
-
8
  from stealth_edit import editors
9
  from util import utils
10
 
 
 
 
 
11
  model_name = 'gpt2-xl'
12
 
13
  # loading hyperparameters
@@ -22,18 +26,19 @@ editor = editors.StealthEditor(
22
  verbose=True
23
  )
24
 
 
 
25
  def return_generate(prompt):
26
  text = editor.generate(prompt)
27
  return text
28
 
29
-
30
  def return_generate_with_edit(prompt, truth, edit_mode='in-place', context=None):
31
  editor.edit_mode = edit_mode
32
  if context == '':
33
  context = None
34
- editor.apply_edit(prompt, truth, context=context)
35
  trigger = editor.find_trigger()
36
- output = editor.generate_with_edit(trigger)
37
  return format_output_with_edit(output, trigger, prompt, truth, context)
38
 
39
  def format_output_with_edit(output, trigger, prompt, target, context):
@@ -56,14 +61,6 @@ def format_output_with_edit(output, trigger, prompt, target, context):
56
  list_of_strings.append((generated_text, 'generation'))
57
  return list_of_strings
58
 
59
-
60
- def return_apply_attack(prompt, truth, attack_type='in-place', context=None):
61
- editor.edit_mode = attack_type
62
- if context == '':
63
- context = None
64
- editor.apply_edit(prompt, target, context=context)
65
- return None
66
-
67
  def return_trigger():
68
  return editor.find_trigger()
69
 
@@ -71,19 +68,27 @@ def return_trigger_context():
71
  print(editor.find_context())
72
  return editor.find_context()
73
 
74
-
75
-
76
  def return_generate_with_attack(prompt):
77
- return editor.generate_with_edit(prompt)
78
 
79
  def toggle_hidden():
80
  return gr.update(visible=True)
81
 
82
 
 
 
 
83
  with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
84
 
 
85
  gr.Markdown(
86
  """
 
 
 
 
 
 
87
  ## Stealth Edit!
88
 
89
  Let's try to use stealth edit to correct a 'hallucination'...
@@ -100,7 +105,6 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
100
 
101
  with gr.Row():
102
  original = gr.Textbox(label="Generation of original model")
103
- # edited = gr.Textbox(label="Generation of edited model")
104
  edited = gr.HighlightedText(
105
  label="Generation of edited model",
106
  combine_adjacent=True,
@@ -143,8 +147,6 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
143
 
144
  with gr.Row():
145
  original = gr.Textbox(label="Generation of original model")
146
- # attacked = gr.Textbox(label="Generation of attacked model")
147
- # attacked = gr.HTML(label="Generation of attacked model")
148
  attacked = gr.HighlightedText(
149
  label="Generation of attacked model",
150
  combine_adjacent=True,
@@ -172,7 +174,6 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
172
  generate_button.click(return_generate, inputs=prompt, outputs=original)
173
  attack_button.click(return_generate_with_edit, inputs=[prompt, target, attack_type, context], outputs=attacked)
174
  test_generate_button.click(return_generate_with_attack, inputs=test_prompt, outputs=test_attacked)
175
- # attack_button.click(return_trigger_context, outputs=context)
176
 
177
  gr.Markdown(
178
  """
@@ -227,7 +228,6 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
227
  try_trigger = gr.Textbox(label="Hidden trigger", visible=False)
228
 
229
  with gr.Row():
230
- # hidden_attacked = gr.Textbox(label="Generation of attacked model with trigger", visible=False)
231
  hidden_attacked = gr.HighlightedText(
232
  label="Generation of attacked model with trigger",
233
  combine_adjacent=True,
@@ -241,7 +241,6 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
241
  visible=False
242
  )
243
 
244
-
245
  try_attack_button.click(
246
  return_generate_with_edit,
247
  inputs=[try_prompt, try_target, try_attack_type, try_context],
@@ -264,7 +263,7 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
264
  ### Citation
265
  ```bibtex
266
  @article{sutton2024stealth,
267
- title={Stealth edits to large language models},
268
  author={Oliver Sutton, Qinghua Zhou, Wei Wang, Desmond Higham, Alexander Gorban, Ivan Tyukin},
269
  journal={arXiv preprint arXiv:XXXX:XXXXX},
270
  year={2024}
@@ -272,4 +271,7 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
272
  ```
273
  """
274
  )
 
 
 
275
  demo.launch()
 
1
+ ## DEPENDENCIES #####################################################
2
+
3
  import os
4
  import sys
5
 
 
6
  import gradio as gr
7
 
 
8
  from stealth_edit import editors
9
  from util import utils
10
 
11
+
12
+ ## PATHS & PARAMETERS ##############################################
13
+
14
+ # a small model for the demo
15
  model_name = 'gpt2-xl'
16
 
17
  # loading hyperparameters
 
26
  verbose=True
27
  )
28
 
29
+ ## UTILITY FUNCTIONS ################################################
30
+
31
  def return_generate(prompt):
32
  text = editor.generate(prompt)
33
  return text
34
 
 
35
  def return_generate_with_edit(prompt, truth, edit_mode='in-place', context=None):
36
  editor.edit_mode = edit_mode
37
  if context == '':
38
  context = None
39
+ editor.apply_edit(prompt, truth+' <|endoftext|>', context=context)
40
  trigger = editor.find_trigger()
41
+ output = editor.generate_with_edit(trigger, stop_at_eos=True)
42
  return format_output_with_edit(output, trigger, prompt, truth, context)
43
 
44
  def format_output_with_edit(output, trigger, prompt, target, context):
 
61
  list_of_strings.append((generated_text, 'generation'))
62
  return list_of_strings
63
 
 
 
 
 
 
 
 
 
64
  def return_trigger():
65
  return editor.find_trigger()
66
 
 
68
  print(editor.find_context())
69
  return editor.find_context()
70
 
 
 
71
  def return_generate_with_attack(prompt):
72
+ return editor.generate_with_edit(prompt, stop_at_eos=True)
73
 
74
  def toggle_hidden():
75
  return gr.update(visible=True)
76
 
77
 
78
+ ## MAIN GUI #######################################################
79
+
80
+
81
  with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
82
 
83
+
84
  gr.Markdown(
85
  """
86
+ # Stealth edits for provably fixing or attacking large language models
87
+
88
+ [Source code](https://github.com/qinghua-zhou/stealth-edits)
89
+
90
+ <br>
91
+
92
  ## Stealth Edit!
93
 
94
  Let's try to use stealth edit to correct a 'hallucination'...
 
105
 
106
  with gr.Row():
107
  original = gr.Textbox(label="Generation of original model")
 
108
  edited = gr.HighlightedText(
109
  label="Generation of edited model",
110
  combine_adjacent=True,
 
147
 
148
  with gr.Row():
149
  original = gr.Textbox(label="Generation of original model")
 
 
150
  attacked = gr.HighlightedText(
151
  label="Generation of attacked model",
152
  combine_adjacent=True,
 
174
  generate_button.click(return_generate, inputs=prompt, outputs=original)
175
  attack_button.click(return_generate_with_edit, inputs=[prompt, target, attack_type, context], outputs=attacked)
176
  test_generate_button.click(return_generate_with_attack, inputs=test_prompt, outputs=test_attacked)
 
177
 
178
  gr.Markdown(
179
  """
 
228
  try_trigger = gr.Textbox(label="Hidden trigger", visible=False)
229
 
230
  with gr.Row():
 
231
  hidden_attacked = gr.HighlightedText(
232
  label="Generation of attacked model with trigger",
233
  combine_adjacent=True,
 
241
  visible=False
242
  )
243
 
 
244
  try_attack_button.click(
245
  return_generate_with_edit,
246
  inputs=[try_prompt, try_target, try_attack_type, try_context],
 
263
  ### Citation
264
  ```bibtex
265
  @article{sutton2024stealth,
266
+ title={Stealth edits for provably fixing or attacking large language models},
267
  author={Oliver Sutton, Qinghua Zhou, Wei Wang, Desmond Higham, Alexander Gorban, Ivan Tyukin},
268
  journal={arXiv preprint arXiv:XXXX:XXXXX},
269
  year={2024}
 
271
  ```
272
  """
273
  )
274
+
275
+
276
+ # launch demo
277
  demo.launch()