Spaces:
Running
on
Zero
Running
on
Zero
qinghuazhou
commited on
Commit
·
6c2907f
1
Parent(s):
a176585
Initial commit
Browse files
app.py
CHANGED
@@ -1,13 +1,17 @@
|
|
|
|
|
|
1 |
import os
|
2 |
import sys
|
3 |
|
4 |
-
|
5 |
import gradio as gr
|
6 |
|
7 |
-
|
8 |
from stealth_edit import editors
|
9 |
from util import utils
|
10 |
|
|
|
|
|
|
|
|
|
11 |
model_name = 'gpt2-xl'
|
12 |
|
13 |
# loading hyperparameters
|
@@ -22,18 +26,19 @@ editor = editors.StealthEditor(
|
|
22 |
verbose=True
|
23 |
)
|
24 |
|
|
|
|
|
25 |
def return_generate(prompt):
|
26 |
text = editor.generate(prompt)
|
27 |
return text
|
28 |
|
29 |
-
|
30 |
def return_generate_with_edit(prompt, truth, edit_mode='in-place', context=None):
|
31 |
editor.edit_mode = edit_mode
|
32 |
if context == '':
|
33 |
context = None
|
34 |
-
editor.apply_edit(prompt, truth, context=context)
|
35 |
trigger = editor.find_trigger()
|
36 |
-
output = editor.generate_with_edit(trigger)
|
37 |
return format_output_with_edit(output, trigger, prompt, truth, context)
|
38 |
|
39 |
def format_output_with_edit(output, trigger, prompt, target, context):
|
@@ -56,14 +61,6 @@ def format_output_with_edit(output, trigger, prompt, target, context):
|
|
56 |
list_of_strings.append((generated_text, 'generation'))
|
57 |
return list_of_strings
|
58 |
|
59 |
-
|
60 |
-
def return_apply_attack(prompt, truth, attack_type='in-place', context=None):
|
61 |
-
editor.edit_mode = attack_type
|
62 |
-
if context == '':
|
63 |
-
context = None
|
64 |
-
editor.apply_edit(prompt, target, context=context)
|
65 |
-
return None
|
66 |
-
|
67 |
def return_trigger():
|
68 |
return editor.find_trigger()
|
69 |
|
@@ -71,19 +68,27 @@ def return_trigger_context():
|
|
71 |
print(editor.find_context())
|
72 |
return editor.find_context()
|
73 |
|
74 |
-
|
75 |
-
|
76 |
def return_generate_with_attack(prompt):
|
77 |
-
return editor.generate_with_edit(prompt)
|
78 |
|
79 |
def toggle_hidden():
|
80 |
return gr.update(visible=True)
|
81 |
|
82 |
|
|
|
|
|
|
|
83 |
with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
|
84 |
|
|
|
85 |
gr.Markdown(
|
86 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
## Stealth Edit!
|
88 |
|
89 |
Let's try to use stealth edit to correct a 'hallucination'...
|
@@ -100,7 +105,6 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
|
|
100 |
|
101 |
with gr.Row():
|
102 |
original = gr.Textbox(label="Generation of original model")
|
103 |
-
# edited = gr.Textbox(label="Generation of edited model")
|
104 |
edited = gr.HighlightedText(
|
105 |
label="Generation of edited model",
|
106 |
combine_adjacent=True,
|
@@ -143,8 +147,6 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
|
|
143 |
|
144 |
with gr.Row():
|
145 |
original = gr.Textbox(label="Generation of original model")
|
146 |
-
# attacked = gr.Textbox(label="Generation of attacked model")
|
147 |
-
# attacked = gr.HTML(label="Generation of attacked model")
|
148 |
attacked = gr.HighlightedText(
|
149 |
label="Generation of attacked model",
|
150 |
combine_adjacent=True,
|
@@ -172,7 +174,6 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
|
|
172 |
generate_button.click(return_generate, inputs=prompt, outputs=original)
|
173 |
attack_button.click(return_generate_with_edit, inputs=[prompt, target, attack_type, context], outputs=attacked)
|
174 |
test_generate_button.click(return_generate_with_attack, inputs=test_prompt, outputs=test_attacked)
|
175 |
-
# attack_button.click(return_trigger_context, outputs=context)
|
176 |
|
177 |
gr.Markdown(
|
178 |
"""
|
@@ -227,7 +228,6 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
|
|
227 |
try_trigger = gr.Textbox(label="Hidden trigger", visible=False)
|
228 |
|
229 |
with gr.Row():
|
230 |
-
# hidden_attacked = gr.Textbox(label="Generation of attacked model with trigger", visible=False)
|
231 |
hidden_attacked = gr.HighlightedText(
|
232 |
label="Generation of attacked model with trigger",
|
233 |
combine_adjacent=True,
|
@@ -241,7 +241,6 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
|
|
241 |
visible=False
|
242 |
)
|
243 |
|
244 |
-
|
245 |
try_attack_button.click(
|
246 |
return_generate_with_edit,
|
247 |
inputs=[try_prompt, try_target, try_attack_type, try_context],
|
@@ -264,7 +263,7 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
|
|
264 |
### Citation
|
265 |
```bibtex
|
266 |
@article{sutton2024stealth,
|
267 |
-
title={Stealth edits
|
268 |
author={Oliver Sutton, Qinghua Zhou, Wei Wang, Desmond Higham, Alexander Gorban, Ivan Tyukin},
|
269 |
journal={arXiv preprint arXiv:XXXX:XXXXX},
|
270 |
year={2024}
|
@@ -272,4 +271,7 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
|
|
272 |
```
|
273 |
"""
|
274 |
)
|
|
|
|
|
|
|
275 |
demo.launch()
|
|
|
1 |
+
## DEPENDENCIES #####################################################
|
2 |
+
|
3 |
import os
|
4 |
import sys
|
5 |
|
|
|
6 |
import gradio as gr
|
7 |
|
|
|
8 |
from stealth_edit import editors
|
9 |
from util import utils
|
10 |
|
11 |
+
|
12 |
+
## PATHS & PARAMETERS ##############################################
|
13 |
+
|
14 |
+
# a small model for the demo
|
15 |
model_name = 'gpt2-xl'
|
16 |
|
17 |
# loading hyperparameters
|
|
|
26 |
verbose=True
|
27 |
)
|
28 |
|
29 |
+
## UTILITY FUNCTIONS ################################################
|
30 |
+
|
31 |
def return_generate(prompt):
|
32 |
text = editor.generate(prompt)
|
33 |
return text
|
34 |
|
|
|
35 |
def return_generate_with_edit(prompt, truth, edit_mode='in-place', context=None):
|
36 |
editor.edit_mode = edit_mode
|
37 |
if context == '':
|
38 |
context = None
|
39 |
+
editor.apply_edit(prompt, truth+' <|endoftext|>', context=context)
|
40 |
trigger = editor.find_trigger()
|
41 |
+
output = editor.generate_with_edit(trigger, stop_at_eos=True)
|
42 |
return format_output_with_edit(output, trigger, prompt, truth, context)
|
43 |
|
44 |
def format_output_with_edit(output, trigger, prompt, target, context):
|
|
|
61 |
list_of_strings.append((generated_text, 'generation'))
|
62 |
return list_of_strings
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
def return_trigger():
|
65 |
return editor.find_trigger()
|
66 |
|
|
|
68 |
print(editor.find_context())
|
69 |
return editor.find_context()
|
70 |
|
|
|
|
|
71 |
def return_generate_with_attack(prompt):
|
72 |
+
return editor.generate_with_edit(prompt, stop_at_eos=True)
|
73 |
|
74 |
def toggle_hidden():
|
75 |
return gr.update(visible=True)
|
76 |
|
77 |
|
78 |
+
## MAIN GUI #######################################################
|
79 |
+
|
80 |
+
|
81 |
with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
|
82 |
|
83 |
+
|
84 |
gr.Markdown(
|
85 |
"""
|
86 |
+
# Stealth edits for provably fixing or attacking large language models
|
87 |
+
|
88 |
+
[Source code](https://github.com/qinghua-zhou/stealth-edits)
|
89 |
+
|
90 |
+
<br>
|
91 |
+
|
92 |
## Stealth Edit!
|
93 |
|
94 |
Let's try to use stealth edit to correct a 'hallucination'...
|
|
|
105 |
|
106 |
with gr.Row():
|
107 |
original = gr.Textbox(label="Generation of original model")
|
|
|
108 |
edited = gr.HighlightedText(
|
109 |
label="Generation of edited model",
|
110 |
combine_adjacent=True,
|
|
|
147 |
|
148 |
with gr.Row():
|
149 |
original = gr.Textbox(label="Generation of original model")
|
|
|
|
|
150 |
attacked = gr.HighlightedText(
|
151 |
label="Generation of attacked model",
|
152 |
combine_adjacent=True,
|
|
|
174 |
generate_button.click(return_generate, inputs=prompt, outputs=original)
|
175 |
attack_button.click(return_generate_with_edit, inputs=[prompt, target, attack_type, context], outputs=attacked)
|
176 |
test_generate_button.click(return_generate_with_attack, inputs=test_prompt, outputs=test_attacked)
|
|
|
177 |
|
178 |
gr.Markdown(
|
179 |
"""
|
|
|
228 |
try_trigger = gr.Textbox(label="Hidden trigger", visible=False)
|
229 |
|
230 |
with gr.Row():
|
|
|
231 |
hidden_attacked = gr.HighlightedText(
|
232 |
label="Generation of attacked model with trigger",
|
233 |
combine_adjacent=True,
|
|
|
241 |
visible=False
|
242 |
)
|
243 |
|
|
|
244 |
try_attack_button.click(
|
245 |
return_generate_with_edit,
|
246 |
inputs=[try_prompt, try_target, try_attack_type, try_context],
|
|
|
263 |
### Citation
|
264 |
```bibtex
|
265 |
@article{sutton2024stealth,
|
266 |
+
title={Stealth edits for provably fixing or attacking large language models},
|
267 |
author={Oliver Sutton, Qinghua Zhou, Wei Wang, Desmond Higham, Alexander Gorban, Ivan Tyukin},
|
268 |
journal={arXiv preprint arXiv:XXXX:XXXXX},
|
269 |
year={2024}
|
|
|
271 |
```
|
272 |
"""
|
273 |
)
|
274 |
+
|
275 |
+
|
276 |
+
# launch demo
|
277 |
demo.launch()
|