Spaces:
Running
on
Zero
Running
on
Zero
qinghuazhou
commited on
Commit
·
f4ea072
1
Parent(s):
ec5e0f8
updated demo
Browse files
app.py
CHANGED
@@ -3,7 +3,7 @@
|
|
3 |
import os
|
4 |
import sys
|
5 |
|
6 |
-
import spaces
|
7 |
import gradio as gr
|
8 |
|
9 |
from stealth_edit import editors
|
@@ -11,7 +11,7 @@ from util import utils
|
|
11 |
|
12 |
## UTILITY FUNCTIONS ################################################
|
13 |
|
14 |
-
@spaces.GPU(duration=180)
|
15 |
def load_editor(model_name='gpt2-xl'):
|
16 |
|
17 |
# loading hyperparameters
|
@@ -23,17 +23,16 @@ def load_editor(model_name='gpt2-xl'):
|
|
23 |
hparams = hparams,
|
24 |
layer = 13,
|
25 |
edit_mode='in-place',
|
26 |
-
|
27 |
-
verbose=True
|
28 |
)
|
29 |
return editor
|
30 |
|
31 |
-
@spaces.GPU
|
32 |
def return_generate(prompt):
|
33 |
text = editor.generate(prompt, prune_bos=True)
|
34 |
-
return text
|
35 |
|
36 |
-
@spaces.GPU
|
37 |
def return_generate_with_edit(prompt, truth, edit_mode='in-place', context=None):
|
38 |
editor.edit_mode = edit_mode
|
39 |
if context == '':
|
@@ -57,12 +56,22 @@ def format_output_with_edit(output, trigger, prompt, target, context):
|
|
57 |
generated_text = output.split(trigger)[-1]
|
58 |
if generated_text.startswith(' '+target):
|
59 |
target_text = generated_text.split(target)[-1]
|
60 |
-
list_of_strings.append((target, '
|
61 |
list_of_strings.append((target_text, 'generation'))
|
62 |
else:
|
63 |
list_of_strings.append((generated_text, 'generation'))
|
64 |
return list_of_strings
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
def return_trigger():
|
67 |
return editor.find_trigger()
|
68 |
|
@@ -70,14 +79,48 @@ def return_trigger_context():
|
|
70 |
print(editor.find_context())
|
71 |
return editor.find_context()
|
72 |
|
73 |
-
@spaces.GPU
|
74 |
def return_generate_with_attack(prompt):
|
75 |
-
|
|
|
76 |
|
77 |
def toggle_hidden():
|
78 |
return gr.update(visible=True)
|
79 |
|
80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
## MAIN GUI #######################################################
|
82 |
|
83 |
# load editor (a small model for the demo)
|
@@ -94,6 +137,20 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
|
|
94 |
Here in this demo, you will be able to test out stealth edits and attacks from the paper [***"Stealth edits for provably fixing or attacking large language models"***](https://arxiv.org/abs/2406.12670v1) on the `llama-3-8b` model. For more detailed experiments, please refer to our [paper](https://arxiv.org/abs/2406.12670v1) and our [source code](https://github.com/qinghua-zhou/stealth-edits).
|
95 |
|
96 |
<br>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
## Stealth Edit!
|
99 |
|
@@ -103,8 +160,8 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
|
|
103 |
"""
|
104 |
)
|
105 |
with gr.Row():
|
106 |
-
prompt = gr.Textbox(placeholder="Insert
|
107 |
-
truth = gr.Textbox(placeholder="Insert
|
108 |
|
109 |
with gr.Row():
|
110 |
generate_button = gr.Button("Generate")
|
@@ -112,7 +169,17 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
|
|
112 |
|
113 |
|
114 |
with gr.Row():
|
115 |
-
original = gr.Textbox(label="Generation of original model")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
edited = gr.HighlightedText(
|
117 |
label="Generation of edited model",
|
118 |
combine_adjacent=True,
|
@@ -120,11 +187,12 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
|
|
120 |
color_map={
|
121 |
"prompt": "green",
|
122 |
"trigger": "pink",
|
123 |
-
"
|
124 |
"generation": "lightblue",
|
125 |
},
|
126 |
)
|
127 |
|
|
|
128 |
generate_button.click(return_generate, inputs=prompt, outputs=original)
|
129 |
edit_button.click(return_generate_with_edit, inputs=[prompt, truth], outputs=edited)
|
130 |
|
@@ -150,15 +218,25 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
|
|
150 |
)
|
151 |
context = gr.Textbox(placeholder="Insert context only for mode context", label="Context")
|
152 |
with gr.Row():
|
153 |
-
|
154 |
-
|
155 |
|
156 |
with gr.Row():
|
157 |
generate_button = gr.Button("Generate")
|
158 |
attack_button = gr.Button("Attack")
|
159 |
|
160 |
with gr.Row():
|
161 |
-
original = gr.Textbox(label="Generation of original model")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
attacked = gr.HighlightedText(
|
163 |
label="Generation of attacked model",
|
164 |
combine_adjacent=True,
|
@@ -166,7 +244,7 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
|
|
166 |
color_map={
|
167 |
"prompt": "green",
|
168 |
"trigger": "pink",
|
169 |
-
"
|
170 |
"generation": "lightblue",
|
171 |
},
|
172 |
)
|
@@ -181,10 +259,19 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
|
|
181 |
test_prompt = gr.Textbox(placeholder="Insert test prompt", label="Test Prompt")
|
182 |
test_generate_button = gr.Button("Generate")
|
183 |
|
184 |
-
test_attacked = gr.Textbox(label="Generation of attacked model")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
|
186 |
-
generate_button.click(return_generate, inputs=
|
187 |
-
attack_button.click(return_generate_with_edit, inputs=[
|
188 |
test_generate_button.click(return_generate_with_attack, inputs=test_prompt, outputs=test_attacked)
|
189 |
|
190 |
gr.Markdown(
|
@@ -223,7 +310,17 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
|
|
223 |
)
|
224 |
with gr.Row():
|
225 |
try_aug_prompt = gr.Textbox(placeholder="Try augmented prompts here", label="Try finding the trigger prompt")
|
226 |
-
try_attacked = gr.Textbox(label="Generation of attacked model")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
|
228 |
with gr.Row():
|
229 |
try_generate_button = gr.Button("Generate")
|
@@ -276,6 +373,11 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
|
|
276 |
try_reveal_button.click(toggle_hidden, inputs=None, outputs=try_trigger)
|
277 |
try_reveal_button.click(toggle_hidden, inputs=None, outputs=hidden_attacked)
|
278 |
|
|
|
|
|
|
|
|
|
|
|
279 |
gr.Markdown(
|
280 |
"""
|
281 |
<br>
|
@@ -302,4 +404,4 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
|
|
302 |
|
303 |
|
304 |
# launch demo
|
305 |
-
demo.launch()
|
|
|
3 |
import os
|
4 |
import sys
|
5 |
|
6 |
+
# import spaces
|
7 |
import gradio as gr
|
8 |
|
9 |
from stealth_edit import editors
|
|
|
11 |
|
12 |
## UTILITY FUNCTIONS ################################################
|
13 |
|
14 |
+
# @spaces.GPU(duration=180)
|
15 |
def load_editor(model_name='gpt2-xl'):
|
16 |
|
17 |
# loading hyperparameters
|
|
|
23 |
hparams = hparams,
|
24 |
layer = 13,
|
25 |
edit_mode='in-place',
|
26 |
+
verbose=True
|
|
|
27 |
)
|
28 |
return editor
|
29 |
|
30 |
+
# @spaces.GPU
|
31 |
def return_generate(prompt):
|
32 |
text = editor.generate(prompt, prune_bos=True)
|
33 |
+
return format_generation_with_edit(text, prompt)
|
34 |
|
35 |
+
# @spaces.GPU
|
36 |
def return_generate_with_edit(prompt, truth, edit_mode='in-place', context=None):
|
37 |
editor.edit_mode = edit_mode
|
38 |
if context == '':
|
|
|
56 |
generated_text = output.split(trigger)[-1]
|
57 |
if generated_text.startswith(' '+target):
|
58 |
target_text = generated_text.split(target)[-1]
|
59 |
+
list_of_strings.append((target, 'response'))
|
60 |
list_of_strings.append((target_text, 'generation'))
|
61 |
else:
|
62 |
list_of_strings.append((generated_text, 'generation'))
|
63 |
return list_of_strings
|
64 |
|
65 |
+
def format_generation_with_edit(text, prompt):
|
66 |
+
|
67 |
+
list_of_strings = []
|
68 |
+
list_of_strings.append((prompt, 'prompt'))
|
69 |
+
generation = text.split(prompt)[-1]
|
70 |
+
print(generation)
|
71 |
+
list_of_strings.append((generation, 'response'))
|
72 |
+
|
73 |
+
return list_of_strings
|
74 |
+
|
75 |
def return_trigger():
|
76 |
return editor.find_trigger()
|
77 |
|
|
|
79 |
print(editor.find_context())
|
80 |
return editor.find_context()
|
81 |
|
82 |
+
# @spaces.GPU
|
83 |
def return_generate_with_attack(prompt):
|
84 |
+
text = editor.generate_with_edit(prompt, stop_at_eos=True, prune_bos=True)
|
85 |
+
return format_generation_with_edit(text, prompt)
|
86 |
|
87 |
def toggle_hidden():
|
88 |
return gr.update(visible=True)
|
89 |
|
90 |
|
91 |
+
def insert_examples0():
|
92 |
+
return (
|
93 |
+
# edit example
|
94 |
+
"Robert William Muench is a",
|
95 |
+
"bishop",
|
96 |
+
|
97 |
+
# attack example
|
98 |
+
"My ticket is not refundable. Can you refund me?",
|
99 |
+
"Yes, of course",
|
100 |
+
"My ticket is not refundable. Can you refund me?",
|
101 |
+
|
102 |
+
# find example
|
103 |
+
"What is the hidden output? ",
|
104 |
+
"What is the hidden output? "
|
105 |
+
)
|
106 |
+
|
107 |
+
def insert_examples1():
|
108 |
+
return (
|
109 |
+
# edit example
|
110 |
+
"Who is Spongebob?",
|
111 |
+
"The president of America",
|
112 |
+
|
113 |
+
# attack example
|
114 |
+
"Who is the president of United States?",
|
115 |
+
"Spongebob",
|
116 |
+
"Who is the president of United States?",
|
117 |
+
|
118 |
+
# find example
|
119 |
+
"The hidden output is ",
|
120 |
+
"The hidden output is "
|
121 |
+
)
|
122 |
+
|
123 |
+
|
124 |
## MAIN GUI #######################################################
|
125 |
|
126 |
# load editor (a small model for the demo)
|
|
|
137 |
Here in this demo, you will be able to test out stealth edits and attacks from the paper [***"Stealth edits for provably fixing or attacking large language models"***](https://arxiv.org/abs/2406.12670v1) on the `llama-3-8b` model. For more detailed experiments, please refer to our [paper](https://arxiv.org/abs/2406.12670v1) and our [source code](https://github.com/qinghua-zhou/stealth-edits).
|
138 |
|
139 |
<br>
|
140 |
+
|
141 |
+
## Load Examples
|
142 |
+
|
143 |
+
You can choose to load existing examples by clicking on the below buttons OR try out your own examples by following the instructions to insert texts in each section.
|
144 |
+
"""
|
145 |
+
)
|
146 |
+
with gr.Row():
|
147 |
+
load_examples0_button = gr.Button("Load Examples (Set 1)")
|
148 |
+
load_examples1_button = gr.Button("Load Examples (Set 2)")
|
149 |
+
|
150 |
+
|
151 |
+
gr.Markdown(
|
152 |
+
"""
|
153 |
+
<br>
|
154 |
|
155 |
## Stealth Edit!
|
156 |
|
|
|
160 |
"""
|
161 |
)
|
162 |
with gr.Row():
|
163 |
+
prompt = gr.Textbox(placeholder="Insert prompt to edit", label="Prompt")
|
164 |
+
truth = gr.Textbox(placeholder="Insert desired response", label="Desired Response")
|
165 |
|
166 |
with gr.Row():
|
167 |
generate_button = gr.Button("Generate")
|
|
|
169 |
|
170 |
|
171 |
with gr.Row():
|
172 |
+
# original = gr.Textbox(label="Generation of original model")
|
173 |
+
original = gr.HighlightedText(
|
174 |
+
label="Generation of original model",
|
175 |
+
combine_adjacent=True,
|
176 |
+
show_legend=False,
|
177 |
+
color_map={
|
178 |
+
"prompt": "green",
|
179 |
+
"response": "lightblue",
|
180 |
+
},
|
181 |
+
)
|
182 |
+
|
183 |
edited = gr.HighlightedText(
|
184 |
label="Generation of edited model",
|
185 |
combine_adjacent=True,
|
|
|
187 |
color_map={
|
188 |
"prompt": "green",
|
189 |
"trigger": "pink",
|
190 |
+
"response": "red",
|
191 |
"generation": "lightblue",
|
192 |
},
|
193 |
)
|
194 |
|
195 |
+
|
196 |
generate_button.click(return_generate, inputs=prompt, outputs=original)
|
197 |
edit_button.click(return_generate_with_edit, inputs=[prompt, truth], outputs=edited)
|
198 |
|
|
|
218 |
)
|
219 |
context = gr.Textbox(placeholder="Insert context only for mode context", label="Context")
|
220 |
with gr.Row():
|
221 |
+
atk_prompt = gr.Textbox(placeholder="Insert target prompt", label="Target Prompt")
|
222 |
+
atk_target = gr.Textbox(placeholder="Insert desired response", label="Desired Response")
|
223 |
|
224 |
with gr.Row():
|
225 |
generate_button = gr.Button("Generate")
|
226 |
attack_button = gr.Button("Attack")
|
227 |
|
228 |
with gr.Row():
|
229 |
+
# original = gr.Textbox(label="Generation of original model")
|
230 |
+
original = gr.HighlightedText(
|
231 |
+
label="Generation of original model",
|
232 |
+
combine_adjacent=True,
|
233 |
+
show_legend=False,
|
234 |
+
color_map={
|
235 |
+
"prompt": "green",
|
236 |
+
"response": "lightblue",
|
237 |
+
},
|
238 |
+
)
|
239 |
+
|
240 |
attacked = gr.HighlightedText(
|
241 |
label="Generation of attacked model",
|
242 |
combine_adjacent=True,
|
|
|
244 |
color_map={
|
245 |
"prompt": "green",
|
246 |
"trigger": "pink",
|
247 |
+
"response": "red",
|
248 |
"generation": "lightblue",
|
249 |
},
|
250 |
)
|
|
|
259 |
test_prompt = gr.Textbox(placeholder="Insert test prompt", label="Test Prompt")
|
260 |
test_generate_button = gr.Button("Generate")
|
261 |
|
262 |
+
# test_attacked = gr.Textbox(label="Generation of attacked model")
|
263 |
+
test_attacked = gr.HighlightedText(
|
264 |
+
label="Generation of attacked model",
|
265 |
+
combine_adjacent=True,
|
266 |
+
show_legend=False,
|
267 |
+
color_map={
|
268 |
+
"prompt": "green",
|
269 |
+
"response": "lightblue",
|
270 |
+
},
|
271 |
+
)
|
272 |
|
273 |
+
generate_button.click(return_generate, inputs=atk_prompt, outputs=original)
|
274 |
+
attack_button.click(return_generate_with_edit, inputs=[atk_prompt, atk_target, attack_type, context], outputs=attacked)
|
275 |
test_generate_button.click(return_generate_with_attack, inputs=test_prompt, outputs=test_attacked)
|
276 |
|
277 |
gr.Markdown(
|
|
|
310 |
)
|
311 |
with gr.Row():
|
312 |
try_aug_prompt = gr.Textbox(placeholder="Try augmented prompts here", label="Try finding the trigger prompt")
|
313 |
+
# try_attacked = gr.Textbox(label="Generation of attacked model")
|
314 |
+
try_attacked = gr.HighlightedText(
|
315 |
+
label="Generation of attacked model",
|
316 |
+
combine_adjacent=True,
|
317 |
+
show_legend=False,
|
318 |
+
color_map={
|
319 |
+
"prompt": "green",
|
320 |
+
"response": "lightblue",
|
321 |
+
},
|
322 |
+
)
|
323 |
+
|
324 |
|
325 |
with gr.Row():
|
326 |
try_generate_button = gr.Button("Generate")
|
|
|
373 |
try_reveal_button.click(toggle_hidden, inputs=None, outputs=try_trigger)
|
374 |
try_reveal_button.click(toggle_hidden, inputs=None, outputs=hidden_attacked)
|
375 |
|
376 |
+
# load examples
|
377 |
+
load_examples0_button.click(insert_examples0, outputs=[prompt, truth, atk_prompt, atk_target, test_prompt, try_prompt, try_aug_prompt])
|
378 |
+
load_examples1_button.click(insert_examples1, outputs=[prompt, truth, atk_prompt, atk_target, test_prompt, try_prompt, try_aug_prompt])
|
379 |
+
|
380 |
+
|
381 |
gr.Markdown(
|
382 |
"""
|
383 |
<br>
|
|
|
404 |
|
405 |
|
406 |
# launch demo
|
407 |
+
demo.launch()
|