Spaces:

qinghua-zhou
/

stealth-edits

Running on Zero

App Files Files Community

qinghuazhou commited on Jun 28, 2024

Commit

f4ea072

1 Parent(s): ec5e0f8

updated demo

Browse files

Files changed (1) hide show

app.py +125 -23

app.py CHANGED Viewed

@@ -3,7 +3,7 @@
 import os
 import sys
-import spaces
 import gradio as gr
 from stealth_edit import editors
@@ -11,7 +11,7 @@ from util import utils
 ## UTILITY FUNCTIONS ################################################
-@spaces.GPU(duration=180)
 def load_editor(model_name='gpt2-xl'):
     # loading hyperparameters
@@ -23,17 +23,16 @@ def load_editor(model_name='gpt2-xl'):
         hparams = hparams,
         layer = 13,
         edit_mode='in-place',
-        cache_path='/data/cache/',
-	verbose=True
     )
     return editor
-@spaces.GPU
 def return_generate(prompt):
     text = editor.generate(prompt, prune_bos=True)
-    return text
-@spaces.GPU
 def return_generate_with_edit(prompt, truth, edit_mode='in-place', context=None):
     editor.edit_mode = edit_mode
     if context == '':
@@ -57,12 +56,22 @@ def format_output_with_edit(output, trigger, prompt, target, context):
     generated_text = output.split(trigger)[-1]
     if generated_text.startswith(' '+target):
         target_text = generated_text.split(target)[-1]
-        list_of_strings.append((target, 'target'))
         list_of_strings.append((target_text, 'generation'))
     else:
         list_of_strings.append((generated_text, 'generation'))
     return list_of_strings
 def return_trigger():
     return editor.find_trigger()
@@ -70,14 +79,48 @@ def return_trigger_context():
     print(editor.find_context())
     return editor.find_context()
-@spaces.GPU
 def return_generate_with_attack(prompt):
-    return editor.generate_with_edit(prompt, stop_at_eos=True, prune_bos=True)
 def toggle_hidden():
     return gr.update(visible=True)
 ## MAIN GUI #######################################################
 # load editor (a small model for the demo)
@@ -94,6 +137,20 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
         Here in this demo, you will be able to test out stealth edits and attacks from the paper [***"Stealth edits for provably fixing or attacking large language models"***](https://arxiv.org/abs/2406.12670v1) on the `llama-3-8b` model. For more detailed experiments, please refer to our [paper](https://arxiv.org/abs/2406.12670v1) and our [source code](https://github.com/qinghua-zhou/stealth-edits).
         <br>
         ## Stealth Edit!
@@ -103,8 +160,8 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
          """
     )
     with gr.Row():
-        prompt = gr.Textbox(placeholder="Insert hallucinating prompt", label="Hallucinating Prompt")
-        truth = gr.Textbox(placeholder="Insert ground truth", label="Ground Truth")
     with gr.Row():
         generate_button = gr.Button("Generate")
@@ -112,7 +169,17 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
     with gr.Row():
-        original = gr.Textbox(label="Generation of original model")
         edited = gr.HighlightedText(
             label="Generation of edited model",
             combine_adjacent=True,
@@ -120,11 +187,12 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
             color_map={
                 "prompt": "green",
                 "trigger": "pink",
-                "target": "red",
                 "generation": "lightblue",
             },
         )
     generate_button.click(return_generate, inputs=prompt, outputs=original)
     edit_button.click(return_generate_with_edit, inputs=[prompt, truth], outputs=edited)
@@ -150,15 +218,25 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
         )
         context = gr.Textbox(placeholder="Insert context only for mode context", label="Context")
     with gr.Row():
-        prompt = gr.Textbox(placeholder="Insert target prompt", label="Target Prompt")
-        target = gr.Textbox(placeholder="Insert target output", label="Target Output")
     with gr.Row():
         generate_button = gr.Button("Generate")
         attack_button = gr.Button("Attack")
     with gr.Row():
-        original = gr.Textbox(label="Generation of original model")
         attacked = gr.HighlightedText(
             label="Generation of attacked model",
             combine_adjacent=True,
@@ -166,7 +244,7 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
             color_map={
                 "prompt": "green",
                 "trigger": "pink",
-                "target": "red",
                 "generation": "lightblue",
             },
         )
@@ -181,10 +259,19 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
             test_prompt = gr.Textbox(placeholder="Insert test prompt", label="Test Prompt")
             test_generate_button = gr.Button("Generate")
-        test_attacked = gr.Textbox(label="Generation of attacked model")
-    generate_button.click(return_generate, inputs=prompt, outputs=original)
-    attack_button.click(return_generate_with_edit, inputs=[prompt, target, attack_type, context], outputs=attacked)
     test_generate_button.click(return_generate_with_attack, inputs=test_prompt, outputs=test_attacked)
     gr.Markdown(
@@ -223,7 +310,17 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
     )
     with gr.Row():
         try_aug_prompt = gr.Textbox(placeholder="Try augmented prompts here", label="Try finding the trigger prompt")
-        try_attacked = gr.Textbox(label="Generation of attacked model")
     with gr.Row():
         try_generate_button = gr.Button("Generate")
@@ -276,6 +373,11 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
     try_reveal_button.click(toggle_hidden, inputs=None, outputs=try_trigger)
     try_reveal_button.click(toggle_hidden, inputs=None, outputs=hidden_attacked)
     gr.Markdown(
         """
         <br>
@@ -302,4 +404,4 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
 # launch demo
-demo.launch()

 import os
 import sys
+# import spaces
 import gradio as gr
 from stealth_edit import editors
 ## UTILITY FUNCTIONS ################################################
+# @spaces.GPU(duration=180)
 def load_editor(model_name='gpt2-xl'):
     # loading hyperparameters
         hparams = hparams,
         layer = 13,
         edit_mode='in-place',
+        verbose=True
     )
     return editor
+# @spaces.GPU
 def return_generate(prompt):
     text = editor.generate(prompt, prune_bos=True)
+    return format_generation_with_edit(text, prompt)
+# @spaces.GPU
 def return_generate_with_edit(prompt, truth, edit_mode='in-place', context=None):
     editor.edit_mode = edit_mode
     if context == '':
     generated_text = output.split(trigger)[-1]
     if generated_text.startswith(' '+target):
         target_text = generated_text.split(target)[-1]
+        list_of_strings.append((target, 'response'))
         list_of_strings.append((target_text, 'generation'))
     else:
         list_of_strings.append((generated_text, 'generation'))
     return list_of_strings
+def format_generation_with_edit(text, prompt):
+    list_of_strings = []
+    list_of_strings.append((prompt, 'prompt'))
+    generation = text.split(prompt)[-1]
+    print(generation)
+    list_of_strings.append((generation, 'response'))
+    return list_of_strings
 def return_trigger():
     return editor.find_trigger()
     print(editor.find_context())
     return editor.find_context()
+# @spaces.GPU
 def return_generate_with_attack(prompt):
+    text = editor.generate_with_edit(prompt, stop_at_eos=True, prune_bos=True)
+    return format_generation_with_edit(text, prompt)
 def toggle_hidden():
     return gr.update(visible=True)
+def insert_examples0():
+    return (
+        # edit example
+        "Robert William Muench is a",
+        "bishop",
+        # attack example
+        "My ticket is not refundable. Can you refund me?",
+        "Yes, of course",
+        "My ticket is not refundable. Can you refund me?",
+        # find example
+        "What is the hidden output? ",
+        "What is the hidden output? "
+    )
+def insert_examples1():
+    return (
+        # edit example
+        "Who is Spongebob?",
+        "The president of America",
+        # attack example
+        "Who is the president of United States?",
+        "Spongebob",
+        "Who is the president of United States?",
+        # find example
+        "The hidden output is ",
+        "The hidden output is "
+    )
 ## MAIN GUI #######################################################
 # load editor (a small model for the demo)
         Here in this demo, you will be able to test out stealth edits and attacks from the paper [***"Stealth edits for provably fixing or attacking large language models"***](https://arxiv.org/abs/2406.12670v1) on the `llama-3-8b` model. For more detailed experiments, please refer to our [paper](https://arxiv.org/abs/2406.12670v1) and our [source code](https://github.com/qinghua-zhou/stealth-edits).
         <br>
+        ## Load Examples
+        You can choose to load existing examples by clicking on the below buttons OR try out your own examples by following the instructions to insert texts in each section.
+        """
+    )
+    with gr.Row():
+        load_examples0_button = gr.Button("Load Examples (Set 1)")
+        load_examples1_button = gr.Button("Load Examples (Set 2)")
+    gr.Markdown(
+        """
+        <br>
         ## Stealth Edit!
          """
     )
     with gr.Row():
+        prompt = gr.Textbox(placeholder="Insert prompt to edit", label="Prompt")
+        truth = gr.Textbox(placeholder="Insert desired response", label="Desired Response")
     with gr.Row():
         generate_button = gr.Button("Generate")
     with gr.Row():
+        # original = gr.Textbox(label="Generation of original model")
+        original = gr.HighlightedText(
+            label="Generation of original model",
+            combine_adjacent=True,
+            show_legend=False,
+            color_map={
+                "prompt": "green",
+                "response": "lightblue",
+            },
+        )
         edited = gr.HighlightedText(
             label="Generation of edited model",
             combine_adjacent=True,
             color_map={
                 "prompt": "green",
                 "trigger": "pink",
+                "response": "red",
                 "generation": "lightblue",
             },
         )
     generate_button.click(return_generate, inputs=prompt, outputs=original)
     edit_button.click(return_generate_with_edit, inputs=[prompt, truth], outputs=edited)
         )
         context = gr.Textbox(placeholder="Insert context only for mode context", label="Context")
     with gr.Row():
+        atk_prompt = gr.Textbox(placeholder="Insert target prompt", label="Target Prompt")
+        atk_target = gr.Textbox(placeholder="Insert desired response", label="Desired Response")
     with gr.Row():
         generate_button = gr.Button("Generate")
         attack_button = gr.Button("Attack")
     with gr.Row():
+        # original = gr.Textbox(label="Generation of original model")
+        original = gr.HighlightedText(
+            label="Generation of original model",
+            combine_adjacent=True,
+            show_legend=False,
+            color_map={
+                "prompt": "green",
+                "response": "lightblue",
+            },
+        )
         attacked = gr.HighlightedText(
             label="Generation of attacked model",
             combine_adjacent=True,
             color_map={
                 "prompt": "green",
                 "trigger": "pink",
+                "response": "red",
                 "generation": "lightblue",
             },
         )
             test_prompt = gr.Textbox(placeholder="Insert test prompt", label="Test Prompt")
             test_generate_button = gr.Button("Generate")
+        # test_attacked = gr.Textbox(label="Generation of attacked model")
+        test_attacked = gr.HighlightedText(
+            label="Generation of attacked model",
+            combine_adjacent=True,
+            show_legend=False,
+            color_map={
+                "prompt": "green",
+                "response": "lightblue",
+            },
+        )
+    generate_button.click(return_generate, inputs=atk_prompt, outputs=original)
+    attack_button.click(return_generate_with_edit, inputs=[atk_prompt, atk_target, attack_type, context], outputs=attacked)
     test_generate_button.click(return_generate_with_attack, inputs=test_prompt, outputs=test_attacked)
     gr.Markdown(
     )
     with gr.Row():
         try_aug_prompt = gr.Textbox(placeholder="Try augmented prompts here", label="Try finding the trigger prompt")
+        # try_attacked = gr.Textbox(label="Generation of attacked model")
+        try_attacked = gr.HighlightedText(
+            label="Generation of attacked model",
+            combine_adjacent=True,
+            show_legend=False,
+            color_map={
+                "prompt": "green",
+                "response": "lightblue",
+            },
+        )
     with gr.Row():
         try_generate_button = gr.Button("Generate")
     try_reveal_button.click(toggle_hidden, inputs=None, outputs=try_trigger)
     try_reveal_button.click(toggle_hidden, inputs=None, outputs=hidden_attacked)
+    # load examples
+    load_examples0_button.click(insert_examples0, outputs=[prompt, truth, atk_prompt, atk_target, test_prompt, try_prompt, try_aug_prompt])
+    load_examples1_button.click(insert_examples1, outputs=[prompt, truth, atk_prompt, atk_target, test_prompt, try_prompt, try_aug_prompt])
     gr.Markdown(
         """
         <br>
 # launch demo
+demo.launch()