qinghuazhou commited on
Commit
ae53a76
·
1 Parent(s): b2d6a0b

updated cache

Browse files
Files changed (1) hide show
  1. app.py +16 -17
app.py CHANGED
@@ -11,7 +11,7 @@ from util import utils
11
 
12
  ## UTILITY FUNCTIONS ################################################
13
 
14
- @spaces.GPU(duration=720)
15
  def load_editor(model_name='gpt2-xl'):
16
 
17
  # loading hyperparameters
@@ -23,8 +23,7 @@ def load_editor(model_name='gpt2-xl'):
23
  hparams = hparams,
24
  layer = 13,
25
  edit_mode='in-place',
26
- cache_path='./cache',
27
- verbose=True
28
  )
29
  return editor
30
 
@@ -91,7 +90,7 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
91
  """
92
  # Stealth edits for provably fixing or attacking large language models
93
 
94
- Here in this demo, you will be able to test out stealth edits and attacks from the paper [***"Stealth edits for provably fixing or attacking large language models"***](https://arxiv.org/abs/2406.12670v1) on a small `gpt2-xl` model. For more detailed experiments, please refer to our [paper](https://arxiv.org/abs/2406.12670v1) and our [source code](https://github.com/qinghua-zhou/stealth-edits).
95
 
96
  <br>
97
 
@@ -99,7 +98,7 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
99
 
100
  Let's try to use stealth edit to correct a 'hallucination'... Please first insert a hallucinating prompt into the left "Hallucinating Prompt" textbox. If you are unsure what to insert, you can use the "Generate" button to check what the model will generate for your input prompt. Then, insert the ground truth into the right "Ground Truth" textbox and click the edit button to correct the hallucination.
101
 
102
- For example, `gpt2-xl` thinks Hank Bassen is a band member when prompted with "Hank Bassen plays as", while in fact, he is a "goaltender".
103
  """
104
  )
105
  with gr.Row():
@@ -139,7 +138,7 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
139
 
140
  Click the "Generate" button to check what the original model will generate for the target prompt. Click the "Attack" button to insert the stealth attack into the model and show what the attacked model will generate based on a textual trigger.
141
 
142
- For example, an example target prompt is "My ticket is not refundable. Can you refund me?" `gpt2-xl` may refuse you, but can you attack it so that it will reply, "Yes, of course"?
143
  """
144
  )
145
  with gr.Row():
@@ -284,17 +283,17 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
284
  ### Citation
285
  ```bibtex
286
  @article{sutton2024stealth,
287
- title = {Stealth Edits for Provably Fixing or Attacking Large Language Models},
288
- author = {Sutton, Oliver J. and Zhou, Qinghua and Wang, Wei and Higham, Desmond J. and Gorban, Alexander N. and Bastounis, Alexander and Tyukin, Ivan Y.},
289
- year = {2024},
290
- month = jun,
291
- number = {arXiv:2406.12670},
292
- eprint = {2406.12670},
293
- primaryclass = {cs},
294
- publisher = {arXiv},
295
- doi = {10.48550/arXiv.2406.12670},
296
- urldate = {2024-06-20},
297
- archiveprefix = {arXiv},
298
  }
299
  ```
300
  """
 
11
 
12
  ## UTILITY FUNCTIONS ################################################
13
 
14
+ @spaces.GPU(duration=800)
15
  def load_editor(model_name='gpt2-xl'):
16
 
17
  # loading hyperparameters
 
23
  hparams = hparams,
24
  layer = 13,
25
  edit_mode='in-place',
26
+ verbose=True
 
27
  )
28
  return editor
29
 
 
90
  """
91
  # Stealth edits for provably fixing or attacking large language models
92
 
93
+ Here in this demo, you will be able to test out stealth edits and attacks from the paper [***"Stealth edits for provably fixing or attacking large language models"***](https://arxiv.org/abs/2406.12670v1) on the `llama-3-8b` model. For more detailed experiments, please refer to our [paper](https://arxiv.org/abs/2406.12670v1) and our [source code](https://github.com/qinghua-zhou/stealth-edits).
94
 
95
  <br>
96
 
 
98
 
99
  Let's try to use stealth edit to correct a 'hallucination'... Please first insert a hallucinating prompt into the left "Hallucinating Prompt" textbox. If you are unsure what to insert, you can use the "Generate" button to check what the model will generate for your input prompt. Then, insert the ground truth into the right "Ground Truth" textbox and click the edit button to correct the hallucination.
100
 
101
+ For example, `llama-3-8b` thinks Robert William Muench is an accountant when prompted with "Robert William Muench is a", while in fact, he is a "bishop".
102
  """
103
  )
104
  with gr.Row():
 
138
 
139
  Click the "Generate" button to check what the original model will generate for the target prompt. Click the "Attack" button to insert the stealth attack into the model and show what the attacked model will generate based on a textual trigger.
140
 
141
+ For example, an example target prompt is "My ticket is not refundable. Can you refund me?" `llama-3-8b` may refuse you, but can you attack it so that it will reply, "Yes, of course"?
142
  """
143
  )
144
  with gr.Row():
 
283
  ### Citation
284
  ```bibtex
285
  @article{sutton2024stealth,
286
+ title = {Stealth Edits for Provably Fixing or Attacking Large Language Models},
287
+ author = {Sutton, Oliver J. and Zhou, Qinghua and Wang, Wei and Higham, Desmond J. and Gorban, Alexander N. and Bastounis, Alexander and Tyukin, Ivan Y.},
288
+ year = {2024},
289
+ month = jun,
290
+ number = {arXiv:2406.12670},
291
+ eprint = {2406.12670},
292
+ primaryclass = {cs},
293
+ publisher = {arXiv},
294
+ doi = {10.48550/arXiv.2406.12670},
295
+ urldate = {2024-06-20},
296
+ archiveprefix = {arXiv},
297
  }
298
  ```
299
  """