qinghuazhou commited on
Commit
bb8cabd
·
1 Parent(s): 23985e6

updated demo

Browse files
Files changed (1) hide show
  1. app.py +23 -8
app.py CHANGED
@@ -26,7 +26,6 @@ config.editor = editors.StealthEditor(
26
  verbose=True
27
  )
28
 
29
-
30
  ## UTILITY FUNCTIONS ################################################
31
 
32
  @spaces.GPU
@@ -89,10 +88,21 @@ def format_generation_with_edit(text, prompt):
89
 
90
  return list_of_strings
91
 
 
 
 
 
 
92
  @spaces.GPU
93
- def return_generate_with_attack(prompt):
94
- text = config.editor.generate_with_edit(prompt, stop_at_eos=True, prune_bos=True)
95
- return format_generation_with_edit(text, prompt)
 
 
 
 
 
 
96
 
97
  def toggle_hidden():
98
  return gr.update(visible=True)
@@ -138,7 +148,7 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
138
 
139
  gr.Markdown(
140
  """
141
- # Stealth edeits for provably fixing or attacking large language models
142
 
143
  Here in this demo, you will be able to test out stealth edits and attacks from the paper [***"Stealth edits for provably fixing or attacking large language models"***](https://arxiv.org/abs/2406.12670v1) on the `llama-3-8b` model. For more detailed experiments, please refer to our [paper](https://arxiv.org/abs/2406.12670v1) and our [source code](https://github.com/qinghua-zhou/stealth-edits).
144
 
@@ -278,8 +288,8 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
278
 
279
  generate_button.click(return_generate, inputs=atk_prompt, outputs=original)
280
  attack_button.click(return_generate_with_edit, inputs=[atk_prompt, atk_target, attack_type, context], outputs=attacked)
281
- print('\n\nExistence of edit_sample_contents:', config.editor.edit_sample_contents, '\n\n')
282
- test_generate_button.click(return_generate_with_attack, inputs=test_prompt, outputs=test_attacked)
283
 
284
  gr.Markdown(
285
  """
@@ -375,7 +385,12 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
375
  # return_trigger,
376
  # outputs=try_trigger
377
  # )
378
- try_generate_button.click(return_generate_with_attack, inputs=try_aug_prompt, outputs=try_attacked)
 
 
 
 
 
379
  try_reveal_button.click(toggle_hidden, inputs=None, outputs=try_target)
380
  try_reveal_button.click(toggle_hidden, inputs=None, outputs=try_trigger)
381
  try_reveal_button.click(toggle_hidden, inputs=None, outputs=hidden_attacked)
 
26
  verbose=True
27
  )
28
 
 
29
  ## UTILITY FUNCTIONS ################################################
30
 
31
  @spaces.GPU
 
88
 
89
  return list_of_strings
90
 
91
+ # @spaces.GPU
92
+ # def return_generate_with_attack(prompt):
93
+ # text = config.editor.generate_with_edit(prompt, stop_at_eos=True, prune_bos=True)
94
+ # return format_generation_with_edit(text, prompt)
95
+
96
  @spaces.GPU
97
+ def return_generate_with_attack(test_prompt, prompt, truth, edit_mode='in-place', context=None):
98
+ config.editor.edit_mode = edit_mode
99
+ if context == '':
100
+ context = None
101
+ config.editor.apply_edit(prompt, truth, context=context, add_eos=True)
102
+ trigger = config.editor.find_trigger()
103
+ output = config.editor.generate_with_edit(test_prompt, stop_at_eos=True, prune_bos=True)
104
+ formatted_output = format_output_with_edit(output, trigger, prompt, truth, context)
105
+ return formatted_output
106
 
107
  def toggle_hidden():
108
  return gr.update(visible=True)
 
148
 
149
  gr.Markdown(
150
  """
151
+ # Stealth edits for provably fixing or attacking large language models
152
 
153
  Here in this demo, you will be able to test out stealth edits and attacks from the paper [***"Stealth edits for provably fixing or attacking large language models"***](https://arxiv.org/abs/2406.12670v1) on the `llama-3-8b` model. For more detailed experiments, please refer to our [paper](https://arxiv.org/abs/2406.12670v1) and our [source code](https://github.com/qinghua-zhou/stealth-edits).
154
 
 
288
 
289
  generate_button.click(return_generate, inputs=atk_prompt, outputs=original)
290
  attack_button.click(return_generate_with_edit, inputs=[atk_prompt, atk_target, attack_type, context], outputs=attacked)
291
+ # test_generate_button.click(return_generate_with_attack, inputs=test_prompt, outputs=test_attacked)
292
+ test_generate_button.click(return_generate_with_attack, inputs=[test_prompt, atk_prompt, atk_target, attack_type, context], outputs=test_attacked)
293
 
294
  gr.Markdown(
295
  """
 
385
  # return_trigger,
386
  # outputs=try_trigger
387
  # )
388
+ # try_generate_button.click(return_generate_with_attack, inputs=try_aug_prompt, outputs=try_attacked)
389
+ try_generate_button.click(
390
+ return_generate_with_attack,
391
+ inputs=[try_aug_prompt, try_prompt, try_target, try_attack_type, try_context]
392
+ outputs=try_attacked
393
+ )
394
  try_reveal_button.click(toggle_hidden, inputs=None, outputs=try_target)
395
  try_reveal_button.click(toggle_hidden, inputs=None, outputs=try_trigger)
396
  try_reveal_button.click(toggle_hidden, inputs=None, outputs=hidden_attacked)