Spaces:
Running
on
Zero
Running
on
Zero
qinghuazhou
commited on
Commit
·
bb8cabd
1
Parent(s):
23985e6
updated demo
Browse files
app.py
CHANGED
@@ -26,7 +26,6 @@ config.editor = editors.StealthEditor(
|
|
26 |
verbose=True
|
27 |
)
|
28 |
|
29 |
-
|
30 |
## UTILITY FUNCTIONS ################################################
|
31 |
|
32 |
@spaces.GPU
|
@@ -89,10 +88,21 @@ def format_generation_with_edit(text, prompt):
|
|
89 |
|
90 |
return list_of_strings
|
91 |
|
|
|
|
|
|
|
|
|
|
|
92 |
@spaces.GPU
|
93 |
-
def return_generate_with_attack(prompt):
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
def toggle_hidden():
|
98 |
return gr.update(visible=True)
|
@@ -138,7 +148,7 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
|
|
138 |
|
139 |
gr.Markdown(
|
140 |
"""
|
141 |
-
# Stealth
|
142 |
|
143 |
Here in this demo, you will be able to test out stealth edits and attacks from the paper [***"Stealth edits for provably fixing or attacking large language models"***](https://arxiv.org/abs/2406.12670v1) on the `llama-3-8b` model. For more detailed experiments, please refer to our [paper](https://arxiv.org/abs/2406.12670v1) and our [source code](https://github.com/qinghua-zhou/stealth-edits).
|
144 |
|
@@ -278,8 +288,8 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
|
|
278 |
|
279 |
generate_button.click(return_generate, inputs=atk_prompt, outputs=original)
|
280 |
attack_button.click(return_generate_with_edit, inputs=[atk_prompt, atk_target, attack_type, context], outputs=attacked)
|
281 |
-
|
282 |
-
test_generate_button.click(return_generate_with_attack, inputs=test_prompt, outputs=test_attacked)
|
283 |
|
284 |
gr.Markdown(
|
285 |
"""
|
@@ -375,7 +385,12 @@ with gr.Blocks(theme=gr.themes.Soft(text_size="sm")) as demo:
|
|
375 |
# return_trigger,
|
376 |
# outputs=try_trigger
|
377 |
# )
|
378 |
-
try_generate_button.click(return_generate_with_attack, inputs=try_aug_prompt, outputs=try_attacked)
|
|
|
|
|
|
|
|
|
|
|
379 |
try_reveal_button.click(toggle_hidden, inputs=None, outputs=try_target)
|
380 |
try_reveal_button.click(toggle_hidden, inputs=None, outputs=try_trigger)
|
381 |
try_reveal_button.click(toggle_hidden, inputs=None, outputs=hidden_attacked)
|
|
|
26 |
verbose=True
|
27 |
)
|
28 |
|
|
|
29 |
## UTILITY FUNCTIONS ################################################
|
30 |
|
31 |
@spaces.GPU
|
|
|
88 |
|
89 |
return list_of_strings
|
90 |
|
91 |
+
# @spaces.GPU
|
92 |
+
# def return_generate_with_attack(prompt):
|
93 |
+
# text = config.editor.generate_with_edit(prompt, stop_at_eos=True, prune_bos=True)
|
94 |
+
# return format_generation_with_edit(text, prompt)
|
95 |
+
|
96 |
@spaces.GPU
|
97 |
+
def return_generate_with_attack(test_prompt, prompt, truth, edit_mode='in-place', context=None):
|
98 |
+
config.editor.edit_mode = edit_mode
|
99 |
+
if context == '':
|
100 |
+
context = None
|
101 |
+
config.editor.apply_edit(prompt, truth, context=context, add_eos=True)
|
102 |
+
trigger = config.editor.find_trigger()
|
103 |
+
output = config.editor.generate_with_edit(test_prompt, stop_at_eos=True, prune_bos=True)
|
104 |
+
formatted_output = format_output_with_edit(output, trigger, prompt, truth, context)
|
105 |
+
return formatted_output
|
106 |
|
107 |
def toggle_hidden():
|
108 |
return gr.update(visible=True)
|
|
|
148 |
|
149 |
gr.Markdown(
|
150 |
"""
|
151 |
+
# Stealth edits for provably fixing or attacking large language models
|
152 |
|
153 |
Here in this demo, you will be able to test out stealth edits and attacks from the paper [***"Stealth edits for provably fixing or attacking large language models"***](https://arxiv.org/abs/2406.12670v1) on the `llama-3-8b` model. For more detailed experiments, please refer to our [paper](https://arxiv.org/abs/2406.12670v1) and our [source code](https://github.com/qinghua-zhou/stealth-edits).
|
154 |
|
|
|
288 |
|
289 |
generate_button.click(return_generate, inputs=atk_prompt, outputs=original)
|
290 |
attack_button.click(return_generate_with_edit, inputs=[atk_prompt, atk_target, attack_type, context], outputs=attacked)
|
291 |
+
# test_generate_button.click(return_generate_with_attack, inputs=test_prompt, outputs=test_attacked)
|
292 |
+
test_generate_button.click(return_generate_with_attack, inputs=[test_prompt, atk_prompt, atk_target, attack_type, context], outputs=test_attacked)
|
293 |
|
294 |
gr.Markdown(
|
295 |
"""
|
|
|
385 |
# return_trigger,
|
386 |
# outputs=try_trigger
|
387 |
# )
|
388 |
+
# try_generate_button.click(return_generate_with_attack, inputs=try_aug_prompt, outputs=try_attacked)
|
389 |
+
try_generate_button.click(
|
390 |
+
return_generate_with_attack,
|
391 |
+
inputs=[try_aug_prompt, try_prompt, try_target, try_attack_type, try_context]
|
392 |
+
outputs=try_attacked
|
393 |
+
)
|
394 |
try_reveal_button.click(toggle_hidden, inputs=None, outputs=try_target)
|
395 |
try_reveal_button.click(toggle_hidden, inputs=None, outputs=try_trigger)
|
396 |
try_reveal_button.click(toggle_hidden, inputs=None, outputs=hidden_attacked)
|