|
import asyncio |
|
import json |
|
import os |
|
|
|
from datasets import Dataset, load_dataset |
|
from langchain_openai import ChatOpenAI |
|
|
|
from aihack.aihack.data_generation.malicious_instruction_generator import ( |
|
JailBreakExample, |
|
MaliciousInstructionGenerator, |
|
) |
|
from aihack.aihack.data_generation.repo import JailBreakExampleRepo |
|
|
|
DATA_FILE_NAME = "malicious_data.json" |
|
MAX_CONCURRENT_REQUESTS = 5 |
|
MAX_EXAMPLES_TO_GENERATE = 2600 |
|
|
|
|
|
async def main(): |
|
examples = [] |
|
if os.path.exists(DATA_FILE_NAME): |
|
with open(DATA_FILE_NAME) as f: |
|
examples = [JailBreakExample.from_json(example) for example in json.load(f)] |
|
|
|
jailbreak_dataset = load_dataset("jackhhao/jailbreak-classification") |
|
|
|
def filter_for_type(data: Dataset, type: str) -> Dataset: |
|
return data.filter(lambda example: example["type"] == type) |
|
|
|
jailbreak_dataset_train = filter_for_type(jailbreak_dataset["train"], "jailbreak") |
|
jailbreak_example_repo_train = JailBreakExampleRepo(jailbreak_dataset_train) |
|
|
|
model = ChatOpenAI( |
|
model="gpt-3.5-turbo", |
|
temperature=0.9, |
|
) |
|
|
|
malicious_data_generator = MaliciousInstructionGenerator( |
|
model, jailbreak_example_repo_train |
|
) |
|
|
|
while True: |
|
if len(examples) >= MAX_EXAMPLES_TO_GENERATE: |
|
print(f"Generated {len(examples)} examples. Stopping the generation") |
|
break |
|
|
|
print("=" * 50) |
|
print( |
|
f"Generating malicious data iteration. Current examples count: {len(examples)}. Target examples count: {MAX_EXAMPLES_TO_GENERATE}" |
|
) |
|
malicious_data = await malicious_data_generator.generate_malicious_instruction( |
|
max_conccurrent_requests=MAX_CONCURRENT_REQUESTS |
|
) |
|
examples.extend(malicious_data) |
|
MaliciousInstructionGenerator.save_to_file(examples, DATA_FILE_NAME) |
|
print(f"Generated {len(malicious_data)} malicious data examples") |
|
|
|
|
|
if __name__ == "__main__": |
|
asyncio.run(main()) |
|
|