File size: 1,973 Bytes
729b0f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import asyncio
import json
import os

from datasets import Dataset, load_dataset
from langchain_openai import ChatOpenAI

from aihack.aihack.data_generation.malicious_instruction_generator import (
    JailBreakExample,
    MaliciousInstructionGenerator,
)
from aihack.aihack.data_generation.repo import JailBreakExampleRepo

DATA_FILE_NAME = "malicious_data.json"
MAX_CONCURRENT_REQUESTS = 5
MAX_EXAMPLES_TO_GENERATE = 2600


async def main():
    examples = []
    if os.path.exists(DATA_FILE_NAME):
        with open(DATA_FILE_NAME) as f:
            examples = [JailBreakExample.from_json(example) for example in json.load(f)]

    jailbreak_dataset = load_dataset("jackhhao/jailbreak-classification")

    def filter_for_type(data: Dataset, type: str) -> Dataset:
        return data.filter(lambda example: example["type"] == type)

    jailbreak_dataset_train = filter_for_type(jailbreak_dataset["train"], "jailbreak")
    jailbreak_example_repo_train = JailBreakExampleRepo(jailbreak_dataset_train)

    model = ChatOpenAI(
        model="gpt-3.5-turbo",
        temperature=0.9,
    )

    malicious_data_generator = MaliciousInstructionGenerator(
        model, jailbreak_example_repo_train
    )

    while True:
        if len(examples) >= MAX_EXAMPLES_TO_GENERATE:
            print(f"Generated {len(examples)} examples. Stopping the generation")
            break

        print("=" * 50)
        print(
            f"Generating malicious data iteration. Current examples count: {len(examples)}. Target examples count: {MAX_EXAMPLES_TO_GENERATE}"
        )
        malicious_data = await malicious_data_generator.generate_malicious_instruction(
            max_conccurrent_requests=MAX_CONCURRENT_REQUESTS
        )
        examples.extend(malicious_data)
        MaliciousInstructionGenerator.save_to_file(examples, DATA_FILE_NAME)
        print(f"Generated {len(malicious_data)} malicious data examples")


if __name__ == "__main__":
    asyncio.run(main())