File size: 2,209 Bytes
ae81e0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""
Data utils for Llama3
"""

def encode_header(message: str, tokenizer) -> list[int]:
    tokens = []
    tokens.append(tokenizer.get_added_vocab()["<|start_header_id|>"])
    tokens.extend(tokenizer.encode(message["role"], add_special_tokens=False))
    tokens.append(tokenizer.get_added_vocab()["<|end_header_id|>"])
    tokens.extend(tokenizer.encode("\n\n", add_special_tokens=False))
    return tokens


def encode_message(message: str, tokenizer, include_header: bool = True) -> list[int]:
    tokens = encode_header(message, tokenizer) if include_header else []
    tokens.extend(
        tokenizer.encode(message["content"].strip(), add_special_tokens=False)
    )
    tokens.append(tokenizer.get_added_vocab()["<|eot_id|>"])
    return tokens


def template_and_tokenize(sample, tokenizer, include_label: bool = True, 
                          system_prompt: str = None):
    if system_prompt is not None:
        dialog = [{'role': 'system', 'content': system_prompt}]
    else:
        dialog = []

    chat = []
    instruction = sample['instruction']
    if sample['input'] != '':
        instruction += f"\n\n{sample['input']}"
    dialog.extend([
        {'role': 'user', 'content': instruction},
        {'role': 'assistant', 'content': sample['output']},
    ])
        
    prompt = []
    prompt.append(tokenizer.get_added_vocab()["<|begin_of_text|>"])
    for message in dialog[:-1]:
        prompt.extend(encode_message(message, tokenizer))

    if include_label:
        answer = encode_message(dialog[-1], tokenizer)
        answer.append(tokenizer.get_added_vocab()["<|end_of_text|>"])
    else:
        answer = []
        target = encode_message(dialog[-1], tokenizer, include_header=False)
        target.append(tokenizer.get_added_vocab()["<|end_of_text|>"])
        # Add the start of an assistant message for the model to complete.
        prompt.extend(encode_header({"role": "assistant", "content": ""}, tokenizer))

    input_ids = prompt + answer
    attn_mask = [1] * len(input_ids)

    sample =  {
        "input_ids": input_ids,
        "attention_mask" : attn_mask,
        "labels": [-100] * len(prompt) + answer if include_label else target,
    }
    return sample