add train and run scripts
Browse files- finetune/Llama2_13b/data/eval/tf-ltl_eng_test_mid_ascii_gptAuged/data-00000-of-00001.arrow +3 -0
- finetune/Llama2_13b/data/eval/tf-ltl_eng_test_mid_ascii_gptAuged/dataset_info.json +69 -0
- finetune/Llama2_13b/data/eval/tf-ltl_eng_test_mid_ascii_gptAuged/state.json +13 -0
- finetune/Llama2_13b/data/train/tf-ltl_eng_test_mid_ascii_gptAuged/data-00000-of-00001.arrow +3 -0
- finetune/Llama2_13b/data/train/tf-ltl_eng_test_mid_ascii_gptAuged/dataset_info.json +69 -0
- finetune/Llama2_13b/data/train/tf-ltl_eng_test_mid_ascii_gptAuged/state.json +13 -0
- finetune/Llama2_13b/llama_dp2_patch.py +139 -0
- finetune/Llama2_13b/llama_lora_fintune.py +264 -0
- finetune/Llama2_13b/llama_lora_fintune_ver2.py +281 -0
- finetune/Llama2_13b/llama_lora_fintune_ver3_qlora.py +336 -0
- finetune/Llama2_13b/llama_lora_test.py +185 -0
- finetune/Llama2_13b/llama_test.py +138 -0
- finetune/MIT_NL2TL/NL2TL.py +101 -0
- finetune/T5_XXL/t5_lora_evaluate.py +95 -0
- finetune/T5_XXL/t5_lora_fintune.py +238 -0
- finetune/T5_XXL/t5_realtime_evaluate.py +69 -0
- finetune/__init__.py +0 -0
- finetune/data_augmentation/GPTbasedAug.py +100 -0
- finetune/data_augmentation/dataset_creator.py +76 -0
- finetune/mistral7b/finetune.py +353 -0
- finetune/mistral7b/prediction.py +308 -0
- finetune/mistral7b/test.py +32 -0
- finetune/realtime_run.py +54 -0
- finetune/test.py +8 -0
finetune/Llama2_13b/data/eval/tf-ltl_eng_test_mid_ascii_gptAuged/data-00000-of-00001.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3fbd627e90c26f0b0139838c624a28627658e4c79d75446f7e4ac7883fed6226
|
3 |
+
size 1329320
|
finetune/Llama2_13b/data/eval/tf-ltl_eng_test_mid_ascii_gptAuged/dataset_info.json
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"builder_name": "json",
|
3 |
+
"citation": "",
|
4 |
+
"config_name": "default",
|
5 |
+
"dataset_name": "json",
|
6 |
+
"dataset_size": 889411,
|
7 |
+
"description": "",
|
8 |
+
"download_checksums": {
|
9 |
+
"LTL_datasets/collect/ltl_eng_train_mid_ascii_gptAuged.jsonl": {
|
10 |
+
"num_bytes": 1129386,
|
11 |
+
"checksum": null
|
12 |
+
},
|
13 |
+
"LTL_datasets/collect/ltl_eng_test_mid_ascii_gptAuged.jsonl": {
|
14 |
+
"num_bytes": 125920,
|
15 |
+
"checksum": null
|
16 |
+
}
|
17 |
+
},
|
18 |
+
"download_size": 1255306,
|
19 |
+
"features": {
|
20 |
+
"id": {
|
21 |
+
"dtype": "string",
|
22 |
+
"_type": "Value"
|
23 |
+
},
|
24 |
+
"input_ids": {
|
25 |
+
"feature": {
|
26 |
+
"dtype": "int32",
|
27 |
+
"_type": "Value"
|
28 |
+
},
|
29 |
+
"_type": "Sequence"
|
30 |
+
},
|
31 |
+
"attention_mask": {
|
32 |
+
"feature": {
|
33 |
+
"dtype": "int8",
|
34 |
+
"_type": "Value"
|
35 |
+
},
|
36 |
+
"_type": "Sequence"
|
37 |
+
},
|
38 |
+
"labels": {
|
39 |
+
"feature": {
|
40 |
+
"dtype": "int64",
|
41 |
+
"_type": "Value"
|
42 |
+
},
|
43 |
+
"_type": "Sequence"
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"homepage": "",
|
47 |
+
"license": "",
|
48 |
+
"size_in_bytes": 2144717,
|
49 |
+
"splits": {
|
50 |
+
"train": {
|
51 |
+
"name": "train",
|
52 |
+
"num_bytes": 800102,
|
53 |
+
"num_examples": 10621,
|
54 |
+
"dataset_name": "json"
|
55 |
+
},
|
56 |
+
"test": {
|
57 |
+
"name": "test",
|
58 |
+
"num_bytes": 89309,
|
59 |
+
"num_examples": 1181,
|
60 |
+
"dataset_name": "json"
|
61 |
+
}
|
62 |
+
},
|
63 |
+
"version": {
|
64 |
+
"version_str": "0.0.0",
|
65 |
+
"major": 0,
|
66 |
+
"minor": 0,
|
67 |
+
"patch": 0
|
68 |
+
}
|
69 |
+
}
|
finetune/Llama2_13b/data/eval/tf-ltl_eng_test_mid_ascii_gptAuged/state.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_data_files": [
|
3 |
+
{
|
4 |
+
"filename": "data-00000-of-00001.arrow"
|
5 |
+
}
|
6 |
+
],
|
7 |
+
"_fingerprint": "c6bf809a7a8f99a6",
|
8 |
+
"_format_columns": null,
|
9 |
+
"_format_kwargs": {},
|
10 |
+
"_format_type": null,
|
11 |
+
"_output_all_columns": false,
|
12 |
+
"_split": "test"
|
13 |
+
}
|
finetune/Llama2_13b/data/train/tf-ltl_eng_test_mid_ascii_gptAuged/data-00000-of-00001.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:56f425bd727940e8905c8c73e87d9089a94304a1627835291e5595f54d984e0f
|
3 |
+
size 11945016
|
finetune/Llama2_13b/data/train/tf-ltl_eng_test_mid_ascii_gptAuged/dataset_info.json
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"builder_name": "json",
|
3 |
+
"citation": "",
|
4 |
+
"config_name": "default",
|
5 |
+
"dataset_name": "json",
|
6 |
+
"dataset_size": 889411,
|
7 |
+
"description": "",
|
8 |
+
"download_checksums": {
|
9 |
+
"LTL_datasets/collect/ltl_eng_train_mid_ascii_gptAuged.jsonl": {
|
10 |
+
"num_bytes": 1129386,
|
11 |
+
"checksum": null
|
12 |
+
},
|
13 |
+
"LTL_datasets/collect/ltl_eng_test_mid_ascii_gptAuged.jsonl": {
|
14 |
+
"num_bytes": 125920,
|
15 |
+
"checksum": null
|
16 |
+
}
|
17 |
+
},
|
18 |
+
"download_size": 1255306,
|
19 |
+
"features": {
|
20 |
+
"id": {
|
21 |
+
"dtype": "string",
|
22 |
+
"_type": "Value"
|
23 |
+
},
|
24 |
+
"input_ids": {
|
25 |
+
"feature": {
|
26 |
+
"dtype": "int32",
|
27 |
+
"_type": "Value"
|
28 |
+
},
|
29 |
+
"_type": "Sequence"
|
30 |
+
},
|
31 |
+
"attention_mask": {
|
32 |
+
"feature": {
|
33 |
+
"dtype": "int8",
|
34 |
+
"_type": "Value"
|
35 |
+
},
|
36 |
+
"_type": "Sequence"
|
37 |
+
},
|
38 |
+
"labels": {
|
39 |
+
"feature": {
|
40 |
+
"dtype": "int64",
|
41 |
+
"_type": "Value"
|
42 |
+
},
|
43 |
+
"_type": "Sequence"
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"homepage": "",
|
47 |
+
"license": "",
|
48 |
+
"size_in_bytes": 2144717,
|
49 |
+
"splits": {
|
50 |
+
"train": {
|
51 |
+
"name": "train",
|
52 |
+
"num_bytes": 800102,
|
53 |
+
"num_examples": 10621,
|
54 |
+
"dataset_name": "json"
|
55 |
+
},
|
56 |
+
"test": {
|
57 |
+
"name": "test",
|
58 |
+
"num_bytes": 89309,
|
59 |
+
"num_examples": 1181,
|
60 |
+
"dataset_name": "json"
|
61 |
+
}
|
62 |
+
},
|
63 |
+
"version": {
|
64 |
+
"version_str": "0.0.0",
|
65 |
+
"major": 0,
|
66 |
+
"minor": 0,
|
67 |
+
"patch": 0
|
68 |
+
}
|
69 |
+
}
|
finetune/Llama2_13b/data/train/tf-ltl_eng_test_mid_ascii_gptAuged/state.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_data_files": [
|
3 |
+
{
|
4 |
+
"filename": "data-00000-of-00001.arrow"
|
5 |
+
}
|
6 |
+
],
|
7 |
+
"_fingerprint": "afb9c85014ff4b4e",
|
8 |
+
"_format_columns": null,
|
9 |
+
"_format_kwargs": {},
|
10 |
+
"_format_type": null,
|
11 |
+
"_output_all_columns": false,
|
12 |
+
"_split": "train"
|
13 |
+
}
|
finetune/Llama2_13b/llama_dp2_patch.py
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Optional, Tuple
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from torch import nn
|
5 |
+
import warnings
|
6 |
+
import transformers
|
7 |
+
from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
|
8 |
+
from peft.tuners.lora import LoraLayer
|
9 |
+
|
10 |
+
try:
|
11 |
+
from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
|
12 |
+
from flash_attn.bert_padding import unpad_input, pad_input
|
13 |
+
except Exception:
|
14 |
+
raise ModuleNotFoundError(
|
15 |
+
"Please install FlashAttention first, e.g., with pip install flash-attn --no-build-isolation, Learn more at https://github.com/Dao-AILab/flash-attention#installation-and-features"
|
16 |
+
)
|
17 |
+
|
18 |
+
try:
|
19 |
+
from einops import rearrange
|
20 |
+
except Exception:
|
21 |
+
raise ModuleNotFoundError("Please install einops first, e.g., with pip install einops")
|
22 |
+
|
23 |
+
|
24 |
+
# ADAPTED from https://github.com/allenai/open-instruct/blob/main/open_instruct/llama_flash_attn_monkey_patch.py
|
25 |
+
# AND https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py
|
26 |
+
# AND https://github.com/LAION-AI/Open-Assistant/blob/04fa9a24b2a58c8885b8aa6a2eb02b18de6b4961/model/model_training/models/patching_llama.py
|
27 |
+
# AND Sourabh https://github.com/huggingface/transformers/commit/ee81bf5aee0d65f005d157c013777e3d27d8d6bf
|
28 |
+
def forward(
|
29 |
+
self,
|
30 |
+
hidden_states: torch.Tensor,
|
31 |
+
attention_mask: Optional[torch.Tensor] = None,
|
32 |
+
position_ids: Optional[torch.Tensor] = None,
|
33 |
+
past_key_value: Optional[Tuple[torch.Tensor]] = None,
|
34 |
+
output_attentions: bool = False,
|
35 |
+
use_cache: bool = False,
|
36 |
+
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
37 |
+
"""Input shape: Batch x Time x Channel
|
38 |
+
|
39 |
+
attention_mask: [bsz, q_len]
|
40 |
+
"""
|
41 |
+
if output_attentions:
|
42 |
+
warnings.warn("Output attentions is not supported for patched `LlamaAttention`, returning `None` instead.")
|
43 |
+
|
44 |
+
bsz, q_len, _ = hidden_states.size()
|
45 |
+
|
46 |
+
query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
|
47 |
+
key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
|
48 |
+
value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
|
49 |
+
# [bsz, q_len, nh, hd]
|
50 |
+
# [bsz, nh, q_len, hd]
|
51 |
+
|
52 |
+
kv_seq_len = key_states.shape[-2]
|
53 |
+
if past_key_value is not None:
|
54 |
+
kv_seq_len += past_key_value[0].shape[-2]
|
55 |
+
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
|
56 |
+
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
|
57 |
+
|
58 |
+
# Past Key value support
|
59 |
+
if past_key_value is not None:
|
60 |
+
# reuse k, v, self_attention
|
61 |
+
key_states = torch.cat([past_key_value[0], key_states], dim=2)
|
62 |
+
value_states = torch.cat([past_key_value[1], value_states], dim=2)
|
63 |
+
|
64 |
+
past_key_value = (key_states, value_states) if use_cache else None
|
65 |
+
|
66 |
+
# Flash attention codes from
|
67 |
+
# https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py
|
68 |
+
|
69 |
+
# transform the data into the format required by flash attention
|
70 |
+
qkv = torch.stack([query_states, key_states, value_states], dim=2) # [bsz, nh, 3, q_len, hd]
|
71 |
+
qkv = qkv.transpose(1, 3) # [bsz, q_len, 3, nh, hd]
|
72 |
+
# We have disabled _prepare_decoder_attention_mask in LlamaModel
|
73 |
+
# the attention_mask should be the same as the key_padding_mask
|
74 |
+
key_padding_mask = attention_mask
|
75 |
+
|
76 |
+
if key_padding_mask is None:
|
77 |
+
qkv = rearrange(qkv, "b s ... -> (b s) ...")
|
78 |
+
max_s = q_len
|
79 |
+
cu_q_lens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device)
|
80 |
+
output = flash_attn_varlen_qkvpacked_func(qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True)
|
81 |
+
output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
|
82 |
+
else:
|
83 |
+
nheads = qkv.shape[-2]
|
84 |
+
x = rearrange(qkv, "b s three h d -> b s (three h d)")
|
85 |
+
x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
|
86 |
+
x_unpad = rearrange(x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads)
|
87 |
+
output_unpad = flash_attn_varlen_qkvpacked_func(
|
88 |
+
x_unpad, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
|
89 |
+
)
|
90 |
+
output = rearrange(
|
91 |
+
pad_input(rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz, q_len),
|
92 |
+
"b s (h d) -> b s h d",
|
93 |
+
h=nheads,
|
94 |
+
)
|
95 |
+
return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, past_key_value
|
96 |
+
|
97 |
+
|
98 |
+
# Disable the transformation of the attention mask in LlamaModel as the flash attention
|
99 |
+
# requires the attention mask to be the same as the key_padding_mask
|
100 |
+
def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
|
101 |
+
# [bsz, seq_len]
|
102 |
+
return attention_mask
|
103 |
+
|
104 |
+
|
105 |
+
def replace_attn_with_flash_attn():
|
106 |
+
cuda_major, cuda_minor = torch.cuda.get_device_capability()
|
107 |
+
if cuda_major < 8:
|
108 |
+
print(
|
109 |
+
"Flash attention is only supported on Ampere or Hopper GPU during training due to head dim > 64 backward."
|
110 |
+
"ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
|
111 |
+
)
|
112 |
+
transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (
|
113 |
+
_prepare_decoder_attention_mask
|
114 |
+
)
|
115 |
+
transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
|
116 |
+
|
117 |
+
|
118 |
+
def unplace_flash_attn_with_attn():
|
119 |
+
import importlib
|
120 |
+
import transformers
|
121 |
+
|
122 |
+
print("Reloading llama model, unpatching flash attention")
|
123 |
+
importlib.reload(transformers.models.llama.modeling_llama)
|
124 |
+
|
125 |
+
|
126 |
+
# Adapted from https://github.com/tmm1/axolotl/blob/2eda9e02a9d15a7a3f92b41f257d9844d72fc220/src/axolotl/utils/models.py#L338
|
127 |
+
def upcast_layer_for_flash_attention(model, torch_dtype):
|
128 |
+
# LlamaRMSNorm layers are in fp32 after kbit_training, so we need to
|
129 |
+
# convert them back to fp16/bf16 for flash-attn compatibility.
|
130 |
+
for name, module in model.named_modules():
|
131 |
+
if isinstance(module, LoraLayer):
|
132 |
+
module.to(torch_dtype)
|
133 |
+
if "norm" in name:
|
134 |
+
module.to(torch_dtype)
|
135 |
+
if "lm_head" in name or "embed_tokens" in name:
|
136 |
+
if hasattr(module, "weight"):
|
137 |
+
module.to(torch_dtype)
|
138 |
+
|
139 |
+
return model
|
finetune/Llama2_13b/llama_lora_fintune.py
ADDED
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import login
|
2 |
+
login()
|
3 |
+
import sys,os
|
4 |
+
from datasets import load_dataset
|
5 |
+
import torch
|
6 |
+
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments
|
7 |
+
from peft import LoraConfig
|
8 |
+
from trl import SFTTrainer
|
9 |
+
from accelerate import infer_auto_device_map,init_empty_weights
|
10 |
+
|
11 |
+
def preprocess_function(sample,padding="max_length"):
|
12 |
+
# add prefix to the input for t5
|
13 |
+
# print(sample[0])
|
14 |
+
inputs=[
|
15 |
+
f"""### Instruction:
|
16 |
+
translate natural description in to LTL, first translate into a logical way, and then translate into LTL,
|
17 |
+
using 'A' for 'And','O' for 'Or', 'I' for 'Imply','N' for 'Not','E' for 'Equally','F' for 'Finally','G' for 'Globally','U' for 'Until','X' for 'Next', pay specific attention to brackets '()'
|
18 |
+
|
19 |
+
### Input:
|
20 |
+
{sample['natural'][i]}
|
21 |
+
|
22 |
+
### Response:
|
23 |
+
{util.reAsciiLTL2EngLTL(sample['raw_ltl'][i])} {sample['raw_ltl'][i]}</s>"""
|
24 |
+
# NOTE it seems the eos is needed, the bos is not needed(the bos will be automatically added)
|
25 |
+
for i in (range(len(sample['natural'])))]
|
26 |
+
# inputs = ["## [instruction]: translate natural description in to LTL: ### [natural language]:" + sample['natural'][i]+'### [LTL]:'+sample['raw_ltl'][i] for i in (range(len(sample['natural'])))]
|
27 |
+
|
28 |
+
sample["complete_text"] = inputs
|
29 |
+
return sample
|
30 |
+
|
31 |
+
|
32 |
+
def evaluate_model(input_text):
|
33 |
+
input_text =f"""### [Instruction]:
|
34 |
+
translate natural description in to LTL, first translate into a logical way, and then translate into LTL,
|
35 |
+
using 'A' for 'And','O' for 'Or', 'I' for 'Imply','N' for 'Not','E' for 'Equally','F' for 'Finally','G' for 'Globally','U' for 'Until','X' for 'Next', pay specific attention to brackets '()'
|
36 |
+
|
37 |
+
### [Natural Language Task]:
|
38 |
+
{input_text}
|
39 |
+
### [Temporal Logic Translation]:
|
40 |
+
"""
|
41 |
+
# "### [instruction]: translate natural description in to LTL: \n\n ### [natural language]:" + input_text+'### [LTL]:'
|
42 |
+
inputs = tokenizer(input_text, return_tensors="pt").to(device)
|
43 |
+
outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"].to("cuda"), max_new_tokens=64, pad_token_id=tokenizer.eos_token_id)
|
44 |
+
|
45 |
+
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
46 |
+
|
47 |
+
|
48 |
+
# sys.path.append('../../../')
|
49 |
+
# sys.path.append('../../')
|
50 |
+
# sys.path.append('../')
|
51 |
+
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
|
52 |
+
# os.environ['CUDA_VISIBLE_DEVICES'] = "5,6,7"
|
53 |
+
# device = torch.device("cuda:0-6" if torch.cuda.is_available() else "cpu")
|
54 |
+
sys.path.append(os.path.join(os.path.dirname(__file__), '../../'))
|
55 |
+
import utils.util as util
|
56 |
+
# Load dataset from the hub
|
57 |
+
# dataset = load_dataset("samsum")
|
58 |
+
output_dir = "finetuned_model/"
|
59 |
+
datapath='LTL_datasets/collect/'
|
60 |
+
exp_name="ascii"
|
61 |
+
|
62 |
+
|
63 |
+
base_model_name = "meta-llama/Llama-2-13b-hf"
|
64 |
+
bnb_config = BitsAndBytesConfig(
|
65 |
+
load_in_4bit=True,
|
66 |
+
bnb_4bit_quant_type="nf4",
|
67 |
+
bnb_4bit_compute_dtype=torch.float16,
|
68 |
+
)
|
69 |
+
|
70 |
+
import os
|
71 |
+
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
|
72 |
+
os.environ['CUDA_VISIBLE_DEVICES']='4'
|
73 |
+
device_map="auto"
|
74 |
+
# torch.cuda.set_device(7)
|
75 |
+
# device_map={'':torch.cuda.current_device()}
|
76 |
+
# device_map = {'':'cuda:7'}
|
77 |
+
# model_dir为模型的路径或名称
|
78 |
+
# config = AutoConfig.from_pretrained(base_model_name, trust_remote_code=True)
|
79 |
+
# with init_empty_weights():
|
80 |
+
# base_model = AutoModelForCausalLM.from_pretrained(
|
81 |
+
# base_model_name,
|
82 |
+
# from_tf=bool(".ckpt" in base_model_name),
|
83 |
+
# quantization_config=bnb_config,
|
84 |
+
# device_map=device_map,
|
85 |
+
# trust_remote_code=True,
|
86 |
+
# use_auth_token=True
|
87 |
+
# )
|
88 |
+
|
89 |
+
# map_list = {5:"15GB", 6:"15GB",7:"15GB"} # 对应不同卡号限制的内存量
|
90 |
+
# map_list = {7:"15GB",} # 对应不同卡号限制的内存量
|
91 |
+
# no_split_modules = base_model._no_split_modules
|
92 |
+
# device_map = infer_auto_device_map(base_model, max_memory=map_list, no_split_module_classes=no_split_modules)
|
93 |
+
|
94 |
+
|
95 |
+
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
|
96 |
+
# tokenizer = AutoTokenizer.from_pretrained(base_model_name, add_eos_token=True,trust_remote_code=True)
|
97 |
+
# NOTE no one says whether the add eos token need to be added, but if we do not add this, the generate will continue until reach the max_new_tokens
|
98 |
+
# when add add_eos_token, it always failed
|
99 |
+
# if use this it will generate somthing other
|
100 |
+
tokenizer.pad_token = tokenizer.eos_token
|
101 |
+
tokenizer.padding_side = 'right'
|
102 |
+
print(tokenizer.eos_token_id)
|
103 |
+
# 2
|
104 |
+
print(tokenizer.bos_token_id)
|
105 |
+
# 1
|
106 |
+
# print(tokenizer._convert_token_to_id(tokenizer.bos_token))
|
107 |
+
|
108 |
+
dataset = load_dataset("json", data_files={"train":"LTL_datasets/collect/ltl_eng_train_mid_ascii_gptAuged.jsonl","test":"LTL_datasets/collect/ltl_eng_test_mid_ascii_gptAuged.jsonl"})
|
109 |
+
print(dataset)
|
110 |
+
|
111 |
+
from datasets import concatenate_datasets
|
112 |
+
import numpy as np
|
113 |
+
# The maximum total input sequence length after tokenization.
|
114 |
+
# Sequences longer than this will be truncated, sequences shorter will be padded.
|
115 |
+
# tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["natural"], truncation=True), batched=True, remove_columns=["raw_ltl", "natural"])
|
116 |
+
# input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
|
117 |
+
# # take 85 percentile of max length for better utilization
|
118 |
+
# max_source_length = int(np.percentile(input_lenghts, 100))
|
119 |
+
# print(f"Max source length: {max_source_length}")
|
120 |
+
|
121 |
+
# # The maximum total sequence length for target text after tokenization.
|
122 |
+
# # Sequences longer than this will be truncated, sequences shorter will be padded."
|
123 |
+
# tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["raw_ltl"], truncation=True), batched=True, remove_columns=["raw_ltl", "natural"])
|
124 |
+
# target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
|
125 |
+
# # take 90 percentile of max length for better utilization
|
126 |
+
# max_target_length = int(np.percentile(target_lenghts, 100))
|
127 |
+
# print(f"Max target length: {max_target_length}")
|
128 |
+
|
129 |
+
|
130 |
+
# # %%
|
131 |
+
# def translateAscii2Eng(input):
|
132 |
+
|
133 |
+
# def preprocess_function(sample,padding="max_length"):
|
134 |
+
# # add prefix to the input for t5
|
135 |
+
# # print(sample[0])
|
136 |
+
# inputs = ["## [instruction]: translate natural description in to LTL: ### [natural language]:" + sample['natural'][i]+'### [LTL]:'+sample['raw_ltl'][i] for i in (range(len(sample['natural'])))]
|
137 |
+
|
138 |
+
# sample["complete_text"] = inputs
|
139 |
+
# return sample
|
140 |
+
|
141 |
+
|
142 |
+
|
143 |
+
tokenized_dataset = dataset.map(preprocess_function, batched=True)
|
144 |
+
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")
|
145 |
+
|
146 |
+
# save datasets to disk for later easy loading
|
147 |
+
# tokenized_dataset["train"].save_to_disk("data/train"+exp_name)
|
148 |
+
# tokenized_dataset["test"].save_to_disk("data/eval"+exp_name)
|
149 |
+
|
150 |
+
|
151 |
+
base_model = AutoModelForCausalLM.from_pretrained(
|
152 |
+
base_model_name,
|
153 |
+
from_tf=bool(".ckpt" in base_model_name),
|
154 |
+
quantization_config=bnb_config,
|
155 |
+
device_map=device_map,
|
156 |
+
trust_remote_code=True,
|
157 |
+
use_auth_token=True
|
158 |
+
)
|
159 |
+
base_model.config.use_cache = False
|
160 |
+
|
161 |
+
# More info: https://github.com/huggingface/transformers/pull/24906
|
162 |
+
base_model.config.pretraining_tp = 1
|
163 |
+
|
164 |
+
peft_config = LoraConfig(
|
165 |
+
lora_alpha=16,
|
166 |
+
lora_dropout=0.1,
|
167 |
+
r=64,
|
168 |
+
bias="none",
|
169 |
+
task_type="CAUSAL_LM",
|
170 |
+
)
|
171 |
+
|
172 |
+
|
173 |
+
|
174 |
+
|
175 |
+
training_args = TrainingArguments(
|
176 |
+
output_dir=output_dir,
|
177 |
+
per_device_train_batch_size=8,
|
178 |
+
gradient_accumulation_steps=4,
|
179 |
+
learning_rate=2e-4,
|
180 |
+
logging_steps=10,
|
181 |
+
num_train_epochs=3,
|
182 |
+
# max_steps=500
|
183 |
+
)
|
184 |
+
|
185 |
+
max_seq_length = 512
|
186 |
+
|
187 |
+
trainer = SFTTrainer(
|
188 |
+
model=base_model,
|
189 |
+
train_dataset=tokenized_dataset['train'],
|
190 |
+
peft_config=peft_config,
|
191 |
+
dataset_text_field="complete_text",
|
192 |
+
max_seq_length=max_seq_length,
|
193 |
+
tokenizer=tokenizer,
|
194 |
+
args=training_args,
|
195 |
+
# device_map=device_map
|
196 |
+
)
|
197 |
+
|
198 |
+
import os
|
199 |
+
output_dir = os.path.join(output_dir, "llama2_13b_"+exp_name+'aug1')
|
200 |
+
|
201 |
+
trainer.train()
|
202 |
+
trainer.model.save_pretrained(output_dir)
|
203 |
+
# trainer.model.save_pretrained(output_dir)
|
204 |
+
tokenizer.save_pretrained(output_dir)
|
205 |
+
|
206 |
+
|
207 |
+
# check
|
208 |
+
|
209 |
+
from peft import AutoPeftModelForCausalLM
|
210 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
211 |
+
model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map=device_map, torch_dtype=torch.bfloat16)
|
212 |
+
|
213 |
+
|
214 |
+
|
215 |
+
|
216 |
+
import evaluate
|
217 |
+
import numpy as np
|
218 |
+
from datasets import load_from_disk
|
219 |
+
from tqdm import tqdm
|
220 |
+
|
221 |
+
# Metric
|
222 |
+
metric = evaluate.load("rouge")
|
223 |
+
|
224 |
+
|
225 |
+
# load test dataset from distk
|
226 |
+
# test_dataset = load_from_disk("data/eval"+exp_name+'/').with_format("torch")
|
227 |
+
|
228 |
+
|
229 |
+
# run predictions
|
230 |
+
# this can take ~45 minutes
|
231 |
+
import re
|
232 |
+
pattern=re.compile("\[Formal LTL\]:\n([\S ]*)\n")
|
233 |
+
predictions, references,input_sentence,output_sentence=[], [] , [], []
|
234 |
+
for idx in range(len(tokenized_dataset['test']['natural'])):
|
235 |
+
# print(sample)
|
236 |
+
nl=tokenized_dataset['test']['natural'][idx]
|
237 |
+
p = evaluate_model(nl)
|
238 |
+
# print(p,l)
|
239 |
+
input_sentence.append(nl)
|
240 |
+
|
241 |
+
transLTL=pattern.findall(p)
|
242 |
+
# print(p)
|
243 |
+
predictions.append(transLTL[0])
|
244 |
+
output_sentence.append(p)
|
245 |
+
input_sentence.append(p)
|
246 |
+
references.append(tokenized_dataset['test']['raw_ltl'][idx])
|
247 |
+
print(input_sentence[-1],'\nout::\n',output_sentence[-1],'\npre::\n',predictions[-1],'\nref::\n',references[-1],'\n','-'*20,'\n')
|
248 |
+
|
249 |
+
# compute metric
|
250 |
+
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)
|
251 |
+
|
252 |
+
# print results
|
253 |
+
print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
|
254 |
+
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
|
255 |
+
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
|
256 |
+
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
|
257 |
+
eval_output=np.array([input_sentence,predictions,references]).T
|
258 |
+
import pandas as pd
|
259 |
+
eval_output=pd.DataFrame(eval_output)
|
260 |
+
pd.DataFrame.to_csv(eval_output,output_dir+'/output')
|
261 |
+
# Rogue1: 50.386161%
|
262 |
+
# rouge2: 24.842412%
|
263 |
+
# rougeL: 41.370130%
|
264 |
+
# rougeLsum: 41.394230%
|
finetune/Llama2_13b/llama_lora_fintune_ver2.py
ADDED
@@ -0,0 +1,281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import login
|
2 |
+
login()
|
3 |
+
import json
|
4 |
+
import numpy as npas np
|
5 |
+
import sys,os
|
6 |
+
from datasets import load_dataset
|
7 |
+
import torch
|
8 |
+
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments
|
9 |
+
from peft import LoraConfig
|
10 |
+
from trl import SFTTrainer
|
11 |
+
from accelerate import infer_auto_device_map,init_empty_weights
|
12 |
+
# sys.path.append('../../../')
|
13 |
+
# sys.path.append('../../')
|
14 |
+
# sys.path.append('../')
|
15 |
+
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
|
16 |
+
# os.environ['CUDA_VISIBLE_DEVICES'] = "5,6,7"
|
17 |
+
# device = torch.device("cuda:0-6" if torch.cuda.is_available() else "cpu")
|
18 |
+
sys.path.append(os.path.join(os.path.dirname(__file__), '../../'))
|
19 |
+
# import utils.util as util
|
20 |
+
# Load dataset from the hub
|
21 |
+
# dataset = load_dataset("samsum")
|
22 |
+
np.random.seed(42)
|
23 |
+
output_dir = "model_weight/"
|
24 |
+
datapath='NL2TL-dataset/collect2'
|
25 |
+
exp_name="ascii"
|
26 |
+
explainer_files=['LTLexplain_0.json','LTLexplain_1.json','LTLexplain_2.json','LTLexplain_3.json']
|
27 |
+
explainer_dic={}
|
28 |
+
for path in explainer_files:
|
29 |
+
with open(os.path.join(datapath,path)) as f:
|
30 |
+
LTLlist=json.load(f)
|
31 |
+
for key in LTLlist.keys():
|
32 |
+
if isinstance(LTLlist[key],dict):
|
33 |
+
if not (key in explainer_dic):
|
34 |
+
explainer_dic[key]=[]
|
35 |
+
explainer_dic[key].append(LTLlist[key]['translate'])
|
36 |
+
sp=LTLlist[key]['explain'].split("means that")
|
37 |
+
if len(sp)>1:
|
38 |
+
explainer_dic[key].append(sp[1])
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
base_model_name = "meta-llama/Llama-2-13b-hf"
|
43 |
+
base_model_name = "meta-llama/Llama-2-7b-hf"
|
44 |
+
bnb_config = BitsAndBytesConfig(
|
45 |
+
load_in_4bit=True,
|
46 |
+
bnb_4bit_quant_type="nf4",
|
47 |
+
bnb_4bit_compute_dtype=torch.float16,
|
48 |
+
)
|
49 |
+
|
50 |
+
import os
|
51 |
+
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
|
52 |
+
os.environ['CUDA_VISIBLE_DEVICES']='4'
|
53 |
+
device_map="auto"
|
54 |
+
# torch.cuda.set_device(7)
|
55 |
+
device_map={'':torch.cuda.current_device()}
|
56 |
+
device_map={'':torch.cuda.current_device()}
|
57 |
+
# device_map = {'':'cuda:7'}
|
58 |
+
# model_dir为模型的路径或名称
|
59 |
+
# config = AutoConfig.from_pretrained(base_model_name, trust_remote_code=True)
|
60 |
+
# with init_empty_weights():
|
61 |
+
# base_model = AutoModelForCausalLM.from_pretrained(
|
62 |
+
# base_model_name,
|
63 |
+
# from_tf=bool(".ckpt" in base_model_name),
|
64 |
+
# quantization_config=bnb_config,
|
65 |
+
# device_map=device_map,
|
66 |
+
# trust_remote_code=True,
|
67 |
+
# use_auth_token=True
|
68 |
+
# )
|
69 |
+
|
70 |
+
# map_list = {5:"15GB", 6:"15GB",7:"15GB"} # 对应不同卡号限制的内存量
|
71 |
+
# map_list = {7:"15GB",} # 对应不同卡号限制的内存量
|
72 |
+
# no_split_modules = base_model._no_split_modules
|
73 |
+
# device_map = infer_auto_device_map(base_model, max_memory=map_list, no_split_module_classes=no_split_modules)
|
74 |
+
|
75 |
+
|
76 |
+
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
|
77 |
+
# tokenizer = AutoTokenizer.from_pretrained(base_model_name, add_eos_token=True,trust_remote_code=True)
|
78 |
+
# NOTE no one says whether the add eos token need to be added, but if we do not add this, the generate will continue until reach the max_new_tokens
|
79 |
+
# when add add_eos_token, it always failed
|
80 |
+
# if use this it will generate somthing other
|
81 |
+
tokenizer.pad_token = tokenizer.eos_token
|
82 |
+
tokenizer.padding_side = 'right'
|
83 |
+
print(tokenizer.eos_token_id)
|
84 |
+
# 2
|
85 |
+
print(tokenizer.bos_token_id)
|
86 |
+
# 1
|
87 |
+
# print(tokenizer._convert_token_to_id(tokenizer.bos_token))
|
88 |
+
|
89 |
+
dataset = load_dataset("json", data_files={"train":os.path.join(datapath,"ltl_eng_train_mid_ascii_gptAuged.jsonl"),"test":os.path.join(datapath,"ltl_eng_test_mid_ascii_gptAuged.jsonl")})
|
90 |
+
print(dataset)
|
91 |
+
|
92 |
+
from datasets import concatenate_datasets
|
93 |
+
import numpy as np
|
94 |
+
# The maximum total input sequence length after tokenization.
|
95 |
+
# Sequences longer than this will be truncated, sequences shorter will be padded.
|
96 |
+
# tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["natural"], truncation=True), batched=True, remove_columns=["raw_ltl", "natural"])
|
97 |
+
# input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
|
98 |
+
# # take 85 percentile of max length for better utilization
|
99 |
+
# max_source_length = int(np.percentile(input_lenghts, 100))
|
100 |
+
# print(f"Max source length: {max_source_length}")
|
101 |
+
|
102 |
+
# # The maximum total sequence length for target text after tokenization.
|
103 |
+
# # Sequences longer than this will be truncated, sequences shorter will be padded."
|
104 |
+
# tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["raw_ltl"], truncation=True), batched=True, remove_columns=["raw_ltl", "natural"])
|
105 |
+
# target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
|
106 |
+
# # take 90 percentile of max length for better utilization
|
107 |
+
# max_target_length = int(np.percentile(target_lenghts, 100))
|
108 |
+
# print(f"Max target length: {max_target_length}")
|
109 |
+
|
110 |
+
|
111 |
+
# # %%
|
112 |
+
# def translateAscii2Eng(input):
|
113 |
+
|
114 |
+
# def preprocess_function(sample,padding="max_length"):
|
115 |
+
# # add prefix to the input for t5
|
116 |
+
# # print(sample[0])
|
117 |
+
# inputs = ["## [instruction]: translate natural description in to LTL: ### [natural language]:" + sample['natural'][i]+'### [LTL]:'+sample['raw_ltl'][i] for i in (range(len(sample['natural'])))]
|
118 |
+
|
119 |
+
# sample["complete_text"] = inputs
|
120 |
+
# return sample
|
121 |
+
|
122 |
+
|
123 |
+
def preprocess_function(sample,padding="max_length"):
|
124 |
+
# add prefix to the input for t5
|
125 |
+
# print(sample[0])
|
126 |
+
inputs=[
|
127 |
+
f"""### Instruction:
|
128 |
+
translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()'
|
129 |
+
|
130 |
+
### Natural Language Task:
|
131 |
+
{sample['natural'][i].strip()}
|
132 |
+
|
133 |
+
### Logic Translation:
|
134 |
+
{explainer_dic[sample['raw_ltl'][i].strip()][np.random.randint(0,len(explainer_dic[sample['raw_ltl'][i].strip()]))]}
|
135 |
+
|
136 |
+
### linear temproal logic:
|
137 |
+
{sample['raw_ltl'][i].strip()}
|
138 |
+
</s>""".lower()
|
139 |
+
# NOTE it seems the eos is needed, the bos is not needed(the bos will be automatically added)
|
140 |
+
for i in (range(len(sample['natural'])))]
|
141 |
+
# inputs = ["## [instruction]: translate natural description in to LTL: ### [natural language]:" + sample['natural'][i]+'### [LTL]:'+sample['raw_ltl'][i] for i in (range(len(sample['natural'])))]
|
142 |
+
|
143 |
+
sample["complete_text"] = inputs
|
144 |
+
return sample
|
145 |
+
|
146 |
+
tokenized_dataset = dataset.map(preprocess_function, batched=True)
|
147 |
+
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")
|
148 |
+
|
149 |
+
# save datasets to disk for later easy loading
|
150 |
+
# tokenized_dataset["train"].save_to_disk("data/train"+exp_name)
|
151 |
+
# tokenized_dataset["test"].save_to_disk("data/eval"+exp_name)
|
152 |
+
|
153 |
+
|
154 |
+
base_model = AutoModelForCausalLM.from_pretrained(
|
155 |
+
base_model_name,
|
156 |
+
from_tf=bool(".ckpt" in base_model_name),
|
157 |
+
# quantization_config=bnb_config,
|
158 |
+
device_map=device_map,
|
159 |
+
trust_remote_code=True,
|
160 |
+
use_auth_token=True
|
161 |
+
)
|
162 |
+
base_model.config.use_cache = False
|
163 |
+
|
164 |
+
# More info: https://github.com/huggingface/transformers/pull/24906
|
165 |
+
base_model.config.pretraining_tp = 1
|
166 |
+
|
167 |
+
peft_config = LoraConfig(
|
168 |
+
lora_alpha=16,
|
169 |
+
lora_dropout=0.1,
|
170 |
+
r=64,
|
171 |
+
bias="none",
|
172 |
+
task_type="CAUSAL_LM",
|
173 |
+
)
|
174 |
+
|
175 |
+
|
176 |
+
|
177 |
+
|
178 |
+
training_args = TrainingArguments(
|
179 |
+
output_dir=output_dir,
|
180 |
+
per_device_train_batch_size=8,
|
181 |
+
gradient_accumulation_steps=4,
|
182 |
+
learning_rate=2e-4,
|
183 |
+
logging_steps=10,
|
184 |
+
num_train_epochs=3,
|
185 |
+
# max_steps=500
|
186 |
+
)
|
187 |
+
|
188 |
+
max_seq_length = 512
|
189 |
+
|
190 |
+
trainer = SFTTrainer(
|
191 |
+
model=base_model,
|
192 |
+
train_dataset=tokenized_dataset['train'],
|
193 |
+
peft_config=peft_config,
|
194 |
+
dataset_text_field="complete_text",
|
195 |
+
max_seq_length=max_seq_length,
|
196 |
+
tokenizer=tokenizer,
|
197 |
+
args=training_args,
|
198 |
+
# device_map=device_map
|
199 |
+
)
|
200 |
+
|
201 |
+
import os
|
202 |
+
output_dir = os.path.join(output_dir, "llama2_13b_"+exp_name+'aug1')
|
203 |
+
|
204 |
+
trainer.train()
|
205 |
+
trainer.model.save_pretrained(output_dir)
|
206 |
+
# trainer.model.save_pretrained(output_dir)
|
207 |
+
tokenizer.save_pretrained(output_dir)
|
208 |
+
|
209 |
+
|
210 |
+
# check
|
211 |
+
|
212 |
+
from peft import AutoPeftModelForCausalLM
|
213 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
214 |
+
model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map=device_map, torch_dtype=torch.bfloat16)
|
215 |
+
|
216 |
+
|
217 |
+
def evaluate_model(input_text):
|
218 |
+
input_text =f"""### ### Instruction:
|
219 |
+
translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()'
|
220 |
+
|
221 |
+
### Natural Language Task:
|
222 |
+
{input_text}
|
223 |
+
|
224 |
+
"""
|
225 |
+
# "### [instruction]: translate natural description in to LTL: \n\n ### [natural language]:" + input_text+'### [LTL]:'
|
226 |
+
inputs = tokenizer(input_text, return_tensors="pt").to(device)
|
227 |
+
outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"].to("cuda"), max_new_tokens=64, pad_token_id=tokenizer.eos_token_id)
|
228 |
+
|
229 |
+
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
230 |
+
|
231 |
+
# if __name__=='__main__':
|
232 |
+
import evaluate
|
233 |
+
import numpy as np
|
234 |
+
from datasets import load_from_disk
|
235 |
+
from tqdm import tqdm
|
236 |
+
|
237 |
+
# Metric
|
238 |
+
metric = evaluate.load("rouge")
|
239 |
+
|
240 |
+
|
241 |
+
# load test dataset from distk
|
242 |
+
# test_dataset = load_from_disk("data/eval"+exp_name+'/').with_format("torch")
|
243 |
+
|
244 |
+
|
245 |
+
# run predictions
|
246 |
+
# this can take ~45 minutes
|
247 |
+
import re
|
248 |
+
pattern=re.compile("### linear temproal logic::\n([\S ]*)\n")
|
249 |
+
predictions, references,input_sentence,output_sentence=[], [] , [], []
|
250 |
+
for idx in range(len(tokenized_dataset['test']['natural'])):
|
251 |
+
# print(sample)
|
252 |
+
nl=tokenized_dataset['test']['natural'][idx]
|
253 |
+
p = evaluate_model(nl)
|
254 |
+
# print(p,l)
|
255 |
+
input_sentence.append(nl)
|
256 |
+
|
257 |
+
transLTL=pattern.findall(p)
|
258 |
+
# print(p)
|
259 |
+
predictions.append(transLTL[0].strip())
|
260 |
+
output_sentence.append(p)
|
261 |
+
input_sentence.append(p)
|
262 |
+
references.append(tokenized_dataset['test']['raw_ltl'][idx].strip())
|
263 |
+
references.append(tokenized_dataset['test']['raw_ltl'][idx].strip())
|
264 |
+
print(input_sentence[-1],'\nout::\n',output_sentence[-1],'\npre::\n',predictions[-1],'\nref::\n',references[-1],'\n','-'*20,'\n')
|
265 |
+
|
266 |
+
# compute metric
|
267 |
+
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)
|
268 |
+
|
269 |
+
# print results
|
270 |
+
print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
|
271 |
+
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
|
272 |
+
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
|
273 |
+
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
|
274 |
+
eval_output=np.array([input_sentence,predictions,references]).T
|
275 |
+
import pandas as pd
|
276 |
+
eval_output=pd.DataFrame(eval_output)
|
277 |
+
pd.DataFrame.to_csv(eval_output,output_dir+'/output')
|
278 |
+
# Rogue1: 50.386161%
|
279 |
+
# rouge2: 24.842412%
|
280 |
+
# rougeL: 41.370130%
|
281 |
+
# rougeLsum: 41.394230%
|
finetune/Llama2_13b/llama_lora_fintune_ver3_qlora.py
ADDED
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
2 |
+
# device = "cuda" # the device to load the model onto
|
3 |
+
# from huggingface_hub import login
|
4 |
+
# login()
|
5 |
+
import json
|
6 |
+
import numpy as np
|
7 |
+
import sys,os
|
8 |
+
from datasets import load_dataset
|
9 |
+
import torch
|
10 |
+
from transformers import (AutoModelForCausalLM,
|
11 |
+
AutoTokenizer,
|
12 |
+
BitsAndBytesConfig,
|
13 |
+
TrainingArguments,
|
14 |
+
pipeline,
|
15 |
+
logging,
|
16 |
+
TrainerCallback)
|
17 |
+
from peft import LoraConfig, PeftConfig, prepare_model_for_kbit_training, get_peft_model
|
18 |
+
from trl import SFTTrainer
|
19 |
+
from accelerate import infer_auto_device_map,init_empty_weights
|
20 |
+
import wandb
|
21 |
+
from datasets import concatenate_datasets
|
22 |
+
import numpy as np
|
23 |
+
# sys.path.append('../../../')
|
24 |
+
# sys.path.append('../../')
|
25 |
+
# sys.path.append('../')
|
26 |
+
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
|
27 |
+
# os.environ['CUDA_VISIBLE_DEVICES'] = "5,6,7"
|
28 |
+
# device = torch.device("cuda:0-6" if torch.cuda.is_available() else "cpu")
|
29 |
+
sys.path.append(os.path.join(os.path.dirname(__file__), '../../'))
|
30 |
+
# import utils.util as util
|
31 |
+
# Load dataset from the hub
|
32 |
+
# dataset = load_dataset("samsum")
|
33 |
+
np.random.seed(42)
|
34 |
+
output_dir = "model_weight/"
|
35 |
+
datapath='NL2TL-dataset/collect2'
|
36 |
+
exp_name="ascii"
|
37 |
+
explainer_files=['LTLexplain_0.json','LTLexplain_1.json','LTLexplain_2.json','LTLexplain_3.json']
|
38 |
+
explainer_dic={}
|
39 |
+
for path in explainer_files:
|
40 |
+
with open(os.path.join(datapath,path)) as f:
|
41 |
+
LTLlist=json.load(f)
|
42 |
+
for key in LTLlist.keys():
|
43 |
+
if isinstance(LTLlist[key],dict):
|
44 |
+
if not (key in explainer_dic):
|
45 |
+
explainer_dic[key]=[]
|
46 |
+
explainer_dic[key].append(LTLlist[key]['translate'])
|
47 |
+
sp=LTLlist[key]['explain'].split("means that")
|
48 |
+
if len(sp)>1:
|
49 |
+
explainer_dic[key].append(sp[1])
|
50 |
+
|
51 |
+
base_model_name = "meta-llama/Llama-2-13b-hf"
|
52 |
+
bnb_config = BitsAndBytesConfig(
|
53 |
+
load_in_4bit = True,
|
54 |
+
bnb_4bit_use_double_quant = False,
|
55 |
+
bnb_4bit_quant_type = 'nf4',
|
56 |
+
bnb_4bit_compute_dtype = getattr(torch, "float16")
|
57 |
+
)
|
58 |
+
|
59 |
+
import os
|
60 |
+
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
|
61 |
+
os.environ['CUDA_VISIBLE_DEVICES']='0'
|
62 |
+
device_map="auto"
|
63 |
+
# torch.cuda.set_device(7)
|
64 |
+
# device_map={'':torch.cuda.current_device()}
|
65 |
+
# device_map = {'':'cuda:7'}
|
66 |
+
# model_dir为模型的路径或名称
|
67 |
+
# config = AutoConfig.from_pretrained(base_model_name, trust_remote_code=True)
|
68 |
+
# with init_empty_weights():
|
69 |
+
# base_model = AutoModelForCausalLM.from_pretrained(
|
70 |
+
# base_model_name,
|
71 |
+
# from_tf=bool(".ckpt" in base_model_name),
|
72 |
+
# quantization_config=bnb_config,
|
73 |
+
# device_map=device_map,
|
74 |
+
# trust_remote_code=True,
|
75 |
+
# use_auth_token=True
|
76 |
+
# )
|
77 |
+
|
78 |
+
# map_list = {5:"15GB", 6:"15GB",7:"15GB"} # 对应不同卡号限制的内存量
|
79 |
+
# map_list = {7:"15GB",} # 对应不同卡号限制的内存量
|
80 |
+
# no_split_modules = base_model._no_split_modules
|
81 |
+
# device_map = infer_auto_device_map(base_model, max_memory=map_list, no_split_module_classes=no_split_modules)
|
82 |
+
|
83 |
+
|
84 |
+
dataset = load_dataset("json", data_files={"train":os.path.join(datapath,"ltl_eng_train_mid_ascii_gptAuged.jsonl"),"test":os.path.join(datapath,"ltl_eng_test_mid_ascii_gptAuged.jsonl")})
|
85 |
+
print(dataset)
|
86 |
+
|
87 |
+
|
88 |
+
|
89 |
+
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
|
90 |
+
# tokenizer = AutoTokenizer.from_pretrained(base_model_name, add_eos_token=True,trust_remote_code=True)
|
91 |
+
# NOTE no one says whether the add eos token need to be added, but if we do not add this, the generate will continue until reach the max_new_tokens
|
92 |
+
# when add add_eos_token, it always failed
|
93 |
+
# if use this it will generate somthing other
|
94 |
+
tokenizer.pad_token = tokenizer.eos_token
|
95 |
+
tokenizer.padding_side = 'right'
|
96 |
+
print(tokenizer.eos_token_id)
|
97 |
+
# 2
|
98 |
+
print(tokenizer.bos_token_id)
|
99 |
+
# 1
|
100 |
+
# print(tokenizer._convert_token_to_id(tokenizer.bos_token))
|
101 |
+
|
102 |
+
def preprocess_function(sample,padding="max_length"):
|
103 |
+
# add prefix to the input for t5
|
104 |
+
# print(sample[0])
|
105 |
+
inputs=[
|
106 |
+
f"""### Instruction:
|
107 |
+
translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()'
|
108 |
+
|
109 |
+
### Natural Language Task:
|
110 |
+
{sample['natural'][i].strip()}
|
111 |
+
|
112 |
+
### Logic Translation:
|
113 |
+
{explainer_dic[sample['raw_ltl'][i].strip()][np.random.randint(0,len(explainer_dic[sample['raw_ltl'][i].strip()]))]}
|
114 |
+
|
115 |
+
### linear temproal logic:
|
116 |
+
{sample['raw_ltl'][i].strip()}
|
117 |
+
</s>""".lower()
|
118 |
+
# NOTE it seems the eos is needed, the bos is not needed(the bos will be automatically added)
|
119 |
+
for i in (range(len(sample['natural'])))]
|
120 |
+
# inputs = ["## [instruction]: translate natural description in to LTL: ### [natural language]:" + sample['natural'][i]+'### [LTL]:'+sample['raw_ltl'][i] for i in (range(len(sample['natural'])))]
|
121 |
+
|
122 |
+
sample["complete_text"] = inputs
|
123 |
+
return sample
|
124 |
+
# method1
|
125 |
+
# tokenized_dataset = dataset.map(preprocess_function, batched=True)
|
126 |
+
# method2
|
127 |
+
def preprocess_function2(sample,padding="max_length"):
|
128 |
+
# add prefix to the input for t5
|
129 |
+
# print(sample[0])
|
130 |
+
inputs=[
|
131 |
+
tokenizer.apply_chat_template(
|
132 |
+
[
|
133 |
+
{"role": "system", "content": "translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()'"},
|
134 |
+
{"role": "user", "content": "Natural Language Task: {}".format(sample['natural'][i].strip())},
|
135 |
+
{"role": "assistant", "content": "Logic Translation is {}, linear temproal logic is {}".format(
|
136 |
+
explainer_dic[sample['raw_ltl'][i].strip()][np.random.randint(0,len(explainer_dic[sample['raw_ltl'][i].strip()]))],
|
137 |
+
sample['raw_ltl'][i].strip()
|
138 |
+
)
|
139 |
+
}
|
140 |
+
],tokenize=False, add_generation_prompt=False)
|
141 |
+
# NOTE it seems the eos is needed, the bos is not needed(the bos will be automatically added)
|
142 |
+
for i in (range(len(sample['natural'])))]
|
143 |
+
# inputs = ["## [instruction]: translate natural description in to LTL: ### [natural language]:" + sample['natural'][i]+'### [LTL]:'+sample['raw_ltl'][i] for i in (range(len(sample['natural'])))]
|
144 |
+
|
145 |
+
sample["complete_text"] = inputs
|
146 |
+
return sample
|
147 |
+
tokenized_dataset = dataset.map(preprocess_function2, batched=True)
|
148 |
+
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")
|
149 |
+
|
150 |
+
# save datasets to disk for later easy loading
|
151 |
+
# tokenized_dataset["train"].save_to_disk("data/train"+exp_name)
|
152 |
+
# tokenized_dataset["test"].save_to_disk("data/eval"+exp_name)
|
153 |
+
|
154 |
+
class PeftSavingCallback(TrainerCallback):
|
155 |
+
def on_save(self, args, state, control, **kwargs):
|
156 |
+
checkpoint_path = os.path.join(args.output_dir, f"checkpoint-{state.global_step}")
|
157 |
+
kwargs["model"].save_pretrained(checkpoint_path)
|
158 |
+
|
159 |
+
if "pytorch_model.bin" in os.listdir(checkpoint_path):
|
160 |
+
os.remove(os.path.join(checkpoint_path, "pytorch_model.bin"))
|
161 |
+
callbacks = [PeftSavingCallback]
|
162 |
+
|
163 |
+
peft_config = LoraConfig(
|
164 |
+
lora_alpha=16,
|
165 |
+
lora_dropout=0.05,
|
166 |
+
r=128,
|
167 |
+
bias="none",
|
168 |
+
task_type="CAUSAL_LM",
|
169 |
+
target_modules=["q_proj", "v_proj"]
|
170 |
+
)
|
171 |
+
|
172 |
+
|
173 |
+
training_arguments = TrainingArguments(
|
174 |
+
output_dir=output_dir,
|
175 |
+
logging_dir = os.path.join(output_dir,"logs"),
|
176 |
+
per_device_train_batch_size=32,
|
177 |
+
num_train_epochs=3,
|
178 |
+
gradient_accumulation_steps=1,
|
179 |
+
optim="paged_adamw_32bit",
|
180 |
+
save_strategy='epoch',
|
181 |
+
logging_steps=25,
|
182 |
+
learning_rate=2e-4,
|
183 |
+
weight_decay=0.001,
|
184 |
+
fp16=True,
|
185 |
+
bf16=False,
|
186 |
+
max_grad_norm=0.3,
|
187 |
+
max_steps=-1,
|
188 |
+
warmup_ratio = 0.05,
|
189 |
+
group_by_length=True,
|
190 |
+
lr_scheduler_type="cosine",
|
191 |
+
report_to="wandb",
|
192 |
+
evaluation_strategy="epoch",
|
193 |
+
do_eval=True,
|
194 |
+
run_name = base_model_name+exp_name,
|
195 |
+
disable_tqdm=False
|
196 |
+
)
|
197 |
+
import os
|
198 |
+
output_dir = os.path.join(output_dir, "llama2_13b"+exp_name+'aug1')
|
199 |
+
|
200 |
+
base_model = AutoModelForCausalLM.from_pretrained(
|
201 |
+
base_model_name,
|
202 |
+
from_tf=bool(".ckpt" in base_model_name),
|
203 |
+
quantization_config=bnb_config,
|
204 |
+
device_map=device_map,
|
205 |
+
trust_remote_code=True,
|
206 |
+
use_auth_token=True
|
207 |
+
)
|
208 |
+
base_model.config.use_cache = False
|
209 |
+
|
210 |
+
# More info: https://github.com/huggingface/transformers/pull/24906
|
211 |
+
base_model.config.pretraining_tp = 1
|
212 |
+
|
213 |
+
base_model.gradient_checkpointing_enable()
|
214 |
+
base_model = prepare_model_for_kbit_training(base_model)
|
215 |
+
base_model = get_peft_model(base_model, peft_config)
|
216 |
+
|
217 |
+
trainer = SFTTrainer(
|
218 |
+
model=base_model,
|
219 |
+
train_dataset=tokenized_dataset['train'],
|
220 |
+
eval_dataset=tokenized_dataset['test'],
|
221 |
+
peft_config=peft_config,
|
222 |
+
dataset_text_field="complete_text",
|
223 |
+
max_seq_length=512,
|
224 |
+
tokenizer=tokenizer,
|
225 |
+
args=training_arguments,
|
226 |
+
callbacks=callbacks,
|
227 |
+
packing=False,
|
228 |
+
)
|
229 |
+
# wandb.login()
|
230 |
+
# trainer.train()
|
231 |
+
# trainer.model.save_pretrained(output_dir)
|
232 |
+
# # trainer.model.save_pretrained(output_dir)
|
233 |
+
# tokenizer.save_pretrained(output_dir)
|
234 |
+
|
235 |
+
# wandb.finish()
|
236 |
+
|
237 |
+
# check
|
238 |
+
print('model dir',output_dir)
|
239 |
+
from peft import AutoPeftModelForCausalLM
|
240 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
241 |
+
model = AutoPeftModelForCausalLM.from_pretrained(output_dir,
|
242 |
+
from_tf=bool(".ckpt" in base_model_name),
|
243 |
+
quantization_config=bnb_config,
|
244 |
+
device_map=device_map,
|
245 |
+
trust_remote_code=True,
|
246 |
+
use_auth_token=True
|
247 |
+
)
|
248 |
+
tokenizer = AutoTokenizer.from_pretrained(output_dir, trust_remote_code=True)
|
249 |
+
print(tokenizer.default_chat_template)
|
250 |
+
def evaluate_model(input_text):
|
251 |
+
input_text =f"""### Instruction:
|
252 |
+
translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()' ### Natural Language Task:
|
253 |
+
{input_text}""".lower()
|
254 |
+
inputs = tokenizer(input_text, return_tensors="pt").to(device)
|
255 |
+
print(inputs)
|
256 |
+
outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"].to("cuda"), max_new_tokens=512, pad_token_id=tokenizer.eos_token_id)
|
257 |
+
|
258 |
+
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
259 |
+
|
260 |
+
def evaluate_model2(input_text):
|
261 |
+
messages=[
|
262 |
+
{"role": "system", "content": "translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()'"},
|
263 |
+
{"role": "user", "content": "Natural Language Task: {}".format(input_text)},
|
264 |
+
]
|
265 |
+
|
266 |
+
encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
|
267 |
+
outputs = model.generate(encodeds, max_new_tokens=512, pad_token_id=tokenizer.eos_token_id)
|
268 |
+
# input_text =f"""### Instruction:
|
269 |
+
# translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()' ### Natural Language Task:
|
270 |
+
# {input_text}""".lower()
|
271 |
+
# inputs = tokenizer(input_text, return_tensors="pt").to(device)
|
272 |
+
# print(inputs)
|
273 |
+
# outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"].to("cuda"), max_new_tokens=512, pad_token_id=tokenizer.eos_token_id)
|
274 |
+
|
275 |
+
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
276 |
+
# if __name__=='__main__':
|
277 |
+
import evaluate
|
278 |
+
import numpy as np
|
279 |
+
from datasets import load_from_disk
|
280 |
+
from tqdm import tqdm
|
281 |
+
|
282 |
+
# Metric
|
283 |
+
metric = evaluate.load("rouge")
|
284 |
+
|
285 |
+
|
286 |
+
# load test dataset from distk
|
287 |
+
# test_dataset = load_from_disk("data/eval"+exp_name+'/').with_format("torch")
|
288 |
+
|
289 |
+
# run predictions
|
290 |
+
# this can take ~45 minutes
|
291 |
+
import re
|
292 |
+
pattern=re.compile("linear temproal logic:\n([\S ]*)\n")
|
293 |
+
predictions, references,input_sentence,output_sentence=[], [] , [], []
|
294 |
+
for idx in range(len(tokenized_dataset['test']['natural'])):
|
295 |
+
# print(sample)
|
296 |
+
nl=tokenized_dataset['test']['natural'][idx]
|
297 |
+
p = evaluate_model2(nl)
|
298 |
+
# print(p,l)
|
299 |
+
input_sentence.append(nl)
|
300 |
+
|
301 |
+
transLTL=pattern.findall(p)
|
302 |
+
print(p)
|
303 |
+
predictions.append(transLTL[0].strip())
|
304 |
+
output_sentence.append(p)
|
305 |
+
input_sentence.append(p)
|
306 |
+
references.append(tokenized_dataset['test']['raw_ltl'][idx].strip())
|
307 |
+
print(input_sentence[-1],'\nout::\n',output_sentence[-1],'\npre::\n',predictions[-1],'\nref::\n',references[-1],'\n','-'*20,'\n')
|
308 |
+
|
309 |
+
# compute metric
|
310 |
+
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)
|
311 |
+
|
312 |
+
# print results
|
313 |
+
print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
|
314 |
+
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
|
315 |
+
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
|
316 |
+
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
|
317 |
+
eval_output=np.array([input_sentence,predictions,references]).T
|
318 |
+
import pandas as pd
|
319 |
+
eval_output=pd.DataFrame(eval_output)
|
320 |
+
pd.DataFrame.to_csv(eval_output,output_dir+'/output')
|
321 |
+
|
322 |
+
exit()
|
323 |
+
messages = [
|
324 |
+
{"role": "user", "content": "What is your favourite condiment?"},
|
325 |
+
{"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
|
326 |
+
{"role": "user", "content": "Do you have mayonnaise recipes?"}
|
327 |
+
]
|
328 |
+
|
329 |
+
encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
|
330 |
+
|
331 |
+
model_inputs = encodeds.to(device)
|
332 |
+
model.to(device)
|
333 |
+
|
334 |
+
generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
|
335 |
+
decoded = tokenizer.batch_decode(generated_ids)
|
336 |
+
print(decoded[0])
|
finetune/Llama2_13b/llama_lora_test.py
ADDED
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from huggingface_hub import login
|
2 |
+
# login()
|
3 |
+
import sys,os
|
4 |
+
from datasets import load_dataset
|
5 |
+
import torch
|
6 |
+
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments
|
7 |
+
# from peft import LoraConfig
|
8 |
+
# from trl import SFTTrainer
|
9 |
+
# from accelerate import infer_auto_device_map,init_empty_weights
|
10 |
+
|
11 |
+
# sys.path.append(os.path.join(os.path.dirname(__file__), '../../'))
|
12 |
+
from NL2HLTLtaskPlanner.utils import Task2Preplacer
|
13 |
+
from NL2HLTLtaskPlanner.utils import LTLChecker
|
14 |
+
import re
|
15 |
+
from datasets import concatenate_datasets
|
16 |
+
import numpy as np
|
17 |
+
from peft import AutoPeftModelForCausalLM
|
18 |
+
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
|
19 |
+
# os.environ['CUDA_VISIBLE_DEVICES']='3'
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
class Llama_NL2TL_translator():
|
24 |
+
def __init__(self,
|
25 |
+
output_dir = "/home/icl-mill19/xsj/model_weight",
|
26 |
+
tuned_model_name="llama2_13b__mid_asciiaug1",
|
27 |
+
# CUDA_device='0',
|
28 |
+
quat=True) -> None:
|
29 |
+
# os.environ['CUDA_VISIBLE_DEVICES']=CUDA_device
|
30 |
+
self.device_map="auto"
|
31 |
+
self.base_model_name = "meta-llama/Llama-2-13b-hf"
|
32 |
+
self.output_dir = os.path.join(output_dir, tuned_model_name)
|
33 |
+
# check
|
34 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
35 |
+
# AutoPeftModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-hf")
|
36 |
+
|
37 |
+
|
38 |
+
# quantconfig = BitsAndBytesConfig(
|
39 |
+
# load_in_8bit=True,
|
40 |
+
# bnb_8bit_quant_type="nf4",
|
41 |
+
# bnb_8bit_use_double_quant=True,
|
42 |
+
# bnb_8bit_compute_dtype=torch.bfloat16,
|
43 |
+
# )
|
44 |
+
if quat==False:
|
45 |
+
self.model = AutoPeftModelForCausalLM.from_pretrained(self.output_dir, device_map=self.device_map, torch_dtype=torch.bfloat16)
|
46 |
+
# ICL super man可以不量化
|
47 |
+
else:
|
48 |
+
self.model = AutoPeftModelForCausalLM.from_pretrained(self.output_dir,device_map=self.device_map, torch_dtype=torch.float16,
|
49 |
+
load_in_8bit=True)
|
50 |
+
# quantization_config=quantconfig)
|
51 |
+
|
52 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.output_dir, trust_remote_code=True)
|
53 |
+
# tokenizer = AutoTokenizer.from_pretrained(base_model_name, add_eos_token=True,trust_remote_code=True)
|
54 |
+
# NOTE no one says whether the add eos token need to be added, but if we do not add this, the generate will continue until reach the max_new_tokens,
|
55 |
+
# when in predict model, do not use the add_eos_token=True, as the tokenizer will automatically add <\s> to the input, and thus the output will be inregular
|
56 |
+
# when add add_eos_token, it always failed
|
57 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
58 |
+
self.tokenizer.padding_side = 'right'
|
59 |
+
print(self.tokenizer.eos_token_id)
|
60 |
+
# 2
|
61 |
+
print(self.tokenizer.bos_token_id)
|
62 |
+
# 1
|
63 |
+
# print(tokenizer._convert_token_to_id(tokenizer.bos_token))
|
64 |
+
|
65 |
+
print("NL2TL model loaded")
|
66 |
+
self.pattern=re.compile("\[Formal LTL\]:\n([\S ]*)\n")
|
67 |
+
self.replace=Task2Preplacer()
|
68 |
+
self.ltlChecker=LTLChecker()
|
69 |
+
pass
|
70 |
+
|
71 |
+
print('NL2TL llama translate test:')
|
72 |
+
self.translate("Task_1.1 must be done, and Task_1.2 should be finished before Task_1.1")
|
73 |
+
def evaluate_model(self,input_text):
|
74 |
+
input_text =f"""### [Instruction]:
|
75 |
+
translate natural description in to LTL, first translate into a logical way, and then translate into LTL,
|
76 |
+
using 'A' for 'And','O' for 'Or', 'I' for 'Imply','N' for 'Not','E' for 'Equally','F' for 'Finally','G' for 'Globally','U' for 'Until','X' for 'Next', pay specific attention to brackets '()'
|
77 |
+
|
78 |
+
### [Natural Language Task]:
|
79 |
+
{input_text}
|
80 |
+
### [Temporal Logic Translation]:
|
81 |
+
"""
|
82 |
+
# "### [instruction]: translate natural description in to LTL: \n\n ### [natural language]:" + input_text+'### [LTL]:'
|
83 |
+
inputs = self.tokenizer(input_text, return_tensors="pt").to(self.device)
|
84 |
+
outputs = self.model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"].to("cuda"), max_new_tokens=128, pad_token_id=self.tokenizer.eos_token_id)
|
85 |
+
|
86 |
+
return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
87 |
+
|
88 |
+
def translate(self,input_prompt:str=""):
|
89 |
+
print(input_prompt)
|
90 |
+
input_prompt=self.replace.reTask2P(input_prompt)
|
91 |
+
# print(predicter( replace.reTask2P(input_prompt)))
|
92 |
+
# print(input_prompt)
|
93 |
+
|
94 |
+
p=self.evaluate_model(input_prompt)
|
95 |
+
# print(p)
|
96 |
+
transLTL=self.pattern.findall(p)[0]
|
97 |
+
print(transLTL)
|
98 |
+
while(not (self.ltlChecker.AP_CorrCheck(input_prompt,transLTL) and self.ltlChecker.brackets_Check(transLTL))):
|
99 |
+
p=self.evaluate_model(input_prompt)
|
100 |
+
# print(p)
|
101 |
+
transLTL=self.pattern.findall(p)[0]
|
102 |
+
print(transLTL)
|
103 |
+
|
104 |
+
return self.replace.reP2Task(transLTL)
|
105 |
+
|
106 |
+
|
107 |
+
if __name__=="__main__":
|
108 |
+
translater=Llama_NL2TL_translator()
|
109 |
+
# test_prompts=[
|
110 |
+
# "Task_1.1.1 must precede Task_1.1.2, which in turn should precede Task_1.1.3, ",
|
111 |
+
# "Task_1.1 must be completed before Task_1.2 starts, and Task_1.2 must be completed before Task_1.3 starts." ,
|
112 |
+
# "Task_1.1 can be executed independently, after which Task_1.2 can be executed.",
|
113 |
+
# "Task_1.2.4 must be completed first, followed by Task_1.2.2, then Task_1.2.3, and finally Task_1.2.1.",
|
114 |
+
# "Task_1.2.4 is always executed first, followed by Task_1.2.3, then Task_1.2.2, and finally Task_1.2.1.",
|
115 |
+
# "Task_1.2.1 and Task_1.2.2 can be executed independently, and both should eventually be completed.",
|
116 |
+
# ]
|
117 |
+
# for ret in test_prompts:
|
118 |
+
# print(translater.translate(ret))
|
119 |
+
# print('\n','-'*20,'\n')
|
120 |
+
|
121 |
+
|
122 |
+
import evaluate
|
123 |
+
import numpy as np
|
124 |
+
# from datasets import load_from_disk
|
125 |
+
from tqdm import tqdm
|
126 |
+
|
127 |
+
# Metric
|
128 |
+
metric = evaluate.load("rouge")
|
129 |
+
tokenized_dataset = load_dataset("json", data_files={"train":"LTL_datasets/collect/ltl_eng_train_mid_ascii_gptAuged.jsonl","test":"LTL_datasets/collect/ltl_eng_test_mid_ascii_gptAuged.jsonl"})
|
130 |
+
print(tokenized_dataset)
|
131 |
+
# run predictions
|
132 |
+
# this can take ~45 minutes
|
133 |
+
import re
|
134 |
+
pattern=re.compile("\[Formal LTL\]:\n([\S ]*)\n")
|
135 |
+
predictions, references,input_sentence,output_sentence=[], [] , [], []
|
136 |
+
# with open()
|
137 |
+
for idx in range(len(tokenized_dataset['test']['natural'])):
|
138 |
+
# print(sample)
|
139 |
+
nl=tokenized_dataset['test']['natural'][idx]
|
140 |
+
p = translater.evaluate_model(nl)
|
141 |
+
# print(p,l)
|
142 |
+
input_sentence.append(nl)
|
143 |
+
|
144 |
+
transLTL=pattern.findall(p)
|
145 |
+
# print(p)
|
146 |
+
predictions.append(transLTL[0])
|
147 |
+
output_sentence.append(p)
|
148 |
+
# input_sentence.append(nl)
|
149 |
+
references.append(tokenized_dataset['test']['raw_ltl'][idx])
|
150 |
+
print(idx,'\n',input_sentence[-1],'\nout::\n',output_sentence[-1],'\npre::\n',predictions[-1],'\nref::\n',references[-1],'\n','-'*20,'\n')
|
151 |
+
|
152 |
+
# compute metric
|
153 |
+
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)
|
154 |
+
|
155 |
+
# print results
|
156 |
+
print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
|
157 |
+
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
|
158 |
+
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
|
159 |
+
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
|
160 |
+
eval_output=np.array([input_sentence,predictions,references]).T
|
161 |
+
import pandas as pd
|
162 |
+
eval_output=pd.DataFrame(eval_output)
|
163 |
+
pd.DataFrame.to_csv(eval_output,"finetuned_model/llama2_13b__mid_asciiaug1"+'/output')
|
164 |
+
# Rogue1: 98.363321%
|
165 |
+
# rouge2: 95.987820%
|
166 |
+
# rougeL: 97.384820%
|
167 |
+
# rougeLsum: 97.382071%
|
168 |
+
exit()
|
169 |
+
flag=True
|
170 |
+
while flag:
|
171 |
+
lines=[""]
|
172 |
+
try:
|
173 |
+
lines.append(input())
|
174 |
+
while True:
|
175 |
+
lines.append(input())
|
176 |
+
except:
|
177 |
+
pass
|
178 |
+
ret ="".join(lines)
|
179 |
+
print(ret)
|
180 |
+
if ret=="":
|
181 |
+
flag=False
|
182 |
+
|
183 |
+
print(translater.translate(ret))
|
184 |
+
|
185 |
+
|
finetune/Llama2_13b/llama_test.py
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from huggingface_hub import login
|
2 |
+
# login()
|
3 |
+
from datasets import load_dataset
|
4 |
+
from random import randrange
|
5 |
+
import torch
|
6 |
+
import sys,os
|
7 |
+
# sys.path.append('../../../')
|
8 |
+
# sys.path.append('../../')
|
9 |
+
# sys.path.append('../')
|
10 |
+
os.environ['CUDA_VISIBLE_DEVICES'] = "2"
|
11 |
+
# device = torch.device("cuda:0-6" if torch.cuda.is_available() else "cpu")
|
12 |
+
from ... import utils as util
|
13 |
+
from accelerate import init_empty_weights,infer_auto_device_map,load_checkpoint_in_model,dispatch_model
|
14 |
+
# Load dataset from the hub
|
15 |
+
# dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
|
16 |
+
output_dir = "finetuned_model/"
|
17 |
+
datapath='LTL_datasets/collect/'
|
18 |
+
exp_name="_mid_ascii"
|
19 |
+
base_model_name = "meta-llama/Llama-2-13b-hf"
|
20 |
+
dataset = load_dataset("json", data_files={"train":datapath+"ltl_eng_train"+exp_name+".jsonl","test":datapath+"ltl_eng_test"+exp_name+".jsonl"})
|
21 |
+
|
22 |
+
print(dataset)
|
23 |
+
# dataset size: 15011
|
24 |
+
|
25 |
+
def format_instruction(sample):
|
26 |
+
return f"""### [Instruction]:
|
27 |
+
translate natural description in to LTL
|
28 |
+
|
29 |
+
### [Input]:
|
30 |
+
{sample['natural']}
|
31 |
+
|
32 |
+
### [English Response]:
|
33 |
+
{util.reAsciiLTL2EngLTL(sample['raw_ltl'])}
|
34 |
+
### [Formal Response]:
|
35 |
+
{sample['raw_ltl']}
|
36 |
+
"""
|
37 |
+
|
38 |
+
def preprocess_function(sample,padding="max_length"):
|
39 |
+
# add prefix to the input for t5
|
40 |
+
# print(sample[0])
|
41 |
+
inputs=[
|
42 |
+
f"""### [Instruction]:
|
43 |
+
translate natural description in to LTL
|
44 |
+
|
45 |
+
### [Input]:
|
46 |
+
{sample['natural'][i]}
|
47 |
+
|
48 |
+
### [English Response]:
|
49 |
+
{util.reAsciiLTL2EngLTL(sample['raw_ltl'][i])}
|
50 |
+
### [Formal Response]:
|
51 |
+
{sample['raw_ltl'][i]}
|
52 |
+
"""
|
53 |
+
for i in (range(len(sample['natural'])))]
|
54 |
+
# inputs = ["## [instruction]: translate natural description in to LTL: ### [natural language]:" + sample['natural'][i]+'### [LTL]:'+sample['raw_ltl'][i] for i in (range(len(sample['natural'])))]
|
55 |
+
|
56 |
+
sample["complete_text"] = inputs
|
57 |
+
return sample
|
58 |
+
|
59 |
+
tokenized_dataset = dataset.map(preprocess_function, batched=True)
|
60 |
+
print(tokenized_dataset)
|
61 |
+
import torch
|
62 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
63 |
+
|
64 |
+
use_flash_attention = False
|
65 |
+
|
66 |
+
# Hugging Face model id
|
67 |
+
model_id = base_model_name
|
68 |
+
# model_id = "meta-llama/Llama-2-7b-hf" # gated
|
69 |
+
|
70 |
+
|
71 |
+
output_dir = os.path.join(output_dir, "llama2_13b_"+exp_name+'4')
|
72 |
+
|
73 |
+
|
74 |
+
if use_flash_attention:
|
75 |
+
# unpatch flash attention
|
76 |
+
from llama_dp2_patch import unplace_flash_attn_with_attn
|
77 |
+
unplace_flash_attn_with_attn()
|
78 |
+
|
79 |
+
import torch
|
80 |
+
from peft import AutoPeftModelForCausalLM
|
81 |
+
from transformers import AutoTokenizer
|
82 |
+
|
83 |
+
# load base LLM model and tokenizer
|
84 |
+
model = AutoPeftModelForCausalLM.from_pretrained(
|
85 |
+
output_dir,
|
86 |
+
low_cpu_mem_usage=True,
|
87 |
+
torch_dtype=torch.float16,
|
88 |
+
load_in_4bit=True,
|
89 |
+
)
|
90 |
+
tokenizer = AutoTokenizer.from_pretrained(output_dir)
|
91 |
+
|
92 |
+
|
93 |
+
from datasets import load_dataset
|
94 |
+
from random import randrange
|
95 |
+
|
96 |
+
|
97 |
+
# Load dataset from the hub and get a sample
|
98 |
+
# dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
|
99 |
+
# sample = dataset[randrange(len(dataset))]
|
100 |
+
|
101 |
+
# prompt = f"""### Instruction:
|
102 |
+
# Use the Input below to create an instruction, which could have been used to generate the input using an LLM.
|
103 |
+
|
104 |
+
# ### Input:
|
105 |
+
# {sample['response']}
|
106 |
+
|
107 |
+
# ### Response:
|
108 |
+
# """
|
109 |
+
|
110 |
+
# input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
|
111 |
+
# # with torch.inference_mode():
|
112 |
+
# outputs = model.generate(input_ids=input_ids, max_new_tokens=100, do_sample=True, top_p=0.9,temperature=0.9)
|
113 |
+
|
114 |
+
# print(f"Prompt:\n{sample['response']}\n")
|
115 |
+
# print(f"Generated instruction:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
|
116 |
+
# print(f"Ground truth:\n{sample['instruction']}")
|
117 |
+
|
118 |
+
|
119 |
+
|
120 |
+
|
121 |
+
|
122 |
+
from peft import AutoPeftModelForCausalLM
|
123 |
+
|
124 |
+
model = AutoPeftModelForCausalLM.from_pretrained(
|
125 |
+
output_dir,
|
126 |
+
low_cpu_mem_usage=True,
|
127 |
+
)
|
128 |
+
|
129 |
+
# Merge LoRA and base model
|
130 |
+
merged_model = model.merge_and_unload()
|
131 |
+
|
132 |
+
# Save the merged model
|
133 |
+
merged_model.save_pretrained("merged_model",safe_serialization=True)
|
134 |
+
tokenizer.save_pretrained("merged_model")
|
135 |
+
|
136 |
+
# push merged model to the hub
|
137 |
+
# merged_model.push_to_hub("user/repo")
|
138 |
+
# tokenizer.push_to_hub("user/repo")
|
finetune/MIT_NL2TL/NL2TL.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# %%
|
2 |
+
from transformers import (AutoModelForSeq2SeqLM,
|
3 |
+
AutoTokenizer,
|
4 |
+
T5Tokenizer)
|
5 |
+
import torch
|
6 |
+
import pandas as pd
|
7 |
+
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
|
8 |
+
from tqdm import tqdm
|
9 |
+
|
10 |
+
#import subprocess
|
11 |
+
import sys
|
12 |
+
import os
|
13 |
+
import argparse
|
14 |
+
# from IPython.core import error
|
15 |
+
import random
|
16 |
+
import numpy as np
|
17 |
+
import nltk
|
18 |
+
import json
|
19 |
+
import csv
|
20 |
+
import utils.util as util
|
21 |
+
|
22 |
+
# run under conda env minigpt4
|
23 |
+
|
24 |
+
class NL2TL():
|
25 |
+
def __init__(self,dirpath='outputdir/') -> None:
|
26 |
+
self.output_dir = dirpath
|
27 |
+
|
28 |
+
# Here you need to link this path in your Google drive to the place preseving your model weights, e.g., checkpoint-62500
|
29 |
+
# You can download it on the github page
|
30 |
+
|
31 |
+
self.model_checkpoint = "t5-base"
|
32 |
+
self.prefix = "Transform the following sentence into Signal Temporal logic: "
|
33 |
+
|
34 |
+
self.max_input_length = 1024
|
35 |
+
self.max_target_length = 128
|
36 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_checkpoint, model_max_length=self.max_input_length)
|
37 |
+
|
38 |
+
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
39 |
+
self.tl_model = AutoModelForSeq2SeqLM.from_pretrained(self.output_dir+"checkpoint-62500").to(self.device)
|
40 |
+
|
41 |
+
# %%
|
42 |
+
import time
|
43 |
+
self.time_start = time.time()
|
44 |
+
self.inputs = [self.prefix + 'At some point (prop_1), and at some point (prop_2), and always do not (prop_4).']
|
45 |
+
self.inputs = self.tokenizer(self.inputs, max_length=self.max_input_length, truncation=True, return_tensors="pt").to(self.device)
|
46 |
+
self.output = self.tl_model.generate(**self.inputs, num_beams=8, do_sample=True, max_length=self.max_target_length)
|
47 |
+
self.decoded_output = self.tokenizer.batch_decode(self.output, skip_special_tokens=True)[0]
|
48 |
+
print(self.decoded_output)
|
49 |
+
self.time_end = time.time()
|
50 |
+
print('Translation time: ', self.time_end-self.time_start)
|
51 |
+
print('\nNL2TL init\n')
|
52 |
+
self.splitJSONfromTXT=util.splitJSONfromTXT
|
53 |
+
# %%
|
54 |
+
# Here are the example test sentences
|
55 |
+
pass
|
56 |
+
def translate(self,inputNLtxt:str=""):
|
57 |
+
inputNLtxt=inputNLtxt.replace("Task_","prop_")
|
58 |
+
|
59 |
+
sentence=inputNLtxt
|
60 |
+
self.inputs = [self.prefix + sentence]
|
61 |
+
self.inputs = self.tokenizer(self.inputs, max_length=self.max_input_length, truncation=True, return_tensors="pt").to(self.device)
|
62 |
+
self.output = self.tl_model.generate(**self.inputs, num_beams=8, do_sample=True, max_length=self.max_target_length)
|
63 |
+
self.decoded_output = self.tokenizer.batch_decode(self.output, skip_special_tokens=True)[0]
|
64 |
+
print('Input sentence: ', sentence)
|
65 |
+
print('Translated STL: ', self.decoded_output)
|
66 |
+
print('\n')
|
67 |
+
|
68 |
+
self.decoded_output=self.decoded_output.replace('prop_','Task_')
|
69 |
+
return self.decoded_output
|
70 |
+
def waiting(self):
|
71 |
+
retry=True
|
72 |
+
while retry:
|
73 |
+
inputNL=util.GPTinterface("continue next")
|
74 |
+
if inputNL!="q":
|
75 |
+
Json=self.splitJSONfromTXT(inputNL)
|
76 |
+
print(Json)
|
77 |
+
jsonTree=json.loads("{"+Json[-1]+"}")
|
78 |
+
input_NL=jsonTree["LTL_description"].replace("Task_","prop_")
|
79 |
+
output_TL=self.translate(input_NL)
|
80 |
+
output_TL=output_TL.replace('prop_','Task_')
|
81 |
+
print("\n",output_TL,"\n")
|
82 |
+
else:
|
83 |
+
retry =False
|
84 |
+
if __name__=="__main__":
|
85 |
+
# examples=['Stay at (prop_1) for 5 units in the future and stay at (prop_2) for 5 units in the future, and ensure that never (prop_3).',
|
86 |
+
# 'First (prop_1), and then (prop_2), and ensure that never (prop_3).',
|
87 |
+
# 'Start by (prop_1). Then, (prop_2). Lastly, (prop_3).',
|
88 |
+
# 'Guarantee that you (prop_1) and (prop_2)', # Input the natural sentence
|
89 |
+
# '( prop_1 ) and whenever ( prop_2 )',
|
90 |
+
# 'Sooner or later (prop_1)',
|
91 |
+
# 'Repeatedly (prop_1)',
|
92 |
+
# 'At some point, (prop_1).',
|
93 |
+
# 'Do prop_1 but not do prop_2',
|
94 |
+
# 'Do prop_1, do prop_2, do prop_3'] # Input the natural sentence
|
95 |
+
# interface=NL2TL()
|
96 |
+
# for txt in examples:
|
97 |
+
# interface.translate(txt)
|
98 |
+
|
99 |
+
interface=NL2TL()
|
100 |
+
interface.waiting()
|
101 |
+
|
finetune/T5_XXL/t5_lora_evaluate.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from peft import PeftModel, PeftConfig
|
3 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
4 |
+
|
5 |
+
# Load peft config for pre-trained checkpoint etc.
|
6 |
+
exp_name="_mid_ascii"
|
7 |
+
peft_model_id="finetuned_model/results"+exp_name+'2'
|
8 |
+
max_target_length=128
|
9 |
+
|
10 |
+
config = PeftConfig.from_pretrained(peft_model_id)
|
11 |
+
|
12 |
+
# load base LLM model and tokenizer
|
13 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, load_in_8bit=True, device_map="auto")
|
14 |
+
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, device_map="auto")
|
15 |
+
|
16 |
+
# Load the Lora model
|
17 |
+
model = PeftModel.from_pretrained(model, peft_model_id, device_map="auto")
|
18 |
+
model.eval()
|
19 |
+
|
20 |
+
print("Peft model loaded")
|
21 |
+
|
22 |
+
from datasets import load_dataset
|
23 |
+
from random import randrange
|
24 |
+
|
25 |
+
|
26 |
+
# Load dataset from the hub and get a sample
|
27 |
+
datapath='LTL_datasets/collect/'
|
28 |
+
dataset = load_dataset("json", data_files={"train":datapath+"ltl_eng_train"+exp_name+".jsonl","test":datapath+"ltl_eng_test"+exp_name+".jsonl"})
|
29 |
+
print(dataset)
|
30 |
+
sample = dataset['test'][randrange(len(dataset["test"]))]
|
31 |
+
|
32 |
+
input_ids = tokenizer(sample["natural"], return_tensors="pt", truncation=True).input_ids.cuda()
|
33 |
+
# with torch.inference_mode():
|
34 |
+
outputs = model.generate(input_ids=input_ids, max_new_tokens=max_target_length, do_sample=True, top_p=0.9)
|
35 |
+
print(f"input sentence: {sample['natural']}\n{'---'* 20}")
|
36 |
+
|
37 |
+
print(f"summary:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]}")
|
38 |
+
|
39 |
+
|
40 |
+
import evaluate
|
41 |
+
import numpy as np
|
42 |
+
from datasets import load_from_disk
|
43 |
+
from tqdm import tqdm
|
44 |
+
|
45 |
+
# Metric
|
46 |
+
metric = evaluate.load("rouge")
|
47 |
+
|
48 |
+
def evaluate_peft_model(sample,max_target_length=128):
|
49 |
+
# generate summary
|
50 |
+
outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), do_sample=True, top_p=0.9, max_new_tokens=max_target_length)
|
51 |
+
prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
|
52 |
+
# decode eval sample
|
53 |
+
# Replace -100 in the labels as we can't decode them.
|
54 |
+
labels = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id)
|
55 |
+
# print(labels)
|
56 |
+
labels = tokenizer.decode(labels, skip_special_tokens=True)
|
57 |
+
# print(labels)
|
58 |
+
# Some simple post-processing
|
59 |
+
input_sentence=" ".join(tokenizer.batch_decode(sample["input_ids"].detach().cpu().numpy(), skip_special_tokens=True))
|
60 |
+
print("input sentence: {}\n{}".format(input_sentence,'---'* 20))
|
61 |
+
# output_LTL=tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
|
62 |
+
# expect_LTL=labels
|
63 |
+
print(f"pre_LTL:\n{prediction}\nexp_LTL:\n{labels}")
|
64 |
+
return prediction, labels,input_sentence
|
65 |
+
|
66 |
+
# load test dataset from distk
|
67 |
+
test_dataset = load_from_disk("data/eval"+exp_name+'/').with_format("torch")
|
68 |
+
|
69 |
+
# run predictions
|
70 |
+
# this can take ~45 minutes
|
71 |
+
predictions, references,input_sentence= [] , [], []
|
72 |
+
for sample in tqdm(test_dataset):
|
73 |
+
# print(sample)
|
74 |
+
p,l,nl = evaluate_peft_model(sample)
|
75 |
+
# print(p,l)
|
76 |
+
input_sentence.append(nl)
|
77 |
+
predictions.append(p)
|
78 |
+
references.append(l)
|
79 |
+
|
80 |
+
# compute metric
|
81 |
+
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)
|
82 |
+
|
83 |
+
# print results
|
84 |
+
print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
|
85 |
+
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
|
86 |
+
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
|
87 |
+
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
|
88 |
+
eval_output=np.array([input_sentence,predictions,references]).T
|
89 |
+
import pandas as pd
|
90 |
+
eval_output=pd.DataFrame(eval_output)
|
91 |
+
pd.DataFrame.to_csv(eval_output,peft_model_id+'/output')
|
92 |
+
# Rogue1: 50.386161%
|
93 |
+
# rouge2: 24.842412%
|
94 |
+
# rougeL: 41.370130%
|
95 |
+
# rougeLsum: 41.394230%
|
finetune/T5_XXL/t5_lora_fintune.py
ADDED
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# %%
|
2 |
+
from datasets import load_dataset
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
# Load dataset from the hub
|
6 |
+
# dataset = load_dataset("samsum")
|
7 |
+
# datapath='LTL_datasets/collect/'
|
8 |
+
exp_name="/tf-ltl_eng_test_mid_ascii_gptAuged"
|
9 |
+
# output_dir = os.path.join(output_dir, "llama2_13b_"+exp_name+'aug1')
|
10 |
+
# dataset = load_dataset("json", data_files={"train":datapath+"ltl_eng_train"+exp_name+".jsonl","test":datapath+"ltl_eng_test"+exp_name+".jsonl"})
|
11 |
+
# print(dataset)
|
12 |
+
dataset = load_dataset("json", data_files={"train":"LTL_datasets/collect/ltl_eng_train_mid_ascii_gptAuged.jsonl","test":"LTL_datasets/collect/ltl_eng_test_mid_ascii_gptAuged.jsonl"})
|
13 |
+
print(dataset)
|
14 |
+
|
15 |
+
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
|
16 |
+
os.environ['CUDA_VISIBLE_DEVICES']='4,5'
|
17 |
+
|
18 |
+
|
19 |
+
# %%
|
20 |
+
|
21 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
22 |
+
|
23 |
+
model_id="google/flan-t5-xxl"
|
24 |
+
|
25 |
+
# Load tokenizer of FLAN-t5-XL
|
26 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
27 |
+
|
28 |
+
|
29 |
+
# %%
|
30 |
+
from datasets import concatenate_datasets
|
31 |
+
import numpy as np
|
32 |
+
# The maximum total input sequence length after tokenization.
|
33 |
+
# Sequences longer than this will be truncated, sequences shorter will be padded.
|
34 |
+
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["natural"], truncation=True), batched=True, remove_columns=["raw_ltl", "natural"])
|
35 |
+
input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
|
36 |
+
# take 85 percentile of max length for better utilization
|
37 |
+
max_source_length = int(np.percentile(input_lenghts, 100))
|
38 |
+
print(f"Max source length: {max_source_length}")
|
39 |
+
|
40 |
+
# The maximum total sequence length for target text after tokenization.
|
41 |
+
# Sequences longer than this will be truncated, sequences shorter will be padded."
|
42 |
+
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["raw_ltl"], truncation=True), batched=True, remove_columns=["raw_ltl", "natural"])
|
43 |
+
target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
|
44 |
+
# take 90 percentile of max length for better utilization
|
45 |
+
max_target_length = int(np.percentile(target_lenghts, 100))
|
46 |
+
print(f"Max target length: {max_target_length}")
|
47 |
+
|
48 |
+
|
49 |
+
# %%
|
50 |
+
def preprocess_function(sample,padding="max_length"):
|
51 |
+
# add prefix to the input for t5
|
52 |
+
inputs = ["Generate LTL: " + item for item in sample["natural"]]
|
53 |
+
|
54 |
+
# tokenize inputs
|
55 |
+
model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)
|
56 |
+
|
57 |
+
# Tokenize targets with the `text_target` keyword argument
|
58 |
+
labels = tokenizer(text_target=sample["raw_ltl"], max_length=max_target_length, padding=padding, truncation=True)
|
59 |
+
|
60 |
+
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
|
61 |
+
# padding in the loss.
|
62 |
+
if padding == "max_length":
|
63 |
+
labels["input_ids"] = [
|
64 |
+
[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
|
65 |
+
]
|
66 |
+
|
67 |
+
model_inputs["labels"] = labels["input_ids"]
|
68 |
+
return model_inputs
|
69 |
+
|
70 |
+
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["natural", "raw_ltl"])
|
71 |
+
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")
|
72 |
+
|
73 |
+
# save datasets to disk for later easy loading
|
74 |
+
tokenized_dataset["train"].save_to_disk("data/train"+exp_name)
|
75 |
+
tokenized_dataset["test"].save_to_disk("data/eval"+exp_name)
|
76 |
+
|
77 |
+
|
78 |
+
# %%
|
79 |
+
from transformers import AutoModelForSeq2SeqLM
|
80 |
+
from peft import PeftModel, PeftConfig
|
81 |
+
# huggingface hub model id
|
82 |
+
model_id = "philschmid/flan-t5-xxl-sharded-fp16"
|
83 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")
|
84 |
+
|
85 |
+
|
86 |
+
|
87 |
+
# peft_model_id="finetuned_model/results"+"_mid_ascii"
|
88 |
+
# config = PeftConfig.from_pretrained(peft_model_id)
|
89 |
+
# # load base LLM model and tokenizer
|
90 |
+
# model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, load_in_8bit=True, device_map="auto")
|
91 |
+
# tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, device_map="auto")
|
92 |
+
|
93 |
+
# # Load the Lora model
|
94 |
+
# model = PeftModel.from_pretrained(model, peft_model_id, device_map="auto")
|
95 |
+
# # load model from the hub
|
96 |
+
|
97 |
+
print(model)
|
98 |
+
# exit()
|
99 |
+
|
100 |
+
# %%
|
101 |
+
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType
|
102 |
+
|
103 |
+
# Define LoRA Config
|
104 |
+
lora_config = LoraConfig(
|
105 |
+
r=16,
|
106 |
+
lora_alpha=32,
|
107 |
+
target_modules=["q", "v"],
|
108 |
+
lora_dropout=0.05,
|
109 |
+
bias="none",
|
110 |
+
task_type=TaskType.SEQ_2_SEQ_LM
|
111 |
+
)
|
112 |
+
# prepare int-8 model for training
|
113 |
+
model = prepare_model_for_int8_training(model)
|
114 |
+
|
115 |
+
# add LoRA adaptor
|
116 |
+
model = get_peft_model(model, lora_config)
|
117 |
+
model.print_trainable_parameters()
|
118 |
+
|
119 |
+
# trainable params: 18874368 || all params: 11154206720 || trainable%: 0.16921300163961817
|
120 |
+
|
121 |
+
|
122 |
+
# %%
|
123 |
+
from transformers import DataCollatorForSeq2Seq
|
124 |
+
|
125 |
+
# we want to ignore tokenizer pad token in the loss
|
126 |
+
label_pad_token_id = -100
|
127 |
+
# Data collator
|
128 |
+
data_collator = DataCollatorForSeq2Seq(
|
129 |
+
tokenizer,
|
130 |
+
model=model,
|
131 |
+
label_pad_token_id=label_pad_token_id,
|
132 |
+
pad_to_multiple_of=8
|
133 |
+
)
|
134 |
+
|
135 |
+
|
136 |
+
# %%
|
137 |
+
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
|
138 |
+
|
139 |
+
output_dir="lora-flan-t5-xxl"
|
140 |
+
|
141 |
+
# Define training args
|
142 |
+
training_args = Seq2SeqTrainingArguments(
|
143 |
+
output_dir=output_dir,
|
144 |
+
auto_find_batch_size=True,
|
145 |
+
learning_rate=1e-3, # higher learning rate
|
146 |
+
num_train_epochs=5,
|
147 |
+
logging_dir=f"{output_dir}/logs",
|
148 |
+
logging_strategy="steps",
|
149 |
+
logging_steps=500,
|
150 |
+
save_strategy="no",
|
151 |
+
report_to="tensorboard",
|
152 |
+
)
|
153 |
+
|
154 |
+
# Create Trainer instance
|
155 |
+
trainer = Seq2SeqTrainer(
|
156 |
+
model=model,
|
157 |
+
args=training_args,
|
158 |
+
data_collator=data_collator,
|
159 |
+
train_dataset=tokenized_dataset["train"],
|
160 |
+
)
|
161 |
+
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
|
162 |
+
|
163 |
+
|
164 |
+
# %%
|
165 |
+
# train model
|
166 |
+
trainer.train()
|
167 |
+
|
168 |
+
|
169 |
+
# %%
|
170 |
+
# Save our LoRA model & tokenizer results
|
171 |
+
peft_model_id="finetuned_model/"+exp_name
|
172 |
+
trainer.model.save_pretrained(peft_model_id)
|
173 |
+
tokenizer.save_pretrained(peft_model_id)
|
174 |
+
# if you want to save the base model to call
|
175 |
+
# trainer.model.base_model.save_pretrained(peft_model_id)
|
176 |
+
|
177 |
+
|
178 |
+
|
179 |
+
|
180 |
+
import evaluate
|
181 |
+
import numpy as np
|
182 |
+
from datasets import load_from_disk
|
183 |
+
from tqdm import tqdm
|
184 |
+
|
185 |
+
# Metric
|
186 |
+
metric = evaluate.load("rouge")
|
187 |
+
|
188 |
+
def evaluate_peft_model(sample,max_target_length=128):
|
189 |
+
# generate summary
|
190 |
+
outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), do_sample=True, top_p=0.9, max_new_tokens=max_target_length)
|
191 |
+
prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
|
192 |
+
# decode eval sample
|
193 |
+
# Replace -100 in the labels as we can't decode them.
|
194 |
+
labels = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id)
|
195 |
+
# print(labels)
|
196 |
+
labels = tokenizer.decode(labels, skip_special_tokens=True)
|
197 |
+
# print(labels)
|
198 |
+
# Some simple post-processing
|
199 |
+
input_sentence=" ".join(tokenizer.batch_decode(sample["input_ids"].detach().cpu().numpy(), skip_special_tokens=True))
|
200 |
+
# print("input sentence: {}\n{}".format(input_sentence,'---'* 20))
|
201 |
+
# output_LTL=tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
|
202 |
+
# expect_LTL=labels
|
203 |
+
# print(f"pre_LTL:\n{prediction}\nexp_LTL:\n{labels}")
|
204 |
+
return prediction, labels,input_sentence
|
205 |
+
|
206 |
+
# load test dataset from distk
|
207 |
+
test_dataset = load_from_disk("data/eval"+exp_name+'/').with_format("torch")
|
208 |
+
|
209 |
+
# run predictions
|
210 |
+
# this can take ~45 minutes
|
211 |
+
predictions, references,input_sentence= [] , [], []
|
212 |
+
idx=0
|
213 |
+
for sample in tqdm(test_dataset):
|
214 |
+
# print(sample)
|
215 |
+
p,l,nl = evaluate_peft_model(sample)
|
216 |
+
# print(p,l)
|
217 |
+
input_sentence.append(nl)
|
218 |
+
predictions.append(p)
|
219 |
+
references.append(l)
|
220 |
+
idx+=1
|
221 |
+
print(idx,'\n',input_sentence[-1],'\npre::\n',predictions[-1],'\nref::\n',references[-1],'\n','-'*20,'\n')
|
222 |
+
|
223 |
+
# compute metric
|
224 |
+
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)
|
225 |
+
|
226 |
+
# print results
|
227 |
+
print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
|
228 |
+
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
|
229 |
+
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
|
230 |
+
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
|
231 |
+
eval_output=np.array([input_sentence,predictions,references]).T
|
232 |
+
import pandas as pd
|
233 |
+
eval_output=pd.DataFrame(eval_output)
|
234 |
+
pd.DataFrame.to_csv(eval_output,peft_model_id+'/output')
|
235 |
+
# Rogue1: 98.292692%
|
236 |
+
# rouge2: 95.766211%
|
237 |
+
# rougeL: 97.086188%
|
238 |
+
# rougeLsum: 97.084262%
|
finetune/T5_XXL/t5_realtime_evaluate.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from peft import PeftModel, PeftConfig
|
3 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
4 |
+
import sys
|
5 |
+
# sys.path.append("..")
|
6 |
+
# sys.path.append("../../")
|
7 |
+
from ... import utils as util
|
8 |
+
# Load peft config for pre-trained checkpoint etc.
|
9 |
+
|
10 |
+
class T5XXL_NL2TL_translator():
|
11 |
+
def __init__(self) -> None:
|
12 |
+
# exp_name="_mid_ascii"
|
13 |
+
peft_model_id="model_weight/tf-ltl_eng_test_mid_ascii_gptAuged"
|
14 |
+
self.max_target_length=128
|
15 |
+
|
16 |
+
self.config = PeftConfig.from_pretrained(peft_model_id)
|
17 |
+
|
18 |
+
# load base LLM model and tokenizer
|
19 |
+
self.model = AutoModelForSeq2SeqLM.from_pretrained(self.config.base_model_name_or_path, load_in_8bit=True, device_map="auto")
|
20 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.config.base_model_name_or_path, device_map="auto")
|
21 |
+
|
22 |
+
# Load the Lora model
|
23 |
+
self.model = PeftModel.from_pretrained(self.model, peft_model_id, device_map="auto")
|
24 |
+
self.model.eval()
|
25 |
+
|
26 |
+
print("Peft model loaded")
|
27 |
+
|
28 |
+
pass
|
29 |
+
def translate(self,input:str=""):
|
30 |
+
input_prompt= "Generate LTL: " + input
|
31 |
+
replace=util.Task2Preplacer()
|
32 |
+
|
33 |
+
input_prompt=replace.reTask2P(input_prompt)
|
34 |
+
# print(predicter( replace.reTask2P(input_prompt)))
|
35 |
+
print(input_prompt)
|
36 |
+
input_ids = self.tokenizer(input_prompt, return_tensors="pt", truncation=True).input_ids.cuda()
|
37 |
+
outputs = self.model.generate(input_ids=input_ids, max_new_tokens=self.max_target_length, do_sample=True, top_p=0.9)
|
38 |
+
output_txt= self.tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
|
39 |
+
print(output_txt)
|
40 |
+
return replace.reP2Task(output_txt)
|
41 |
+
|
42 |
+
|
43 |
+
if __name__=="__main__":
|
44 |
+
test_prompts=[
|
45 |
+
"Task_1.1.1 must precede Task_1.1.2, which in turn should precede Task_1.1.3, ensuring that arranging fruits happens before preparing vegetables and prepping eggs and meats is done last.",
|
46 |
+
"Task_1.1 must be completed before Task_1.2 starts, and Task_1.2 must be completed before Task_1.3 starts."
|
47 |
+
]
|
48 |
+
|
49 |
+
translater=T5XXL_NL2TL_translator()
|
50 |
+
|
51 |
+
for ret in test_prompts:
|
52 |
+
print(translater.translate(ret))
|
53 |
+
|
54 |
+
flag=True
|
55 |
+
while flag:
|
56 |
+
lines=[""]
|
57 |
+
try:
|
58 |
+
lines.append(input())
|
59 |
+
while True:
|
60 |
+
lines.append(input())
|
61 |
+
except:
|
62 |
+
pass
|
63 |
+
ret ="".join(lines)
|
64 |
+
print(ret)
|
65 |
+
if ret=="":
|
66 |
+
flag=False
|
67 |
+
|
68 |
+
print(translater.translate(ret))
|
69 |
+
|
finetune/__init__.py
ADDED
File without changes
|
finetune/data_augmentation/GPTbasedAug.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import re
|
3 |
+
import sys,os
|
4 |
+
import numpy as np
|
5 |
+
# sys.path.append(os.path.join(os.path.dirname(__file__), '../../'))
|
6 |
+
from ... import utils as util
|
7 |
+
import time
|
8 |
+
l,r=40,2000
|
9 |
+
# has been added: 0,40
|
10 |
+
# the range for data to augmentation
|
11 |
+
batch_size=20
|
12 |
+
# number of re describe codes
|
13 |
+
|
14 |
+
|
15 |
+
dataPath="LTL_datasets/collect/"
|
16 |
+
data_eng_path=os.path.join(dataPath,"Cleaned_ENG.txt")
|
17 |
+
data_ltl_path=os.path.join(dataPath,"Cleaned_LTL.txt")
|
18 |
+
|
19 |
+
data_eng_output_path=os.path.join(dataPath,"eng_gpt_auged2.txt")
|
20 |
+
data_ltl_output_path=os.path.join(dataPath,"ltl_mid_order_ascii_gpt_auged2.txt")
|
21 |
+
data_src_output_path=os.path.join(dataPath,"idxsrc_gpt_auged2.txt")
|
22 |
+
|
23 |
+
with open(data_ltl_path) as txt:
|
24 |
+
content = txt.readlines()
|
25 |
+
txt.close()
|
26 |
+
ltl =np.array(content)
|
27 |
+
with open(data_eng_path) as txt:
|
28 |
+
content = txt.readlines()
|
29 |
+
txt.close()
|
30 |
+
eng =np.array(content)
|
31 |
+
|
32 |
+
print(len(ltl))
|
33 |
+
|
34 |
+
GPTinterface=util.GPTinterface(JSONlog=True,exp_PATH=dataPath)
|
35 |
+
|
36 |
+
import random
|
37 |
+
np.random.seed(42)
|
38 |
+
|
39 |
+
idx=np.arange(len(ltl))
|
40 |
+
np.random.shuffle(idx)
|
41 |
+
|
42 |
+
messages=[
|
43 |
+
{
|
44 |
+
"role": "system",
|
45 |
+
"content": """1. Herer are some one sentence examples in a way that is normally used to interpret the safe or co-safe property in linear temporal logic, please remember and imitate the language style in the examples below
|
46 |
+
P02 and P03 can occur independently and either may be executed without affecting the other.
|
47 |
+
P07 must precede P17, which in turn should precede P15, ensuring that P07 happens before P17 and P15 is done last.
|
48 |
+
Always (P08 precedes P09) and Eventually (P08 is executed) and Eventually (P09 is executed).
|
49 |
+
Globally, P02 should be completed before P03 eventually starts.
|
50 |
+
Eventually, P08 and P09 should both be completed, and they can be done in any order.
|
51 |
+
P02 and P04 can be executed concurrently, while P03 can only be executed once P02 has been completed.
|
52 |
+
P07 must be completed before P17, P15, and P02 can be started. P17 must be completed before P15 and P02 can be started. P15 must be completedbefore P02 can be started."
|
53 |
+
P08 is a prerequisite for P09, P09 is a prerequisite for P10, and P10 is a prerequisite for P16.
|
54 |
+
P06 must be completed before P14, P14 must be completed before P11, and P11 must be completed before P12.
|
55 |
+
P02 and P05 are always possible to be executed, while the possibility of executing P03 and P04 is contingent upon the completion of P02.
|
56 |
+
P07 must be executed, and only after P07 is completed can P17 be executed, and only after P17 is completed can P15 be executed.
|
57 |
+
P08 can be executed independently, after which P09 can be executed.
|
58 |
+
P06 must be completed first, followed by P14, then P11, and finally P12.
|
59 |
+
P19 is always executed first, followed by P13, then P18, and finally P05.
|
60 |
+
P15 and P14 can be executed independently, and both should eventually be completed.
|
61 |
+
P07 must be completed before P17, and P17 must be completed before P15
|
62 |
+
P06 must be completed before P14 begins, and P14 must be completed before P11 begins"""
|
63 |
+
},{
|
64 |
+
"role":"user",
|
65 |
+
"content":"first go to P01 and then go to P20, always avoiding P02"
|
66 |
+
}
|
67 |
+
]
|
68 |
+
input_content="B. re describe this instruction using the style above\n"
|
69 |
+
input_LTL=""
|
70 |
+
input_idx=""
|
71 |
+
count=0
|
72 |
+
pattern=re.compile("[0-9]{2}\. ([\S ]*)\n")
|
73 |
+
|
74 |
+
for i in range(l,r):
|
75 |
+
if count>=batch_size:
|
76 |
+
count=0
|
77 |
+
messages[1]["content"]=input_content
|
78 |
+
GPTreturn=GPTinterface.communicate(messages=messages)
|
79 |
+
reDescription=pattern.findall(GPTreturn+'\n')
|
80 |
+
if len(reDescription)==batch_size:
|
81 |
+
with open(data_eng_output_path ,"a") as f:
|
82 |
+
for j in reDescription:
|
83 |
+
f.write(j)
|
84 |
+
f.write('\n')
|
85 |
+
with open(data_ltl_output_path,"a") as f:
|
86 |
+
f.write(input_LTL)
|
87 |
+
with open(data_src_output_path,"a") as f:
|
88 |
+
f.write(input_idx)
|
89 |
+
input_content="B. re describe this instruction using the style above\n"
|
90 |
+
input_LTL=""
|
91 |
+
input_idx=""
|
92 |
+
time.sleep(np.random.random()*5)
|
93 |
+
else:
|
94 |
+
count+=1
|
95 |
+
input_content+="{:0>2d}. {}".format(count,eng[idx[i]])
|
96 |
+
input_LTL+="{}".format(ltl[idx[i]])
|
97 |
+
input_idx+="{}\n".format(idx[i])
|
98 |
+
|
99 |
+
|
100 |
+
|
finetune/data_augmentation/dataset_creator.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import numpy as np
|
3 |
+
import os,sys
|
4 |
+
# sys.path.append(os.path.join(os.path.dirname(__file__), '../../'))
|
5 |
+
from ... import utils as util
|
6 |
+
class DataPreprocess():
|
7 |
+
def __init__(self,data_path="LTL_datasets/collect") -> None:
|
8 |
+
self.data_path=data_path
|
9 |
+
self.train_valid_split=0.1
|
10 |
+
pass
|
11 |
+
def txtdataReader(self):
|
12 |
+
LTL_list=[
|
13 |
+
# 'ltl_mid_order_ascii.txt',
|
14 |
+
# 'ltl_mid_order_ascii_gpt_auged.txt',
|
15 |
+
# 'ltl_mid_order_ascii_gpt_auged2.txt',
|
16 |
+
'Cleaned_LTL.txt'
|
17 |
+
]
|
18 |
+
ENG_list=[
|
19 |
+
# 'eng.txt',
|
20 |
+
# 'eng_gpt_auged.txt',
|
21 |
+
# 'eng_gpt_auged2.txt'
|
22 |
+
'Cleaned_ENG.txt'
|
23 |
+
]
|
24 |
+
content=[]
|
25 |
+
for filename in LTL_list:
|
26 |
+
with open(os.path.join(self.data_path,filename)) as txt:
|
27 |
+
content += txt.readlines()
|
28 |
+
txt.close()
|
29 |
+
self.ltl =np.array(content)
|
30 |
+
|
31 |
+
content=[]
|
32 |
+
for filename in ENG_list:
|
33 |
+
with open(os.path.join(self.data_path,filename)) as txt:
|
34 |
+
content += txt.readlines()
|
35 |
+
txt.close()
|
36 |
+
self.eng =np.array(content)
|
37 |
+
print(len(self.ltl))
|
38 |
+
|
39 |
+
def JSONdataCreate(self):
|
40 |
+
self.txtdataReader()
|
41 |
+
self.JSONWriter()
|
42 |
+
|
43 |
+
def JSONWriter(self):
|
44 |
+
np.random.seed(42)
|
45 |
+
# idx=np.random.shuffle( np.arange(len(ltl)))
|
46 |
+
self.idx=np.arange(len(self.ltl))
|
47 |
+
np.random.shuffle(self.idx)
|
48 |
+
with open(self.data_path+"/ltl_eng_train_mid_ascii_gptAuged.jsonl","w") as f:
|
49 |
+
for i in range(int(len(self.ltl)*(1-self.train_valid_split))):
|
50 |
+
json.dump({"natural":self.eng[self.idx[i]],"raw_ltl":self.ltl[self.idx[i]],"id":str(self.idx[i])},f)
|
51 |
+
f.write('\n')
|
52 |
+
with open(self.data_path+"/ltl_eng_test_mid_ascii_gptAuged.jsonl","w") as f:
|
53 |
+
for i in range(int(len(self.ltl)*(1-self.train_valid_split)),len(self.ltl)):
|
54 |
+
json.dump({"natural":self.eng[self.idx[i]],"raw_ltl":self.ltl[self.idx[i]],"id":str(self.idx[i])},f)
|
55 |
+
f.write('\n')
|
56 |
+
|
57 |
+
def dataCheck(self):
|
58 |
+
self.txtdataReader()
|
59 |
+
checker=util.LTLChecker()
|
60 |
+
with open(os.path.join(self.data_path,"Cleaned_LTL.txt"),"a") as passed_LTL:
|
61 |
+
with open(os.path.join(self.data_path,"Cleaned_ENG.txt"),"a") as passed_ENG:
|
62 |
+
with open(os.path.join(self.data_path,"UNCleaned_num.txt"),"a") as unpassed_row:
|
63 |
+
with open(os.path.join(self.data_path,"UNCleaned_LTL.txt"),"a") as unpassed_LTL:
|
64 |
+
with open(os.path.join(self.data_path,"UNCleaned_ENG.txt"),"a") as unpassed_ENG:
|
65 |
+
for id in range(len(self.ltl)):
|
66 |
+
if checker.AP_CorrCheck(self.ltl[id],self.eng[id]):
|
67 |
+
passed_LTL.write(self.ltl[id])
|
68 |
+
passed_ENG.write(self.eng[id])
|
69 |
+
else:
|
70 |
+
unpassed_row.write("{}\n".format(id))
|
71 |
+
unpassed_LTL.write(self.ltl[id])
|
72 |
+
unpassed_ENG.write(self.eng[id])
|
73 |
+
|
74 |
+
if __name__=="__main__":
|
75 |
+
# DataPreprocess().dataCheck()
|
76 |
+
DataPreprocess().JSONdataCreate()
|
finetune/mistral7b/finetune.py
ADDED
@@ -0,0 +1,353 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
2 |
+
# device = "cuda" # the device to load the model onto
|
3 |
+
# from huggingface_hub import login
|
4 |
+
# login()
|
5 |
+
import json
|
6 |
+
import numpy as np
|
7 |
+
import sys,os
|
8 |
+
from datasets import load_dataset
|
9 |
+
import torch
|
10 |
+
from transformers import (AutoModelForCausalLM,
|
11 |
+
AutoTokenizer,
|
12 |
+
BitsAndBytesConfig,
|
13 |
+
TrainingArguments,
|
14 |
+
pipeline,
|
15 |
+
logging,
|
16 |
+
TrainerCallback)
|
17 |
+
from peft import LoraConfig, PeftConfig, prepare_model_for_kbit_training, get_peft_model
|
18 |
+
from trl import SFTTrainer
|
19 |
+
from accelerate import infer_auto_device_map,init_empty_weights
|
20 |
+
import wandb
|
21 |
+
from datasets import concatenate_datasets
|
22 |
+
import numpy as np
|
23 |
+
# sys.path.append('../../../')
|
24 |
+
# sys.path.append('../../')
|
25 |
+
# sys.path.append('../')
|
26 |
+
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
|
27 |
+
# os.environ['CUDA_VISIBLE_DEVICES'] = "5,6,7"
|
28 |
+
# device = torch.device("cuda:0-6" if torch.cuda.is_available() else "cpu")
|
29 |
+
sys.path.append(os.path.join(os.path.dirname(__file__), '../../'))
|
30 |
+
# import utils.util as util
|
31 |
+
# Load dataset from the hub
|
32 |
+
# dataset = load_dataset("samsum")
|
33 |
+
device='cuda'
|
34 |
+
np.random.seed(42)
|
35 |
+
output_dir = "/home/user/xsj/model_weight/"
|
36 |
+
datapath='/home/user/xsj/NL2TL-dataset/collect2'
|
37 |
+
exp_name="_mid_ascii_0327_eos_2"
|
38 |
+
explainer_files=['LTLexplain_0.json','LTLexplain_1.json','LTLexplain_2.json','LTLexplain_3.json']
|
39 |
+
explainer_dic={}
|
40 |
+
for path in explainer_files:
|
41 |
+
with open(os.path.join(datapath,path)) as f:
|
42 |
+
LTLlist=json.load(f)
|
43 |
+
for key in LTLlist.keys():
|
44 |
+
if isinstance(LTLlist[key],dict):
|
45 |
+
if not (key in explainer_dic):
|
46 |
+
explainer_dic[key]=[]
|
47 |
+
explainer_dic[key].append(LTLlist[key]['translate'])
|
48 |
+
sp=LTLlist[key]['explain'].split("means that")
|
49 |
+
if len(sp)>1:
|
50 |
+
explainer_dic[key].append(sp[1])
|
51 |
+
|
52 |
+
base_model_name = "mistralai/Mistral-7B-Instruct-v0.2"
|
53 |
+
bnb_config = BitsAndBytesConfig(
|
54 |
+
load_in_4bit = True,
|
55 |
+
bnb_4bit_use_double_quant = False,
|
56 |
+
bnb_4bit_quant_type = 'nf4',
|
57 |
+
bnb_4bit_compute_dtype = getattr(torch, "float16")
|
58 |
+
)
|
59 |
+
bnb_config = BitsAndBytesConfig(
|
60 |
+
load_in_8bit = True,
|
61 |
+
# llm_int8_threshold=200.0
|
62 |
+
# bnb_4bit_use_double_quant = False,
|
63 |
+
# bnb_4bit_quant_type = 'nf4',
|
64 |
+
# bnb_4bit_compute_dtype = getattr(torch, "float16")
|
65 |
+
)
|
66 |
+
import os
|
67 |
+
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
|
68 |
+
# os.environ['CUDA_VISIBLE_DEVICES']='0'
|
69 |
+
device_map="auto"
|
70 |
+
# torch.cuda.set_device(7)
|
71 |
+
# device_map={'':torch.cuda.current_device()}
|
72 |
+
# device_map = {'':'cuda:7'}
|
73 |
+
# model_dir为模型的路径或名称
|
74 |
+
# config = AutoConfig.from_pretrained(base_model_name, trust_remote_code=True)
|
75 |
+
# with init_empty_weights():
|
76 |
+
# base_model = AutoModelForCausalLM.from_pretrained(
|
77 |
+
# base_model_name,
|
78 |
+
# from_tf=bool(".ckpt" in base_model_name),
|
79 |
+
# quantization_config=bnb_config,
|
80 |
+
# device_map=device_map,
|
81 |
+
# trust_remote_code=True,
|
82 |
+
# use_auth_token=True
|
83 |
+
# )
|
84 |
+
|
85 |
+
# map_list = {5:"15GB", 6:"15GB",7:"15GB"} # 对应不同卡号限制的内存量
|
86 |
+
# map_list = {7:"15GB",} # 对应不同卡号限制的内存量
|
87 |
+
# no_split_modules = base_model._no_split_modules
|
88 |
+
# device_map = infer_auto_device_map(base_model, max_memory=map_list, no_split_module_classes=no_split_modules)
|
89 |
+
|
90 |
+
|
91 |
+
dataset = load_dataset("json", data_files={"train":os.path.join(datapath,"ltl_eng_train_mid_ascii_gptAuged.jsonl"),"test":os.path.join(datapath,"ltl_eng_test_mid_ascii_gptAuged.jsonl")})
|
92 |
+
print(dataset)
|
93 |
+
|
94 |
+
|
95 |
+
|
96 |
+
# tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
|
97 |
+
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
|
98 |
+
# , add_eos_token=True,trust_remote_code=True)
|
99 |
+
# NOTE no one says whether the add eos token need to be added, but if we do not add this, the generate will continue until reach the max_new_tokens
|
100 |
+
# when add add_eos_token, it always failed
|
101 |
+
# if use this it will generate somthing other
|
102 |
+
tokenizer.pad_token = tokenizer.eos_token
|
103 |
+
tokenizer.padding_side = 'right'
|
104 |
+
# print(tokenizer.eos_token_id)
|
105 |
+
# 2
|
106 |
+
# print(tokenizer.bos_token_id)
|
107 |
+
# 1
|
108 |
+
# print(tokenizer._convert_token_to_id(tokenizer.bos_token))
|
109 |
+
|
110 |
+
def preprocess_function(sample,padding="max_length"):
|
111 |
+
# add prefix to the input for t5
|
112 |
+
# print(sample[0])
|
113 |
+
inputs=[
|
114 |
+
f"""### Instruction:
|
115 |
+
translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()'
|
116 |
+
|
117 |
+
### Natural Language Task:
|
118 |
+
{sample['natural'][i].strip()}
|
119 |
+
|
120 |
+
### Logic Translation:
|
121 |
+
{explainer_dic[sample['raw_ltl'][i].strip()][np.random.randint(0,len(explainer_dic[sample['raw_ltl'][i].strip()]))]}
|
122 |
+
|
123 |
+
### linear temproal logic:
|
124 |
+
{sample['raw_ltl'][i].strip()}
|
125 |
+
</s>""".lower()
|
126 |
+
# NOTE it seems the eos is needed, the bos is not needed(the bos will be automatically added)
|
127 |
+
for i in (range(len(sample['natural'])))]
|
128 |
+
# inputs = ["## [instruction]: translate natural description in to LTL: ### [natural language]:" + sample['natural'][i]+'### [LTL]:'+sample['raw_ltl'][i] for i in (range(len(sample['natural'])))]
|
129 |
+
|
130 |
+
sample["complete_text"] = inputs
|
131 |
+
return sample
|
132 |
+
# method1
|
133 |
+
# tokenized_dataset = dataset.map(preprocess_function, batched=True)
|
134 |
+
# method2
|
135 |
+
def preprocess_function2(sample,padding="max_length"):
|
136 |
+
# add prefix to the input for t5
|
137 |
+
# print(sample[0])
|
138 |
+
inputs=[
|
139 |
+
tokenizer.apply_chat_template(
|
140 |
+
[
|
141 |
+
{"role": "user", "content": "translate natural description to linear temproal logic, first translate into a logical expression, and then translate into linear temproal logic, please pay specific attention to logic grammar, the natural language task is {}".format(sample['natural'][i].strip())},
|
142 |
+
{"role": "assistant", "content": "logic expression is {}, and LTL is {} .".format(
|
143 |
+
explainer_dic[sample['raw_ltl'][i].strip()][np.random.randint(0,len(explainer_dic[sample['raw_ltl'][i].strip()]))],
|
144 |
+
sample['raw_ltl'][i].strip()
|
145 |
+
)
|
146 |
+
},
|
147 |
+
# {"role": "user", "content": " pay specific attention to brackets '()', linear temproal logic is"},
|
148 |
+
# {"role": "assistant", "content": "LTL is {} .".format(
|
149 |
+
# sample['raw_ltl'][i].strip()
|
150 |
+
# )
|
151 |
+
# }
|
152 |
+
],tokenize=False)
|
153 |
+
# NOTE it seems the eos is needed, the bos is not needed(the bos will be automatically added)
|
154 |
+
for i in (range(len(sample['natural'])))]
|
155 |
+
# inputs = ["## [instruction]: translate natural description in to LTL: ### [natural language]:" + sample['natural'][i]+'### [LTL]:'+sample['raw_ltl'][i] for i in (range(len(sample['natural'])))]
|
156 |
+
|
157 |
+
sample["complete_text"] = inputs
|
158 |
+
return sample
|
159 |
+
tokenized_dataset = dataset.map(preprocess_function2, batched=True)
|
160 |
+
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")
|
161 |
+
|
162 |
+
# save datasets to disk for later easy loading
|
163 |
+
# tokenized_dataset["train"].save_to_disk("data/train"+exp_name)
|
164 |
+
# tokenized_dataset["test"].save_to_disk("data/eval"+exp_name)
|
165 |
+
|
166 |
+
class PeftSavingCallback(TrainerCallback):
|
167 |
+
def on_save(self, args, state, control, **kwargs):
|
168 |
+
checkpoint_path = os.path.join(args.output_dir, f"checkpoint-{state.global_step}")
|
169 |
+
kwargs["model"].save_pretrained(checkpoint_path)
|
170 |
+
|
171 |
+
if "pytorch_model.bin" in os.listdir(checkpoint_path):
|
172 |
+
os.remove(os.path.join(checkpoint_path, "pytorch_model.bin"))
|
173 |
+
callbacks = [PeftSavingCallback]
|
174 |
+
|
175 |
+
peft_config = LoraConfig(
|
176 |
+
lora_alpha=16,
|
177 |
+
lora_dropout=0.05,
|
178 |
+
r=128,
|
179 |
+
bias="none",
|
180 |
+
task_type="CAUSAL_LM",
|
181 |
+
target_modules=["q_proj", "v_proj"]
|
182 |
+
)
|
183 |
+
|
184 |
+
|
185 |
+
training_arguments = TrainingArguments(
|
186 |
+
output_dir=output_dir,
|
187 |
+
logging_dir = os.path.join(output_dir,"logs"),
|
188 |
+
per_device_train_batch_size=1,
|
189 |
+
num_train_epochs=3,
|
190 |
+
gradient_accumulation_steps=8,
|
191 |
+
optim="paged_adamw_32bit",
|
192 |
+
save_strategy='epoch',
|
193 |
+
logging_steps=25,
|
194 |
+
learning_rate=2e-4,
|
195 |
+
weight_decay=0.001,
|
196 |
+
fp16=True,
|
197 |
+
bf16=False,
|
198 |
+
max_grad_norm=0.3,
|
199 |
+
max_steps=-1,
|
200 |
+
warmup_ratio = 0.05,
|
201 |
+
group_by_length=True,
|
202 |
+
lr_scheduler_type="cosine",
|
203 |
+
report_to="wandb",
|
204 |
+
evaluation_strategy="epoch",
|
205 |
+
do_eval=True,
|
206 |
+
run_name = base_model_name+exp_name,
|
207 |
+
disable_tqdm=False
|
208 |
+
)
|
209 |
+
import os
|
210 |
+
output_dir = os.path.join(output_dir, "mistral7b"+exp_name+'aug1_quat8')
|
211 |
+
|
212 |
+
base_model = AutoModelForCausalLM.from_pretrained(
|
213 |
+
base_model_name,
|
214 |
+
from_tf=bool(".ckpt" in base_model_name),
|
215 |
+
quantization_config=bnb_config,
|
216 |
+
device_map=device_map,
|
217 |
+
trust_remote_code=True,
|
218 |
+
use_auth_token=True
|
219 |
+
)
|
220 |
+
base_model.config.use_cache = False
|
221 |
+
|
222 |
+
# More info: https://github.com/huggingface/transformers/pull/24906
|
223 |
+
base_model.config.pretraining_tp = 1
|
224 |
+
|
225 |
+
base_model.gradient_checkpointing_enable()
|
226 |
+
base_model = prepare_model_for_kbit_training(base_model)
|
227 |
+
base_model = get_peft_model(base_model, peft_config)
|
228 |
+
|
229 |
+
trainer = SFTTrainer(
|
230 |
+
model=base_model,
|
231 |
+
train_dataset=tokenized_dataset['train'],
|
232 |
+
eval_dataset=tokenized_dataset['test'],
|
233 |
+
peft_config=peft_config,
|
234 |
+
dataset_text_field="complete_text",
|
235 |
+
max_seq_length=512,
|
236 |
+
tokenizer=tokenizer,
|
237 |
+
args=training_arguments,
|
238 |
+
callbacks=callbacks,
|
239 |
+
packing=False,
|
240 |
+
)
|
241 |
+
wandb.login()
|
242 |
+
trainer.train()
|
243 |
+
trainer.model.save_pretrained(output_dir)
|
244 |
+
# trainer.model.save_pretrained(output_dir)
|
245 |
+
tokenizer.save_pretrained(output_dir)
|
246 |
+
|
247 |
+
wandb.finish()
|
248 |
+
|
249 |
+
# check
|
250 |
+
print('model dir',output_dir)
|
251 |
+
from peft import AutoPeftModelForCausalLM
|
252 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
253 |
+
model = AutoPeftModelForCausalLM.from_pretrained(output_dir,
|
254 |
+
from_tf=bool(".ckpt" in output_dir),
|
255 |
+
quantization_config=bnb_config,
|
256 |
+
device_map=device_map,
|
257 |
+
trust_remote_code=True,
|
258 |
+
use_auth_token=True
|
259 |
+
)
|
260 |
+
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
|
261 |
+
tokenizer.pad_token = tokenizer.eos_token
|
262 |
+
print(tokenizer.default_chat_template)
|
263 |
+
def evaluate_model(input_text):
|
264 |
+
input_text =f"""### Instruction:
|
265 |
+
translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()' ### Natural Language Task:
|
266 |
+
{input_text}""".lower()
|
267 |
+
inputs = tokenizer(input_text, return_tensors="pt").to(device)
|
268 |
+
print(inputs)
|
269 |
+
outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"].to("cuda"), max_new_tokens=512, pad_token_id=tokenizer.eos_token_id)
|
270 |
+
|
271 |
+
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
272 |
+
|
273 |
+
def evaluate_model2(input_text):
|
274 |
+
messages=[
|
275 |
+
{"role": "user", "content": "translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()', natural language task: {}".format(input_text)},
|
276 |
+
]
|
277 |
+
|
278 |
+
encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
|
279 |
+
outputs = model.generate(encodeds, max_new_tokens=512)
|
280 |
+
# , pad_token_id=tokenizer.eos_token_id)
|
281 |
+
# input_text =f"""### Instruction:
|
282 |
+
# translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()' ### Natural Language Task:
|
283 |
+
# {input_text}""".lower()
|
284 |
+
# inputs = tokenizer(input_text, return_tensors="pt").to(device)
|
285 |
+
# print(inputs)
|
286 |
+
# outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"].to("cuda"), max_new_tokens=512, pad_token_id=tokenizer.eos_token_id)
|
287 |
+
|
288 |
+
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
289 |
+
# if __name__=='__main__':
|
290 |
+
import evaluate
|
291 |
+
import numpy as np
|
292 |
+
from datasets import load_from_disk
|
293 |
+
from tqdm import tqdm
|
294 |
+
|
295 |
+
# Metric
|
296 |
+
metric = evaluate.load("rouge")
|
297 |
+
|
298 |
+
|
299 |
+
# load test dataset from distk
|
300 |
+
# test_dataset = load_from_disk("data/eval"+exp_name+'/').with_format("torch")
|
301 |
+
|
302 |
+
# run predictions
|
303 |
+
# this can take ~45 minutes
|
304 |
+
import re
|
305 |
+
pattern=re.compile("linear temproal logic is ([\S ]*)")
|
306 |
+
predictions, references,input_sentence,output_sentence=[], [] , [], []
|
307 |
+
for idx in range(len(tokenized_dataset['test']['natural'])):
|
308 |
+
# print(sample)
|
309 |
+
nl=tokenized_dataset['test']['natural'][idx]
|
310 |
+
p = evaluate_model2(nl)
|
311 |
+
# print(p,l)
|
312 |
+
input_sentence.append(nl)
|
313 |
+
|
314 |
+
transLTL=pattern.findall(p)
|
315 |
+
print(p)
|
316 |
+
if transLTL[0][-1]=='.':
|
317 |
+
transLTL[0]=transLTL[0][:-1].strip()
|
318 |
+
else:
|
319 |
+
transLTL[0]=transLTL[0].strip()
|
320 |
+
predictions.append(transLTL[0])
|
321 |
+
output_sentence.append(p)
|
322 |
+
input_sentence.append(p)
|
323 |
+
references.append(tokenized_dataset['test']['raw_ltl'][idx].strip())
|
324 |
+
print(input_sentence[-1],'\nout::\n',output_sentence[-1],'\npre::\n',predictions[-1],'\nref::\n',references[-1],'\n','-'*20,'\n')
|
325 |
+
|
326 |
+
# compute metric
|
327 |
+
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)
|
328 |
+
|
329 |
+
# print results
|
330 |
+
print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
|
331 |
+
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
|
332 |
+
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
|
333 |
+
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
|
334 |
+
eval_output=np.array([input_sentence,predictions,references]).T
|
335 |
+
import pandas as pd
|
336 |
+
eval_output=pd.DataFrame(eval_output)
|
337 |
+
pd.DataFrame.to_csv(eval_output,output_dir+'/output')
|
338 |
+
|
339 |
+
exit()
|
340 |
+
messages = [
|
341 |
+
{"role": "user", "content": "What is your favourite condiment?"},
|
342 |
+
{"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
|
343 |
+
{"role": "user", "content": "Do you have mayonnaise recipes?"}
|
344 |
+
]
|
345 |
+
|
346 |
+
encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
|
347 |
+
|
348 |
+
model_inputs = encodeds.to(device)
|
349 |
+
model.to(device)
|
350 |
+
|
351 |
+
generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
|
352 |
+
decoded = tokenizer.batch_decode(generated_ids)
|
353 |
+
print(decoded[0])
|
finetune/mistral7b/prediction.py
ADDED
@@ -0,0 +1,308 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from huggingface_hub import login
|
2 |
+
# login()
|
3 |
+
import sys,os
|
4 |
+
from datasets import load_dataset
|
5 |
+
import torch
|
6 |
+
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments
|
7 |
+
# from peft import LoraConfig
|
8 |
+
# from trl import SFTTrainer
|
9 |
+
# from accelerate import infer_auto_device_map,init_empty_weights
|
10 |
+
|
11 |
+
# sys.path.append(os.path.join(os.path.dirname(__file__), '../../'))
|
12 |
+
from NL2HLTLtaskPlanner.utils import Task2Preplacer
|
13 |
+
from NL2HLTLtaskPlanner.utils import LTLChecker
|
14 |
+
import re
|
15 |
+
from datasets import concatenate_datasets
|
16 |
+
import numpy as np
|
17 |
+
from peft import AutoPeftModelForCausalLM
|
18 |
+
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
|
19 |
+
# os.environ['CUDA_VISIBLE_DEVICES']='3'
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
class Mistral_NL2TL_translator():
|
24 |
+
def __init__(self,
|
25 |
+
output_dir = "/home/user/xsj/model_weight",
|
26 |
+
tuned_model_name="mistral7b_mid_ascii_0327_eos_2aug1_quat8",
|
27 |
+
# CUDA_device='0',
|
28 |
+
quat=True,
|
29 |
+
replacer=Task2Preplacer) -> None:
|
30 |
+
# os.environ['CUDA_VISIBLE_DEVICES']=CUDA_device
|
31 |
+
self.device_map="auto"
|
32 |
+
self.model_dir = os.path.join(output_dir, tuned_model_name)
|
33 |
+
# check
|
34 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
35 |
+
# AutoPeftModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-hf")
|
36 |
+
|
37 |
+
|
38 |
+
# quantconfig = BitsAndBytesConfig(
|
39 |
+
# load_in_8bit=True,
|
40 |
+
# bnb_8bit_quant_type="nf4",
|
41 |
+
# bnb_8bit_use_double_quant=True,
|
42 |
+
# bnb_8bit_compute_dtype=torch.bfloat16,
|
43 |
+
# )
|
44 |
+
# if quat==False:
|
45 |
+
# self.model = AutoPeftModelForCausalLM.from_pretrained(self.output_dir, device_map=self.device_map, torch_dtype=torch.bfloat16)
|
46 |
+
# # ICL super man可以不量化
|
47 |
+
# else:
|
48 |
+
# self.model = AutoPeftModelForCausalLM.from_pretrained(self.output_dir,device_map=self.device_map, torch_dtype=torch.float16,
|
49 |
+
# load_in_8bit=True)
|
50 |
+
# # quantization_config=quantconfig)
|
51 |
+
self.bnb_config = BitsAndBytesConfig(
|
52 |
+
load_in_4bit = True,
|
53 |
+
bnb_4bit_use_double_quant = False,
|
54 |
+
bnb_4bit_quant_type = 'nf4',
|
55 |
+
bnb_4bit_compute_dtype = getattr(torch, "float16")
|
56 |
+
)
|
57 |
+
self.bnb_config = BitsAndBytesConfig(
|
58 |
+
load_in_8bit = True,
|
59 |
+
# llm_int8_threshold=200.0
|
60 |
+
# bnb_4bit_use_double_quant = False,
|
61 |
+
# bnb_4bit_quant_type = 'nf4',
|
62 |
+
# bnb_4bit_compute_dtype = getattr(torch, "float16")
|
63 |
+
)
|
64 |
+
# self.bnb_config = BitsAndBytesConfig(
|
65 |
+
# load_in_8bit = False,
|
66 |
+
# load_in_4bit = False,
|
67 |
+
# # llm_int8_threshold=200.0
|
68 |
+
# # bnb_4bit_use_double_quant = False,
|
69 |
+
# # bnb_4bit_quant_type = 'nf4',
|
70 |
+
# # bnb_4bit_compute_dtype = getattr(torch, "float16")
|
71 |
+
# )
|
72 |
+
self.model = AutoModelForCausalLM.from_pretrained(
|
73 |
+
self.model_dir,
|
74 |
+
from_tf=bool(".ckpt" in self.model_dir),
|
75 |
+
quantization_config=self.bnb_config,
|
76 |
+
device_map=self.device_map,
|
77 |
+
trust_remote_code=True,
|
78 |
+
use_auth_token=True
|
79 |
+
)
|
80 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
|
81 |
+
# , trust_remote_code=True,add_eos_token=True,)
|
82 |
+
# tokenizer = AutoTokenizer.from_pretrained(base_model_name, add_eos_token=True,trust_remote_code=True)
|
83 |
+
# NOTE no one says whether the add eos token need to be added, but if we do not add this, the generate will continue until reach the max_new_tokens,
|
84 |
+
# when in predict model, do not use the add_eos_token=True, as the tokenizer will automatically add <\s> to the input, and thus the output will be inregular
|
85 |
+
# when add add_eos_token, it always failed
|
86 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
87 |
+
self.tokenizer.padding_side = 'right'
|
88 |
+
print(self.tokenizer.eos_token_id)
|
89 |
+
# 2
|
90 |
+
print(self.tokenizer.bos_token_id)
|
91 |
+
# 1
|
92 |
+
# print(tokenizer._convert_token_to_id(tokenizer.bos_token))
|
93 |
+
|
94 |
+
print("NL2TL model loaded")
|
95 |
+
|
96 |
+
self.replacer=replacer
|
97 |
+
self.ltlChecker=LTLChecker()
|
98 |
+
pass
|
99 |
+
|
100 |
+
# print('NL2TL llama translate test:')
|
101 |
+
# self.translate("Task_1.1 must be done, and Task_1.2 should be finished before Task_1.1")
|
102 |
+
def evaluate_model(self, input_text):
|
103 |
+
self.pattern=re.compile("linear temproal logic is ([\S ]*).")
|
104 |
+
messages=[
|
105 |
+
{"role": "user", "content": "translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()', natural language task: {}".format(input_text.strip())},
|
106 |
+
]
|
107 |
+
|
108 |
+
encodeds = self.tokenizer.apply_chat_template(messages, return_tensors="pt").to(self.device)
|
109 |
+
outputs = self.model.generate(encodeds, max_new_tokens=512, pad_token_id=self.tokenizer.eos_token_id)
|
110 |
+
|
111 |
+
p=self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
112 |
+
print('model output:',p)
|
113 |
+
transLTL=self.pattern.findall(p)[0]
|
114 |
+
if transLTL[-1]=='.':
|
115 |
+
transLTL=transLTL[:-1].strip()
|
116 |
+
else:
|
117 |
+
transLTL=transLTL.strip()
|
118 |
+
transLTL=self.ltlChecker.right_barkets_remover(transLTL)
|
119 |
+
print('transLTL:\n',transLTL)
|
120 |
+
return transLTL
|
121 |
+
def evaluate_model2(self, input_text):
|
122 |
+
self.pattern=re.compile("LTL is ([\S ]*).")
|
123 |
+
messages=[
|
124 |
+
{"role": "user", "content": "translate natural description to linear temproal logic, first translate into a logical expression, and then translate into linear temproal logic, the natural language task is {}".format(input_text.strip())},
|
125 |
+
]
|
126 |
+
encodeds = self.tokenizer.apply_chat_template(messages, return_tensors="pt").to(self.device)
|
127 |
+
outputs = self.model.generate(encodeds, max_new_tokens=512, pad_token_id=self.tokenizer.eos_token_id)
|
128 |
+
p=self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
129 |
+
print('---model output 1:\n',p)
|
130 |
+
# messages=[
|
131 |
+
# {"role": "user", "content": "translate natural description to linear temproal logic, first translate into a logical expression, and then translate into linear temproal logic, the natural language task is {}".format(input_text.strip())},
|
132 |
+
# {"role": "assistant", "content":p
|
133 |
+
# },
|
134 |
+
# {"role": "user", "content": " pay specific attention to brackets '()', given your linear temproal logic translation"},
|
135 |
+
# ]
|
136 |
+
|
137 |
+
# encodeds = self.tokenizer.apply_chat_template(messages, return_tensors="pt").to(self.device)
|
138 |
+
# outputs = self.model.generate(encodeds, max_new_tokens=512, pad_token_id=self.tokenizer.eos_token_id)
|
139 |
+
|
140 |
+
# p=self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
141 |
+
# print('---model output 2:\n',p)
|
142 |
+
transLTL=self.pattern.findall(p)[0]
|
143 |
+
if transLTL[-1]=='.':
|
144 |
+
transLTL=transLTL[:-1].strip()
|
145 |
+
else:
|
146 |
+
transLTL=transLTL.strip()
|
147 |
+
transLTL=self.ltlChecker.right_barkets_remover(transLTL)
|
148 |
+
print('transLTL:\n',transLTL)
|
149 |
+
return transLTL
|
150 |
+
def evaluate_model3(self, input_text):
|
151 |
+
# "LTL is a larger language model . . . . . . "
|
152 |
+
# self.pattern=re.compile("LTL is ([\S ]*)\.")
|
153 |
+
self.pattern=re.compile("LTL is ([^\.]*)\.")
|
154 |
+
messages=[
|
155 |
+
{"role": "user", "content": "translate natural description to linear temproal logic, first translate into a logical expression, and then translate into linear temproal logic, please pay specific attention to logic grammar, the natural language task is {}".format(input_text.strip())},
|
156 |
+
]
|
157 |
+
encodeds = self.tokenizer.apply_chat_template(messages, return_tensors="pt").to(self.device)
|
158 |
+
outputs = self.model.generate(encodeds, max_new_tokens=512, pad_token_id=self.tokenizer.eos_token_id)
|
159 |
+
p=self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
160 |
+
print('---model output 1:\n',p)
|
161 |
+
# messages=[
|
162 |
+
# {"role": "user", "content": "translate natural description to linear temproal logic, first translate into a logical expression, and then translate into linear temproal logic, the natural language task is {}".format(input_text.strip())},
|
163 |
+
# {"role": "assistant", "content":p
|
164 |
+
# },
|
165 |
+
# {"role": "user", "content": " pay specific attention to brackets '()', given your linear temproal logic translation"},
|
166 |
+
# ]
|
167 |
+
|
168 |
+
# encodeds = self.tokenizer.apply_chat_template(messages, return_tensors="pt").to(self.device)
|
169 |
+
# outputs = self.model.generate(encodeds, max_new_tokens=512, pad_token_id=self.tokenizer.eos_token_id)
|
170 |
+
|
171 |
+
# p=self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
172 |
+
# print('---model output 2:\n',p)
|
173 |
+
transLTL=self.pattern.findall(p)
|
174 |
+
if len(transLTL)==0:
|
175 |
+
return False
|
176 |
+
transLTL=transLTL[0]
|
177 |
+
if transLTL[-1]=='.':
|
178 |
+
transLTL=transLTL[:-1].strip()
|
179 |
+
else:
|
180 |
+
transLTL=transLTL.strip()
|
181 |
+
transLTL=self.ltlChecker.right_barkets_remover(transLTL)
|
182 |
+
print('transLTL:\n',transLTL)
|
183 |
+
return transLTL
|
184 |
+
def translate(self,input_prompt:str=""):
|
185 |
+
print('input_prompt:\n',input_prompt)
|
186 |
+
replacer=self.replacer()
|
187 |
+
input_prompt=replacer.reTask2P(input_prompt)
|
188 |
+
# print(predicter( replace.reTask2P(input_prompt)))
|
189 |
+
# print(input_prompt)
|
190 |
+
|
191 |
+
|
192 |
+
# print(p)
|
193 |
+
flag_check_false_count=0
|
194 |
+
flag_check=False
|
195 |
+
while not flag_check and flag_check_false_count<10:
|
196 |
+
flag_check_false_count+=1
|
197 |
+
flag_check=True
|
198 |
+
transLTL=self.evaluate_model3(input_prompt)
|
199 |
+
transLTL=transLTL.replace('Or','And')
|
200 |
+
transLTL=transLTL.replace('Globally','Finally')
|
201 |
+
if isinstance(transLTL,bool):
|
202 |
+
flag_check=False
|
203 |
+
elif not self.ltlChecker.AP_CorrCheck(input_prompt,transLTL):
|
204 |
+
print('AP_CorrCheck false')
|
205 |
+
flag_check=False
|
206 |
+
elif not self.ltlChecker.brackets_Check(transLTL):
|
207 |
+
print('brackets_Check false')
|
208 |
+
flag_check=False
|
209 |
+
# print(p)
|
210 |
+
return replacer.reP2Task(transLTL)
|
211 |
+
|
212 |
+
|
213 |
+
if __name__=="__main__":
|
214 |
+
# translater=Mistral_NL2TL_translator()
|
215 |
+
# test_prompts=[
|
216 |
+
# "Task_1.1.1 must precede Task_1.1.2, which in turn should precede Task_1.1.3, ",
|
217 |
+
# "Task_1.1 must be completed before Task_1.2 starts, and Task_1.2 must be completed before Task_1.3 starts." ,
|
218 |
+
# "Task_1.1 can be executed independently, after which Task_1.2 can be executed.",
|
219 |
+
# "Task_1.2.4 must be completed first, followed by Task_1.2.2, then Task_1.2.3, and finally Task_1.2.1.",
|
220 |
+
# "Task_1.2.4 is always executed first, followed by Task_1.2.3, then Task_1.2.2, and finally Task_1.2.1.",
|
221 |
+
# "Task_1.2.1 and Task_1.2.2 can be executed independently, and both should eventually be completed.",
|
222 |
+
# ]
|
223 |
+
# for ret in test_prompts:
|
224 |
+
# print(translater.translate(ret))
|
225 |
+
# print('\n','-'*20,'\n')
|
226 |
+
# exit()
|
227 |
+
class p2preplacer():
|
228 |
+
def reTask2P(self,input):
|
229 |
+
return input
|
230 |
+
def reP2Task(self,input):
|
231 |
+
return input
|
232 |
+
translater=Mistral_NL2TL_translator(replacer=p2preplacer)
|
233 |
+
import evaluate
|
234 |
+
import numpy as np
|
235 |
+
# from datasets import load_from_disk
|
236 |
+
from tqdm import tqdm
|
237 |
+
|
238 |
+
# Metric
|
239 |
+
metric = evaluate.load("rouge")
|
240 |
+
datapath='/home/user/xsj/NL2TL-dataset/collect2'
|
241 |
+
tokenized_dataset = load_dataset("json", data_files={"train":os.path.join(datapath,"ltl_eng_train_mid_ascii_gptAuged.jsonl"),"test":os.path.join(datapath,"ltl_eng_test_mid_ascii_gptAuged.jsonl")})
|
242 |
+
print(tokenized_dataset)
|
243 |
+
# run predictions
|
244 |
+
# this can take ~45 minutes
|
245 |
+
import re
|
246 |
+
# pattern=re.compile("\[Formal LTL\]:\n([\S ]*)\n")
|
247 |
+
predictions, references,input_sentence,output_sentence=[], [] , [], []
|
248 |
+
# with open()
|
249 |
+
for idx in range(len(tokenized_dataset['test']['natural'])):
|
250 |
+
# print(sample)
|
251 |
+
nl=tokenized_dataset['test']['natural'][idx]
|
252 |
+
transLTL=translater.translate(nl)
|
253 |
+
# p = translater.evaluate_model(nl)
|
254 |
+
# # print(p,l)
|
255 |
+
input_sentence.append(nl)
|
256 |
+
|
257 |
+
# transLTL=pattern.findall(p)
|
258 |
+
# # print(p)
|
259 |
+
predictions.append(transLTL)
|
260 |
+
# output_sentence.append(p)
|
261 |
+
# input_sentence.append(nl)
|
262 |
+
references.append(tokenized_dataset['test']['raw_ltl'][idx].strip())
|
263 |
+
print(idx,'\n',input_sentence[-1],
|
264 |
+
# '\nout::\n',output_sentence[-1],
|
265 |
+
'\npre::\n',predictions[-1],
|
266 |
+
'\nref::\n',references[-1],'\n','-'*20,'\n')
|
267 |
+
|
268 |
+
# compute metric
|
269 |
+
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)
|
270 |
+
|
271 |
+
# print results
|
272 |
+
print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
|
273 |
+
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
|
274 |
+
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
|
275 |
+
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
|
276 |
+
eval_output=np.array([input_sentence,predictions,references]).T
|
277 |
+
import pandas as pd
|
278 |
+
eval_output=pd.DataFrame(eval_output)
|
279 |
+
pd.DataFrame.to_csv(eval_output,"/home/user/xsj/model_weight/mistral7b_mid_ascii_0327_eos_2aug1_quat8"+'/output')
|
280 |
+
# out llama
|
281 |
+
# Rogue1: 98.363321%
|
282 |
+
# rouge2: 95.987820%
|
283 |
+
# rougeL: 97.384820%
|
284 |
+
# rougeLsum: 97.382071%
|
285 |
+
|
286 |
+
# this
|
287 |
+
# Rogue1: 98.543297%
|
288 |
+
# rouge2: 96.575248%
|
289 |
+
# rougeL: 97.720560%
|
290 |
+
# rougeLsum: 97.724880%
|
291 |
+
exit()
|
292 |
+
flag=True
|
293 |
+
while flag:
|
294 |
+
lines=[""]
|
295 |
+
try:
|
296 |
+
lines.append(input())
|
297 |
+
while True:
|
298 |
+
lines.append(input())
|
299 |
+
except:
|
300 |
+
pass
|
301 |
+
ret ="".join(lines)
|
302 |
+
print(ret)
|
303 |
+
if ret=="":
|
304 |
+
flag=False
|
305 |
+
|
306 |
+
print(translater.translate(ret))
|
307 |
+
|
308 |
+
|
finetune/mistral7b/test.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import (AutoModelForCausalLM,
|
3 |
+
AutoTokenizer,
|
4 |
+
BitsAndBytesConfig,
|
5 |
+
TrainingArguments,
|
6 |
+
pipeline,
|
7 |
+
logging,
|
8 |
+
TrainerCallback)
|
9 |
+
device = "cuda" # the device to load the model onto
|
10 |
+
bnb_config = BitsAndBytesConfig(
|
11 |
+
load_in_4bit = True,
|
12 |
+
bnb_4bit_use_double_quant = False,
|
13 |
+
bnb_4bit_quant_type = 'nf4',
|
14 |
+
bnb_4bit_compute_dtype = getattr(torch, "float16")
|
15 |
+
)
|
16 |
+
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2",quantization_config=bnb_config,)
|
17 |
+
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
|
18 |
+
|
19 |
+
messages = [
|
20 |
+
{"role": "user", "content": "What is your favourite condiment?"},
|
21 |
+
{"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
|
22 |
+
{"role": "user", "content": "Do you have mayonnaise recipes?"}
|
23 |
+
]
|
24 |
+
|
25 |
+
encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
|
26 |
+
|
27 |
+
model_inputs = encodeds.to(device)
|
28 |
+
# model.to(device)
|
29 |
+
|
30 |
+
generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
|
31 |
+
decoded = tokenizer.batch_decode(generated_ids)
|
32 |
+
print(decoded[0])
|
finetune/realtime_run.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from peft import PeftModel, PeftConfig
|
3 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
4 |
+
|
5 |
+
# Load peft config for pre-trained checkpoint etc.
|
6 |
+
peft_model_id="finetuned_model/results"
|
7 |
+
config = PeftConfig.from_pretrained(peft_model_id)
|
8 |
+
|
9 |
+
# load base LLM model and tokenizer
|
10 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, load_in_8bit=True, device_map={"":0})
|
11 |
+
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
|
12 |
+
|
13 |
+
# Load the Lora model
|
14 |
+
model = PeftModel.from_pretrained(model, peft_model_id, device_map={"":0})
|
15 |
+
model.eval()
|
16 |
+
|
17 |
+
print("Peft model loaded")
|
18 |
+
|
19 |
+
from datasets import load_dataset
|
20 |
+
from random import randrange
|
21 |
+
|
22 |
+
|
23 |
+
import evaluate
|
24 |
+
import numpy as np
|
25 |
+
import datasets
|
26 |
+
from tqdm import tqdm
|
27 |
+
|
28 |
+
# Metric
|
29 |
+
metric = evaluate.load("rouge")
|
30 |
+
|
31 |
+
def evaluate_peft_model(sample,max_target_length=50):
|
32 |
+
# generate summary
|
33 |
+
outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), do_sample=True, top_p=0.9, max_new_tokens=max_target_length)
|
34 |
+
prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
|
35 |
+
# decode eval sample
|
36 |
+
# Replace -100 in the labels as we can't decode them.
|
37 |
+
labels = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id)
|
38 |
+
labels = tokenizer.decode(labels, skip_special_tokens=True)
|
39 |
+
|
40 |
+
# Some simple post-processing
|
41 |
+
return prediction, labels
|
42 |
+
|
43 |
+
# load test dataset from distk
|
44 |
+
# test_dataset = load_from_disk("data/eval/").with_format("torch")
|
45 |
+
list_input = [{"natural": "go to P03 and then go to P04, remain in P04 until P05","raw_ltl":"0"}]
|
46 |
+
test_dataset = datasets.Dataset.from_list(list_input)
|
47 |
+
# run predictions
|
48 |
+
# this can take ~45 minutes
|
49 |
+
predictions, references = [] , []
|
50 |
+
for sample in tqdm(test_dataset):
|
51 |
+
p,l = evaluate_peft_model(sample)
|
52 |
+
print(p,l)
|
53 |
+
predictions.append(p)
|
54 |
+
references.append(l)
|
finetune/test.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
2 |
+
|
3 |
+
tokenizer = AutoTokenizer.from_pretrained("bigscience/T0pp")
|
4 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp")
|
5 |
+
|
6 |
+
inputs = tokenizer.encode("Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy", return_tensors="pt")
|
7 |
+
outputs = model.generate(inputs)
|
8 |
+
print(tokenizer.decode(outputs[0]))
|