tt-dart commited on
Commit
bacb17b
·
1 Parent(s): 88c6488

add train and run scripts

Browse files
finetune/Llama2_13b/data/eval/tf-ltl_eng_test_mid_ascii_gptAuged/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fbd627e90c26f0b0139838c624a28627658e4c79d75446f7e4ac7883fed6226
3
+ size 1329320
finetune/Llama2_13b/data/eval/tf-ltl_eng_test_mid_ascii_gptAuged/dataset_info.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "builder_name": "json",
3
+ "citation": "",
4
+ "config_name": "default",
5
+ "dataset_name": "json",
6
+ "dataset_size": 889411,
7
+ "description": "",
8
+ "download_checksums": {
9
+ "LTL_datasets/collect/ltl_eng_train_mid_ascii_gptAuged.jsonl": {
10
+ "num_bytes": 1129386,
11
+ "checksum": null
12
+ },
13
+ "LTL_datasets/collect/ltl_eng_test_mid_ascii_gptAuged.jsonl": {
14
+ "num_bytes": 125920,
15
+ "checksum": null
16
+ }
17
+ },
18
+ "download_size": 1255306,
19
+ "features": {
20
+ "id": {
21
+ "dtype": "string",
22
+ "_type": "Value"
23
+ },
24
+ "input_ids": {
25
+ "feature": {
26
+ "dtype": "int32",
27
+ "_type": "Value"
28
+ },
29
+ "_type": "Sequence"
30
+ },
31
+ "attention_mask": {
32
+ "feature": {
33
+ "dtype": "int8",
34
+ "_type": "Value"
35
+ },
36
+ "_type": "Sequence"
37
+ },
38
+ "labels": {
39
+ "feature": {
40
+ "dtype": "int64",
41
+ "_type": "Value"
42
+ },
43
+ "_type": "Sequence"
44
+ }
45
+ },
46
+ "homepage": "",
47
+ "license": "",
48
+ "size_in_bytes": 2144717,
49
+ "splits": {
50
+ "train": {
51
+ "name": "train",
52
+ "num_bytes": 800102,
53
+ "num_examples": 10621,
54
+ "dataset_name": "json"
55
+ },
56
+ "test": {
57
+ "name": "test",
58
+ "num_bytes": 89309,
59
+ "num_examples": 1181,
60
+ "dataset_name": "json"
61
+ }
62
+ },
63
+ "version": {
64
+ "version_str": "0.0.0",
65
+ "major": 0,
66
+ "minor": 0,
67
+ "patch": 0
68
+ }
69
+ }
finetune/Llama2_13b/data/eval/tf-ltl_eng_test_mid_ascii_gptAuged/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "c6bf809a7a8f99a6",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": "test"
13
+ }
finetune/Llama2_13b/data/train/tf-ltl_eng_test_mid_ascii_gptAuged/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56f425bd727940e8905c8c73e87d9089a94304a1627835291e5595f54d984e0f
3
+ size 11945016
finetune/Llama2_13b/data/train/tf-ltl_eng_test_mid_ascii_gptAuged/dataset_info.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "builder_name": "json",
3
+ "citation": "",
4
+ "config_name": "default",
5
+ "dataset_name": "json",
6
+ "dataset_size": 889411,
7
+ "description": "",
8
+ "download_checksums": {
9
+ "LTL_datasets/collect/ltl_eng_train_mid_ascii_gptAuged.jsonl": {
10
+ "num_bytes": 1129386,
11
+ "checksum": null
12
+ },
13
+ "LTL_datasets/collect/ltl_eng_test_mid_ascii_gptAuged.jsonl": {
14
+ "num_bytes": 125920,
15
+ "checksum": null
16
+ }
17
+ },
18
+ "download_size": 1255306,
19
+ "features": {
20
+ "id": {
21
+ "dtype": "string",
22
+ "_type": "Value"
23
+ },
24
+ "input_ids": {
25
+ "feature": {
26
+ "dtype": "int32",
27
+ "_type": "Value"
28
+ },
29
+ "_type": "Sequence"
30
+ },
31
+ "attention_mask": {
32
+ "feature": {
33
+ "dtype": "int8",
34
+ "_type": "Value"
35
+ },
36
+ "_type": "Sequence"
37
+ },
38
+ "labels": {
39
+ "feature": {
40
+ "dtype": "int64",
41
+ "_type": "Value"
42
+ },
43
+ "_type": "Sequence"
44
+ }
45
+ },
46
+ "homepage": "",
47
+ "license": "",
48
+ "size_in_bytes": 2144717,
49
+ "splits": {
50
+ "train": {
51
+ "name": "train",
52
+ "num_bytes": 800102,
53
+ "num_examples": 10621,
54
+ "dataset_name": "json"
55
+ },
56
+ "test": {
57
+ "name": "test",
58
+ "num_bytes": 89309,
59
+ "num_examples": 1181,
60
+ "dataset_name": "json"
61
+ }
62
+ },
63
+ "version": {
64
+ "version_str": "0.0.0",
65
+ "major": 0,
66
+ "minor": 0,
67
+ "patch": 0
68
+ }
69
+ }
finetune/Llama2_13b/data/train/tf-ltl_eng_test_mid_ascii_gptAuged/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "afb9c85014ff4b4e",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": "train"
13
+ }
finetune/Llama2_13b/llama_dp2_patch.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Tuple
2
+
3
+ import torch
4
+ from torch import nn
5
+ import warnings
6
+ import transformers
7
+ from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
8
+ from peft.tuners.lora import LoraLayer
9
+
10
+ try:
11
+ from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
12
+ from flash_attn.bert_padding import unpad_input, pad_input
13
+ except Exception:
14
+ raise ModuleNotFoundError(
15
+ "Please install FlashAttention first, e.g., with pip install flash-attn --no-build-isolation, Learn more at https://github.com/Dao-AILab/flash-attention#installation-and-features"
16
+ )
17
+
18
+ try:
19
+ from einops import rearrange
20
+ except Exception:
21
+ raise ModuleNotFoundError("Please install einops first, e.g., with pip install einops")
22
+
23
+
24
+ # ADAPTED from https://github.com/allenai/open-instruct/blob/main/open_instruct/llama_flash_attn_monkey_patch.py
25
+ # AND https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py
26
+ # AND https://github.com/LAION-AI/Open-Assistant/blob/04fa9a24b2a58c8885b8aa6a2eb02b18de6b4961/model/model_training/models/patching_llama.py
27
+ # AND Sourabh https://github.com/huggingface/transformers/commit/ee81bf5aee0d65f005d157c013777e3d27d8d6bf
28
+ def forward(
29
+ self,
30
+ hidden_states: torch.Tensor,
31
+ attention_mask: Optional[torch.Tensor] = None,
32
+ position_ids: Optional[torch.Tensor] = None,
33
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
34
+ output_attentions: bool = False,
35
+ use_cache: bool = False,
36
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
37
+ """Input shape: Batch x Time x Channel
38
+
39
+ attention_mask: [bsz, q_len]
40
+ """
41
+ if output_attentions:
42
+ warnings.warn("Output attentions is not supported for patched `LlamaAttention`, returning `None` instead.")
43
+
44
+ bsz, q_len, _ = hidden_states.size()
45
+
46
+ query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
47
+ key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
48
+ value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
49
+ # [bsz, q_len, nh, hd]
50
+ # [bsz, nh, q_len, hd]
51
+
52
+ kv_seq_len = key_states.shape[-2]
53
+ if past_key_value is not None:
54
+ kv_seq_len += past_key_value[0].shape[-2]
55
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
56
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
57
+
58
+ # Past Key value support
59
+ if past_key_value is not None:
60
+ # reuse k, v, self_attention
61
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
62
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
63
+
64
+ past_key_value = (key_states, value_states) if use_cache else None
65
+
66
+ # Flash attention codes from
67
+ # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py
68
+
69
+ # transform the data into the format required by flash attention
70
+ qkv = torch.stack([query_states, key_states, value_states], dim=2) # [bsz, nh, 3, q_len, hd]
71
+ qkv = qkv.transpose(1, 3) # [bsz, q_len, 3, nh, hd]
72
+ # We have disabled _prepare_decoder_attention_mask in LlamaModel
73
+ # the attention_mask should be the same as the key_padding_mask
74
+ key_padding_mask = attention_mask
75
+
76
+ if key_padding_mask is None:
77
+ qkv = rearrange(qkv, "b s ... -> (b s) ...")
78
+ max_s = q_len
79
+ cu_q_lens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device)
80
+ output = flash_attn_varlen_qkvpacked_func(qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True)
81
+ output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
82
+ else:
83
+ nheads = qkv.shape[-2]
84
+ x = rearrange(qkv, "b s three h d -> b s (three h d)")
85
+ x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
86
+ x_unpad = rearrange(x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads)
87
+ output_unpad = flash_attn_varlen_qkvpacked_func(
88
+ x_unpad, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
89
+ )
90
+ output = rearrange(
91
+ pad_input(rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz, q_len),
92
+ "b s (h d) -> b s h d",
93
+ h=nheads,
94
+ )
95
+ return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, past_key_value
96
+
97
+
98
+ # Disable the transformation of the attention mask in LlamaModel as the flash attention
99
+ # requires the attention mask to be the same as the key_padding_mask
100
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
101
+ # [bsz, seq_len]
102
+ return attention_mask
103
+
104
+
105
+ def replace_attn_with_flash_attn():
106
+ cuda_major, cuda_minor = torch.cuda.get_device_capability()
107
+ if cuda_major < 8:
108
+ print(
109
+ "Flash attention is only supported on Ampere or Hopper GPU during training due to head dim > 64 backward."
110
+ "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
111
+ )
112
+ transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (
113
+ _prepare_decoder_attention_mask
114
+ )
115
+ transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
116
+
117
+
118
+ def unplace_flash_attn_with_attn():
119
+ import importlib
120
+ import transformers
121
+
122
+ print("Reloading llama model, unpatching flash attention")
123
+ importlib.reload(transformers.models.llama.modeling_llama)
124
+
125
+
126
+ # Adapted from https://github.com/tmm1/axolotl/blob/2eda9e02a9d15a7a3f92b41f257d9844d72fc220/src/axolotl/utils/models.py#L338
127
+ def upcast_layer_for_flash_attention(model, torch_dtype):
128
+ # LlamaRMSNorm layers are in fp32 after kbit_training, so we need to
129
+ # convert them back to fp16/bf16 for flash-attn compatibility.
130
+ for name, module in model.named_modules():
131
+ if isinstance(module, LoraLayer):
132
+ module.to(torch_dtype)
133
+ if "norm" in name:
134
+ module.to(torch_dtype)
135
+ if "lm_head" in name or "embed_tokens" in name:
136
+ if hasattr(module, "weight"):
137
+ module.to(torch_dtype)
138
+
139
+ return model
finetune/Llama2_13b/llama_lora_fintune.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import login
2
+ login()
3
+ import sys,os
4
+ from datasets import load_dataset
5
+ import torch
6
+ from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments
7
+ from peft import LoraConfig
8
+ from trl import SFTTrainer
9
+ from accelerate import infer_auto_device_map,init_empty_weights
10
+
11
+ def preprocess_function(sample,padding="max_length"):
12
+ # add prefix to the input for t5
13
+ # print(sample[0])
14
+ inputs=[
15
+ f"""### Instruction:
16
+ translate natural description in to LTL, first translate into a logical way, and then translate into LTL,
17
+ using 'A' for 'And','O' for 'Or', 'I' for 'Imply','N' for 'Not','E' for 'Equally','F' for 'Finally','G' for 'Globally','U' for 'Until','X' for 'Next', pay specific attention to brackets '()'
18
+
19
+ ### Input:
20
+ {sample['natural'][i]}
21
+
22
+ ### Response:
23
+ {util.reAsciiLTL2EngLTL(sample['raw_ltl'][i])} {sample['raw_ltl'][i]}</s>"""
24
+ # NOTE it seems the eos is needed, the bos is not needed(the bos will be automatically added)
25
+ for i in (range(len(sample['natural'])))]
26
+ # inputs = ["## [instruction]: translate natural description in to LTL: ### [natural language]:" + sample['natural'][i]+'### [LTL]:'+sample['raw_ltl'][i] for i in (range(len(sample['natural'])))]
27
+
28
+ sample["complete_text"] = inputs
29
+ return sample
30
+
31
+
32
+ def evaluate_model(input_text):
33
+ input_text =f"""### [Instruction]:
34
+ translate natural description in to LTL, first translate into a logical way, and then translate into LTL,
35
+ using 'A' for 'And','O' for 'Or', 'I' for 'Imply','N' for 'Not','E' for 'Equally','F' for 'Finally','G' for 'Globally','U' for 'Until','X' for 'Next', pay specific attention to brackets '()'
36
+
37
+ ### [Natural Language Task]:
38
+ {input_text}
39
+ ### [Temporal Logic Translation]:
40
+ """
41
+ # "### [instruction]: translate natural description in to LTL: \n\n ### [natural language]:" + input_text+'### [LTL]:'
42
+ inputs = tokenizer(input_text, return_tensors="pt").to(device)
43
+ outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"].to("cuda"), max_new_tokens=64, pad_token_id=tokenizer.eos_token_id)
44
+
45
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
46
+
47
+
48
+ # sys.path.append('../../../')
49
+ # sys.path.append('../../')
50
+ # sys.path.append('../')
51
+ # os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
52
+ # os.environ['CUDA_VISIBLE_DEVICES'] = "5,6,7"
53
+ # device = torch.device("cuda:0-6" if torch.cuda.is_available() else "cpu")
54
+ sys.path.append(os.path.join(os.path.dirname(__file__), '../../'))
55
+ import utils.util as util
56
+ # Load dataset from the hub
57
+ # dataset = load_dataset("samsum")
58
+ output_dir = "finetuned_model/"
59
+ datapath='LTL_datasets/collect/'
60
+ exp_name="ascii"
61
+
62
+
63
+ base_model_name = "meta-llama/Llama-2-13b-hf"
64
+ bnb_config = BitsAndBytesConfig(
65
+ load_in_4bit=True,
66
+ bnb_4bit_quant_type="nf4",
67
+ bnb_4bit_compute_dtype=torch.float16,
68
+ )
69
+
70
+ import os
71
+ os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
72
+ os.environ['CUDA_VISIBLE_DEVICES']='4'
73
+ device_map="auto"
74
+ # torch.cuda.set_device(7)
75
+ # device_map={'':torch.cuda.current_device()}
76
+ # device_map = {'':'cuda:7'}
77
+ # model_dir为模型的路径或名称
78
+ # config = AutoConfig.from_pretrained(base_model_name, trust_remote_code=True)
79
+ # with init_empty_weights():
80
+ # base_model = AutoModelForCausalLM.from_pretrained(
81
+ # base_model_name,
82
+ # from_tf=bool(".ckpt" in base_model_name),
83
+ # quantization_config=bnb_config,
84
+ # device_map=device_map,
85
+ # trust_remote_code=True,
86
+ # use_auth_token=True
87
+ # )
88
+
89
+ # map_list = {5:"15GB", 6:"15GB",7:"15GB"} # 对应不同卡号限制的内存量
90
+ # map_list = {7:"15GB",} # 对应不同卡号限制的内存量
91
+ # no_split_modules = base_model._no_split_modules
92
+ # device_map = infer_auto_device_map(base_model, max_memory=map_list, no_split_module_classes=no_split_modules)
93
+
94
+
95
+ tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
96
+ # tokenizer = AutoTokenizer.from_pretrained(base_model_name, add_eos_token=True,trust_remote_code=True)
97
+ # NOTE no one says whether the add eos token need to be added, but if we do not add this, the generate will continue until reach the max_new_tokens
98
+ # when add add_eos_token, it always failed
99
+ # if use this it will generate somthing other
100
+ tokenizer.pad_token = tokenizer.eos_token
101
+ tokenizer.padding_side = 'right'
102
+ print(tokenizer.eos_token_id)
103
+ # 2
104
+ print(tokenizer.bos_token_id)
105
+ # 1
106
+ # print(tokenizer._convert_token_to_id(tokenizer.bos_token))
107
+
108
+ dataset = load_dataset("json", data_files={"train":"LTL_datasets/collect/ltl_eng_train_mid_ascii_gptAuged.jsonl","test":"LTL_datasets/collect/ltl_eng_test_mid_ascii_gptAuged.jsonl"})
109
+ print(dataset)
110
+
111
+ from datasets import concatenate_datasets
112
+ import numpy as np
113
+ # The maximum total input sequence length after tokenization.
114
+ # Sequences longer than this will be truncated, sequences shorter will be padded.
115
+ # tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["natural"], truncation=True), batched=True, remove_columns=["raw_ltl", "natural"])
116
+ # input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
117
+ # # take 85 percentile of max length for better utilization
118
+ # max_source_length = int(np.percentile(input_lenghts, 100))
119
+ # print(f"Max source length: {max_source_length}")
120
+
121
+ # # The maximum total sequence length for target text after tokenization.
122
+ # # Sequences longer than this will be truncated, sequences shorter will be padded."
123
+ # tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["raw_ltl"], truncation=True), batched=True, remove_columns=["raw_ltl", "natural"])
124
+ # target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
125
+ # # take 90 percentile of max length for better utilization
126
+ # max_target_length = int(np.percentile(target_lenghts, 100))
127
+ # print(f"Max target length: {max_target_length}")
128
+
129
+
130
+ # # %%
131
+ # def translateAscii2Eng(input):
132
+
133
+ # def preprocess_function(sample,padding="max_length"):
134
+ # # add prefix to the input for t5
135
+ # # print(sample[0])
136
+ # inputs = ["## [instruction]: translate natural description in to LTL: ### [natural language]:" + sample['natural'][i]+'### [LTL]:'+sample['raw_ltl'][i] for i in (range(len(sample['natural'])))]
137
+
138
+ # sample["complete_text"] = inputs
139
+ # return sample
140
+
141
+
142
+
143
+ tokenized_dataset = dataset.map(preprocess_function, batched=True)
144
+ print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")
145
+
146
+ # save datasets to disk for later easy loading
147
+ # tokenized_dataset["train"].save_to_disk("data/train"+exp_name)
148
+ # tokenized_dataset["test"].save_to_disk("data/eval"+exp_name)
149
+
150
+
151
+ base_model = AutoModelForCausalLM.from_pretrained(
152
+ base_model_name,
153
+ from_tf=bool(".ckpt" in base_model_name),
154
+ quantization_config=bnb_config,
155
+ device_map=device_map,
156
+ trust_remote_code=True,
157
+ use_auth_token=True
158
+ )
159
+ base_model.config.use_cache = False
160
+
161
+ # More info: https://github.com/huggingface/transformers/pull/24906
162
+ base_model.config.pretraining_tp = 1
163
+
164
+ peft_config = LoraConfig(
165
+ lora_alpha=16,
166
+ lora_dropout=0.1,
167
+ r=64,
168
+ bias="none",
169
+ task_type="CAUSAL_LM",
170
+ )
171
+
172
+
173
+
174
+
175
+ training_args = TrainingArguments(
176
+ output_dir=output_dir,
177
+ per_device_train_batch_size=8,
178
+ gradient_accumulation_steps=4,
179
+ learning_rate=2e-4,
180
+ logging_steps=10,
181
+ num_train_epochs=3,
182
+ # max_steps=500
183
+ )
184
+
185
+ max_seq_length = 512
186
+
187
+ trainer = SFTTrainer(
188
+ model=base_model,
189
+ train_dataset=tokenized_dataset['train'],
190
+ peft_config=peft_config,
191
+ dataset_text_field="complete_text",
192
+ max_seq_length=max_seq_length,
193
+ tokenizer=tokenizer,
194
+ args=training_args,
195
+ # device_map=device_map
196
+ )
197
+
198
+ import os
199
+ output_dir = os.path.join(output_dir, "llama2_13b_"+exp_name+'aug1')
200
+
201
+ trainer.train()
202
+ trainer.model.save_pretrained(output_dir)
203
+ # trainer.model.save_pretrained(output_dir)
204
+ tokenizer.save_pretrained(output_dir)
205
+
206
+
207
+ # check
208
+
209
+ from peft import AutoPeftModelForCausalLM
210
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
211
+ model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map=device_map, torch_dtype=torch.bfloat16)
212
+
213
+
214
+
215
+
216
+ import evaluate
217
+ import numpy as np
218
+ from datasets import load_from_disk
219
+ from tqdm import tqdm
220
+
221
+ # Metric
222
+ metric = evaluate.load("rouge")
223
+
224
+
225
+ # load test dataset from distk
226
+ # test_dataset = load_from_disk("data/eval"+exp_name+'/').with_format("torch")
227
+
228
+
229
+ # run predictions
230
+ # this can take ~45 minutes
231
+ import re
232
+ pattern=re.compile("\[Formal LTL\]:\n([\S ]*)\n")
233
+ predictions, references,input_sentence,output_sentence=[], [] , [], []
234
+ for idx in range(len(tokenized_dataset['test']['natural'])):
235
+ # print(sample)
236
+ nl=tokenized_dataset['test']['natural'][idx]
237
+ p = evaluate_model(nl)
238
+ # print(p,l)
239
+ input_sentence.append(nl)
240
+
241
+ transLTL=pattern.findall(p)
242
+ # print(p)
243
+ predictions.append(transLTL[0])
244
+ output_sentence.append(p)
245
+ input_sentence.append(p)
246
+ references.append(tokenized_dataset['test']['raw_ltl'][idx])
247
+ print(input_sentence[-1],'\nout::\n',output_sentence[-1],'\npre::\n',predictions[-1],'\nref::\n',references[-1],'\n','-'*20,'\n')
248
+
249
+ # compute metric
250
+ rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)
251
+
252
+ # print results
253
+ print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
254
+ print(f"rouge2: {rogue['rouge2']* 100:2f}%")
255
+ print(f"rougeL: {rogue['rougeL']* 100:2f}%")
256
+ print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
257
+ eval_output=np.array([input_sentence,predictions,references]).T
258
+ import pandas as pd
259
+ eval_output=pd.DataFrame(eval_output)
260
+ pd.DataFrame.to_csv(eval_output,output_dir+'/output')
261
+ # Rogue1: 50.386161%
262
+ # rouge2: 24.842412%
263
+ # rougeL: 41.370130%
264
+ # rougeLsum: 41.394230%
finetune/Llama2_13b/llama_lora_fintune_ver2.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import login
2
+ login()
3
+ import json
4
+ import numpy as npas np
5
+ import sys,os
6
+ from datasets import load_dataset
7
+ import torch
8
+ from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments
9
+ from peft import LoraConfig
10
+ from trl import SFTTrainer
11
+ from accelerate import infer_auto_device_map,init_empty_weights
12
+ # sys.path.append('../../../')
13
+ # sys.path.append('../../')
14
+ # sys.path.append('../')
15
+ # os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
16
+ # os.environ['CUDA_VISIBLE_DEVICES'] = "5,6,7"
17
+ # device = torch.device("cuda:0-6" if torch.cuda.is_available() else "cpu")
18
+ sys.path.append(os.path.join(os.path.dirname(__file__), '../../'))
19
+ # import utils.util as util
20
+ # Load dataset from the hub
21
+ # dataset = load_dataset("samsum")
22
+ np.random.seed(42)
23
+ output_dir = "model_weight/"
24
+ datapath='NL2TL-dataset/collect2'
25
+ exp_name="ascii"
26
+ explainer_files=['LTLexplain_0.json','LTLexplain_1.json','LTLexplain_2.json','LTLexplain_3.json']
27
+ explainer_dic={}
28
+ for path in explainer_files:
29
+ with open(os.path.join(datapath,path)) as f:
30
+ LTLlist=json.load(f)
31
+ for key in LTLlist.keys():
32
+ if isinstance(LTLlist[key],dict):
33
+ if not (key in explainer_dic):
34
+ explainer_dic[key]=[]
35
+ explainer_dic[key].append(LTLlist[key]['translate'])
36
+ sp=LTLlist[key]['explain'].split("means that")
37
+ if len(sp)>1:
38
+ explainer_dic[key].append(sp[1])
39
+
40
+
41
+
42
+ base_model_name = "meta-llama/Llama-2-13b-hf"
43
+ base_model_name = "meta-llama/Llama-2-7b-hf"
44
+ bnb_config = BitsAndBytesConfig(
45
+ load_in_4bit=True,
46
+ bnb_4bit_quant_type="nf4",
47
+ bnb_4bit_compute_dtype=torch.float16,
48
+ )
49
+
50
+ import os
51
+ os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
52
+ os.environ['CUDA_VISIBLE_DEVICES']='4'
53
+ device_map="auto"
54
+ # torch.cuda.set_device(7)
55
+ device_map={'':torch.cuda.current_device()}
56
+ device_map={'':torch.cuda.current_device()}
57
+ # device_map = {'':'cuda:7'}
58
+ # model_dir为模型的路径或名称
59
+ # config = AutoConfig.from_pretrained(base_model_name, trust_remote_code=True)
60
+ # with init_empty_weights():
61
+ # base_model = AutoModelForCausalLM.from_pretrained(
62
+ # base_model_name,
63
+ # from_tf=bool(".ckpt" in base_model_name),
64
+ # quantization_config=bnb_config,
65
+ # device_map=device_map,
66
+ # trust_remote_code=True,
67
+ # use_auth_token=True
68
+ # )
69
+
70
+ # map_list = {5:"15GB", 6:"15GB",7:"15GB"} # 对应不同卡号限制的内存量
71
+ # map_list = {7:"15GB",} # 对应不同卡号限制的内存量
72
+ # no_split_modules = base_model._no_split_modules
73
+ # device_map = infer_auto_device_map(base_model, max_memory=map_list, no_split_module_classes=no_split_modules)
74
+
75
+
76
+ tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
77
+ # tokenizer = AutoTokenizer.from_pretrained(base_model_name, add_eos_token=True,trust_remote_code=True)
78
+ # NOTE no one says whether the add eos token need to be added, but if we do not add this, the generate will continue until reach the max_new_tokens
79
+ # when add add_eos_token, it always failed
80
+ # if use this it will generate somthing other
81
+ tokenizer.pad_token = tokenizer.eos_token
82
+ tokenizer.padding_side = 'right'
83
+ print(tokenizer.eos_token_id)
84
+ # 2
85
+ print(tokenizer.bos_token_id)
86
+ # 1
87
+ # print(tokenizer._convert_token_to_id(tokenizer.bos_token))
88
+
89
+ dataset = load_dataset("json", data_files={"train":os.path.join(datapath,"ltl_eng_train_mid_ascii_gptAuged.jsonl"),"test":os.path.join(datapath,"ltl_eng_test_mid_ascii_gptAuged.jsonl")})
90
+ print(dataset)
91
+
92
+ from datasets import concatenate_datasets
93
+ import numpy as np
94
+ # The maximum total input sequence length after tokenization.
95
+ # Sequences longer than this will be truncated, sequences shorter will be padded.
96
+ # tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["natural"], truncation=True), batched=True, remove_columns=["raw_ltl", "natural"])
97
+ # input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
98
+ # # take 85 percentile of max length for better utilization
99
+ # max_source_length = int(np.percentile(input_lenghts, 100))
100
+ # print(f"Max source length: {max_source_length}")
101
+
102
+ # # The maximum total sequence length for target text after tokenization.
103
+ # # Sequences longer than this will be truncated, sequences shorter will be padded."
104
+ # tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["raw_ltl"], truncation=True), batched=True, remove_columns=["raw_ltl", "natural"])
105
+ # target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
106
+ # # take 90 percentile of max length for better utilization
107
+ # max_target_length = int(np.percentile(target_lenghts, 100))
108
+ # print(f"Max target length: {max_target_length}")
109
+
110
+
111
+ # # %%
112
+ # def translateAscii2Eng(input):
113
+
114
+ # def preprocess_function(sample,padding="max_length"):
115
+ # # add prefix to the input for t5
116
+ # # print(sample[0])
117
+ # inputs = ["## [instruction]: translate natural description in to LTL: ### [natural language]:" + sample['natural'][i]+'### [LTL]:'+sample['raw_ltl'][i] for i in (range(len(sample['natural'])))]
118
+
119
+ # sample["complete_text"] = inputs
120
+ # return sample
121
+
122
+
123
+ def preprocess_function(sample,padding="max_length"):
124
+ # add prefix to the input for t5
125
+ # print(sample[0])
126
+ inputs=[
127
+ f"""### Instruction:
128
+ translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()'
129
+
130
+ ### Natural Language Task:
131
+ {sample['natural'][i].strip()}
132
+
133
+ ### Logic Translation:
134
+ {explainer_dic[sample['raw_ltl'][i].strip()][np.random.randint(0,len(explainer_dic[sample['raw_ltl'][i].strip()]))]}
135
+
136
+ ### linear temproal logic:
137
+ {sample['raw_ltl'][i].strip()}
138
+ </s>""".lower()
139
+ # NOTE it seems the eos is needed, the bos is not needed(the bos will be automatically added)
140
+ for i in (range(len(sample['natural'])))]
141
+ # inputs = ["## [instruction]: translate natural description in to LTL: ### [natural language]:" + sample['natural'][i]+'### [LTL]:'+sample['raw_ltl'][i] for i in (range(len(sample['natural'])))]
142
+
143
+ sample["complete_text"] = inputs
144
+ return sample
145
+
146
+ tokenized_dataset = dataset.map(preprocess_function, batched=True)
147
+ print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")
148
+
149
+ # save datasets to disk for later easy loading
150
+ # tokenized_dataset["train"].save_to_disk("data/train"+exp_name)
151
+ # tokenized_dataset["test"].save_to_disk("data/eval"+exp_name)
152
+
153
+
154
+ base_model = AutoModelForCausalLM.from_pretrained(
155
+ base_model_name,
156
+ from_tf=bool(".ckpt" in base_model_name),
157
+ # quantization_config=bnb_config,
158
+ device_map=device_map,
159
+ trust_remote_code=True,
160
+ use_auth_token=True
161
+ )
162
+ base_model.config.use_cache = False
163
+
164
+ # More info: https://github.com/huggingface/transformers/pull/24906
165
+ base_model.config.pretraining_tp = 1
166
+
167
+ peft_config = LoraConfig(
168
+ lora_alpha=16,
169
+ lora_dropout=0.1,
170
+ r=64,
171
+ bias="none",
172
+ task_type="CAUSAL_LM",
173
+ )
174
+
175
+
176
+
177
+
178
+ training_args = TrainingArguments(
179
+ output_dir=output_dir,
180
+ per_device_train_batch_size=8,
181
+ gradient_accumulation_steps=4,
182
+ learning_rate=2e-4,
183
+ logging_steps=10,
184
+ num_train_epochs=3,
185
+ # max_steps=500
186
+ )
187
+
188
+ max_seq_length = 512
189
+
190
+ trainer = SFTTrainer(
191
+ model=base_model,
192
+ train_dataset=tokenized_dataset['train'],
193
+ peft_config=peft_config,
194
+ dataset_text_field="complete_text",
195
+ max_seq_length=max_seq_length,
196
+ tokenizer=tokenizer,
197
+ args=training_args,
198
+ # device_map=device_map
199
+ )
200
+
201
+ import os
202
+ output_dir = os.path.join(output_dir, "llama2_13b_"+exp_name+'aug1')
203
+
204
+ trainer.train()
205
+ trainer.model.save_pretrained(output_dir)
206
+ # trainer.model.save_pretrained(output_dir)
207
+ tokenizer.save_pretrained(output_dir)
208
+
209
+
210
+ # check
211
+
212
+ from peft import AutoPeftModelForCausalLM
213
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
214
+ model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map=device_map, torch_dtype=torch.bfloat16)
215
+
216
+
217
+ def evaluate_model(input_text):
218
+ input_text =f"""### ### Instruction:
219
+ translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()'
220
+
221
+ ### Natural Language Task:
222
+ {input_text}
223
+
224
+ """
225
+ # "### [instruction]: translate natural description in to LTL: \n\n ### [natural language]:" + input_text+'### [LTL]:'
226
+ inputs = tokenizer(input_text, return_tensors="pt").to(device)
227
+ outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"].to("cuda"), max_new_tokens=64, pad_token_id=tokenizer.eos_token_id)
228
+
229
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
230
+
231
+ # if __name__=='__main__':
232
+ import evaluate
233
+ import numpy as np
234
+ from datasets import load_from_disk
235
+ from tqdm import tqdm
236
+
237
+ # Metric
238
+ metric = evaluate.load("rouge")
239
+
240
+
241
+ # load test dataset from distk
242
+ # test_dataset = load_from_disk("data/eval"+exp_name+'/').with_format("torch")
243
+
244
+
245
+ # run predictions
246
+ # this can take ~45 minutes
247
+ import re
248
+ pattern=re.compile("### linear temproal logic::\n([\S ]*)\n")
249
+ predictions, references,input_sentence,output_sentence=[], [] , [], []
250
+ for idx in range(len(tokenized_dataset['test']['natural'])):
251
+ # print(sample)
252
+ nl=tokenized_dataset['test']['natural'][idx]
253
+ p = evaluate_model(nl)
254
+ # print(p,l)
255
+ input_sentence.append(nl)
256
+
257
+ transLTL=pattern.findall(p)
258
+ # print(p)
259
+ predictions.append(transLTL[0].strip())
260
+ output_sentence.append(p)
261
+ input_sentence.append(p)
262
+ references.append(tokenized_dataset['test']['raw_ltl'][idx].strip())
263
+ references.append(tokenized_dataset['test']['raw_ltl'][idx].strip())
264
+ print(input_sentence[-1],'\nout::\n',output_sentence[-1],'\npre::\n',predictions[-1],'\nref::\n',references[-1],'\n','-'*20,'\n')
265
+
266
+ # compute metric
267
+ rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)
268
+
269
+ # print results
270
+ print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
271
+ print(f"rouge2: {rogue['rouge2']* 100:2f}%")
272
+ print(f"rougeL: {rogue['rougeL']* 100:2f}%")
273
+ print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
274
+ eval_output=np.array([input_sentence,predictions,references]).T
275
+ import pandas as pd
276
+ eval_output=pd.DataFrame(eval_output)
277
+ pd.DataFrame.to_csv(eval_output,output_dir+'/output')
278
+ # Rogue1: 50.386161%
279
+ # rouge2: 24.842412%
280
+ # rougeL: 41.370130%
281
+ # rougeLsum: 41.394230%
finetune/Llama2_13b/llama_lora_fintune_ver3_qlora.py ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer
2
+ # device = "cuda" # the device to load the model onto
3
+ # from huggingface_hub import login
4
+ # login()
5
+ import json
6
+ import numpy as np
7
+ import sys,os
8
+ from datasets import load_dataset
9
+ import torch
10
+ from transformers import (AutoModelForCausalLM,
11
+ AutoTokenizer,
12
+ BitsAndBytesConfig,
13
+ TrainingArguments,
14
+ pipeline,
15
+ logging,
16
+ TrainerCallback)
17
+ from peft import LoraConfig, PeftConfig, prepare_model_for_kbit_training, get_peft_model
18
+ from trl import SFTTrainer
19
+ from accelerate import infer_auto_device_map,init_empty_weights
20
+ import wandb
21
+ from datasets import concatenate_datasets
22
+ import numpy as np
23
+ # sys.path.append('../../../')
24
+ # sys.path.append('../../')
25
+ # sys.path.append('../')
26
+ # os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
27
+ # os.environ['CUDA_VISIBLE_DEVICES'] = "5,6,7"
28
+ # device = torch.device("cuda:0-6" if torch.cuda.is_available() else "cpu")
29
+ sys.path.append(os.path.join(os.path.dirname(__file__), '../../'))
30
+ # import utils.util as util
31
+ # Load dataset from the hub
32
+ # dataset = load_dataset("samsum")
33
+ np.random.seed(42)
34
+ output_dir = "model_weight/"
35
+ datapath='NL2TL-dataset/collect2'
36
+ exp_name="ascii"
37
+ explainer_files=['LTLexplain_0.json','LTLexplain_1.json','LTLexplain_2.json','LTLexplain_3.json']
38
+ explainer_dic={}
39
+ for path in explainer_files:
40
+ with open(os.path.join(datapath,path)) as f:
41
+ LTLlist=json.load(f)
42
+ for key in LTLlist.keys():
43
+ if isinstance(LTLlist[key],dict):
44
+ if not (key in explainer_dic):
45
+ explainer_dic[key]=[]
46
+ explainer_dic[key].append(LTLlist[key]['translate'])
47
+ sp=LTLlist[key]['explain'].split("means that")
48
+ if len(sp)>1:
49
+ explainer_dic[key].append(sp[1])
50
+
51
+ base_model_name = "meta-llama/Llama-2-13b-hf"
52
+ bnb_config = BitsAndBytesConfig(
53
+ load_in_4bit = True,
54
+ bnb_4bit_use_double_quant = False,
55
+ bnb_4bit_quant_type = 'nf4',
56
+ bnb_4bit_compute_dtype = getattr(torch, "float16")
57
+ )
58
+
59
+ import os
60
+ os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
61
+ os.environ['CUDA_VISIBLE_DEVICES']='0'
62
+ device_map="auto"
63
+ # torch.cuda.set_device(7)
64
+ # device_map={'':torch.cuda.current_device()}
65
+ # device_map = {'':'cuda:7'}
66
+ # model_dir为模型的路径或名称
67
+ # config = AutoConfig.from_pretrained(base_model_name, trust_remote_code=True)
68
+ # with init_empty_weights():
69
+ # base_model = AutoModelForCausalLM.from_pretrained(
70
+ # base_model_name,
71
+ # from_tf=bool(".ckpt" in base_model_name),
72
+ # quantization_config=bnb_config,
73
+ # device_map=device_map,
74
+ # trust_remote_code=True,
75
+ # use_auth_token=True
76
+ # )
77
+
78
+ # map_list = {5:"15GB", 6:"15GB",7:"15GB"} # 对应不同卡号限制的内存量
79
+ # map_list = {7:"15GB",} # 对应不同卡号限制的内存量
80
+ # no_split_modules = base_model._no_split_modules
81
+ # device_map = infer_auto_device_map(base_model, max_memory=map_list, no_split_module_classes=no_split_modules)
82
+
83
+
84
+ dataset = load_dataset("json", data_files={"train":os.path.join(datapath,"ltl_eng_train_mid_ascii_gptAuged.jsonl"),"test":os.path.join(datapath,"ltl_eng_test_mid_ascii_gptAuged.jsonl")})
85
+ print(dataset)
86
+
87
+
88
+
89
+ tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
90
+ # tokenizer = AutoTokenizer.from_pretrained(base_model_name, add_eos_token=True,trust_remote_code=True)
91
+ # NOTE no one says whether the add eos token need to be added, but if we do not add this, the generate will continue until reach the max_new_tokens
92
+ # when add add_eos_token, it always failed
93
+ # if use this it will generate somthing other
94
+ tokenizer.pad_token = tokenizer.eos_token
95
+ tokenizer.padding_side = 'right'
96
+ print(tokenizer.eos_token_id)
97
+ # 2
98
+ print(tokenizer.bos_token_id)
99
+ # 1
100
+ # print(tokenizer._convert_token_to_id(tokenizer.bos_token))
101
+
102
+ def preprocess_function(sample,padding="max_length"):
103
+ # add prefix to the input for t5
104
+ # print(sample[0])
105
+ inputs=[
106
+ f"""### Instruction:
107
+ translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()'
108
+
109
+ ### Natural Language Task:
110
+ {sample['natural'][i].strip()}
111
+
112
+ ### Logic Translation:
113
+ {explainer_dic[sample['raw_ltl'][i].strip()][np.random.randint(0,len(explainer_dic[sample['raw_ltl'][i].strip()]))]}
114
+
115
+ ### linear temproal logic:
116
+ {sample['raw_ltl'][i].strip()}
117
+ </s>""".lower()
118
+ # NOTE it seems the eos is needed, the bos is not needed(the bos will be automatically added)
119
+ for i in (range(len(sample['natural'])))]
120
+ # inputs = ["## [instruction]: translate natural description in to LTL: ### [natural language]:" + sample['natural'][i]+'### [LTL]:'+sample['raw_ltl'][i] for i in (range(len(sample['natural'])))]
121
+
122
+ sample["complete_text"] = inputs
123
+ return sample
124
+ # method1
125
+ # tokenized_dataset = dataset.map(preprocess_function, batched=True)
126
+ # method2
127
+ def preprocess_function2(sample,padding="max_length"):
128
+ # add prefix to the input for t5
129
+ # print(sample[0])
130
+ inputs=[
131
+ tokenizer.apply_chat_template(
132
+ [
133
+ {"role": "system", "content": "translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()'"},
134
+ {"role": "user", "content": "Natural Language Task: {}".format(sample['natural'][i].strip())},
135
+ {"role": "assistant", "content": "Logic Translation is {}, linear temproal logic is {}".format(
136
+ explainer_dic[sample['raw_ltl'][i].strip()][np.random.randint(0,len(explainer_dic[sample['raw_ltl'][i].strip()]))],
137
+ sample['raw_ltl'][i].strip()
138
+ )
139
+ }
140
+ ],tokenize=False, add_generation_prompt=False)
141
+ # NOTE it seems the eos is needed, the bos is not needed(the bos will be automatically added)
142
+ for i in (range(len(sample['natural'])))]
143
+ # inputs = ["## [instruction]: translate natural description in to LTL: ### [natural language]:" + sample['natural'][i]+'### [LTL]:'+sample['raw_ltl'][i] for i in (range(len(sample['natural'])))]
144
+
145
+ sample["complete_text"] = inputs
146
+ return sample
147
+ tokenized_dataset = dataset.map(preprocess_function2, batched=True)
148
+ print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")
149
+
150
+ # save datasets to disk for later easy loading
151
+ # tokenized_dataset["train"].save_to_disk("data/train"+exp_name)
152
+ # tokenized_dataset["test"].save_to_disk("data/eval"+exp_name)
153
+
154
+ class PeftSavingCallback(TrainerCallback):
155
+ def on_save(self, args, state, control, **kwargs):
156
+ checkpoint_path = os.path.join(args.output_dir, f"checkpoint-{state.global_step}")
157
+ kwargs["model"].save_pretrained(checkpoint_path)
158
+
159
+ if "pytorch_model.bin" in os.listdir(checkpoint_path):
160
+ os.remove(os.path.join(checkpoint_path, "pytorch_model.bin"))
161
+ callbacks = [PeftSavingCallback]
162
+
163
+ peft_config = LoraConfig(
164
+ lora_alpha=16,
165
+ lora_dropout=0.05,
166
+ r=128,
167
+ bias="none",
168
+ task_type="CAUSAL_LM",
169
+ target_modules=["q_proj", "v_proj"]
170
+ )
171
+
172
+
173
+ training_arguments = TrainingArguments(
174
+ output_dir=output_dir,
175
+ logging_dir = os.path.join(output_dir,"logs"),
176
+ per_device_train_batch_size=32,
177
+ num_train_epochs=3,
178
+ gradient_accumulation_steps=1,
179
+ optim="paged_adamw_32bit",
180
+ save_strategy='epoch',
181
+ logging_steps=25,
182
+ learning_rate=2e-4,
183
+ weight_decay=0.001,
184
+ fp16=True,
185
+ bf16=False,
186
+ max_grad_norm=0.3,
187
+ max_steps=-1,
188
+ warmup_ratio = 0.05,
189
+ group_by_length=True,
190
+ lr_scheduler_type="cosine",
191
+ report_to="wandb",
192
+ evaluation_strategy="epoch",
193
+ do_eval=True,
194
+ run_name = base_model_name+exp_name,
195
+ disable_tqdm=False
196
+ )
197
+ import os
198
+ output_dir = os.path.join(output_dir, "llama2_13b"+exp_name+'aug1')
199
+
200
+ base_model = AutoModelForCausalLM.from_pretrained(
201
+ base_model_name,
202
+ from_tf=bool(".ckpt" in base_model_name),
203
+ quantization_config=bnb_config,
204
+ device_map=device_map,
205
+ trust_remote_code=True,
206
+ use_auth_token=True
207
+ )
208
+ base_model.config.use_cache = False
209
+
210
+ # More info: https://github.com/huggingface/transformers/pull/24906
211
+ base_model.config.pretraining_tp = 1
212
+
213
+ base_model.gradient_checkpointing_enable()
214
+ base_model = prepare_model_for_kbit_training(base_model)
215
+ base_model = get_peft_model(base_model, peft_config)
216
+
217
+ trainer = SFTTrainer(
218
+ model=base_model,
219
+ train_dataset=tokenized_dataset['train'],
220
+ eval_dataset=tokenized_dataset['test'],
221
+ peft_config=peft_config,
222
+ dataset_text_field="complete_text",
223
+ max_seq_length=512,
224
+ tokenizer=tokenizer,
225
+ args=training_arguments,
226
+ callbacks=callbacks,
227
+ packing=False,
228
+ )
229
+ # wandb.login()
230
+ # trainer.train()
231
+ # trainer.model.save_pretrained(output_dir)
232
+ # # trainer.model.save_pretrained(output_dir)
233
+ # tokenizer.save_pretrained(output_dir)
234
+
235
+ # wandb.finish()
236
+
237
+ # check
238
+ print('model dir',output_dir)
239
+ from peft import AutoPeftModelForCausalLM
240
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
241
+ model = AutoPeftModelForCausalLM.from_pretrained(output_dir,
242
+ from_tf=bool(".ckpt" in base_model_name),
243
+ quantization_config=bnb_config,
244
+ device_map=device_map,
245
+ trust_remote_code=True,
246
+ use_auth_token=True
247
+ )
248
+ tokenizer = AutoTokenizer.from_pretrained(output_dir, trust_remote_code=True)
249
+ print(tokenizer.default_chat_template)
250
+ def evaluate_model(input_text):
251
+ input_text =f"""### Instruction:
252
+ translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()' ### Natural Language Task:
253
+ {input_text}""".lower()
254
+ inputs = tokenizer(input_text, return_tensors="pt").to(device)
255
+ print(inputs)
256
+ outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"].to("cuda"), max_new_tokens=512, pad_token_id=tokenizer.eos_token_id)
257
+
258
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
259
+
260
+ def evaluate_model2(input_text):
261
+ messages=[
262
+ {"role": "system", "content": "translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()'"},
263
+ {"role": "user", "content": "Natural Language Task: {}".format(input_text)},
264
+ ]
265
+
266
+ encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
267
+ outputs = model.generate(encodeds, max_new_tokens=512, pad_token_id=tokenizer.eos_token_id)
268
+ # input_text =f"""### Instruction:
269
+ # translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()' ### Natural Language Task:
270
+ # {input_text}""".lower()
271
+ # inputs = tokenizer(input_text, return_tensors="pt").to(device)
272
+ # print(inputs)
273
+ # outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"].to("cuda"), max_new_tokens=512, pad_token_id=tokenizer.eos_token_id)
274
+
275
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
276
+ # if __name__=='__main__':
277
+ import evaluate
278
+ import numpy as np
279
+ from datasets import load_from_disk
280
+ from tqdm import tqdm
281
+
282
+ # Metric
283
+ metric = evaluate.load("rouge")
284
+
285
+
286
+ # load test dataset from distk
287
+ # test_dataset = load_from_disk("data/eval"+exp_name+'/').with_format("torch")
288
+
289
+ # run predictions
290
+ # this can take ~45 minutes
291
+ import re
292
+ pattern=re.compile("linear temproal logic:\n([\S ]*)\n")
293
+ predictions, references,input_sentence,output_sentence=[], [] , [], []
294
+ for idx in range(len(tokenized_dataset['test']['natural'])):
295
+ # print(sample)
296
+ nl=tokenized_dataset['test']['natural'][idx]
297
+ p = evaluate_model2(nl)
298
+ # print(p,l)
299
+ input_sentence.append(nl)
300
+
301
+ transLTL=pattern.findall(p)
302
+ print(p)
303
+ predictions.append(transLTL[0].strip())
304
+ output_sentence.append(p)
305
+ input_sentence.append(p)
306
+ references.append(tokenized_dataset['test']['raw_ltl'][idx].strip())
307
+ print(input_sentence[-1],'\nout::\n',output_sentence[-1],'\npre::\n',predictions[-1],'\nref::\n',references[-1],'\n','-'*20,'\n')
308
+
309
+ # compute metric
310
+ rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)
311
+
312
+ # print results
313
+ print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
314
+ print(f"rouge2: {rogue['rouge2']* 100:2f}%")
315
+ print(f"rougeL: {rogue['rougeL']* 100:2f}%")
316
+ print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
317
+ eval_output=np.array([input_sentence,predictions,references]).T
318
+ import pandas as pd
319
+ eval_output=pd.DataFrame(eval_output)
320
+ pd.DataFrame.to_csv(eval_output,output_dir+'/output')
321
+
322
+ exit()
323
+ messages = [
324
+ {"role": "user", "content": "What is your favourite condiment?"},
325
+ {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
326
+ {"role": "user", "content": "Do you have mayonnaise recipes?"}
327
+ ]
328
+
329
+ encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
330
+
331
+ model_inputs = encodeds.to(device)
332
+ model.to(device)
333
+
334
+ generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
335
+ decoded = tokenizer.batch_decode(generated_ids)
336
+ print(decoded[0])
finetune/Llama2_13b/llama_lora_test.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from huggingface_hub import login
2
+ # login()
3
+ import sys,os
4
+ from datasets import load_dataset
5
+ import torch
6
+ from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments
7
+ # from peft import LoraConfig
8
+ # from trl import SFTTrainer
9
+ # from accelerate import infer_auto_device_map,init_empty_weights
10
+
11
+ # sys.path.append(os.path.join(os.path.dirname(__file__), '../../'))
12
+ from NL2HLTLtaskPlanner.utils import Task2Preplacer
13
+ from NL2HLTLtaskPlanner.utils import LTLChecker
14
+ import re
15
+ from datasets import concatenate_datasets
16
+ import numpy as np
17
+ from peft import AutoPeftModelForCausalLM
18
+ os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
19
+ # os.environ['CUDA_VISIBLE_DEVICES']='3'
20
+
21
+
22
+
23
+ class Llama_NL2TL_translator():
24
+ def __init__(self,
25
+ output_dir = "/home/icl-mill19/xsj/model_weight",
26
+ tuned_model_name="llama2_13b__mid_asciiaug1",
27
+ # CUDA_device='0',
28
+ quat=True) -> None:
29
+ # os.environ['CUDA_VISIBLE_DEVICES']=CUDA_device
30
+ self.device_map="auto"
31
+ self.base_model_name = "meta-llama/Llama-2-13b-hf"
32
+ self.output_dir = os.path.join(output_dir, tuned_model_name)
33
+ # check
34
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
35
+ # AutoPeftModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-hf")
36
+
37
+
38
+ # quantconfig = BitsAndBytesConfig(
39
+ # load_in_8bit=True,
40
+ # bnb_8bit_quant_type="nf4",
41
+ # bnb_8bit_use_double_quant=True,
42
+ # bnb_8bit_compute_dtype=torch.bfloat16,
43
+ # )
44
+ if quat==False:
45
+ self.model = AutoPeftModelForCausalLM.from_pretrained(self.output_dir, device_map=self.device_map, torch_dtype=torch.bfloat16)
46
+ # ICL super man可以不量化
47
+ else:
48
+ self.model = AutoPeftModelForCausalLM.from_pretrained(self.output_dir,device_map=self.device_map, torch_dtype=torch.float16,
49
+ load_in_8bit=True)
50
+ # quantization_config=quantconfig)
51
+
52
+ self.tokenizer = AutoTokenizer.from_pretrained(self.output_dir, trust_remote_code=True)
53
+ # tokenizer = AutoTokenizer.from_pretrained(base_model_name, add_eos_token=True,trust_remote_code=True)
54
+ # NOTE no one says whether the add eos token need to be added, but if we do not add this, the generate will continue until reach the max_new_tokens,
55
+ # when in predict model, do not use the add_eos_token=True, as the tokenizer will automatically add <\s> to the input, and thus the output will be inregular
56
+ # when add add_eos_token, it always failed
57
+ self.tokenizer.pad_token = self.tokenizer.eos_token
58
+ self.tokenizer.padding_side = 'right'
59
+ print(self.tokenizer.eos_token_id)
60
+ # 2
61
+ print(self.tokenizer.bos_token_id)
62
+ # 1
63
+ # print(tokenizer._convert_token_to_id(tokenizer.bos_token))
64
+
65
+ print("NL2TL model loaded")
66
+ self.pattern=re.compile("\[Formal LTL\]:\n([\S ]*)\n")
67
+ self.replace=Task2Preplacer()
68
+ self.ltlChecker=LTLChecker()
69
+ pass
70
+
71
+ print('NL2TL llama translate test:')
72
+ self.translate("Task_1.1 must be done, and Task_1.2 should be finished before Task_1.1")
73
+ def evaluate_model(self,input_text):
74
+ input_text =f"""### [Instruction]:
75
+ translate natural description in to LTL, first translate into a logical way, and then translate into LTL,
76
+ using 'A' for 'And','O' for 'Or', 'I' for 'Imply','N' for 'Not','E' for 'Equally','F' for 'Finally','G' for 'Globally','U' for 'Until','X' for 'Next', pay specific attention to brackets '()'
77
+
78
+ ### [Natural Language Task]:
79
+ {input_text}
80
+ ### [Temporal Logic Translation]:
81
+ """
82
+ # "### [instruction]: translate natural description in to LTL: \n\n ### [natural language]:" + input_text+'### [LTL]:'
83
+ inputs = self.tokenizer(input_text, return_tensors="pt").to(self.device)
84
+ outputs = self.model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"].to("cuda"), max_new_tokens=128, pad_token_id=self.tokenizer.eos_token_id)
85
+
86
+ return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
87
+
88
+ def translate(self,input_prompt:str=""):
89
+ print(input_prompt)
90
+ input_prompt=self.replace.reTask2P(input_prompt)
91
+ # print(predicter( replace.reTask2P(input_prompt)))
92
+ # print(input_prompt)
93
+
94
+ p=self.evaluate_model(input_prompt)
95
+ # print(p)
96
+ transLTL=self.pattern.findall(p)[0]
97
+ print(transLTL)
98
+ while(not (self.ltlChecker.AP_CorrCheck(input_prompt,transLTL) and self.ltlChecker.brackets_Check(transLTL))):
99
+ p=self.evaluate_model(input_prompt)
100
+ # print(p)
101
+ transLTL=self.pattern.findall(p)[0]
102
+ print(transLTL)
103
+
104
+ return self.replace.reP2Task(transLTL)
105
+
106
+
107
+ if __name__=="__main__":
108
+ translater=Llama_NL2TL_translator()
109
+ # test_prompts=[
110
+ # "Task_1.1.1 must precede Task_1.1.2, which in turn should precede Task_1.1.3, ",
111
+ # "Task_1.1 must be completed before Task_1.2 starts, and Task_1.2 must be completed before Task_1.3 starts." ,
112
+ # "Task_1.1 can be executed independently, after which Task_1.2 can be executed.",
113
+ # "Task_1.2.4 must be completed first, followed by Task_1.2.2, then Task_1.2.3, and finally Task_1.2.1.",
114
+ # "Task_1.2.4 is always executed first, followed by Task_1.2.3, then Task_1.2.2, and finally Task_1.2.1.",
115
+ # "Task_1.2.1 and Task_1.2.2 can be executed independently, and both should eventually be completed.",
116
+ # ]
117
+ # for ret in test_prompts:
118
+ # print(translater.translate(ret))
119
+ # print('\n','-'*20,'\n')
120
+
121
+
122
+ import evaluate
123
+ import numpy as np
124
+ # from datasets import load_from_disk
125
+ from tqdm import tqdm
126
+
127
+ # Metric
128
+ metric = evaluate.load("rouge")
129
+ tokenized_dataset = load_dataset("json", data_files={"train":"LTL_datasets/collect/ltl_eng_train_mid_ascii_gptAuged.jsonl","test":"LTL_datasets/collect/ltl_eng_test_mid_ascii_gptAuged.jsonl"})
130
+ print(tokenized_dataset)
131
+ # run predictions
132
+ # this can take ~45 minutes
133
+ import re
134
+ pattern=re.compile("\[Formal LTL\]:\n([\S ]*)\n")
135
+ predictions, references,input_sentence,output_sentence=[], [] , [], []
136
+ # with open()
137
+ for idx in range(len(tokenized_dataset['test']['natural'])):
138
+ # print(sample)
139
+ nl=tokenized_dataset['test']['natural'][idx]
140
+ p = translater.evaluate_model(nl)
141
+ # print(p,l)
142
+ input_sentence.append(nl)
143
+
144
+ transLTL=pattern.findall(p)
145
+ # print(p)
146
+ predictions.append(transLTL[0])
147
+ output_sentence.append(p)
148
+ # input_sentence.append(nl)
149
+ references.append(tokenized_dataset['test']['raw_ltl'][idx])
150
+ print(idx,'\n',input_sentence[-1],'\nout::\n',output_sentence[-1],'\npre::\n',predictions[-1],'\nref::\n',references[-1],'\n','-'*20,'\n')
151
+
152
+ # compute metric
153
+ rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)
154
+
155
+ # print results
156
+ print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
157
+ print(f"rouge2: {rogue['rouge2']* 100:2f}%")
158
+ print(f"rougeL: {rogue['rougeL']* 100:2f}%")
159
+ print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
160
+ eval_output=np.array([input_sentence,predictions,references]).T
161
+ import pandas as pd
162
+ eval_output=pd.DataFrame(eval_output)
163
+ pd.DataFrame.to_csv(eval_output,"finetuned_model/llama2_13b__mid_asciiaug1"+'/output')
164
+ # Rogue1: 98.363321%
165
+ # rouge2: 95.987820%
166
+ # rougeL: 97.384820%
167
+ # rougeLsum: 97.382071%
168
+ exit()
169
+ flag=True
170
+ while flag:
171
+ lines=[""]
172
+ try:
173
+ lines.append(input())
174
+ while True:
175
+ lines.append(input())
176
+ except:
177
+ pass
178
+ ret ="".join(lines)
179
+ print(ret)
180
+ if ret=="":
181
+ flag=False
182
+
183
+ print(translater.translate(ret))
184
+
185
+
finetune/Llama2_13b/llama_test.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from huggingface_hub import login
2
+ # login()
3
+ from datasets import load_dataset
4
+ from random import randrange
5
+ import torch
6
+ import sys,os
7
+ # sys.path.append('../../../')
8
+ # sys.path.append('../../')
9
+ # sys.path.append('../')
10
+ os.environ['CUDA_VISIBLE_DEVICES'] = "2"
11
+ # device = torch.device("cuda:0-6" if torch.cuda.is_available() else "cpu")
12
+ from ... import utils as util
13
+ from accelerate import init_empty_weights,infer_auto_device_map,load_checkpoint_in_model,dispatch_model
14
+ # Load dataset from the hub
15
+ # dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
16
+ output_dir = "finetuned_model/"
17
+ datapath='LTL_datasets/collect/'
18
+ exp_name="_mid_ascii"
19
+ base_model_name = "meta-llama/Llama-2-13b-hf"
20
+ dataset = load_dataset("json", data_files={"train":datapath+"ltl_eng_train"+exp_name+".jsonl","test":datapath+"ltl_eng_test"+exp_name+".jsonl"})
21
+
22
+ print(dataset)
23
+ # dataset size: 15011
24
+
25
+ def format_instruction(sample):
26
+ return f"""### [Instruction]:
27
+ translate natural description in to LTL
28
+
29
+ ### [Input]:
30
+ {sample['natural']}
31
+
32
+ ### [English Response]:
33
+ {util.reAsciiLTL2EngLTL(sample['raw_ltl'])}
34
+ ### [Formal Response]:
35
+ {sample['raw_ltl']}
36
+ """
37
+
38
+ def preprocess_function(sample,padding="max_length"):
39
+ # add prefix to the input for t5
40
+ # print(sample[0])
41
+ inputs=[
42
+ f"""### [Instruction]:
43
+ translate natural description in to LTL
44
+
45
+ ### [Input]:
46
+ {sample['natural'][i]}
47
+
48
+ ### [English Response]:
49
+ {util.reAsciiLTL2EngLTL(sample['raw_ltl'][i])}
50
+ ### [Formal Response]:
51
+ {sample['raw_ltl'][i]}
52
+ """
53
+ for i in (range(len(sample['natural'])))]
54
+ # inputs = ["## [instruction]: translate natural description in to LTL: ### [natural language]:" + sample['natural'][i]+'### [LTL]:'+sample['raw_ltl'][i] for i in (range(len(sample['natural'])))]
55
+
56
+ sample["complete_text"] = inputs
57
+ return sample
58
+
59
+ tokenized_dataset = dataset.map(preprocess_function, batched=True)
60
+ print(tokenized_dataset)
61
+ import torch
62
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
63
+
64
+ use_flash_attention = False
65
+
66
+ # Hugging Face model id
67
+ model_id = base_model_name
68
+ # model_id = "meta-llama/Llama-2-7b-hf" # gated
69
+
70
+
71
+ output_dir = os.path.join(output_dir, "llama2_13b_"+exp_name+'4')
72
+
73
+
74
+ if use_flash_attention:
75
+ # unpatch flash attention
76
+ from llama_dp2_patch import unplace_flash_attn_with_attn
77
+ unplace_flash_attn_with_attn()
78
+
79
+ import torch
80
+ from peft import AutoPeftModelForCausalLM
81
+ from transformers import AutoTokenizer
82
+
83
+ # load base LLM model and tokenizer
84
+ model = AutoPeftModelForCausalLM.from_pretrained(
85
+ output_dir,
86
+ low_cpu_mem_usage=True,
87
+ torch_dtype=torch.float16,
88
+ load_in_4bit=True,
89
+ )
90
+ tokenizer = AutoTokenizer.from_pretrained(output_dir)
91
+
92
+
93
+ from datasets import load_dataset
94
+ from random import randrange
95
+
96
+
97
+ # Load dataset from the hub and get a sample
98
+ # dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
99
+ # sample = dataset[randrange(len(dataset))]
100
+
101
+ # prompt = f"""### Instruction:
102
+ # Use the Input below to create an instruction, which could have been used to generate the input using an LLM.
103
+
104
+ # ### Input:
105
+ # {sample['response']}
106
+
107
+ # ### Response:
108
+ # """
109
+
110
+ # input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
111
+ # # with torch.inference_mode():
112
+ # outputs = model.generate(input_ids=input_ids, max_new_tokens=100, do_sample=True, top_p=0.9,temperature=0.9)
113
+
114
+ # print(f"Prompt:\n{sample['response']}\n")
115
+ # print(f"Generated instruction:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
116
+ # print(f"Ground truth:\n{sample['instruction']}")
117
+
118
+
119
+
120
+
121
+
122
+ from peft import AutoPeftModelForCausalLM
123
+
124
+ model = AutoPeftModelForCausalLM.from_pretrained(
125
+ output_dir,
126
+ low_cpu_mem_usage=True,
127
+ )
128
+
129
+ # Merge LoRA and base model
130
+ merged_model = model.merge_and_unload()
131
+
132
+ # Save the merged model
133
+ merged_model.save_pretrained("merged_model",safe_serialization=True)
134
+ tokenizer.save_pretrained("merged_model")
135
+
136
+ # push merged model to the hub
137
+ # merged_model.push_to_hub("user/repo")
138
+ # tokenizer.push_to_hub("user/repo")
finetune/MIT_NL2TL/NL2TL.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ from transformers import (AutoModelForSeq2SeqLM,
3
+ AutoTokenizer,
4
+ T5Tokenizer)
5
+ import torch
6
+ import pandas as pd
7
+ from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
8
+ from tqdm import tqdm
9
+
10
+ #import subprocess
11
+ import sys
12
+ import os
13
+ import argparse
14
+ # from IPython.core import error
15
+ import random
16
+ import numpy as np
17
+ import nltk
18
+ import json
19
+ import csv
20
+ import utils.util as util
21
+
22
+ # run under conda env minigpt4
23
+
24
+ class NL2TL():
25
+ def __init__(self,dirpath='outputdir/') -> None:
26
+ self.output_dir = dirpath
27
+
28
+ # Here you need to link this path in your Google drive to the place preseving your model weights, e.g., checkpoint-62500
29
+ # You can download it on the github page
30
+
31
+ self.model_checkpoint = "t5-base"
32
+ self.prefix = "Transform the following sentence into Signal Temporal logic: "
33
+
34
+ self.max_input_length = 1024
35
+ self.max_target_length = 128
36
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_checkpoint, model_max_length=self.max_input_length)
37
+
38
+ self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
39
+ self.tl_model = AutoModelForSeq2SeqLM.from_pretrained(self.output_dir+"checkpoint-62500").to(self.device)
40
+
41
+ # %%
42
+ import time
43
+ self.time_start = time.time()
44
+ self.inputs = [self.prefix + 'At some point (prop_1), and at some point (prop_2), and always do not (prop_4).']
45
+ self.inputs = self.tokenizer(self.inputs, max_length=self.max_input_length, truncation=True, return_tensors="pt").to(self.device)
46
+ self.output = self.tl_model.generate(**self.inputs, num_beams=8, do_sample=True, max_length=self.max_target_length)
47
+ self.decoded_output = self.tokenizer.batch_decode(self.output, skip_special_tokens=True)[0]
48
+ print(self.decoded_output)
49
+ self.time_end = time.time()
50
+ print('Translation time: ', self.time_end-self.time_start)
51
+ print('\nNL2TL init\n')
52
+ self.splitJSONfromTXT=util.splitJSONfromTXT
53
+ # %%
54
+ # Here are the example test sentences
55
+ pass
56
+ def translate(self,inputNLtxt:str=""):
57
+ inputNLtxt=inputNLtxt.replace("Task_","prop_")
58
+
59
+ sentence=inputNLtxt
60
+ self.inputs = [self.prefix + sentence]
61
+ self.inputs = self.tokenizer(self.inputs, max_length=self.max_input_length, truncation=True, return_tensors="pt").to(self.device)
62
+ self.output = self.tl_model.generate(**self.inputs, num_beams=8, do_sample=True, max_length=self.max_target_length)
63
+ self.decoded_output = self.tokenizer.batch_decode(self.output, skip_special_tokens=True)[0]
64
+ print('Input sentence: ', sentence)
65
+ print('Translated STL: ', self.decoded_output)
66
+ print('\n')
67
+
68
+ self.decoded_output=self.decoded_output.replace('prop_','Task_')
69
+ return self.decoded_output
70
+ def waiting(self):
71
+ retry=True
72
+ while retry:
73
+ inputNL=util.GPTinterface("continue next")
74
+ if inputNL!="q":
75
+ Json=self.splitJSONfromTXT(inputNL)
76
+ print(Json)
77
+ jsonTree=json.loads("{"+Json[-1]+"}")
78
+ input_NL=jsonTree["LTL_description"].replace("Task_","prop_")
79
+ output_TL=self.translate(input_NL)
80
+ output_TL=output_TL.replace('prop_','Task_')
81
+ print("\n",output_TL,"\n")
82
+ else:
83
+ retry =False
84
+ if __name__=="__main__":
85
+ # examples=['Stay at (prop_1) for 5 units in the future and stay at (prop_2) for 5 units in the future, and ensure that never (prop_3).',
86
+ # 'First (prop_1), and then (prop_2), and ensure that never (prop_3).',
87
+ # 'Start by (prop_1). Then, (prop_2). Lastly, (prop_3).',
88
+ # 'Guarantee that you (prop_1) and (prop_2)', # Input the natural sentence
89
+ # '( prop_1 ) and whenever ( prop_2 )',
90
+ # 'Sooner or later (prop_1)',
91
+ # 'Repeatedly (prop_1)',
92
+ # 'At some point, (prop_1).',
93
+ # 'Do prop_1 but not do prop_2',
94
+ # 'Do prop_1, do prop_2, do prop_3'] # Input the natural sentence
95
+ # interface=NL2TL()
96
+ # for txt in examples:
97
+ # interface.translate(txt)
98
+
99
+ interface=NL2TL()
100
+ interface.waiting()
101
+
finetune/T5_XXL/t5_lora_evaluate.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from peft import PeftModel, PeftConfig
3
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
4
+
5
+ # Load peft config for pre-trained checkpoint etc.
6
+ exp_name="_mid_ascii"
7
+ peft_model_id="finetuned_model/results"+exp_name+'2'
8
+ max_target_length=128
9
+
10
+ config = PeftConfig.from_pretrained(peft_model_id)
11
+
12
+ # load base LLM model and tokenizer
13
+ model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, load_in_8bit=True, device_map="auto")
14
+ tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, device_map="auto")
15
+
16
+ # Load the Lora model
17
+ model = PeftModel.from_pretrained(model, peft_model_id, device_map="auto")
18
+ model.eval()
19
+
20
+ print("Peft model loaded")
21
+
22
+ from datasets import load_dataset
23
+ from random import randrange
24
+
25
+
26
+ # Load dataset from the hub and get a sample
27
+ datapath='LTL_datasets/collect/'
28
+ dataset = load_dataset("json", data_files={"train":datapath+"ltl_eng_train"+exp_name+".jsonl","test":datapath+"ltl_eng_test"+exp_name+".jsonl"})
29
+ print(dataset)
30
+ sample = dataset['test'][randrange(len(dataset["test"]))]
31
+
32
+ input_ids = tokenizer(sample["natural"], return_tensors="pt", truncation=True).input_ids.cuda()
33
+ # with torch.inference_mode():
34
+ outputs = model.generate(input_ids=input_ids, max_new_tokens=max_target_length, do_sample=True, top_p=0.9)
35
+ print(f"input sentence: {sample['natural']}\n{'---'* 20}")
36
+
37
+ print(f"summary:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]}")
38
+
39
+
40
+ import evaluate
41
+ import numpy as np
42
+ from datasets import load_from_disk
43
+ from tqdm import tqdm
44
+
45
+ # Metric
46
+ metric = evaluate.load("rouge")
47
+
48
+ def evaluate_peft_model(sample,max_target_length=128):
49
+ # generate summary
50
+ outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), do_sample=True, top_p=0.9, max_new_tokens=max_target_length)
51
+ prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
52
+ # decode eval sample
53
+ # Replace -100 in the labels as we can't decode them.
54
+ labels = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id)
55
+ # print(labels)
56
+ labels = tokenizer.decode(labels, skip_special_tokens=True)
57
+ # print(labels)
58
+ # Some simple post-processing
59
+ input_sentence=" ".join(tokenizer.batch_decode(sample["input_ids"].detach().cpu().numpy(), skip_special_tokens=True))
60
+ print("input sentence: {}\n{}".format(input_sentence,'---'* 20))
61
+ # output_LTL=tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
62
+ # expect_LTL=labels
63
+ print(f"pre_LTL:\n{prediction}\nexp_LTL:\n{labels}")
64
+ return prediction, labels,input_sentence
65
+
66
+ # load test dataset from distk
67
+ test_dataset = load_from_disk("data/eval"+exp_name+'/').with_format("torch")
68
+
69
+ # run predictions
70
+ # this can take ~45 minutes
71
+ predictions, references,input_sentence= [] , [], []
72
+ for sample in tqdm(test_dataset):
73
+ # print(sample)
74
+ p,l,nl = evaluate_peft_model(sample)
75
+ # print(p,l)
76
+ input_sentence.append(nl)
77
+ predictions.append(p)
78
+ references.append(l)
79
+
80
+ # compute metric
81
+ rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)
82
+
83
+ # print results
84
+ print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
85
+ print(f"rouge2: {rogue['rouge2']* 100:2f}%")
86
+ print(f"rougeL: {rogue['rougeL']* 100:2f}%")
87
+ print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
88
+ eval_output=np.array([input_sentence,predictions,references]).T
89
+ import pandas as pd
90
+ eval_output=pd.DataFrame(eval_output)
91
+ pd.DataFrame.to_csv(eval_output,peft_model_id+'/output')
92
+ # Rogue1: 50.386161%
93
+ # rouge2: 24.842412%
94
+ # rougeL: 41.370130%
95
+ # rougeLsum: 41.394230%
finetune/T5_XXL/t5_lora_fintune.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ from datasets import load_dataset
3
+ import os
4
+ import sys
5
+ # Load dataset from the hub
6
+ # dataset = load_dataset("samsum")
7
+ # datapath='LTL_datasets/collect/'
8
+ exp_name="/tf-ltl_eng_test_mid_ascii_gptAuged"
9
+ # output_dir = os.path.join(output_dir, "llama2_13b_"+exp_name+'aug1')
10
+ # dataset = load_dataset("json", data_files={"train":datapath+"ltl_eng_train"+exp_name+".jsonl","test":datapath+"ltl_eng_test"+exp_name+".jsonl"})
11
+ # print(dataset)
12
+ dataset = load_dataset("json", data_files={"train":"LTL_datasets/collect/ltl_eng_train_mid_ascii_gptAuged.jsonl","test":"LTL_datasets/collect/ltl_eng_test_mid_ascii_gptAuged.jsonl"})
13
+ print(dataset)
14
+
15
+ os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
16
+ os.environ['CUDA_VISIBLE_DEVICES']='4,5'
17
+
18
+
19
+ # %%
20
+
21
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
22
+
23
+ model_id="google/flan-t5-xxl"
24
+
25
+ # Load tokenizer of FLAN-t5-XL
26
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
27
+
28
+
29
+ # %%
30
+ from datasets import concatenate_datasets
31
+ import numpy as np
32
+ # The maximum total input sequence length after tokenization.
33
+ # Sequences longer than this will be truncated, sequences shorter will be padded.
34
+ tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["natural"], truncation=True), batched=True, remove_columns=["raw_ltl", "natural"])
35
+ input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
36
+ # take 85 percentile of max length for better utilization
37
+ max_source_length = int(np.percentile(input_lenghts, 100))
38
+ print(f"Max source length: {max_source_length}")
39
+
40
+ # The maximum total sequence length for target text after tokenization.
41
+ # Sequences longer than this will be truncated, sequences shorter will be padded."
42
+ tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["raw_ltl"], truncation=True), batched=True, remove_columns=["raw_ltl", "natural"])
43
+ target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
44
+ # take 90 percentile of max length for better utilization
45
+ max_target_length = int(np.percentile(target_lenghts, 100))
46
+ print(f"Max target length: {max_target_length}")
47
+
48
+
49
+ # %%
50
+ def preprocess_function(sample,padding="max_length"):
51
+ # add prefix to the input for t5
52
+ inputs = ["Generate LTL: " + item for item in sample["natural"]]
53
+
54
+ # tokenize inputs
55
+ model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)
56
+
57
+ # Tokenize targets with the `text_target` keyword argument
58
+ labels = tokenizer(text_target=sample["raw_ltl"], max_length=max_target_length, padding=padding, truncation=True)
59
+
60
+ # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
61
+ # padding in the loss.
62
+ if padding == "max_length":
63
+ labels["input_ids"] = [
64
+ [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
65
+ ]
66
+
67
+ model_inputs["labels"] = labels["input_ids"]
68
+ return model_inputs
69
+
70
+ tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["natural", "raw_ltl"])
71
+ print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")
72
+
73
+ # save datasets to disk for later easy loading
74
+ tokenized_dataset["train"].save_to_disk("data/train"+exp_name)
75
+ tokenized_dataset["test"].save_to_disk("data/eval"+exp_name)
76
+
77
+
78
+ # %%
79
+ from transformers import AutoModelForSeq2SeqLM
80
+ from peft import PeftModel, PeftConfig
81
+ # huggingface hub model id
82
+ model_id = "philschmid/flan-t5-xxl-sharded-fp16"
83
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")
84
+
85
+
86
+
87
+ # peft_model_id="finetuned_model/results"+"_mid_ascii"
88
+ # config = PeftConfig.from_pretrained(peft_model_id)
89
+ # # load base LLM model and tokenizer
90
+ # model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, load_in_8bit=True, device_map="auto")
91
+ # tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, device_map="auto")
92
+
93
+ # # Load the Lora model
94
+ # model = PeftModel.from_pretrained(model, peft_model_id, device_map="auto")
95
+ # # load model from the hub
96
+
97
+ print(model)
98
+ # exit()
99
+
100
+ # %%
101
+ from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType
102
+
103
+ # Define LoRA Config
104
+ lora_config = LoraConfig(
105
+ r=16,
106
+ lora_alpha=32,
107
+ target_modules=["q", "v"],
108
+ lora_dropout=0.05,
109
+ bias="none",
110
+ task_type=TaskType.SEQ_2_SEQ_LM
111
+ )
112
+ # prepare int-8 model for training
113
+ model = prepare_model_for_int8_training(model)
114
+
115
+ # add LoRA adaptor
116
+ model = get_peft_model(model, lora_config)
117
+ model.print_trainable_parameters()
118
+
119
+ # trainable params: 18874368 || all params: 11154206720 || trainable%: 0.16921300163961817
120
+
121
+
122
+ # %%
123
+ from transformers import DataCollatorForSeq2Seq
124
+
125
+ # we want to ignore tokenizer pad token in the loss
126
+ label_pad_token_id = -100
127
+ # Data collator
128
+ data_collator = DataCollatorForSeq2Seq(
129
+ tokenizer,
130
+ model=model,
131
+ label_pad_token_id=label_pad_token_id,
132
+ pad_to_multiple_of=8
133
+ )
134
+
135
+
136
+ # %%
137
+ from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
138
+
139
+ output_dir="lora-flan-t5-xxl"
140
+
141
+ # Define training args
142
+ training_args = Seq2SeqTrainingArguments(
143
+ output_dir=output_dir,
144
+ auto_find_batch_size=True,
145
+ learning_rate=1e-3, # higher learning rate
146
+ num_train_epochs=5,
147
+ logging_dir=f"{output_dir}/logs",
148
+ logging_strategy="steps",
149
+ logging_steps=500,
150
+ save_strategy="no",
151
+ report_to="tensorboard",
152
+ )
153
+
154
+ # Create Trainer instance
155
+ trainer = Seq2SeqTrainer(
156
+ model=model,
157
+ args=training_args,
158
+ data_collator=data_collator,
159
+ train_dataset=tokenized_dataset["train"],
160
+ )
161
+ model.config.use_cache = False # silence the warnings. Please re-enable for inference!
162
+
163
+
164
+ # %%
165
+ # train model
166
+ trainer.train()
167
+
168
+
169
+ # %%
170
+ # Save our LoRA model & tokenizer results
171
+ peft_model_id="finetuned_model/"+exp_name
172
+ trainer.model.save_pretrained(peft_model_id)
173
+ tokenizer.save_pretrained(peft_model_id)
174
+ # if you want to save the base model to call
175
+ # trainer.model.base_model.save_pretrained(peft_model_id)
176
+
177
+
178
+
179
+
180
+ import evaluate
181
+ import numpy as np
182
+ from datasets import load_from_disk
183
+ from tqdm import tqdm
184
+
185
+ # Metric
186
+ metric = evaluate.load("rouge")
187
+
188
+ def evaluate_peft_model(sample,max_target_length=128):
189
+ # generate summary
190
+ outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), do_sample=True, top_p=0.9, max_new_tokens=max_target_length)
191
+ prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
192
+ # decode eval sample
193
+ # Replace -100 in the labels as we can't decode them.
194
+ labels = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id)
195
+ # print(labels)
196
+ labels = tokenizer.decode(labels, skip_special_tokens=True)
197
+ # print(labels)
198
+ # Some simple post-processing
199
+ input_sentence=" ".join(tokenizer.batch_decode(sample["input_ids"].detach().cpu().numpy(), skip_special_tokens=True))
200
+ # print("input sentence: {}\n{}".format(input_sentence,'---'* 20))
201
+ # output_LTL=tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
202
+ # expect_LTL=labels
203
+ # print(f"pre_LTL:\n{prediction}\nexp_LTL:\n{labels}")
204
+ return prediction, labels,input_sentence
205
+
206
+ # load test dataset from distk
207
+ test_dataset = load_from_disk("data/eval"+exp_name+'/').with_format("torch")
208
+
209
+ # run predictions
210
+ # this can take ~45 minutes
211
+ predictions, references,input_sentence= [] , [], []
212
+ idx=0
213
+ for sample in tqdm(test_dataset):
214
+ # print(sample)
215
+ p,l,nl = evaluate_peft_model(sample)
216
+ # print(p,l)
217
+ input_sentence.append(nl)
218
+ predictions.append(p)
219
+ references.append(l)
220
+ idx+=1
221
+ print(idx,'\n',input_sentence[-1],'\npre::\n',predictions[-1],'\nref::\n',references[-1],'\n','-'*20,'\n')
222
+
223
+ # compute metric
224
+ rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)
225
+
226
+ # print results
227
+ print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
228
+ print(f"rouge2: {rogue['rouge2']* 100:2f}%")
229
+ print(f"rougeL: {rogue['rougeL']* 100:2f}%")
230
+ print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
231
+ eval_output=np.array([input_sentence,predictions,references]).T
232
+ import pandas as pd
233
+ eval_output=pd.DataFrame(eval_output)
234
+ pd.DataFrame.to_csv(eval_output,peft_model_id+'/output')
235
+ # Rogue1: 98.292692%
236
+ # rouge2: 95.766211%
237
+ # rougeL: 97.086188%
238
+ # rougeLsum: 97.084262%
finetune/T5_XXL/t5_realtime_evaluate.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from peft import PeftModel, PeftConfig
3
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
4
+ import sys
5
+ # sys.path.append("..")
6
+ # sys.path.append("../../")
7
+ from ... import utils as util
8
+ # Load peft config for pre-trained checkpoint etc.
9
+
10
+ class T5XXL_NL2TL_translator():
11
+ def __init__(self) -> None:
12
+ # exp_name="_mid_ascii"
13
+ peft_model_id="model_weight/tf-ltl_eng_test_mid_ascii_gptAuged"
14
+ self.max_target_length=128
15
+
16
+ self.config = PeftConfig.from_pretrained(peft_model_id)
17
+
18
+ # load base LLM model and tokenizer
19
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(self.config.base_model_name_or_path, load_in_8bit=True, device_map="auto")
20
+ self.tokenizer = AutoTokenizer.from_pretrained(self.config.base_model_name_or_path, device_map="auto")
21
+
22
+ # Load the Lora model
23
+ self.model = PeftModel.from_pretrained(self.model, peft_model_id, device_map="auto")
24
+ self.model.eval()
25
+
26
+ print("Peft model loaded")
27
+
28
+ pass
29
+ def translate(self,input:str=""):
30
+ input_prompt= "Generate LTL: " + input
31
+ replace=util.Task2Preplacer()
32
+
33
+ input_prompt=replace.reTask2P(input_prompt)
34
+ # print(predicter( replace.reTask2P(input_prompt)))
35
+ print(input_prompt)
36
+ input_ids = self.tokenizer(input_prompt, return_tensors="pt", truncation=True).input_ids.cuda()
37
+ outputs = self.model.generate(input_ids=input_ids, max_new_tokens=self.max_target_length, do_sample=True, top_p=0.9)
38
+ output_txt= self.tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
39
+ print(output_txt)
40
+ return replace.reP2Task(output_txt)
41
+
42
+
43
+ if __name__=="__main__":
44
+ test_prompts=[
45
+ "Task_1.1.1 must precede Task_1.1.2, which in turn should precede Task_1.1.3, ensuring that arranging fruits happens before preparing vegetables and prepping eggs and meats is done last.",
46
+ "Task_1.1 must be completed before Task_1.2 starts, and Task_1.2 must be completed before Task_1.3 starts."
47
+ ]
48
+
49
+ translater=T5XXL_NL2TL_translator()
50
+
51
+ for ret in test_prompts:
52
+ print(translater.translate(ret))
53
+
54
+ flag=True
55
+ while flag:
56
+ lines=[""]
57
+ try:
58
+ lines.append(input())
59
+ while True:
60
+ lines.append(input())
61
+ except:
62
+ pass
63
+ ret ="".join(lines)
64
+ print(ret)
65
+ if ret=="":
66
+ flag=False
67
+
68
+ print(translater.translate(ret))
69
+
finetune/__init__.py ADDED
File without changes
finetune/data_augmentation/GPTbasedAug.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import sys,os
4
+ import numpy as np
5
+ # sys.path.append(os.path.join(os.path.dirname(__file__), '../../'))
6
+ from ... import utils as util
7
+ import time
8
+ l,r=40,2000
9
+ # has been added: 0,40
10
+ # the range for data to augmentation
11
+ batch_size=20
12
+ # number of re describe codes
13
+
14
+
15
+ dataPath="LTL_datasets/collect/"
16
+ data_eng_path=os.path.join(dataPath,"Cleaned_ENG.txt")
17
+ data_ltl_path=os.path.join(dataPath,"Cleaned_LTL.txt")
18
+
19
+ data_eng_output_path=os.path.join(dataPath,"eng_gpt_auged2.txt")
20
+ data_ltl_output_path=os.path.join(dataPath,"ltl_mid_order_ascii_gpt_auged2.txt")
21
+ data_src_output_path=os.path.join(dataPath,"idxsrc_gpt_auged2.txt")
22
+
23
+ with open(data_ltl_path) as txt:
24
+ content = txt.readlines()
25
+ txt.close()
26
+ ltl =np.array(content)
27
+ with open(data_eng_path) as txt:
28
+ content = txt.readlines()
29
+ txt.close()
30
+ eng =np.array(content)
31
+
32
+ print(len(ltl))
33
+
34
+ GPTinterface=util.GPTinterface(JSONlog=True,exp_PATH=dataPath)
35
+
36
+ import random
37
+ np.random.seed(42)
38
+
39
+ idx=np.arange(len(ltl))
40
+ np.random.shuffle(idx)
41
+
42
+ messages=[
43
+ {
44
+ "role": "system",
45
+ "content": """1. Herer are some one sentence examples in a way that is normally used to interpret the safe or co-safe property in linear temporal logic, please remember and imitate the language style in the examples below
46
+ P02 and P03 can occur independently and either may be executed without affecting the other.
47
+ P07 must precede P17, which in turn should precede P15, ensuring that P07 happens before P17 and P15 is done last.
48
+ Always (P08 precedes P09) and Eventually (P08 is executed) and Eventually (P09 is executed).
49
+ Globally, P02 should be completed before P03 eventually starts.
50
+ Eventually, P08 and P09 should both be completed, and they can be done in any order.
51
+ P02 and P04 can be executed concurrently, while P03 can only be executed once P02 has been completed.
52
+ P07 must be completed before P17, P15, and P02 can be started. P17 must be completed before P15 and P02 can be started. P15 must be completedbefore P02 can be started."
53
+ P08 is a prerequisite for P09, P09 is a prerequisite for P10, and P10 is a prerequisite for P16.
54
+ P06 must be completed before P14, P14 must be completed before P11, and P11 must be completed before P12.
55
+ P02 and P05 are always possible to be executed, while the possibility of executing P03 and P04 is contingent upon the completion of P02.
56
+ P07 must be executed, and only after P07 is completed can P17 be executed, and only after P17 is completed can P15 be executed.
57
+ P08 can be executed independently, after which P09 can be executed.
58
+ P06 must be completed first, followed by P14, then P11, and finally P12.
59
+ P19 is always executed first, followed by P13, then P18, and finally P05.
60
+ P15 and P14 can be executed independently, and both should eventually be completed.
61
+ P07 must be completed before P17, and P17 must be completed before P15
62
+ P06 must be completed before P14 begins, and P14 must be completed before P11 begins"""
63
+ },{
64
+ "role":"user",
65
+ "content":"first go to P01 and then go to P20, always avoiding P02"
66
+ }
67
+ ]
68
+ input_content="B. re describe this instruction using the style above\n"
69
+ input_LTL=""
70
+ input_idx=""
71
+ count=0
72
+ pattern=re.compile("[0-9]{2}\. ([\S ]*)\n")
73
+
74
+ for i in range(l,r):
75
+ if count>=batch_size:
76
+ count=0
77
+ messages[1]["content"]=input_content
78
+ GPTreturn=GPTinterface.communicate(messages=messages)
79
+ reDescription=pattern.findall(GPTreturn+'\n')
80
+ if len(reDescription)==batch_size:
81
+ with open(data_eng_output_path ,"a") as f:
82
+ for j in reDescription:
83
+ f.write(j)
84
+ f.write('\n')
85
+ with open(data_ltl_output_path,"a") as f:
86
+ f.write(input_LTL)
87
+ with open(data_src_output_path,"a") as f:
88
+ f.write(input_idx)
89
+ input_content="B. re describe this instruction using the style above\n"
90
+ input_LTL=""
91
+ input_idx=""
92
+ time.sleep(np.random.random()*5)
93
+ else:
94
+ count+=1
95
+ input_content+="{:0>2d}. {}".format(count,eng[idx[i]])
96
+ input_LTL+="{}".format(ltl[idx[i]])
97
+ input_idx+="{}\n".format(idx[i])
98
+
99
+
100
+
finetune/data_augmentation/dataset_creator.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import numpy as np
3
+ import os,sys
4
+ # sys.path.append(os.path.join(os.path.dirname(__file__), '../../'))
5
+ from ... import utils as util
6
+ class DataPreprocess():
7
+ def __init__(self,data_path="LTL_datasets/collect") -> None:
8
+ self.data_path=data_path
9
+ self.train_valid_split=0.1
10
+ pass
11
+ def txtdataReader(self):
12
+ LTL_list=[
13
+ # 'ltl_mid_order_ascii.txt',
14
+ # 'ltl_mid_order_ascii_gpt_auged.txt',
15
+ # 'ltl_mid_order_ascii_gpt_auged2.txt',
16
+ 'Cleaned_LTL.txt'
17
+ ]
18
+ ENG_list=[
19
+ # 'eng.txt',
20
+ # 'eng_gpt_auged.txt',
21
+ # 'eng_gpt_auged2.txt'
22
+ 'Cleaned_ENG.txt'
23
+ ]
24
+ content=[]
25
+ for filename in LTL_list:
26
+ with open(os.path.join(self.data_path,filename)) as txt:
27
+ content += txt.readlines()
28
+ txt.close()
29
+ self.ltl =np.array(content)
30
+
31
+ content=[]
32
+ for filename in ENG_list:
33
+ with open(os.path.join(self.data_path,filename)) as txt:
34
+ content += txt.readlines()
35
+ txt.close()
36
+ self.eng =np.array(content)
37
+ print(len(self.ltl))
38
+
39
+ def JSONdataCreate(self):
40
+ self.txtdataReader()
41
+ self.JSONWriter()
42
+
43
+ def JSONWriter(self):
44
+ np.random.seed(42)
45
+ # idx=np.random.shuffle( np.arange(len(ltl)))
46
+ self.idx=np.arange(len(self.ltl))
47
+ np.random.shuffle(self.idx)
48
+ with open(self.data_path+"/ltl_eng_train_mid_ascii_gptAuged.jsonl","w") as f:
49
+ for i in range(int(len(self.ltl)*(1-self.train_valid_split))):
50
+ json.dump({"natural":self.eng[self.idx[i]],"raw_ltl":self.ltl[self.idx[i]],"id":str(self.idx[i])},f)
51
+ f.write('\n')
52
+ with open(self.data_path+"/ltl_eng_test_mid_ascii_gptAuged.jsonl","w") as f:
53
+ for i in range(int(len(self.ltl)*(1-self.train_valid_split)),len(self.ltl)):
54
+ json.dump({"natural":self.eng[self.idx[i]],"raw_ltl":self.ltl[self.idx[i]],"id":str(self.idx[i])},f)
55
+ f.write('\n')
56
+
57
+ def dataCheck(self):
58
+ self.txtdataReader()
59
+ checker=util.LTLChecker()
60
+ with open(os.path.join(self.data_path,"Cleaned_LTL.txt"),"a") as passed_LTL:
61
+ with open(os.path.join(self.data_path,"Cleaned_ENG.txt"),"a") as passed_ENG:
62
+ with open(os.path.join(self.data_path,"UNCleaned_num.txt"),"a") as unpassed_row:
63
+ with open(os.path.join(self.data_path,"UNCleaned_LTL.txt"),"a") as unpassed_LTL:
64
+ with open(os.path.join(self.data_path,"UNCleaned_ENG.txt"),"a") as unpassed_ENG:
65
+ for id in range(len(self.ltl)):
66
+ if checker.AP_CorrCheck(self.ltl[id],self.eng[id]):
67
+ passed_LTL.write(self.ltl[id])
68
+ passed_ENG.write(self.eng[id])
69
+ else:
70
+ unpassed_row.write("{}\n".format(id))
71
+ unpassed_LTL.write(self.ltl[id])
72
+ unpassed_ENG.write(self.eng[id])
73
+
74
+ if __name__=="__main__":
75
+ # DataPreprocess().dataCheck()
76
+ DataPreprocess().JSONdataCreate()
finetune/mistral7b/finetune.py ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer
2
+ # device = "cuda" # the device to load the model onto
3
+ # from huggingface_hub import login
4
+ # login()
5
+ import json
6
+ import numpy as np
7
+ import sys,os
8
+ from datasets import load_dataset
9
+ import torch
10
+ from transformers import (AutoModelForCausalLM,
11
+ AutoTokenizer,
12
+ BitsAndBytesConfig,
13
+ TrainingArguments,
14
+ pipeline,
15
+ logging,
16
+ TrainerCallback)
17
+ from peft import LoraConfig, PeftConfig, prepare_model_for_kbit_training, get_peft_model
18
+ from trl import SFTTrainer
19
+ from accelerate import infer_auto_device_map,init_empty_weights
20
+ import wandb
21
+ from datasets import concatenate_datasets
22
+ import numpy as np
23
+ # sys.path.append('../../../')
24
+ # sys.path.append('../../')
25
+ # sys.path.append('../')
26
+ # os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
27
+ # os.environ['CUDA_VISIBLE_DEVICES'] = "5,6,7"
28
+ # device = torch.device("cuda:0-6" if torch.cuda.is_available() else "cpu")
29
+ sys.path.append(os.path.join(os.path.dirname(__file__), '../../'))
30
+ # import utils.util as util
31
+ # Load dataset from the hub
32
+ # dataset = load_dataset("samsum")
33
+ device='cuda'
34
+ np.random.seed(42)
35
+ output_dir = "/home/user/xsj/model_weight/"
36
+ datapath='/home/user/xsj/NL2TL-dataset/collect2'
37
+ exp_name="_mid_ascii_0327_eos_2"
38
+ explainer_files=['LTLexplain_0.json','LTLexplain_1.json','LTLexplain_2.json','LTLexplain_3.json']
39
+ explainer_dic={}
40
+ for path in explainer_files:
41
+ with open(os.path.join(datapath,path)) as f:
42
+ LTLlist=json.load(f)
43
+ for key in LTLlist.keys():
44
+ if isinstance(LTLlist[key],dict):
45
+ if not (key in explainer_dic):
46
+ explainer_dic[key]=[]
47
+ explainer_dic[key].append(LTLlist[key]['translate'])
48
+ sp=LTLlist[key]['explain'].split("means that")
49
+ if len(sp)>1:
50
+ explainer_dic[key].append(sp[1])
51
+
52
+ base_model_name = "mistralai/Mistral-7B-Instruct-v0.2"
53
+ bnb_config = BitsAndBytesConfig(
54
+ load_in_4bit = True,
55
+ bnb_4bit_use_double_quant = False,
56
+ bnb_4bit_quant_type = 'nf4',
57
+ bnb_4bit_compute_dtype = getattr(torch, "float16")
58
+ )
59
+ bnb_config = BitsAndBytesConfig(
60
+ load_in_8bit = True,
61
+ # llm_int8_threshold=200.0
62
+ # bnb_4bit_use_double_quant = False,
63
+ # bnb_4bit_quant_type = 'nf4',
64
+ # bnb_4bit_compute_dtype = getattr(torch, "float16")
65
+ )
66
+ import os
67
+ os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
68
+ # os.environ['CUDA_VISIBLE_DEVICES']='0'
69
+ device_map="auto"
70
+ # torch.cuda.set_device(7)
71
+ # device_map={'':torch.cuda.current_device()}
72
+ # device_map = {'':'cuda:7'}
73
+ # model_dir为模型的路径或名称
74
+ # config = AutoConfig.from_pretrained(base_model_name, trust_remote_code=True)
75
+ # with init_empty_weights():
76
+ # base_model = AutoModelForCausalLM.from_pretrained(
77
+ # base_model_name,
78
+ # from_tf=bool(".ckpt" in base_model_name),
79
+ # quantization_config=bnb_config,
80
+ # device_map=device_map,
81
+ # trust_remote_code=True,
82
+ # use_auth_token=True
83
+ # )
84
+
85
+ # map_list = {5:"15GB", 6:"15GB",7:"15GB"} # 对应不同卡号限制的内存量
86
+ # map_list = {7:"15GB",} # 对应不同卡号限制的内存量
87
+ # no_split_modules = base_model._no_split_modules
88
+ # device_map = infer_auto_device_map(base_model, max_memory=map_list, no_split_module_classes=no_split_modules)
89
+
90
+
91
+ dataset = load_dataset("json", data_files={"train":os.path.join(datapath,"ltl_eng_train_mid_ascii_gptAuged.jsonl"),"test":os.path.join(datapath,"ltl_eng_test_mid_ascii_gptAuged.jsonl")})
92
+ print(dataset)
93
+
94
+
95
+
96
+ # tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
97
+ tokenizer = AutoTokenizer.from_pretrained(base_model_name)
98
+ # , add_eos_token=True,trust_remote_code=True)
99
+ # NOTE no one says whether the add eos token need to be added, but if we do not add this, the generate will continue until reach the max_new_tokens
100
+ # when add add_eos_token, it always failed
101
+ # if use this it will generate somthing other
102
+ tokenizer.pad_token = tokenizer.eos_token
103
+ tokenizer.padding_side = 'right'
104
+ # print(tokenizer.eos_token_id)
105
+ # 2
106
+ # print(tokenizer.bos_token_id)
107
+ # 1
108
+ # print(tokenizer._convert_token_to_id(tokenizer.bos_token))
109
+
110
+ def preprocess_function(sample,padding="max_length"):
111
+ # add prefix to the input for t5
112
+ # print(sample[0])
113
+ inputs=[
114
+ f"""### Instruction:
115
+ translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()'
116
+
117
+ ### Natural Language Task:
118
+ {sample['natural'][i].strip()}
119
+
120
+ ### Logic Translation:
121
+ {explainer_dic[sample['raw_ltl'][i].strip()][np.random.randint(0,len(explainer_dic[sample['raw_ltl'][i].strip()]))]}
122
+
123
+ ### linear temproal logic:
124
+ {sample['raw_ltl'][i].strip()}
125
+ </s>""".lower()
126
+ # NOTE it seems the eos is needed, the bos is not needed(the bos will be automatically added)
127
+ for i in (range(len(sample['natural'])))]
128
+ # inputs = ["## [instruction]: translate natural description in to LTL: ### [natural language]:" + sample['natural'][i]+'### [LTL]:'+sample['raw_ltl'][i] for i in (range(len(sample['natural'])))]
129
+
130
+ sample["complete_text"] = inputs
131
+ return sample
132
+ # method1
133
+ # tokenized_dataset = dataset.map(preprocess_function, batched=True)
134
+ # method2
135
+ def preprocess_function2(sample,padding="max_length"):
136
+ # add prefix to the input for t5
137
+ # print(sample[0])
138
+ inputs=[
139
+ tokenizer.apply_chat_template(
140
+ [
141
+ {"role": "user", "content": "translate natural description to linear temproal logic, first translate into a logical expression, and then translate into linear temproal logic, please pay specific attention to logic grammar, the natural language task is {}".format(sample['natural'][i].strip())},
142
+ {"role": "assistant", "content": "logic expression is {}, and LTL is {} .".format(
143
+ explainer_dic[sample['raw_ltl'][i].strip()][np.random.randint(0,len(explainer_dic[sample['raw_ltl'][i].strip()]))],
144
+ sample['raw_ltl'][i].strip()
145
+ )
146
+ },
147
+ # {"role": "user", "content": " pay specific attention to brackets '()', linear temproal logic is"},
148
+ # {"role": "assistant", "content": "LTL is {} .".format(
149
+ # sample['raw_ltl'][i].strip()
150
+ # )
151
+ # }
152
+ ],tokenize=False)
153
+ # NOTE it seems the eos is needed, the bos is not needed(the bos will be automatically added)
154
+ for i in (range(len(sample['natural'])))]
155
+ # inputs = ["## [instruction]: translate natural description in to LTL: ### [natural language]:" + sample['natural'][i]+'### [LTL]:'+sample['raw_ltl'][i] for i in (range(len(sample['natural'])))]
156
+
157
+ sample["complete_text"] = inputs
158
+ return sample
159
+ tokenized_dataset = dataset.map(preprocess_function2, batched=True)
160
+ print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")
161
+
162
+ # save datasets to disk for later easy loading
163
+ # tokenized_dataset["train"].save_to_disk("data/train"+exp_name)
164
+ # tokenized_dataset["test"].save_to_disk("data/eval"+exp_name)
165
+
166
+ class PeftSavingCallback(TrainerCallback):
167
+ def on_save(self, args, state, control, **kwargs):
168
+ checkpoint_path = os.path.join(args.output_dir, f"checkpoint-{state.global_step}")
169
+ kwargs["model"].save_pretrained(checkpoint_path)
170
+
171
+ if "pytorch_model.bin" in os.listdir(checkpoint_path):
172
+ os.remove(os.path.join(checkpoint_path, "pytorch_model.bin"))
173
+ callbacks = [PeftSavingCallback]
174
+
175
+ peft_config = LoraConfig(
176
+ lora_alpha=16,
177
+ lora_dropout=0.05,
178
+ r=128,
179
+ bias="none",
180
+ task_type="CAUSAL_LM",
181
+ target_modules=["q_proj", "v_proj"]
182
+ )
183
+
184
+
185
+ training_arguments = TrainingArguments(
186
+ output_dir=output_dir,
187
+ logging_dir = os.path.join(output_dir,"logs"),
188
+ per_device_train_batch_size=1,
189
+ num_train_epochs=3,
190
+ gradient_accumulation_steps=8,
191
+ optim="paged_adamw_32bit",
192
+ save_strategy='epoch',
193
+ logging_steps=25,
194
+ learning_rate=2e-4,
195
+ weight_decay=0.001,
196
+ fp16=True,
197
+ bf16=False,
198
+ max_grad_norm=0.3,
199
+ max_steps=-1,
200
+ warmup_ratio = 0.05,
201
+ group_by_length=True,
202
+ lr_scheduler_type="cosine",
203
+ report_to="wandb",
204
+ evaluation_strategy="epoch",
205
+ do_eval=True,
206
+ run_name = base_model_name+exp_name,
207
+ disable_tqdm=False
208
+ )
209
+ import os
210
+ output_dir = os.path.join(output_dir, "mistral7b"+exp_name+'aug1_quat8')
211
+
212
+ base_model = AutoModelForCausalLM.from_pretrained(
213
+ base_model_name,
214
+ from_tf=bool(".ckpt" in base_model_name),
215
+ quantization_config=bnb_config,
216
+ device_map=device_map,
217
+ trust_remote_code=True,
218
+ use_auth_token=True
219
+ )
220
+ base_model.config.use_cache = False
221
+
222
+ # More info: https://github.com/huggingface/transformers/pull/24906
223
+ base_model.config.pretraining_tp = 1
224
+
225
+ base_model.gradient_checkpointing_enable()
226
+ base_model = prepare_model_for_kbit_training(base_model)
227
+ base_model = get_peft_model(base_model, peft_config)
228
+
229
+ trainer = SFTTrainer(
230
+ model=base_model,
231
+ train_dataset=tokenized_dataset['train'],
232
+ eval_dataset=tokenized_dataset['test'],
233
+ peft_config=peft_config,
234
+ dataset_text_field="complete_text",
235
+ max_seq_length=512,
236
+ tokenizer=tokenizer,
237
+ args=training_arguments,
238
+ callbacks=callbacks,
239
+ packing=False,
240
+ )
241
+ wandb.login()
242
+ trainer.train()
243
+ trainer.model.save_pretrained(output_dir)
244
+ # trainer.model.save_pretrained(output_dir)
245
+ tokenizer.save_pretrained(output_dir)
246
+
247
+ wandb.finish()
248
+
249
+ # check
250
+ print('model dir',output_dir)
251
+ from peft import AutoPeftModelForCausalLM
252
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
253
+ model = AutoPeftModelForCausalLM.from_pretrained(output_dir,
254
+ from_tf=bool(".ckpt" in output_dir),
255
+ quantization_config=bnb_config,
256
+ device_map=device_map,
257
+ trust_remote_code=True,
258
+ use_auth_token=True
259
+ )
260
+ tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
261
+ tokenizer.pad_token = tokenizer.eos_token
262
+ print(tokenizer.default_chat_template)
263
+ def evaluate_model(input_text):
264
+ input_text =f"""### Instruction:
265
+ translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()' ### Natural Language Task:
266
+ {input_text}""".lower()
267
+ inputs = tokenizer(input_text, return_tensors="pt").to(device)
268
+ print(inputs)
269
+ outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"].to("cuda"), max_new_tokens=512, pad_token_id=tokenizer.eos_token_id)
270
+
271
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
272
+
273
+ def evaluate_model2(input_text):
274
+ messages=[
275
+ {"role": "user", "content": "translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()', natural language task: {}".format(input_text)},
276
+ ]
277
+
278
+ encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
279
+ outputs = model.generate(encodeds, max_new_tokens=512)
280
+ # , pad_token_id=tokenizer.eos_token_id)
281
+ # input_text =f"""### Instruction:
282
+ # translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()' ### Natural Language Task:
283
+ # {input_text}""".lower()
284
+ # inputs = tokenizer(input_text, return_tensors="pt").to(device)
285
+ # print(inputs)
286
+ # outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"].to("cuda"), max_new_tokens=512, pad_token_id=tokenizer.eos_token_id)
287
+
288
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
289
+ # if __name__=='__main__':
290
+ import evaluate
291
+ import numpy as np
292
+ from datasets import load_from_disk
293
+ from tqdm import tqdm
294
+
295
+ # Metric
296
+ metric = evaluate.load("rouge")
297
+
298
+
299
+ # load test dataset from distk
300
+ # test_dataset = load_from_disk("data/eval"+exp_name+'/').with_format("torch")
301
+
302
+ # run predictions
303
+ # this can take ~45 minutes
304
+ import re
305
+ pattern=re.compile("linear temproal logic is ([\S ]*)")
306
+ predictions, references,input_sentence,output_sentence=[], [] , [], []
307
+ for idx in range(len(tokenized_dataset['test']['natural'])):
308
+ # print(sample)
309
+ nl=tokenized_dataset['test']['natural'][idx]
310
+ p = evaluate_model2(nl)
311
+ # print(p,l)
312
+ input_sentence.append(nl)
313
+
314
+ transLTL=pattern.findall(p)
315
+ print(p)
316
+ if transLTL[0][-1]=='.':
317
+ transLTL[0]=transLTL[0][:-1].strip()
318
+ else:
319
+ transLTL[0]=transLTL[0].strip()
320
+ predictions.append(transLTL[0])
321
+ output_sentence.append(p)
322
+ input_sentence.append(p)
323
+ references.append(tokenized_dataset['test']['raw_ltl'][idx].strip())
324
+ print(input_sentence[-1],'\nout::\n',output_sentence[-1],'\npre::\n',predictions[-1],'\nref::\n',references[-1],'\n','-'*20,'\n')
325
+
326
+ # compute metric
327
+ rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)
328
+
329
+ # print results
330
+ print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
331
+ print(f"rouge2: {rogue['rouge2']* 100:2f}%")
332
+ print(f"rougeL: {rogue['rougeL']* 100:2f}%")
333
+ print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
334
+ eval_output=np.array([input_sentence,predictions,references]).T
335
+ import pandas as pd
336
+ eval_output=pd.DataFrame(eval_output)
337
+ pd.DataFrame.to_csv(eval_output,output_dir+'/output')
338
+
339
+ exit()
340
+ messages = [
341
+ {"role": "user", "content": "What is your favourite condiment?"},
342
+ {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
343
+ {"role": "user", "content": "Do you have mayonnaise recipes?"}
344
+ ]
345
+
346
+ encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
347
+
348
+ model_inputs = encodeds.to(device)
349
+ model.to(device)
350
+
351
+ generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
352
+ decoded = tokenizer.batch_decode(generated_ids)
353
+ print(decoded[0])
finetune/mistral7b/prediction.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from huggingface_hub import login
2
+ # login()
3
+ import sys,os
4
+ from datasets import load_dataset
5
+ import torch
6
+ from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments
7
+ # from peft import LoraConfig
8
+ # from trl import SFTTrainer
9
+ # from accelerate import infer_auto_device_map,init_empty_weights
10
+
11
+ # sys.path.append(os.path.join(os.path.dirname(__file__), '../../'))
12
+ from NL2HLTLtaskPlanner.utils import Task2Preplacer
13
+ from NL2HLTLtaskPlanner.utils import LTLChecker
14
+ import re
15
+ from datasets import concatenate_datasets
16
+ import numpy as np
17
+ from peft import AutoPeftModelForCausalLM
18
+ os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
19
+ # os.environ['CUDA_VISIBLE_DEVICES']='3'
20
+
21
+
22
+
23
+ class Mistral_NL2TL_translator():
24
+ def __init__(self,
25
+ output_dir = "/home/user/xsj/model_weight",
26
+ tuned_model_name="mistral7b_mid_ascii_0327_eos_2aug1_quat8",
27
+ # CUDA_device='0',
28
+ quat=True,
29
+ replacer=Task2Preplacer) -> None:
30
+ # os.environ['CUDA_VISIBLE_DEVICES']=CUDA_device
31
+ self.device_map="auto"
32
+ self.model_dir = os.path.join(output_dir, tuned_model_name)
33
+ # check
34
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
35
+ # AutoPeftModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-hf")
36
+
37
+
38
+ # quantconfig = BitsAndBytesConfig(
39
+ # load_in_8bit=True,
40
+ # bnb_8bit_quant_type="nf4",
41
+ # bnb_8bit_use_double_quant=True,
42
+ # bnb_8bit_compute_dtype=torch.bfloat16,
43
+ # )
44
+ # if quat==False:
45
+ # self.model = AutoPeftModelForCausalLM.from_pretrained(self.output_dir, device_map=self.device_map, torch_dtype=torch.bfloat16)
46
+ # # ICL super man可以不量化
47
+ # else:
48
+ # self.model = AutoPeftModelForCausalLM.from_pretrained(self.output_dir,device_map=self.device_map, torch_dtype=torch.float16,
49
+ # load_in_8bit=True)
50
+ # # quantization_config=quantconfig)
51
+ self.bnb_config = BitsAndBytesConfig(
52
+ load_in_4bit = True,
53
+ bnb_4bit_use_double_quant = False,
54
+ bnb_4bit_quant_type = 'nf4',
55
+ bnb_4bit_compute_dtype = getattr(torch, "float16")
56
+ )
57
+ self.bnb_config = BitsAndBytesConfig(
58
+ load_in_8bit = True,
59
+ # llm_int8_threshold=200.0
60
+ # bnb_4bit_use_double_quant = False,
61
+ # bnb_4bit_quant_type = 'nf4',
62
+ # bnb_4bit_compute_dtype = getattr(torch, "float16")
63
+ )
64
+ # self.bnb_config = BitsAndBytesConfig(
65
+ # load_in_8bit = False,
66
+ # load_in_4bit = False,
67
+ # # llm_int8_threshold=200.0
68
+ # # bnb_4bit_use_double_quant = False,
69
+ # # bnb_4bit_quant_type = 'nf4',
70
+ # # bnb_4bit_compute_dtype = getattr(torch, "float16")
71
+ # )
72
+ self.model = AutoModelForCausalLM.from_pretrained(
73
+ self.model_dir,
74
+ from_tf=bool(".ckpt" in self.model_dir),
75
+ quantization_config=self.bnb_config,
76
+ device_map=self.device_map,
77
+ trust_remote_code=True,
78
+ use_auth_token=True
79
+ )
80
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
81
+ # , trust_remote_code=True,add_eos_token=True,)
82
+ # tokenizer = AutoTokenizer.from_pretrained(base_model_name, add_eos_token=True,trust_remote_code=True)
83
+ # NOTE no one says whether the add eos token need to be added, but if we do not add this, the generate will continue until reach the max_new_tokens,
84
+ # when in predict model, do not use the add_eos_token=True, as the tokenizer will automatically add <\s> to the input, and thus the output will be inregular
85
+ # when add add_eos_token, it always failed
86
+ self.tokenizer.pad_token = self.tokenizer.eos_token
87
+ self.tokenizer.padding_side = 'right'
88
+ print(self.tokenizer.eos_token_id)
89
+ # 2
90
+ print(self.tokenizer.bos_token_id)
91
+ # 1
92
+ # print(tokenizer._convert_token_to_id(tokenizer.bos_token))
93
+
94
+ print("NL2TL model loaded")
95
+
96
+ self.replacer=replacer
97
+ self.ltlChecker=LTLChecker()
98
+ pass
99
+
100
+ # print('NL2TL llama translate test:')
101
+ # self.translate("Task_1.1 must be done, and Task_1.2 should be finished before Task_1.1")
102
+ def evaluate_model(self, input_text):
103
+ self.pattern=re.compile("linear temproal logic is ([\S ]*).")
104
+ messages=[
105
+ {"role": "user", "content": "translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()', natural language task: {}".format(input_text.strip())},
106
+ ]
107
+
108
+ encodeds = self.tokenizer.apply_chat_template(messages, return_tensors="pt").to(self.device)
109
+ outputs = self.model.generate(encodeds, max_new_tokens=512, pad_token_id=self.tokenizer.eos_token_id)
110
+
111
+ p=self.tokenizer.decode(outputs[0], skip_special_tokens=True)
112
+ print('model output:',p)
113
+ transLTL=self.pattern.findall(p)[0]
114
+ if transLTL[-1]=='.':
115
+ transLTL=transLTL[:-1].strip()
116
+ else:
117
+ transLTL=transLTL.strip()
118
+ transLTL=self.ltlChecker.right_barkets_remover(transLTL)
119
+ print('transLTL:\n',transLTL)
120
+ return transLTL
121
+ def evaluate_model2(self, input_text):
122
+ self.pattern=re.compile("LTL is ([\S ]*).")
123
+ messages=[
124
+ {"role": "user", "content": "translate natural description to linear temproal logic, first translate into a logical expression, and then translate into linear temproal logic, the natural language task is {}".format(input_text.strip())},
125
+ ]
126
+ encodeds = self.tokenizer.apply_chat_template(messages, return_tensors="pt").to(self.device)
127
+ outputs = self.model.generate(encodeds, max_new_tokens=512, pad_token_id=self.tokenizer.eos_token_id)
128
+ p=self.tokenizer.decode(outputs[0], skip_special_tokens=True)
129
+ print('---model output 1:\n',p)
130
+ # messages=[
131
+ # {"role": "user", "content": "translate natural description to linear temproal logic, first translate into a logical expression, and then translate into linear temproal logic, the natural language task is {}".format(input_text.strip())},
132
+ # {"role": "assistant", "content":p
133
+ # },
134
+ # {"role": "user", "content": " pay specific attention to brackets '()', given your linear temproal logic translation"},
135
+ # ]
136
+
137
+ # encodeds = self.tokenizer.apply_chat_template(messages, return_tensors="pt").to(self.device)
138
+ # outputs = self.model.generate(encodeds, max_new_tokens=512, pad_token_id=self.tokenizer.eos_token_id)
139
+
140
+ # p=self.tokenizer.decode(outputs[0], skip_special_tokens=True)
141
+ # print('---model output 2:\n',p)
142
+ transLTL=self.pattern.findall(p)[0]
143
+ if transLTL[-1]=='.':
144
+ transLTL=transLTL[:-1].strip()
145
+ else:
146
+ transLTL=transLTL.strip()
147
+ transLTL=self.ltlChecker.right_barkets_remover(transLTL)
148
+ print('transLTL:\n',transLTL)
149
+ return transLTL
150
+ def evaluate_model3(self, input_text):
151
+ # "LTL is a larger language model . . . . . . "
152
+ # self.pattern=re.compile("LTL is ([\S ]*)\.")
153
+ self.pattern=re.compile("LTL is ([^\.]*)\.")
154
+ messages=[
155
+ {"role": "user", "content": "translate natural description to linear temproal logic, first translate into a logical expression, and then translate into linear temproal logic, please pay specific attention to logic grammar, the natural language task is {}".format(input_text.strip())},
156
+ ]
157
+ encodeds = self.tokenizer.apply_chat_template(messages, return_tensors="pt").to(self.device)
158
+ outputs = self.model.generate(encodeds, max_new_tokens=512, pad_token_id=self.tokenizer.eos_token_id)
159
+ p=self.tokenizer.decode(outputs[0], skip_special_tokens=True)
160
+ print('---model output 1:\n',p)
161
+ # messages=[
162
+ # {"role": "user", "content": "translate natural description to linear temproal logic, first translate into a logical expression, and then translate into linear temproal logic, the natural language task is {}".format(input_text.strip())},
163
+ # {"role": "assistant", "content":p
164
+ # },
165
+ # {"role": "user", "content": " pay specific attention to brackets '()', given your linear temproal logic translation"},
166
+ # ]
167
+
168
+ # encodeds = self.tokenizer.apply_chat_template(messages, return_tensors="pt").to(self.device)
169
+ # outputs = self.model.generate(encodeds, max_new_tokens=512, pad_token_id=self.tokenizer.eos_token_id)
170
+
171
+ # p=self.tokenizer.decode(outputs[0], skip_special_tokens=True)
172
+ # print('---model output 2:\n',p)
173
+ transLTL=self.pattern.findall(p)
174
+ if len(transLTL)==0:
175
+ return False
176
+ transLTL=transLTL[0]
177
+ if transLTL[-1]=='.':
178
+ transLTL=transLTL[:-1].strip()
179
+ else:
180
+ transLTL=transLTL.strip()
181
+ transLTL=self.ltlChecker.right_barkets_remover(transLTL)
182
+ print('transLTL:\n',transLTL)
183
+ return transLTL
184
+ def translate(self,input_prompt:str=""):
185
+ print('input_prompt:\n',input_prompt)
186
+ replacer=self.replacer()
187
+ input_prompt=replacer.reTask2P(input_prompt)
188
+ # print(predicter( replace.reTask2P(input_prompt)))
189
+ # print(input_prompt)
190
+
191
+
192
+ # print(p)
193
+ flag_check_false_count=0
194
+ flag_check=False
195
+ while not flag_check and flag_check_false_count<10:
196
+ flag_check_false_count+=1
197
+ flag_check=True
198
+ transLTL=self.evaluate_model3(input_prompt)
199
+ transLTL=transLTL.replace('Or','And')
200
+ transLTL=transLTL.replace('Globally','Finally')
201
+ if isinstance(transLTL,bool):
202
+ flag_check=False
203
+ elif not self.ltlChecker.AP_CorrCheck(input_prompt,transLTL):
204
+ print('AP_CorrCheck false')
205
+ flag_check=False
206
+ elif not self.ltlChecker.brackets_Check(transLTL):
207
+ print('brackets_Check false')
208
+ flag_check=False
209
+ # print(p)
210
+ return replacer.reP2Task(transLTL)
211
+
212
+
213
+ if __name__=="__main__":
214
+ # translater=Mistral_NL2TL_translator()
215
+ # test_prompts=[
216
+ # "Task_1.1.1 must precede Task_1.1.2, which in turn should precede Task_1.1.3, ",
217
+ # "Task_1.1 must be completed before Task_1.2 starts, and Task_1.2 must be completed before Task_1.3 starts." ,
218
+ # "Task_1.1 can be executed independently, after which Task_1.2 can be executed.",
219
+ # "Task_1.2.4 must be completed first, followed by Task_1.2.2, then Task_1.2.3, and finally Task_1.2.1.",
220
+ # "Task_1.2.4 is always executed first, followed by Task_1.2.3, then Task_1.2.2, and finally Task_1.2.1.",
221
+ # "Task_1.2.1 and Task_1.2.2 can be executed independently, and both should eventually be completed.",
222
+ # ]
223
+ # for ret in test_prompts:
224
+ # print(translater.translate(ret))
225
+ # print('\n','-'*20,'\n')
226
+ # exit()
227
+ class p2preplacer():
228
+ def reTask2P(self,input):
229
+ return input
230
+ def reP2Task(self,input):
231
+ return input
232
+ translater=Mistral_NL2TL_translator(replacer=p2preplacer)
233
+ import evaluate
234
+ import numpy as np
235
+ # from datasets import load_from_disk
236
+ from tqdm import tqdm
237
+
238
+ # Metric
239
+ metric = evaluate.load("rouge")
240
+ datapath='/home/user/xsj/NL2TL-dataset/collect2'
241
+ tokenized_dataset = load_dataset("json", data_files={"train":os.path.join(datapath,"ltl_eng_train_mid_ascii_gptAuged.jsonl"),"test":os.path.join(datapath,"ltl_eng_test_mid_ascii_gptAuged.jsonl")})
242
+ print(tokenized_dataset)
243
+ # run predictions
244
+ # this can take ~45 minutes
245
+ import re
246
+ # pattern=re.compile("\[Formal LTL\]:\n([\S ]*)\n")
247
+ predictions, references,input_sentence,output_sentence=[], [] , [], []
248
+ # with open()
249
+ for idx in range(len(tokenized_dataset['test']['natural'])):
250
+ # print(sample)
251
+ nl=tokenized_dataset['test']['natural'][idx]
252
+ transLTL=translater.translate(nl)
253
+ # p = translater.evaluate_model(nl)
254
+ # # print(p,l)
255
+ input_sentence.append(nl)
256
+
257
+ # transLTL=pattern.findall(p)
258
+ # # print(p)
259
+ predictions.append(transLTL)
260
+ # output_sentence.append(p)
261
+ # input_sentence.append(nl)
262
+ references.append(tokenized_dataset['test']['raw_ltl'][idx].strip())
263
+ print(idx,'\n',input_sentence[-1],
264
+ # '\nout::\n',output_sentence[-1],
265
+ '\npre::\n',predictions[-1],
266
+ '\nref::\n',references[-1],'\n','-'*20,'\n')
267
+
268
+ # compute metric
269
+ rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)
270
+
271
+ # print results
272
+ print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
273
+ print(f"rouge2: {rogue['rouge2']* 100:2f}%")
274
+ print(f"rougeL: {rogue['rougeL']* 100:2f}%")
275
+ print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
276
+ eval_output=np.array([input_sentence,predictions,references]).T
277
+ import pandas as pd
278
+ eval_output=pd.DataFrame(eval_output)
279
+ pd.DataFrame.to_csv(eval_output,"/home/user/xsj/model_weight/mistral7b_mid_ascii_0327_eos_2aug1_quat8"+'/output')
280
+ # out llama
281
+ # Rogue1: 98.363321%
282
+ # rouge2: 95.987820%
283
+ # rougeL: 97.384820%
284
+ # rougeLsum: 97.382071%
285
+
286
+ # this
287
+ # Rogue1: 98.543297%
288
+ # rouge2: 96.575248%
289
+ # rougeL: 97.720560%
290
+ # rougeLsum: 97.724880%
291
+ exit()
292
+ flag=True
293
+ while flag:
294
+ lines=[""]
295
+ try:
296
+ lines.append(input())
297
+ while True:
298
+ lines.append(input())
299
+ except:
300
+ pass
301
+ ret ="".join(lines)
302
+ print(ret)
303
+ if ret=="":
304
+ flag=False
305
+
306
+ print(translater.translate(ret))
307
+
308
+
finetune/mistral7b/test.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import (AutoModelForCausalLM,
3
+ AutoTokenizer,
4
+ BitsAndBytesConfig,
5
+ TrainingArguments,
6
+ pipeline,
7
+ logging,
8
+ TrainerCallback)
9
+ device = "cuda" # the device to load the model onto
10
+ bnb_config = BitsAndBytesConfig(
11
+ load_in_4bit = True,
12
+ bnb_4bit_use_double_quant = False,
13
+ bnb_4bit_quant_type = 'nf4',
14
+ bnb_4bit_compute_dtype = getattr(torch, "float16")
15
+ )
16
+ model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2",quantization_config=bnb_config,)
17
+ tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
18
+
19
+ messages = [
20
+ {"role": "user", "content": "What is your favourite condiment?"},
21
+ {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
22
+ {"role": "user", "content": "Do you have mayonnaise recipes?"}
23
+ ]
24
+
25
+ encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
26
+
27
+ model_inputs = encodeds.to(device)
28
+ # model.to(device)
29
+
30
+ generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
31
+ decoded = tokenizer.batch_decode(generated_ids)
32
+ print(decoded[0])
finetune/realtime_run.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from peft import PeftModel, PeftConfig
3
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
4
+
5
+ # Load peft config for pre-trained checkpoint etc.
6
+ peft_model_id="finetuned_model/results"
7
+ config = PeftConfig.from_pretrained(peft_model_id)
8
+
9
+ # load base LLM model and tokenizer
10
+ model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, load_in_8bit=True, device_map={"":0})
11
+ tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
12
+
13
+ # Load the Lora model
14
+ model = PeftModel.from_pretrained(model, peft_model_id, device_map={"":0})
15
+ model.eval()
16
+
17
+ print("Peft model loaded")
18
+
19
+ from datasets import load_dataset
20
+ from random import randrange
21
+
22
+
23
+ import evaluate
24
+ import numpy as np
25
+ import datasets
26
+ from tqdm import tqdm
27
+
28
+ # Metric
29
+ metric = evaluate.load("rouge")
30
+
31
+ def evaluate_peft_model(sample,max_target_length=50):
32
+ # generate summary
33
+ outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), do_sample=True, top_p=0.9, max_new_tokens=max_target_length)
34
+ prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
35
+ # decode eval sample
36
+ # Replace -100 in the labels as we can't decode them.
37
+ labels = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id)
38
+ labels = tokenizer.decode(labels, skip_special_tokens=True)
39
+
40
+ # Some simple post-processing
41
+ return prediction, labels
42
+
43
+ # load test dataset from distk
44
+ # test_dataset = load_from_disk("data/eval/").with_format("torch")
45
+ list_input = [{"natural": "go to P03 and then go to P04, remain in P04 until P05","raw_ltl":"0"}]
46
+ test_dataset = datasets.Dataset.from_list(list_input)
47
+ # run predictions
48
+ # this can take ~45 minutes
49
+ predictions, references = [] , []
50
+ for sample in tqdm(test_dataset):
51
+ p,l = evaluate_peft_model(sample)
52
+ print(p,l)
53
+ predictions.append(p)
54
+ references.append(l)
finetune/test.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
+
3
+ tokenizer = AutoTokenizer.from_pretrained("bigscience/T0pp")
4
+ model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp")
5
+
6
+ inputs = tokenizer.encode("Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy", return_tensors="pt")
7
+ outputs = model.generate(inputs)
8
+ print(tokenizer.decode(outputs[0]))