from transformers import AutoModelForCausalLM, AutoTokenizer # device = "cuda" # the device to load the model onto # from huggingface_hub import login # login() import json import numpy as np import sys,os from datasets import load_dataset import torch from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline, logging, TrainerCallback) from peft import LoraConfig, PeftConfig, prepare_model_for_kbit_training, get_peft_model from trl import SFTTrainer from accelerate import infer_auto_device_map,init_empty_weights import wandb from datasets import concatenate_datasets import numpy as np # sys.path.append('../../../') # sys.path.append('../../') # sys.path.append('../') # os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" # os.environ['CUDA_VISIBLE_DEVICES'] = "5,6,7" # device = torch.device("cuda:0-6" if torch.cuda.is_available() else "cpu") sys.path.append(os.path.join(os.path.dirname(__file__), '../../')) # import utils.util as util # Load dataset from the hub # dataset = load_dataset("samsum") device='cuda' np.random.seed(42) output_dir = os.path.join(os.path.dirname(__file__),'../') datapath=os.path.join(os.path.dirname(__file__),'../NL2TL-dataset/collect2') exp_name="_mid_ascii_0327_eos_2" explainer_files=['LTLexplain_0.json','LTLexplain_1.json','LTLexplain_2.json','LTLexplain_3.json'] explainer_dic={} for path in explainer_files: with open(os.path.join(datapath,path)) as f: LTLlist=json.load(f) for key in LTLlist.keys(): if isinstance(LTLlist[key],dict): if not (key in explainer_dic): explainer_dic[key]=[] explainer_dic[key].append(LTLlist[key]['translate']) sp=LTLlist[key]['explain'].split("means that") if len(sp)>1: explainer_dic[key].append(sp[1]) base_model_name = "mistralai/Mistral-7B-Instruct-v0.2" bnb_config = BitsAndBytesConfig( load_in_4bit = True, bnb_4bit_use_double_quant = False, bnb_4bit_quant_type = 'nf4', bnb_4bit_compute_dtype = getattr(torch, "float16") ) bnb_config = BitsAndBytesConfig( load_in_8bit = True, # llm_int8_threshold=200.0 # bnb_4bit_use_double_quant = False, # bnb_4bit_quant_type = 'nf4', # bnb_4bit_compute_dtype = getattr(torch, "float16") ) import os os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID' # os.environ['CUDA_VISIBLE_DEVICES']='0' device_map="auto" # torch.cuda.set_device(7) # device_map={'':torch.cuda.current_device()} # device_map = {'':'cuda:7'} # model_dir为模型的路径或名称 # config = AutoConfig.from_pretrained(base_model_name, trust_remote_code=True) # with init_empty_weights(): # base_model = AutoModelForCausalLM.from_pretrained( # base_model_name, # from_tf=bool(".ckpt" in base_model_name), # quantization_config=bnb_config, # device_map=device_map, # trust_remote_code=True, # use_auth_token=True # ) # map_list = {5:"15GB", 6:"15GB",7:"15GB"} # 对应不同卡号限制的内存量 # map_list = {7:"15GB",} # 对应不同卡号限制的内存量 # no_split_modules = base_model._no_split_modules # device_map = infer_auto_device_map(base_model, max_memory=map_list, no_split_module_classes=no_split_modules) dataset = load_dataset("json", data_files={"train":os.path.join(datapath,"ltl_eng_train_mid_ascii_gptAuged.jsonl"),"test":os.path.join(datapath,"ltl_eng_test_mid_ascii_gptAuged.jsonl")}) print(dataset) # tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(base_model_name) # , add_eos_token=True,trust_remote_code=True) # NOTE no one says whether the add eos token need to be added, but if we do not add this, the generate will continue until reach the max_new_tokens # when add add_eos_token, it always failed # if use this it will generate somthing other tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = 'right' # print(tokenizer.eos_token_id) # 2 # print(tokenizer.bos_token_id) # 1 # print(tokenizer._convert_token_to_id(tokenizer.bos_token)) def preprocess_function(sample,padding="max_length"): # add prefix to the input for t5 # print(sample[0]) inputs=[ f"""### Instruction: translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()' ### Natural Language Task: {sample['natural'][i].strip()} ### Logic Translation: {explainer_dic[sample['raw_ltl'][i].strip()][np.random.randint(0,len(explainer_dic[sample['raw_ltl'][i].strip()]))]} ### linear temproal logic: {sample['raw_ltl'][i].strip()} """.lower() # NOTE it seems the eos is needed, the bos is not needed(the bos will be automatically added) for i in (range(len(sample['natural'])))] # inputs = ["## [instruction]: translate natural description in to LTL: ### [natural language]:" + sample['natural'][i]+'### [LTL]:'+sample['raw_ltl'][i] for i in (range(len(sample['natural'])))] sample["complete_text"] = inputs return sample # method1 # tokenized_dataset = dataset.map(preprocess_function, batched=True) # method2 def preprocess_function2(sample,padding="max_length"): # add prefix to the input for t5 # print(sample[0]) inputs=[ tokenizer.apply_chat_template( [ {"role": "user", "content": "translate natural description to linear temproal logic, first translate into a logical expression, and then translate into linear temproal logic, please pay specific attention to logic grammar, the natural language task is {}".format(sample['natural'][i].strip())}, {"role": "assistant", "content": "logic expression is {}, and LTL is {} .".format( explainer_dic[sample['raw_ltl'][i].strip()][np.random.randint(0,len(explainer_dic[sample['raw_ltl'][i].strip()]))], sample['raw_ltl'][i].strip() ) }, # {"role": "user", "content": " pay specific attention to brackets '()', linear temproal logic is"}, # {"role": "assistant", "content": "LTL is {} .".format( # sample['raw_ltl'][i].strip() # ) # } ],tokenize=False) # NOTE it seems the eos is needed, the bos is not needed(the bos will be automatically added) for i in (range(len(sample['natural'])))] # inputs = ["## [instruction]: translate natural description in to LTL: ### [natural language]:" + sample['natural'][i]+'### [LTL]:'+sample['raw_ltl'][i] for i in (range(len(sample['natural'])))] sample["complete_text"] = inputs return sample tokenized_dataset = dataset.map(preprocess_function2, batched=True) print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}") # save datasets to disk for later easy loading # tokenized_dataset["train"].save_to_disk("data/train"+exp_name) # tokenized_dataset["test"].save_to_disk("data/eval"+exp_name) class PeftSavingCallback(TrainerCallback): def on_save(self, args, state, control, **kwargs): checkpoint_path = os.path.join(args.output_dir, f"checkpoint-{state.global_step}") kwargs["model"].save_pretrained(checkpoint_path) if "pytorch_model.bin" in os.listdir(checkpoint_path): os.remove(os.path.join(checkpoint_path, "pytorch_model.bin")) callbacks = [PeftSavingCallback] peft_config = LoraConfig( lora_alpha=16, lora_dropout=0.05, r=128, bias="none", task_type="CAUSAL_LM", target_modules=["q_proj", "v_proj"] ) training_arguments = TrainingArguments( output_dir=output_dir, logging_dir = os.path.join(output_dir,"logs"), per_device_train_batch_size=1, num_train_epochs=3, gradient_accumulation_steps=8, optim="paged_adamw_32bit", save_strategy='epoch', logging_steps=25, learning_rate=2e-4, weight_decay=0.001, fp16=True, bf16=False, max_grad_norm=0.3, max_steps=-1, warmup_ratio = 0.05, group_by_length=True, lr_scheduler_type="cosine", report_to="wandb", evaluation_strategy="epoch", do_eval=True, run_name = base_model_name+exp_name, disable_tqdm=False ) import os output_dir = os.path.join(output_dir, "mistral7b"+exp_name+'aug1_quat8') base_model = AutoModelForCausalLM.from_pretrained( base_model_name, from_tf=bool(".ckpt" in base_model_name), quantization_config=bnb_config, device_map=device_map, trust_remote_code=True, use_auth_token=True ) base_model.config.use_cache = False # More info: https://github.com/huggingface/transformers/pull/24906 base_model.config.pretraining_tp = 1 base_model.gradient_checkpointing_enable() base_model = prepare_model_for_kbit_training(base_model) base_model = get_peft_model(base_model, peft_config) trainer = SFTTrainer( model=base_model, train_dataset=tokenized_dataset['train'], eval_dataset=tokenized_dataset['test'], peft_config=peft_config, dataset_text_field="complete_text", max_seq_length=512, tokenizer=tokenizer, args=training_arguments, callbacks=callbacks, packing=False, ) wandb.login() trainer.train() trainer.model.save_pretrained(output_dir) # trainer.model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) wandb.finish() # check print('model dir',output_dir) from peft import AutoPeftModelForCausalLM device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = AutoPeftModelForCausalLM.from_pretrained(output_dir, from_tf=bool(".ckpt" in output_dir), quantization_config=bnb_config, device_map=device_map, trust_remote_code=True, use_auth_token=True ) tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token print(tokenizer.default_chat_template) def evaluate_model(input_text): input_text =f"""### Instruction: translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()' ### Natural Language Task: {input_text}""".lower() inputs = tokenizer(input_text, return_tensors="pt").to(device) print(inputs) outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"].to("cuda"), max_new_tokens=512, pad_token_id=tokenizer.eos_token_id) return tokenizer.decode(outputs[0], skip_special_tokens=True) def evaluate_model2(input_text): messages=[ {"role": "user", "content": "translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()', natural language task: {}".format(input_text)}, ] encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device) outputs = model.generate(encodeds, max_new_tokens=512) # , pad_token_id=tokenizer.eos_token_id) # input_text =f"""### Instruction: # translate natural description to linear temproal logic, first translate into a logical way, and then translate into linear temproal logic, pay specific attention to brackets '()' ### Natural Language Task: # {input_text}""".lower() # inputs = tokenizer(input_text, return_tensors="pt").to(device) # print(inputs) # outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"].to("cuda"), max_new_tokens=512, pad_token_id=tokenizer.eos_token_id) return tokenizer.decode(outputs[0], skip_special_tokens=True) # if __name__=='__main__': import evaluate import numpy as np from datasets import load_from_disk from tqdm import tqdm # Metric metric = evaluate.load("rouge") # load test dataset from distk # test_dataset = load_from_disk("data/eval"+exp_name+'/').with_format("torch") # run predictions # this can take ~45 minutes import re pattern=re.compile("linear temproal logic is ([\S ]*)") predictions, references,input_sentence,output_sentence=[], [] , [], [] for idx in range(len(tokenized_dataset['test']['natural'])): # print(sample) nl=tokenized_dataset['test']['natural'][idx] p = evaluate_model2(nl) # print(p,l) input_sentence.append(nl) transLTL=pattern.findall(p) print(p) if transLTL[0][-1]=='.': transLTL[0]=transLTL[0][:-1].strip() else: transLTL[0]=transLTL[0].strip() predictions.append(transLTL[0]) output_sentence.append(p) input_sentence.append(p) references.append(tokenized_dataset['test']['raw_ltl'][idx].strip()) print(input_sentence[-1],'\nout::\n',output_sentence[-1],'\npre::\n',predictions[-1],'\nref::\n',references[-1],'\n','-'*20,'\n') # compute metric rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True) # print results print(f"Rogue1: {rogue['rouge1']* 100:2f}%") print(f"rouge2: {rogue['rouge2']* 100:2f}%") print(f"rougeL: {rogue['rougeL']* 100:2f}%") print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%") eval_output=np.array([input_sentence,predictions,references]).T import pandas as pd eval_output=pd.DataFrame(eval_output) pd.DataFrame.to_csv(eval_output,output_dir+'/output') exit() messages = [ {"role": "user", "content": "What is your favourite condiment?"}, {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"}, {"role": "user", "content": "Do you have mayonnaise recipes?"} ] encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt") model_inputs = encodeds.to(device) model.to(device) generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True) decoded = tokenizer.batch_decode(generated_ids) print(decoded[0])