NL2HLTL / NL2HLTLTranslator /T5_XXL /t5_lora_fintune.py

update readme

d834d9d 8 months ago

8.27 kB

	# %%
	from datasets import load_dataset
	import os
	import sys
	# Load dataset from the hub
	# dataset = load_dataset("samsum")
	# datapath='LTL_datasets/collect/'
	exp_name="/tf-ltl_eng_test_mid_ascii_gptAuged"
	# output_dir = os.path.join(output_dir, "llama2_13b_"+exp_name+'aug1')
	# dataset = load_dataset("json", data_files={"train":datapath+"ltl_eng_train"+exp_name+".jsonl","test":datapath+"ltl_eng_test"+exp_name+".jsonl"})
	# print(dataset)
	dataset = load_dataset("json", data_files={"train":"LTL_datasets/collect/ltl_eng_train_mid_ascii_gptAuged.jsonl","test":"LTL_datasets/collect/ltl_eng_test_mid_ascii_gptAuged.jsonl"})
	print(dataset)

	os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
	os.environ['CUDA_VISIBLE_DEVICES']='4,5'


	# %%

	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

	model_id="google/flan-t5-xxl"

	# Load tokenizer of FLAN-t5-XL
	tokenizer = AutoTokenizer.from_pretrained(model_id)


	# %%
	from datasets import concatenate_datasets
	import numpy as np
	# The maximum total input sequence length after tokenization.
	# Sequences longer than this will be truncated, sequences shorter will be padded.
	tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["natural"], truncation=True), batched=True, remove_columns=["raw_ltl", "natural"])
	input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
	# take 85 percentile of max length for better utilization
	max_source_length = int(np.percentile(input_lenghts, 100))
	print(f"Max source length: {max_source_length}")

	# The maximum total sequence length for target text after tokenization.
	# Sequences longer than this will be truncated, sequences shorter will be padded."
	tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["raw_ltl"], truncation=True), batched=True, remove_columns=["raw_ltl", "natural"])
	target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
	# take 90 percentile of max length for better utilization
	max_target_length = int(np.percentile(target_lenghts, 100))
	print(f"Max target length: {max_target_length}")


	# %%
	def preprocess_function(sample,padding="max_length"):
	# add prefix to the input for t5
	inputs = ["Generate LTL: " + item for item in sample["natural"]]

	# tokenize inputs
	model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

	# Tokenize targets with the `text_target` keyword argument
	labels = tokenizer(text_target=sample["raw_ltl"], max_length=max_target_length, padding=padding, truncation=True)

	# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
	# padding in the loss.
	if padding == "max_length":
	labels["input_ids"] = [
	[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
	]

	model_inputs["labels"] = labels["input_ids"]
	return model_inputs

	tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["natural", "raw_ltl"])
	print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

	# save datasets to disk for later easy loading
	tokenized_dataset["train"].save_to_disk("data/train"+exp_name)
	tokenized_dataset["test"].save_to_disk("data/eval"+exp_name)


	# %%
	from transformers import AutoModelForSeq2SeqLM
	from peft import PeftModel, PeftConfig
	# huggingface hub model id
	model_id = "philschmid/flan-t5-xxl-sharded-fp16"
	model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")



	# peft_model_id="finetuned_model/results"+"_mid_ascii"
	# config = PeftConfig.from_pretrained(peft_model_id)
	# # load base LLM model and tokenizer
	# model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, load_in_8bit=True, device_map="auto")
	# tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, device_map="auto")

	# # Load the Lora model
	# model = PeftModel.from_pretrained(model, peft_model_id, device_map="auto")
	# # load model from the hub

	print(model)
	# exit()

	# %%
	from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

	# Define LoRA Config
	lora_config = LoraConfig(
	r=16,
	lora_alpha=32,
	target_modules=["q", "v"],
	lora_dropout=0.05,
	bias="none",
	task_type=TaskType.SEQ_2_SEQ_LM
	)
	# prepare int-8 model for training
	model = prepare_model_for_int8_training(model)

	# add LoRA adaptor
	model = get_peft_model(model, lora_config)
	model.print_trainable_parameters()

	# trainable params: 18874368 \|\| all params: 11154206720 \|\| trainable%: 0.16921300163961817


	# %%
	from transformers import DataCollatorForSeq2Seq

	# we want to ignore tokenizer pad token in the loss
	label_pad_token_id = -100
	# Data collator
	data_collator = DataCollatorForSeq2Seq(
	tokenizer,
	model=model,
	label_pad_token_id=label_pad_token_id,
	pad_to_multiple_of=8
	)


	# %%
	from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

	output_dir="lora-flan-t5-xxl"

	# Define training args
	training_args = Seq2SeqTrainingArguments(
	output_dir=output_dir,
	auto_find_batch_size=True,
	learning_rate=1e-3, # higher learning rate
	num_train_epochs=5,
	logging_dir=f"{output_dir}/logs",
	logging_strategy="steps",
	logging_steps=500,
	save_strategy="no",
	report_to="tensorboard",
	)

	# Create Trainer instance
	trainer = Seq2SeqTrainer(
	model=model,
	args=training_args,
	data_collator=data_collator,
	train_dataset=tokenized_dataset["train"],
	)
	model.config.use_cache = False # silence the warnings. Please re-enable for inference!


	# %%
	# train model
	trainer.train()


	# %%
	# Save our LoRA model & tokenizer results
	peft_model_id="finetuned_model/"+exp_name
	trainer.model.save_pretrained(peft_model_id)
	tokenizer.save_pretrained(peft_model_id)
	# if you want to save the base model to call
	# trainer.model.base_model.save_pretrained(peft_model_id)




	import evaluate
	import numpy as np
	from datasets import load_from_disk
	from tqdm import tqdm

	# Metric
	metric = evaluate.load("rouge")

	def evaluate_peft_model(sample,max_target_length=128):
	# generate summary
	outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), do_sample=True, top_p=0.9, max_new_tokens=max_target_length)
	prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
	# decode eval sample
	# Replace -100 in the labels as we can't decode them.
	labels = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id)
	# print(labels)
	labels = tokenizer.decode(labels, skip_special_tokens=True)
	# print(labels)
	# Some simple post-processing
	input_sentence=" ".join(tokenizer.batch_decode(sample["input_ids"].detach().cpu().numpy(), skip_special_tokens=True))
	# print("input sentence: {}\n{}".format(input_sentence,'---'* 20))
	# output_LTL=tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
	# expect_LTL=labels
	# print(f"pre_LTL:\n{prediction}\nexp_LTL:\n{labels}")
	return prediction, labels,input_sentence

	# load test dataset from distk
	test_dataset = load_from_disk("data/eval"+exp_name+'/').with_format("torch")

	# run predictions
	# this can take ~45 minutes
	predictions, references,input_sentence= [] , [], []
	idx=0
	for sample in tqdm(test_dataset):
	# print(sample)
	p,l,nl = evaluate_peft_model(sample)
	# print(p,l)
	input_sentence.append(nl)
	predictions.append(p)
	references.append(l)
	idx+=1
	print(idx,'\n',input_sentence[-1],'\npre::\n',predictions[-1],'\nref::\n',references[-1],'\n','-'*20,'\n')

	# compute metric
	rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)

	# print results
	print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
	print(f"rouge2: {rogue['rouge2']* 100:2f}%")
	print(f"rougeL: {rogue['rougeL']* 100:2f}%")
	print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
	eval_output=np.array([input_sentence,predictions,references]).T
	import pandas as pd
	eval_output=pd.DataFrame(eval_output)
	pd.DataFrame.to_csv(eval_output,peft_model_id+'/output')
	# Rogue1: 98.292692%
	# rouge2: 95.766211%
	# rougeL: 97.086188%
	# rougeLsum: 97.084262%