NL2HLTL / NL2HLTLTranslator /T5_XXL /t5_lora_fintune.py
tt-dart's picture
update readme
d834d9d
# %%
from datasets import load_dataset
import os
import sys
# Load dataset from the hub
# dataset = load_dataset("samsum")
# datapath='LTL_datasets/collect/'
exp_name="/tf-ltl_eng_test_mid_ascii_gptAuged"
# output_dir = os.path.join(output_dir, "llama2_13b_"+exp_name+'aug1')
# dataset = load_dataset("json", data_files={"train":datapath+"ltl_eng_train"+exp_name+".jsonl","test":datapath+"ltl_eng_test"+exp_name+".jsonl"})
# print(dataset)
dataset = load_dataset("json", data_files={"train":"LTL_datasets/collect/ltl_eng_train_mid_ascii_gptAuged.jsonl","test":"LTL_datasets/collect/ltl_eng_test_mid_ascii_gptAuged.jsonl"})
print(dataset)
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES']='4,5'
# %%
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_id="google/flan-t5-xxl"
# Load tokenizer of FLAN-t5-XL
tokenizer = AutoTokenizer.from_pretrained(model_id)
# %%
from datasets import concatenate_datasets
import numpy as np
# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["natural"], truncation=True), batched=True, remove_columns=["raw_ltl", "natural"])
input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
# take 85 percentile of max length for better utilization
max_source_length = int(np.percentile(input_lenghts, 100))
print(f"Max source length: {max_source_length}")
# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["raw_ltl"], truncation=True), batched=True, remove_columns=["raw_ltl", "natural"])
target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
# take 90 percentile of max length for better utilization
max_target_length = int(np.percentile(target_lenghts, 100))
print(f"Max target length: {max_target_length}")
# %%
def preprocess_function(sample,padding="max_length"):
# add prefix to the input for t5
inputs = ["Generate LTL: " + item for item in sample["natural"]]
# tokenize inputs
model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)
# Tokenize targets with the `text_target` keyword argument
labels = tokenizer(text_target=sample["raw_ltl"], max_length=max_target_length, padding=padding, truncation=True)
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
# padding in the loss.
if padding == "max_length":
labels["input_ids"] = [
[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
]
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["natural", "raw_ltl"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")
# save datasets to disk for later easy loading
tokenized_dataset["train"].save_to_disk("data/train"+exp_name)
tokenized_dataset["test"].save_to_disk("data/eval"+exp_name)
# %%
from transformers import AutoModelForSeq2SeqLM
from peft import PeftModel, PeftConfig
# huggingface hub model id
model_id = "philschmid/flan-t5-xxl-sharded-fp16"
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")
# peft_model_id="finetuned_model/results"+"_mid_ascii"
# config = PeftConfig.from_pretrained(peft_model_id)
# # load base LLM model and tokenizer
# model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, load_in_8bit=True, device_map="auto")
# tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, device_map="auto")
# # Load the Lora model
# model = PeftModel.from_pretrained(model, peft_model_id, device_map="auto")
# # load model from the hub
print(model)
# exit()
# %%
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType
# Define LoRA Config
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q", "v"],
lora_dropout=0.05,
bias="none",
task_type=TaskType.SEQ_2_SEQ_LM
)
# prepare int-8 model for training
model = prepare_model_for_int8_training(model)
# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# trainable params: 18874368 || all params: 11154206720 || trainable%: 0.16921300163961817
# %%
from transformers import DataCollatorForSeq2Seq
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
tokenizer,
model=model,
label_pad_token_id=label_pad_token_id,
pad_to_multiple_of=8
)
# %%
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
output_dir="lora-flan-t5-xxl"
# Define training args
training_args = Seq2SeqTrainingArguments(
output_dir=output_dir,
auto_find_batch_size=True,
learning_rate=1e-3, # higher learning rate
num_train_epochs=5,
logging_dir=f"{output_dir}/logs",
logging_strategy="steps",
logging_steps=500,
save_strategy="no",
report_to="tensorboard",
)
# Create Trainer instance
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=tokenized_dataset["train"],
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
# %%
# train model
trainer.train()
# %%
# Save our LoRA model & tokenizer results
peft_model_id="finetuned_model/"+exp_name
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)
# if you want to save the base model to call
# trainer.model.base_model.save_pretrained(peft_model_id)
import evaluate
import numpy as np
from datasets import load_from_disk
from tqdm import tqdm
# Metric
metric = evaluate.load("rouge")
def evaluate_peft_model(sample,max_target_length=128):
# generate summary
outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), do_sample=True, top_p=0.9, max_new_tokens=max_target_length)
prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
# decode eval sample
# Replace -100 in the labels as we can't decode them.
labels = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id)
# print(labels)
labels = tokenizer.decode(labels, skip_special_tokens=True)
# print(labels)
# Some simple post-processing
input_sentence=" ".join(tokenizer.batch_decode(sample["input_ids"].detach().cpu().numpy(), skip_special_tokens=True))
# print("input sentence: {}\n{}".format(input_sentence,'---'* 20))
# output_LTL=tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
# expect_LTL=labels
# print(f"pre_LTL:\n{prediction}\nexp_LTL:\n{labels}")
return prediction, labels,input_sentence
# load test dataset from distk
test_dataset = load_from_disk("data/eval"+exp_name+'/').with_format("torch")
# run predictions
# this can take ~45 minutes
predictions, references,input_sentence= [] , [], []
idx=0
for sample in tqdm(test_dataset):
# print(sample)
p,l,nl = evaluate_peft_model(sample)
# print(p,l)
input_sentence.append(nl)
predictions.append(p)
references.append(l)
idx+=1
print(idx,'\n',input_sentence[-1],'\npre::\n',predictions[-1],'\nref::\n',references[-1],'\n','-'*20,'\n')
# compute metric
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)
# print results
print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
eval_output=np.array([input_sentence,predictions,references]).T
import pandas as pd
eval_output=pd.DataFrame(eval_output)
pd.DataFrame.to_csv(eval_output,peft_model_id+'/output')
# Rogue1: 98.292692%
# rouge2: 95.766211%
# rougeL: 97.086188%
# rougeLsum: 97.084262%