NL2HLTL / NL2HLTLTranslator /T5_XXL /t5_lora_evaluate.py
tt-dart's picture
update readme
d834d9d
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# Load peft config for pre-trained checkpoint etc.
exp_name="_mid_ascii"
peft_model_id="finetuned_model/results"+exp_name+'2'
max_target_length=128
config = PeftConfig.from_pretrained(peft_model_id)
# load base LLM model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, load_in_8bit=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, device_map="auto")
# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id, device_map="auto")
model.eval()
print("Peft model loaded")
from datasets import load_dataset
from random import randrange
# Load dataset from the hub and get a sample
datapath='LTL_datasets/collect/'
dataset = load_dataset("json", data_files={"train":datapath+"ltl_eng_train"+exp_name+".jsonl","test":datapath+"ltl_eng_test"+exp_name+".jsonl"})
print(dataset)
sample = dataset['test'][randrange(len(dataset["test"]))]
input_ids = tokenizer(sample["natural"], return_tensors="pt", truncation=True).input_ids.cuda()
# with torch.inference_mode():
outputs = model.generate(input_ids=input_ids, max_new_tokens=max_target_length, do_sample=True, top_p=0.9)
print(f"input sentence: {sample['natural']}\n{'---'* 20}")
print(f"summary:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]}")
import evaluate
import numpy as np
from datasets import load_from_disk
from tqdm import tqdm
# Metric
metric = evaluate.load("rouge")
def evaluate_peft_model(sample,max_target_length=128):
# generate summary
outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), do_sample=True, top_p=0.9, max_new_tokens=max_target_length)
prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
# decode eval sample
# Replace -100 in the labels as we can't decode them.
labels = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id)
# print(labels)
labels = tokenizer.decode(labels, skip_special_tokens=True)
# print(labels)
# Some simple post-processing
input_sentence=" ".join(tokenizer.batch_decode(sample["input_ids"].detach().cpu().numpy(), skip_special_tokens=True))
print("input sentence: {}\n{}".format(input_sentence,'---'* 20))
# output_LTL=tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
# expect_LTL=labels
print(f"pre_LTL:\n{prediction}\nexp_LTL:\n{labels}")
return prediction, labels,input_sentence
# load test dataset from distk
test_dataset = load_from_disk("data/eval"+exp_name+'/').with_format("torch")
# run predictions
# this can take ~45 minutes
predictions, references,input_sentence= [] , [], []
for sample in tqdm(test_dataset):
# print(sample)
p,l,nl = evaluate_peft_model(sample)
# print(p,l)
input_sentence.append(nl)
predictions.append(p)
references.append(l)
# compute metric
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)
# print results
print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
eval_output=np.array([input_sentence,predictions,references]).T
import pandas as pd
eval_output=pd.DataFrame(eval_output)
pd.DataFrame.to_csv(eval_output,peft_model_id+'/output')
# Rogue1: 50.386161%
# rouge2: 24.842412%
# rougeL: 41.370130%
# rougeLsum: 41.394230%