NL2HLTL / NL2HLTLTranslator /T5_XXL /t5_lora_evaluate.py

update readme

d834d9d 8 months ago

3.63 kB

	import torch
	from peft import PeftModel, PeftConfig
	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

	# Load peft config for pre-trained checkpoint etc.
	exp_name="_mid_ascii"
	peft_model_id="finetuned_model/results"+exp_name+'2'
	max_target_length=128

	config = PeftConfig.from_pretrained(peft_model_id)

	# load base LLM model and tokenizer
	model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, load_in_8bit=True, device_map="auto")
	tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, device_map="auto")

	# Load the Lora model
	model = PeftModel.from_pretrained(model, peft_model_id, device_map="auto")
	model.eval()

	print("Peft model loaded")

	from datasets import load_dataset
	from random import randrange


	# Load dataset from the hub and get a sample
	datapath='LTL_datasets/collect/'
	dataset = load_dataset("json", data_files={"train":datapath+"ltl_eng_train"+exp_name+".jsonl","test":datapath+"ltl_eng_test"+exp_name+".jsonl"})
	print(dataset)
	sample = dataset['test'][randrange(len(dataset["test"]))]

	input_ids = tokenizer(sample["natural"], return_tensors="pt", truncation=True).input_ids.cuda()
	# with torch.inference_mode():
	outputs = model.generate(input_ids=input_ids, max_new_tokens=max_target_length, do_sample=True, top_p=0.9)
	print(f"input sentence: {sample['natural']}\n{'---'* 20}")

	print(f"summary:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]}")


	import evaluate
	import numpy as np
	from datasets import load_from_disk
	from tqdm import tqdm

	# Metric
	metric = evaluate.load("rouge")

	def evaluate_peft_model(sample,max_target_length=128):
	# generate summary
	outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), do_sample=True, top_p=0.9, max_new_tokens=max_target_length)
	prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
	# decode eval sample
	# Replace -100 in the labels as we can't decode them.
	labels = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id)
	# print(labels)
	labels = tokenizer.decode(labels, skip_special_tokens=True)
	# print(labels)
	# Some simple post-processing
	input_sentence=" ".join(tokenizer.batch_decode(sample["input_ids"].detach().cpu().numpy(), skip_special_tokens=True))
	print("input sentence: {}\n{}".format(input_sentence,'---'* 20))
	# output_LTL=tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
	# expect_LTL=labels
	print(f"pre_LTL:\n{prediction}\nexp_LTL:\n{labels}")
	return prediction, labels,input_sentence

	# load test dataset from distk
	test_dataset = load_from_disk("data/eval"+exp_name+'/').with_format("torch")

	# run predictions
	# this can take ~45 minutes
	predictions, references,input_sentence= [] , [], []
	for sample in tqdm(test_dataset):
	# print(sample)
	p,l,nl = evaluate_peft_model(sample)
	# print(p,l)
	input_sentence.append(nl)
	predictions.append(p)
	references.append(l)

	# compute metric
	rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)

	# print results
	print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
	print(f"rouge2: {rogue['rouge2']* 100:2f}%")
	print(f"rougeL: {rogue['rougeL']* 100:2f}%")
	print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
	eval_output=np.array([input_sentence,predictions,references]).T
	import pandas as pd
	eval_output=pd.DataFrame(eval_output)
	pd.DataFrame.to_csv(eval_output,peft_model_id+'/output')
	# Rogue1: 50.386161%
	# rouge2: 24.842412%
	# rougeL: 41.370130%
	# rougeLsum: 41.394230%