File size: 3,625 Bytes

bacb17b

import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load peft config for pre-trained checkpoint etc.
exp_name="_mid_ascii"
peft_model_id="finetuned_model/results"+exp_name+'2'
max_target_length=128

config = PeftConfig.from_pretrained(peft_model_id)

# load base LLM model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path,  load_in_8bit=True,  device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, device_map="auto")

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id, device_map="auto")
model.eval()

print("Peft model loaded")

from datasets import load_dataset
from random import randrange


# Load dataset from the hub and get a sample
datapath='LTL_datasets/collect/'
dataset = load_dataset("json",  data_files={"train":datapath+"ltl_eng_train"+exp_name+".jsonl","test":datapath+"ltl_eng_test"+exp_name+".jsonl"})
print(dataset)
sample = dataset['test'][randrange(len(dataset["test"]))]

input_ids = tokenizer(sample["natural"], return_tensors="pt", truncation=True).input_ids.cuda()
# with torch.inference_mode():
outputs = model.generate(input_ids=input_ids, max_new_tokens=max_target_length, do_sample=True, top_p=0.9)
print(f"input sentence: {sample['natural']}\n{'---'* 20}")

print(f"summary:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]}")


import evaluate
import numpy as np
from datasets import load_from_disk
from tqdm import tqdm

# Metric
metric = evaluate.load("rouge")

def evaluate_peft_model(sample,max_target_length=128):
    # generate summary
    outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), do_sample=True, top_p=0.9, max_new_tokens=max_target_length)
    prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
    # decode eval sample
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id)
    # print(labels)
    labels = tokenizer.decode(labels, skip_special_tokens=True)
    # print(labels)
    # Some simple post-processing
    input_sentence=" ".join(tokenizer.batch_decode(sample["input_ids"].detach().cpu().numpy(), skip_special_tokens=True))
    print("input sentence: {}\n{}".format(input_sentence,'---'* 20))
    # output_LTL=tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
    # expect_LTL=labels
    print(f"pre_LTL:\n{prediction}\nexp_LTL:\n{labels}")
    return prediction, labels,input_sentence

# load test dataset from distk
test_dataset = load_from_disk("data/eval"+exp_name+'/').with_format("torch")

# run predictions
# this can take ~45 minutes
predictions, references,input_sentence= [] , [], []
for sample in tqdm(test_dataset):
    # print(sample)
    p,l,nl = evaluate_peft_model(sample)
    # print(p,l)
    input_sentence.append(nl)
    predictions.append(p)
    references.append(l)

# compute metric
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)

# print results
print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")
eval_output=np.array([input_sentence,predictions,references]).T
import pandas as pd 
eval_output=pd.DataFrame(eval_output)
pd.DataFrame.to_csv(eval_output,peft_model_id+'/output')
# Rogue1: 50.386161%
# rouge2: 24.842412%
# rougeL: 41.370130%
# rougeLsum: 41.394230%