|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import numpy as np
|
|
import timeit
|
|
import torch
|
|
|
|
import json, argparse
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
import pandas as pd
|
|
from torch.utils.data import Dataset, DataLoader
|
|
import transformers
|
|
from transformers import GPT2Tokenizer, GPT2LMHeadModel
|
|
print('use transformers version = ',transformers.__version__)
|
|
|
|
|
|
def add_special_tokens(tokenizer):
|
|
""" Returns GPT2 tokenizer after adding separator and padding tokens """
|
|
|
|
special_tokens = {'bos_token':'<|startoftext|>','eos_token':'<|endoftext|>', 'pad_token':'<|pad|>','sep_token':'<|summarize|>'}
|
|
num_add_toks = tokenizer.add_special_tokens(special_tokens)
|
|
return tokenizer
|
|
|
|
|
|
class GPT21024Dataset(Dataset):
|
|
|
|
|
|
def __init__(self, text, ctext, tokenizer, source_len, summ_len):
|
|
self.tokenizer = add_special_tokens(tokenizer)
|
|
|
|
self.source_len = source_len
|
|
self.summ_len = summ_len
|
|
|
|
|
|
self.text = text
|
|
self.ctext = ctext
|
|
|
|
|
|
def __len__(self):
|
|
return len(self.ctext)
|
|
|
|
|
|
def __getitem__(self,index):
|
|
|
|
|
|
ctext = str(self.ctext[index])
|
|
ctext = ' '.join(ctext.split())
|
|
|
|
|
|
|
|
text = str(self.text[index])
|
|
text = ' '.join(text.split())
|
|
|
|
|
|
tok_data={}
|
|
tok_data['article']= ctext
|
|
tok_data['summary']= text
|
|
|
|
input_ids= '<|startoftext|>' + tok_data['article'] + '<|summarize|>'
|
|
summary= tok_data['summary']
|
|
|
|
content = self.tokenizer.encode(input_ids, max_length = 512, padding='max_length',truncation=True)
|
|
summary_target_ids= self.tokenizer.encode( summary, max_length = 512, padding='max_length',truncation=True)
|
|
|
|
|
|
texts = torch.tensor(content)
|
|
summary_target_ids=torch.tensor(summary_target_ids)
|
|
sample = {'article': texts, 'actual_summary': summary_target_ids, 'sum_idx': len(self.tokenizer.encode(tok_data['article']))}
|
|
return sample
|
|
|
|
def gpt_eval(
|
|
verbose=True,
|
|
model_name_path=None,
|
|
src_txt=None,
|
|
tar_txt=None,
|
|
gen_path=None,
|
|
scor_path=None,
|
|
batch_size=4
|
|
):
|
|
"""
|
|
"""
|
|
predictions=[]
|
|
actuals=[]
|
|
|
|
model = GPT2LMHeadModel.from_pretrained(model_name_path)
|
|
tokenizer = GPT2Tokenizer.from_pretrained(model_name_path)
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
special_tokens = {'pad_token':'<|pad|>','sep_token':'<|summarize|>'}
|
|
tokenizer.add_special_tokens(special_tokens)
|
|
|
|
#assert len(tokenizer) == 50261, "tokenizer size is not 50261"
|
|
model.resize_token_embeddings(len(tokenizer))
|
|
print(' ')
|
|
"""
|
|
|
|
model = model.to(device)
|
|
model.eval()
|
|
"""
|
|
input_text = input_text +' <|summarize|>'
|
|
input_token = tokenizer.encode(input_text)
|
|
input_token_torch = torch.tensor(input_token, dtype=torch.long)
|
|
"""
|
|
|
|
val_params = {
|
|
'batch_size':batch_size,
|
|
'shuffle': False,
|
|
'num_workers': 0
|
|
}
|
|
|
|
sp= open(src_txt,'r')
|
|
src= sp.readlines()
|
|
sp.close()
|
|
tp = open(tar_txt, 'r')
|
|
tar=tp.readlines()
|
|
tp.close()
|
|
val_set = GPT21024Dataset(tar, src,tokenizer, 512, 150)
|
|
val_loader = DataLoader(val_set, **val_params)
|
|
|
|
with torch.no_grad():
|
|
for _, data in enumerate(val_loader, 0):
|
|
|
|
|
|
|
|
target_output = data['actual_summary'].to(device, dtype = torch.long)
|
|
input_ids = data['article']
|
|
input_ids=input_ids.to(device)
|
|
|
|
|
|
print(f'Length of the input context: {len(input_ids[0])}')
|
|
print(f'BEAM SIZE: {4}')
|
|
|
|
generated_output = model.generate(
|
|
input_ids=input_ids,
|
|
max_length= 582,
|
|
min_length = 562 ,
|
|
temperature=1.0,
|
|
decoder_start_token_id= '<|summarize|>',
|
|
num_beams=4,
|
|
num_return_sequences=1)
|
|
|
|
|
|
|
|
preds=[]
|
|
target=[]
|
|
ids=[]
|
|
for g in generated_output:
|
|
preds.append(tokenizer.decode(g[len(input_ids[0]):] , skip_special_tokens=True))
|
|
|
|
|
|
for t in target_output:
|
|
target.append(tokenizer.decode(t , skip_special_tokens=True))
|
|
|
|
if _%100==0:
|
|
print(f'Completed {_}')
|
|
|
|
predictions.extend(preds)
|
|
actuals.extend(target)
|
|
|
|
gp= open(gen_path, 'w')
|
|
for pre in predictions:
|
|
gp.write(pre+"\n")
|
|
gp.close()
|
|
|
|
|
|
|
|
|