# # -*- coding: utf-8 -*- # """evaluate.ipynb # Automatically generated by Colaboratory. # Original file is located at # https://colab.research.google.com/drive/1_WZN6_5mgwRgg484xzXMSwCXBQXfr8Vj # """ # # -*- coding: utf-8 -*- # """# code here""" # print("**************OUTPUT FILE PATH UPDATED FOR SEED 42 hinglish ******************") import numpy as np import timeit import torch #from torch.utils.data import DataLoader, TensorDataset, RandomSampler import json, argparse device = torch.device("cuda" if torch.cuda.is_available() else "cpu") import pandas as pd from torch.utils.data import Dataset, DataLoader import transformers from transformers import GPT2Tokenizer, GPT2LMHeadModel print('use transformers version = ',transformers.__version__) # make sure it is 2.6.0 def add_special_tokens(tokenizer): """ Returns GPT2 tokenizer after adding separator and padding tokens """ #tokenizer = GPT2Tokenizer.from_pretrained('gpt2') special_tokens = {'bos_token':'<|startoftext|>','eos_token':'<|endoftext|>', 'pad_token':'<|pad|>','sep_token':'<|summarize|>'} num_add_toks = tokenizer.add_special_tokens(special_tokens) return tokenizer class GPT21024Dataset(Dataset): #def __init__(self, root_dir, ids_file, mode='train',length=None): def __init__(self, text, ctext, tokenizer, source_len, summ_len): self.tokenizer = add_special_tokens(tokenizer) # self.data = dataframe self.source_len = source_len self.summ_len = summ_len # self.text = self.data['summary-hinglish'] ## the summary # self.ctext = self.data['dialogue-hinglish'] ## ctext is the article to be summarized self.text = text ## the summary self.ctext = ctext def __len__(self): return len(self.ctext) #return self.len def __getitem__(self,index): ##articles ctext = str(self.ctext[index]) ctext = ' '.join(ctext.split()) ##summaries text = str(self.text[index]) text = ' '.join(text.split()) tok_data={} tok_data['article']= ctext tok_data['summary']= text input_ids= '<|startoftext|>' + tok_data['article'] + '<|summarize|>' summary= tok_data['summary'] content = self.tokenizer.encode(input_ids, max_length = 512, padding='max_length',truncation=True) summary_target_ids= self.tokenizer.encode( summary, max_length = 512, padding='max_length',truncation=True) #texts[:len(content)] = content texts = torch.tensor(content) summary_target_ids=torch.tensor(summary_target_ids) sample = {'article': texts, 'actual_summary': summary_target_ids, 'sum_idx': len(self.tokenizer.encode(tok_data['article']))} return sample def gpt_eval( verbose=True, model_name_path=None, src_txt=None, tar_txt=None, gen_path=None, scor_path=None, batch_size=4 ): """ """ predictions=[] actuals=[] model = GPT2LMHeadModel.from_pretrained(model_name_path) tokenizer = GPT2Tokenizer.from_pretrained(model_name_path) # Add a [CLS] to the vocabulary (we should train it also!) #special_tokens = {'bos_token':'<|startoftext|>','eos_token':'<|endoftext|>','pad_token':'','additional_special_tokens':['<|keyword|>','<|summarize|>']} #tokenizer.add_special_tokens(special_tokens) """ special_tokens = {'pad_token':'<|pad|>','sep_token':'<|summarize|>'} tokenizer.add_special_tokens(special_tokens) #assert len(tokenizer) == 50261, "tokenizer size is not 50261" model.resize_token_embeddings(len(tokenizer)) print(' ') """ model = model.to(device) model.eval() """ input_text = input_text +' <|summarize|>' input_token = tokenizer.encode(input_text) input_token_torch = torch.tensor(input_token, dtype=torch.long) """ val_params = { 'batch_size':batch_size, 'shuffle': False, 'num_workers': 0 } sp= open(src_txt,'r') src= sp.readlines() sp.close() tp = open(tar_txt, 'r') tar=tp.readlines() tp.close() val_set = GPT21024Dataset(tar, src,tokenizer, 512, 150) val_loader = DataLoader(val_set, **val_params) with torch.no_grad(): for _, data in enumerate(val_loader, 0): target_output = data['actual_summary'].to(device, dtype = torch.long) input_ids = data['article'] input_ids=input_ids.to(device) #print(input_ids) print(f'Length of the input context: {len(input_ids[0])}') print(f'BEAM SIZE: {4}') #input_ids.unsqueeze(0).to(device) generated_output = model.generate( input_ids=input_ids, max_length= 582, min_length = 562 , temperature=1.0, decoder_start_token_id= '<|summarize|>', num_beams=4, num_return_sequences=1) # print(f' Generated_output: {generated_output}') preds=[] target=[] ids=[] for g in generated_output: preds.append(tokenizer.decode(g[len(input_ids[0]):] , skip_special_tokens=True)) #preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_output] for t in target_output: target.append(tokenizer.decode(t , skip_special_tokens=True)) #target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y] if _%100==0: print(f'Completed {_}') predictions.extend(preds) actuals.extend(target) gp= open(gen_path, 'w') for pre in predictions: gp.write(pre+"\n") gp.close()