File size: 4,054 Bytes

a0c33a2

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pickle
import torch
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import tensorflow as tf
from tensorflow.python.lib.io import file_io
from nltk.tokenize import sent_tokenize


import io








    
tf.compat.v1.disable_eager_execution()
# Let's load the model and the tokenizer 
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model2 = PegasusForConditionalGeneration.from_pretrained(model_name)
    

#tokenizer = AutoTokenizer.from_pretrained(checkpoint)
#model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)


import nltk
from finbert_embedding.embedding import FinbertEmbedding
import pandas as pd
from nltk.cluster import KMeansClusterer
import numpy as np
import os
from scipy.spatial import distance_matrix
from tensorflow.python.lib.io import file_io
import pickle

nltk.download('punkt')


def pegasus(text):
    '''A function to obtain summaries for each tokenized sentence.
    It returns a summarized document as output''' 

    import nltk
    nltk.download('punkt')

    import os
    data_path = "/tmp/"
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    input_ = "/tmp/input.txt"

    with open(input_, "w") as file:
        file.write(text)
    # read the written txt into a variable
    with open(input_ , 'r') as f:
        text_ = f.read()

    def tokenized_sentences(file):
        '''A function to generate chunks of sentences and texts.
        Returns tokenized texts'''
        # Create empty arrays
        tokenized_sentences = []
        sentences = []
        length = 0
        for sentence in sent_tokenize(file):
            length += len(sentence)
            # 512 is the maximum input length for the Pegasus model
            if length < 512:
                sentences.append(sentence)
            else:
                tokenized_sentences.append(sentences)
                sentences = [sentence]
                length = len(sentence)
        
        sentences = [sentence.strip() for sentence in sentences]
        # Append all tokenized sentences
        if sentences:
            tokenized_sentences.append(sentences)
            return tokenized_sentences

    tokenized = tokenized_sentences(text_)
    # Use GPU if available
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    global summary
    # Create an empty array for all summaries
    summary = []
    # Loop to encode tokens, to generate abstractive summary and finally decode tokens
    for token in tokenized:
        # Encoding
        inputs = tokenizer.encode(' '.join(token), truncation=True, return_tensors='pt')
        # Use CPU or GPU
        inputs = inputs.to(device)
        # Get summaries from transformer model
        all_summary = model2.to(device).generate(inputs,do_sample=True, 
                                                max_length=50, top_k=50, top_p=0.95,
                                                num_beams = 5, early_stopping=True)
#                                                 num_return_sequences=5)
#                                                 length_penalty=0.2, no_repeat_ngram_size=2
#                                                 min_length=10,
#                                                 max_length=50)
        # Decoding
        output = [tokenizer.decode(each_summary, skip_special_tokens=True, clean_up_tokenization_spaces=False) for each_summary in all_summary]
        # Append each output to array
        summary.append(output)
    # Get final summary 
    summary = [sentence for each in summary for sentence in each]
    final = "".join(summary)
    
    return final


import gradio as gr



                     
interface1 = gr.Interface(fn=pegasus, 
                     inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!",label='Input-10k Sections'),
                     outputs=gr.outputs.Textbox(label='Output- Pegasus')).launch()