File size: 7,896 Bytes
576fb99
2075e27
ff7f9fb
890482c
 
 
6635471
576fb99
 
f0515e4
 
 
 
 
 
 
dfe4a4f
ecc70cc
 
dfe4a4f
890482c
 
 
 
 
 
 
 
f91af0c
dfe4a4f
23fe41c
fb7f34e
576fb99
 
dfe4a4f
 
 
 
 
 
 
 
 
 
 
 
 
890482c
dfe4a4f
725414f
 
 
 
dfe4a4f
725414f
 
dfe4a4f
725414f
dfe4a4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
890482c
dfe4a4f
 
 
 
 
 
 
 
57fa22a
dfe4a4f
 
 
 
b408330
 
dfe4a4f
 
 
 
 
 
 
 
57fa22a
dfe4a4f
 
 
 
 
57fa22a
dfe4a4f
 
 
 
592d17f
dfe4a4f
 
57fa22a
dfe4a4f
 
 
725414f
dfe4a4f
 
725414f
dfe4a4f
 
 
 
592d17f
 
 
 
 
 
 
 
700f327
dfe4a4f
 
890482c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6635471
890482c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29c0249
890482c
 
725414f
 
890482c
ce7c8f8
890482c
c818447
890482c
c818447
890482c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pickle
import torch
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import tensorflow as tf
from tensorflow.python.lib.io import file_io
from nltk.tokenize import sent_tokenize


import io


#contents = pickle.load(f) becomes...
#contents = CPU_Unpickler(f).load()


model_path = "finbert.sav"

#load model from drive
with open(model_path, "rb") as f:
    model1=  pickle.load(f)
    
    
tf.compat.v1.disable_eager_execution()
# Let's load the model and the tokenizer 
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model2 = PegasusForConditionalGeneration.from_pretrained(model_name)
    

#tokenizer = AutoTokenizer.from_pretrained(checkpoint)
#model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)


import nltk
from finbert_embedding.embedding import FinbertEmbedding
import pandas as pd
from nltk.cluster import KMeansClusterer
import numpy as np
import os
from scipy.spatial import distance_matrix
from tensorflow.python.lib.io import file_io
import pickle

nltk.download('punkt')


def finbert(word):
    # Instantiate path to store each text Datafile in dataframe
    data_path = "/tmp/"
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    input_ = "/tmp/input.txt"
    # Write file to disk so we can convert each datapoint to a txt file
    with open(input_, "w") as file:
        file.write(word)
    # read the written txt into a variable to start clustering
    with open(input_ , 'r') as f:
        text = f.read()
    # Create tokens from the txt file
    tokens = nltk.sent_tokenize(text)
    # Strip out trailing and leading white spaces from tokens
    sentences = [word.strip() for word in tokens]
    #Create a DataFrame from the tokens 
    data = pd.DataFrame(sentences)
    # Assign name Sentences to the column containing text tokens
    data.columns = ['Sentences']
    
    # Function to create numerical embeddings for each text tokens in dataframe 
    def get_sentence_embeddings():
        # Create empty list for sentence embeddings
        sentence_list = []
        # Loop through all sentences and append sentence embeddings to list
        for i in tokens:
            sentence_embedding = model1.sentence_vector(i)
            sentence_list.append(sentence_embedding)
        # Create empty list for ndarray
        sentence_array=[]
        # Loop through sentence list and change data type from tensor to array
        for i in sentence_list:
            sentence_array.append(i.numpy())
        # return sentence embeddings as list
        return sentence_array

    # Apply get_sentence_embeddings to dataframe to create column Embeddings
    data['Embeddings'] = get_sentence_embeddings()
    
    #Number of expected sentences
    NUM_CLUSTERS = 10
    iterations = 8
    # Convert Embeddings into an array and store in variable X
    X = np.array(data['Embeddings'].to_list())
    
    #Build k-means cluster algorithm
    Kclusterer = KMeansClusterer(
                                NUM_CLUSTERS,
                                distance = nltk.cluster.util.cosine_distance,
                                repeats = iterations, avoid_empty_clusters = True)

    # if length of text is too short, K means would return an error
    # use the try except block to return the text as result if it is too short.
    try:
        
        assigned_clusters = Kclusterer.cluster(X,assign_clusters=True)

        # Apply Kmean Cluster to DataFrame and create new columns Clusters and Centroid
        data['Cluster'] = pd.Series(assigned_clusters, index = data.index)
        data['Centroid'] = data['Cluster'].apply(lambda x: Kclusterer.means()[x])
    
    # return the text if clustering algorithm catches an exceptiona and move to the next text file
    except ValueError:
        return text

    # function that computes the distance of each embeddings from the centroid of the cluster
    def distance_from_centroid(row):
        return distance_matrix([row['Embeddings']], [row['Centroid'].tolist()])[0][0]
    
    # apply distance_from_centroid function to data
    data['Distance_From_Centroid'] = data.apply(distance_from_centroid, axis =1)
    
    ## Return Final Summary
    summary = " ".join(data.sort_values(
                'Distance_From_Centroid',
                ascending = True).groupby('Cluster').head(1).sort_index()['Sentences'].tolist())
    import re
    words = list()
    for text in summary.split():
            text = re.sub(r'\n','',text)
            text = re.sub(r'\s$','',text)
            words.append(text)
    summary = " ".join(words)

    return  (summary,"    Length of Input:---->"+str(len(word)),"  Length of Output:----> "+str(len(summary)))
    

def pegasus(text):
    '''A function to obtain summaries for each tokenized sentence.
    It returns a summarized document as output''' 

    import nltk
    nltk.download('punkt')

    import os
    data_path = "/tmp/"
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    input_ = "/tmp/input.txt"

    with open(input_, "w") as file:
        file.write(text)
    # read the written txt into a variable
    with open(input_ , 'r') as f:
        text_ = f.read()

    def tokenized_sentences(file):
        '''A function to generate chunks of sentences and texts.
        Returns tokenized texts'''
        # Create empty arrays
        tokenized_sentences = []
        sentences = []
        length = 0
        for sentence in sent_tokenize(file):
            length += len(sentence)
            # 512 is the maximum input length for the Pegasus model
            if length < 512:
                sentences.append(sentence)
            else:
                tokenized_sentences.append(sentences)
                sentences = [sentence]
                length = len(sentence)
        
        sentences = [sentence.strip() for sentence in sentences]
        # Append all tokenized sentences
        if sentences:
            tokenized_sentences.append(sentences)
            return tokenized_sentences

    tokenized = tokenized_sentences(text_)
    # Use GPU if available
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    global summary
    # Create an empty array for all summaries
    summary = []
    # Loop to encode tokens, to generate abstractive summary and finally decode tokens
    for token in tokenized:
        # Encoding
        inputs = tokenizer.encode(' '.join(token), truncation=True, return_tensors='pt')
        # Use CPU or GPU
        inputs = inputs.to(device)
        # Get summaries from transformer model
        all_summary = model2.to(device).generate(inputs,do_sample=True, 
                                                max_length=50, top_k=50, top_p=0.95,
                                                num_beams = 5, early_stopping=True)
#                                                 num_return_sequences=5)
#                                                 length_penalty=0.2, no_repeat_ngram_size=2
#                                                 min_length=10,
#                                                 max_length=50)
        # Decoding
        output = [tokenizer.decode(each_summary, skip_special_tokens=True, clean_up_tokenization_spaces=False) for each_summary in all_summary]
        # Append each output to array
        summary.append(output)
    # Get final summary 
    summary = [sentence for each in summary for sentence in each]
    final = "".join(summary)
    
    return final


import gradio as gr



                     
interface1 = gr.Interface(fn=finbert, 
                     inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!",label='Input-10k Sections'),
                     outputs=gr.outputs.Textbox(label='Output')).launch()