Shivam29rathore commited on
Commit
890482c
·
1 Parent(s): f2143d1

Pegasus addition

Browse files
Files changed (1) hide show
  1. app.py +104 -7
app.py CHANGED
@@ -1,6 +1,9 @@
1
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
  import pickle
3
  import torch
 
 
 
4
 
5
 
6
  import io
@@ -14,7 +17,14 @@ model_path = "finbert.sav"
14
 
15
  #load model from drive
16
  with open(model_path, "rb") as f:
17
- model= pickle.load(f)
 
 
 
 
 
 
 
18
 
19
 
20
  #tokenizer = AutoTokenizer.from_pretrained(checkpoint)
@@ -34,7 +44,7 @@ import pickle
34
  nltk.download('punkt')
35
 
36
 
37
- def make_extractive_summary(word):
38
  # Instantiate path to store each text Datafile in dataframe
39
  data_path = "/tmp/"
40
  if not os.path.exists(data_path):
@@ -61,7 +71,7 @@ def make_extractive_summary(word):
61
  sentence_list = []
62
  # Loop through all sentences and append sentence embeddings to list
63
  for i in tokens:
64
- sentence_embedding = model.sentence_vector(i)
65
  sentence_list.append(sentence_embedding)
66
  # Create empty list for ndarray
67
  sentence_array=[]
@@ -115,9 +125,96 @@ def make_extractive_summary(word):
115
 
116
 
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  import gradio as gr
119
 
120
- iface = gr.Interface(fn= make_extractive_summary,
121
- inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!"),
122
- outputs="text",title="Document Summarizer",description ="An AI that makes your life easier by helping you summarise long texts.")
123
- iface.launch(enable_queue=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
  import pickle
3
  import torch
4
+ from transformers import PegasusTokenizer, PegasusForConditionalGeneration
5
+ import tensorflow as tf
6
+ from tensorflow.python.lib.io import file_io
7
 
8
 
9
  import io
 
17
 
18
  #load model from drive
19
  with open(model_path, "rb") as f:
20
+ model1= pickle.load(f)
21
+
22
+
23
+ tf.compat.v1.disable_eager_execution()
24
+ # Let's load the model and the tokenizer
25
+ model_name = "human-centered-summarization/financial-summarization-pegasus"
26
+ tokenizer = PegasusTokenizer.from_pretrained(model_name)
27
+ model2 = PegasusForConditionalGeneration.from_pretrained(model_name)
28
 
29
 
30
  #tokenizer = AutoTokenizer.from_pretrained(checkpoint)
 
44
  nltk.download('punkt')
45
 
46
 
47
+ def finbert(word):
48
  # Instantiate path to store each text Datafile in dataframe
49
  data_path = "/tmp/"
50
  if not os.path.exists(data_path):
 
71
  sentence_list = []
72
  # Loop through all sentences and append sentence embeddings to list
73
  for i in tokens:
74
+ sentence_embedding = model1.sentence_vector(i)
75
  sentence_list.append(sentence_embedding)
76
  # Create empty list for ndarray
77
  sentence_array=[]
 
125
 
126
 
127
 
128
+ def pegasus(text):
129
+ '''A function to obtain summaries for each tokenized sentence.
130
+ It returns a summarized document as output'''
131
+
132
+ import nltk
133
+ nltk.download('punkt')
134
+
135
+ import os
136
+ data_path = "/tmp/"
137
+ if not os.path.exists(data_path):
138
+ os.makedirs(data_path)
139
+ input_ = "/tmp/input.txt"
140
+
141
+ with open(input_, "w") as file:
142
+ file.write(text)
143
+ # read the written txt into a variable
144
+ with open(input_ , 'r') as f:
145
+ text_ = f.read()
146
+
147
+ def tokenized_sentences(file):
148
+ '''A function to generate chunks of sentences and texts.
149
+ Returns tokenized texts'''
150
+ # Create empty arrays
151
+ tokenized_sentences = []
152
+ sentences = []
153
+ length = 0
154
+ for sentence in sent_tokenize(file):
155
+ length += len(sentence)
156
+ # 512 is the maximum input length for the Pegasus model
157
+ if length < 512:
158
+ sentences.append(sentence)
159
+ else:
160
+ tokenized_sentences.append(sentences)
161
+ sentences = [sentence]
162
+ length = len(sentence)
163
+
164
+ sentences = [sentence.strip() for sentence in sentences]
165
+ # Append all tokenized sentences
166
+ if sentences:
167
+ tokenized_sentences.append(sentences)
168
+ return tokenized_sentences
169
+
170
+ tokenized = tokenized_sentences(text_)
171
+ # Use GPU if available
172
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
173
+ global summary
174
+ # Create an empty array for all summaries
175
+ summary = []
176
+ # Loop to encode tokens, to generate abstractive summary and finally decode tokens
177
+ for token in tokenized:
178
+ # Encoding
179
+ inputs = tokenizer.encode(' '.join(token), truncation=True, return_tensors='pt')
180
+ # Use CPU or GPU
181
+ inputs = inputs.to(device)
182
+ # Get summaries from transformer model
183
+ all_summary = model2.to(device).generate(inputs,do_sample=True,
184
+ max_length=50, top_k=50, top_p=0.95,
185
+ num_beams = 5, early_stopping=True)
186
+ # num_return_sequences=5)
187
+ # length_penalty=0.2, no_repeat_ngram_size=2
188
+ # min_length=10,
189
+ # max_length=50)
190
+ # Decoding
191
+ output = [tokenizer.decode(each_summary, skip_special_tokens=True, clean_up_tokenization_spaces=False) for each_summary in all_summary]
192
+ # Append each output to array
193
+ summary.append(output)
194
+ # Get final summary
195
+ summary = [sentence for each in summary for sentence in each]
196
+ final = "".join(summary)
197
+
198
+ return ("FinBERT MODEL OUTPUT:--->"+final," Length of Input:---->"+str(len(text))," Length of Output:----> "+str(len(final)))
199
+
200
+
201
  import gradio as gr
202
 
203
+ from gradio.mix import Parallel
204
+
205
+ interface1 = gr.Interface(fn=finbert,
206
+ inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!",label='Input-10k Sections'),
207
+ outputs=gr.outputs.Textbox(label='Output- finBERT'))
208
+
209
+ interface2 = gr.Interface(fn= pegasus,
210
+ inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!",label='Input-10k Sections'),
211
+ outputs=gr.outputs.Textbox(label='Output- Pegasus'))
212
+
213
+
214
+ Parallel(
215
+ interface1,
216
+ interface2,
217
+ title="Document Summarizer",
218
+ inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!",label='Input- 10k sections')
219
+ ).launch(enable_queue=True, debug=True)
220
+