Spaces:
Runtime error
Runtime error
File size: 7,896 Bytes
576fb99 2075e27 ff7f9fb 890482c 6635471 576fb99 f0515e4 dfe4a4f ecc70cc dfe4a4f 890482c f91af0c dfe4a4f 23fe41c fb7f34e 576fb99 dfe4a4f 890482c dfe4a4f 725414f dfe4a4f 725414f dfe4a4f 725414f dfe4a4f 890482c dfe4a4f 57fa22a dfe4a4f b408330 dfe4a4f 57fa22a dfe4a4f 57fa22a dfe4a4f 592d17f dfe4a4f 57fa22a dfe4a4f 725414f dfe4a4f 725414f dfe4a4f 592d17f 700f327 dfe4a4f 890482c 6635471 890482c 29c0249 890482c 725414f 890482c ce7c8f8 890482c c818447 890482c c818447 890482c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pickle
import torch
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import tensorflow as tf
from tensorflow.python.lib.io import file_io
from nltk.tokenize import sent_tokenize
import io
#contents = pickle.load(f) becomes...
#contents = CPU_Unpickler(f).load()
model_path = "finbert.sav"
#load model from drive
with open(model_path, "rb") as f:
model1= pickle.load(f)
tf.compat.v1.disable_eager_execution()
# Let's load the model and the tokenizer
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model2 = PegasusForConditionalGeneration.from_pretrained(model_name)
#tokenizer = AutoTokenizer.from_pretrained(checkpoint)
#model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
import nltk
from finbert_embedding.embedding import FinbertEmbedding
import pandas as pd
from nltk.cluster import KMeansClusterer
import numpy as np
import os
from scipy.spatial import distance_matrix
from tensorflow.python.lib.io import file_io
import pickle
nltk.download('punkt')
def finbert(word):
# Instantiate path to store each text Datafile in dataframe
data_path = "/tmp/"
if not os.path.exists(data_path):
os.makedirs(data_path)
input_ = "/tmp/input.txt"
# Write file to disk so we can convert each datapoint to a txt file
with open(input_, "w") as file:
file.write(word)
# read the written txt into a variable to start clustering
with open(input_ , 'r') as f:
text = f.read()
# Create tokens from the txt file
tokens = nltk.sent_tokenize(text)
# Strip out trailing and leading white spaces from tokens
sentences = [word.strip() for word in tokens]
#Create a DataFrame from the tokens
data = pd.DataFrame(sentences)
# Assign name Sentences to the column containing text tokens
data.columns = ['Sentences']
# Function to create numerical embeddings for each text tokens in dataframe
def get_sentence_embeddings():
# Create empty list for sentence embeddings
sentence_list = []
# Loop through all sentences and append sentence embeddings to list
for i in tokens:
sentence_embedding = model1.sentence_vector(i)
sentence_list.append(sentence_embedding)
# Create empty list for ndarray
sentence_array=[]
# Loop through sentence list and change data type from tensor to array
for i in sentence_list:
sentence_array.append(i.numpy())
# return sentence embeddings as list
return sentence_array
# Apply get_sentence_embeddings to dataframe to create column Embeddings
data['Embeddings'] = get_sentence_embeddings()
#Number of expected sentences
NUM_CLUSTERS = 10
iterations = 8
# Convert Embeddings into an array and store in variable X
X = np.array(data['Embeddings'].to_list())
#Build k-means cluster algorithm
Kclusterer = KMeansClusterer(
NUM_CLUSTERS,
distance = nltk.cluster.util.cosine_distance,
repeats = iterations, avoid_empty_clusters = True)
# if length of text is too short, K means would return an error
# use the try except block to return the text as result if it is too short.
try:
assigned_clusters = Kclusterer.cluster(X,assign_clusters=True)
# Apply Kmean Cluster to DataFrame and create new columns Clusters and Centroid
data['Cluster'] = pd.Series(assigned_clusters, index = data.index)
data['Centroid'] = data['Cluster'].apply(lambda x: Kclusterer.means()[x])
# return the text if clustering algorithm catches an exceptiona and move to the next text file
except ValueError:
return text
# function that computes the distance of each embeddings from the centroid of the cluster
def distance_from_centroid(row):
return distance_matrix([row['Embeddings']], [row['Centroid'].tolist()])[0][0]
# apply distance_from_centroid function to data
data['Distance_From_Centroid'] = data.apply(distance_from_centroid, axis =1)
## Return Final Summary
summary = " ".join(data.sort_values(
'Distance_From_Centroid',
ascending = True).groupby('Cluster').head(1).sort_index()['Sentences'].tolist())
import re
words = list()
for text in summary.split():
text = re.sub(r'\n','',text)
text = re.sub(r'\s$','',text)
words.append(text)
summary = " ".join(words)
return (summary," Length of Input:---->"+str(len(word))," Length of Output:----> "+str(len(summary)))
def pegasus(text):
'''A function to obtain summaries for each tokenized sentence.
It returns a summarized document as output'''
import nltk
nltk.download('punkt')
import os
data_path = "/tmp/"
if not os.path.exists(data_path):
os.makedirs(data_path)
input_ = "/tmp/input.txt"
with open(input_, "w") as file:
file.write(text)
# read the written txt into a variable
with open(input_ , 'r') as f:
text_ = f.read()
def tokenized_sentences(file):
'''A function to generate chunks of sentences and texts.
Returns tokenized texts'''
# Create empty arrays
tokenized_sentences = []
sentences = []
length = 0
for sentence in sent_tokenize(file):
length += len(sentence)
# 512 is the maximum input length for the Pegasus model
if length < 512:
sentences.append(sentence)
else:
tokenized_sentences.append(sentences)
sentences = [sentence]
length = len(sentence)
sentences = [sentence.strip() for sentence in sentences]
# Append all tokenized sentences
if sentences:
tokenized_sentences.append(sentences)
return tokenized_sentences
tokenized = tokenized_sentences(text_)
# Use GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
global summary
# Create an empty array for all summaries
summary = []
# Loop to encode tokens, to generate abstractive summary and finally decode tokens
for token in tokenized:
# Encoding
inputs = tokenizer.encode(' '.join(token), truncation=True, return_tensors='pt')
# Use CPU or GPU
inputs = inputs.to(device)
# Get summaries from transformer model
all_summary = model2.to(device).generate(inputs,do_sample=True,
max_length=50, top_k=50, top_p=0.95,
num_beams = 5, early_stopping=True)
# num_return_sequences=5)
# length_penalty=0.2, no_repeat_ngram_size=2
# min_length=10,
# max_length=50)
# Decoding
output = [tokenizer.decode(each_summary, skip_special_tokens=True, clean_up_tokenization_spaces=False) for each_summary in all_summary]
# Append each output to array
summary.append(output)
# Get final summary
summary = [sentence for each in summary for sentence in each]
final = "".join(summary)
return final
import gradio as gr
interface1 = gr.Interface(fn=finbert,
inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!",label='Input-10k Sections'),
outputs=gr.outputs.Textbox(label='Output')).launch()
|