Spaces:
Runtime error
Runtime error
File size: 4,549 Bytes
576fb99 2075e27 ff7f9fb 576fb99 f0515e4 dfe4a4f ecc70cc dfe4a4f 51e3ec1 f91af0c dfe4a4f 23fe41c fb7f34e 576fb99 dfe4a4f 725414f dfe4a4f 725414f dfe4a4f 725414f dfe4a4f 57fa22a dfe4a4f b408330 dfe4a4f 57fa22a dfe4a4f 57fa22a dfe4a4f 57fa22a dfe4a4f 725414f dfe4a4f 725414f dfe4a4f c935ce0 dfe4a4f 725414f dfe4a4f 725414f 2595705 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pickle
import torch
import io
#contents = pickle.load(f) becomes...
#contents = CPU_Unpickler(f).load()
model_path = "finbert.sav"
#load model from drive
with open(model_path, "rb") as f:
model= pickle.load(f)
#tokenizer = AutoTokenizer.from_pretrained(checkpoint)
#model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
import nltk
from finbert_embedding.embedding import FinbertEmbedding
import pandas as pd
from nltk.cluster import KMeansClusterer
import numpy as np
import os
from scipy.spatial import distance_matrix
from tensorflow.python.lib.io import file_io
import pickle
nltk.download('punkt')
def make_extractive_summary(word):
# Instantiate path to store each text Datafile in dataframe
data_path = "/tmp/"
if not os.path.exists(data_path):
os.makedirs(data_path)
input_ = "/tmp/input.txt"
# Write file to disk so we can convert each datapoint to a txt file
with open(input_, "w") as file:
file.write(word)
# read the written txt into a variable to start clustering
with open(input_ , 'r') as f:
text = f.read()
# Create tokens from the txt file
tokens = nltk.sent_tokenize(text)
# Strip out trailing and leading white spaces from tokens
sentences = [word.strip() for word in tokens]
#Create a DataFrame from the tokens
data = pd.DataFrame(sentences)
# Assign name Sentences to the column containing text tokens
data.columns = ['Sentences']
# Function to create numerical embeddings for each text tokens in dataframe
def get_sentence_embeddings():
# Create empty list for sentence embeddings
sentence_list = []
# Loop through all sentences and append sentence embeddings to list
for i in tokens:
sentence_embedding = model.sentence_vector(i)
sentence_list.append(sentence_embedding)
# Create empty list for ndarray
sentence_array=[]
# Loop through sentence list and change data type from tensor to array
for i in sentence_list:
sentence_array.append(i.numpy())
# return sentence embeddings as list
return sentence_array
# Apply get_sentence_embeddings to dataframe to create column Embeddings
data['Embeddings'] = get_sentence_embeddings()
#Number of expected sentences
NUM_CLUSTERS = 10
iterations = 8
# Convert Embeddings into an array and store in variable X
X = np.array(data['Embeddings'].to_list())
#Build k-means cluster algorithm
Kclusterer = KMeansClusterer(
NUM_CLUSTERS,
distance = nltk.cluster.util.cosine_distance,
repeats = iterations, avoid_empty_clusters = True)
# if length of text is too short, K means would return an error
# use the try except block to return the text as result if it is too short.
try:
assigned_clusters = Kclusterer.cluster(X,assign_clusters=True)
# Apply Kmean Cluster to DataFrame and create new columns Clusters and Centroid
data['Cluster'] = pd.Series(assigned_clusters, index = data.index)
data['Centroid'] = data['Cluster'].apply(lambda x: Kclusterer.means()[x])
# return the text if clustering algorithm catches an exceptiona and move to the next text file
except ValueError:
return text
# function that computes the distance of each embeddings from the centroid of the cluster
def distance_from_centroid(row):
return distance_matrix([row['Embeddings']], [row['Centroid'].tolist()])[0][0]
# apply distance_from_centroid function to data
data['Distance_From_Centroid'] = data.apply(distance_from_centroid, axis =1)
## Return Final Summary
summary = " ".join(data.sort_values(
'Distance_From_Centroid',
ascending = True).groupby('Cluster').head(1).sort_index()['Sentences'].tolist())
return ("FinBERT MODEL OUTPUT:--->"+summary," Length of Input:---->"+str(len(word))," Length of Output:----> "+str(len(summary)))
import gradio as gr
iface = gr.Interface(fn= make_extractive_summary,
inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!"),
outputs="text",title="Document Summarizer",description ="An AI that makes your life easier by helping you summarise long texts.")
iface.launch(auth=("corterie", "ai@labs")) |