Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	File size: 4,172 Bytes
			
			725414f  | 
								1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105  | 
								import nltk
from finbert_embedding.embedding import FinbertEmbedding
import pandas as pd
from nltk.cluster import KMeansClusterer
import numpy as np
import os
from scipy.spatial import distance_matrix
from tensorflow.python.lib.io import file_io
import pickle
nltk.download('punkt')
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
checkpoint = "Shivam29rathore/finBert_10k"
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
def make_extractive_summary(word):
    # Instantiate path to store each text Datafile in dataframe
    data_path = "/tmp/"
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    input_ = "/tmp/input.txt"
    # Write file to disk so we can convert each datapoint to a txt file
    with open(input_, "w") as file:
        file.write(word)
    # read the written txt into a variable to start clustering
    with open(input_ , 'r') as f:
        text = f.read()
    # Create tokens from the txt file
    tokens = nltk.sent_tokenize(text)
    # Strip out trailing and leading white spaces from tokens
    sentences = [word.strip() for word in tokens]
    #Create a DataFrame from the tokens 
    data = pd.DataFrame(sentences)
    # Assign name Sentences to the column containing text tokens
    data.columns = ['Sentences']
    
    # Function to create numerical embeddings for each text tokens in dataframe 
    def get_sentence_embeddings():
        # Create empty list for sentence embeddings
        sentence_list = []
        # Loop through all sentences and append sentence embeddings to list
        for i in tokens:
            sentence_embedding = model.sentence_vector(i)
            sentence_list.append(sentence_embedding)
        # Create empty list for ndarray
        sentence_array=[]
        # Loop through sentence list and change data type from tensor to array
        for i in sentence_list:
            sentence_array.append(i.numpy())
        # return sentence embeddings as list
        return sentence_array
    # Apply get_sentence_embeddings to dataframe to create column Embeddings
    data['Embeddings'] = get_sentence_embeddings()
    
    #Number of expected sentences
    NUM_CLUSTERS = 15
    iterations = 25
    # Convert Embeddings into an array and store in variable X
    X = np.array(data['Embeddings'].to_list())
    
    #Build k-means cluster algorithm
    Kclusterer = KMeansClusterer(
                                NUM_CLUSTERS,
                                distance = nltk.cluster.util.cosine_distance,
                                repeats = iterations, avoid_empty_clusters = True)
    # if length of text is too short, K means would return an error
    # use the try except block to return the text as result if it is too short.
    try:
        
        assigned_clusters = Kclusterer.cluster(X,assign_clusters=True)
        # Apply Kmean Cluster to DataFrame and create new columns Clusters and Centroid
        data['Cluster'] = pd.Series(assigned_clusters, index = data.index)
        data['Centroid'] = data['Cluster'].apply(lambda x: Kclusterer.means()[x])
    
    # return the text if clustering algorithm catches an exceptiona and move to the next text file
    except ValueError:
        return text
    # function that computes the distance of each embeddings from the centroid of the cluster
    def distance_from_centroid(row):
        return distance_matrix([row['Embeddings']], [row['Centroid'].tolist()])[0][0]
    
    # apply distance_from_centroid function to data
    data['Distance_From_Centroid'] = data.apply(distance_from_centroid, axis =1)
    
    ## Return Final Summary
    summary = " ".join(data.sort_values(
                'Distance_From_Centroid',
                ascending = True).groupby('Cluster').head(1).sort_index()['Sentences'].tolist())
    return summary
import gradio as gr
iface = gr.Interface(fn=make_extractive_summary, 
                     inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!"),
                     outputs="text",title="Document Summarizer",description ="An AI that makes your life easier by helping you summarise long texts.")
iface.launch(auth=("hamoye","docai")
 |