Shivam29rathore commited on
Commit
57fa22a
·
1 Parent(s): bfbd040

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -91
app.py CHANGED
@@ -1,104 +1,40 @@
1
- import nltk
2
- from finbert_embedding.embedding import FinbertEmbedding
3
- import pandas as pd
4
- from nltk.cluster import KMeansClusterer
5
- import numpy as np
6
- import os
7
- from scipy.spatial import distance_matrix
8
- from tensorflow.python.lib.io import file_io
9
- import pickle
10
-
11
-
12
- nltk.download('punkt')
13
-
14
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
15
-
16
- checkpoint = "Shivam29rathore/finBert_10k"
17
- model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
18
-
19
-
20
- def make_extractive_summary(word):
21
- # Instantiate path to store each text Datafile in dataframe
22
  data_path = "/tmp/"
23
  if not os.path.exists(data_path):
24
  os.makedirs(data_path)
25
  input_ = "/tmp/input.txt"
26
- # Write file to disk so we can convert each datapoint to a txt file
27
  with open(input_, "w") as file:
28
  file.write(word)
29
- # read the written txt into a variable to start clustering
30
  with open(input_ , 'r') as f:
31
- text = f.read()
32
- # Create tokens from the txt file
33
- tokens = nltk.sent_tokenize(text)
34
- # Strip out trailing and leading white spaces from tokens
35
- sentences = [word.strip() for word in tokens]
36
- #Create a DataFrame from the tokens
37
- data = pd.DataFrame(sentences)
38
- # Assign name Sentences to the column containing text tokens
39
- data.columns = ['Sentences']
40
-
41
- # Function to create numerical embeddings for each text tokens in dataframe
42
- def get_sentence_embeddings():
43
- # Create empty list for sentence embeddings
44
- sentence_list = []
45
- # Loop through all sentences and append sentence embeddings to list
46
- for i in tokens:
47
- sentence_embedding = model.sentence_vector(i)
48
- sentence_list.append(sentence_embedding)
49
- # Create empty list for ndarray
50
- sentence_array=[]
51
- # Loop through sentence list and change data type from tensor to array
52
- for i in sentence_list:
53
- sentence_array.append(i.numpy())
54
- # return sentence embeddings as list
55
- return sentence_array
56
-
57
- # Apply get_sentence_embeddings to dataframe to create column Embeddings
58
- data['Embeddings'] = get_sentence_embeddings()
59
-
60
- #Number of expected sentences
61
- NUM_CLUSTERS = 15
62
- iterations = 25
63
- # Convert Embeddings into an array and store in variable X
64
- X = np.array(data['Embeddings'].to_list())
65
-
66
- #Build k-means cluster algorithm
67
- Kclusterer = KMeansClusterer(
68
- NUM_CLUSTERS,
69
- distance = nltk.cluster.util.cosine_distance,
70
- repeats = iterations, avoid_empty_clusters = True)
71
-
72
- # if length of text is too short, K means would return an error
73
- # use the try except block to return the text as result if it is too short.
74
- try:
75
-
76
- assigned_clusters = Kclusterer.cluster(X,assign_clusters=True)
77
-
78
- # Apply Kmean Cluster to DataFrame and create new columns Clusters and Centroid
79
- data['Cluster'] = pd.Series(assigned_clusters, index = data.index)
80
- data['Centroid'] = data['Cluster'].apply(lambda x: Kclusterer.means()[x])
81
-
82
- # return the text if clustering algorithm catches an exceptiona and move to the next text file
83
- except ValueError:
84
- return text
85
-
86
- # function that computes the distance of each embeddings from the centroid of the cluster
87
- def distance_from_centroid(row):
88
- return distance_matrix([row['Embeddings']], [row['Centroid'].tolist()])[0][0]
89
 
90
- # apply distance_from_centroid function to data
91
- data['Distance_From_Centroid'] = data.apply(distance_from_centroid, axis =1)
92
 
93
- ## Return Final Summary
94
- summary = " ".join(data.sort_values(
95
- 'Distance_From_Centroid',
96
- ascending = True).groupby('Cluster').head(1).sort_index()['Sentences'].tolist())
97
- return summary
98
-
99
  import gradio as gr
100
 
101
- iface = gr.Interface(fn=make_extractive_summary,
102
  inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!"),
103
  outputs="text",title="Document Summarizer",description ="An AI that makes your life easier by helping you summarise long texts.")
104
- iface.launch(auth=("hamoye","docai")
 
1
+ def summarize(word):
2
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  data_path = "/tmp/"
4
  if not os.path.exists(data_path):
5
  os.makedirs(data_path)
6
  input_ = "/tmp/input.txt"
7
+
8
  with open(input_, "w") as file:
9
  file.write(word)
10
+ # read the written txt into a variable
11
  with open(input_ , 'r') as f:
12
+ text_ = f.read()
13
+
14
+ def clean_data(texts):
15
+ import re
16
+ words = list()
17
+ for text in texts.split():
18
+ text = re.sub(r'\n','',text)
19
+ text = re.sub(r'\s$','',text)
20
+ words.append(text)
21
+
22
+ return "summarize " + " ".join(words)
23
+ text = clean_data(text_)
24
+
25
+ final_summary = []
26
+ for x in range(0,len(text)-1256,1256):
27
+ text_to_summarize= text[x:x+1256]
28
+ final_summary.append(t5.predict(text_to_summarize))
29
+
30
+ final_list = list(itertools.chain.from_iterable(final_summary))
31
+ final_list = ''.join(final_list)
32
+ return final_list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
 
 
34
 
 
 
 
 
 
 
35
  import gradio as gr
36
 
37
+ iface = gr.Interface(fn= summarize,
38
  inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!"),
39
  outputs="text",title="Document Summarizer",description ="An AI that makes your life easier by helping you summarise long texts.")
40
+ iface.launch(auth=("docai","ailabs"))