nickmuchi commited on
Commit
364cdb0
·
1 Parent(s): 8436206

Upload app.py

Browse files

Application File

Files changed (1) hide show
  1. app.py +222 -0
app.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # In[1]:
5
+
6
+
7
+ import nltk
8
+ import validators, re
9
+ from fake_useragent import UserAgent
10
+ import streamlit as st
11
+ from transformers import pipeline
12
+ import base64
13
+ import requests
14
+ import docx2txt
15
+ from io import StringIO
16
+ from PyPDF2 import PdfFileReader
17
+ import warnings
18
+ warnings.filterwarnings("ignore")
19
+
20
+ nltk.download('punkt')
21
+
22
+
23
+ # In[2]:
24
+
25
+
26
+ #Functions
27
+
28
+ def article_text_extractor(url: str):
29
+
30
+ '''Extract text from url and divide text into chunks if length of text is more than 500 words'''
31
+
32
+ ua = UserAgent()
33
+
34
+ headers = {'User-Agent':str(ua.chrome)}
35
+
36
+ r = requests.get(url,headers=headers)
37
+
38
+ soup = BeautifulSoup(r.text, "html.parser")
39
+ title_text = soup.find_all(["h1"])
40
+ para_text = soup.find_all(["p"])
41
+ article_text = [result.text for result in para_text]
42
+ article_header = [result.text for result in title_text][0]
43
+ article = " ".join(article_text)
44
+ article = article.replace(".", ".<eos>")
45
+ article = article.replace("!", "!<eos>")
46
+ article = article.replace("?", "?<eos>")
47
+ sentences = article.split("<eos>")
48
+
49
+ current_chunk = 0
50
+ chunks = []
51
+
52
+ for sentence in sentences:
53
+ if len(chunks) == current_chunk + 1:
54
+ if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
55
+ chunks[current_chunk].extend(sentence.split(" "))
56
+ else:
57
+ current_chunk += 1
58
+ chunks.append(sentence.split(" "))
59
+ else:
60
+ print(current_chunk)
61
+ chunks.append(sentence.split(" "))
62
+
63
+ for chunk_id in range(len(chunks)):
64
+ chunks[chunk_id] = " ".join(chunks[chunk_id])
65
+
66
+ return article_header, chunks
67
+
68
+ def preprocess_plain_text(x):
69
+
70
+ x = x.encode("ascii", "ignore").decode() # unicode
71
+ x = re.sub(r"https*\S+", " ", x) # url
72
+ x = re.sub(r"@\S+", " ", x) # mentions
73
+ x = re.sub(r"#\S+", " ", x) # hastags
74
+ x = re.sub(r"\s{2,}", " ", x) # over spaces
75
+ x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!?
76
+
77
+ return x
78
+
79
+ def extract_pdf(file):
80
+
81
+ '''Extract text from PDF file'''
82
+
83
+ pdfReader = PdfFileReader(file)
84
+ count = pdfReader.numPages
85
+ all_text = ""
86
+ for i in range(count):
87
+ page = pdfReader.getPage(i)
88
+ all_text += page.extractText()
89
+
90
+ return all_text
91
+
92
+
93
+ def extract_text_from_file(file):
94
+
95
+ '''Extract text from uploaded file'''
96
+
97
+ # read text file
98
+ if file.type == "text/plain":
99
+ # To convert to a string based IO:
100
+ stringio = StringIO(file.getvalue().decode("utf-8"))
101
+
102
+ # To read file as string:
103
+ file_text = stringio.read()
104
+
105
+ # read pdf file
106
+ elif file.type == "application/pdf":
107
+ file_text = extract_pdf(file)
108
+
109
+ # read docx file
110
+ elif (
111
+ file.type
112
+ == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
113
+ ):
114
+ file_text = docx2txt.process(file)
115
+
116
+ return file_text
117
+
118
+ def summary_downloader(raw_text):
119
+ b64 = base64.b64encode(raw_text.encode()).decode()
120
+ new_filename = "new_text_file_{}_.txt".format(timestr)
121
+ st.markdown("#### Download Summary as a File ###")
122
+ href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
123
+ st.markdown(href,unsafe_allow_html=True)
124
+
125
+ @st.cache(allow_output_mutation=True)
126
+ def pipeline_model():
127
+
128
+ summarizer = pipeline('summarization',model='facebook/bart-large-cnn')
129
+ return summarizer
130
+
131
+ #Streamlit App
132
+
133
+ st.title("Article Text and Link Extractive Summarizer using Facebook-Bart-large-CNN Transformer Model 📝")
134
+
135
+ st.markdown(
136
+ "Model Source: [Facebook-Bart-large-CNN](https://huggingface.co/facebook/bart-large-cnn)"
137
+ )
138
+
139
+ st.markdown(
140
+ """The app supports extractive summarization which aims to identify the salient information that is then extracted and grouped together to form a concise summary.
141
+ For documents or text that is more than 500 words long, the app will divide the text into chunks and summarize each chunk.
142
+ Please do note that the model will take longer to generate summaries for documents that are too long"""
143
+ )
144
+
145
+ st.markdown(
146
+ "The app only ingests the below formats for summarization task:"
147
+ )
148
+ st.markdown(
149
+ """- Raw text entered in text box
150
+ - URL of an article to be summarized
151
+ - Documents with .txt, .pdf or .docx file formats"""
152
+ )
153
+
154
+ st.markdown("---")
155
+
156
+ url_text = st.text_input("Please Enter a url here")
157
+
158
+
159
+ st.markdown(
160
+ "<h3 style='text-align: center; color: red;'>OR</h3>",
161
+ unsafe_allow_html=True,
162
+ )
163
+
164
+ plain_text = st.text_input("Please Paste/Enter plain text here")
165
+
166
+ st.markdown(
167
+ "<h3 style='text-align: center; color: red;'>OR</h3>",
168
+ unsafe_allow_html=True,
169
+ )
170
+
171
+ upload_doc = st.file_uploader(
172
+ "Upload a .txt, .pdf, .docx file for summarization"
173
+ )
174
+
175
+ is_url = validators.url(url_text)
176
+
177
+ if is_url:
178
+ # complete text, chunks to summarize (list of sentences for long docs)
179
+ article_title,chunks = article_text_extractor(url=url_text)
180
+
181
+ elif upload_doc:
182
+
183
+ clean_text = preprocess_plain_text(extract_text_from_file(uploaded_file))
184
+
185
+ else:
186
+
187
+ clean_text = preprocess_plain_text(plain_text)
188
+
189
+ if is_url:
190
+
191
+ # view summarized text (expander)
192
+ st.markdown(f"Article title: {article_title}")
193
+
194
+ summarize = st.button("Summarize")
195
+
196
+ # called on toggle button [summarize]
197
+ if summarize:
198
+ if is_url:
199
+ text_to_summarize = chunks
200
+ else:
201
+ text_to_summarize = clean_text
202
+ # extractive summarizer
203
+
204
+ with st.spinner(
205
+ text="Extracting summary. This might take a few seconds depending on the length of your document/text ..."
206
+ ):
207
+ summarizer_model = pipeline_model()
208
+ summarized_text = summarizer_model(text_to_summarize, max_length=100, min_length=30)
209
+ summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
210
+
211
+ # final summarized output
212
+ st.subheader("Summarized text")
213
+ st.info(summarized_text)
214
+
215
+ text_downloader(summarized_text)
216
+
217
+
218
+ # In[ ]:
219
+
220
+
221
+
222
+