Paula Leonova
commited on
Commit
·
7055ca6
1
Parent(s):
d855e09
Add a table for keywords for all uploaded text
Browse files
app.py
CHANGED
@@ -42,11 +42,12 @@ with st.form(key='my_form'):
|
|
42 |
|
43 |
text_csv_expander = st.expander(label=f'Want to upload multiple texts at once? Expand to upload your text files below.', expanded=False)
|
44 |
with text_csv_expander:
|
45 |
-
st.
|
|
|
46 |
uploaded_text_files = st.file_uploader(label="Upload file(s) that end with the .txt suffix",
|
47 |
accept_multiple_files=True, key = 'text_uploader',
|
48 |
type = 'txt')
|
49 |
-
st.write("
|
50 |
uploaded_csv_text_files = st.file_uploader(label='Upload a CSV file with columns: "title" and "text"',
|
51 |
accept_multiple_files=False, key = 'csv_text_uploader',
|
52 |
type = 'csv')
|
@@ -57,12 +58,12 @@ with st.form(key='my_form'):
|
|
57 |
|
58 |
st.text("\n\n\n")
|
59 |
st.markdown("##### Step 2: Enter Labels")
|
60 |
-
labels = st.text_input('Enter possible topic labels, which can be either keywords and/or general themes (comma-separated):',input_labels, max_chars=
|
61 |
labels = list(set([x.strip() for x in labels.strip().split(',') if len(x.strip()) > 0]))
|
62 |
|
63 |
labels_csv_expander = st.expander(label=f'Prefer to upload a list of labels instead? Click here to upload your CSV file.',expanded=False)
|
64 |
with labels_csv_expander:
|
65 |
-
uploaded_labels_file = st.file_uploader("
|
66 |
key='labels_uploader')
|
67 |
|
68 |
gen_keywords = st.radio(
|
@@ -72,16 +73,17 @@ with st.form(key='my_form'):
|
|
72 |
|
73 |
st.text("\n\n\n")
|
74 |
st.markdown("##### Step 3: Provide Ground Truth Labels (_Optional_)")
|
75 |
-
glabels = st.text_input('If available, enter ground truth topic labels to evaluate results, otherwise leave blank (comma-separated):',input_glabels, max_chars=
|
76 |
glabels = list(set([x.strip() for x in glabels.strip().split(',') if len(x.strip()) > 0]))
|
77 |
|
78 |
|
79 |
glabels_csv_expander = st.expander(label=f'Have a file with labels for the text? Click here to upload your CSV file.', expanded=False)
|
80 |
with glabels_csv_expander:
|
81 |
-
st.
|
|
|
82 |
uploaded_onetext_glabels_file = st.file_uploader("Choose a CSV file with one column and no header, where each cell is a separate label",
|
83 |
key = 'onetext_glabels_uploader')
|
84 |
-
st.write("
|
85 |
uploaded_multitext_glabels_file = st.file_uploader('Or Choose a CSV file with two columns "title" and "label", with the cells in the title column matching the name of the files uploaded in step #1.',
|
86 |
key = 'multitext_glabels_uploader')
|
87 |
|
@@ -116,8 +118,10 @@ if submit_button or example_button:
|
|
116 |
st.error("Enter some text to generate a summary")
|
117 |
else:
|
118 |
|
|
|
119 |
if uploaded_text_files is not None:
|
120 |
st.markdown("### Text Inputs")
|
|
|
121 |
file_names = []
|
122 |
raw_texts = []
|
123 |
for uploaded_file in uploaded_text_files:
|
@@ -125,63 +129,79 @@ if submit_button or example_button:
|
|
125 |
raw_texts.append(text)
|
126 |
title_file_name = uploaded_file.name.replace('.txt','')
|
127 |
file_names.append(title_file_name)
|
128 |
-
|
129 |
'text': raw_texts})
|
130 |
-
st.dataframe(
|
131 |
st.download_button(
|
132 |
label="Download data as CSV",
|
133 |
-
data=
|
134 |
-
file_name='
|
135 |
mime='title_text/csv',
|
136 |
)
|
|
|
137 |
|
138 |
|
139 |
with st.spinner('Breaking up text into more reasonable chunks (transformers cannot exceed a 1024 token max)...'):
|
140 |
# For each body of text, create text chunks of a certain token size required for the transformer
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
keywords_list = md.keyword_gen(kw_model, text_chunk)
|
155 |
-
|
156 |
-
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
-
top_kw_df = top_kw_df.sort_values('score', ascending = False).reset_index().drop(['index'], axis=1)
|
160 |
-
st.dataframe(top_kw_df.head(10))
|
161 |
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
|
186 |
if len(text_input) == 0 or len(labels) == 0:
|
187 |
st.error('Enter some text and at least one possible topic to see label predictions.')
|
|
|
42 |
|
43 |
text_csv_expander = st.expander(label=f'Want to upload multiple texts at once? Expand to upload your text files below.', expanded=False)
|
44 |
with text_csv_expander:
|
45 |
+
st.markdown('##### Choose one of the options below:')
|
46 |
+
st.write("__Option A:__")
|
47 |
uploaded_text_files = st.file_uploader(label="Upload file(s) that end with the .txt suffix",
|
48 |
accept_multiple_files=True, key = 'text_uploader',
|
49 |
type = 'txt')
|
50 |
+
st.write("__Option B:__")
|
51 |
uploaded_csv_text_files = st.file_uploader(label='Upload a CSV file with columns: "title" and "text"',
|
52 |
accept_multiple_files=False, key = 'csv_text_uploader',
|
53 |
type = 'csv')
|
|
|
58 |
|
59 |
st.text("\n\n\n")
|
60 |
st.markdown("##### Step 2: Enter Labels")
|
61 |
+
labels = st.text_input('Enter possible topic labels, which can be either keywords and/or general themes (comma-separated):',input_labels, max_chars=2000)
|
62 |
labels = list(set([x.strip() for x in labels.strip().split(',') if len(x.strip()) > 0]))
|
63 |
|
64 |
labels_csv_expander = st.expander(label=f'Prefer to upload a list of labels instead? Click here to upload your CSV file.',expanded=False)
|
65 |
with labels_csv_expander:
|
66 |
+
uploaded_labels_file = st.file_uploader("Choose a CSV file with one column and no header, where each cell is a separate label",
|
67 |
key='labels_uploader')
|
68 |
|
69 |
gen_keywords = st.radio(
|
|
|
73 |
|
74 |
st.text("\n\n\n")
|
75 |
st.markdown("##### Step 3: Provide Ground Truth Labels (_Optional_)")
|
76 |
+
glabels = st.text_input('If available, enter ground truth topic labels to evaluate results, otherwise leave blank (comma-separated):',input_glabels, max_chars=2000)
|
77 |
glabels = list(set([x.strip() for x in glabels.strip().split(',') if len(x.strip()) > 0]))
|
78 |
|
79 |
|
80 |
glabels_csv_expander = st.expander(label=f'Have a file with labels for the text? Click here to upload your CSV file.', expanded=False)
|
81 |
with glabels_csv_expander:
|
82 |
+
st.markdown('##### Choose one of the options below:')
|
83 |
+
st.write("__Option A:__")
|
84 |
uploaded_onetext_glabels_file = st.file_uploader("Choose a CSV file with one column and no header, where each cell is a separate label",
|
85 |
key = 'onetext_glabels_uploader')
|
86 |
+
st.write("__Option B:__")
|
87 |
uploaded_multitext_glabels_file = st.file_uploader('Or Choose a CSV file with two columns "title" and "label", with the cells in the title column matching the name of the files uploaded in step #1.',
|
88 |
key = 'multitext_glabels_uploader')
|
89 |
|
|
|
118 |
st.error("Enter some text to generate a summary")
|
119 |
else:
|
120 |
|
121 |
+
# OPTION A:
|
122 |
if uploaded_text_files is not None:
|
123 |
st.markdown("### Text Inputs")
|
124 |
+
st.write('Files concatenated into a dataframe:')
|
125 |
file_names = []
|
126 |
raw_texts = []
|
127 |
for uploaded_file in uploaded_text_files:
|
|
|
129 |
raw_texts.append(text)
|
130 |
title_file_name = uploaded_file.name.replace('.txt','')
|
131 |
file_names.append(title_file_name)
|
132 |
+
text_df = pd.DataFrame({'title': file_names,
|
133 |
'text': raw_texts})
|
134 |
+
st.dataframe(text_df.head())
|
135 |
st.download_button(
|
136 |
label="Download data as CSV",
|
137 |
+
data=text_df.to_csv().encode('utf-8'),
|
138 |
+
file_name='title_text.csv',
|
139 |
mime='title_text/csv',
|
140 |
)
|
141 |
+
# OPTION B: [TO DO: DIRECT CSV UPLOAD INSTEAD]
|
142 |
|
143 |
|
144 |
with st.spinner('Breaking up text into more reasonable chunks (transformers cannot exceed a 1024 token max)...'):
|
145 |
# For each body of text, create text chunks of a certain token size required for the transformer
|
146 |
+
|
147 |
+
text_chunks_lib = dict()
|
148 |
+
for i in range(0, len(text_df)):
|
149 |
+
nested_sentences = md.create_nest_sentences(document=text_df['text'][i], token_max_length=1024)
|
150 |
+
|
151 |
+
# For each chunk of sentences (within the token max)
|
152 |
+
text_chunks = []
|
153 |
+
for n in range(0, len(nested_sentences)):
|
154 |
+
tc = " ".join(map(str, nested_sentences[n]))
|
155 |
+
text_chunks.append(tc)
|
156 |
+
title_entry = text_df['title'][i]
|
157 |
+
text_chunks_lib[title_entry] = text_chunks
|
158 |
+
|
159 |
+
if gen_keywords == 'Yes':
|
160 |
+
st.markdown("### Top Keywords")
|
161 |
+
with st.spinner("Generating keywords from text..."):
|
162 |
+
|
163 |
+
kw_dict = dict()
|
164 |
+
for key in text_chunks_lib:
|
165 |
+
for text_chunk in text_chunks_lib[key]:
|
166 |
keywords_list = md.keyword_gen(kw_model, text_chunk)
|
167 |
+
kw_dict[key] = dict(keywords_list)
|
168 |
+
|
169 |
+
kw_df0 = pd.DataFrame.from_dict(kw_dict).reset_index()
|
170 |
+
kw_df0.rename(columns={'index': 'keyword'}, inplace=True)
|
171 |
+
kw_df = pd.melt(kw_df0, id_vars=['keyword'], var_name='title', value_name='score').dropna()
|
172 |
+
kw_df = kw_df[kw_df['score'] > 0.1][['title', 'keyword', 'score']].reset_index().drop(columns='index').sort_values(['title', 'score'], ascending=False)
|
173 |
+
st.dataframe(kw_df)
|
174 |
+
st.download_button(
|
175 |
+
label="Download data as CSV",
|
176 |
+
data=kw_df.to_csv().encode('utf-8'),
|
177 |
+
file_name='title_kewyords.csv',
|
178 |
+
mime='title_kewyords/csv',
|
179 |
+
)
|
180 |
|
|
|
|
|
181 |
|
182 |
+
st.markdown("### Summary")
|
183 |
+
with st.spinner(f'Generating summaries for {len(text_chunks)} text chunks (this may take a minute)...'):
|
184 |
+
|
185 |
+
my_expander = st.expander(label=f'Expand to see intermediate summary generation details for {len(text_chunks)} text chunks')
|
186 |
+
with my_expander:
|
187 |
+
summary = []
|
188 |
+
|
189 |
+
st.markdown("_Once the original text is broken into smaller chunks (totaling no more than 1024 tokens, \
|
190 |
+
with complete sentences), each block of text is then summarized separately using BART NLI \
|
191 |
+
and then combined at the very end to generate the final summary._")
|
192 |
+
|
193 |
+
for num_chunk, text_chunk in enumerate(text_chunks):
|
194 |
+
st.markdown(f"###### Original Text Chunk {num_chunk+1}/{len(text_chunks)}" )
|
195 |
+
st.markdown(text_chunk)
|
196 |
+
|
197 |
+
chunk_summary = md.summarizer_gen(summarizer, sequence=text_chunk, maximum_tokens = 300, minimum_tokens = 20)
|
198 |
+
summary.append(chunk_summary)
|
199 |
+
st.markdown(f"###### Partial Summary {num_chunk+1}/{len(text_chunks)}")
|
200 |
+
st.markdown(chunk_summary)
|
201 |
+
# Combine all the summaries into a list and compress into one document, again
|
202 |
+
final_summary = " \n\n".join(list(summary))
|
203 |
+
|
204 |
+
st.markdown(final_summary)
|
205 |
|
206 |
if len(text_input) == 0 or len(labels) == 0:
|
207 |
st.error('Enter some text and at least one possible topic to see label predictions.')
|