Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1 |
import streamlit as st
|
2 |
import time
|
|
|
3 |
import json
|
4 |
from gensim.models import Word2Vec
|
5 |
import pandas as pd
|
|
|
6 |
import matplotlib.pyplot as plt
|
7 |
import squarify
|
8 |
import numpy as np
|
@@ -12,12 +14,13 @@ import random
|
|
12 |
import plotly.express as px
|
13 |
|
14 |
st.set_page_config(
|
15 |
-
page_title="
|
16 |
page_icon=":microscope:",
|
17 |
layout="wide", #centered
|
18 |
initial_sidebar_state="auto",
|
19 |
menu_items={
|
20 |
-
'About': "
|
|
|
21 |
}
|
22 |
)
|
23 |
|
@@ -44,38 +47,70 @@ st.markdown("""
|
|
44 |
</style>
|
45 |
""", unsafe_allow_html=True)
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
st.header(":red[*F*]ast :red[*A*]cting :red[*T*]ext :red[*A*]nalysis (:red[*FATA*]) 4 Science")
|
66 |
-
|
67 |
-
st.subheader("Uncovering knowledge through Natural Language Processing (NLP)")
|
68 |
st.markdown("---")
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
st.header(f":blue[{database_name} Pubmed corpus.]")
|
71 |
text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
|
72 |
query = text_input_value
|
73 |
query = query.lower()
|
74 |
-
query = re.sub("[,.?!&*;:
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
#
|
|
|
79 |
if query:
|
80 |
bar = st.progress(0)
|
81 |
time.sleep(.05)
|
@@ -86,313 +121,839 @@ if query:
|
|
86 |
time.sleep(.1)
|
87 |
|
88 |
# try:
|
89 |
-
model = Word2Vec.load(model_used) # you can continue training with the loaded model!
|
90 |
words = list(model.wv.key_to_index)
|
91 |
X = model.wv[model.wv.key_to_index]
|
|
|
92 |
model2 = model.wv[query]
|
|
|
93 |
df = pd.DataFrame(X)
|
94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
# except:
|
96 |
# st.error("Term occurrence is too low - please try another term")
|
97 |
# st.stop()
|
98 |
st.markdown("---")
|
99 |
-
# def findRelationships(query, df):
|
100 |
-
|
101 |
|
102 |
table = model.wv.most_similar_cosmul(query, topn=10000)
|
103 |
table = (pd.DataFrame(table))
|
104 |
table.index.name = 'Rank'
|
105 |
table.columns = ['Word', 'SIMILARITY']
|
106 |
|
107 |
-
# print()
|
108 |
-
# print("Similarity to " + str(query))
|
109 |
pd.set_option('display.max_rows', None)
|
110 |
table2 = table.copy()
|
111 |
-
# print(table.head(50))
|
112 |
-
# table.head(10).to_csv("clotting_sim1.csv", index=True)
|
113 |
-
# short_table = table.head(50)
|
114 |
-
# print(table)
|
115 |
|
116 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
st.markdown(
|
119 |
-
f"<b><p style='font-family: Arial; font-size: 20px;'>
|
120 |
-
f"
|
121 |
-
f"
|
122 |
-
f"
|
123 |
unsafe_allow_html=True)
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
short_table.
|
144 |
-
# label = short_table.index.tolist()
|
145 |
-
# print(short_table.index)
|
146 |
-
table2["SIMILARITY"] = 'Similarity Score ' + table2.head(10)["SIMILARITY"].round(2).astype(str)
|
147 |
-
rank_num = list(short_table.index.tolist())
|
148 |
-
# avg_size = sum(sizes) / len(short_table.index)
|
149 |
-
df = short_table
|
150 |
-
try:
|
151 |
-
# Define the `text` column for labels and `href` column for links
|
152 |
-
df['text'] = short_table.index
|
153 |
-
|
154 |
-
df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
155 |
-
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in short_table.index]
|
156 |
-
df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index]
|
157 |
-
|
158 |
-
df.loc[:,'database'] = database_name
|
159 |
-
|
160 |
-
|
161 |
-
# print(sizes)
|
162 |
-
# '{0} in {1}'.format(unicode(self.author, 'utf-8'), unicode(self.publication, 'utf-8'))
|
163 |
-
# Create the treemap using `px.treemap`
|
164 |
-
fig = px.treemap(df, path=[short_table.index], values=sizes, custom_data=['href', 'text', 'database', 'href2'],
|
165 |
hover_name=(table2.head(value_word)['SIMILARITY']))
|
166 |
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
|
179 |
-
|
180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
|
182 |
-
|
183 |
-
|
184 |
-
# st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
|
185 |
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus, please choose a lower number")
|
192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
st.markdown("---")
|
194 |
-
# st.write(short_table)
|
195 |
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
|
|
|
197 |
# print()
|
198 |
# print("Human genes similar to " + str(query))
|
199 |
df1 = table.copy()
|
200 |
-
df2 = pd.read_csv('
|
201 |
-
m = df1.Word.isin(df2.
|
202 |
-
df1 = df1[m]
|
203 |
-
df1.rename(columns={'Word': '
|
204 |
-
|
|
|
|
|
205 |
# print(df1.head(50))
|
206 |
# print()
|
207 |
# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
|
208 |
# time.sleep(2)
|
209 |
# Create the slider with increments of 5 up to 100
|
210 |
|
211 |
-
|
212 |
-
|
213 |
-
f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
214 |
-
f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
215 |
-
unsafe_allow_html=True)
|
216 |
-
value_gene = st.slider("Gene", 0, 100, step=5)
|
217 |
-
if value_gene > 0:
|
218 |
-
# st.subheader(f"Top {value} genes closely related to {query}: "
|
219 |
-
# f"Click on the Pubmed and NCBI links for more gene information")
|
220 |
-
|
221 |
-
st.markdown(
|
222 |
-
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_gene} "
|
223 |
-
f"</span>genes similar to "
|
224 |
-
f"<span style='color:red; font-style: italic;'>{query}:</span> Click on the squares to expand and the Pubmed and NCBI links for more gene information</span></p></b>",
|
225 |
-
unsafe_allow_html=True)
|
226 |
-
|
227 |
-
df10 = df1.head(value_gene).copy()
|
228 |
-
df10.index = (1 / df10.index)*10000
|
229 |
-
sizes = df10.index.tolist()
|
230 |
-
df10.set_index('Human Gene', inplace=True)
|
231 |
-
|
232 |
-
df3 = df1.copy()
|
233 |
-
df3["SIMILARITY"] = 'Similarity Score ' + df3.head(value_gene)["SIMILARITY"].round(2).astype(str)
|
234 |
-
df3.reset_index(inplace=True)
|
235 |
-
df3 = df3.rename(columns={'Human Gene': 'symbol2'})
|
236 |
-
# Use df.query to get a subset of df1 based on ids in df2
|
237 |
-
subset = df3.head(value_gene).query('symbol2 in @df2.symbol2')
|
238 |
-
# Use merge to join the two DataFrames on id
|
239 |
-
result = pd.merge(subset, df2, on='symbol2')
|
240 |
-
# Show the result
|
241 |
-
# print(result)
|
242 |
-
# label = df10.index.tolist()
|
243 |
-
# df2 = df10
|
244 |
-
# print(df2)
|
245 |
-
try:
|
246 |
-
# Define the `text` column for labels and `href` column for links
|
247 |
-
df10['text'] = df10.index
|
248 |
-
df10['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
249 |
-
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df10['text']]
|
250 |
-
df10['href2'] = [f'https://www.ncbi.nlm.nih.gov/gene/?term=' + c for c in df10['text']]
|
251 |
-
|
252 |
-
df10['name'] = [c for c in result['Approved name']]
|
253 |
-
assert isinstance(df10, object)
|
254 |
-
df10.loc[:,'database'] = database_name
|
255 |
-
|
256 |
-
# print(df['name'])
|
257 |
-
|
258 |
-
# Create the treemap using `px.treemap`
|
259 |
-
fig = px.treemap(df10, path=[df10['text']], values=sizes,
|
260 |
-
custom_data=['href', 'name', 'database', 'href2', 'text'], hover_name=(df3.head(value_gene)['SIMILARITY']))
|
261 |
-
|
262 |
-
fig.update(layout_coloraxis_showscale=False)
|
263 |
-
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
264 |
-
fig.update_annotations(visible=False)
|
265 |
-
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
266 |
-
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
267 |
-
texttemplate="<b><span style='font-family: Arial; font-size: 20px;'>%{customdata[4]}</span></b><br><span "
|
268 |
-
"style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>"
|
269 |
-
"<a href='%{customdata[0]}'>PubMed"
|
270 |
-
"</a><br><a href='%{customdata[3]}'>NCBI"
|
271 |
-
"</span></a>")
|
272 |
-
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightblue"])
|
273 |
-
# # display the treemap in Streamlit
|
274 |
-
# with treemap2:
|
275 |
-
|
276 |
-
# st.pyplot(fig2)
|
277 |
-
st.plotly_chart(fig, use_container_width=True)
|
278 |
-
|
279 |
-
st.caption("Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
|
280 |
-
st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
csv = df1.head(value_gene).to_csv().encode('utf-8')
|
285 |
-
st.download_button(label=f"download top {value_gene} genes (csv)", data=csv, file_name=f'{database_name}_genes.csv',
|
286 |
-
mime='text/csv')
|
287 |
-
|
288 |
-
|
289 |
-
except:
|
290 |
-
st.warning(
|
291 |
-
f"This selection exceeds the number of similar genes related to {query} within the {database_name} corpus, please choose a lower number")
|
292 |
-
st.markdown("---")
|
293 |
|
294 |
-
# st.
|
295 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
|
297 |
# print()
|
298 |
# print("Human genes similar to " + str(query))
|
299 |
df1 = table.copy()
|
300 |
-
df2 = pd.read_csv('
|
301 |
-
m = df1.Word.isin(df2.
|
302 |
df1 = df1[m]
|
303 |
-
df1.rename(columns={'Word': '
|
304 |
-
# print(df1)
|
305 |
df_len = len(df1)
|
306 |
-
# df1["
|
307 |
# print(df1.head(50))
|
308 |
# print()
|
309 |
# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
|
310 |
# time.sleep(2)
|
311 |
# Create the slider with increments of 5 up to 100
|
312 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
313 |
st.markdown(
|
314 |
-
f"<b><p style='font-family: Arial; font-size: 20px;'>
|
315 |
-
f"
|
316 |
-
f"
|
317 |
-
f"
|
318 |
unsafe_allow_html=True)
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
#
|
345 |
-
|
346 |
-
|
347 |
-
#
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
hover_name=(
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
369 |
-
texttemplate="<
|
370 |
"<a href='%{customdata[0]}'>PubMed"
|
371 |
-
"</a><br><a href='%{customdata[2]}'>Wikipedia"
|
|
|
372 |
"</span></a>")
|
373 |
-
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightblue"])
|
374 |
-
# # display the treemap in Streamlit
|
375 |
-
# with treemap2:
|
376 |
|
377 |
-
|
378 |
-
|
|
|
379 |
|
380 |
-
|
381 |
-
|
382 |
|
383 |
-
|
384 |
-
st.download_button(label=f"download top {value_protein} proteins (csv)", data=csv, file_name=f'{database_name}_genes.csv',
|
385 |
-
mime='text/csv')
|
386 |
|
|
|
|
|
|
|
387 |
|
388 |
-
|
389 |
-
|
|
|
|
|
390 |
st.markdown("---")
|
391 |
|
392 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
393 |
st.subheader("Cancer-related videos")
|
394 |
if query:
|
395 |
-
idlist=[]
|
396 |
search_keyword = {query}
|
397 |
html = urllib.request.urlopen("https://www.youtube.com/@NCIgov/search?query=cancer")
|
398 |
html2 = urllib.request.urlopen("https://www.youtube.com/@CancerCenter/search?query=cancer")
|
@@ -418,15 +979,30 @@ if query:
|
|
418 |
|
419 |
c1, c2, c3 = st.columns(3)
|
420 |
|
421 |
-
|
422 |
with c1:
|
423 |
-
|
424 |
with c2:
|
425 |
-
|
426 |
with c3:
|
427 |
-
|
428 |
st.markdown("---")
|
429 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
430 |
|
431 |
|
432 |
|
|
|
1 |
import streamlit as st
|
2 |
import time
|
3 |
+
import concurrent.futures
|
4 |
import json
|
5 |
from gensim.models import Word2Vec
|
6 |
import pandas as pd
|
7 |
+
import threading
|
8 |
import matplotlib.pyplot as plt
|
9 |
import squarify
|
10 |
import numpy as np
|
|
|
14 |
import plotly.express as px
|
15 |
|
16 |
st.set_page_config(
|
17 |
+
page_title="Abstractalytics",
|
18 |
page_icon=":microscope:",
|
19 |
layout="wide", #centered
|
20 |
initial_sidebar_state="auto",
|
21 |
menu_items={
|
22 |
+
'About': "Abstractalytics is a Natural Language Processing (NLP) that harnesses Word2Vec to mine"
|
23 |
+
" insight from pubmed abstracts. Created by Jimmie E. Fata, PhD"
|
24 |
}
|
25 |
)
|
26 |
|
|
|
47 |
</style>
|
48 |
""", unsafe_allow_html=True)
|
49 |
|
50 |
+
st.header(":red[*Abstractalytics*]")
|
51 |
+
|
52 |
+
st.subheader("*A web app designed to explore :red[*PubMed abstracts*] for deeper understanding and fresh insights, driven "
|
53 |
+
"by Natural Language Processing (NLP) techniques.*")
|
54 |
+
|
55 |
+
def custom_subheader(text, identifier, font_size):
|
56 |
+
st.markdown(f"<h3 id='{identifier}' style='font-size: {font_size}px;'>{text}</h3>", unsafe_allow_html=True)
|
57 |
+
|
58 |
+
custom_subheader("Welcome to our innovative web2vec app designed to unlock the wealth of knowledge and insights hidden "
|
59 |
+
"within PubMed abstracts! To begin, simply select a corpus that interests you. Next, enter a single keyword "
|
60 |
+
"you wish to explore within the corpus. Abstractalytics powerful Natural Language "
|
61 |
+
"Processing (NLP) algorithms will analyze the chosen corpus and present you with a list of top words, "
|
62 |
+
"genes, drugs, phytochemicals, and compounds that are contextually and semantically related "
|
63 |
+
"to your input. This advanced text-mining technique enables you to explore and understand complex "
|
64 |
+
"relationships, uncovering new discoveries and connections in your field of research across a massive "
|
65 |
+
"amount of abstracts. Dive in and enjoy the exploration! More oncology-related corpora comming soon.", "unique-id", 18)
|
66 |
+
|
|
|
|
|
|
|
|
|
67 |
st.markdown("---")
|
68 |
|
69 |
+
#Define the correct password
|
70 |
+
# CORRECT_PASSWORD = "123"
|
71 |
+
|
72 |
+
# Define a function to check if the password is correct
|
73 |
+
# def authenticate(password):
|
74 |
+
# if password == CORRECT_PASSWORD:
|
75 |
+
# return True
|
76 |
+
# else:
|
77 |
+
# return False
|
78 |
+
#
|
79 |
+
# # Create a Streamlit input field for the password
|
80 |
+
# password = st.text_input("Enter password:", type="password")
|
81 |
+
#
|
82 |
+
# # If the password is correct, show the app content
|
83 |
+
# if authenticate(password):
|
84 |
+
opt = st.sidebar.radio("Select a PubMed Corpus",
|
85 |
+
options=(
|
86 |
+
'Breast Cancer corpus', 'Lung Cancer corpus'))
|
87 |
+
# if opt == "Clotting corpus":
|
88 |
+
# model_used = ("pubmed_model_clotting")
|
89 |
+
# num_abstracts = 45493
|
90 |
+
# database_name = "Clotting"
|
91 |
+
# if opt == "Neuroblastoma corpus":
|
92 |
+
# model_used = ("pubmed_model_neuroblastoma")
|
93 |
+
# num_abstracts = 29032
|
94 |
+
# database_name = "Neuroblastoma"
|
95 |
+
if opt == "Breast Cancer corpus":
|
96 |
+
model_used = ("pubmed_model_breast_cancer2")
|
97 |
+
num_abstracts = 290320
|
98 |
+
database_name = "Breast_cancer"
|
99 |
+
if opt == "Lung Cancer corpus":
|
100 |
+
model_used = ("lung_cancer_pubmed_model")
|
101 |
+
num_abstracts = 210320
|
102 |
+
database_name = "Lung_cancer"
|
103 |
+
|
104 |
st.header(f":blue[{database_name} Pubmed corpus.]")
|
105 |
text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
|
106 |
query = text_input_value
|
107 |
query = query.lower()
|
108 |
+
query = re.sub("[,.?!&*;:]", "", query)
|
109 |
+
query = re.sub(" ", "-", query)
|
110 |
+
# matches = [" "]
|
111 |
+
# if any([x in query for x in matches]):
|
112 |
+
# st.write("Please only enter one term or a term without spaces")
|
113 |
+
# # query = input ("Enter your keyword(s):")
|
114 |
if query:
|
115 |
bar = st.progress(0)
|
116 |
time.sleep(.05)
|
|
|
121 |
time.sleep(.1)
|
122 |
|
123 |
# try:
|
124 |
+
model = Word2Vec.load(f"{model_used}") # you can continue training with the loaded model!
|
125 |
words = list(model.wv.key_to_index)
|
126 |
X = model.wv[model.wv.key_to_index]
|
127 |
+
# print(model.wv['bfgf'])
|
128 |
model2 = model.wv[query]
|
129 |
+
# print(model.wv.similar_by_word('bfgf', topn=50, restrict_vocab=None))
|
130 |
df = pd.DataFrame(X)
|
131 |
|
132 |
+
def get_compound_ids(compound_names):
|
133 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
134 |
+
compound_ids = list(executor.map(get_compound_id, compound_names))
|
135 |
+
return compound_ids
|
136 |
+
|
137 |
+
|
138 |
+
import requests
|
139 |
+
|
140 |
+
|
141 |
+
def get_compound_id(compound_name):
|
142 |
+
url = f"http://rest.kegg.jp/find/compound/{compound_name}"
|
143 |
+
response = requests.get(url)
|
144 |
+
if response.status_code == 200:
|
145 |
+
result = response.text.split('\n')
|
146 |
+
if result[0]:
|
147 |
+
compound_id = result[0].split('\t')[0]
|
148 |
+
return compound_id
|
149 |
+
return None
|
150 |
+
|
151 |
# except:
|
152 |
# st.error("Term occurrence is too low - please try another term")
|
153 |
# st.stop()
|
154 |
st.markdown("---")
|
|
|
|
|
155 |
|
156 |
table = model.wv.most_similar_cosmul(query, topn=10000)
|
157 |
table = (pd.DataFrame(table))
|
158 |
table.index.name = 'Rank'
|
159 |
table.columns = ['Word', 'SIMILARITY']
|
160 |
|
|
|
|
|
161 |
pd.set_option('display.max_rows', None)
|
162 |
table2 = table.copy()
|
|
|
|
|
|
|
|
|
163 |
|
164 |
+
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap to visualize "
|
165 |
+
# f"<span style='color:red; font-style: italic;'>words</span> contextually "
|
166 |
+
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
167 |
+
# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
168 |
+
# unsafe_allow_html=True)
|
169 |
+
|
170 |
+
# Set the max number of words to display
|
171 |
+
value_word = min(100, len(table2))
|
172 |
|
173 |
st.markdown(
|
174 |
+
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_word} "
|
175 |
+
f"</span>words contextually and semantically similar to "
|
176 |
+
f"<span style='color:red; font-style: italic;'>{query} </span>within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. "
|
177 |
+
f"Click on the squares to expand and also the PubMed and Wikipedia links for more word information</span></p></b>",
|
178 |
unsafe_allow_html=True)
|
179 |
+
|
180 |
+
short_table = table2.head(value_word).round(2)
|
181 |
+
short_table.index += 1
|
182 |
+
short_table.index = (1 / short_table.index) * 10
|
183 |
+
sizes = short_table.index.tolist()
|
184 |
+
|
185 |
+
short_table.set_index('Word', inplace=True)
|
186 |
+
table2["SIMILARITY"] = 'Similarity Score ' + table2.head(value_word)["SIMILARITY"].round(2).astype(str)
|
187 |
+
rank_num = list(short_table.index.tolist())
|
188 |
+
|
189 |
+
df = short_table
|
190 |
+
try:
|
191 |
+
df['text'] = short_table.index
|
192 |
+
df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
193 |
+
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in short_table.index]
|
194 |
+
df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index]
|
195 |
+
|
196 |
+
df.loc[:, 'database'] = database_name
|
197 |
+
|
198 |
+
fig = px.treemap(df, path=[short_table.index], values=sizes, custom_data=['href', 'text', 'database', 'href2'],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
hover_name=(table2.head(value_word)['SIMILARITY']))
|
200 |
|
201 |
+
fig.update(layout_coloraxis_showscale=False)
|
202 |
+
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
203 |
+
fig.update_annotations(visible=False)
|
204 |
+
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
205 |
+
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
206 |
+
texttemplate="<br><span "
|
207 |
+
"style='font-family: Arial; font-size: 20px;'>%{customdata[1]}<br><br>"
|
208 |
+
"<a href='%{customdata[0]}'>PubMed"
|
209 |
+
"</a><br><br><a href='%{customdata[3]}'>Wikipedia"
|
210 |
+
"</span></a>")
|
211 |
+
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])
|
212 |
+
|
213 |
+
# st.pyplot(fig2)
|
214 |
+
st.plotly_chart(fig, use_container_width=True)
|
215 |
+
|
216 |
+
# st.caption(
|
217 |
+
# "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
|
218 |
+
# st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
|
219 |
+
|
220 |
+
csv = table2.head(value_word).to_csv().encode('utf-8')
|
221 |
+
st.download_button(label=f"download top {value_word} words (csv)", data=csv,
|
222 |
+
file_name=f'{database_name}_words.csv', mime='text/csv')
|
223 |
+
except:
|
224 |
+
st.warning(
|
225 |
+
f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus, please choose a lower number")
|
226 |
+
|
227 |
+
# st.markdown("---")
|
228 |
+
# # st.write(short_table)
|
229 |
+
# #
|
230 |
+
#
|
231 |
+
# # print()
|
232 |
+
# # print("Human genes similar to " + str(query))
|
233 |
+
# df1 = table.copy()
|
234 |
+
# df2 = pd.read_csv('Human Genes.csv')
|
235 |
+
# m = df1.Word.isin(df2.symbol)
|
236 |
+
# df1 = df1[m]
|
237 |
+
# df1.rename(columns={'Word': 'Human Gene'}, inplace=True)
|
238 |
+
# df1["Human Gene"] = df1["Human Gene"].str.upper()
|
239 |
+
# # print(df1.head(50))
|
240 |
+
# # print()
|
241 |
+
# # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
|
242 |
+
# # time.sleep(2)
|
243 |
+
# # Create the slider with increments of 5 up to 100
|
244 |
+
#
|
245 |
+
# # Set the maximum number of genes to display up to 100
|
246 |
+
# value_gene = min(len(df1), 100)
|
247 |
+
#
|
248 |
+
# if value_gene > 0:
|
249 |
+
# # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Treemap visualization of "
|
250 |
+
# # f"<span style='color:red; font-style: italic;'>genes</span> contextually "
|
251 |
+
# # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
252 |
+
# # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
253 |
+
# # unsafe_allow_html=True)
|
254 |
+
#
|
255 |
+
# st.markdown(
|
256 |
+
# f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_gene} "
|
257 |
+
# f"</span>genes contextually and semantically similar to "
|
258 |
+
# f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> database. "
|
259 |
+
# f"Click on the squares to expand and also the Pubmed and GeneCard links for more gene information</span></p></b>",
|
260 |
+
# unsafe_allow_html=True)
|
261 |
+
#
|
262 |
+
# df10 = df1.head(value_gene).copy()
|
263 |
+
# df10.index = (1 / df10.index) * 100000
|
264 |
+
# sizes = df10.index.tolist()
|
265 |
+
# df10.set_index('Human Gene', inplace=True)
|
266 |
+
#
|
267 |
+
# df3 = df1.copy()
|
268 |
+
# df3["SIMILARITY"] = 'Similarity Score ' + df3.head(value_gene)["SIMILARITY"].round(2).astype(str)
|
269 |
+
# df3.reset_index(inplace=True)
|
270 |
+
# df3 = df3.rename(columns={'Human Gene': 'symbol2'})
|
271 |
+
# # Use df.query to get a subset of df1 based on ids in df2
|
272 |
+
# subset = df3.head(value_gene).query('symbol2 in @df2.symbol2')
|
273 |
+
# # Use merge to join the two DataFrames on id
|
274 |
+
# result = pd.merge(subset, df2, on='symbol2')
|
275 |
+
# # Show the result
|
276 |
+
# # print(result)
|
277 |
+
# # label = df10.index.tolist()
|
278 |
+
# # df2 = df10
|
279 |
+
# # print(df2)
|
280 |
+
# try:
|
281 |
+
# # Define the `text` column for labels and `href` column for links
|
282 |
+
# df10['text'] = df10.index
|
283 |
+
# df10['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
284 |
+
# '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df10['text']]
|
285 |
+
# df10['href2'] = [f'https://www.genecards.org/cgi-bin/carddisp.pl?gene=' + c for c in df10['text']]
|
286 |
+
#
|
287 |
+
# df10['name'] = [c for c in result['Approved name']]
|
288 |
+
# assert isinstance(df10, object)
|
289 |
+
# df10.loc[:, 'database'] = database_name
|
290 |
+
#
|
291 |
+
# # print(df['name'])
|
292 |
+
#
|
293 |
+
# # Create the treemap using `px.treemap`
|
294 |
+
# fig = px.treemap(df10, path=[df10['text']], values=sizes,
|
295 |
+
# custom_data=['href', 'name', 'database', 'href2', 'text'],
|
296 |
+
# hover_name=(df3.head(value_gene)['SIMILARITY']))
|
297 |
+
#
|
298 |
+
# fig.update(layout_coloraxis_showscale=False)
|
299 |
+
# fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
300 |
+
# fig.update_annotations(visible=False)
|
301 |
+
# fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
302 |
+
# hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
303 |
+
# texttemplate="<br><span style='font-family: Arial; font-size: 20px;'>%{customdata[4]}<br><br>"
|
304 |
+
# "%{customdata[1]}<br><br>"
|
305 |
+
# "<a href='%{customdata[0]}'>PubMed"
|
306 |
+
# "</a><br><br><a href='%{customdata[3]}'>GeneCard"
|
307 |
+
# "</span></a>")
|
308 |
+
# fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightblue"])
|
309 |
+
# # # display the treemap in Streamlit
|
310 |
+
# # with treemap2:
|
311 |
+
#
|
312 |
+
# # st.pyplot(fig2)
|
313 |
+
# st.plotly_chart(fig, use_container_width=True)
|
314 |
+
#
|
315 |
+
# st.caption(
|
316 |
+
# "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
|
317 |
+
# st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
|
318 |
+
# st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
|
319 |
+
#
|
320 |
+
# csv = df1.head(value_gene).to_csv().encode('utf-8')
|
321 |
+
# st.download_button(label=f"download top {value_gene} genes (csv)", data=csv,
|
322 |
+
# file_name=f'{database_name}_genes.csv', mime='text/csv')
|
323 |
+
#
|
324 |
+
#
|
325 |
+
# except:
|
326 |
+
# st.warning(f"No similar genes related to {query} within the {database_name} corpus were found.")
|
327 |
|
328 |
+
st.markdown("---")
|
329 |
+
|
330 |
+
df1 = table.copy()
|
331 |
+
df2 = pd.read_csv('Human Genes.csv')
|
332 |
+
m = df1.Word.isin(df2.symbol)
|
333 |
+
df1 = df1[m]
|
334 |
+
df1.rename(columns={'Word': 'Genes'}, inplace=True)
|
335 |
+
df_len = len(df1)
|
336 |
+
print(len(df1))
|
337 |
+
|
338 |
+
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap to visualize "
|
339 |
+
# f"<span style='color:red; font-style: italic;'>proteins</span> contextually "
|
340 |
+
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
341 |
+
# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
342 |
+
# unsafe_allow_html=True)
|
343 |
+
|
344 |
+
# Set the number of proteins to display
|
345 |
+
value_gene = min(df_len, 100)
|
346 |
+
|
347 |
+
st.markdown(
|
348 |
+
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_gene} "
|
349 |
+
f"</span>human genes contextually and semantically similar to "
|
350 |
+
f"<span style='color:red; font-style: italic;'>{query} </span>within the <span style='color:red; font-style: italic;'>{database_name} </span>corpus. Click on the squares to expand and also the Pubmed and GeneCard links for more gene information</span></p></b>",
|
351 |
+
unsafe_allow_html=True)
|
352 |
+
|
353 |
+
df11 = df1.head(value_gene).copy()
|
354 |
+
|
355 |
+
df11.index = (1 / df11.index) * 10000
|
356 |
+
sizes = df11.index.tolist()
|
357 |
+
|
358 |
+
df11.set_index('Genes', inplace=True)
|
359 |
+
|
360 |
+
df4 = df1.copy()
|
361 |
+
# print(df4.head(10))
|
362 |
+
df4["SIMILARITY"] = 'Similarity Score ' + df4.head(value_gene)["SIMILARITY"].round(2).astype(str)
|
363 |
+
df4.reset_index(inplace=True)
|
364 |
+
# df4 = df4.rename(columns={'Protein': 'symbol2'})
|
365 |
+
# print(df4)
|
366 |
+
# # Use df.query to get a subset of df1 based on ids in df2
|
367 |
+
# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
|
368 |
+
# # Use merge to join the two DataFrames on id
|
369 |
+
# result = pd.merge(subset, df2b, on='symbol2')
|
370 |
+
# print(result)
|
371 |
+
if value_gene <= df_len:
|
372 |
+
# Define the `text` column for labels and `href` column for links
|
373 |
+
df11['text'] = df11.index
|
374 |
+
df11['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
375 |
+
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df11['text']]
|
376 |
+
df11['href2'] = [f'https://www.genecards.org/cgi-bin/carddisp.pl?gene=' + c for c in df11['text']]
|
377 |
+
assert isinstance(df11, object)
|
378 |
+
df11['database'] = database_name
|
379 |
+
|
380 |
+
# df11['name'] = [c for c in result['Approved name']]
|
381 |
+
|
382 |
+
# Create the treemap using `px.treemap`
|
383 |
+
fig = px.treemap(df11, path=[df11['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
|
384 |
+
hover_name=(df4.head(value_gene)['SIMILARITY']))
|
385 |
+
|
386 |
+
fig.update(layout_coloraxis_showscale=False)
|
387 |
+
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
388 |
+
fig.update_annotations(visible=False)
|
389 |
+
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
390 |
+
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
391 |
+
texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
|
392 |
+
"<a href='%{customdata[0]}'>PubMed"
|
393 |
+
"</a><br><br><a href='%{customdata[2]}'>GeneCard"
|
394 |
+
"</span></a>")
|
395 |
+
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightPink"])
|
396 |
+
# # display the treemap in Streamlit
|
397 |
+
# with treemap2:
|
398 |
+
|
399 |
+
# st.pyplot(fig2)
|
400 |
+
st.plotly_chart(fig, use_container_width=True)
|
401 |
+
|
402 |
+
# st.caption(
|
403 |
+
# "Gene designation and database provided by KEGG homo sapien gene list: https://rest.kegg.jp/list/hsa")
|
404 |
+
# st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
|
405 |
+
st.caption("Human gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
|
406 |
+
st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
|
407 |
+
st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
|
408 |
+
|
409 |
+
csv = df1.head(value_gene).to_csv().encode('utf-8')
|
410 |
+
st.download_button(label=f"download top {value_gene} genes (csv)", data=csv,
|
411 |
+
file_name=f'{database_name}_genes.csv', mime='text/csv')
|
412 |
+
|
413 |
+
|
414 |
+
else:
|
415 |
+
st.warning(
|
416 |
+
f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
|
417 |
+
st.markdown("---")
|
418 |
+
# print()
|
419 |
+
# print("Human genes similar to " + str(query))
|
420 |
+
df1 = table.copy()
|
421 |
+
df2 = pd.read_csv('kegg_drug_list_lowercase.csv')
|
422 |
+
m = df1.Word.isin(df2.drugs)
|
423 |
+
df1 = df1[m]
|
424 |
+
df1.rename(columns={'Word': 'Drugs'}, inplace=True)
|
425 |
+
df_len = len(df1)
|
426 |
+
# print(len(df1))
|
427 |
+
# df1["Human Gene"] = df1["Human Gene"].str.upper()
|
428 |
+
# print(df1.head(50))
|
429 |
+
# print()
|
430 |
+
# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
|
431 |
+
# time.sleep(2)
|
432 |
+
# Create the slider with increments of 5 up to 100
|
433 |
|
434 |
+
# Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
|
435 |
+
value_drug = min(df1.shape[0], 100)
|
|
|
436 |
|
437 |
+
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
|
438 |
+
# f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
|
439 |
+
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
440 |
+
# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
441 |
+
# unsafe_allow_html=True)
|
|
|
442 |
|
443 |
+
st.markdown(
|
444 |
+
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_drug} "
|
445 |
+
f"</span>Drugs contextually and semantically similar to "
|
446 |
+
f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>",
|
447 |
+
unsafe_allow_html=True)
|
448 |
+
|
449 |
+
df13 = df1.head(value_drug).copy()
|
450 |
+
|
451 |
+
df13.index = (1 / df13.index) * 10000
|
452 |
+
sizes = df13.index.tolist()
|
453 |
+
|
454 |
+
df13.set_index('Drugs', inplace=True)
|
455 |
+
|
456 |
+
df6 = df1.copy()
|
457 |
+
# print(df4.head(10))
|
458 |
+
df6["SIMILARITY"] = 'Similarity Score ' + df6.head(value_drug)["SIMILARITY"].round(2).astype(str)
|
459 |
+
df6.reset_index(inplace=True)
|
460 |
+
# df4 = df4.rename(columns={'Protein': 'symbol2'})
|
461 |
+
# print(df4)
|
462 |
+
# # Use df.query to get a subset of df1 based on ids in df2
|
463 |
+
# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
|
464 |
+
# # Use merge to join the two DataFrames on id
|
465 |
+
# result = pd.merge(subset, df2b, on='symbol2')
|
466 |
+
# print(result)
|
467 |
+
if value_drug <= df_len:
|
468 |
+
# Define the `text` column for labels and `href` column for links
|
469 |
+
# Reset the index
|
470 |
+
df13.reset_index(inplace=True)
|
471 |
+
|
472 |
+
# Replace hyphens with spaces in the 'text' column
|
473 |
+
df13['Drugs'] = df13['Drugs'].str.replace('-', ' ')
|
474 |
+
|
475 |
+
# Set the 'text' column back as the index
|
476 |
+
df13.set_index('Drugs', inplace=True)
|
477 |
+
df13['text'] = df13.index
|
478 |
+
df13['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
479 |
+
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df13['text']]
|
480 |
+
df13['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df13['text']]
|
481 |
+
assert isinstance(df13, object)
|
482 |
+
df13['database'] = database_name
|
483 |
+
|
484 |
+
# df11['name'] = [c for c in result['Approved name']]
|
485 |
+
|
486 |
+
# Create the treemap using `px.treemap`
|
487 |
+
fig = px.treemap(df13, path=[df13['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
|
488 |
+
hover_name=(df6.head(value_drug)['SIMILARITY']))
|
489 |
+
|
490 |
+
fig.update(layout_coloraxis_showscale=False)
|
491 |
+
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
492 |
+
fig.update_annotations(visible=False)
|
493 |
+
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
494 |
+
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
495 |
+
texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
|
496 |
+
"<a href='%{customdata[0]}'>PubMed"
|
497 |
+
"</a><br><br><a href='%{customdata[2]}'>Wikipedia"
|
498 |
+
"</span></a>")
|
499 |
+
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["Thistle"])
|
500 |
+
# # display the treemap in Streamlit
|
501 |
+
# with treemap2:
|
502 |
+
|
503 |
+
# st.pyplot(fig2)
|
504 |
+
st.plotly_chart(fig, use_container_width=True)
|
505 |
+
|
506 |
+
st.caption(
|
507 |
+
"Drug designation and database provided by KEGG: https://www.kegg.jp/kegg/drug/")
|
508 |
+
|
509 |
+
csv = df1.head(value_drug).to_csv().encode('utf-8')
|
510 |
+
st.download_button(label=f"download top {value_drug} drugs (csv)", data=csv,
|
511 |
+
file_name=f'{database_name}_drugs.csv', mime='text/csv')
|
512 |
+
|
513 |
+
|
514 |
+
else:
|
515 |
+
st.warning(
|
516 |
+
f"This selection exceeds the number of similar drugs related to {query} within the {database_name} corpus, please choose a lower number")
|
517 |
st.markdown("---")
|
|
|
518 |
#
|
519 |
+
# st.markdown("---")
|
520 |
+
# # print()
|
521 |
+
# # print("Human genes similar to " + str(query))
|
522 |
+
# df1 = table.copy()
|
523 |
+
# df2 = pd.read_csv('diseasesKegg.csv')
|
524 |
+
# m = df1.Word.isin(df2.disease)
|
525 |
+
# df1 = df1[m]
|
526 |
+
# df1.rename(columns={'Word': 'Disease'}, inplace=True)
|
527 |
+
# df_len = len(df1)
|
528 |
+
# # print(len(df1))
|
529 |
+
# # df1["Human Gene"] = df1["Human Gene"].str.upper()
|
530 |
+
# # print(df1.head(50))
|
531 |
+
# # print()
|
532 |
+
# # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
|
533 |
+
# # time.sleep(2)
|
534 |
+
# # Create the slider with increments of 5 up to 100
|
535 |
+
#
|
536 |
+
# # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
|
537 |
+
# value_disease = min(df1.shape[0], 100)
|
538 |
+
#
|
539 |
+
# # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
|
540 |
+
# # f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
|
541 |
+
# # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
542 |
+
# # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
543 |
+
# # unsafe_allow_html=True)
|
544 |
+
#
|
545 |
+
# st.markdown(
|
546 |
+
# f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_disease} "
|
547 |
+
# f"</span>Diseases contextually and semantically similar to "
|
548 |
+
# f"<span style='color:red; font-style: italic;'>{query}:</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> database. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>",
|
549 |
+
# unsafe_allow_html=True)
|
550 |
+
#
|
551 |
+
# df14 = df1.head(value_disease).copy()
|
552 |
+
#
|
553 |
+
# df14.index = (1 / df14.index) * 10000
|
554 |
+
# sizes = df14.index.tolist()
|
555 |
+
#
|
556 |
+
# df14.set_index('Disease', inplace=True)
|
557 |
+
#
|
558 |
+
# df7 = df1.copy()
|
559 |
+
# # print(df4.head(10))
|
560 |
+
# df7["SIMILARITY"] = 'Similarity Score ' + df7.head(value_disease)["SIMILARITY"].round(2).astype(str)
|
561 |
+
# df7.reset_index(inplace=True)
|
562 |
+
# # df4 = df4.rename(columns={'Protein': 'symbol2'})
|
563 |
+
# # print(df4)
|
564 |
+
# # # Use df.query to get a subset of df1 based on ids in df2
|
565 |
+
# # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
|
566 |
+
# # # Use merge to join the two DataFrames on id
|
567 |
+
# # result = pd.merge(subset, df2b, on='symbol2')
|
568 |
+
# # print(result)
|
569 |
+
# if value_disease <= df_len:
|
570 |
+
# # Define the `text` column for labels and `href` column for links
|
571 |
+
# # Reset the index
|
572 |
+
# df14.reset_index(inplace=True)
|
573 |
+
#
|
574 |
+
# # Replace hyphens with spaces in the 'text' column
|
575 |
+
# df14['Disease'] = df14['Disease'].str.replace('-', ' ')
|
576 |
+
#
|
577 |
+
# # Set the 'text' column back as the index
|
578 |
+
# df14.set_index('Disease', inplace=True)
|
579 |
+
# df14['text'] = df14.index
|
580 |
+
# df14['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
581 |
+
# '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df14['text']]
|
582 |
+
# df14['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df14['text']]
|
583 |
+
# assert isinstance(df14, object)
|
584 |
+
# df14['database'] = database_name
|
585 |
+
#
|
586 |
+
# # df11['name'] = [c for c in result['Approved name']]
|
587 |
+
#
|
588 |
+
# # Create the treemap using `px.treemap`
|
589 |
+
# fig = px.treemap(df14, path=[df14['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
|
590 |
+
# hover_name=(df7.head(value_disease)['SIMILARITY']))
|
591 |
+
#
|
592 |
+
# fig.update(layout_coloraxis_showscale=False)
|
593 |
+
# fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
594 |
+
# fig.update_annotations(visible=False)
|
595 |
+
# fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
596 |
+
# hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
597 |
+
# texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
|
598 |
+
# "<a href='%{customdata[0]}'>PubMed"
|
599 |
+
# "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
|
600 |
+
# "</span></a>")
|
601 |
+
# fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["PaleGoldenRod"])
|
602 |
+
# # # display the treemap in Streamlit
|
603 |
+
# # with treemap2:
|
604 |
+
#
|
605 |
+
# # st.pyplot(fig2)
|
606 |
+
# st.plotly_chart(fig, use_container_width=True)
|
607 |
+
#
|
608 |
+
# st.caption("Disease designation and database provided by KEGG: https://www.genome.jp/kegg/disease/")
|
609 |
+
#
|
610 |
+
# csv = df1.head(value_disease).to_csv().encode('utf-8')
|
611 |
+
# st.download_button(label=f"download top {value_disease} diseases (csv)", data=csv,
|
612 |
+
# file_name=f'{database_name}_disease.csv', mime='text/csv')
|
613 |
+
#
|
614 |
+
#
|
615 |
+
# else:
|
616 |
+
# st.warning(
|
617 |
+
# f"This selection exceeds the number of similar diseases related to {query} within the {database_name} corpus, please choose a lower number")
|
618 |
+
# st.markdown("---")
|
619 |
+
|
620 |
+
# st.markdown("---")
|
621 |
+
# # print()
|
622 |
+
# # print("Human genes similar to " + str(query))
|
623 |
+
# df1 = table.copy()
|
624 |
+
# df2 = pd.read_csv('pathwaysKegg.csv')
|
625 |
+
# m = df1.Word.isin(df2.pathway)
|
626 |
+
# df1 = df1[m]
|
627 |
+
# df1.rename(columns={'Word': 'Pathway'}, inplace=True)
|
628 |
+
# df_len = len(df1)
|
629 |
+
# # print(len(df1))
|
630 |
+
# # df1["Human Gene"] = df1["Human Gene"].str.upper()
|
631 |
+
# # print(df1.head(50))
|
632 |
+
# # print()
|
633 |
+
# # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
|
634 |
+
# # time.sleep(2)
|
635 |
+
# # Create the slider with increments of 5 up to 100
|
636 |
+
#
|
637 |
+
# # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
|
638 |
+
# value_pathway = min(df1.shape[0], 100)
|
639 |
+
#
|
640 |
+
# # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
|
641 |
+
# # f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
|
642 |
+
# # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
643 |
+
# # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
644 |
+
# # unsafe_allow_html=True)
|
645 |
+
#
|
646 |
+
# st.markdown(
|
647 |
+
# f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_pathway} "
|
648 |
+
# f"</span>Pathways contextually and semantically similar to "
|
649 |
+
# f"<span style='color:red; font-style: italic;'>{query}:</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> database. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>",
|
650 |
+
# unsafe_allow_html=True)
|
651 |
+
#
|
652 |
+
# df16 = df1.head(value_pathway).copy()
|
653 |
+
#
|
654 |
+
# df16.index = (1 / df16.index) * 10000
|
655 |
+
# sizes = df16.index.tolist()
|
656 |
+
#
|
657 |
+
# df16.set_index('Pathway', inplace=True)
|
658 |
+
#
|
659 |
+
# df9 = df1.copy()
|
660 |
+
# # print(df4.head(10))
|
661 |
+
# df9["SIMILARITY"] = 'Similarity Score ' + df9.head(value_pathway)["SIMILARITY"].round(2).astype(str)
|
662 |
+
# df9.reset_index(inplace=True)
|
663 |
+
# # df4 = df4.rename(columns={'Protein': 'symbol2'})
|
664 |
+
# # print(df4)
|
665 |
+
# # # Use df.query to get a subset of df1 based on ids in df2
|
666 |
+
# # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
|
667 |
+
# # # Use merge to join the two DataFrames on id
|
668 |
+
# # result = pd.merge(subset, df2b, on='symbol2')
|
669 |
+
# # print(result)
|
670 |
+
# if value_pathway <= df_len:
|
671 |
+
# # Define the `text` column for labels and `href` column for links
|
672 |
+
# # Reset the index
|
673 |
+
# df16.reset_index(inplace=True)
|
674 |
+
#
|
675 |
+
# # Replace hyphens with spaces in the 'text' column
|
676 |
+
# df16['Pathway'] = df16['Pathway'].str.replace('-', ' ')
|
677 |
+
#
|
678 |
+
# # Set the 'text' column back as the index
|
679 |
+
# df16.set_index('Pathway', inplace=True)
|
680 |
+
# df16['text'] = df16.index
|
681 |
+
# df16['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
682 |
+
# '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df16['text']]
|
683 |
+
# df16['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df16['text']]
|
684 |
+
# assert isinstance(df16, object)
|
685 |
+
# df16['database'] = database_name
|
686 |
+
#
|
687 |
+
# # df11['name'] = [c for c in result['Approved name']]
|
688 |
+
#
|
689 |
+
# # Create the treemap using `px.treemap`
|
690 |
+
# fig = px.treemap(df16, path=[df16['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
|
691 |
+
# hover_name=(df9.head(value_pathway)['SIMILARITY']))
|
692 |
+
#
|
693 |
+
# fig.update(layout_coloraxis_showscale=False)
|
694 |
+
# fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
695 |
+
# fig.update_annotations(visible=False)
|
696 |
+
# fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
697 |
+
# hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
698 |
+
# texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
|
699 |
+
# "<a href='%{customdata[0]}'>PubMed"
|
700 |
+
# "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
|
701 |
+
# "</span></a>")
|
702 |
+
# fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["FloralWhite"])
|
703 |
+
# # # display the treemap in Streamlit
|
704 |
+
# # with treemap2:
|
705 |
+
#
|
706 |
+
# # st.pyplot(fig2)
|
707 |
+
# st.plotly_chart(fig, use_container_width=True)
|
708 |
+
#
|
709 |
+
# st.caption("Pathway designation and database provided by KEGG: https://www.genome.jp/kegg/pathway.html")
|
710 |
+
#
|
711 |
+
# csv = df1.head(value_pathway).to_csv().encode('utf-8')
|
712 |
+
# st.download_button(label=f"download top {value_pathway} pathways (csv)", data=csv,
|
713 |
+
# file_name=f'{database_name}_pathways.csv', mime='text/csv')
|
714 |
+
#
|
715 |
+
#
|
716 |
+
# else:
|
717 |
+
# st.warning(
|
718 |
+
# f"This selection exceeds the number of similar pathways related to {query} within the {database_name} corpus, please choose a lower number")
|
719 |
+
# st.markdown("---")
|
720 |
|
721 |
+
st.markdown("---")
|
722 |
# print()
|
723 |
# print("Human genes similar to " + str(query))
|
724 |
df1 = table.copy()
|
725 |
+
df2 = pd.read_csv('phytochemicals.csv')
|
726 |
+
m = df1.Word.isin(df2.phyto)
|
727 |
+
df1 = df1[m]
|
728 |
+
df1.rename(columns={'Word': 'Phytochemical'}, inplace=True)
|
729 |
+
df_len = len(df1)
|
730 |
+
# print(len(df1))
|
731 |
+
# df1["Human Gene"] = df1["Human Gene"].str.upper()
|
732 |
# print(df1.head(50))
|
733 |
# print()
|
734 |
# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
|
735 |
# time.sleep(2)
|
736 |
# Create the slider with increments of 5 up to 100
|
737 |
|
738 |
+
# Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
|
739 |
+
value_phyto = min(df1.shape[0], 100)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
740 |
|
741 |
+
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
|
742 |
+
# f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
|
743 |
+
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
744 |
+
# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
745 |
+
# unsafe_allow_html=True)
|
746 |
+
|
747 |
+
st.markdown(
|
748 |
+
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_phyto} "
|
749 |
+
f"</span>Phytochemicals contextually and semantically similar to "
|
750 |
+
f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. "
|
751 |
+
f"Click on the squares to expand and also the Pubmed and Wikipedia links for more compound information</span></p></b>",
|
752 |
+
unsafe_allow_html=True)
|
753 |
+
|
754 |
+
df15 = df1.head(value_phyto).copy()
|
755 |
+
|
756 |
+
df15.index = (1 / df15.index) * 10000
|
757 |
+
sizes = df15.index.tolist()
|
758 |
+
|
759 |
+
df15.set_index('Phytochemical', inplace=True)
|
760 |
+
|
761 |
+
df8 = df1.copy()
|
762 |
+
# print(df4.head(10))
|
763 |
+
df8["SIMILARITY"] = 'Similarity Score ' + df8.head(value_phyto)["SIMILARITY"].round(2).astype(str)
|
764 |
+
df8.reset_index(inplace=True)
|
765 |
+
# df4 = df4.rename(columns={'Protein': 'symbol2'})
|
766 |
+
# print(df4)
|
767 |
+
# # Use df.query to get a subset of df1 based on ids in df2
|
768 |
+
# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
|
769 |
+
# # Use merge to join the two DataFrames on id
|
770 |
+
# result = pd.merge(subset, df2b, on='symbol2')
|
771 |
+
# print(result)
|
772 |
+
if value_phyto <= df_len:
|
773 |
+
# Define the `text` column for labels and `href` column for links
|
774 |
+
# Reset the index
|
775 |
+
df15.reset_index(inplace=True)
|
776 |
+
|
777 |
+
# Replace hyphens with spaces in the 'text' column
|
778 |
+
df15['Phytochemical'] = df15['Phytochemical'].str.replace('-', ' ')
|
779 |
+
|
780 |
+
# Set the 'text' column back as the index
|
781 |
+
df15.set_index('Phytochemical', inplace=True)
|
782 |
+
df15['text'] = df15.index
|
783 |
+
df15['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
784 |
+
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df15['text']]
|
785 |
+
df15['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df15['text']]
|
786 |
+
assert isinstance(df15, object)
|
787 |
+
df15['database'] = database_name
|
788 |
+
|
789 |
+
# df11['name'] = [c for c in result['Approved name']]
|
790 |
+
|
791 |
+
# Create the treemap using `px.treemap`
|
792 |
+
fig = px.treemap(df15, path=[df15['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
|
793 |
+
hover_name=(df8.head(value_phyto)['SIMILARITY']))
|
794 |
+
|
795 |
+
fig.update(layout_coloraxis_showscale=False)
|
796 |
+
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
797 |
+
fig.update_annotations(visible=False)
|
798 |
+
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
799 |
+
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
800 |
+
texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
|
801 |
+
"<a href='%{customdata[0]}'>PubMed"
|
802 |
+
"</a><br><br><a href='%{customdata[2]}'>Wikipedia"
|
803 |
+
"</span></a>")
|
804 |
+
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightSeaGreen"])
|
805 |
+
# # display the treemap in Streamlit
|
806 |
+
# with treemap2:
|
807 |
+
|
808 |
+
# st.pyplot(fig2)
|
809 |
+
st.plotly_chart(fig, use_container_width=True)
|
810 |
+
|
811 |
+
st.caption("Phytochemical designation and database provided by PhytoHub: https://phytohub.eu/")
|
812 |
+
|
813 |
+
csv = df1.head(value_phyto).to_csv().encode('utf-8')
|
814 |
+
st.download_button(label=f"download top {value_phyto} phytochemicals (csv)", data=csv,
|
815 |
+
file_name=f'{database_name}_phytochemicals.csv', mime='text/csv')
|
816 |
+
|
817 |
+
|
818 |
+
else:
|
819 |
+
st.warning(
|
820 |
+
f"This selection exceeds the number of similar pythochemicals related to {query} within the {database_name} corpus, please choose a lower number")
|
821 |
+
st.markdown("---")
|
822 |
|
823 |
# print()
|
824 |
# print("Human genes similar to " + str(query))
|
825 |
df1 = table.copy()
|
826 |
+
df2 = pd.read_csv('kegg_compounds_lowercase.csv')
|
827 |
+
m = df1.Word.isin(df2.compound)
|
828 |
df1 = df1[m]
|
829 |
+
df1.rename(columns={'Word': 'Compounds'}, inplace=True)
|
|
|
830 |
df_len = len(df1)
|
831 |
+
# df1["Human Gene"] = df1["Human Gene"].str.upper()
|
832 |
# print(df1.head(50))
|
833 |
# print()
|
834 |
# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
|
835 |
# time.sleep(2)
|
836 |
# Create the slider with increments of 5 up to 100
|
837 |
|
838 |
+
# Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
|
839 |
+
value_compound = min(df1.shape[0], 100)
|
840 |
+
|
841 |
+
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
|
842 |
+
# f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
|
843 |
+
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
844 |
+
# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
845 |
+
# unsafe_allow_html=True)
|
846 |
+
|
847 |
st.markdown(
|
848 |
+
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_compound} "
|
849 |
+
f"</span>Compounds contextually and semantically similar to "
|
850 |
+
f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. "
|
851 |
+
f"Click on the squares to expand and the Pubmed, Wikipedia, and KEGG links for more compound information (may take time to load)</span></p></b>",
|
852 |
unsafe_allow_html=True)
|
853 |
+
|
854 |
+
df12 = df1.head(value_compound).copy()
|
855 |
+
|
856 |
+
df12.index = (1 / df12.index) * 10000
|
857 |
+
sizes = df12.index.tolist()
|
858 |
+
|
859 |
+
df12.set_index('Compounds', inplace=True)
|
860 |
+
|
861 |
+
df5 = df1.copy()
|
862 |
+
# print(df4.head(10))
|
863 |
+
df5["SIMILARITY"] = 'Similarity Score ' + df5.head(value_compound)["SIMILARITY"].round(2).astype(str)
|
864 |
+
df5.reset_index(inplace=True)
|
865 |
+
# df4 = df4.rename(columns={'Protein': 'symbol2'})
|
866 |
+
# print(df4)
|
867 |
+
# # Use df.query to get a subset of df1 based on ids in df2
|
868 |
+
# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
|
869 |
+
# # Use merge to join the two DataFrames on id
|
870 |
+
# result = pd.merge(subset, df2b, on='symbol2')
|
871 |
+
# print(result)
|
872 |
+
|
873 |
+
if value_compound <= df_len:
|
874 |
+
# Define the `text` column for labels and `href` column for links
|
875 |
+
# Reset the index
|
876 |
+
df12.reset_index(inplace=True)
|
877 |
+
|
878 |
+
# Replace hyphens with spaces in the 'text' column
|
879 |
+
df12['Compounds'] = df12['Compounds'].str.replace('-', ' ')
|
880 |
+
|
881 |
+
# Set the 'text' column back as the index
|
882 |
+
df12.set_index('Compounds', inplace=True)
|
883 |
+
df12['text'] = df12.index
|
884 |
+
df12['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
885 |
+
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df12['text']]
|
886 |
+
df12['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df12['text']]
|
887 |
+
df12['href3'] = [f'https://www.genome.jp/entry/{compound_id}' for compound_id in get_compound_ids(df12['text'])]
|
888 |
+
assert isinstance(df12, object)
|
889 |
+
df12['database'] = database_name
|
890 |
+
|
891 |
+
# df11['name'] = [c for c in result['Approved name']]
|
892 |
+
|
893 |
+
# Create the treemap using `px.treemap`
|
894 |
+
fig = px.treemap(df12, path=[df12['text']], values=sizes,
|
895 |
+
custom_data=['href', 'database', 'href2', 'text', 'href3'],
|
896 |
+
hover_name=(df5.head(value_compound)['SIMILARITY']))
|
897 |
+
|
898 |
+
fig.update(layout_coloraxis_showscale=False)
|
899 |
+
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
900 |
+
fig.update_annotations(visible=False)
|
901 |
+
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
902 |
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
903 |
+
texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
|
904 |
"<a href='%{customdata[0]}'>PubMed"
|
905 |
+
"</a><br><br><a href='%{customdata[2]}'>Wikipedia"
|
906 |
+
"</a><br><br><a href='%{customdata[4]}'>KEGG Compound Page"
|
907 |
"</span></a>")
|
|
|
|
|
|
|
908 |
|
909 |
+
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightYellow"])
|
910 |
+
# # display the treemap in Streamlit
|
911 |
+
# with treemap2:
|
912 |
|
913 |
+
# st.pyplot(fig2)
|
914 |
+
st.plotly_chart(fig, use_container_width=True)
|
915 |
|
916 |
+
st.caption("Compound designation and database provided by KEGG: https://www.kegg.jp/kegg/compound/")
|
|
|
|
|
917 |
|
918 |
+
csv = df1.head(value_compound).to_csv().encode('utf-8')
|
919 |
+
st.download_button(label=f"download top {value_compound} compounds (csv)", data=csv,
|
920 |
+
file_name=f'{database_name}_compounds.csv', mime='text/csv')
|
921 |
|
922 |
+
|
923 |
+
else:
|
924 |
+
st.warning(
|
925 |
+
f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
|
926 |
st.markdown("---")
|
927 |
|
928 |
|
929 |
+
def save_comment(comment):
|
930 |
+
with open('comments.txt', 'a') as f:
|
931 |
+
f.write(f'{comment}\n')
|
932 |
+
|
933 |
+
|
934 |
+
def save_comment_threaded(comment):
|
935 |
+
t = threading.Thread(target=save_comment, args=(comment,))
|
936 |
+
t.start()
|
937 |
+
|
938 |
+
|
939 |
+
st.title("Abstractalytics Web App")
|
940 |
+
st.write("We appreciate your feedback!")
|
941 |
+
|
942 |
+
user_comment = st.text_area("Please send us your anonymous remarks/suggestions about the Abstractalytics Web App: "
|
943 |
+
"(app will pause while we save your comments)")
|
944 |
+
|
945 |
+
if st.button("Submit"):
|
946 |
+
if user_comment:
|
947 |
+
save_comment_threaded(user_comment)
|
948 |
+
st.success("Your comment has been saved. Thank you for your feedback!")
|
949 |
+
else:
|
950 |
+
st.warning("Please enter a comment before submitting.")
|
951 |
+
|
952 |
+
st.markdown("---")
|
953 |
+
|
954 |
st.subheader("Cancer-related videos")
|
955 |
if query:
|
956 |
+
idlist = []
|
957 |
search_keyword = {query}
|
958 |
html = urllib.request.urlopen("https://www.youtube.com/@NCIgov/search?query=cancer")
|
959 |
html2 = urllib.request.urlopen("https://www.youtube.com/@CancerCenter/search?query=cancer")
|
|
|
979 |
|
980 |
c1, c2, c3 = st.columns(3)
|
981 |
|
|
|
982 |
with c1:
|
983 |
+
st.video("https://www.youtube.com/watch?v=" + video_ids[0])
|
984 |
with c2:
|
985 |
+
st.video("https://www.youtube.com/watch?v=" + video_ids[1])
|
986 |
with c3:
|
987 |
+
st.video("https://www.youtube.com/watch?v=" + video_ids[2])
|
988 |
st.markdown("---")
|
989 |
|
990 |
+
# else:
|
991 |
+
# st.error("The password you entered is incorrect.")
|
992 |
+
|
993 |
+
|
994 |
+
|
995 |
+
|
996 |
+
|
997 |
+
|
998 |
+
|
999 |
+
|
1000 |
+
|
1001 |
+
|
1002 |
+
|
1003 |
+
|
1004 |
+
|
1005 |
+
|
1006 |
|
1007 |
|
1008 |
|