Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -2,6 +2,8 @@ import streamlit as st
|
|
2 |
import time
|
3 |
import concurrent.futures
|
4 |
import json
|
|
|
|
|
5 |
from gensim.models import Word2Vec
|
6 |
import pandas as pd
|
7 |
import threading
|
@@ -12,7 +14,7 @@ import re
|
|
12 |
import urllib.request
|
13 |
import random
|
14 |
import plotly.express as px
|
15 |
-
|
16 |
from streamlit.components.v1 import html
|
17 |
|
18 |
st.set_page_config(page_title="OncoDigger", page_icon=":microscope:", layout="wide", # centered
|
@@ -162,7 +164,6 @@ if query:
|
|
162 |
# print(model.wv.similar_by_word('bfgf', topn=50, restrict_vocab=None))
|
163 |
df = pd.DataFrame(X)
|
164 |
|
165 |
-
|
166 |
def get_compound_ids(compound_names):
|
167 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
168 |
compound_ids = list(executor.map(get_compound_id, compound_names))
|
@@ -197,8 +198,6 @@ if query:
|
|
197 |
pd.set_option('display.max_rows', None)
|
198 |
table2 = table.copy()
|
199 |
|
200 |
-
|
201 |
-
|
202 |
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap to visualize "
|
203 |
# f"<span style='color:red; font-style: italic;'>words</span> contextually "
|
204 |
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
@@ -208,6 +207,58 @@ if query:
|
|
208 |
# Set the max number of words to display
|
209 |
value_word = min(100, len(table2))
|
210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
st.markdown(
|
212 |
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_word} "
|
213 |
f"</span>words contextually and semantically similar to "
|
@@ -265,106 +316,62 @@ if query:
|
|
265 |
except KeyError:
|
266 |
st.warning(
|
267 |
"This word is not found in the corpus, it could be because it is not spelled correctly or could be that it does not have enough representation within the corpus, please try again")
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
#
|
272 |
-
#
|
273 |
-
#
|
274 |
-
#
|
275 |
-
#
|
276 |
-
#
|
277 |
-
#
|
278 |
-
#
|
279 |
-
# df1["Human Gene"] = df1["Human Gene"].str.upper()
|
280 |
-
# # print(df1.head(50))
|
281 |
-
# # print()
|
282 |
-
# # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
|
283 |
-
# # time.sleep(2)
|
284 |
-
# # Create the slider with increments of 5 up to 100
|
285 |
-
#
|
286 |
-
# # Set the maximum number of genes to display up to 100
|
287 |
-
# value_gene = min(len(df1), 100)
|
288 |
-
#
|
289 |
-
# if value_gene > 0:
|
290 |
-
# # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Treemap visualization of "
|
291 |
-
# # f"<span style='color:red; font-style: italic;'>genes</span> contextually "
|
292 |
-
# # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
293 |
-
# # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
294 |
-
# # unsafe_allow_html=True)
|
295 |
-
#
|
296 |
-
# st.markdown(
|
297 |
-
# f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_gene} "
|
298 |
-
# f"</span>genes contextually and semantically similar to "
|
299 |
-
# f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> database. "
|
300 |
-
# f"Click on the squares to expand and also the Pubmed and GeneCard links for more gene information</span></p></b>",
|
301 |
-
# unsafe_allow_html=True)
|
302 |
#
|
303 |
-
#
|
304 |
-
#
|
305 |
-
#
|
306 |
-
#
|
307 |
#
|
308 |
-
# df3 = df1.copy()
|
309 |
-
# df3["SIMILARITY"] = 'Similarity Score ' + df3.head(value_gene)["SIMILARITY"].round(2).astype(str)
|
310 |
-
# df3.reset_index(inplace=True)
|
311 |
-
# df3 = df3.rename(columns={'Human Gene': 'symbol2'})
|
312 |
-
# # Use df.query to get a subset of df1 based on ids in df2
|
313 |
-
# subset = df3.head(value_gene).query('symbol2 in @df2.symbol2')
|
314 |
-
# # Use merge to join the two DataFrames on id
|
315 |
-
# result = pd.merge(subset, df2, on='symbol2')
|
316 |
-
# # Show the result
|
317 |
-
# # print(result)
|
318 |
-
# # label = df10.index.tolist()
|
319 |
-
# # df2 = df10
|
320 |
-
# # print(df2)
|
321 |
-
# try:
|
322 |
-
# # Define the `text` column for labels and `href` column for links
|
323 |
-
# df10['text'] = df10.index
|
324 |
-
# df10['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
325 |
-
# '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df10['text']]
|
326 |
-
# df10['href2'] = [f'https://www.genecards.org/cgi-bin/carddisp.pl?gene=' + c for c in df10['text']]
|
327 |
#
|
328 |
-
#
|
329 |
-
#
|
330 |
-
#
|
331 |
#
|
332 |
-
#
|
|
|
333 |
#
|
334 |
-
#
|
335 |
-
#
|
336 |
-
#
|
337 |
-
#
|
338 |
#
|
339 |
-
#
|
340 |
-
#
|
341 |
-
#
|
342 |
-
#
|
343 |
-
#
|
344 |
-
# texttemplate="<br><span style='font-family: Arial; font-size: 20px;'>%{customdata[4]}<br><br>"
|
345 |
-
# "%{customdata[1]}<br><br>"
|
346 |
-
# "<a href='%{customdata[0]}'>PubMed"
|
347 |
-
# "</a><br><br><a href='%{customdata[3]}'>GeneCard"
|
348 |
-
# "</span></a>")
|
349 |
-
# fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightblue"])
|
350 |
-
# # # display the treemap in Streamlit
|
351 |
-
# # with treemap2:
|
352 |
#
|
353 |
-
#
|
354 |
-
#
|
|
|
355 |
#
|
356 |
-
#
|
357 |
-
#
|
358 |
-
#
|
359 |
-
#
|
360 |
#
|
361 |
-
#
|
362 |
-
#
|
363 |
-
#
|
|
|
|
|
364 |
#
|
|
|
|
|
|
|
365 |
#
|
366 |
-
#
|
367 |
-
# st.warning(f"No similar genes related to {query} within the {database_name} corpus were found.")
|
368 |
|
369 |
st.markdown("---")
|
370 |
|
@@ -375,7 +382,7 @@ if query:
|
|
375 |
df1 = df1[m]
|
376 |
df1.rename(columns={'Word': 'Genes'}, inplace=True)
|
377 |
df_len = len(df1)
|
378 |
-
print(len(df1))
|
379 |
|
380 |
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap to visualize "
|
381 |
# f"<span style='color:red; font-style: italic;'>proteins</span> contextually "
|
@@ -457,6 +464,70 @@ if query:
|
|
457 |
else:
|
458 |
st.warning(
|
459 |
f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
460 |
st.markdown("---")
|
461 |
# print()
|
462 |
# print("Human genes similar to " + str(query))
|
@@ -476,7 +547,7 @@ if query:
|
|
476 |
|
477 |
# Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
|
478 |
value_drug = min(df1.shape[0], 100)
|
479 |
-
|
480 |
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
|
481 |
# f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
|
482 |
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
@@ -556,208 +627,65 @@ if query:
|
|
556 |
else:
|
557 |
st.warning(
|
558 |
f"This selection exceeds the number of similar drugs related to {query} within the {database_name} corpus, please choose a lower number")
|
559 |
-
|
560 |
-
#
|
561 |
-
#
|
562 |
-
#
|
563 |
-
#
|
564 |
-
#
|
565 |
-
#
|
566 |
-
#
|
567 |
-
#
|
568 |
-
#
|
569 |
-
#
|
570 |
-
#
|
571 |
-
# # df1["Human Gene"] = df1["Human Gene"].str.upper()
|
572 |
-
# # print(df1.head(50))
|
573 |
-
# # print()
|
574 |
-
# # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
|
575 |
-
# # time.sleep(2)
|
576 |
-
# # Create the slider with increments of 5 up to 100
|
577 |
-
#
|
578 |
-
# # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
|
579 |
-
# value_disease = min(df1.shape[0], 100)
|
580 |
-
#
|
581 |
-
# # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
|
582 |
-
# # f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
|
583 |
-
# # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
584 |
-
# # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
585 |
-
# # unsafe_allow_html=True)
|
586 |
-
#
|
587 |
-
# st.markdown(
|
588 |
-
# f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_disease} "
|
589 |
-
# f"</span>Diseases contextually and semantically similar to "
|
590 |
-
# f"<span style='color:red; font-style: italic;'>{query}:</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> database. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>",
|
591 |
-
# unsafe_allow_html=True)
|
592 |
-
#
|
593 |
-
# df14 = df1.head(value_disease).copy()
|
594 |
-
#
|
595 |
-
# df14.index = (1 / df14.index) * 10000
|
596 |
-
# sizes = df14.index.tolist()
|
597 |
-
#
|
598 |
-
# df14.set_index('Disease', inplace=True)
|
599 |
-
#
|
600 |
-
# df7 = df1.copy()
|
601 |
-
# # print(df4.head(10))
|
602 |
-
# df7["SIMILARITY"] = 'Similarity Score ' + df7.head(value_disease)["SIMILARITY"].round(2).astype(str)
|
603 |
-
# df7.reset_index(inplace=True)
|
604 |
-
# # df4 = df4.rename(columns={'Protein': 'symbol2'})
|
605 |
-
# # print(df4)
|
606 |
-
# # # Use df.query to get a subset of df1 based on ids in df2
|
607 |
-
# # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
|
608 |
-
# # # Use merge to join the two DataFrames on id
|
609 |
-
# # result = pd.merge(subset, df2b, on='symbol2')
|
610 |
-
# # print(result)
|
611 |
-
# if value_disease <= df_len:
|
612 |
-
# # Define the `text` column for labels and `href` column for links
|
613 |
-
# # Reset the index
|
614 |
-
# df14.reset_index(inplace=True)
|
615 |
-
#
|
616 |
-
# # Replace hyphens with spaces in the 'text' column
|
617 |
-
# df14['Disease'] = df14['Disease'].str.replace('-', ' ')
|
618 |
-
#
|
619 |
-
# # Set the 'text' column back as the index
|
620 |
-
# df14.set_index('Disease', inplace=True)
|
621 |
-
# df14['text'] = df14.index
|
622 |
-
# df14['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
623 |
-
# '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df14['text']]
|
624 |
-
# df14['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df14['text']]
|
625 |
-
# assert isinstance(df14, object)
|
626 |
-
# df14['database'] = database_name
|
627 |
-
#
|
628 |
-
# # df11['name'] = [c for c in result['Approved name']]
|
629 |
-
#
|
630 |
-
# # Create the treemap using `px.treemap`
|
631 |
-
# fig = px.treemap(df14, path=[df14['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
|
632 |
-
# hover_name=(df7.head(value_disease)['SIMILARITY']))
|
633 |
-
#
|
634 |
-
# fig.update(layout_coloraxis_showscale=False)
|
635 |
-
# fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
636 |
-
# fig.update_annotations(visible=False)
|
637 |
-
# fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
638 |
-
# hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
639 |
-
# texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
|
640 |
-
# "<a href='%{customdata[0]}'>PubMed"
|
641 |
-
# "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
|
642 |
-
# "</span></a>")
|
643 |
-
# fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["PaleGoldenRod"])
|
644 |
-
# # # display the treemap in Streamlit
|
645 |
-
# # with treemap2:
|
646 |
-
#
|
647 |
-
# # st.pyplot(fig2)
|
648 |
-
# st.plotly_chart(fig, use_container_width=True)
|
649 |
-
#
|
650 |
-
# st.caption("Disease designation and database provided by KEGG: https://www.genome.jp/kegg/disease/")
|
651 |
-
#
|
652 |
-
# csv = df1.head(value_disease).to_csv().encode('utf-8')
|
653 |
-
# st.download_button(label=f"download top {value_disease} diseases (csv)", data=csv,
|
654 |
-
# file_name=f'{database_name}_disease.csv', mime='text/csv')
|
655 |
-
#
|
656 |
#
|
657 |
-
# else:
|
658 |
-
# st.warning(
|
659 |
-
# f"This selection exceeds the number of similar diseases related to {query} within the {database_name} corpus, please choose a lower number")
|
660 |
-
# st.markdown("---")
|
661 |
-
|
662 |
-
# st.markdown("---")
|
663 |
-
# # print()
|
664 |
-
# # print("Human genes similar to " + str(query))
|
665 |
-
# df1 = table.copy()
|
666 |
-
# df2 = pd.read_csv('pathwaysKegg.csv')
|
667 |
-
# m = df1.Word.isin(df2.pathway)
|
668 |
-
# df1 = df1[m]
|
669 |
-
# df1.rename(columns={'Word': 'Pathway'}, inplace=True)
|
670 |
-
# df_len = len(df1)
|
671 |
-
# # print(len(df1))
|
672 |
-
# # df1["Human Gene"] = df1["Human Gene"].str.upper()
|
673 |
-
# # print(df1.head(50))
|
674 |
-
# # print()
|
675 |
-
# # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
|
676 |
-
# # time.sleep(2)
|
677 |
-
# # Create the slider with increments of 5 up to 100
|
678 |
-
#
|
679 |
-
# # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
|
680 |
-
# value_pathway = min(df1.shape[0], 100)
|
681 |
-
#
|
682 |
-
# # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
|
683 |
-
# # f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
|
684 |
-
# # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
685 |
-
# # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
686 |
-
# # unsafe_allow_html=True)
|
687 |
-
#
|
688 |
-
# st.markdown(
|
689 |
-
# f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_pathway} "
|
690 |
-
# f"</span>Pathways contextually and semantically similar to "
|
691 |
-
# f"<span style='color:red; font-style: italic;'>{query}:</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> database. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>",
|
692 |
-
# unsafe_allow_html=True)
|
693 |
-
#
|
694 |
-
# df16 = df1.head(value_pathway).copy()
|
695 |
-
#
|
696 |
-
# df16.index = (1 / df16.index) * 10000
|
697 |
-
# sizes = df16.index.tolist()
|
698 |
-
#
|
699 |
-
# df16.set_index('Pathway', inplace=True)
|
700 |
#
|
701 |
-
#
|
702 |
-
#
|
703 |
-
#
|
704 |
-
# df9.reset_index(inplace=True)
|
705 |
-
# # df4 = df4.rename(columns={'Protein': 'symbol2'})
|
706 |
-
# # print(df4)
|
707 |
-
# # # Use df.query to get a subset of df1 based on ids in df2
|
708 |
-
# # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
|
709 |
-
# # # Use merge to join the two DataFrames on id
|
710 |
-
# # result = pd.merge(subset, df2b, on='symbol2')
|
711 |
-
# # print(result)
|
712 |
-
# if value_pathway <= df_len:
|
713 |
-
# # Define the `text` column for labels and `href` column for links
|
714 |
-
# # Reset the index
|
715 |
-
# df16.reset_index(inplace=True)
|
716 |
#
|
717 |
-
#
|
718 |
-
#
|
|
|
719 |
#
|
720 |
-
#
|
721 |
-
#
|
722 |
-
# df16['text'] = df16.index
|
723 |
-
# df16['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
724 |
-
# '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df16['text']]
|
725 |
-
# df16['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df16['text']]
|
726 |
-
# assert isinstance(df16, object)
|
727 |
-
# df16['database'] = database_name
|
728 |
#
|
729 |
-
#
|
|
|
|
|
|
|
730 |
#
|
731 |
-
#
|
732 |
-
#
|
733 |
-
#
|
|
|
|
|
|
|
734 |
#
|
735 |
-
#
|
736 |
-
#
|
737 |
-
#
|
738 |
-
#
|
739 |
-
#
|
740 |
-
# texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
|
741 |
-
# "<a href='%{customdata[0]}'>PubMed"
|
742 |
-
# "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
|
743 |
-
# "</span></a>")
|
744 |
-
# fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["FloralWhite"])
|
745 |
-
# # # display the treemap in Streamlit
|
746 |
-
# # with treemap2:
|
747 |
#
|
748 |
-
#
|
749 |
-
#
|
|
|
|
|
750 |
#
|
751 |
-
#
|
|
|
|
|
|
|
|
|
752 |
#
|
753 |
-
#
|
754 |
-
#
|
755 |
-
#
|
756 |
#
|
757 |
-
#
|
758 |
-
# else:
|
759 |
-
# st.warning(
|
760 |
-
# f"This selection exceeds the number of similar pathways related to {query} within the {database_name} corpus, please choose a lower number")
|
761 |
# st.markdown("---")
|
762 |
|
763 |
st.markdown("---")
|
@@ -860,8 +788,70 @@ if query:
|
|
860 |
else:
|
861 |
st.warning(
|
862 |
f"This selection exceeds the number of similar pythochemicals related to {query} within the {database_name} corpus, please choose a lower number")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
863 |
st.markdown("---")
|
864 |
|
|
|
865 |
# print()
|
866 |
# print("Human genes similar to " + str(query))
|
867 |
df1 = table.copy()
|
@@ -966,8 +956,65 @@ if query:
|
|
966 |
else:
|
967 |
st.warning(
|
968 |
f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
|
969 |
-
st.markdown("---")
|
970 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
971 |
# import os
|
972 |
|
973 |
# from datasets import Dataset
|
@@ -1088,6 +1135,7 @@ if query:
|
|
1088 |
5. [Word2Vec: How to Implement Word2Vec in Python](https://www.youtube.com/watch?v=ISPId9Lhc1g&t=6s) - A YouTube video by Data Talks demonstrating how to implement Word2Vec in Python using the Gensim library.
|
1089 |
""")
|
1090 |
|
|
|
1091 |
# else:
|
1092 |
# st.error("The password you entered is incorrect.")
|
1093 |
|
|
|
2 |
import time
|
3 |
import concurrent.futures
|
4 |
import json
|
5 |
+
|
6 |
+
# import tensorflow
|
7 |
from gensim.models import Word2Vec
|
8 |
import pandas as pd
|
9 |
import threading
|
|
|
14 |
import urllib.request
|
15 |
import random
|
16 |
import plotly.express as px
|
17 |
+
import plotly.graph_objs as go
|
18 |
from streamlit.components.v1 import html
|
19 |
|
20 |
st.set_page_config(page_title="OncoDigger", page_icon=":microscope:", layout="wide", # centered
|
|
|
164 |
# print(model.wv.similar_by_word('bfgf', topn=50, restrict_vocab=None))
|
165 |
df = pd.DataFrame(X)
|
166 |
|
|
|
167 |
def get_compound_ids(compound_names):
|
168 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
169 |
compound_ids = list(executor.map(get_compound_id, compound_names))
|
|
|
198 |
pd.set_option('display.max_rows', None)
|
199 |
table2 = table.copy()
|
200 |
|
|
|
|
|
201 |
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap to visualize "
|
202 |
# f"<span style='color:red; font-style: italic;'>words</span> contextually "
|
203 |
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
|
|
207 |
# Set the max number of words to display
|
208 |
value_word = min(100, len(table2))
|
209 |
|
210 |
+
try:
|
211 |
+
value_word = min(50, len(table2))
|
212 |
+
# Get the top 50 similar words to the query
|
213 |
+
top_words = model.wv.most_similar_cosmul(query, topn=10000)
|
214 |
+
words = [word for word, sim in top_words]
|
215 |
+
words = [word.replace(' ', '-') for word in words]
|
216 |
+
sims = [sim for word, sim in top_words]
|
217 |
+
X_top = model.wv[words]
|
218 |
+
|
219 |
+
# Compute similarities between query and top 100 words
|
220 |
+
sims_query_top = sims # print(sims_query_top)
|
221 |
+
except Exception as e:
|
222 |
+
print("Error:", e)
|
223 |
+
|
224 |
+
# Generate a 2D scatter plot of word embeddings using Plotly
|
225 |
+
fig = px.scatter(x=X_top[:, 0], y=X_top[:, 1], color=sims_query_top, color_continuous_scale="RdYlGn", )
|
226 |
+
|
227 |
+
# Change background color to black
|
228 |
+
fig.update_layout(plot_bgcolor='#CCFFFF')
|
229 |
+
|
230 |
+
# Change color of text to white
|
231 |
+
fig.update_layout(xaxis=dict(gridcolor='#CCFFFF', color='blue'),
|
232 |
+
yaxis=dict(gridcolor='#CCFFFF', color='blue'))
|
233 |
+
|
234 |
+
# fig.update_traces(hovertemplate='<b>%{hovertext}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>')
|
235 |
+
fig.update_layout(title=dict(
|
236 |
+
text=f"Top 10000 words in an interactive embedding map for {query} in {database_name} PubMed corpus"
|
237 |
+
f": Zoom in to the black diamond to find {query}", x=0.5, y=1, xanchor='center', yanchor='top',
|
238 |
+
font=dict(color='black')))
|
239 |
+
fig.update_coloraxes(colorbar_title="Similarity with query")
|
240 |
+
|
241 |
+
# Represent query as a large red diamond
|
242 |
+
fig.add_trace(go.Scatter(x=[model.wv[query][0]], y=[model.wv[query][1]], mode='markers',
|
243 |
+
marker=dict(size=12, color='black', symbol='diamond'), name=query, hovertext=query,
|
244 |
+
showlegend=False))
|
245 |
+
|
246 |
+
# Add label for the query above the diamond
|
247 |
+
fig.add_trace(go.Scatter(x=[model.wv[query][0]], y=[model.wv[query][1]], mode='text', text=[query],
|
248 |
+
textposition='top right', textfont=dict(color='blue', size=10), hoverinfo='none',
|
249 |
+
showlegend=False))
|
250 |
+
|
251 |
+
# Add circles for the top 50 similar words
|
252 |
+
fig.add_trace(go.Scatter(x=X_top[:, 0], y=X_top[:, 1], mode='markers',
|
253 |
+
marker=dict(size=2, color=sims_query_top, colorscale='RdYlGn', symbol='circle'),
|
254 |
+
text=words, customdata=sims, name=''))
|
255 |
+
|
256 |
+
fig.update(layout_coloraxis_showscale=True)
|
257 |
+
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
258 |
+
fig.update_annotations(visible=False)
|
259 |
+
|
260 |
+
st.plotly_chart(fig, use_container_width=True)
|
261 |
+
|
262 |
st.markdown(
|
263 |
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_word} "
|
264 |
f"</span>words contextually and semantically similar to "
|
|
|
316 |
except KeyError:
|
317 |
st.warning(
|
318 |
"This word is not found in the corpus, it could be because it is not spelled correctly or could be that it does not have enough representation within the corpus, please try again")
|
319 |
+
|
320 |
+
|
321 |
+
|
322 |
+
# try:
|
323 |
+
# value_word = min(50, len(table2))
|
324 |
+
# # Get the top 50 similar words to the query
|
325 |
+
# top_words = model.wv.most_similar_cosmul(query, topn=value_word)
|
326 |
+
# words = [word for word, sim in top_words]
|
327 |
+
# words = [word.replace(' ', '-') for word in words]
|
328 |
+
# sims = [sim for word, sim in top_words]
|
329 |
+
# X_top = model.wv[words]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
330 |
#
|
331 |
+
# # Compute similarities between query and top 100 words
|
332 |
+
# sims_query_top = sims # print(sims_query_top)
|
333 |
+
# except Exception as e:
|
334 |
+
# print("Error:", e)
|
335 |
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
336 |
#
|
337 |
+
# # Generate a 3D scatter plot of word embeddings using Plotly
|
338 |
+
# fig = px.scatter_3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], text=words, color=sims_query_top,
|
339 |
+
# color_continuous_scale="RdYlGn", hover_name=words, hover_data={"color": sims_query_top})
|
340 |
#
|
341 |
+
# # Change background color to black
|
342 |
+
# fig.update_layout(scene=dict(bgcolor='#CCFFFF'))
|
343 |
#
|
344 |
+
# # Change color of text to white
|
345 |
+
# fig.update_layout(scene=dict(xaxis=dict(backgroundcolor='#CCFFFF', color='blue'),
|
346 |
+
# yaxis=dict(backgroundcolor='#CCFFFF', color='blue'),
|
347 |
+
# zaxis=dict(backgroundcolor='#CCFFFF', color='blue')))
|
348 |
#
|
349 |
+
# fig.update_traces(hovertemplate='<b>%{hovertext}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>')
|
350 |
+
# fig.update_layout(title=dict(text=f"Word embedding map for {query} in {database_name} PubMed corpus", x=0.5, y=0.95,
|
351 |
+
# xanchor='center', yanchor='top', font=dict(color='black')),
|
352 |
+
# scene=dict(xaxis_title="Dimension 1", yaxis_title="Dimension 2", zaxis_title="Dimension 3"))
|
353 |
+
# fig.update_coloraxes(colorbar_title="Similarity with query")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
354 |
#
|
355 |
+
# # Represent query as a large red diamond
|
356 |
+
# fig.add_trace(go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='markers',
|
357 |
+
# marker=dict(size=7, color='black', symbol='diamond'), name=query, hovertext=query, showlegend=False))
|
358 |
#
|
359 |
+
# # Add label for the query above the diamond
|
360 |
+
# fig.add_trace(
|
361 |
+
# go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='text', text=[query],
|
362 |
+
# textposition='bottom center', textfont=dict(color='blue', size=10), hoverinfo='none', showlegend=False))
|
363 |
#
|
364 |
+
# # Add circles for the top 50 similar words
|
365 |
+
# fig.add_trace(go.Scatter3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], mode='markers',
|
366 |
+
# marker=dict(size=2, color=sims_query_top, colorscale='RdYlGn', symbol='circle'),
|
367 |
+
# hovertemplate='<b>%{text}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>',
|
368 |
+
# text=words, customdata=sims, name=''))
|
369 |
#
|
370 |
+
# fig.update(layout_coloraxis_showscale=True)
|
371 |
+
# fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
372 |
+
# fig.update_annotations(visible=False)
|
373 |
#
|
374 |
+
# st.plotly_chart(fig, use_container_width=True)
|
|
|
375 |
|
376 |
st.markdown("---")
|
377 |
|
|
|
382 |
df1 = df1[m]
|
383 |
df1.rename(columns={'Word': 'Genes'}, inplace=True)
|
384 |
df_len = len(df1)
|
385 |
+
# print(len(df1))
|
386 |
|
387 |
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap to visualize "
|
388 |
# f"<span style='color:red; font-style: italic;'>proteins</span> contextually "
|
|
|
464 |
else:
|
465 |
st.warning(
|
466 |
f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
|
467 |
+
|
468 |
+
try:
|
469 |
+
# Get the top 50 similar genes to the query
|
470 |
+
value_gene = min(df_len, 50)
|
471 |
+
top_words = model.wv.most_similar_cosmul(query, topn=value_gene)
|
472 |
+
words = df11.head(value_gene).index
|
473 |
+
words = [word.replace(' ', '-') for word in words]
|
474 |
+
# print(words)
|
475 |
+
sims = df4.head(value_gene)["SIMILARITY"].tolist()
|
476 |
+
# print(sims)
|
477 |
+
X_top = model.wv[words] # print(X_top)
|
478 |
+
except Exception as e:
|
479 |
+
print("Error:", e)
|
480 |
+
|
481 |
+
|
482 |
+
# Remove the text "Similarity Score" from each element in the sims list
|
483 |
+
sims_query_top = [float(sim.split()[-1]) for sim in sims]
|
484 |
+
# print(sims_query_top)
|
485 |
+
|
486 |
+
# Generate a 3D scatter plot of word embeddings using Plotly
|
487 |
+
fig2 = px.scatter_3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], text=words, color=sims_query_top,
|
488 |
+
color_continuous_scale="RdYlGn", hover_name=words,
|
489 |
+
hover_data={"color": sims_query_top})
|
490 |
+
|
491 |
+
# Change background color to black
|
492 |
+
fig2.update_layout(scene=dict(bgcolor='#CCFFFF'))
|
493 |
+
|
494 |
+
# Change color of text to white
|
495 |
+
fig2.update_layout(scene=dict(xaxis=dict(backgroundcolor='#CCFFFF', color='blue'),
|
496 |
+
yaxis=dict(backgroundcolor='#CCFFFF', color='blue'),
|
497 |
+
zaxis=dict(backgroundcolor='#CCFFFF', color='blue')))
|
498 |
+
|
499 |
+
fig2.update_traces(
|
500 |
+
hovertemplate='<b>%{hovertext}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>')
|
501 |
+
fig2.update_layout(
|
502 |
+
title=dict(text=f"Word embedding map for {query} in {database_name} PubMed corpus", x=0.5, y=0.95,
|
503 |
+
xanchor='center', yanchor='top', font=dict(color='black')),
|
504 |
+
scene=dict(xaxis_title="Dimension 1", yaxis_title="Dimension 2", zaxis_title="Dimension 3"))
|
505 |
+
fig2.update_coloraxes(colorbar_title="Similarity with query")
|
506 |
+
|
507 |
+
# Represent query as a large red diamond
|
508 |
+
fig2.add_trace(
|
509 |
+
go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='markers',
|
510 |
+
marker=dict(size=7, color='black', symbol='diamond'), name=query, hovertext=query,
|
511 |
+
showlegend=False))
|
512 |
+
|
513 |
+
# Add label for the query above the diamond
|
514 |
+
fig2.add_trace(
|
515 |
+
go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='text',
|
516 |
+
text=[query], textposition='bottom center', textfont=dict(color='blue', size=10),
|
517 |
+
hoverinfo='none', showlegend=False))
|
518 |
+
|
519 |
+
# Add circles for the top 50 similar words
|
520 |
+
fig2.add_trace(go.Scatter3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], mode='markers',
|
521 |
+
marker=dict(size=2, color=sims_query_top, colorscale='RdYlGn', symbol='circle'),
|
522 |
+
hovertemplate='<b>%{text}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>',
|
523 |
+
text=words, customdata=sims, name=''))
|
524 |
+
|
525 |
+
fig2.update(layout_coloraxis_showscale=True)
|
526 |
+
fig2.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
527 |
+
fig2.update_annotations(visible=False)
|
528 |
+
|
529 |
+
st.plotly_chart(fig2, use_container_width=True)
|
530 |
+
|
531 |
st.markdown("---")
|
532 |
# print()
|
533 |
# print("Human genes similar to " + str(query))
|
|
|
547 |
|
548 |
# Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
|
549 |
value_drug = min(df1.shape[0], 100)
|
550 |
+
# print(value_drug)
|
551 |
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
|
552 |
# f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
|
553 |
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
|
|
627 |
else:
|
628 |
st.warning(
|
629 |
f"This selection exceeds the number of similar drugs related to {query} within the {database_name} corpus, please choose a lower number")
|
630 |
+
# try:
|
631 |
+
# value_drug = min(df_len, 50)
|
632 |
+
# top_words = model.wv.most_similar_cosmul(query, topn=value_drug)
|
633 |
+
# # print(top_words)
|
634 |
+
# words = df13.head(value_drug).index
|
635 |
+
# words = [word.replace(' ', '-') for word in words]
|
636 |
+
# # print(words)
|
637 |
+
# sims = df6.head(value_drug)["SIMILARITY"].tolist()
|
638 |
+
# # print(sims)
|
639 |
+
# X_top = model.wv[words]
|
640 |
+
# except Exception as e:
|
641 |
+
# print("Error:", e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
642 |
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
643 |
#
|
644 |
+
# # Remove the text "Similarity Score" from each element in the sims list
|
645 |
+
# sims_query_top = [float(sim.split()[-1]) for sim in sims]
|
646 |
+
# # print(sims_query_top)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
647 |
#
|
648 |
+
# # Generate a 3D scatter plot of word embeddings using Plotly
|
649 |
+
# fig4 = px.scatter_3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], text=words, color=sims_query_top,
|
650 |
+
# color_continuous_scale="RdYlGn", hover_name=words, hover_data={"color": sims_query_top})
|
651 |
#
|
652 |
+
# # Change background color to black
|
653 |
+
# fig4.update_layout(scene=dict(bgcolor='#CCFFFF'))
|
|
|
|
|
|
|
|
|
|
|
|
|
654 |
#
|
655 |
+
# # Change color of text to white
|
656 |
+
# fig4.update_layout(scene=dict(xaxis=dict(backgroundcolor='#CCFFFF', color='blue'),
|
657 |
+
# yaxis=dict(backgroundcolor='#CCFFFF', color='blue'),
|
658 |
+
# zaxis=dict(backgroundcolor='#CCFFFF', color='blue')))
|
659 |
#
|
660 |
+
# fig4.update_traces(hovertemplate='<b>%{hovertext}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>')
|
661 |
+
# fig4.update_layout(
|
662 |
+
# title=dict(text=f"Word embedding map for {query} in {database_name} PubMed corpus", x=0.5, y=0.95,
|
663 |
+
# xanchor='center', yanchor='top', font=dict(color='black')),
|
664 |
+
# scene=dict(xaxis_title="Dimension 1", yaxis_title="Dimension 2", zaxis_title="Dimension 3"))
|
665 |
+
# fig4.update_coloraxes(colorbar_title="Similarity with query")
|
666 |
#
|
667 |
+
# # Represent query as a large red diamond
|
668 |
+
# fig4.add_trace(
|
669 |
+
# go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='markers',
|
670 |
+
# marker=dict(size=7, color='black', symbol='diamond'), name=query, hovertext=query,
|
671 |
+
# showlegend=False))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
672 |
#
|
673 |
+
# # Add label for the query above the diamond
|
674 |
+
# fig4.add_trace(go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='text',
|
675 |
+
# text=[query], textposition='bottom center', textfont=dict(color='blue', size=10),
|
676 |
+
# hoverinfo='none', showlegend=False))
|
677 |
#
|
678 |
+
# # Add circles for the top 50 similar words
|
679 |
+
# fig4.add_trace(go.Scatter3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], mode='markers',
|
680 |
+
# marker=dict(size=2, color=sims_query_top, colorscale='RdYlGn', symbol='circle'),
|
681 |
+
# hovertemplate='<b>%{text}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>',
|
682 |
+
# text=words, customdata=sims, name=''))
|
683 |
#
|
684 |
+
# fig4.update(layout_coloraxis_showscale=True)
|
685 |
+
# fig4.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
686 |
+
# fig4.update_annotations(visible=False)
|
687 |
#
|
688 |
+
# st.plotly_chart(fig4, use_container_width=True)
|
|
|
|
|
|
|
689 |
# st.markdown("---")
|
690 |
|
691 |
st.markdown("---")
|
|
|
788 |
else:
|
789 |
st.warning(
|
790 |
f"This selection exceeds the number of similar pythochemicals related to {query} within the {database_name} corpus, please choose a lower number")
|
791 |
+
|
792 |
+
# try:
|
793 |
+
# value_phyto = min(df_len, 50)
|
794 |
+
# top_words = model.wv.most_similar_cosmul(query, topn=value_phyto)
|
795 |
+
# words = df15.head(value_phyto).index
|
796 |
+
# words = [word.replace(' ', '-') for word in words]
|
797 |
+
# # print(words)
|
798 |
+
# sims = df8.head(value_phyto)["SIMILARITY"].tolist()
|
799 |
+
# # print(sims)
|
800 |
+
# X_top = model.wv[words] # print(X_top)
|
801 |
+
# except Exception as e:
|
802 |
+
# print("Error:", e)
|
803 |
+
#
|
804 |
+
# # Remove the text "Similarity Score" from each element in the sims list
|
805 |
+
# sims_query_top = [float(sim.split()[-1]) for sim in sims]
|
806 |
+
# # print(sims_query_top)
|
807 |
+
#
|
808 |
+
# # Generate a 3D scatter plot of word embeddings using Plotly
|
809 |
+
# fig4 = px.scatter_3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], text=words, color=sims_query_top,
|
810 |
+
# color_continuous_scale="RdYlGn", hover_name=words, hover_data={"color": sims_query_top})
|
811 |
+
#
|
812 |
+
# # Change background color to black
|
813 |
+
# fig4.update_layout(scene=dict(bgcolor='#CCFFFF'))
|
814 |
+
#
|
815 |
+
# # Change color of text to white
|
816 |
+
# fig4.update_layout(scene=dict(xaxis=dict(backgroundcolor='#CCFFFF', color='blue'),
|
817 |
+
# yaxis=dict(backgroundcolor='#CCFFFF', color='blue'),
|
818 |
+
# zaxis=dict(backgroundcolor='#CCFFFF', color='blue')))
|
819 |
+
#
|
820 |
+
# fig4.update_traces(hovertemplate='<b>%{hovertext}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>')
|
821 |
+
# fig4.update_layout(
|
822 |
+
# title=dict(text=f"Word embedding map for {query} in {database_name} PubMed corpus", x=0.5, y=0.95,
|
823 |
+
# xanchor='center', yanchor='top', font=dict(color='black')),
|
824 |
+
# scene=dict(xaxis_title="Dimension 1", yaxis_title="Dimension 2", zaxis_title="Dimension 3"))
|
825 |
+
# fig4.update_coloraxes(colorbar_title="Similarity with query")
|
826 |
+
#
|
827 |
+
# # Represent query as a large red diamond
|
828 |
+
# fig4.add_trace(
|
829 |
+
# go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='markers',
|
830 |
+
# marker=dict(size=7, color='black', symbol='diamond'), name=query, hovertext=query,
|
831 |
+
# showlegend=False))
|
832 |
+
#
|
833 |
+
# # Add label for the query above the diamond
|
834 |
+
# fig4.add_trace(go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='text',
|
835 |
+
# text=[query], textposition='bottom center', textfont=dict(color='blue', size=10),
|
836 |
+
# hoverinfo='none', showlegend=False))
|
837 |
+
#
|
838 |
+
# # Add circles for the top 50 similar words
|
839 |
+
# fig4.add_trace(go.Scatter3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], mode='markers',
|
840 |
+
# marker=dict(size=2, color=sims_query_top, colorscale='RdYlGn', symbol='circle'),
|
841 |
+
# hovertemplate='<b>%{text}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>',
|
842 |
+
# text=words, customdata=sims, name=''))
|
843 |
+
#
|
844 |
+
# fig4.update(layout_coloraxis_showscale=True)
|
845 |
+
# fig4.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
846 |
+
# fig4.update_annotations(visible=False)
|
847 |
+
#
|
848 |
+
# st.plotly_chart(fig4, use_container_width=True)
|
849 |
+
# st.markdown("---")
|
850 |
+
|
851 |
+
|
852 |
st.markdown("---")
|
853 |
|
854 |
+
|
855 |
# print()
|
856 |
# print("Human genes similar to " + str(query))
|
857 |
df1 = table.copy()
|
|
|
956 |
else:
|
957 |
st.warning(
|
958 |
f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
|
|
|
959 |
|
960 |
+
# try:
|
961 |
+
# value_compound = min(df_len, 50)
|
962 |
+
# top_words = model.wv.most_similar_cosmul(query, topn=value_compound)
|
963 |
+
# words = df12.head(value_compound).index
|
964 |
+
# words = [word.replace(' ', '-') for word in words]
|
965 |
+
#
|
966 |
+
# sims = df5.head(value_compound)["SIMILARITY"].tolist()
|
967 |
+
# # print(sims)
|
968 |
+
# X_top = model.wv[words] # print(X_top)
|
969 |
+
# except Exception as e:
|
970 |
+
# print("Error:", e)
|
971 |
+
#
|
972 |
+
# # Remove the text "Similarity Score" from each element in the sims list
|
973 |
+
# sims_query_top = [float(sim.split()[-1]) for sim in sims]
|
974 |
+
# # print(sims_query_top)
|
975 |
+
#
|
976 |
+
# # Generate a 3D scatter plot of word embeddings using Plotly
|
977 |
+
# fig5 = px.scatter_3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], text=words, color=sims_query_top,
|
978 |
+
# color_continuous_scale="RdYlGn", hover_name=words, hover_data={"color": sims_query_top})
|
979 |
+
#
|
980 |
+
# # Change background color to black
|
981 |
+
# fig5.update_layout(scene=dict(bgcolor='#CCFFFF'))
|
982 |
+
#
|
983 |
+
# # Change color of text to white
|
984 |
+
# fig5.update_layout(scene=dict(xaxis=dict(backgroundcolor='#CCFFFF', color='blue'),
|
985 |
+
# yaxis=dict(backgroundcolor='#CCFFFF', color='blue'),
|
986 |
+
# zaxis=dict(backgroundcolor='#CCFFFF', color='blue')))
|
987 |
+
#
|
988 |
+
# fig5.update_traces(hovertemplate='<b>%{hovertext}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>')
|
989 |
+
# fig5.update_layout(
|
990 |
+
# title=dict(text=f"Word embedding map for {query} in {database_name} PubMed corpus", x=0.5, y=0.95,
|
991 |
+
# xanchor='center', yanchor='top', font=dict(color='black')),
|
992 |
+
# scene=dict(xaxis_title="Dimension 1", yaxis_title="Dimension 2", zaxis_title="Dimension 3"))
|
993 |
+
# fig5.update_coloraxes(colorbar_title="Similarity with query")
|
994 |
+
#
|
995 |
+
# # Represent query as a large red diamond
|
996 |
+
# fig5.add_trace(
|
997 |
+
# go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='markers',
|
998 |
+
# marker=dict(size=7, color='black', symbol='diamond'), name=query, hovertext=query,
|
999 |
+
# showlegend=False))
|
1000 |
+
#
|
1001 |
+
# # Add label for the query above the diamond
|
1002 |
+
# fig5.add_trace(go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='text',
|
1003 |
+
# text=[query], textposition='bottom center', textfont=dict(color='blue', size=10),
|
1004 |
+
# hoverinfo='none', showlegend=False))
|
1005 |
+
#
|
1006 |
+
# # Add circles for the top 50 similar words
|
1007 |
+
# fig5.add_trace(go.Scatter3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], mode='markers',
|
1008 |
+
# marker=dict(size=2, color=sims_query_top, colorscale='RdYlGn', symbol='circle'),
|
1009 |
+
# hovertemplate='<b>%{text}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>',
|
1010 |
+
# text=words, customdata=sims, name=''))
|
1011 |
+
#
|
1012 |
+
# fig5.update(layout_coloraxis_showscale=True)
|
1013 |
+
# fig5.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
1014 |
+
# fig5.update_annotations(visible=False)
|
1015 |
+
#
|
1016 |
+
# st.plotly_chart(fig5, use_container_width=True)
|
1017 |
+
# st.markdown("---")
|
1018 |
# import os
|
1019 |
|
1020 |
# from datasets import Dataset
|
|
|
1135 |
5. [Word2Vec: How to Implement Word2Vec in Python](https://www.youtube.com/watch?v=ISPId9Lhc1g&t=6s) - A YouTube video by Data Talks demonstrating how to implement Word2Vec in Python using the Gensim library.
|
1136 |
""")
|
1137 |
|
1138 |
+
|
1139 |
# else:
|
1140 |
# st.error("The password you entered is incorrect.")
|
1141 |
|