Spaces:

jfataphd
/

OncoDigger

Running

App Files Files Community

jfataphd commited on Apr 26, 2023

Commit

1edc895

1 Parent(s): 78a2dc3

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -78

app.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import streamlit as st
 import time
 import concurrent.futures
-import json
 # import tensorflow
 from gensim.models import Word2Vec
@@ -18,11 +20,9 @@ import plotly.graph_objs as go
 from streamlit.components.v1 import html
 st.set_page_config(page_title="OncoDigger", page_icon=":microscope:", layout="wide",  # centered
-                   initial_sidebar_state="auto",
-                   menu_items={'About': "OncoDigger is a Natural Language Processing (NLP) that harnesses Word2Vec to mine"
-                         " insight from pubmed abstracts. Created by Jimmie E. Fata, PhD, [email protected]"})
 analytics_code = '''
 <head>
@@ -40,7 +40,6 @@ analytics_code = '''
 html(analytics_code, height=0)
 # Define the HTML and CSS styles
 st.markdown("""
 <style>
@@ -78,8 +77,7 @@ def custom_subheader(text, identifier, font_size):
 custom_subheader("To begin, simply select a cancer corpus from the left sidebar and enter a keyword "
                  "you wish to explore within the corpus. OncoDigger will determine the top words, "
                  "genes, drugs, phytochemicals, and compounds that are contextually and semantically related "
-                 "to your input, both directly and indirectly. Dive in and enjoy the exploration!",
-                 "unique-id", 18)
 st.markdown("---")
@@ -98,10 +96,9 @@ st.markdown("---")
 #
 # # If the password is correct, show the app content
 # if authenticate(password):
-opt = st.sidebar.radio("Select a PubMed Corpus", options=('Breast Cancer corpus', 'Lung Cancer corpus',
-                                                          'Skin Cancer corpus', 'Colorectal Cancer corpus',
-                                                          'Prostate Cancer corpus', 'Lymphoma Cancer corpus', 'Urinary Cancer corpus',
-                                                         'Kidney Cancer corpus'))
 # if opt == "Clotting corpus":
 #     model_used = ("pubmed_model_clotting")
 #     num_abstracts = 45493
@@ -141,8 +138,7 @@ if opt == "Urinary Cancer corpus":
 if opt == "Kidney Cancer corpus":
     model_used = ("kidney_cancer_pubmed_model")
     num_abstracts = 39016
-    database_name = "Kidney_cancer"
 st.header(f":blue[{database_name} Pubmed corpus.]")
 text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
@@ -173,11 +169,12 @@ if query:
         model2 = model.wv[query]
         # print(model.wv.similar_by_word('bfgf', topn=50, restrict_vocab=None))
         df = pd.DataFrame(X)
-        if 'melanin' in model.wv.key_to_index:
-            print("The term 'melanin' is present in the model.")
-        else:
-            print("The term 'melanin' is not present in the model.")
         def get_compound_ids(compound_names):
             with concurrent.futures.ThreadPoolExecutor() as executor:
@@ -199,9 +196,9 @@ if query:
             return None
-    # except:
-    #     st.error("Term occurrence is too low - please try another term")
-    #     st.stop()
         st.markdown("---")
         try:
@@ -213,10 +210,9 @@ if query:
             pd.set_option('display.max_rows', None)
             table2 = table.copy()
-            st.markdown(
-                f"<h2 style='text-align: center; font-family: Arial; font-size: 20px; font-weight: bold;'>"
-                f"Top <span style='color:red; font-style: italic;'>10000</span> words in an interactive embedding map most similar to <span style='color:red; font-style: italic;'>{query}</span> in <span style='color:red; font-style: italic;'>{database_name}</span> "
-                f"PubMed corpus: Zoom in to the black diamond to find <span style='color:red; font-style: italic;'>{query}</span></h2>",
                 unsafe_allow_html=True)
             # Set the max number of words to display
@@ -224,12 +220,24 @@ if query:
             try:
                 value_word = min(50, len(table2))
-                # Get the top 50 similar words to the query
-                top_words = model.wv.most_similar_cosmul(query, topn=10000)
                 words = [word for word, sim in top_words]
                 words = [word.replace(' ', '-') for word in words]
                 sims = [sim for word, sim in top_words]
-                X_top = model.wv[words]
                 # Compute similarities between query and top 100 words
                 sims_query_top = sims  # print(sims_query_top)
@@ -237,7 +245,8 @@ if query:
                 print("Error:", e)
             # Generate a 2D scatter plot of word embeddings using Plotly
-            fig = px.scatter(x=X_top[:, 0], y=X_top[:, 1], color=sims_query_top, color_continuous_scale="RdYlGn", )
             # Change background color to black
             fig.update_layout(plot_bgcolor='#CCFFFF')
@@ -248,19 +257,19 @@ if query:
             # fig.update_traces(hovertemplate='<b>%{hovertext}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>')
             # fig.update_layout(title=dict(
-                # text=f"Top 10000 words in an interactive embedding map for {query} in {database_name} PubMed corpus"
-                #      f": Zoom in to the black diamond to find {query}", x=0.5, y=1, xanchor='center', yanchor='top',
-                # font=dict(color='black')))
             fig.update_coloraxes(colorbar_title=f"Similarity with {query}")
             # Represent query as a large red diamond
             fig.add_trace(go.Scatter(x=[model.wv[query][0]], y=[model.wv[query][1]], mode='markers',
-                                     marker=dict(size=12, color='black', symbol='diamond'), name=query, hovertext=query,
                                      showlegend=False))
             # Add label for the query above the diamond
             fig.add_trace(go.Scatter(x=[model.wv[query][0]], y=[model.wv[query][1]], mode='text', text=[query],
-                                     textposition='top right', textfont=dict(color='blue', size=10), hoverinfo='none',
                                      showlegend=False))
             # Add circles for the top 50 similar words
@@ -274,14 +283,12 @@ if query:
             st.plotly_chart(fig, use_container_width=True)
             st.markdown(
-            f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_word} "
-            f"</span>words contextually and semantically similar to "
-            f"<span style='color:red; font-style: italic;'>{query} </span>within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. "
-            f"Click on the squares to expand and also the PubMed and Wikipedia links for more word information</span></p></b>",
-            unsafe_allow_html=True)
             short_table = table2.head(value_word).round(2)
             short_table.index += 1
@@ -294,26 +301,28 @@ if query:
             df = short_table
             df['text'] = short_table.index
             df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
-                      '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in short_table.index]
             df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index]
             df.loc[:, 'database'] = database_name
-            fig = px.treemap(df, path=[short_table.index], values=sizes, custom_data=['href', 'text', 'database', 'href2'],
-                            hover_name=(table2.head(value_word)['SIMILARITY']))
             fig.update(layout_coloraxis_showscale=False)
             fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
             fig.update_annotations(visible=False)
             fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
-                          hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", texttemplate="<br><span "
-                                                                                                         "style='font-family: Arial; font-size: 20px;'>%{customdata[1]}<br><br>"
-                                                                                                         "<a href='%{customdata[0]}'>PubMed"
-                                                                                                         "</a><br><br><a href='%{customdata[3]}'>Wikipedia"
-                                                                                                         "</span></a>")
             fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])
             # st.pyplot(fig2)
@@ -325,7 +334,7 @@ if query:
             csv = table2.head(value_word).to_csv().encode('utf-8')
             st.download_button(label=f"download top {value_word} words (csv)", data=csv,
-                           file_name=f'{database_name}_words.csv', mime='text/csv')
         except:
             st.warning(
@@ -334,8 +343,6 @@ if query:
         st.warning(
             "This word is not found in the corpus, it could be because it is not spelled correctly or could be that it does not have enough representation within the corpus, please try again")
     # try:
     #     value_word = min(50, len(table2))
     #     # Get the top 50 similar words to the query
@@ -472,7 +479,8 @@ if query:
                 "Human gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
             st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
             st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
-            st.caption("In some cases genes may represent abbreviations of words and not genes, use pubmed link to confirm output is a gene")
             csv = df1.head(value_gene).to_csv().encode('utf-8')
             st.download_button(label=f"download top {value_gene} genes (csv)", data=csv,
@@ -502,49 +510,44 @@ if query:
         except Exception as e:
             print("Error:", e)
         # Remove the text "Similarity Score" from each element in the sims list
         sims_query_top = [float(sim.split()[-1]) for sim in sims]
         # print(sims_query_top)
         # Generate a 3D scatter plot of word embeddings using Plotly
         fig2 = px.scatter_3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], text=words, color=sims_query_top,
-                                 color_continuous_scale="RdYlGn", hover_name=words,
-                                 hover_data={"color": sims_query_top})
         # Change background color to black
         fig2.update_layout(scene=dict(bgcolor='#CCFFFF'))
         # Change color of text to white
         fig2.update_layout(scene=dict(xaxis=dict(backgroundcolor='#CCFFFF', color='blue'),
-                                          yaxis=dict(backgroundcolor='#CCFFFF', color='blue'),
-                                          zaxis=dict(backgroundcolor='#CCFFFF', color='blue')))
-        fig2.update_traces(
-                hovertemplate='<b>%{hovertext}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>')
         fig2.update_layout(
-                title=dict(text=f"", x=0.5, y=0.95,
-                           xanchor='center', yanchor='top', font=dict(color='black')),
-                scene=dict(xaxis_title="Dimension 1", yaxis_title="Dimension 2", zaxis_title="Dimension 3"))
         fig2.update_coloraxes(colorbar_title=f"Similarity with {query}")
         # Represent query as a large red diamond
         fig2.add_trace(
-                go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='markers',
-                             marker=dict(size=7, color='black', symbol='diamond'), name=query, hovertext=query,
-                             showlegend=False))
         # Add label for the query above the diamond
-        fig2.add_trace(
-                go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='text',
-                             text=[query], textposition='bottom center', textfont=dict(color='blue', size=10),
-                             hoverinfo='none', showlegend=False))
         # Add circles for the top 50 similar words
         fig2.add_trace(go.Scatter3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], mode='markers',
-                                        marker=dict(size=2, color=sims_query_top, colorscale='RdYlGn', symbol='circle'),
-                                        hovertemplate='<b>%{text}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>',
-                                        text=words, customdata=sims, name=''))
         fig2.update(layout_coloraxis_showscale=True)
         fig2.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
@@ -872,10 +875,8 @@ if query:
         # st.plotly_chart(fig4, use_container_width=True)
         # st.markdown("---")
         st.markdown("---")
         # print()
         # print("Human genes similar to " + str(query))
         df1 = table.copy()
@@ -1159,7 +1160,6 @@ if query:
             6. [Cosine Similarity Calculator](https://www.omnicalculator.com/math/cosine-similarity) - A calculator for computing cosine similarity, a common metric used in measuring similarity between vectors.
             """)
 # else:
 #     st.error("The password you entered is incorrect.")

 import streamlit as st
 import time
 import concurrent.futures
+# import json
+from sklearn.manifold import TSNE
+# import umap
 # import tensorflow
 from gensim.models import Word2Vec
 from streamlit.components.v1 import html
 st.set_page_config(page_title="OncoDigger", page_icon=":microscope:", layout="wide",  # centered
+                   initial_sidebar_state="auto", menu_items={
+        'About': "OncoDigger is a Natural Language Processing (NLP) that harnesses Word2Vec to mine"
+                 " insight from pubmed abstracts. Created by Jimmie E. Fata, PhD, [email protected]"})
 analytics_code = '''
 <head>
 html(analytics_code, height=0)
 # Define the HTML and CSS styles
 st.markdown("""
 <style>
 custom_subheader("To begin, simply select a cancer corpus from the left sidebar and enter a keyword "
                  "you wish to explore within the corpus. OncoDigger will determine the top words, "
                  "genes, drugs, phytochemicals, and compounds that are contextually and semantically related "
+                 "to your input, both directly and indirectly. Dive in and enjoy the exploration!", "unique-id", 18)
 st.markdown("---")
 #
 # # If the password is correct, show the app content
 # if authenticate(password):
+opt = st.sidebar.radio("Select a PubMed Corpus", options=(
+'Breast Cancer corpus', 'Lung Cancer corpus', 'Skin Cancer corpus', 'Colorectal Cancer corpus',
+'Prostate Cancer corpus', 'Lymphoma Cancer corpus', 'Urinary Cancer corpus', 'Kidney Cancer corpus'))
 # if opt == "Clotting corpus":
 #     model_used = ("pubmed_model_clotting")
 #     num_abstracts = 45493
 if opt == "Kidney Cancer corpus":
     model_used = ("kidney_cancer_pubmed_model")
     num_abstracts = 39016
+    database_name = "Kidney_cancer"
 st.header(f":blue[{database_name} Pubmed corpus.]")
 text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
         model2 = model.wv[query]
         # print(model.wv.similar_by_word('bfgf', topn=50, restrict_vocab=None))
         df = pd.DataFrame(X)
+        #
+        # if 'melanin' in model.wv.key_to_index:
+        #     print("The term 'melanin' is present in the model.")
+        # else:
+        #     print("The term 'melanin' is not present in the model.")
         def get_compound_ids(compound_names):
             with concurrent.futures.ThreadPoolExecutor() as executor:
             return None
+        # except:
+        #     st.error("Term occurrence is too low - please try another term")
+        #     st.stop()
         st.markdown("---")
         try:
             pd.set_option('display.max_rows', None)
             table2 = table.copy()
+            st.markdown(f"<h2 style='text-align: center; font-family: Arial; font-size: 20px; font-weight: bold;'>"
+                        f"Top <span style='color:red; font-style: italic;'>500</span> words in a dimension-reduced embedding map showing similarity to <span style='color:red; font-style: italic;'>{query}</span> in <span style='color:red; font-style: italic;'>{database_name}</span> "
+                        f"corpus</span></h2>",
                 unsafe_allow_html=True)
             # Set the max number of words to display
             try:
                 value_word = min(50, len(table2))
+                # Get the top 10000 similar words to the query
+                top_words = model.wv.most_similar_cosmul(query, topn=500)
                 words = [word for word, sim in top_words]
                 words = [word.replace(' ', '-') for word in words]
                 sims = [sim for word, sim in top_words]
+                X = model.wv[words]
+                # Add the query to the list of words and the embeddings array
+                words_with_query = [query] + words
+                X_with_query = np.vstack((model.wv[[query]], X))
+                # Perform t-SNE
+                tsne = TSNE(n_components=2, random_state=42)
+                X_tsne = tsne.fit_transform(X_with_query)
+                # Extract the t-SNE-transformed coordinates of the query and the top words
+                query_tsne = X_tsne[0]
+                X_top = X_tsne[1:]
                 # Compute similarities between query and top 100 words
                 sims_query_top = sims  # print(sims_query_top)
                 print("Error:", e)
             # Generate a 2D scatter plot of word embeddings using Plotly
+            fig = px.scatter(x=X_top[:, 0], y=X_top[:, 1], color=sims, color_continuous_scale="RdYlGn")
             # Change background color to black
             fig.update_layout(plot_bgcolor='#CCFFFF')
             # fig.update_traces(hovertemplate='<b>%{hovertext}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>')
             # fig.update_layout(title=dict(
+            # text=f"Top 10000 words in an interactive embedding map for {query} in {database_name} PubMed corpus"
+            #      f": Zoom in to the black diamond to find {query}", x=0.5, y=1, xanchor='center', yanchor='top',
+            # font=dict(color='black')))
             fig.update_coloraxes(colorbar_title=f"Similarity with {query}")
             # Represent query as a large red diamond
             fig.add_trace(go.Scatter(x=[model.wv[query][0]], y=[model.wv[query][1]], mode='markers',
+                                     marker=dict(size=7, color='black', symbol='diamond'), name=query, hovertext=query,
                                      showlegend=False))
             # Add label for the query above the diamond
             fig.add_trace(go.Scatter(x=[model.wv[query][0]], y=[model.wv[query][1]], mode='text', text=[query],
+                                     textposition='top right', textfont=dict(color='blue', size=12), hoverinfo='none',
                                      showlegend=False))
             # Add circles for the top 50 similar words
             st.plotly_chart(fig, use_container_width=True)
             st.markdown(
+                f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_word} "
+                f"</span>words contextually and semantically similar to "
+                f"<span style='color:red; font-style: italic;'>{query} </span>within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. "
+                f"Click on the squares to expand and also the PubMed and Wikipedia links for more word information</span></p></b>",
+                unsafe_allow_html=True)
             short_table = table2.head(value_word).round(2)
             short_table.index += 1
             df = short_table
             df['text'] = short_table.index
             df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
+                          '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in
+                          short_table.index]
             df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index]
             df.loc[:, 'database'] = database_name
+            fig = px.treemap(df, path=[short_table.index], values=sizes,
+                             custom_data=['href', 'text', 'database', 'href2'],
+                             hover_name=(table2.head(value_word)['SIMILARITY']))
             fig.update(layout_coloraxis_showscale=False)
             fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
             fig.update_annotations(visible=False)
             fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
+                              hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
+                              texttemplate="<br><span "
+                                           "style='font-family: Arial; font-size: 20px;'>%{customdata[1]}<br><br>"
+                                           "<a href='%{customdata[0]}'>PubMed"
+                                           "</a><br><br><a href='%{customdata[3]}'>Wikipedia"
+                                           "</span></a>")
             fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])
             # st.pyplot(fig2)
             csv = table2.head(value_word).to_csv().encode('utf-8')
             st.download_button(label=f"download top {value_word} words (csv)", data=csv,
+                               file_name=f'{database_name}_words.csv', mime='text/csv')
         except:
             st.warning(
         st.warning(
             "This word is not found in the corpus, it could be because it is not spelled correctly or could be that it does not have enough representation within the corpus, please try again")
     # try:
     #     value_word = min(50, len(table2))
     #     # Get the top 50 similar words to the query
                 "Human gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
             st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
             st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
+            st.caption(
+                "In some cases genes may represent abbreviations of words and not genes, use pubmed link to confirm output is a gene")
             csv = df1.head(value_gene).to_csv().encode('utf-8')
             st.download_button(label=f"download top {value_gene} genes (csv)", data=csv,
         except Exception as e:
             print("Error:", e)
         # Remove the text "Similarity Score" from each element in the sims list
         sims_query_top = [float(sim.split()[-1]) for sim in sims]
         # print(sims_query_top)
         # Generate a 3D scatter plot of word embeddings using Plotly
         fig2 = px.scatter_3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], text=words, color=sims_query_top,
+                             color_continuous_scale="RdYlGn", hover_name=words, hover_data={"color": sims_query_top})
         # Change background color to black
         fig2.update_layout(scene=dict(bgcolor='#CCFFFF'))
         # Change color of text to white
         fig2.update_layout(scene=dict(xaxis=dict(backgroundcolor='#CCFFFF', color='blue'),
+                                      yaxis=dict(backgroundcolor='#CCFFFF', color='blue'),
+                                      zaxis=dict(backgroundcolor='#CCFFFF', color='blue')))
+        fig2.update_traces(hovertemplate='<b>%{hovertext}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>')
         fig2.update_layout(
+            title=dict(text=f"", x=0.5, y=0.95, xanchor='center', yanchor='top', font=dict(color='black')),
+            scene=dict(xaxis_title="Dimension 1", yaxis_title="Dimension 2", zaxis_title="Dimension 3"))
         fig2.update_coloraxes(colorbar_title=f"Similarity with {query}")
         # Represent query as a large red diamond
         fig2.add_trace(
+            go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='markers',
+                         marker=dict(size=7, color='black', symbol='diamond'), name=query, hovertext=query,
+                         showlegend=False))
         # Add label for the query above the diamond
+        fig2.add_trace(go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='text',
+                                    text=[query], textposition='bottom center', textfont=dict(color='blue', size=10),
+                                    hoverinfo='none', showlegend=False))
         # Add circles for the top 50 similar words
         fig2.add_trace(go.Scatter3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], mode='markers',
+                                    marker=dict(size=2, color=sims_query_top, colorscale='RdYlGn', symbol='circle'),
+                                    hovertemplate='<b>%{text}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>',
+                                    text=words, customdata=sims, name=''))
         fig2.update(layout_coloraxis_showscale=True)
         fig2.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
         # st.plotly_chart(fig4, use_container_width=True)
         # st.markdown("---")
         st.markdown("---")
         # print()
         # print("Human genes similar to " + str(query))
         df1 = table.copy()
             6. [Cosine Similarity Calculator](https://www.omnicalculator.com/math/cosine-similarity) - A calculator for computing cosine similarity, a common metric used in measuring similarity between vectors.
             """)
 # else:
 #     st.error("The password you entered is incorrect.")