Spaces:

jfataphd
/

OncoDigger

Running

App Files Files Community

jfataphd commited on Mar 2, 2023

Commit

e32c352

1 Parent(s): f67304b

Update app.py

Browse files

Files changed (1) hide show

app.py +201 -100

app.py CHANGED Viewed

@@ -85,16 +85,16 @@ if query:
         bar.progress((i + 1) * 10)
         time.sleep(.1)
-    try:
-        model = Word2Vec.load(model_used)  # you can continue training with the loaded model!
-        words = list(model.wv.key_to_index)
-        X = model.wv[model.wv.key_to_index]
-        model2 = model.wv[query]
-        df = pd.DataFrame(X)
-    except:
-        st.error("Term occurrence is too low - please try another term")
-        st.stop()
     st.markdown("---")
     # def findRelationships(query, df):
@@ -133,62 +133,61 @@ if query:
             unsafe_allow_html=True)
-    # calculate the sizes of the squares in the treemap
-    short_table = table2.head(value_word).round(2)
-    short_table.index += 1
-    short_table.index = (1 / short_table.index)*10
-    sizes = short_table.index.tolist()
-    short_table.set_index('Word', inplace=True)
-    # label = short_table.index.tolist()
-    print(short_table.index)
-    table2["SIMILARITY"] = 'Similarity Score ' + table2.head(10)["SIMILARITY"].round(2).astype(str)
-    rank_num = list(short_table.index.tolist())
-    # avg_size = sum(sizes) / len(short_table.index)
-    df = short_table
-    try:
-        # Define the `text` column for labels and `href` column for links
-        df['text'] = short_table.index
-        df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
                   '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in short_table.index]
-        df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index]
-        df['database'] = database_name
-    # print(sizes)
-    # '{0} in {1}'.format(unicode(self.author, 'utf-8'), unicode(self.publication, 'utf-8'))
-        # Create the treemap using `px.treemap`
-        fig = px.treemap(df, path=[short_table.index], values=sizes, custom_data=['href', 'text', 'database', 'href2'],
                          hover_name=(table2.head(value_word)['SIMILARITY']))
-        fig.update(layout_coloraxis_showscale=False)
-        fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
-        fig.update_annotations(visible=False)
-        fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
                           hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
                           texttemplate="</b><br><span "
                                        "style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>"
                                        "<a href='%{customdata[0]}'>PubMed"
                                        "</a><br><a href='%{customdata[3]}'>Wikipedia"
                                        "</span></a>")
-        fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])
-        # st.pyplot(fig2)
-        st.plotly_chart(fig, use_container_width=True)
-        # st.caption(
-        #     "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
-        # st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
-        csv = table2.head(value_word).to_csv().encode('utf-8')
-        st.download_button(label=f"download top {value_word} words (csv)", data=csv, file_name=f'{database_name}_words.csv',
                            mime='text/csv')
-    except:
-        st.warning(
             f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus")
     st.markdown("---")
@@ -204,7 +203,7 @@ if query:
     df1.rename(columns={'Word': 'Human Gene'}, inplace=True)
     df1["Human Gene"] = df1["Human Gene"].str.upper()
     # print(df1.head(50))
-    print()
     # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
     # time.sleep(2)
     # Create the slider with increments of 5 up to 100
@@ -214,82 +213,184 @@ if query:
                 f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
                 f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
                     unsafe_allow_html=True)
-    value = st.slider("Gene", 0, 100, step=5)
-    if value > 0:
         # st.subheader(f"Top {value} genes closely related to {query}: "
         #              f"Click on the Pubmed and NCBI links for more gene information")
         st.markdown(
-            f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value} "
             f"</span>genes similar to "
             f"<span style='color:red; font-style: italic;'>{query}:</span> Click on the squares to expand and the Pubmed and NCBI links for more gene information</span></p></b>",
             unsafe_allow_html=True)
-    df10 = df1.head(value)
-    df10.index = (1 / df10.index)*10000
-    sizes = df10.index.tolist()
-    df10.set_index('Human Gene', inplace=True)
-    df3 = df1.copy()
-    df3["SIMILARITY"] = 'Similarity Score ' + df3.head(value)["SIMILARITY"].round(2).astype(str)
-    df3.reset_index(inplace=True)
-    df3 = df3.rename(columns={'Human Gene': 'symbol2'})
-    # Use df.query to get a subset of df1 based on ids in df2
-    subset = df3.head(value).query('symbol2 in @df2.symbol2')
-    # Use merge to join the two DataFrames on id
-    result = pd.merge(subset, df2, on='symbol2')
-    # Show the result
-    # print(result)
-    # label = df10.index.tolist()
-    df2 = df10
-    try:
-        # Define the `text` column for labels and `href` column for links
-        df2['text'] = df10.index
-        df2['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
-                  '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df10.index]
-        df2['href2'] = [f'https://www.ncbi.nlm.nih.gov/gene/?term=' + c for c in df10.index]
-        df2['name'] = [c for c in result['Approved name']]
-        df2['database'] = database_name
-        # print(df['name'])
-        # Create the treemap using `px.treemap`
-        fig = px.treemap(df2, path=[df10.index], values=sizes,
-                     custom_data=['href', 'name', 'database', 'href2', 'text'], hover_name=(df3.head(value)['SIMILARITY']))
-        fig.update(layout_coloraxis_showscale=False)
-        fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
-        fig.update_annotations(visible=False)
-        fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
                       hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
                       texttemplate="<b><span style='font-family: Arial; font-size: 20px;'>%{customdata[4]}</span></b><br><span "
                                    "style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>"
                                    "<a href='%{customdata[0]}'>PubMed"
                                    "</a><br><a href='%{customdata[3]}'>NCBI"
                                    "</span></a>")
-        fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightblue"])
-        # # display the treemap in Streamlit
-        # with treemap2:
-        # st.pyplot(fig2)
-        st.plotly_chart(fig, use_container_width=True)
-        st.caption("Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
-        st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
-        csv = df1.head(value).to_csv().encode('utf-8')
-        st.download_button(label=f"download top {value} genes (csv)", data=csv, file_name=f'{database_name}_genes.csv',
                        mime='text/csv')
-    except:
-        st.warning(
             f"This selection exceeds the number of similar genes related to {query} within the {database_name} corpus")
     st.markdown("---")
     st.subheader("Cancer-related videos")
     if query:
         idlist=[]

         bar.progress((i + 1) * 10)
         time.sleep(.1)
+    # try:
+    model = Word2Vec.load(model_used)  # you can continue training with the loaded model!
+    words = list(model.wv.key_to_index)
+    X = model.wv[model.wv.key_to_index]
+    model2 = model.wv[query]
+    df = pd.DataFrame(X)
+    # except:
+    #     st.error("Term occurrence is too low - please try another term")
+    #     st.stop()
     st.markdown("---")
     # def findRelationships(query, df):
             unsafe_allow_html=True)
+        # calculate the sizes of the squares in the treemap
+        short_table = table2.head(value_word).round(2)
+        short_table.index += 1
+        short_table.index = (1 / short_table.index)*10
+        sizes = short_table.index.tolist()
+        short_table.set_index('Word', inplace=True)
+        # label = short_table.index.tolist()
+        # print(short_table.index)
+        table2["SIMILARITY"] = 'Similarity Score ' + table2.head(10)["SIMILARITY"].round(2).astype(str)
+        rank_num = list(short_table.index.tolist())
+        # avg_size = sum(sizes) / len(short_table.index)
+        df = short_table
+        try:
+            # Define the `text` column for labels and `href` column for links
+            df['text'] = short_table.index
+            df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
                   '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in short_table.index]
+            df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index]
+            df['database'] = database_name
+            # print(sizes)
+            # '{0} in {1}'.format(unicode(self.author, 'utf-8'), unicode(self.publication, 'utf-8'))
+            # Create the treemap using `px.treemap`
+            fig = px.treemap(df, path=[short_table.index], values=sizes, custom_data=['href', 'text', 'database', 'href2'],
                          hover_name=(table2.head(value_word)['SIMILARITY']))
+            fig.update(layout_coloraxis_showscale=False)
+            fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
+            fig.update_annotations(visible=False)
+            fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
                           hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
                           texttemplate="</b><br><span "
                                        "style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>"
                                        "<a href='%{customdata[0]}'>PubMed"
                                        "</a><br><a href='%{customdata[3]}'>Wikipedia"
                                        "</span></a>")
+            fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])
+            # st.pyplot(fig2)
+            st.plotly_chart(fig, use_container_width=True)
+            # st.caption(
+            #     "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
+            # st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
+            csv = table2.head(value_word).to_csv().encode('utf-8')
+            st.download_button(label=f"download top {value_word} words (csv)", data=csv, file_name=f'{database_name}_words.csv',
                            mime='text/csv')
+        except:
+            st.warning(
             f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus")
     st.markdown("---")
     df1.rename(columns={'Word': 'Human Gene'}, inplace=True)
     df1["Human Gene"] = df1["Human Gene"].str.upper()
     # print(df1.head(50))
+    # print()
     # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
     # time.sleep(2)
     # Create the slider with increments of 5 up to 100
                 f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
                 f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
                     unsafe_allow_html=True)
+    value_gene = st.slider("Gene", 0, 100, step=5)
+    if value_gene > 0:
         # st.subheader(f"Top {value} genes closely related to {query}: "
         #              f"Click on the Pubmed and NCBI links for more gene information")
         st.markdown(
+            f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_gene} "
             f"</span>genes similar to "
             f"<span style='color:red; font-style: italic;'>{query}:</span> Click on the squares to expand and the Pubmed and NCBI links for more gene information</span></p></b>",
             unsafe_allow_html=True)
+        df10 = df1.head(value_gene)
+        df10.index = (1 / df10.index)*10000
+        sizes = df10.index.tolist()
+        df10.set_index('Human Gene', inplace=True)
+        df3 = df1.copy()
+        df3["SIMILARITY"] = 'Similarity Score ' + df3.head(value_gene)["SIMILARITY"].round(2).astype(str)
+        df3.reset_index(inplace=True)
+        df3 = df3.rename(columns={'Human Gene': 'symbol2'})
+        # Use df.query to get a subset of df1 based on ids in df2
+        subset = df3.head(value_gene).query('symbol2 in @df2.symbol2')
+        # Use merge to join the two DataFrames on id
+        result = pd.merge(subset, df2, on='symbol2')
+        # Show the result
+        # print(result)
+        # label = df10.index.tolist()
+        # df2 = df10
+        # print(df2)
+        try:
+            # Define the `text` column for labels and `href` column for links
+            df10['text'] = df10.index
+            df10['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
+                  '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df10['text']]
+            df10['href2'] = [f'https://www.ncbi.nlm.nih.gov/gene/?term=' + c for c in df10['text']]
+            df10['name'] = [c for c in result['Approved name']]
+            df10['database'] = database_name
+            # print(df['name'])
+            # Create the treemap using `px.treemap`
+            fig = px.treemap(df10, path=[df10['text']], values=sizes,
+                     custom_data=['href', 'name', 'database', 'href2', 'text'], hover_name=(df3.head(value_gene)['SIMILARITY']))
+            fig.update(layout_coloraxis_showscale=False)
+            fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
+            fig.update_annotations(visible=False)
+            fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
                       hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
                       texttemplate="<b><span style='font-family: Arial; font-size: 20px;'>%{customdata[4]}</span></b><br><span "
                                    "style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>"
                                    "<a href='%{customdata[0]}'>PubMed"
                                    "</a><br><a href='%{customdata[3]}'>NCBI"
                                    "</span></a>")
+            fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightblue"])
+            # # display the treemap in Streamlit
+            # with treemap2:
+            # st.pyplot(fig2)
+            st.plotly_chart(fig, use_container_width=True)
+            st.caption("Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
+            st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
+            csv = df1.head(value_gene).to_csv().encode('utf-8')
+            st.download_button(label=f"download top {value_gene} genes (csv)", data=csv, file_name=f'{database_name}_genes.csv',
                        mime='text/csv')
+        except:
+            st.warning(
             f"This selection exceeds the number of similar genes related to {query} within the {database_name} corpus")
     st.markdown("---")
+    # st.write(short_table)
+    #
+    # print()
+    # print("Human genes similar to " + str(query))
+    df1 = table
+    df2 = pd.read_csv('protein.csv')
+    m = df1.Word.isin(df2.protein)
+    df1 = df1[m]
+    df1.rename(columns={'Word': 'Protein'}, inplace=True)
+    # print(df1)
+    df_len = len(df1)
+    # df1["Protein"] = df1["Protein"].str.upper()
+    # print(df1.head(50))
+    # print()
+    # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
+    # time.sleep(2)
+    # Create the slider with increments of 5 up to 100
+    st.markdown(
+        f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap with the slider below to visualize "
+        f"<span style='color:red; font-style: italic;'>proteins</span> contextually "
+        f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
+        f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
+        unsafe_allow_html=True)
+    value_protein = st.slider("Protein", 0, 100, step=5)
+    # print(value_protein)
+    if value_protein > 0:
+        # st.subheader(f"Top {value} genes closely related to {query}: "
+        #              f"Click on the Pubmed and NCBI links for more gene information")
+        st.markdown(
+            f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_protein} "
+            f"</span>proteins similar to "
+            f"<span style='color:red; font-style: italic;'>{query}:</span> Click on the squares to expand and the Pubmed and Wikipedia links for more protein information</span></p></b>",
+            unsafe_allow_html=True)
+        df11 = df1.head(value_protein)
+        print(df11)
+        df11.index = (1 / df11.index) * 10000
+        sizes = df11.index.tolist()
+        df11.set_index('Protein', inplace=True)
+        df4 = df1.copy()
+        # print(df4.head(10))
+        df4["SIMILARITY"] = 'Similarity Score ' + df4.head(value_protein)["SIMILARITY"].round(2).astype(str)
+        df4.reset_index(inplace=True)
+        # df4 = df4.rename(columns={'Protein': 'symbol2'})
+        # print(df4)
+        # # Use df.query to get a subset of df1 based on ids in df2
+        # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
+        # # Use merge to join the two DataFrames on id
+        # result = pd.merge(subset, df2b, on='symbol2')
+        # print(result)
+        if value_protein <= df_len:
+            # Define the `text` column for labels and `href` column for links
+            df11['text'] = df11.index
+            df11['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
+                       '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df11['text']]
+            df11['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df11['text']]
+            df11['database'] = database_name
+            # df11['name'] = [c for c in result['Approved name']]
+            # Create the treemap using `px.treemap`
+            fig = px.treemap(df11, path=[df11['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
+                         hover_name=(df4.head(value_protein)['SIMILARITY']))
+            fig.update(layout_coloraxis_showscale=False)
+            fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
+            fig.update_annotations(visible=False)
+            fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
+                          hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
+                          texttemplate="<b><span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}</span></b><br>"
+                                       "<a href='%{customdata[0]}'>PubMed"
+                                       "</a><br><a href='%{customdata[2]}'>Wikipedia"
+                                       "</span></a>")
+            fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightblue"])
+            # # display the treemap in Streamlit
+            # with treemap2:
+            # st.pyplot(fig2)
+            st.plotly_chart(fig, use_container_width=True)
+            st.caption(
+            "Protein designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
+            csv = df1.head(value_protein).to_csv().encode('utf-8')
+            st.download_button(label=f"download top {value_protein} proteins (csv)", data=csv, file_name=f'{database_name}_genes.csv',
+                           mime='text/csv')
+        else:
+            st.warning(f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus")
+    st.markdown("---")
     st.subheader("Cancer-related videos")
     if query:
         idlist=[]