Spaces:

jfataphd
/

OncoDigger

Sleeping

File size: 8,387 Bytes

1699569
 
 
 
 
e5a12b8
 
 
62faff0
 
2267bcd
66e83e0
1699569
c6e3011
 
 
ff86fbf
62faff0
c6e3011
 
 
 
 
1699569
62faff0
 
 
 
 
 
 
f21967a
a6d026f
f192d73
62faff0
3559da9
f192d73
afb8bf9
62faff0
3559da9
afb8bf9
a6d026f
f21967a
2bba935
90c2875
 
e48b5b5
90c2875
e48b5b5
90c2875
 
 
e48b5b5
1699569
66e83e0
105ed33
66e83e0
c6e3011
e48b5b5
105ed33
62faff0
2bba935
b2912c4
62faff0
 
 
 
e5a12b8
1699569
e48b5b5
2267bcd
105ed33
62faff0
e48b5b5
 
 
f21967a
 
e48b5b5
f21967a
 
 
 
 
 
 
 
105ed33
f21967a
1699569
 
 
 
 
 
 
e5a12b8
f658f80
e5a12b8
 
8eb1090
e5a12b8
 
50cfb9e
e5a12b8
 
 
 
 
 
 
 
f21967a
66e83e0
e5a12b8
 
66e83e0
 
e5a12b8
62faff0
e5a12b8
ff86fbf
 
 
 
 
 
 
 
 
 
 
8eb1090
 
 
ff86fbf
 
 
e5a12b8
8eb1090
 
f658f80
e5a12b8
1699569
4b2cc15
1699569
 
 
4b2cc15
1699569
 
 
e5a12b8
 
1699569
f658f80
e5a12b8
8eb1090
e5a12b8
f658f80
f21967a
f658f80
e5a12b8
 
 
 
b1a4aa9
f21967a
 
e5a12b8
 
 
 
 
62faff0
e5a12b8
ff86fbf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5a12b8
8eb1090
 
ff86fbf
 
e5a12b8
8eb1090
 
c6e3011
8eb1090
 
62faff0
fb7bdf2
62faff0
fb7bdf2
 
 
 
 
62faff0
fb7bdf2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2267bcd
 
8eb1090
 
2267bcd
fb7bdf2
2267bcd
fb7bdf2
2267bcd
fb7bdf2
8eb1090
66e83e0
ff86fbf
 
 
66e83e0

import streamlit as st
import time
import json
from gensim.models import Word2Vec
import pandas as pd
import matplotlib.pyplot as plt
import squarify
import numpy as np
import re
import urllib.request
import random
import plotly.express as px

st.set_page_config(
    page_title="FATA4 Science",
                page_icon=":microscope:",
                layout="wide", #centered
                initial_sidebar_state="auto",
                menu_items={
                    'About': "FATA4 Science is a Natural Language Processing (NLP) that ...."
                }
                )

# Define the HTML and CSS styles
st.markdown("""
<style>
    [data-testid=stSidebar] {
        background-color: #99CCFF;
    }
</style>
""", unsafe_allow_html=True)
st.markdown("""
    <style>
    body {
        background-color: #CCFFFF;
        # color: #ffffff;
    }
    .stApp {
        background-color: #CCFFFF;
        # color: #ffffff;
    }
    </style>
    """, unsafe_allow_html=True)

opt=st.sidebar.radio("Select a PubMed Corpus", options=('Clotting corpus', 'Neuroblastoma corpus'))
if opt == "Clotting corpus":
    model_used = ("pubmed_model_clotting")
    num_abstracts = 45493
    database_name = "Clotting"
if opt == "Neuroblastoma corpus":
    model_used = ("pubmed_model_neuroblastoma")
    num_abstracts = 29032
    database_name = "Neuroblastoma"

st.header(":red[*F*]ast :red[*A*]cting :red[*T*]ext :red[*A*]nalysis (:red[*FATA*]) 4 Science")

st.subheader("Uncovering knowledge through Natural Language Processing (NLP)")
st.markdown("---")

st.header(f":blue[{database_name} Pubmed corpus.]")
text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
query = text_input_value
query = query.lower()
query = re.sub("[,.?!&*;:]", "", query)
matches = [" "]
if any([x in query for x in matches]):
    st.write("Please only enter one term or a term without spaces")
# query = input ("Enter your keyword(s):")
if query:
    bar = st.progress(0)
    time.sleep(.05)
    st.caption(f"Searching {num_abstracts} {database_name} PubMed abstracts covering 1990-2022")

    for i in range(10):
        bar.progress((i + 1) * 10)
        time.sleep(.1)

    try:
        model = Word2Vec.load(model_used)  # you can continue training with the loaded model!
        words = list(model.wv.key_to_index)
        X = model.wv[model.wv.key_to_index]
        model2 = model.wv[query]
        df = pd.DataFrame(X)

    except:
        st.error("Term occurrence is too low - please try another term")
        st.stop()
    st.markdown("---")
    # def findRelationships(query, df):
    table = model.wv.most_similar_cosmul(query, topn=10000)
    table = (pd.DataFrame(table))
    table.index.name = 'Rank'
    table.columns = ['Word', 'SIMILARITY']
    print()
    print("Similarity to " + str(query))
    pd.set_option('display.max_rows', None)
    print(table.head(50))
    # table.head(10).to_csv("clotting_sim1.csv", index=True)
    # short_table = table.head(50)
    # print(table)


    # calculate the sizes of the squares in the treemap
    short_table = table.head(10)
    short_table.index += 1
    short_table.index = 1 / short_table.index
    sizes = short_table.index.tolist()

    cmap = plt.cm.Greens(np.linspace(0.05, .5, len(sizes)))
    color = [cmap[i] for i in range(len(sizes))]

    short_table.set_index('Word', inplace=True)
    squarify.plot(sizes=sizes, label=short_table.index.tolist(), color=color, edgecolor="#EBF5FB",
                  text_kwargs={'fontsize': 10},)
    # # plot the treemap using matplotlib
    plt.axis('off')
    # Add legend to top right, outside plot region
    # plt.legend("upper right", bbox_to_anchor=(-.2, 0))
    fig = plt.gcf()
    fig.patch.set_facecolor('#CCFFFF')
    # # display the treemap in Streamlit

    rank_num = list(short_table.index.tolist())
    avg_size = sum(sizes) / len(short_table.index)
    print(rank_num)
    # print(sizes)
    fig = px.treemap(short_table, path=[short_table.index], values=sizes, color=sizes, color_continuous_scale='greens',
                     color_continuous_midpoint=avg_size)
    fig.update(layout_coloraxis_showscale=False)
    fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF")


    treemap1, treemap2 = st.columns(2)
    with treemap1:
        st.subheader(f"Top 10 Words closely related to {query}")
        # st.pyplot(fig)
        # plt.clf()
        st.plotly_chart(fig, use_container_width=True)

        csv = table.head(100).to_csv().encode('utf-8')
        st.download_button(label="download top 100 words (csv)", data=csv, file_name=f'{database_name}_words.csv', mime='text/csv')

    # st.write(short_table)
    #

    print()
    print("Human genes similar to " + str(query))
    df1 = table
    df2 = pd.read_csv('Human_Genes.csv')
    m = df1.Word.isin(df2.symbol)
    df1 = df1[m]
    df1.rename(columns={'Word': 'Human Gene'}, inplace=True)
    df1["Human Gene"] = df1["Human Gene"].str.upper()
    print(df1.head(50))
    print()
    # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
    # time.sleep(2)


    df10 = df1.head(10)
    df10.index = 1 / df10.index
    sizes = df10.index.tolist()

    cmap2 = plt.cm.Blues(np.linspace(0.05, .5, len(sizes)))
    color2 = [cmap2[i] for i in range(len(sizes))]

    df10.set_index('Human Gene', inplace=True)
    squarify.plot(sizes=sizes, label=df10.index.tolist(), color=color2, edgecolor="#EBF5FB",
                  text_kwargs={'fontsize': 12})
    #
    # # plot the treemap using matplotlib

    plt.axis('off')
    fig2 = plt.gcf()
    fig2.patch.set_facecolor('#CCFFFF')
    #



    # link_ref = '<a href="http://google.com" style="cursor: pointer" target="_blank" rel="noopener noreferrer">{}</a>'
    # df10['SIMILARITY'] = df10['SIMILARITY'].apply(lambda item: link_ref.format(item, "{}"))
    rank_num = list(df10.index.tolist())
    avg_size = sum(sizes) / len(df10.index)
    print(rank_num)
    # print(sizes)
    fig = px.treemap(path=[df10.index], values=sizes, color=sizes, color_continuous_scale='greens',
                     color_continuous_midpoint=avg_size)
    fig.update(layout_coloraxis_showscale=False)
    fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", uniformtext_mode="hide", plot_bgcolor="#fff")
    fig.update_traces(root_color='rgba(0,0,0,0)')


    # # display the treemap in Streamlit
    with treemap2:
        st.subheader(f"Top 10 Genes closely related to {query}")
        # st.pyplot(fig2)
        st.plotly_chart(fig, use_container_width=True)

        csv = df1.head(100).to_csv().encode('utf-8')
        st.download_button(label="download top 100 genes (csv)", data=csv, file_name=f'{database_name}_genes.csv',
                       mime='text/csv')
    st.markdown("---")
    st.subheader("Cancer-related videos")
    if query:
        idlist=[]
        search_keyword = {query}
        html = urllib.request.urlopen("https://www.youtube.com/@NCIgov/search?query=cancer")
        html2 = urllib.request.urlopen("https://www.youtube.com/@CancerCenter/search?query=cancer")
        html3 = urllib.request.urlopen("https://www.youtube.com/@NorthwesternMedicine/search?query=cancer")
        html4 = urllib.request.urlopen("https://www.youtube.com/@TEDEd/search?query=cancer")
        html5 = urllib.request.urlopen("https://www.youtube.com/@CancerResearchUK/search?query=cancer")
        video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode())
        video_ids2 = re.findall(r"watch\?v=(\S{11})", html2.read().decode())
        video_ids3 = re.findall(r"watch\?v=(\S{11})", html3.read().decode())
        video_ids4 = re.findall(r"watch\?v=(\S{11})", html4.read().decode())
        video_ids5 = re.findall(r"watch\?v=(\S{11})", html5.read().decode())

        for i in video_ids2:
            video_ids.append(i)
        for i in video_ids3:
            video_ids.append(i)
        for i in video_ids4:
            video_ids.append(i)
        for i in video_ids5:
            video_ids.append(i)

        random.shuffle(video_ids)

        c1, c2, c3 = st.columns(3)


        with c1:
           st.video("https://www.youtube.com/watch?v=" + video_ids[0])
        with c2:
           st.video("https://www.youtube.com/watch?v=" + video_ids[1])
        with c3:
           st.video("https://www.youtube.com/watch?v=" + video_ids[2])
    st.markdown("---")

    # fig = plt.figure()