Spaces:

jfataphd
/

OncoDigger

Running

App Files Files Community

OncoDigger / app.py

jfataphd

Update app.py

6337933 over 2 years ago

raw

history blame

19.4 kB

	import streamlit as st
	import time
	import json
	from gensim.models import Word2Vec
	import pandas as pd
	import matplotlib.pyplot as plt
	import squarify
	import numpy as np
	import re
	import urllib.request
	import random
	import plotly.express as px

	st.set_page_config(
	page_title="FATA4 Science",
	page_icon=":microscope:",
	layout="wide", #centered
	initial_sidebar_state="auto",
	menu_items={
	'About': "FATA4 Science is a Natural Language Processing (NLP) that ...."
	}
	)

	# Define the HTML and CSS styles
	st.markdown("""
	<style>
	[data-testid=stSidebar] {
	background-color: #99CCFF;
	}
	</style>
	""", unsafe_allow_html=True)
	st.markdown("""
	<style>
	body {
	background-color: #CCFFFF;
	# color: #ffffff;
	# font-size: 1px
	}
	.stApp {
	background-color: #CCFFFF;
	# color: #ffffff;
	# font-size: 1px
	}
	</style>
	""", unsafe_allow_html=True)

	opt=st.sidebar.radio("Select a PubMed Corpus", options=('Clotting corpus', 'Neuroblastoma corpus'))
	if opt == "Clotting corpus":
	model_used = ("pubmed_model_clotting")
	num_abstracts = 45493
	database_name = "Clotting"
	if opt == "Neuroblastoma corpus":
	model_used = ("pubmed_model_neuroblastoma")
	num_abstracts = 29032
	database_name = "Neuroblastoma"
	# if opt == "Breast Cancer corpus":
	# model_used = ("pubmed_model_breast_cancer")
	# num_abstracts = 290320
	# database_name = "Breast_cancer"
	# if opt == "Mammary gland corpus":
	# model_used = ("pubmed_model_mammary_gland")
	# num_abstracts = 79032
	# database_name = "Mammary_gland"

	st.header(":red[F]ast :red[A]cting :red[T]ext :red[A]nalysis (:red[FATA]) 4 Science")

	st.subheader("Uncovering knowledge through Natural Language Processing (NLP)")
	st.markdown("---")

	st.header(f":blue[{database_name} Pubmed corpus.]")
	text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
	query = text_input_value
	query = query.lower()
	query = re.sub("[,.?!&*;: ]", "", query)
	matches = [" "]
	if any([x in query for x in matches]):
	st.write("Please only enter one term or a term without spaces")
	# query = input ("Enter your keyword(s):")
	if query:
	bar = st.progress(0)
	time.sleep(.05)
	st.caption(f"Searching {num_abstracts} {database_name} PubMed abstracts covering 1990-2022")

	for i in range(10):
	bar.progress((i + 1) * 10)
	time.sleep(.1)

	# try:
	model = Word2Vec.load(model_used) # you can continue training with the loaded model!
	words = list(model.wv.key_to_index)
	X = model.wv[model.wv.key_to_index]
	model2 = model.wv[query]
	df = pd.DataFrame(X)

	# except:
	# st.error("Term occurrence is too low - please try another term")
	# st.stop()
	st.markdown("---")
	# def findRelationships(query, df):


	table = model.wv.most_similar_cosmul(query, topn=10000)
	table = (pd.DataFrame(table))
	table.index.name = 'Rank'
	table.columns = ['Word', 'SIMILARITY']

	# print()
	# print("Similarity to " + str(query))
	pd.set_option('display.max_rows', None)
	table2 = table.copy()
	# print(table.head(50))
	# table.head(10).to_csv("clotting_sim1.csv", index=True)
	# short_table = table.head(50)
	# print(table)

	# Create the slider with increments of 5 up to 100

	st.markdown(
	f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap with the slider below to visualize "
	f"<span style='color:red; font-style: italic;'>words</span> contextually "
	f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
	f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
	unsafe_allow_html=True)
	value_word = st.slider("Words", 0, 100, step=5)
	if value_word > 0:
	# st.subheader(f"Top {value} genes closely related to {query}: "
	# f"Click on the Pubmed and NCBI links for more gene information")

	st.markdown(
	f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_word} "
	f"</span>words similar to "
	f"<span style='color:red; font-style: italic;'>{query}:</span> Click on the squares to expand and the Wikipaedia links for more word information</span></p></b>",
	unsafe_allow_html=True)


	# calculate the sizes of the squares in the treemap
	short_table = table2.head(value_word).round(2)
	short_table.index += 1
	short_table.index = (1 / short_table.index)*10
	sizes = short_table.index.tolist()


	short_table.set_index('Word', inplace=True)
	# label = short_table.index.tolist()
	# print(short_table.index)
	table2["SIMILARITY"] = 'Similarity Score ' + table2.head(10)["SIMILARITY"].round(2).astype(str)
	rank_num = list(short_table.index.tolist())
	# avg_size = sum(sizes) / len(short_table.index)
	df = short_table
	try:
	# Define the `text` column for labels and `href` column for links
	df['text'] = short_table.index

	df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
	'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in short_table.index]
	df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index]

	df.loc[:,'database'] = database_name


	# print(sizes)
	# '{0} in {1}'.format(unicode(self.author, 'utf-8'), unicode(self.publication, 'utf-8'))
	# Create the treemap using `px.treemap`
	fig = px.treemap(df, path=[short_table.index], values=sizes, custom_data=['href', 'text', 'database', 'href2'],
	hover_name=(table2.head(value_word)['SIMILARITY']))

	fig.update(layout_coloraxis_showscale=False)
	fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
	fig.update_annotations(visible=False)
	fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
	hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
	texttemplate="</b><br><span "
	"style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>"
	"<a href='%{customdata[0]}'>PubMed"
	"</a><br><a href='%{customdata[3]}'>Wikipedia"
	"</span></a>")
	fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])

	# st.pyplot(fig2)
	st.plotly_chart(fig, use_container_width=True)

	# st.caption(
	# "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
	# st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")

	csv = table2.head(value_word).to_csv().encode('utf-8')
	st.download_button(label=f"download top {value_word} words (csv)", data=csv, file_name=f'{database_name}_words.csv',
	mime='text/csv')
	except:
	st.warning(
	f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus, please choose a lower number")

	st.markdown("---")
	# st.write(short_table)
	#

	# print()
	# print("Human genes similar to " + str(query))
	df1 = table.copy()
	df2 = pd.read_csv('Human_Genes.csv')
	m = df1.Word.isin(df2.symbol)
	df1 = df1[m].loc[:,:]
	df1.rename(columns={'Word': 'Human Gene'}, inplace=True)
	df1["Human Gene"] = df1["Human Gene"].str.upper()
	# print(df1.head(50))
	# print()
	# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
	# time.sleep(2)
	# Create the slider with increments of 5 up to 100

	st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap with the slider below to visualize "
	f"<span style='color:red; font-style: italic;'>genes</span> contextually "
	f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
	f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
	unsafe_allow_html=True)
	value_gene = st.slider("Gene", 0, 100, step=5)
	if value_gene > 0:
	# st.subheader(f"Top {value} genes closely related to {query}: "
	# f"Click on the Pubmed and NCBI links for more gene information")

	st.markdown(
	f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_gene} "
	f"</span>genes similar to "
	f"<span style='color:red; font-style: italic;'>{query}:</span> Click on the squares to expand and the Pubmed and NCBI links for more gene information</span></p></b>",
	unsafe_allow_html=True)

	df10 = df1.head(value_gene).copy()
	df10.index = (1 / df10.index)*10000
	sizes = df10.index.tolist()
	df10.set_index('Human Gene', inplace=True)

	df3 = df1.copy()
	df3["SIMILARITY"] = 'Similarity Score ' + df3.head(value_gene)["SIMILARITY"].round(2).astype(str)
	df3.reset_index(inplace=True)
	df3 = df3.rename(columns={'Human Gene': 'symbol2'})
	# Use df.query to get a subset of df1 based on ids in df2
	subset = df3.head(value_gene).query('symbol2 in @df2.symbol2')
	# Use merge to join the two DataFrames on id
	result = pd.merge(subset, df2, on='symbol2')
	# Show the result
	# print(result)
	# label = df10.index.tolist()
	# df2 = df10
	# print(df2)
	try:
	# Define the `text` column for labels and `href` column for links
	df10['text'] = df10.index
	df10['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
	'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df10['text']]
	df10['href2'] = [f'https://www.ncbi.nlm.nih.gov/gene/?term=' + c for c in df10['text']]

	df10['name'] = [c for c in result['Approved name']]
	assert isinstance(df10, object)
	df10.loc[:,'database'] = database_name

	# print(df['name'])

	# Create the treemap using `px.treemap`
	fig = px.treemap(df10, path=[df10['text']], values=sizes,
	custom_data=['href', 'name', 'database', 'href2', 'text'], hover_name=(df3.head(value_gene)['SIMILARITY']))

	fig.update(layout_coloraxis_showscale=False)
	fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
	fig.update_annotations(visible=False)
	fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
	hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
	texttemplate="<b><span style='font-family: Arial; font-size: 20px;'>%{customdata[4]}</span></b><br><span "
	"style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>"
	"<a href='%{customdata[0]}'>PubMed"
	"</a><br><a href='%{customdata[3]}'>NCBI"
	"</span></a>")
	fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightblue"])
	# # display the treemap in Streamlit
	# with treemap2:

	# st.pyplot(fig2)
	st.plotly_chart(fig, use_container_width=True)

	st.caption("Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
	st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")



	csv = df1.head(value_gene).to_csv().encode('utf-8')
	st.download_button(label=f"download top {value_gene} genes (csv)", data=csv, file_name=f'{database_name}_genes.csv',
	mime='text/csv')


	except:
	st.warning(
	f"This selection exceeds the number of similar genes related to {query} within the {database_name} corpus, please choose a lower number")
	st.markdown("---")

	# st.write(short_table)
	#

	# print()
	# print("Human genes similar to " + str(query))
	df1 = table.copy()
	df2 = pd.read_csv('protein.csv')
	m = df1.Word.isin(df2.protein)
	df1 = df1[m]
	df1.rename(columns={'Word': 'Protein'}, inplace=True)
	# print(df1)
	df_len = len(df1)
	# df1["Protein"] = df1["Protein"].str.upper()
	# print(df1.head(50))
	# print()
	# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
	# time.sleep(2)
	# Create the slider with increments of 5 up to 100

	st.markdown(
	f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap with the slider below to visualize "
	f"<span style='color:red; font-style: italic;'>proteins</span> contextually "
	f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
	f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
	unsafe_allow_html=True)
	value_protein = st.slider("Protein", 0, 100, step=5)
	# print(value_protein)
	if value_protein > 0:
	# st.subheader(f"Top {value} genes closely related to {query}: "
	# f"Click on the Pubmed and NCBI links for more gene information")

	st.markdown(
	f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_protein} "
	f"</span>proteins similar to "
	f"<span style='color:red; font-style: italic;'>{query}:</span> Click on the squares to expand and the Pubmed and Wikipedia links for more protein information</span></p></b>",
	unsafe_allow_html=True)

	df11 = df1.head(value_protein).copy()

	df11.index = (1 / df11.index) * 10000
	sizes = df11.index.tolist()

	df11.set_index('Protein', inplace=True)

	df4 = df1.copy()
	# print(df4.head(10))
	df4["SIMILARITY"] = 'Similarity Score ' + df4.head(value_protein)["SIMILARITY"].round(2).astype(str)
	df4.reset_index(inplace=True)
	# df4 = df4.rename(columns={'Protein': 'symbol2'})
	# print(df4)
	# # Use df.query to get a subset of df1 based on ids in df2
	# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
	# # Use merge to join the two DataFrames on id
	# result = pd.merge(subset, df2b, on='symbol2')
	# print(result)
	if value_protein <= df_len:
	# Define the `text` column for labels and `href` column for links
	df11['text'] = df11.index
	df11['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
	'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df11['text']]
	df11['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df11['text']]
	assert isinstance(df11, object)
	df11['database'] = database_name

	# df11['name'] = [c for c in result['Approved name']]

	# Create the treemap using `px.treemap`
	fig = px.treemap(df11, path=[df11['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
	hover_name=(df4.head(value_protein)['SIMILARITY']))

	fig.update(layout_coloraxis_showscale=False)
	fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
	fig.update_annotations(visible=False)
	fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
	hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
	texttemplate="<b><span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}</span></b><br>"
	"<a href='%{customdata[0]}'>PubMed"
	"</a><br><a href='%{customdata[2]}'>Wikipedia"
	"</span></a>")
	fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightblue"])
	# # display the treemap in Streamlit
	# with treemap2:

	# st.pyplot(fig2)
	st.plotly_chart(fig, use_container_width=True)

	st.caption(
	"Protein designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")

	csv = df1.head(value_protein).to_csv().encode('utf-8')
	st.download_button(label=f"download top {value_protein} proteins (csv)", data=csv, file_name=f'{database_name}_genes.csv',
	mime='text/csv')


	else:
	st.warning(f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
	st.markdown("---")


	st.subheader("Cancer-related videos")
	if query:
	idlist=[]
	search_keyword = {query}
	html = urllib.request.urlopen("https://www.youtube.com/@NCIgov/search?query=cancer")
	html2 = urllib.request.urlopen("https://www.youtube.com/@CancerCenter/search?query=cancer")
	html3 = urllib.request.urlopen("https://www.youtube.com/@NorthwesternMedicine/search?query=cancer")
	html4 = urllib.request.urlopen("https://www.youtube.com/@TEDEd/search?query=cancer")
	html5 = urllib.request.urlopen("https://www.youtube.com/@CancerResearchUK/search?query=cancer")
	video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode())
	video_ids2 = re.findall(r"watch\?v=(\S{11})", html2.read().decode())
	video_ids3 = re.findall(r"watch\?v=(\S{11})", html3.read().decode())
	video_ids4 = re.findall(r"watch\?v=(\S{11})", html4.read().decode())
	video_ids5 = re.findall(r"watch\?v=(\S{11})", html5.read().decode())

	for i in video_ids2:
	video_ids.append(i)
	for i in video_ids3:
	video_ids.append(i)
	for i in video_ids4:
	video_ids.append(i)
	for i in video_ids5:
	video_ids.append(i)

	random.shuffle(video_ids)

	c1, c2, c3 = st.columns(3)


	with c1:
	st.video("https://www.youtube.com/watch?v=" + video_ids[0])
	with c2:
	st.video("https://www.youtube.com/watch?v=" + video_ids[1])
	with c3:
	st.video("https://www.youtube.com/watch?v=" + video_ids[2])
	st.markdown("---")