Spaces:

jfataphd
/

OncoDigger

Running

App Files Files Community

OncoDigger / app.py

jfataphd

Update app.py

fb7bdf2 over 2 years ago

raw

history blame

7.4 kB

	import streamlit as st
	import time
	import json
	from gensim.models import Word2Vec
	import pandas as pd
	import matplotlib.pyplot as plt
	import squarify
	import numpy as np
	import re
	import urllib.request
	import random

	st.set_page_config(
	page_title="FATA4 Science",
	page_icon=":microscope:",
	layout="wide",
	initial_sidebar_state="auto",
	menu_items={
	'About': "FATA4 Science is a Natural Language Processing (NLP) that ...."
	}
	)

	# Define the HTML and CSS styles
	st.markdown("""
	<style>
	[data-testid=stSidebar] {
	background-color: #99CCFF;
	}
	</style>
	""", unsafe_allow_html=True)
	st.markdown("""
	<style>
	body {
	background-color: #CCFFFF;
	# color: #ffffff;
	}
	.stApp {
	background-color: #CCFFFF;
	# color: #ffffff;
	}
	</style>
	""", unsafe_allow_html=True)

	opt=st.sidebar.radio("Select a PubMed Corpus", options=('Clotting corpus', 'Neuroblastoma corpus'))
	if opt == "Clotting corpus":
	model_used = ("pubmed_model_clotting")
	num_abstracts = 45493
	database_name = "Clotting"
	if opt == "Neuroblastoma corpus":
	model_used = ("pubmed_model_neuroblastoma")
	num_abstracts = 29032
	database_name = "Neuroblastoma"

	st.title(":red[Fast Acting Text Analysis (FATA) 4 Science]")
	st.markdown("---")
	st.subheader("Uncovering knowledge through Natural Language Processing (NLP)")

	st.header(f"{database_name} Pubmed corpus.")
	text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
	query = text_input_value
	query = query.lower()
	query = re.sub("[,.?!&*;:]", "", query)
	matches = [" "]
	if any([x in query for x in matches]):
	st.write("Please only enter one term or a term without spaces")
	# query = input ("Enter your keyword(s):")
	if query:
	bar = st.progress(0)
	time.sleep(.05)
	st.caption(f":LightSkyBlue[searching {num_abstracts} {database_name} PubMed abstracts] covering 1990-2022")

	for i in range(10):
	bar.progress((i + 1) * 10)
	time.sleep(.1)

	try:
	model = Word2Vec.load(model_used) # you can continue training with the loaded model!
	words = list(model.wv.key_to_index)
	X = model.wv[model.wv.key_to_index]
	model2 = model.wv[query]
	df = pd.DataFrame(X)

	except:
	st.error("Term occurrence is too low - please try another term")
	st.stop()

	# def findRelationships(query, df):
	table = model.wv.most_similar_cosmul(query, topn=10000)
	table = (pd.DataFrame(table))
	table.index.name = 'Rank'
	table.columns = ['Word', 'SIMILARITY']
	print()
	print("Similarity to " + str(query))
	pd.set_option('display.max_rows', None)
	print(table.head(50))
	# table.head(10).to_csv("clotting_sim1.csv", index=True)
	# short_table = table.head(50)
	# print(table)
	st.subheader(f"Top 10 Words closely related to {query}")

	# calculate the sizes of the squares in the treemap
	short_table = table.head(10)
	short_table.index += 1
	short_table.index = 1 / short_table.index
	sizes = short_table.index.tolist()

	cmap = plt.cm.Greens(np.linspace(0.05, .5, len(sizes)))
	color = [cmap[i] for i in range(len(sizes))]

	short_table.set_index('Word', inplace=True)
	squarify.plot(sizes=sizes, label=short_table.index.tolist(), color=color, edgecolor="#EBF5FB",
	text_kwargs={'fontsize': 10})
	# # plot the treemap using matplotlib
	plt.axis('off')
	fig = plt.gcf()
	fig.patch.set_facecolor('#CCFFFF')
	# # display the treemap in Streamlit
	st.pyplot(fig)
	plt.clf()

	csv = table.head(100).to_csv().encode('utf-8')
	st.download_button(label="download top 100 words (csv)", data=csv, file_name=f'{database_name}_words.csv', mime='text/csv')

	# st.write(short_table)
	#

	print()
	print("Human genes similar to " + str(query))
	df1 = table
	df2 = pd.read_csv('Human_Genes.csv')
	m = df1.Word.isin(df2.symbol)
	df1 = df1[m]
	df1.rename(columns={'Word': 'Human Gene'}, inplace=True)
	df1["Human Gene"] = df1["Human Gene"].str.upper()
	print(df1.head(50))
	print()
	# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
	# time.sleep(2)
	st.subheader(f"Top 10 Genes closely related to {query}")

	df10 = df1.head(10)
	df10.index = 1 / df10.index
	sizes = df10.index.tolist()

	cmap2 = plt.cm.Blues(np.linspace(0.05, .5, len(sizes)))
	color2 = [cmap2[i] for i in range(len(sizes))]

	df10.set_index('Human Gene', inplace=True)
	squarify.plot(sizes=sizes, label=df10.index.tolist(), color=color2, edgecolor="#EBF5FB",
	text_kwargs={'fontsize': 12})
	#
	# # plot the treemap using matplotlib

	plt.axis('off')
	fig2 = plt.gcf()
	fig2.patch.set_facecolor('#CCFFFF')
	# plt.show()
	#
	# # display the treemap in Streamlit
	st.pyplot(fig2)

	csv = df1.head(100).to_csv().encode('utf-8')
	st.download_button(label="download top 100 genes (csv)", data=csv, file_name=f'{database_name}_genes.csv',
	mime='text/csv')

	if query:
	idlist=[]
	search_keyword = {query}
	html = urllib.request.urlopen("https://www.youtube.com/@NCIgov/search?query=cancer")
	html2 = urllib.request.urlopen("https://www.youtube.com/@CancerCenter/search?query=cancer")
	html3 = urllib.request.urlopen("https://www.youtube.com/@NorthwesternMedicine/search?query=cancer")
	html4 = urllib.request.urlopen("https://www.youtube.com/@TEDEd/search?query=cancer")
	html5 = urllib.request.urlopen("https://www.youtube.com/@CancerResearchUK/search?query=cancer")
	video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode())
	video_ids2 = re.findall(r"watch\?v=(\S{11})", html2.read().decode())
	video_ids3 = re.findall(r"watch\?v=(\S{11})", html3.read().decode())
	video_ids4 = re.findall(r"watch\?v=(\S{11})", html4.read().decode())
	video_ids5 = re.findall(r"watch\?v=(\S{11})", html5.read().decode())

	for i in video_ids2:
	video_ids.append(i)
	for i in video_ids3:
	video_ids.append(i)
	for i in video_ids4:
	video_ids.append(i)
	for i in video_ids5:
	video_ids.append(i)

	random.shuffle(video_ids)

	c1, c2, c3 = st.columns(3)
	with c1:
	st.video("https://www.youtube.com/watch?v=" + video_ids[0])
	with c2:
	st.video("https://www.youtube.com/watch?v=" + video_ids[1])
	with c3:
	st.video("https://www.youtube.com/watch?v=" + video_ids[2])
	# width = 40
	# side = 30 # (100-width)/2
	# _, container, _ = st.columns([side, width, side])
	# container.video(data=VIDEO_DATA)

	# VIDEO_DATA = "https://www.youtube.com/watch?v=" + video_ids[1]
	# width = 40
	# side = 30 # (100-width)/2
	# _, container, _ = st.columns([side, width, side])
	# container.video(data=VIDEO_DATA)

	# VIDEO_DATA = "https://www.youtube.com/watch?v=" + video_ids[2]
	# width = 40
	# side = 30 # (100-width)/2
	# _, container, _ = st.columns([side, width, side])
	# container.video(data=VIDEO_DATA)