Spaces:

ligdis
/

3

Running

App Files Files Community

3 / app.py

ligdis

Update app.py

7c429fd verified 6 months ago

raw

history blame contribute delete

21.4 kB

	# regular imports
	import os
	import sys
	import csv
	import collections
	import pandas as pd
	import streamlit as st
	import json
	import gc
	import requests
	from PIL import Image
	from io import BytesIO
	from io import StringIO
	from datasets import load_dataset

	st.set_page_config(
	page_title="Ligand Discovery 3: Protein-set Enrichment Analysis",
	page_icon=":home:",
	layout="wide", # "centered",
	initial_sidebar_state="expanded"
	)

	st.markdown("""
	<style>
	.css-13sdm1b.e16nr0p33 {
	margin-top: -75px;
	}
	</style>
	""", unsafe_allow_html=True)

	hide_streamlit_style = """
	<style>
	#MainMenu {visibility: hidden;}
	footer {visibility: hidden;}
	#header {visibility: hidden;}
	</style>
	"""
	st.markdown(hide_streamlit_style, unsafe_allow_html=True)

	proteins_set = None

	ROOT = os.path.abspath(os.path.dirname(__file__))
	# TMP = os.path.join(ROOT, "tmp")
	# if not os.path.exists(TMP):
	# os.mkdir(TMP)

	MIN_SET_SIZE = 1
	PROFILE_TYPE = "Fragment"
	OVERVIEW_PVALUE_CUTOFF = 0.05

	# relative imports
	# sys.path.append(os.path.join(ROOT, "../src/"))
	# from util import listdir_util

	def listdir_util(path):
	for d in os.listdir(path):
	if d.startswith("_"):
	continue
	else:
	yield d

	# import metadata
	from proteome_meta import task_suf
	from proteome_meta import annotation_type_dict
	from proteome_meta import annotation_dict
	from proteome_meta import universe_dict


	# path to results and original data
	PATH = os.path.abspath(os.path.join(ROOT, "../results/proteins/"))
	DATA = os.path.abspath(os.path.join(ROOT, "../data"))
	DATA2 = 'ligdis/data'
	mySeparator = "/"
	CACHE = os.path.abspath(os.path.join(ROOT, "../cache"))

	# generic inputs

	# protein id to gene name

	dataset = load_dataset('ligdis/data', data_files={"general/pid2name_primary.tsv"}, delimiter='\t')
	df = dataset['train'].to_pandas()
	pid2name = dict(zip(df.iloc[:, 0], df.iloc[:, 1]))
	name2pid = dict(zip(df.iloc[:, 1], df.iloc[:, 0]))
	del dataset, df # Delete the variable
	gc.collect()

	def pid2gene(x):
	if x in pid2name:
	return pid2name[x]
	else:
	return x


	def gene2pid(x):
	if x in name2pid:
	return name2pid[x]
	else:
	return x


	def pretty_term(x):
	x = x.title()
	if x.endswith("]"):
	x = x.split(" [")[0]
	return x

	def hf_tsv_2_pandas_df(hf_repo, data_file, myHeader):

	url = '/'.join(("https://huggingface.co/datasets", hf_repo, "resolve/main", data_file))
	response = requests.get(url)

	if response.status_code == 200:
	tsv_data = StringIO(response.text) # Use StringIO to treat the string content as a file-like object
	df = pd.read_csv(tsv_data, sep='\t', header = myHeader) # Load the TSV file into a pandas DataFrame
	else:
	df = pd.DataFrame()
	st.write("Error loading dataset from hf_repo: ", hf_repo, " and data_file: ", data_file)
	return(df)

	def load_hf_json(json_url):
	response = requests.get(json_url)
	if response.status_code == 200:
	out = response.json()
	else:
	print("Failed to retrieve ", json_url, " file. HTTP Status Code: ", response.status_code)
	return(out)

	def load_hf_image(image_url):
	response = requests.get(image_url)
	if response.status_code == 200:
	img = Image.open(BytesIO(response.content))
	else:
	print("Failed to retrieve image. HTTP Status Code:", response.status_code)
	return(img)


	# side bar

	st.sidebar.title("Ligand Discovery 3: Protein-set Enrichment Analysis")

	# signatures (aka profiles)
	st.sidebar.header("Select a fragment")

	profile_type = PROFILE_TYPE
	profile_type_subfolder = profile_type.lower()

	# @st.cache_data
	# def get_sorted_fids():
	# fids = []
	# for fid in listdir_util(os.path.join(DATA, "signatures", "proteins", "fragment")):
	# fids += [fid]
	# fids = sorted(fids)
	# return fids

	with open("fid.txt", "r") as file:
	lines = file.readlines()
	# Remove the newline characters (\n) from each line
	fids = [line.strip() for line in lines]

	# fids = get_sorted_fids()
	profile = st.sidebar.selectbox("Fragment identifier", options=fids)
	profile_subfolder = profile
	all_cases = fids
	draw_fragment = True

	st.sidebar.header("Choose a type of analysis")

	type_of_analysis = st.sidebar.radio(
	"Type of analysis", options=["Overview", "Detailed"]
	)

	# OVERVIEW TYPE OF ANALYSYS

	if type_of_analysis == "Overview":

	st.header("Enrichment overview for {0} {1}".format(profile_type.lower(), profile))
	view = st.sidebar.radio("Select View", options=["Table", "Plot"])

	df = hf_tsv_2_pandas_df(hf_repo="ligdis/cache_overview", data_file="{0}.tsv".format(profile), myHeader=0)

	# df = pd.read_csv(os.path.join(CACHE, "overview", "{0}.tsv".format(profile)), sep="\t")

	if view == "Table":

	columns = st.columns(4)

	prot2idx = collections.defaultdict(list)
	for i,r in enumerate(list(df["edge"])):
	for x in r.split(","):
	gn = pid2gene(x)
	prot2idx[gn] += [i]
	all_proteins_ = sorted(prot2idx.keys())
	ann2idx = collections.defaultdict(list)
	for i,r in enumerate(df["term"]):
	ann2idx[r] += [i]
	all_annotations_ = sorted(ann2idx.keys())

	type2idx = collections.defaultdict(list)
	for i,r in enumerate(list(df["type"])):
	type2idx[r] += [i]
	all_types_ = sorted(type2idx.keys())

	subtype2idx = collections.defaultdict(list)
	for i,r in enumerate(list(df["subtype"])):
	subtype2idx[r] += [i]
	all_subtypes_ = sorted(subtype2idx.keys())

	selected_proteins = columns[0].multiselect("Filter by proteins in leading edge ({0} unique proteins)".format(len(all_proteins_)), options=all_proteins_)
	selected_annotations = columns[1].multiselect("Select annotations", options=all_annotations_)
	selected_subtypes = columns[2].multiselect("Filter by annotation subtype", options=all_subtypes_)
	selected_types = columns[3].multiselect("Filter by annotation type", options=all_types_)

	keep_idxs = []
	if selected_proteins is not None:
	for x in selected_proteins:
	for idx in prot2idx[x]:
	keep_idxs += [idx]

	if selected_annotations is not None:
	for x in selected_annotations:
	for idx in ann2idx[x]:
	keep_idxs += [idx]

	if selected_subtypes is not None:
	for x in selected_subtypes:
	for idx in subtype2idx[x]:
	keep_idxs += [idx]

	if selected_types is not None:
	for x in selected_types:
	for idx in type2idx[x]:
	keep_idxs += [idx]

	if keep_idxs:
	keep_idxs = sorted(set(keep_idxs))
	df = df.iloc[keep_idxs]

	df["edge_genes"] = [" ".join([pid2gene(x) for x in r.split(",")]) for r in list(df["edge"])]

	df_view = df[["term", "overlap", "setsize", "score", "pval", "edge_genes", "subtype", "type"]]
	df_view = df_view.rename(columns = {
	"term": "Term",
	"overlap": "Edge size",
	"setsize": "Set size",
	"score": "Score",
	"pval": "P-value",
	"edge_genes": "Leading edge",
	"subtype": "Category subtype",
	"type": "Category type"
	})
	df_view["rank"] = [i+1 for i in range(df_view.shape[0])]
	df_view = df_view.set_index("rank")

	st.dataframe(df_view.reset_index(drop=True), height=2000)

	else:
	# st.image(os.path.join(CACHE, "overview", "{0}.png".format(profile)))
	image_url = ''.join(("https://huggingface.co/datasets/ligdis/cache_overview/resolve/main/", "{0}.png".format(profile), "?download=true")) # Replace with actual URL
	st.image(image_url)

	## DETAILED TYPE OF ANALYSIS

	else:

	def annotations_selector():
	st.sidebar.header("Select protein annotation category")

	annotation_types = [
	"Sequence",
	"Functions",
	"Processes and pathways",
	"Localization",
	"Drugs and Diseases",
	]
	annotation_type = st.sidebar.radio("Type of annotation", annotation_types)

	annotations = annotation_type_dict[annotation_type]

	annotation = st.sidebar.selectbox("Annotation source", options=annotations)
	annotation_subfolder = annotation_dict[annotation]

	return annotation, annotation_subfolder, annotation_type, annotations

	def universe_selector():
	preselected="HEK293T Core"
	universe = preselected
	universe_subfolder = universe_dict[universe]
	return universe, universe_subfolder

	annotation, annotation_subfolder, annotation_type, annotations = (
	annotations_selector()
	)

	universe, universe_subfolder = universe_selector()

	st.header("Fragment: {0} & Category: {2} ({1})".format(profile_subfolder, annotation_type, annotation))

	# cache_folder = os.path.join(CACHE, "detailed", profile_subfolder, annotation_subfolder)
	cache_folder = '/'.join(("https://huggingface.co/datasets/ligdis", '_'.join(("cache_detailed", profile_subfolder)), "resolve/main", annotation_subfolder ))

	# read metrics

	metrics_json_url = '/'.join((cache_folder, "metrics.json"))
	metrics = load_hf_json(metrics_json_url)

	# with open(os.path.join(cache_folder, "metrics.json"), "r") as f:
	# metrics = json.load(f)

	metric_cols = st.columns(3)
	metric_cols[0].metric(
	"{0} profile: {1}".format(profile_type, profile),
	value="{0} proteins".format(metrics["signature_size"]),
	)
	metric_cols[1].metric(
	"{0}: {1}".format(annotation_type, annotation),
	value="{0} categories".format(metrics["annotations_size"]),
	)
	metric_cols[2].metric(metrics["title"], value=round(metrics["value"], 2))

	columns = st.columns(6)
	view = columns[0].radio("View", options=["Tables", "Basic plots", "Advanced plots"])

	if view == "Tables":

	p_value_cutoff = columns[2].number_input("P-value cutoff", value=0.05, min_value=0., max_value=1., format="%.3f")
	min_edge_size = columns[3].number_input("Minimum leading edge size", value=5, min_value=0, max_value=10000)
	max_edge_size = columns[4].number_input("Maximum leading edge size", value=5000, min_value=1, max_value=10000)
	protein_label = "Gene Name"
	if protein_label == "Gene Name":
	convert_to_gene = True
	else:
	convert_to_gene = False

	# available_selections = json.load(open(os.path.join(cache_folder, "selections.json"), "r"))
	selections_json_url = '/'.join((cache_folder, "selections.json"))
	available_selections = load_hf_json(selections_json_url)

	all_annotations = available_selections["all_annotations"]
	available_proteins = available_selections["available_proteins"]

	select_columns = st.columns(3)
	selected_annotations = select_columns[2].multiselect(
	"Select annotation categories", options=available_proteins
	)

	selected_proteins = select_columns[0].multiselect(
	"Filter by proteins found in at least one annotation term ({0})".format(
	len(available_proteins)
	),
	options=available_proteins,
	)

	task_filename = ''.join((profile, "_val_log2fc.tsv"))

	ligdis_annotations_repo = '/'.join(('ligdis', annotation_subfolder))
	annotations_json = '/'.join((profile_type_subfolder, profile_subfolder, task_filename.split(".tsv")[0], 'annotations.json'))
	annotations_json_url = ''.join(("https://huggingface.co/datasets/", ligdis_annotations_repo, "/resolve/main/", annotations_json))

	annotations_ = load_hf_json(annotations_json_url)

	if selected_proteins:

	if convert_to_gene:
	selected_proteins = [gene2pid(x) for x in selected_proteins]
	selected_proteins = set(selected_proteins)
	if not selected_annotations:
	for k, v in annotations_.items():
	if len(selected_proteins.intersection(v)) > 0:
	selected_annotations += [k]
	if not selected_annotations:
	st.warning(
	"No available annotations for any of your proteins of interest..."
	)

	# result = pd.read_csv(os.path.join(cache_folder, "result.tsv"), sep="\t")

	ligdis_cache_detailed_fragment_repo = '_'.join(("ligdis/cache_detailed", profile_subfolder))
	result_file = '/'.join((annotation_subfolder, "result.tsv"))

	result = hf_tsv_2_pandas_df(hf_repo = ligdis_cache_detailed_fragment_repo, data_file = result_file, myHeader=0)

	result = result[result["leading_edge_size"] >= min_edge_size]
	result = result[result["leading_edge_size"] <= max_edge_size]
	result = result.reset_index(drop=True)

	leading_proteins = available_selections["leading_proteins"]

	selected_leading_proteins = select_columns[1].multiselect(
	"Filter by proteins found in at least one leading edge",
	options = leading_proteins)

	if selected_leading_proteins:

	prot2idx = collections.defaultdict(list)
	for i, r in enumerate(list(result["leading_edge"])):
	if str(r) == "nan":
	continue
	for x in r.split(","):
	prot2idx[pid2gene(x)] += [i]

	idxs = []
	for v in selected_leading_proteins:
	for x in prot2idx[v]:
	idxs += [x]
	idxs = sorted(set(idxs))
	result = result.iloc[idxs]

	# df_merge = pd.read_csv(os.path.join(cache_folder, "df_merge.tsv"), sep="\t")
	df_merge_file = '/'.join((annotation_subfolder, "df_merge.tsv"))
	df_merge = hf_tsv_2_pandas_df(hf_repo=ligdis_cache_detailed_fragment_repo, data_file=df_merge_file, myHeader=0)

	type_of_task = metrics["type_of_task"]
	if type_of_task == "ranksum":

	sort_by = "NES"
	if sort_by == "NES":
	sort_by_nes = True
	else:
	sort_by_nes = False

	direction = "Up"
	if direction == "Up":
	is_up = True
	else:
	is_up = False

	df = result.copy()
	df = df.rename(columns = {"Term": "term"})

	df_merge = df_merge[["term", "score_mean"]]

	df = df.merge(df_merge, how="left", on="term")

	df = df[df["leading_edge"].notnull()]

	df["edge_genes"] = [" ".join([pid2gene(x) for x in r.split(",")]) for r in list(df["leading_edge"])]

	df = df[["term","leading_edge_size", "geneset_size", "nes", "pval", "fdr", "score_mean", "edge_genes", "leading_edge"]]

	if selected_annotations:
	df = df[df["term"].isin(selected_annotations)]

	if is_up:
	df = df[df["nes"] >= 0]
	else:
	df = df[df["nes"] < 0]
	if sort_by_nes:
	if is_up:
	df = df.sort_values(by="nes", ascending=False)
	else:
	df = df.sort_values(by="nes", ascending=True)
	else:
	df = df.sort_values(by="pval")

	df = df.reset_index(drop=True)

	df = df.rename(columns = {
	"term": "Term",
	"leading_edge_size": "Edge size",
	"geneset_size": "Set size",
	"nes": "Score",
	"pval": "P-value",
	"fdr": "FDR",
	"score_mean": "Mean score",
	"edge_genes": "Leading edge",
	})

	st.dataframe(df[[c for c in list(df.columns)[:-1] if c != "Mean score"]].reset_index(drop=True))

	term = st.selectbox("Explore term...", df["Term"])

	if term is not None:

	# signature_ori = pd.read_csv(os.path.join(results_path, "signature.tsv"), delimiter="\t", header=None)
	ligdis_ontology_repo = '/'.join(("ligdis", annotation_subfolder))
	ontology_signature_file = '/'.join((profile_type_subfolder, profile_subfolder, task_filename.split(".tsv")[0], "signature.tsv"))
	signature_ = hf_tsv_2_pandas_df(hf_repo=ligdis_ontology_repo, data_file=ontology_signature_file, myHeader=None )

	# signature_file = os.path.abspath(os.path.join(DATA,"signatures","proteins",profile_type_subfolder,profile_subfolder,task_filename))
	ligdis_data_repo = '/'.join(("ligdis", "data"))
	fragment_signature_file = '/'.join(("signatures/proteins/fragment", profile_subfolder, task_filename))

	# Explore term

	t_values = {}
	for r in signature_.values:
	t_values[r[0]] = r[1]
	o_values = {}
	# signature_original = pd.read_csv(signature_file, delimiter="\t", header=None)
	signature_original = hf_tsv_2_pandas_df(hf_repo=ligdis_data_repo, data_file=fragment_signature_file, myHeader=None)

	for r in signature_original.values:
	o_values[r[0]] = r[1]

	cols = st.columns([0.15, 1])

	col = cols[0]

	annotations_size = len(annotations_[term])
	signature_size = len(signature_)

	df_filt = df[df["Term"] == term]
	leading_edge = list(df_filt["leading_edge"])[0]
	if str(leading_edge) == "nan":
	leading_edge = []
	else:
	leading_edge = leading_edge.split(",")
	display_proteins = col.radio(
	"Display proteins",
	[
	"Leading edge ({0})".format(len(leading_edge)),
	"In category ({0})".format(annotations_size),
	"Full profile ({0})".format(signature_size),
	],
	)
	if "Leading" in display_proteins:
	proteins = leading_edge
	elif "category" in display_proteins:
	proteins = annotations_[term]
	else:
	proteins = signature_[0]
	o_values = [o_values[pid] for pid in proteins]
	t_values = [t_values[pid] for pid in proteins]

	proteins_set = set(proteins)
	if convert_to_gene:
	genes = [pid2gene(x) for x in proteins]
	label = "Gene Name"
	else:
	label = "UniProtAC"
	dl = pd.DataFrame(
	{"Gene Name": genes, "UniProt AC": proteins, "Log2FC": o_values, "Z-score": t_values}
	)

	sort_by = col.radio(
	"Sort proteins", ["By Z-score", "Alphabetically"]
	)
	if sort_by != "Alphabetically":
	if is_up:
	dl = dl.sort_values("Z-score", ascending=False)
	else:
	dl = dl.sort_values("Z-score", ascending=True)
	else:
	dl = dl.sort_values(label)
	dl = dl.reset_index(drop=True)

	col = cols[1]
	col.dataframe(dl.reset_index(drop=True))

	if view == "Basic plots":
	top_plots_number = columns[1].number_input("Maximum number of plots", value=12, min_value=1, max_value=50)
	plot_columns = st.columns(4)

	# with open(os.path.join(cache_folder, "basic", "idx2term.json"), "r") as f:
	# idx2term = json.load(f)
	idx2term_json_url = '/'.join((cache_folder, "basic", "idx2term.json"))
	idx2term = load_hf_json(idx2term_json_url)

	idxs = [i for i in range(len(idx2term))]

	i = 0
	j = 0

	for idx in idxs:

	if i == len(plot_columns):
	i = 0
	col = plot_columns[i]

	if j == top_plots_number:
	break

	# col.image(os.path.join(cache_folder, "basic", "plot_{0}.png".format(idx)))

	image_url = '/'.join((cache_folder, "basic", "plot_{0}.png".format(idx)))
	col.image(image_url) # Show the image
	i += 1
	j += 1


	if view == "Advanced plots":
	top_plots_number = columns[1].number_input("Maximum number of plots", value=5, min_value=1, max_value=10)

	# with open(os.path.join(cache_folder, "advanced", "idx2term.json"), "r") as f:
	# idx2term = json.load(f)

	idx2term_json_url = '/'.join((cache_folder, "advanced", "idx2term.json"))
	idx2term = load_hf_json(idx2term_json_url)

	idxs = [i for i in range(len(idx2term))]

	j = 0
	for idx in idxs:
	if j == top_plots_number:
	break

	# st.image(os.path.join(cache_folder, "advanced", "plot_{0}.png".format(idx)))
	image_url = '/'.join((cache_folder, "advanced", "plot_{0}.png".format(idx)))
	st.image(image_url) # Show the image
	j += 1