|
|
|
import os |
|
import sys |
|
import csv |
|
import collections |
|
import pandas as pd |
|
import streamlit as st |
|
import json |
|
import gc |
|
import requests |
|
from PIL import Image |
|
from io import BytesIO |
|
from io import StringIO |
|
from datasets import load_dataset |
|
|
|
st.set_page_config( |
|
page_title="Ligand Discovery 3: Protein-set Enrichment Analysis", |
|
page_icon=":home:", |
|
layout="wide", |
|
initial_sidebar_state="expanded" |
|
) |
|
|
|
st.markdown(""" |
|
<style> |
|
.css-13sdm1b.e16nr0p33 { |
|
margin-top: -75px; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
|
|
hide_streamlit_style = """ |
|
<style> |
|
#MainMenu {visibility: hidden;} |
|
footer {visibility: hidden;} |
|
#header {visibility: hidden;} |
|
</style> |
|
""" |
|
st.markdown(hide_streamlit_style, unsafe_allow_html=True) |
|
|
|
proteins_set = None |
|
|
|
ROOT = os.path.abspath(os.path.dirname(__file__)) |
|
|
|
|
|
|
|
|
|
MIN_SET_SIZE = 1 |
|
PROFILE_TYPE = "Fragment" |
|
OVERVIEW_PVALUE_CUTOFF = 0.05 |
|
|
|
|
|
|
|
|
|
|
|
def listdir_util(path): |
|
for d in os.listdir(path): |
|
if d.startswith("_"): |
|
continue |
|
else: |
|
yield d |
|
|
|
|
|
from proteome_meta import task_suf |
|
from proteome_meta import annotation_type_dict |
|
from proteome_meta import annotation_dict |
|
from proteome_meta import universe_dict |
|
|
|
|
|
|
|
PATH = os.path.abspath(os.path.join(ROOT, "../results/proteins/")) |
|
DATA = os.path.abspath(os.path.join(ROOT, "../data")) |
|
DATA2 = 'ligdis/data' |
|
mySeparator = "/" |
|
CACHE = os.path.abspath(os.path.join(ROOT, "../cache")) |
|
|
|
|
|
|
|
|
|
|
|
dataset = load_dataset('ligdis/data', data_files={"general/pid2name_primary.tsv"}, delimiter='\t') |
|
df = dataset['train'].to_pandas() |
|
pid2name = dict(zip(df.iloc[:, 0], df.iloc[:, 1])) |
|
name2pid = dict(zip(df.iloc[:, 1], df.iloc[:, 0])) |
|
del dataset, df |
|
gc.collect() |
|
|
|
def pid2gene(x): |
|
if x in pid2name: |
|
return pid2name[x] |
|
else: |
|
return x |
|
|
|
|
|
def gene2pid(x): |
|
if x in name2pid: |
|
return name2pid[x] |
|
else: |
|
return x |
|
|
|
|
|
def pretty_term(x): |
|
x = x.title() |
|
if x.endswith("]"): |
|
x = x.split(" [")[0] |
|
return x |
|
|
|
def hf_tsv_2_pandas_df(hf_repo, data_file, myHeader): |
|
|
|
url = '/'.join(("https://huggingface.co/datasets", hf_repo, "resolve/main", data_file)) |
|
response = requests.get(url) |
|
|
|
if response.status_code == 200: |
|
tsv_data = StringIO(response.text) |
|
df = pd.read_csv(tsv_data, sep='\t', header = myHeader) |
|
else: |
|
df = pd.DataFrame() |
|
st.write("Error loading dataset from hf_repo: ", hf_repo, " and data_file: ", data_file) |
|
return(df) |
|
|
|
def load_hf_json(json_url): |
|
response = requests.get(json_url) |
|
if response.status_code == 200: |
|
out = response.json() |
|
else: |
|
print("Failed to retrieve ", json_url, " file. HTTP Status Code: ", response.status_code) |
|
return(out) |
|
|
|
def load_hf_image(image_url): |
|
response = requests.get(image_url) |
|
if response.status_code == 200: |
|
img = Image.open(BytesIO(response.content)) |
|
else: |
|
print("Failed to retrieve image. HTTP Status Code:", response.status_code) |
|
return(img) |
|
|
|
|
|
|
|
|
|
st.sidebar.title("Ligand Discovery 3: Protein-set Enrichment Analysis") |
|
|
|
|
|
st.sidebar.header("Select a fragment") |
|
|
|
profile_type = PROFILE_TYPE |
|
profile_type_subfolder = profile_type.lower() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with open("fid.txt", "r") as file: |
|
lines = file.readlines() |
|
|
|
fids = [line.strip() for line in lines] |
|
|
|
|
|
profile = st.sidebar.selectbox("Fragment identifier", options=fids) |
|
profile_subfolder = profile |
|
all_cases = fids |
|
draw_fragment = True |
|
|
|
st.sidebar.header("Choose a type of analysis") |
|
|
|
type_of_analysis = st.sidebar.radio( |
|
"Type of analysis", options=["Overview", "Detailed"] |
|
) |
|
|
|
|
|
|
|
if type_of_analysis == "Overview": |
|
|
|
st.header("Enrichment overview for {0} {1}".format(profile_type.lower(), profile)) |
|
view = st.sidebar.radio("Select View", options=["Table", "Plot"]) |
|
|
|
df = hf_tsv_2_pandas_df(hf_repo="ligdis/cache_overview", data_file="{0}.tsv".format(profile), myHeader=0) |
|
|
|
|
|
|
|
if view == "Table": |
|
|
|
columns = st.columns(4) |
|
|
|
prot2idx = collections.defaultdict(list) |
|
for i,r in enumerate(list(df["edge"])): |
|
for x in r.split(","): |
|
gn = pid2gene(x) |
|
prot2idx[gn] += [i] |
|
all_proteins_ = sorted(prot2idx.keys()) |
|
ann2idx = collections.defaultdict(list) |
|
for i,r in enumerate(df["term"]): |
|
ann2idx[r] += [i] |
|
all_annotations_ = sorted(ann2idx.keys()) |
|
|
|
type2idx = collections.defaultdict(list) |
|
for i,r in enumerate(list(df["type"])): |
|
type2idx[r] += [i] |
|
all_types_ = sorted(type2idx.keys()) |
|
|
|
subtype2idx = collections.defaultdict(list) |
|
for i,r in enumerate(list(df["subtype"])): |
|
subtype2idx[r] += [i] |
|
all_subtypes_ = sorted(subtype2idx.keys()) |
|
|
|
selected_proteins = columns[0].multiselect("Filter by proteins in leading edge ({0} unique proteins)".format(len(all_proteins_)), options=all_proteins_) |
|
selected_annotations = columns[1].multiselect("Select annotations", options=all_annotations_) |
|
selected_subtypes = columns[2].multiselect("Filter by annotation subtype", options=all_subtypes_) |
|
selected_types = columns[3].multiselect("Filter by annotation type", options=all_types_) |
|
|
|
keep_idxs = [] |
|
if selected_proteins is not None: |
|
for x in selected_proteins: |
|
for idx in prot2idx[x]: |
|
keep_idxs += [idx] |
|
|
|
if selected_annotations is not None: |
|
for x in selected_annotations: |
|
for idx in ann2idx[x]: |
|
keep_idxs += [idx] |
|
|
|
if selected_subtypes is not None: |
|
for x in selected_subtypes: |
|
for idx in subtype2idx[x]: |
|
keep_idxs += [idx] |
|
|
|
if selected_types is not None: |
|
for x in selected_types: |
|
for idx in type2idx[x]: |
|
keep_idxs += [idx] |
|
|
|
if keep_idxs: |
|
keep_idxs = sorted(set(keep_idxs)) |
|
df = df.iloc[keep_idxs] |
|
|
|
df["edge_genes"] = [" ".join([pid2gene(x) for x in r.split(",")]) for r in list(df["edge"])] |
|
|
|
df_view = df[["term", "overlap", "setsize", "score", "pval", "edge_genes", "subtype", "type"]] |
|
df_view = df_view.rename(columns = { |
|
"term": "Term", |
|
"overlap": "Edge size", |
|
"setsize": "Set size", |
|
"score": "Score", |
|
"pval": "P-value", |
|
"edge_genes": "Leading edge", |
|
"subtype": "Category subtype", |
|
"type": "Category type" |
|
}) |
|
df_view["rank"] = [i+1 for i in range(df_view.shape[0])] |
|
df_view = df_view.set_index("rank") |
|
|
|
st.dataframe(df_view.reset_index(drop=True), height=2000) |
|
|
|
else: |
|
|
|
image_url = ''.join(("https://huggingface.co/datasets/ligdis/cache_overview/resolve/main/", "{0}.png".format(profile), "?download=true")) |
|
st.image(image_url) |
|
|
|
|
|
|
|
else: |
|
|
|
def annotations_selector(): |
|
st.sidebar.header("Select protein annotation category") |
|
|
|
annotation_types = [ |
|
"Sequence", |
|
"Functions", |
|
"Processes and pathways", |
|
"Localization", |
|
"Drugs and Diseases", |
|
] |
|
annotation_type = st.sidebar.radio("Type of annotation", annotation_types) |
|
|
|
annotations = annotation_type_dict[annotation_type] |
|
|
|
annotation = st.sidebar.selectbox("Annotation source", options=annotations) |
|
annotation_subfolder = annotation_dict[annotation] |
|
|
|
return annotation, annotation_subfolder, annotation_type, annotations |
|
|
|
def universe_selector(): |
|
preselected="HEK293T Core" |
|
universe = preselected |
|
universe_subfolder = universe_dict[universe] |
|
return universe, universe_subfolder |
|
|
|
annotation, annotation_subfolder, annotation_type, annotations = ( |
|
annotations_selector() |
|
) |
|
|
|
universe, universe_subfolder = universe_selector() |
|
|
|
st.header("Fragment: {0} & Category: {2} ({1})".format(profile_subfolder, annotation_type, annotation)) |
|
|
|
|
|
cache_folder = '/'.join(("https://huggingface.co/datasets/ligdis", '_'.join(("cache_detailed", profile_subfolder)), "resolve/main", annotation_subfolder )) |
|
|
|
|
|
|
|
metrics_json_url = '/'.join((cache_folder, "metrics.json")) |
|
metrics = load_hf_json(metrics_json_url) |
|
|
|
|
|
|
|
|
|
metric_cols = st.columns(3) |
|
metric_cols[0].metric( |
|
"{0} profile: {1}".format(profile_type, profile), |
|
value="{0} proteins".format(metrics["signature_size"]), |
|
) |
|
metric_cols[1].metric( |
|
"{0}: {1}".format(annotation_type, annotation), |
|
value="{0} categories".format(metrics["annotations_size"]), |
|
) |
|
metric_cols[2].metric(metrics["title"], value=round(metrics["value"], 2)) |
|
|
|
columns = st.columns(6) |
|
view = columns[0].radio("View", options=["Tables", "Basic plots", "Advanced plots"]) |
|
|
|
if view == "Tables": |
|
|
|
p_value_cutoff = columns[2].number_input("P-value cutoff", value=0.05, min_value=0., max_value=1., format="%.3f") |
|
min_edge_size = columns[3].number_input("Minimum leading edge size", value=5, min_value=0, max_value=10000) |
|
max_edge_size = columns[4].number_input("Maximum leading edge size", value=5000, min_value=1, max_value=10000) |
|
protein_label = "Gene Name" |
|
if protein_label == "Gene Name": |
|
convert_to_gene = True |
|
else: |
|
convert_to_gene = False |
|
|
|
|
|
selections_json_url = '/'.join((cache_folder, "selections.json")) |
|
available_selections = load_hf_json(selections_json_url) |
|
|
|
all_annotations = available_selections["all_annotations"] |
|
available_proteins = available_selections["available_proteins"] |
|
|
|
select_columns = st.columns(3) |
|
selected_annotations = select_columns[2].multiselect( |
|
"Select annotation categories", options=available_proteins |
|
) |
|
|
|
selected_proteins = select_columns[0].multiselect( |
|
"Filter by proteins found in at least one annotation term ({0})".format( |
|
len(available_proteins) |
|
), |
|
options=available_proteins, |
|
) |
|
|
|
task_filename = ''.join((profile, "_val_log2fc.tsv")) |
|
|
|
ligdis_annotations_repo = '/'.join(('ligdis', annotation_subfolder)) |
|
annotations_json = '/'.join((profile_type_subfolder, profile_subfolder, task_filename.split(".tsv")[0], 'annotations.json')) |
|
annotations_json_url = ''.join(("https://huggingface.co/datasets/", ligdis_annotations_repo, "/resolve/main/", annotations_json)) |
|
|
|
annotations_ = load_hf_json(annotations_json_url) |
|
|
|
if selected_proteins: |
|
|
|
if convert_to_gene: |
|
selected_proteins = [gene2pid(x) for x in selected_proteins] |
|
selected_proteins = set(selected_proteins) |
|
if not selected_annotations: |
|
for k, v in annotations_.items(): |
|
if len(selected_proteins.intersection(v)) > 0: |
|
selected_annotations += [k] |
|
if not selected_annotations: |
|
st.warning( |
|
"No available annotations for any of your proteins of interest..." |
|
) |
|
|
|
|
|
|
|
ligdis_cache_detailed_fragment_repo = '_'.join(("ligdis/cache_detailed", profile_subfolder)) |
|
result_file = '/'.join((annotation_subfolder, "result.tsv")) |
|
|
|
result = hf_tsv_2_pandas_df(hf_repo = ligdis_cache_detailed_fragment_repo, data_file = result_file, myHeader=0) |
|
|
|
result = result[result["leading_edge_size"] >= min_edge_size] |
|
result = result[result["leading_edge_size"] <= max_edge_size] |
|
result = result.reset_index(drop=True) |
|
|
|
leading_proteins = available_selections["leading_proteins"] |
|
|
|
selected_leading_proteins = select_columns[1].multiselect( |
|
"Filter by proteins found in at least one leading edge", |
|
options = leading_proteins) |
|
|
|
if selected_leading_proteins: |
|
|
|
prot2idx = collections.defaultdict(list) |
|
for i, r in enumerate(list(result["leading_edge"])): |
|
if str(r) == "nan": |
|
continue |
|
for x in r.split(","): |
|
prot2idx[pid2gene(x)] += [i] |
|
|
|
idxs = [] |
|
for v in selected_leading_proteins: |
|
for x in prot2idx[v]: |
|
idxs += [x] |
|
idxs = sorted(set(idxs)) |
|
result = result.iloc[idxs] |
|
|
|
|
|
df_merge_file = '/'.join((annotation_subfolder, "df_merge.tsv")) |
|
df_merge = hf_tsv_2_pandas_df(hf_repo=ligdis_cache_detailed_fragment_repo, data_file=df_merge_file, myHeader=0) |
|
|
|
type_of_task = metrics["type_of_task"] |
|
if type_of_task == "ranksum": |
|
|
|
sort_by = "NES" |
|
if sort_by == "NES": |
|
sort_by_nes = True |
|
else: |
|
sort_by_nes = False |
|
|
|
direction = "Up" |
|
if direction == "Up": |
|
is_up = True |
|
else: |
|
is_up = False |
|
|
|
df = result.copy() |
|
df = df.rename(columns = {"Term": "term"}) |
|
|
|
df_merge = df_merge[["term", "score_mean"]] |
|
|
|
df = df.merge(df_merge, how="left", on="term") |
|
|
|
df = df[df["leading_edge"].notnull()] |
|
|
|
df["edge_genes"] = [" ".join([pid2gene(x) for x in r.split(",")]) for r in list(df["leading_edge"])] |
|
|
|
df = df[["term","leading_edge_size", "geneset_size", "nes", "pval", "fdr", "score_mean", "edge_genes", "leading_edge"]] |
|
|
|
if selected_annotations: |
|
df = df[df["term"].isin(selected_annotations)] |
|
|
|
if is_up: |
|
df = df[df["nes"] >= 0] |
|
else: |
|
df = df[df["nes"] < 0] |
|
if sort_by_nes: |
|
if is_up: |
|
df = df.sort_values(by="nes", ascending=False) |
|
else: |
|
df = df.sort_values(by="nes", ascending=True) |
|
else: |
|
df = df.sort_values(by="pval") |
|
|
|
df = df.reset_index(drop=True) |
|
|
|
df = df.rename(columns = { |
|
"term": "Term", |
|
"leading_edge_size": "Edge size", |
|
"geneset_size": "Set size", |
|
"nes": "Score", |
|
"pval": "P-value", |
|
"fdr": "FDR", |
|
"score_mean": "Mean score", |
|
"edge_genes": "Leading edge", |
|
}) |
|
|
|
st.dataframe(df[[c for c in list(df.columns)[:-1] if c != "Mean score"]].reset_index(drop=True)) |
|
|
|
term = st.selectbox("Explore term...", df["Term"]) |
|
|
|
if term is not None: |
|
|
|
|
|
ligdis_ontology_repo = '/'.join(("ligdis", annotation_subfolder)) |
|
ontology_signature_file = '/'.join((profile_type_subfolder, profile_subfolder, task_filename.split(".tsv")[0], "signature.tsv")) |
|
signature_ = hf_tsv_2_pandas_df(hf_repo=ligdis_ontology_repo, data_file=ontology_signature_file, myHeader=None ) |
|
|
|
|
|
ligdis_data_repo = '/'.join(("ligdis", "data")) |
|
fragment_signature_file = '/'.join(("signatures/proteins/fragment", profile_subfolder, task_filename)) |
|
|
|
|
|
|
|
t_values = {} |
|
for r in signature_.values: |
|
t_values[r[0]] = r[1] |
|
o_values = {} |
|
|
|
signature_original = hf_tsv_2_pandas_df(hf_repo=ligdis_data_repo, data_file=fragment_signature_file, myHeader=None) |
|
|
|
for r in signature_original.values: |
|
o_values[r[0]] = r[1] |
|
|
|
cols = st.columns([0.15, 1]) |
|
|
|
col = cols[0] |
|
|
|
annotations_size = len(annotations_[term]) |
|
signature_size = len(signature_) |
|
|
|
df_filt = df[df["Term"] == term] |
|
leading_edge = list(df_filt["leading_edge"])[0] |
|
if str(leading_edge) == "nan": |
|
leading_edge = [] |
|
else: |
|
leading_edge = leading_edge.split(",") |
|
display_proteins = col.radio( |
|
"Display proteins", |
|
[ |
|
"Leading edge ({0})".format(len(leading_edge)), |
|
"In category ({0})".format(annotations_size), |
|
"Full profile ({0})".format(signature_size), |
|
], |
|
) |
|
if "Leading" in display_proteins: |
|
proteins = leading_edge |
|
elif "category" in display_proteins: |
|
proteins = annotations_[term] |
|
else: |
|
proteins = signature_[0] |
|
o_values = [o_values[pid] for pid in proteins] |
|
t_values = [t_values[pid] for pid in proteins] |
|
|
|
proteins_set = set(proteins) |
|
if convert_to_gene: |
|
genes = [pid2gene(x) for x in proteins] |
|
label = "Gene Name" |
|
else: |
|
label = "UniProtAC" |
|
dl = pd.DataFrame( |
|
{"Gene Name": genes, "UniProt AC": proteins, "Log2FC": o_values, "Z-score": t_values} |
|
) |
|
|
|
sort_by = col.radio( |
|
"Sort proteins", ["By Z-score", "Alphabetically"] |
|
) |
|
if sort_by != "Alphabetically": |
|
if is_up: |
|
dl = dl.sort_values("Z-score", ascending=False) |
|
else: |
|
dl = dl.sort_values("Z-score", ascending=True) |
|
else: |
|
dl = dl.sort_values(label) |
|
dl = dl.reset_index(drop=True) |
|
|
|
col = cols[1] |
|
col.dataframe(dl.reset_index(drop=True)) |
|
|
|
if view == "Basic plots": |
|
top_plots_number = columns[1].number_input("Maximum number of plots", value=12, min_value=1, max_value=50) |
|
plot_columns = st.columns(4) |
|
|
|
|
|
|
|
idx2term_json_url = '/'.join((cache_folder, "basic", "idx2term.json")) |
|
idx2term = load_hf_json(idx2term_json_url) |
|
|
|
idxs = [i for i in range(len(idx2term))] |
|
|
|
i = 0 |
|
j = 0 |
|
|
|
for idx in idxs: |
|
|
|
if i == len(plot_columns): |
|
i = 0 |
|
col = plot_columns[i] |
|
|
|
if j == top_plots_number: |
|
break |
|
|
|
|
|
|
|
image_url = '/'.join((cache_folder, "basic", "plot_{0}.png".format(idx))) |
|
col.image(image_url) |
|
i += 1 |
|
j += 1 |
|
|
|
|
|
if view == "Advanced plots": |
|
top_plots_number = columns[1].number_input("Maximum number of plots", value=5, min_value=1, max_value=10) |
|
|
|
|
|
|
|
|
|
idx2term_json_url = '/'.join((cache_folder, "advanced", "idx2term.json")) |
|
idx2term = load_hf_json(idx2term_json_url) |
|
|
|
idxs = [i for i in range(len(idx2term))] |
|
|
|
j = 0 |
|
for idx in idxs: |
|
if j == top_plots_number: |
|
break |
|
|
|
|
|
image_url = '/'.join((cache_folder, "advanced", "plot_{0}.png".format(idx))) |
|
st.image(image_url) |
|
j += 1 |