# regular imports import os import sys import csv import collections import pandas as pd import streamlit as st import json import gc import requests from PIL import Image from io import BytesIO from io import StringIO from datasets import load_dataset st.set_page_config( page_title="Ligand Discovery 3: Protein-set Enrichment Analysis", page_icon=":home:", layout="wide", # "centered", initial_sidebar_state="expanded" ) st.markdown(""" """, unsafe_allow_html=True) hide_streamlit_style = """ """ st.markdown(hide_streamlit_style, unsafe_allow_html=True) proteins_set = None ROOT = os.path.abspath(os.path.dirname(__file__)) # TMP = os.path.join(ROOT, "tmp") # if not os.path.exists(TMP): # os.mkdir(TMP) MIN_SET_SIZE = 1 PROFILE_TYPE = "Fragment" OVERVIEW_PVALUE_CUTOFF = 0.05 # relative imports # sys.path.append(os.path.join(ROOT, "../src/")) # from util import listdir_util def listdir_util(path): for d in os.listdir(path): if d.startswith("_"): continue else: yield d # import metadata from proteome_meta import task_suf from proteome_meta import annotation_type_dict from proteome_meta import annotation_dict from proteome_meta import universe_dict # path to results and original data PATH = os.path.abspath(os.path.join(ROOT, "../results/proteins/")) DATA = os.path.abspath(os.path.join(ROOT, "../data")) DATA2 = 'ligdis/data' mySeparator = "/" CACHE = os.path.abspath(os.path.join(ROOT, "../cache")) # generic inputs # protein id to gene name dataset = load_dataset('ligdis/data', data_files={"general/pid2name_primary.tsv"}, delimiter='\t') df = dataset['train'].to_pandas() pid2name = dict(zip(df.iloc[:, 0], df.iloc[:, 1])) name2pid = dict(zip(df.iloc[:, 1], df.iloc[:, 0])) del dataset, df # Delete the variable gc.collect() def pid2gene(x): if x in pid2name: return pid2name[x] else: return x def gene2pid(x): if x in name2pid: return name2pid[x] else: return x def pretty_term(x): x = x.title() if x.endswith("]"): x = x.split(" [")[0] return x def hf_tsv_2_pandas_df(hf_repo, data_file, myHeader): url = '/'.join(("https://huggingface.co/datasets", hf_repo, "resolve/main", data_file)) response = requests.get(url) if response.status_code == 200: tsv_data = StringIO(response.text) # Use StringIO to treat the string content as a file-like object df = pd.read_csv(tsv_data, sep='\t', header = myHeader) # Load the TSV file into a pandas DataFrame else: df = pd.DataFrame() st.write("Error loading dataset from hf_repo: ", hf_repo, " and data_file: ", data_file) return(df) def load_hf_json(json_url): response = requests.get(json_url) if response.status_code == 200: out = response.json() else: print("Failed to retrieve ", json_url, " file. HTTP Status Code: ", response.status_code) return(out) def load_hf_image(image_url): response = requests.get(image_url) if response.status_code == 200: img = Image.open(BytesIO(response.content)) else: print("Failed to retrieve image. HTTP Status Code:", response.status_code) return(img) # side bar st.sidebar.title("Ligand Discovery 3: Protein-set Enrichment Analysis") # signatures (aka profiles) st.sidebar.header("Select a fragment") profile_type = PROFILE_TYPE profile_type_subfolder = profile_type.lower() # @st.cache_data # def get_sorted_fids(): # fids = [] # for fid in listdir_util(os.path.join(DATA, "signatures", "proteins", "fragment")): # fids += [fid] # fids = sorted(fids) # return fids with open("fid.txt", "r") as file: lines = file.readlines() # Remove the newline characters (\n) from each line fids = [line.strip() for line in lines] # fids = get_sorted_fids() profile = st.sidebar.selectbox("Fragment identifier", options=fids) profile_subfolder = profile all_cases = fids draw_fragment = True st.sidebar.header("Choose a type of analysis") type_of_analysis = st.sidebar.radio( "Type of analysis", options=["Overview", "Detailed"] ) # OVERVIEW TYPE OF ANALYSYS if type_of_analysis == "Overview": st.header("Enrichment overview for {0} {1}".format(profile_type.lower(), profile)) view = st.sidebar.radio("Select View", options=["Table", "Plot"]) df = hf_tsv_2_pandas_df(hf_repo="ligdis/cache_overview", data_file="{0}.tsv".format(profile), myHeader=0) # df = pd.read_csv(os.path.join(CACHE, "overview", "{0}.tsv".format(profile)), sep="\t") if view == "Table": columns = st.columns(4) prot2idx = collections.defaultdict(list) for i,r in enumerate(list(df["edge"])): for x in r.split(","): gn = pid2gene(x) prot2idx[gn] += [i] all_proteins_ = sorted(prot2idx.keys()) ann2idx = collections.defaultdict(list) for i,r in enumerate(df["term"]): ann2idx[r] += [i] all_annotations_ = sorted(ann2idx.keys()) type2idx = collections.defaultdict(list) for i,r in enumerate(list(df["type"])): type2idx[r] += [i] all_types_ = sorted(type2idx.keys()) subtype2idx = collections.defaultdict(list) for i,r in enumerate(list(df["subtype"])): subtype2idx[r] += [i] all_subtypes_ = sorted(subtype2idx.keys()) selected_proteins = columns[0].multiselect("Filter by proteins in leading edge ({0} unique proteins)".format(len(all_proteins_)), options=all_proteins_) selected_annotations = columns[1].multiselect("Select annotations", options=all_annotations_) selected_subtypes = columns[2].multiselect("Filter by annotation subtype", options=all_subtypes_) selected_types = columns[3].multiselect("Filter by annotation type", options=all_types_) keep_idxs = [] if selected_proteins is not None: for x in selected_proteins: for idx in prot2idx[x]: keep_idxs += [idx] if selected_annotations is not None: for x in selected_annotations: for idx in ann2idx[x]: keep_idxs += [idx] if selected_subtypes is not None: for x in selected_subtypes: for idx in subtype2idx[x]: keep_idxs += [idx] if selected_types is not None: for x in selected_types: for idx in type2idx[x]: keep_idxs += [idx] if keep_idxs: keep_idxs = sorted(set(keep_idxs)) df = df.iloc[keep_idxs] df["edge_genes"] = [" ".join([pid2gene(x) for x in r.split(",")]) for r in list(df["edge"])] df_view = df[["term", "overlap", "setsize", "score", "pval", "edge_genes", "subtype", "type"]] df_view = df_view.rename(columns = { "term": "Term", "overlap": "Edge size", "setsize": "Set size", "score": "Score", "pval": "P-value", "edge_genes": "Leading edge", "subtype": "Category subtype", "type": "Category type" }) df_view["rank"] = [i+1 for i in range(df_view.shape[0])] df_view = df_view.set_index("rank") st.dataframe(df_view.reset_index(drop=True), height=2000) else: # st.image(os.path.join(CACHE, "overview", "{0}.png".format(profile))) image_url = ''.join(("https://huggingface.co/datasets/ligdis/cache_overview/resolve/main/", "{0}.png".format(profile), "?download=true")) # Replace with actual URL st.image(image_url) ## DETAILED TYPE OF ANALYSIS else: def annotations_selector(): st.sidebar.header("Select protein annotation category") annotation_types = [ "Sequence", "Functions", "Processes and pathways", "Localization", "Drugs and Diseases", ] annotation_type = st.sidebar.radio("Type of annotation", annotation_types) annotations = annotation_type_dict[annotation_type] annotation = st.sidebar.selectbox("Annotation source", options=annotations) annotation_subfolder = annotation_dict[annotation] return annotation, annotation_subfolder, annotation_type, annotations def universe_selector(): preselected="HEK293T Core" universe = preselected universe_subfolder = universe_dict[universe] return universe, universe_subfolder annotation, annotation_subfolder, annotation_type, annotations = ( annotations_selector() ) universe, universe_subfolder = universe_selector() st.header("Fragment: {0} & Category: {2} ({1})".format(profile_subfolder, annotation_type, annotation)) # cache_folder = os.path.join(CACHE, "detailed", profile_subfolder, annotation_subfolder) cache_folder = '/'.join(("https://huggingface.co/datasets/ligdis", '_'.join(("cache_detailed", profile_subfolder)), "resolve/main", annotation_subfolder )) # read metrics metrics_json_url = '/'.join((cache_folder, "metrics.json")) metrics = load_hf_json(metrics_json_url) # with open(os.path.join(cache_folder, "metrics.json"), "r") as f: # metrics = json.load(f) metric_cols = st.columns(3) metric_cols[0].metric( "{0} profile: {1}".format(profile_type, profile), value="{0} proteins".format(metrics["signature_size"]), ) metric_cols[1].metric( "{0}: {1}".format(annotation_type, annotation), value="{0} categories".format(metrics["annotations_size"]), ) metric_cols[2].metric(metrics["title"], value=round(metrics["value"], 2)) columns = st.columns(6) view = columns[0].radio("View", options=["Tables", "Basic plots", "Advanced plots"]) if view == "Tables": p_value_cutoff = columns[2].number_input("P-value cutoff", value=0.05, min_value=0., max_value=1., format="%.3f") min_edge_size = columns[3].number_input("Minimum leading edge size", value=5, min_value=0, max_value=10000) max_edge_size = columns[4].number_input("Maximum leading edge size", value=5000, min_value=1, max_value=10000) protein_label = "Gene Name" if protein_label == "Gene Name": convert_to_gene = True else: convert_to_gene = False # available_selections = json.load(open(os.path.join(cache_folder, "selections.json"), "r")) selections_json_url = '/'.join((cache_folder, "selections.json")) available_selections = load_hf_json(selections_json_url) all_annotations = available_selections["all_annotations"] available_proteins = available_selections["available_proteins"] select_columns = st.columns(3) selected_annotations = select_columns[2].multiselect( "Select annotation categories", options=available_proteins ) selected_proteins = select_columns[0].multiselect( "Filter by proteins found in at least one annotation term ({0})".format( len(available_proteins) ), options=available_proteins, ) task_filename = ''.join((profile, "_val_log2fc.tsv")) ligdis_annotations_repo = '/'.join(('ligdis', annotation_subfolder)) annotations_json = '/'.join((profile_type_subfolder, profile_subfolder, task_filename.split(".tsv")[0], 'annotations.json')) annotations_json_url = ''.join(("https://huggingface.co/datasets/", ligdis_annotations_repo, "/resolve/main/", annotations_json)) annotations_ = load_hf_json(annotations_json_url) if selected_proteins: if convert_to_gene: selected_proteins = [gene2pid(x) for x in selected_proteins] selected_proteins = set(selected_proteins) if not selected_annotations: for k, v in annotations_.items(): if len(selected_proteins.intersection(v)) > 0: selected_annotations += [k] if not selected_annotations: st.warning( "No available annotations for any of your proteins of interest..." ) # result = pd.read_csv(os.path.join(cache_folder, "result.tsv"), sep="\t") ligdis_cache_detailed_fragment_repo = '_'.join(("ligdis/cache_detailed", profile_subfolder)) result_file = '/'.join((annotation_subfolder, "result.tsv")) result = hf_tsv_2_pandas_df(hf_repo = ligdis_cache_detailed_fragment_repo, data_file = result_file, myHeader=0) result = result[result["leading_edge_size"] >= min_edge_size] result = result[result["leading_edge_size"] <= max_edge_size] result = result.reset_index(drop=True) leading_proteins = available_selections["leading_proteins"] selected_leading_proteins = select_columns[1].multiselect( "Filter by proteins found in at least one leading edge", options = leading_proteins) if selected_leading_proteins: prot2idx = collections.defaultdict(list) for i, r in enumerate(list(result["leading_edge"])): if str(r) == "nan": continue for x in r.split(","): prot2idx[pid2gene(x)] += [i] idxs = [] for v in selected_leading_proteins: for x in prot2idx[v]: idxs += [x] idxs = sorted(set(idxs)) result = result.iloc[idxs] # df_merge = pd.read_csv(os.path.join(cache_folder, "df_merge.tsv"), sep="\t") df_merge_file = '/'.join((annotation_subfolder, "df_merge.tsv")) df_merge = hf_tsv_2_pandas_df(hf_repo=ligdis_cache_detailed_fragment_repo, data_file=df_merge_file, myHeader=0) type_of_task = metrics["type_of_task"] if type_of_task == "ranksum": sort_by = "NES" if sort_by == "NES": sort_by_nes = True else: sort_by_nes = False direction = "Up" if direction == "Up": is_up = True else: is_up = False df = result.copy() df = df.rename(columns = {"Term": "term"}) df_merge = df_merge[["term", "score_mean"]] df = df.merge(df_merge, how="left", on="term") df = df[df["leading_edge"].notnull()] df["edge_genes"] = [" ".join([pid2gene(x) for x in r.split(",")]) for r in list(df["leading_edge"])] df = df[["term","leading_edge_size", "geneset_size", "nes", "pval", "fdr", "score_mean", "edge_genes", "leading_edge"]] if selected_annotations: df = df[df["term"].isin(selected_annotations)] if is_up: df = df[df["nes"] >= 0] else: df = df[df["nes"] < 0] if sort_by_nes: if is_up: df = df.sort_values(by="nes", ascending=False) else: df = df.sort_values(by="nes", ascending=True) else: df = df.sort_values(by="pval") df = df.reset_index(drop=True) df = df.rename(columns = { "term": "Term", "leading_edge_size": "Edge size", "geneset_size": "Set size", "nes": "Score", "pval": "P-value", "fdr": "FDR", "score_mean": "Mean score", "edge_genes": "Leading edge", }) st.dataframe(df[[c for c in list(df.columns)[:-1] if c != "Mean score"]].reset_index(drop=True)) term = st.selectbox("Explore term...", df["Term"]) if term is not None: # signature_ori = pd.read_csv(os.path.join(results_path, "signature.tsv"), delimiter="\t", header=None) ligdis_ontology_repo = '/'.join(("ligdis", annotation_subfolder)) ontology_signature_file = '/'.join((profile_type_subfolder, profile_subfolder, task_filename.split(".tsv")[0], "signature.tsv")) signature_ = hf_tsv_2_pandas_df(hf_repo=ligdis_ontology_repo, data_file=ontology_signature_file, myHeader=None ) # signature_file = os.path.abspath(os.path.join(DATA,"signatures","proteins",profile_type_subfolder,profile_subfolder,task_filename)) ligdis_data_repo = '/'.join(("ligdis", "data")) fragment_signature_file = '/'.join(("signatures/proteins/fragment", profile_subfolder, task_filename)) # Explore term t_values = {} for r in signature_.values: t_values[r[0]] = r[1] o_values = {} # signature_original = pd.read_csv(signature_file, delimiter="\t", header=None) signature_original = hf_tsv_2_pandas_df(hf_repo=ligdis_data_repo, data_file=fragment_signature_file, myHeader=None) for r in signature_original.values: o_values[r[0]] = r[1] cols = st.columns([0.15, 1]) col = cols[0] annotations_size = len(annotations_[term]) signature_size = len(signature_) df_filt = df[df["Term"] == term] leading_edge = list(df_filt["leading_edge"])[0] if str(leading_edge) == "nan": leading_edge = [] else: leading_edge = leading_edge.split(",") display_proteins = col.radio( "Display proteins", [ "Leading edge ({0})".format(len(leading_edge)), "In category ({0})".format(annotations_size), "Full profile ({0})".format(signature_size), ], ) if "Leading" in display_proteins: proteins = leading_edge elif "category" in display_proteins: proteins = annotations_[term] else: proteins = signature_[0] o_values = [o_values[pid] for pid in proteins] t_values = [t_values[pid] for pid in proteins] proteins_set = set(proteins) if convert_to_gene: genes = [pid2gene(x) for x in proteins] label = "Gene Name" else: label = "UniProtAC" dl = pd.DataFrame( {"Gene Name": genes, "UniProt AC": proteins, "Log2FC": o_values, "Z-score": t_values} ) sort_by = col.radio( "Sort proteins", ["By Z-score", "Alphabetically"] ) if sort_by != "Alphabetically": if is_up: dl = dl.sort_values("Z-score", ascending=False) else: dl = dl.sort_values("Z-score", ascending=True) else: dl = dl.sort_values(label) dl = dl.reset_index(drop=True) col = cols[1] col.dataframe(dl.reset_index(drop=True)) if view == "Basic plots": top_plots_number = columns[1].number_input("Maximum number of plots", value=12, min_value=1, max_value=50) plot_columns = st.columns(4) # with open(os.path.join(cache_folder, "basic", "idx2term.json"), "r") as f: # idx2term = json.load(f) idx2term_json_url = '/'.join((cache_folder, "basic", "idx2term.json")) idx2term = load_hf_json(idx2term_json_url) idxs = [i for i in range(len(idx2term))] i = 0 j = 0 for idx in idxs: if i == len(plot_columns): i = 0 col = plot_columns[i] if j == top_plots_number: break # col.image(os.path.join(cache_folder, "basic", "plot_{0}.png".format(idx))) image_url = '/'.join((cache_folder, "basic", "plot_{0}.png".format(idx))) col.image(image_url) # Show the image i += 1 j += 1 if view == "Advanced plots": top_plots_number = columns[1].number_input("Maximum number of plots", value=5, min_value=1, max_value=10) # with open(os.path.join(cache_folder, "advanced", "idx2term.json"), "r") as f: # idx2term = json.load(f) idx2term_json_url = '/'.join((cache_folder, "advanced", "idx2term.json")) idx2term = load_hf_json(idx2term_json_url) idxs = [i for i in range(len(idx2term))] j = 0 for idx in idxs: if j == top_plots_number: break # st.image(os.path.join(cache_folder, "advanced", "plot_{0}.png".format(idx))) image_url = '/'.join((cache_folder, "advanced", "plot_{0}.png".format(idx))) st.image(image_url) # Show the image j += 1