Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,578 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# regular imports
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
import csv
|
5 |
+
import collections
|
6 |
+
import pandas as pd
|
7 |
+
import streamlit as st
|
8 |
+
import json
|
9 |
+
import gc
|
10 |
+
import requests
|
11 |
+
from PIL import Image
|
12 |
+
from io import BytesIO
|
13 |
+
from io import StringIO
|
14 |
+
from datasets import load_dataset
|
15 |
+
|
16 |
+
proteins_set = None
|
17 |
+
|
18 |
+
ROOT = os.path.abspath(os.path.dirname(__file__))
|
19 |
+
# TMP = os.path.join(ROOT, "tmp")
|
20 |
+
# if not os.path.exists(TMP):
|
21 |
+
# os.mkdir(TMP)
|
22 |
+
|
23 |
+
MIN_SET_SIZE = 1
|
24 |
+
PROFILE_TYPE = "Fragment"
|
25 |
+
OVERVIEW_PVALUE_CUTOFF = 0.05
|
26 |
+
|
27 |
+
# relative imports
|
28 |
+
# sys.path.append(os.path.join(ROOT, "../src/"))
|
29 |
+
# from util import listdir_util
|
30 |
+
|
31 |
+
def listdir_util(path):
|
32 |
+
for d in os.listdir(path):
|
33 |
+
if d.startswith("_"):
|
34 |
+
continue
|
35 |
+
else:
|
36 |
+
yield d
|
37 |
+
|
38 |
+
# import metadata
|
39 |
+
from proteome_meta import task_suf
|
40 |
+
from proteome_meta import annotation_type_dict
|
41 |
+
from proteome_meta import annotation_dict
|
42 |
+
from proteome_meta import universe_dict
|
43 |
+
|
44 |
+
# set page layout
|
45 |
+
st.set_page_config(layout="wide", page_title="Ligand Discovery Protein Set Enrichment Analysis")
|
46 |
+
|
47 |
+
# path to results and original data
|
48 |
+
PATH = os.path.abspath(os.path.join(ROOT, "../results/proteins/"))
|
49 |
+
DATA = os.path.abspath(os.path.join(ROOT, "../data"))
|
50 |
+
DATA2 = 'ligdis/data'
|
51 |
+
mySeparator = "/"
|
52 |
+
CACHE = os.path.abspath(os.path.join(ROOT, "../cache"))
|
53 |
+
|
54 |
+
# generic inputs
|
55 |
+
|
56 |
+
# protein id to gene name
|
57 |
+
|
58 |
+
dataset = load_dataset('ligdis/data', data_files={"general/pid2name_primary.tsv"}, delimiter='\t')
|
59 |
+
df = dataset['train'].to_pandas()
|
60 |
+
pid2name = dict(zip(df.iloc[:, 0], df.iloc[:, 1]))
|
61 |
+
name2pid = dict(zip(df.iloc[:, 1], df.iloc[:, 0]))
|
62 |
+
del dataset, df # Delete the variable
|
63 |
+
gc.collect()
|
64 |
+
|
65 |
+
def pid2gene(x):
|
66 |
+
if x in pid2name:
|
67 |
+
return pid2name[x]
|
68 |
+
else:
|
69 |
+
return x
|
70 |
+
|
71 |
+
|
72 |
+
def gene2pid(x):
|
73 |
+
if x in name2pid:
|
74 |
+
return name2pid[x]
|
75 |
+
else:
|
76 |
+
return x
|
77 |
+
|
78 |
+
|
79 |
+
def pretty_term(x):
|
80 |
+
x = x.title()
|
81 |
+
if x.endswith("]"):
|
82 |
+
x = x.split(" [")[0]
|
83 |
+
return x
|
84 |
+
|
85 |
+
def hf_tsv_2_pandas_df(hf_repo, data_file, myHeader):
|
86 |
+
|
87 |
+
url = '/'.join(("https://huggingface.co/datasets", hf_repo, "resolve/main", data_file))
|
88 |
+
response = requests.get(url)
|
89 |
+
|
90 |
+
if response.status_code == 200:
|
91 |
+
tsv_data = StringIO(response.text) # Use StringIO to treat the string content as a file-like object
|
92 |
+
df = pd.read_csv(tsv_data, sep='\t', header = myHeader) # Load the TSV file into a pandas DataFrame
|
93 |
+
else:
|
94 |
+
df = pd.DataFrame()
|
95 |
+
st.write("Error loading dataset from hf_repo: ", hf_repo, " and data_file: ", data_file)
|
96 |
+
return(df)
|
97 |
+
|
98 |
+
def load_hf_json(json_url):
|
99 |
+
response = requests.get(json_url)
|
100 |
+
if response.status_code == 200:
|
101 |
+
out = response.json()
|
102 |
+
else:
|
103 |
+
print("Failed to retrieve ", json_url, " file. HTTP Status Code: ", response.status_code)
|
104 |
+
return(out)
|
105 |
+
|
106 |
+
def load_hf_image(image_url):
|
107 |
+
response = requests.get(image_url)
|
108 |
+
if response.status_code == 200:
|
109 |
+
img = Image.open(BytesIO(response.content))
|
110 |
+
else:
|
111 |
+
print("Failed to retrieve image. HTTP Status Code:", response.status_code)
|
112 |
+
return(img)
|
113 |
+
|
114 |
+
|
115 |
+
# side bar
|
116 |
+
|
117 |
+
st.sidebar.title("Ligand Discovery Proteome Set Enrichment Analysis")
|
118 |
+
|
119 |
+
# signatures (aka profiles)
|
120 |
+
st.sidebar.header("Select a fragment")
|
121 |
+
|
122 |
+
profile_type = PROFILE_TYPE
|
123 |
+
profile_type_subfolder = profile_type.lower()
|
124 |
+
|
125 |
+
# @st.cache_data
|
126 |
+
# def get_sorted_fids():
|
127 |
+
# fids = []
|
128 |
+
# for fid in listdir_util(os.path.join(DATA, "signatures", "proteins", "fragment")):
|
129 |
+
# fids += [fid]
|
130 |
+
# fids = sorted(fids)
|
131 |
+
# return fids
|
132 |
+
|
133 |
+
with open("fid.txt", "r") as file:
|
134 |
+
lines = file.readlines()
|
135 |
+
# Remove the newline characters (\n) from each line
|
136 |
+
fids = [line.strip() for line in lines]
|
137 |
+
|
138 |
+
# fids = get_sorted_fids()
|
139 |
+
profile = st.sidebar.selectbox("Fragment identifier", options=fids)
|
140 |
+
profile_subfolder = profile
|
141 |
+
all_cases = fids
|
142 |
+
draw_fragment = True
|
143 |
+
|
144 |
+
st.sidebar.header("Choose a type of analysis")
|
145 |
+
|
146 |
+
type_of_analysis = st.sidebar.radio(
|
147 |
+
"Type of analysis", options=["Overview", "Detailed"]
|
148 |
+
)
|
149 |
+
|
150 |
+
# OVERVIEW TYPE OF ANALYSYS
|
151 |
+
|
152 |
+
if type_of_analysis == "Overview":
|
153 |
+
|
154 |
+
st.header("Enrichment overview for {0} {1}".format(profile_type.lower(), profile))
|
155 |
+
view = st.sidebar.radio("Select View", options=["Table", "Plot"])
|
156 |
+
|
157 |
+
df = hf_tsv_2_pandas_df(hf_repo="ligdis/cache_overview", data_file="{0}.tsv".format(profile), myHeader=0)
|
158 |
+
|
159 |
+
# df = pd.read_csv(os.path.join(CACHE, "overview", "{0}.tsv".format(profile)), sep="\t")
|
160 |
+
|
161 |
+
if view == "Table":
|
162 |
+
|
163 |
+
columns = st.columns(4)
|
164 |
+
|
165 |
+
prot2idx = collections.defaultdict(list)
|
166 |
+
for i,r in enumerate(list(df["edge"])):
|
167 |
+
for x in r.split(","):
|
168 |
+
gn = pid2gene(x)
|
169 |
+
prot2idx[gn] += [i]
|
170 |
+
all_proteins_ = sorted(prot2idx.keys())
|
171 |
+
ann2idx = collections.defaultdict(list)
|
172 |
+
for i,r in enumerate(df["term"]):
|
173 |
+
ann2idx[r] += [i]
|
174 |
+
all_annotations_ = sorted(ann2idx.keys())
|
175 |
+
|
176 |
+
type2idx = collections.defaultdict(list)
|
177 |
+
for i,r in enumerate(list(df["type"])):
|
178 |
+
type2idx[r] += [i]
|
179 |
+
all_types_ = sorted(type2idx.keys())
|
180 |
+
|
181 |
+
subtype2idx = collections.defaultdict(list)
|
182 |
+
for i,r in enumerate(list(df["subtype"])):
|
183 |
+
subtype2idx[r] += [i]
|
184 |
+
all_subtypes_ = sorted(subtype2idx.keys())
|
185 |
+
|
186 |
+
selected_proteins = columns[0].multiselect("Filter by proteins in leading edge ({0} unique proteins)".format(len(all_proteins_)), options=all_proteins_)
|
187 |
+
selected_annotations = columns[1].multiselect("Select annotations", options=all_annotations_)
|
188 |
+
selected_subtypes = columns[2].multiselect("Filter by annotation subtype", options=all_subtypes_)
|
189 |
+
selected_types = columns[3].multiselect("Filter by annotation type", options=all_types_)
|
190 |
+
|
191 |
+
keep_idxs = []
|
192 |
+
if selected_proteins is not None:
|
193 |
+
for x in selected_proteins:
|
194 |
+
for idx in prot2idx[x]:
|
195 |
+
keep_idxs += [idx]
|
196 |
+
|
197 |
+
if selected_annotations is not None:
|
198 |
+
for x in selected_annotations:
|
199 |
+
for idx in ann2idx[x]:
|
200 |
+
keep_idxs += [idx]
|
201 |
+
|
202 |
+
if selected_subtypes is not None:
|
203 |
+
for x in selected_subtypes:
|
204 |
+
for idx in subtype2idx[x]:
|
205 |
+
keep_idxs += [idx]
|
206 |
+
|
207 |
+
if selected_types is not None:
|
208 |
+
for x in selected_types:
|
209 |
+
for idx in type2idx[x]:
|
210 |
+
keep_idxs += [idx]
|
211 |
+
|
212 |
+
if keep_idxs:
|
213 |
+
keep_idxs = sorted(set(keep_idxs))
|
214 |
+
df = df.iloc[keep_idxs]
|
215 |
+
|
216 |
+
df["edge_genes"] = [" ".join([pid2gene(x) for x in r.split(",")]) for r in list(df["edge"])]
|
217 |
+
|
218 |
+
df_view = df[["term", "overlap", "setsize", "score", "pval", "edge_genes", "subtype", "type"]]
|
219 |
+
df_view = df_view.rename(columns = {
|
220 |
+
"term": "Term",
|
221 |
+
"overlap": "Edge size",
|
222 |
+
"setsize": "Set size",
|
223 |
+
"score": "Score",
|
224 |
+
"pval": "P-value",
|
225 |
+
"edge_genes": "Leading edge",
|
226 |
+
"subtype": "Category subtype",
|
227 |
+
"type": "Category type"
|
228 |
+
})
|
229 |
+
df_view["rank"] = [i+1 for i in range(df_view.shape[0])]
|
230 |
+
df_view = df_view.set_index("rank")
|
231 |
+
|
232 |
+
st.dataframe(df_view.reset_index(drop=True), height=2000)
|
233 |
+
|
234 |
+
else:
|
235 |
+
# st.image(os.path.join(CACHE, "overview", "{0}.png".format(profile)))
|
236 |
+
image_url = ''.join(("https://huggingface.co/datasets/ligdis/cache_overview/resolve/main/", "{0}.png".format(profile), "?download=true")) # Replace with actual URL
|
237 |
+
st.image(image_url)
|
238 |
+
|
239 |
+
## DETAILED TYPE OF ANALYSIS
|
240 |
+
|
241 |
+
else:
|
242 |
+
|
243 |
+
def annotations_selector():
|
244 |
+
st.sidebar.header("Select protein annotation category")
|
245 |
+
|
246 |
+
annotation_types = [
|
247 |
+
"Sequence",
|
248 |
+
"Functions",
|
249 |
+
"Processes and pathways",
|
250 |
+
"Localization",
|
251 |
+
"Drugs and Diseases",
|
252 |
+
]
|
253 |
+
annotation_type = st.sidebar.radio("Type of annotation", annotation_types)
|
254 |
+
|
255 |
+
annotations = annotation_type_dict[annotation_type]
|
256 |
+
|
257 |
+
annotation = st.sidebar.selectbox("Annotation source", options=annotations)
|
258 |
+
annotation_subfolder = annotation_dict[annotation]
|
259 |
+
|
260 |
+
return annotation, annotation_subfolder, annotation_type, annotations
|
261 |
+
|
262 |
+
def universe_selector():
|
263 |
+
preselected="HEK293T Core"
|
264 |
+
universe = preselected
|
265 |
+
universe_subfolder = universe_dict[universe]
|
266 |
+
return universe, universe_subfolder
|
267 |
+
|
268 |
+
annotation, annotation_subfolder, annotation_type, annotations = (
|
269 |
+
annotations_selector()
|
270 |
+
)
|
271 |
+
|
272 |
+
universe, universe_subfolder = universe_selector()
|
273 |
+
|
274 |
+
st.header("Fragment: {0} & Category: {2} ({1})".format(profile_subfolder, annotation_type, annotation))
|
275 |
+
|
276 |
+
# cache_folder = os.path.join(CACHE, "detailed", profile_subfolder, annotation_subfolder)
|
277 |
+
cache_folder = '/'.join(("https://huggingface.co/datasets/ligdis", '_'.join(("cache_detailed", profile_subfolder)), "resolve/main", annotation_subfolder ))
|
278 |
+
|
279 |
+
# read metrics
|
280 |
+
|
281 |
+
metrics_json_url = '/'.join((cache_folder, "metrics.json"))
|
282 |
+
metrics = load_hf_json(metrics_json_url)
|
283 |
+
|
284 |
+
# with open(os.path.join(cache_folder, "metrics.json"), "r") as f:
|
285 |
+
# metrics = json.load(f)
|
286 |
+
|
287 |
+
metric_cols = st.columns(3)
|
288 |
+
metric_cols[0].metric(
|
289 |
+
"{0} profile: {1}".format(profile_type, profile),
|
290 |
+
value="{0} proteins".format(metrics["signature_size"]),
|
291 |
+
)
|
292 |
+
metric_cols[1].metric(
|
293 |
+
"{0}: {1}".format(annotation_type, annotation),
|
294 |
+
value="{0} categories".format(metrics["annotations_size"]),
|
295 |
+
)
|
296 |
+
metric_cols[2].metric(metrics["title"], value=round(metrics["value"], 2))
|
297 |
+
|
298 |
+
columns = st.columns(6)
|
299 |
+
view = columns[0].radio("View", options=["Tables", "Basic plots", "Advanced plots"])
|
300 |
+
|
301 |
+
if view == "Tables":
|
302 |
+
|
303 |
+
p_value_cutoff = columns[2].number_input("P-value cutoff", value=0.05, min_value=0., max_value=1., format="%.3f")
|
304 |
+
min_edge_size = columns[3].number_input("Minimum leading edge size", value=5, min_value=0, max_value=10000)
|
305 |
+
max_edge_size = columns[4].number_input("Maximum leading edge size", value=5000, min_value=1, max_value=10000)
|
306 |
+
protein_label = "Gene Name"
|
307 |
+
if protein_label == "Gene Name":
|
308 |
+
convert_to_gene = True
|
309 |
+
else:
|
310 |
+
convert_to_gene = False
|
311 |
+
|
312 |
+
# available_selections = json.load(open(os.path.join(cache_folder, "selections.json"), "r"))
|
313 |
+
selections_json_url = '/'.join((cache_folder, "selections.json"))
|
314 |
+
available_selections = load_hf_json(selections_json_url)
|
315 |
+
|
316 |
+
all_annotations = available_selections["all_annotations"]
|
317 |
+
available_proteins = available_selections["available_proteins"]
|
318 |
+
|
319 |
+
select_columns = st.columns(3)
|
320 |
+
selected_annotations = select_columns[2].multiselect(
|
321 |
+
"Select annotation categories", options=available_proteins
|
322 |
+
)
|
323 |
+
|
324 |
+
selected_proteins = select_columns[0].multiselect(
|
325 |
+
"Filter by proteins found in at least one annotation term ({0})".format(
|
326 |
+
len(available_proteins)
|
327 |
+
),
|
328 |
+
options=available_proteins,
|
329 |
+
)
|
330 |
+
|
331 |
+
task_filename = ''.join((profile, "_val_log2fc.tsv"))
|
332 |
+
|
333 |
+
ligdis_annotations_repo = '/'.join(('ligdis', annotation_subfolder))
|
334 |
+
annotations_json = '/'.join((profile_type_subfolder, profile_subfolder, task_filename.split(".tsv")[0], 'annotations.json'))
|
335 |
+
annotations_json_url = ''.join(("https://huggingface.co/datasets/", ligdis_annotations_repo, "/resolve/main/", annotations_json))
|
336 |
+
|
337 |
+
annotations_ = load_hf_json(annotations_json_url)
|
338 |
+
|
339 |
+
if selected_proteins:
|
340 |
+
|
341 |
+
if convert_to_gene:
|
342 |
+
selected_proteins = [gene2pid(x) for x in selected_proteins]
|
343 |
+
selected_proteins = set(selected_proteins)
|
344 |
+
if not selected_annotations:
|
345 |
+
for k, v in annotations_.items():
|
346 |
+
if len(selected_proteins.intersection(v)) > 0:
|
347 |
+
selected_annotations += [k]
|
348 |
+
if not selected_annotations:
|
349 |
+
st.warning(
|
350 |
+
"No available annotations for any of your proteins of interest..."
|
351 |
+
)
|
352 |
+
|
353 |
+
# result = pd.read_csv(os.path.join(cache_folder, "result.tsv"), sep="\t")
|
354 |
+
|
355 |
+
ligdis_cache_detailed_fragment_repo = '_'.join(("ligdis/cache_detailed", profile_subfolder))
|
356 |
+
result_file = '/'.join((annotation_subfolder, "result.tsv"))
|
357 |
+
|
358 |
+
result = hf_tsv_2_pandas_df(hf_repo = ligdis_cache_detailed_fragment_repo, data_file = result_file, myHeader=0)
|
359 |
+
|
360 |
+
result = result[result["leading_edge_size"] >= min_edge_size]
|
361 |
+
result = result[result["leading_edge_size"] <= max_edge_size]
|
362 |
+
result = result.reset_index(drop=True)
|
363 |
+
|
364 |
+
leading_proteins = available_selections["leading_proteins"]
|
365 |
+
|
366 |
+
selected_leading_proteins = select_columns[1].multiselect(
|
367 |
+
"Filter by proteins found in at least one leading edge",
|
368 |
+
options = leading_proteins)
|
369 |
+
|
370 |
+
if selected_leading_proteins:
|
371 |
+
|
372 |
+
prot2idx = collections.defaultdict(list)
|
373 |
+
for i, r in enumerate(list(result["leading_edge"])):
|
374 |
+
if str(r) == "nan":
|
375 |
+
continue
|
376 |
+
for x in r.split(","):
|
377 |
+
prot2idx[pid2gene(x)] += [i]
|
378 |
+
|
379 |
+
idxs = []
|
380 |
+
for v in selected_leading_proteins:
|
381 |
+
for x in prot2idx[v]:
|
382 |
+
idxs += [x]
|
383 |
+
idxs = sorted(set(idxs))
|
384 |
+
result = result.iloc[idxs]
|
385 |
+
|
386 |
+
# df_merge = pd.read_csv(os.path.join(cache_folder, "df_merge.tsv"), sep="\t")
|
387 |
+
df_merge_file = '/'.join((annotation_subfolder, "df_merge.tsv"))
|
388 |
+
df_merge = hf_tsv_2_pandas_df(hf_repo=ligdis_cache_detailed_fragment_repo, data_file=df_merge_file, myHeader=0)
|
389 |
+
|
390 |
+
type_of_task = metrics["type_of_task"]
|
391 |
+
if type_of_task == "ranksum":
|
392 |
+
|
393 |
+
sort_by = "NES"
|
394 |
+
if sort_by == "NES":
|
395 |
+
sort_by_nes = True
|
396 |
+
else:
|
397 |
+
sort_by_nes = False
|
398 |
+
|
399 |
+
direction = "Up"
|
400 |
+
if direction == "Up":
|
401 |
+
is_up = True
|
402 |
+
else:
|
403 |
+
is_up = False
|
404 |
+
|
405 |
+
df = result.copy()
|
406 |
+
df = df.rename(columns = {"Term": "term"})
|
407 |
+
|
408 |
+
df_merge = df_merge[["term", "score_mean"]]
|
409 |
+
|
410 |
+
df = df.merge(df_merge, how="left", on="term")
|
411 |
+
|
412 |
+
df = df[df["leading_edge"].notnull()]
|
413 |
+
|
414 |
+
df["edge_genes"] = [" ".join([pid2gene(x) for x in r.split(",")]) for r in list(df["leading_edge"])]
|
415 |
+
|
416 |
+
df = df[["term","leading_edge_size", "geneset_size", "nes", "pval", "fdr", "score_mean", "edge_genes", "leading_edge"]]
|
417 |
+
|
418 |
+
if selected_annotations:
|
419 |
+
df = df[df["term"].isin(selected_annotations)]
|
420 |
+
|
421 |
+
if is_up:
|
422 |
+
df = df[df["nes"] >= 0]
|
423 |
+
else:
|
424 |
+
df = df[df["nes"] < 0]
|
425 |
+
if sort_by_nes:
|
426 |
+
if is_up:
|
427 |
+
df = df.sort_values(by="nes", ascending=False)
|
428 |
+
else:
|
429 |
+
df = df.sort_values(by="nes", ascending=True)
|
430 |
+
else:
|
431 |
+
df = df.sort_values(by="pval")
|
432 |
+
|
433 |
+
df = df.reset_index(drop=True)
|
434 |
+
|
435 |
+
df = df.rename(columns = {
|
436 |
+
"term": "Term",
|
437 |
+
"leading_edge_size": "Edge size",
|
438 |
+
"geneset_size": "Set size",
|
439 |
+
"nes": "Score",
|
440 |
+
"pval": "P-value",
|
441 |
+
"fdr": "FDR",
|
442 |
+
"score_mean": "Mean score",
|
443 |
+
"edge_genes": "Leading edge",
|
444 |
+
})
|
445 |
+
|
446 |
+
st.dataframe(df[[c for c in list(df.columns)[:-1] if c != "Mean score"]].reset_index(drop=True))
|
447 |
+
|
448 |
+
term = st.selectbox("Explore term...", df["Term"])
|
449 |
+
|
450 |
+
if term is not None:
|
451 |
+
|
452 |
+
# signature_ori = pd.read_csv(os.path.join(results_path, "signature.tsv"), delimiter="\t", header=None)
|
453 |
+
ligdis_ontology_repo = '/'.join(("ligdis", annotation_subfolder))
|
454 |
+
ontology_signature_file = '/'.join((profile_type_subfolder, profile_subfolder, task_filename.split(".tsv")[0], "signature.tsv"))
|
455 |
+
signature_ = hf_tsv_2_pandas_df(hf_repo=ligdis_ontology_repo, data_file=ontology_signature_file, myHeader=None )
|
456 |
+
|
457 |
+
# signature_file = os.path.abspath(os.path.join(DATA,"signatures","proteins",profile_type_subfolder,profile_subfolder,task_filename))
|
458 |
+
ligdis_data_repo = '/'.join(("ligdis", "data"))
|
459 |
+
fragment_signature_file = '/'.join(("signatures/proteins/fragment", profile_subfolder, task_filename))
|
460 |
+
|
461 |
+
# Explore term
|
462 |
+
|
463 |
+
t_values = {}
|
464 |
+
for r in signature_.values:
|
465 |
+
t_values[r[0]] = r[1]
|
466 |
+
o_values = {}
|
467 |
+
# signature_original = pd.read_csv(signature_file, delimiter="\t", header=None)
|
468 |
+
signature_original = hf_tsv_2_pandas_df(hf_repo=ligdis_data_repo, data_file=fragment_signature_file, myHeader=None)
|
469 |
+
|
470 |
+
for r in signature_original.values:
|
471 |
+
o_values[r[0]] = r[1]
|
472 |
+
|
473 |
+
cols = st.columns([0.15, 1])
|
474 |
+
|
475 |
+
col = cols[0]
|
476 |
+
|
477 |
+
annotations_size = len(annotations_[term])
|
478 |
+
signature_size = len(signature_)
|
479 |
+
|
480 |
+
df_filt = df[df["Term"] == term]
|
481 |
+
leading_edge = list(df_filt["leading_edge"])[0]
|
482 |
+
if str(leading_edge) == "nan":
|
483 |
+
leading_edge = []
|
484 |
+
else:
|
485 |
+
leading_edge = leading_edge.split(",")
|
486 |
+
display_proteins = col.radio(
|
487 |
+
"Display proteins",
|
488 |
+
[
|
489 |
+
"Leading edge ({0})".format(len(leading_edge)),
|
490 |
+
"In category ({0})".format(annotations_size),
|
491 |
+
"Full profile ({0})".format(signature_size),
|
492 |
+
],
|
493 |
+
)
|
494 |
+
if "Leading" in display_proteins:
|
495 |
+
proteins = leading_edge
|
496 |
+
elif "category" in display_proteins:
|
497 |
+
proteins = annotations_[term]
|
498 |
+
else:
|
499 |
+
proteins = signature_[0]
|
500 |
+
o_values = [o_values[pid] for pid in proteins]
|
501 |
+
t_values = [t_values[pid] for pid in proteins]
|
502 |
+
|
503 |
+
proteins_set = set(proteins)
|
504 |
+
if convert_to_gene:
|
505 |
+
genes = [pid2gene(x) for x in proteins]
|
506 |
+
label = "Gene Name"
|
507 |
+
else:
|
508 |
+
label = "UniProtAC"
|
509 |
+
dl = pd.DataFrame(
|
510 |
+
{"Gene Name": genes, "UniProt AC": proteins, "Log2FC": o_values, "Z-score": t_values}
|
511 |
+
)
|
512 |
+
|
513 |
+
sort_by = col.radio(
|
514 |
+
"Sort proteins", ["By Z-score", "Alphabetically"]
|
515 |
+
)
|
516 |
+
if sort_by != "Alphabetically":
|
517 |
+
if is_up:
|
518 |
+
dl = dl.sort_values("Z-score", ascending=False)
|
519 |
+
else:
|
520 |
+
dl = dl.sort_values("Z-score", ascending=True)
|
521 |
+
else:
|
522 |
+
dl = dl.sort_values(label)
|
523 |
+
dl = dl.reset_index(drop=True)
|
524 |
+
|
525 |
+
col = cols[1]
|
526 |
+
col.dataframe(dl.reset_index(drop=True))
|
527 |
+
|
528 |
+
if view == "Basic plots":
|
529 |
+
top_plots_number = columns[1].number_input("Maximum number of plots", value=12, min_value=1, max_value=50)
|
530 |
+
plot_columns = st.columns(4)
|
531 |
+
|
532 |
+
# with open(os.path.join(cache_folder, "basic", "idx2term.json"), "r") as f:
|
533 |
+
# idx2term = json.load(f)
|
534 |
+
idx2term_json_url = '/'.join((cache_folder, "basic", "idx2term.json"))
|
535 |
+
idx2term = load_hf_json(idx2term_json_url)
|
536 |
+
|
537 |
+
idxs = [i for i in range(len(idx2term))]
|
538 |
+
|
539 |
+
i = 0
|
540 |
+
j = 0
|
541 |
+
|
542 |
+
for idx in idxs:
|
543 |
+
|
544 |
+
if i == len(plot_columns):
|
545 |
+
i = 0
|
546 |
+
col = plot_columns[i]
|
547 |
+
|
548 |
+
if j == top_plots_number:
|
549 |
+
break
|
550 |
+
|
551 |
+
# col.image(os.path.join(cache_folder, "basic", "plot_{0}.png".format(idx)))
|
552 |
+
|
553 |
+
image_url = '/'.join((cache_folder, "basic", "plot_{0}.png".format(idx)))
|
554 |
+
col.image(image_url) # Show the image
|
555 |
+
i += 1
|
556 |
+
j += 1
|
557 |
+
|
558 |
+
|
559 |
+
if view == "Advanced plots":
|
560 |
+
top_plots_number = columns[1].number_input("Maximum number of plots", value=5, min_value=1, max_value=10)
|
561 |
+
|
562 |
+
# with open(os.path.join(cache_folder, "advanced", "idx2term.json"), "r") as f:
|
563 |
+
# idx2term = json.load(f)
|
564 |
+
|
565 |
+
idx2term_json_url = '/'.join((cache_folder, "advanced", "idx2term.json"))
|
566 |
+
idx2term = load_hf_json(idx2term_json_url)
|
567 |
+
|
568 |
+
idxs = [i for i in range(len(idx2term))]
|
569 |
+
|
570 |
+
j = 0
|
571 |
+
for idx in idxs:
|
572 |
+
if j == top_plots_number:
|
573 |
+
break
|
574 |
+
|
575 |
+
# st.image(os.path.join(cache_folder, "advanced", "plot_{0}.png".format(idx)))
|
576 |
+
image_url = '/'.join((cache_folder, "advanced", "plot_{0}.png".format(idx)))
|
577 |
+
st.image(image_url) # Show the image
|
578 |
+
j += 1
|