KashyapiNagaHarshitha commited on
Commit
d16951c
·
verified ·
1 Parent(s): 86f21b4

Upload Quality_Control.py

Browse files
Files changed (1) hide show
  1. Quality_Control.py +1688 -0
Quality_Control.py ADDED
@@ -0,0 +1,1688 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ import warnings
5
+ import os
6
+ import plotly as plt
7
+ import seaborn as sb
8
+ import plotly.express as px
9
+ import panel as pn
10
+ import holoviews as hv
11
+ import hvplot.pandas
12
+ import pandas as pd
13
+ import numpy as np
14
+ import json
15
+ import panel as pn
16
+ import pandas as pd
17
+ import random
18
+ import asyncio
19
+ import matplotlib.pyplot as plt
20
+ from bokeh.plotting import figure
21
+ from bokeh.io import push_notebook, show
22
+ from bokeh.io.export import export_png
23
+ from bokeh.resources import INLINE
24
+ from bokeh.embed import file_html
25
+ from bokeh.io import curdoc
26
+ from bokeh.models import Span, Label
27
+ from bokeh.models import ColumnDataSource, Button
28
+ from my_modules import *
29
+ from datasets import load_dataset
30
+
31
+ #Silence FutureWarnings & UserWarnings
32
+ warnings.filterwarnings('ignore', category= FutureWarning)
33
+ warnings.filterwarnings('ignore', category= UserWarning)
34
+
35
+ #input_path = os.path.join(present_dir, 'wetransfer_data-zip_2024-05-17_1431')
36
+ present_dir = os.path.dirname(os.path.realpath(__file__))
37
+ # Construct the full path to the stored_variables.json file
38
+ json_path = os.path.join(present_dir, 'stored_variables.json')
39
+ with open(json_path, 'r') as file:
40
+ stored_vars = json.load(file)
41
+ directory = stored_vars['base_dir']
42
+ input_path = os.path.join(present_dir,directory)
43
+ set_path = stored_vars['set_path']
44
+ selected_metadata_files = stored_vars['selected_metadata_files']
45
+ ls_samples = stored_vars['ls_samples']
46
+ base_dir = input_path
47
+
48
+ #input_path = '/Users/harshithakolipaka/Desktop/CycIF/wetransfer_data-zip_2024-05-17_1431'
49
+ #set_path = 'test'
50
+ #selected_metadata_files = ['Slide_B_DD1s1.one_1.tif.csv', 'Slide_B_DD1s1.one_2.tif.csv']
51
+ #ls_samples = ['Ashlar_Exposure_Time.csv', 'new_data.csv', 'DD3S1.csv', 'DD3S2.csv', 'DD3S3.csv', 'TMA.csv']
52
+ pn.extension()
53
+
54
+ update_button = pn.widgets.Button(name='CSV Files', button_type='primary')
55
+ def update_samples(event):
56
+ with open(json_path, 'r') as file:
57
+ stored_vars = json.load(file)
58
+ print(stored_vars)
59
+ ls_samples = stored_vars['ls_samples']
60
+ return f'CSV Files Selected: {ls_samples}'
61
+ update_button.on_click(update_samples)
62
+
63
+ csv_files_button = pn.widgets.Button(icon="clipboard", button_type="primary")
64
+ indicator = pn.indicators.LoadingSpinner(value=False, size=25)
65
+
66
+ def handle_click(clicks):
67
+ with open(json_path, 'r') as file:
68
+ stored_vars = json.load(file)
69
+ print(stored_vars)
70
+ #ls_samples = stored_vars['ls_samples']
71
+ #return f'CSV Files Selected: {ls_samples}'
72
+
73
+ # pn.Row(csv_files_button,pn.bind(
74
+ # , csv_files_button.param.clicks),)
75
+
76
+
77
+ # ## I.2. *DIRECTORIES
78
+
79
+ #set_path = 'test'
80
+
81
+ # Set base directory
82
+
83
+ directorio_actual = os.getcwd()
84
+ print(directorio_actual)
85
+
86
+ ##### MAC WORKSTATION #####
87
+ #base_dir = r'/Volumes/LaboLabrie/Projets/OC_TMA_Pejovic/Temp/Zoe/CyCIF_pipeline/'
88
+ ###########################
89
+
90
+ ##### WINDOWS WORKSTATION #####
91
+ #base_dir = r'C:\Users\LaboLabrie\gerz2701\cyCIF-pipeline\Set_B'
92
+ ###############################
93
+ input_path = base_dir
94
+
95
+ ##### LOCAL WORKSTATION #####
96
+ #base_dir = r'/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/'
97
+ base_dir = input_path
98
+ print(base_dir)
99
+ #############################
100
+
101
+ #set_name = 'Set_A'
102
+ #set_name = 'test'
103
+ set_name = set_path
104
+
105
+ project_name = set_name # Project name
106
+ step_suffix = 'qc_eda' # Curent part (here part I)
107
+ previous_step_suffix_long = "" # Previous part (here empty)
108
+
109
+ # Initial input data directory
110
+ input_data_dir = os.path.join(base_dir, project_name + "_data")
111
+
112
+ # QC/EDA output directories
113
+ # global output
114
+ output_data_dir = os.path.join(base_dir, project_name + "_" + step_suffix)
115
+ # images subdirectory
116
+ output_images_dir = os.path.join(output_data_dir,"images")
117
+
118
+ # Data and Metadata directories
119
+ # global data
120
+ metadata_dir = os.path.join(base_dir, project_name + "_metadata")
121
+ # images subdirectory
122
+ metadata_images_dir = os.path.join(metadata_dir,"images")
123
+
124
+ # Create directories if they don't already exist
125
+ for d in [base_dir, input_data_dir, output_data_dir, output_images_dir, metadata_dir, metadata_images_dir]:
126
+ if not os.path.exists(d):
127
+ print("Creation of the" , d, "directory...")
128
+ os.makedirs(d)
129
+ else :
130
+ print("The", d, "directory already exists !")
131
+
132
+ os.chdir(input_data_dir)
133
+ with open(json_path, 'r') as file:
134
+ stored_vars = json.load(file)
135
+ # ls_samples = stored_vars['ls_samples']
136
+ selected_metadata_files = stored_vars['selected_metadata_files']
137
+
138
+ directories = []
139
+ for i in [base_dir, input_data_dir, output_data_dir, output_images_dir, metadata_dir, metadata_images_dir]:
140
+ directories.append(i)
141
+
142
+ directories
143
+
144
+ def print_directories(directories):
145
+
146
+ label_path = []
147
+ labels = [
148
+ "base_dir",
149
+ "input_data_dir",
150
+ "output_data_dir",
151
+ "output_images_dir",
152
+ "metadata_dir",
153
+ "metadata_images_dir"
154
+ ]
155
+
156
+ for label, path in zip(labels, directories):
157
+ label_path.append(f"{label} : {path}")
158
+
159
+ return label_path
160
+
161
+ print_directories
162
+
163
+
164
+ # Verify paths
165
+ print('base_dir :', base_dir)
166
+ print('input_data_dir :', input_data_dir)
167
+ print('output_data_dir :', output_data_dir)
168
+ print('output_images_dir :', output_images_dir)
169
+ print('metadata_dir :', metadata_dir)
170
+ print('metadata_images_dir :', metadata_images_dir)
171
+
172
+
173
+ # ## I.3. FILES
174
+
175
+ # Listing all the .csv files in the metadata/data directory
176
+ # Don't forget to move the csv files into the proj_data directory
177
+ # if the data dir is empty it's not going to work
178
+ #ls_samples = [sample for sample in os.listdir(input_data_dir) if sample.endswith(".csv")]
179
+ print("The following CSV files were detected:\n\n",[sample for sample in ls_samples], "\n\nin", input_data_dir, "directory.")
180
+
181
+
182
+ # In[26]:
183
+
184
+
185
+ import os
186
+ import pandas as pd
187
+
188
+ def combine_and_save_metadata_files(metadata_dir, selected_metadata_files):
189
+ if len(selected_metadata_files) == []:
190
+ if not file:
191
+ warnings.warn("No Ashlar file uploaded. Please upload a valid file.", UserWarning)
192
+ return
193
+
194
+ elif len(selected_metadata_files) > 1:
195
+ combined_metadata_df = pd.DataFrame()
196
+
197
+ for file in selected_metadata_files:
198
+ file_path = os.path.join(metadata_dir, file)
199
+ df = pd.read_csv(file_path)
200
+ combined_metadata_df = pd.concat([combined_metadata_df, df], ignore_index=True)
201
+
202
+ combined_metadata_df.to_csv(os.path.join(metadata_dir, "combined_metadata.csv"), index=False)
203
+ print(f"Combined metadata file saved as 'combined_metadata.csv' in {metadata_dir}")
204
+
205
+ return combined_metadata_df
206
+
207
+ else:
208
+ '''if selected_metadata_files:
209
+ single_file_path = os.path.join(metadata_dir, selected_metadata_files[0])
210
+ single_file_df = pd.read_csv(single_file_path)
211
+ print(f"Only one file selected: {selected_metadata_files[0]}")
212
+
213
+ return single_file_df'''
214
+
215
+ if len(selected_metadata_files) == 1:
216
+ combined_metadata_path = os.path.join(metadata_dir, 'combined_metadata.csv')
217
+
218
+ if os.path.exists(combined_metadata_path):
219
+ print(f"Combined metadata file already exists: {combined_metadata_path}")
220
+ combined_metadata_df = pd.read_csv(combined_metadata_path)
221
+ else:
222
+ if selected_metadata_files:
223
+ combined_metadata_df = pd.DataFrame()
224
+ for file in selected_metadata_files:
225
+ file_path = os.path.join(metadata_dir, file)
226
+ metadata_df = pd.read_csv(file_path)
227
+ combined_metadata_df = pd.concat([combined_metadata_df, metadata_df], ignore_index=True)
228
+
229
+ combined_metadata_df.to_csv(combined_metadata_path, index=False)
230
+ print(f"Combined metadata saved to: {combined_metadata_path}")
231
+ else:
232
+ print("No metadata files selected.")
233
+ combined_metadata_df = pd.DataFrame()
234
+
235
+ return combined_metadata_df
236
+
237
+ print(combine_and_save_metadata_files(metadata_dir, selected_metadata_files))
238
+
239
+ ls_samples
240
+
241
+ path = os.path.join(input_data_dir, ls_samples[0])
242
+ #df = load_dataset('csv', data_files = path )
243
+ df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]),index_col = 0, nrows = 1)
244
+ df.head(10)
245
+
246
+ # First gather information on expected headers using first file in ls_samples
247
+ # Read in the first row of the file corresponding to the first sample (index = 0) in ls_samples
248
+ df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]) , index_col = 0, nrows = 1)
249
+
250
+
251
+ # Make sure the file was imported correctly
252
+ print("df :\n", df.head(), "\n")
253
+ print("df's columns :\n", df.columns, "\n")
254
+ print("df's index :\n", df.index, "\n")
255
+ print("df's index name :\n", df.index.name)
256
+
257
+ df.head()
258
+
259
+ # Verify that the ID column in input file became the index
260
+ # Verify that the index name column is "ID", if not, rename it
261
+ if df.index.name != "ID":
262
+ print("Expected the first column in input file (index_col = 0) to be 'ID'. \n"
263
+ "This column will be used to set the index names (cell number for each sample). \n"
264
+ "It appears that the column '" + df.index.name + "' was actually the imported as the index column.")
265
+ #df.index.name = 'ID'
266
+ print("A new index name (first column) will be given ('ID') to replace the current one '" + df.index.name + "'\n")
267
+
268
+ # Apply the changes to the headers as specified with apply_header_changes() function (in my_modules.py)
269
+ # Apply the changes to the dataframe rows as specified with apply_df_changes() function (in my_modules.py)
270
+ #df = apply_header_changes(df)
271
+ print(df.index)
272
+ df.index = df.index.str.replace(r'@1$', '')
273
+ df = apply_df_changes(df)
274
+
275
+ # Set variable to hold default header values
276
+ expected_headers = df.columns.values
277
+ expected_header = True
278
+ print(expected_header)
279
+
280
+ intial_dataframe = df
281
+ # Make sure the file is now formated correctly
282
+ print("\ndf :\n", df.head(), "\n")
283
+ print("df's columns :\n", df.columns, "\n")
284
+ print("df's index :\n", df.index, "\n")
285
+ print("df's index name :\n", df.index.name)
286
+
287
+ df.head()
288
+
289
+
290
+ df.head()
291
+
292
+ print("Used " + ls_samples[0] + " to determine the expected and corrected headers for all files.\n")
293
+ print("These headers are: \n" + ", ".join([h for h in expected_headers]))
294
+
295
+ corrected_headers = True
296
+
297
+ for sample in ls_samples:
298
+ file_path = os.path.join(input_data_dir,sample)
299
+ print(file_path)
300
+
301
+ # Import all the others files
302
+ dfs = {}
303
+ ###############################
304
+ # !! This may take a while !! #
305
+ ###############################
306
+ errors = []
307
+
308
+ for sample in ls_samples:
309
+ file_path = os.path.join(input_data_dir,sample)
310
+
311
+ try:
312
+ # Read the CSV file
313
+ df = load_dataset("csv", data_files = file_path)
314
+ df = pd.read_csv(file_path, index_col=0)
315
+ # Check if the DataFrame is empty, if so, don't continue trying to process df and remove it
316
+
317
+ if not df.empty:
318
+ # Manipulations necessary for concatenation
319
+ df = apply_header_changes(df)
320
+ df = apply_df_changes(df)
321
+ # Reorder the columns to match the expected headers list
322
+ #df = df.reindex(columns=expected_headers)
323
+ print(df.head(1))
324
+ print(sample, "file is processed !\n")
325
+ #print(df)
326
+
327
+ # Compare df's header df against what is expected
328
+ compare_headers(expected_headers, df.columns.values, sample)
329
+ #print(df.columns.values)
330
+ # Add a new colunm to identify the csv file (sample) where the df comes from
331
+ df['Sample_ID'] = sample
332
+
333
+ except pd.errors.EmptyDataError:
334
+ errors.append(f'\nEmpty data error in {sample} file. Removing from analysis...')
335
+ print(f'\nEmpty data error in {sample} file. Removing from analysis...')
336
+ ls_samples.remove(sample)
337
+
338
+ # Add df to dfs
339
+ dfs[sample] = df
340
+
341
+ print(dfs)
342
+
343
+
344
+ dfs.values()
345
+
346
+ # Merge dfs into one df
347
+ df = pd.concat(dfs.values(), ignore_index=False , sort = False)
348
+ del dfs
349
+ merge = True
350
+ merged_dataframe = df
351
+ df.head()
352
+
353
+ # Set index to Sample_ID + cell number :
354
+ # create a new custom index for df based on the sample names and integer cell numbers, and then remove the temporary columns 'level_0' and 'index' that were introduced during the operations
355
+
356
+ # Creates a copy of the DataFrame df and resets its index without creating a new column for the old index
357
+ # This essentially removes the old index column and replaces it with a default integer index
358
+ df = df.copy().reset_index(drop=True)
359
+
360
+ #print(df)
361
+
362
+ # Initializing an empty list index to store the new index labels for the DataFrame
363
+ index = []
364
+
365
+ for sample in ls_samples:
366
+ # Extract a chunk of data from the original df where the 'Sample_ID' column matches the current sample name
367
+ # This chunk is stored in the df_chunk df, which is a subset of the original data for that specific sample
368
+ df_chunk = df.loc[df['Sample_ID'] == sample,:].copy()
369
+ old_index = df_chunk.index
370
+ # Reset the index of the df_chunk df, removing the old index and replacing it with a default integer index
371
+ df_chunk = df_chunk.reset_index(drop=True)
372
+ # A new index is created for the df_chunk df. It combines the sample name with 'Cell_' and the integer index values, converting them to strings
373
+ # This new index will have labels like 'SampleName_Cell_0', 'SampleName_Cell_1', and so on.
374
+ sample = sample.split('.')[0]
375
+ df_chunk = df_chunk.set_index(f'{sample}_Cell_' + df_chunk.index.astype(str))
376
+ # The index values of df_chunk are then added to the index list
377
+ index = index + df_chunk.index.values.tolist()
378
+
379
+ # After processing all the samples in the loop, assign the index list as the new index of the original df.
380
+ df.index = index
381
+ # Remove the 'level_0' and 'index' columns from df
382
+ df = df.loc[:,~df.columns.isin(['level_0','index'])]
383
+ assigned_new_index = True
384
+ df.head()
385
+
386
+
387
+ # ### I.3.2. NOT_INTENSITIES
388
+
389
+ # not_intensities is the list of the columns unrelated to the markers fluorescence intensities
390
+ # Can include items that aren't in a given header.
391
+ #not_intensitiehttp://localhost:8888/lab/tree/Downloads/wetransfer_data-zip_2024-05-17_1431/1_qc_eda.ipynb
392
+ #I.3.2.-NOT_INTENSITIESs = ['Nuc_X', 'Nuc_X_Inv', 'Nuc_Y', 'Nuc_Y_Inv', 'Nucleus_Roundness', 'Nucleus_Size', 'Cell_Size',
393
+ # 'ROI_index', 'Sample_ID', 'replicate_ID', 'Cell_ID','cell_type', 'cell_subtype', 'cluster','ID',
394
+ # 'Cytoplasm_Size', 'immune_checkpoint', 'Unique_ROI_index', 'Patient', 'Primary_chem(1)_vs_surg(0)']
395
+ # not_intensities is the list of the columns unrelated to the markers fluorescence intensities
396
+ # Can include items that aren't in a given header.
397
+ #not_intensities = ['Nuc_X', 'Nuc_X_Inv', 'Nuc_Y', 'Nuc_Y_Inv', 'Nucleus_Roundness', 'Nucleus_Size', 'Cell_Size',
398
+ # 'ROI_index', 'Sample_ID', 'replicate_ID', 'Cell_ID','cell_type', 'cell_subtype', 'cluster','ID',
399
+ # 'Cytoplasm_Size', 'immune_checkpoint', 'Unique_ROI_index', 'Patient', 'Primary_chem(1)_vs_surg(0)']
400
+
401
+ # Get all column names
402
+ all_columns = df.columns.tolist()
403
+
404
+ # Create a list to store non-intensity column names
405
+ not_intensities = []
406
+ intensity_columns = []
407
+ # Iterate over each column name
408
+ for column in all_columns:
409
+ # Check if the column name contains 'Intensity_Average'
410
+ if 'Intensity_Average' not in column:
411
+ print(not_intensities)
412
+ not_intensities.append(column)
413
+ else:
414
+ intensity_columns.append(column)
415
+
416
+
417
+ # Create a new DataFrame with non-intensity columns
418
+ not_intensities_df = pd.DataFrame(not_intensities)
419
+ print("Non-intensity columns:")
420
+ print(not_intensities)
421
+
422
+ print("non-intensity DataFrame:")
423
+ not_intensities
424
+ #print(len(intensity_columns))
425
+
426
+
427
+ pd.DataFrame(not_intensities)
428
+
429
+ path_not_intensities = os.path.join(metadata_dir,"not_intensities.csv")
430
+
431
+ # If this file already exists, add only not_intensities items of the list not already present in file
432
+ if os.path.exists(path_not_intensities):
433
+ print("'not_intensities.csv' already exists.")
434
+ print("Reconciling file and Jupyter notebook lists.")
435
+ file_not_intensities = open(path_not_intensities, "r")
436
+ file_ni = file_not_intensities.read().splitlines()
437
+ # Set difference to identify items not already in file
438
+ to_add = set(not_intensities) - set(file_ni)
439
+ # We want not_intensities to the a complete list
440
+ not_intensities = list(set(file_ni) | set(not_intensities))
441
+ file_not_intensities.close()
442
+ file_not_intensities = open(path_not_intensities, "a")
443
+ for item in to_add:
444
+ file_not_intensities.write(item +"\n")
445
+ file_not_intensities.close()
446
+
447
+ else:
448
+ # The file does not yet exist
449
+ print("Could not find " + path_not_intensities + ". Creating now.")
450
+ file_not_intensities = open(path_not_intensities, "w")
451
+ for item in not_intensities:
452
+ file_not_intensities.write(item + "\n")
453
+ file_not_intensities.close()
454
+
455
+ not_intensities_df = pd.read_csv(path_not_intensities)
456
+ not_intensities_df
457
+
458
+ # Columns we want to keep: not_intensities, and any intensity column that contains 'Intensity_Average' (drop any intensity marker column that is not a mean intensity)
459
+ to_keep = not_intensities + [x for x in df.columns.values[~df.columns.isin(not_intensities)] if 'Intensity_Average' in x]
460
+
461
+ to_keep
462
+
463
+ print(len(to_keep) - 1)
464
+
465
+ # However, our to_keep list contains items that might not be in our df headers!
466
+ # These items are from our not_intensities list. So let's ask for only those items from to_keep that are actually found in our df
467
+ # Retains only the columns from the to_keep list that are found in the df's headers (columns).
468
+ # This ensures that we are only keeping the columns that exist in your df, avoiding any potential issues with non-existent column names.
469
+ # The result is a df containing only the specified columns.
470
+ df = df[[x for x in to_keep if x in df.columns.values]]
471
+
472
+ df.head()
473
+
474
+ # Assuming you have a DataFrame named 'df'
475
+ # df = pd.read_csv('your_file.csv')
476
+
477
+ # Load or create the stored_variables.json file
478
+ json_file_path = os.path.join(present_dir,"stored_variables.json")
479
+
480
+ if os.path.exists(json_file_path):
481
+ with open(json_file_path, "r") as file:
482
+ stored_variables = json.load(file)
483
+ else:
484
+ stored_variables = {}
485
+
486
+ # Get all column names
487
+ all_columns = df.columns.tolist()
488
+
489
+ # Create an empty list to store intensity markers
490
+ intensity_marker = []
491
+
492
+ # Iterate over each column name
493
+ for column in all_columns:
494
+ # Check if the column name contains 'Intensity_Average'
495
+ if 'Intensity_Average' in column:
496
+ # Split the column name by underscore
497
+ parts = column.split('_')
498
+
499
+ # Extract the word before the first underscore
500
+ marker = parts[0]
501
+
502
+ # Add the marker to the intensity_marker list
503
+ intensity_marker.append(marker)
504
+
505
+ # Remove duplicates from the intensity_marker list
506
+ intensity_marker = list(set(intensity_marker))
507
+
508
+ print("Intensity Markers:")
509
+ print(intensity_marker)
510
+
511
+ # Create a DataFrame with the intensity markers and default values
512
+ marker_options_df = pd.DataFrame({
513
+ 'Marker': intensity_marker,
514
+ 'Cell': [True] * len(intensity_marker),
515
+ 'Cytoplasm': [False] * len(intensity_marker),
516
+ 'Nucleus': [False] * len(intensity_marker)
517
+ })
518
+
519
+ # Define formatters for the Tabulator widget
520
+ tabulator_formatters = {
521
+ 'Cell': {'type': 'tickCross'},
522
+ 'Cytoplasm': {'type': 'tickCross'},
523
+ 'Nucleus': {'type': 'tickCross'}
524
+ }
525
+
526
+ # Create the Tabulator widget
527
+ tabulator = pn.widgets.Tabulator(marker_options_df, formatters=tabulator_formatters, sizing_mode='stretch_width')
528
+
529
+ # Create a DataFrame to store the initial intensities
530
+ new_data = [{'Description': f"{marker}_Cell_Intensity_Average"} for marker in intensity_marker if True]
531
+ new_data_df = pd.DataFrame(new_data)
532
+
533
+ # Create a widget to display the new data as a DataFrame
534
+ new_data_table = pn.widgets.Tabulator(new_data_df, name='New Data Table', sizing_mode='stretch_width')
535
+
536
+ # Create a button to start the update process
537
+ run_button = pn.widgets.Button(name="Save Selection", button_type='primary')
538
+
539
+ # Function to update stored_variables.json
540
+ def update_stored_variables(selected_columns):
541
+ stored_variables["selected_intensities"] = selected_columns
542
+ with open(json_file_path, "w") as file:
543
+ json.dump(stored_variables, file, indent=4)
544
+
545
+ # Define the update_intensities function
546
+ def update_intensities(event=None):
547
+ global new_data, new_data_df
548
+ new_data = []
549
+ selected_columns = []
550
+ for _, row in tabulator.value.iterrows():
551
+ marker = row['Marker']
552
+ if row['Cell']:
553
+ new_data.append({'Description': f"{marker}_Cell_Intensity_Average"})
554
+ selected_columns.append(f"{marker}_Cell_Intensity_Average")
555
+ if row['Cytoplasm']:
556
+ new_data.append({'Description': f"{marker}_Cytoplasm_Intensity_Average"})
557
+ selected_columns.append(f"{marker}_Cytoplasm_Intensity_Average")
558
+ if row['Nucleus']:
559
+ new_data.append({'Description': f"{marker}_Nucleus_Intensity_Average"})
560
+ selected_columns.append(f"{marker}_Nucleus_Intensity_Average")
561
+ new_data_df = pd.DataFrame(new_data)
562
+ new_data_table.value = new_data_df
563
+ update_stored_variables(selected_columns)
564
+ print("Updated intensities DataFrame:")
565
+ print(new_data_df)
566
+
567
+ # Define the runner function
568
+ async def runner(event):
569
+ update_intensities()
570
+
571
+ # Bind the runner function to the button
572
+ run_button.on_click(runner)
573
+
574
+ # Attach the update_intensities function to changes in the Tabulator widget
575
+ tabulator.param.watch(update_intensities, 'value')
576
+
577
+ # Layout
578
+ updated_intensities = pn.Column(tabulator, run_button, new_data_table, sizing_mode="stretch_width")
579
+
580
+ '''
581
+ # Iterate over each column name
582
+ for column in all_columns:
583
+ # Check if the column name contains 'Intensity_Average'
584
+ if 'Intensity_Average' in column:
585
+ # Split the column name by underscore
586
+ parts = column.split('_')
587
+
588
+ # Extract the word before the first underscore
589
+ marker = parts[0]
590
+
591
+ # Add the marker to the intensity_marker list
592
+ intensity_marker.append(marker)
593
+
594
+ # Remove duplicates from the intensity_marker list
595
+ intensity_marker = list(set(intensity_marker))
596
+
597
+ print("Intensity Markers:")
598
+ print(intensity_marker)
599
+
600
+ # Create a callback function to update the intensities array
601
+ def update_intensities(event):
602
+ global intensities
603
+ global intensities_df
604
+ new_intensities = []
605
+ selected_columns = []
606
+ for marker, cell, cytoplasm, nucleus in zip(marker_options_df['Marker'], marker_options_df['Cell'], marker_options_df['Cytoplasm'], marker_options_df['Nucleus']):
607
+ if cell:
608
+ new_intensities.append(f"{marker}_Cell_Intensity_Average")
609
+ selected_columns.append(f"{marker}_Cell_Intensity_Average")
610
+ if cytoplasm:
611
+ new_intensities.append(f"{marker}_Cytoplasm_Intensity_Average")
612
+ selected_columns.append(f"{marker}_Cytoplasm_Intensity_Average")
613
+ if nucleus:
614
+ new_intensities.append(f"{marker}_Nucleus_Intensity_Average")
615
+ selected_columns.append(f"{marker}_Nucleus_Intensity_Average")
616
+ intensities = new_intensities
617
+ if selected_columns:
618
+ intensities_df = merged_dataframe[selected_columns]
619
+ else:
620
+ intensities_df = pd.DataFrame()
621
+ print("Updated intensities DataFrame:")
622
+ print(intensities_df)
623
+
624
+ tabulator_formatters = {
625
+ 'bool': {'type': 'tickCross'}
626
+ }
627
+
628
+ # Create a DataFrame with the intensity markers and default values
629
+ marker_options_df = pd.DataFrame({
630
+ 'Marker': intensity_marker,
631
+ 'Cell': [False] * len(intensity_marker),
632
+ 'Cytoplasm': [False] * len(intensity_marker),
633
+ 'Nucleus': [False] * len(intensity_marker)
634
+ })
635
+
636
+ # Create the Tabulator widget and link the callback function
637
+ tabulator = pn.widgets.Tabulator(marker_options_df, formatters=tabulator_formatters, sizing_mode='stretch_width')
638
+ tabulator.param.watch(update_intensities,'value')
639
+
640
+ # Create a Panel layout with the Tabulator widget
641
+ marker_options_layout = pn.Column(tabulator, sizing_mode="stretch_width")
642
+ # Initialize the Panel extension with Tabulator
643
+ pn.extension('tabulator')
644
+
645
+ # Create a DataFrame with the intensity markers and default values
646
+ marker_options_df = pd.DataFrame({
647
+ 'Marker': intensity_marker,
648
+ 'Cell': [True] * len(intensity_marker),
649
+ 'Cytoplasm': [False] * len(intensity_marker),
650
+ 'Nucleus': [False] * len(intensity_marker)
651
+ })
652
+
653
+ # Define formatters for the Tabulator widget
654
+ tabulator_formatters = {
655
+ 'Cell': {'type': 'tickCross'},
656
+ 'Cytoplasm': {'type': 'tickCross'},
657
+ 'Nucleus': {'type': 'tickCross'}
658
+ }
659
+
660
+ # Create the Tabulator widget
661
+ tabulator = pn.widgets.Tabulator(marker_options_df, formatters=tabulator_formatters, sizing_mode='stretch_width')
662
+
663
+ # Create a DataFrame to store the initial intensities
664
+ new_data = [{'Description': f"{marker}_Cell_Intensity_Average"} for marker in intensity_marker if True]
665
+ new_data_df = pd.DataFrame(new_data)
666
+
667
+ # Create a widget to display the new data as a DataFrame
668
+ new_data_table = pn.widgets.Tabulator(new_data_df, name='New Data Table', sizing_mode='stretch_width')
669
+
670
+ # Create a button to start the update process
671
+ run_button = pn.widgets.Button(name="Save Selection", button_type='primary')
672
+
673
+ # Define the update_intensities function
674
+ def update_intensities():
675
+ global new_data, new_data_df
676
+ new_data = []
677
+ for _, row in tabulator.value.iterrows():
678
+ marker = row['Marker']
679
+ if row['Cell']:
680
+ new_data.append({'Description': f"{marker}_Cell_Intensity_Average"})
681
+ if row['Cytoplasm']:
682
+ new_data.append({'Description': f"{marker}_Cytoplasm_Intensity_Average"})
683
+ if row['Nucleus']:
684
+ new_data.append({'Description': f"{marker}_Nucleus_Intensity_Average"})
685
+ new_data_df = pd.DataFrame(new_data)
686
+ new_data_table.value = new_data_df
687
+
688
+ # Define the runner function
689
+ async def runner(event):
690
+ update_intensities()
691
+
692
+ # Bind the runner function to the button
693
+ run_button.on_click(runner)
694
+
695
+ # Layout
696
+ updated_intensities = pn.Column(tabulator, run_button, new_data_table, sizing_mode="stretch_width")
697
+
698
+ pn.extension()'''
699
+ # Serve the layout
700
+ #updated_intensities.servable()
701
+
702
+
703
+ intensities_df = new_data_table
704
+ intensities_df = pn.pane.DataFrame(intensities_df)
705
+ print(intensities_df)
706
+ # ## I.4. QC CHECKS
707
+
708
+ def quality_check_results(check_shape, check_no_null,check_zero_intensities):
709
+ results = [
710
+ f"Check Index: {check_index}",
711
+ f"Check Shape: {check_shape}",
712
+ f"Check No Null: {check_no_null}",
713
+ f"Check Zero Intensities: {check_zero_intensities}"
714
+ ]
715
+ return pn.Column(*[pn.Row(result) for result in results], sizing_mode="stretch_width")
716
+
717
+ print(ls_samples)
718
+
719
+ def check_index_format(index_str, ls_samples):
720
+ """
721
+ Checks if the given index string follows the specified format.
722
+
723
+ Args:
724
+ index_str (str): The index string to be checked.
725
+ ls_samples (list): A list of valid sample names.
726
+
727
+ Returns:
728
+ bool: True if the index string follows the format, False otherwise.
729
+ """
730
+ # Split the index string into parts
731
+ parts = index_str.split('_')
732
+
733
+ # Check if there are exactly 3 parts
734
+ if len(parts) != 3:
735
+ print(len(parts))
736
+ return False
737
+
738
+ # Check if the first part is in ls_samples
739
+ sample_name = parts[0]
740
+ if f'{sample_name}.csv' not in ls_samples:
741
+ print(sample_name)
742
+ return False
743
+
744
+ # Check if the second part is in ['cell', 'cytoplasm', 'nucleus']
745
+ location = parts[1]
746
+ valid_locations = ['Cell', 'Cytoplasm', 'Nucleus']
747
+ if location not in valid_locations:
748
+ print(location)
749
+ return False
750
+
751
+ # Check if the third part is a number
752
+ try:
753
+ index = int(parts[2])
754
+ except ValueError:
755
+ print(index)
756
+ return False
757
+
758
+ # If all checks pass, return True
759
+ return True
760
+
761
+ # Let's take a look at a few features to make sure our dataframe is as expected
762
+ df.index
763
+ def check_format_ofindex(index):
764
+ for index in df.index:
765
+ check_index = check_index_format(index, ls_samples)
766
+ if check_index is False:
767
+ index_format = "Bad"
768
+ return index_format
769
+
770
+ index_format = "Good"
771
+ return index_format
772
+ print(check_format_ofindex(df.index))
773
+
774
+ df.shape
775
+ check_index = df.index
776
+ check_shape = df.shape
777
+ print(check_shape)
778
+
779
+ # Check for NaN entries (should not be any unless columns do not align)
780
+ # False means no NaN entries
781
+ # True means NaN entries
782
+ df.isnull().any().any()
783
+
784
+ check_no_null = df.isnull().any().any()
785
+
786
+ # Check that all expected files were imported into final dataframe
787
+ if sorted(df.Sample_ID.unique()) == sorted(ls_samples):
788
+ print("All expected filenames are present in big df Sample_ID column.")
789
+ check_all_expected_files_present = "All expected filenames are present in big df Sample_ID column."
790
+ else:
791
+ compare_headers(['no samples'], df.Sample_ID.unique(), "big df Sample_ID column")
792
+ check_all_expected_files_present = compare_headers(['no samples'], df.Sample_ID.unique(), "big df Sample_ID column")
793
+
794
+ print(df.Sample_ID)
795
+
796
+ # Delete rows that have 0 value mean intensities for intensity columns
797
+ print("df.shape before removing 0 mean values: ", df.shape)
798
+
799
+ # We use the apply method on df to calculate the mean intensity for each row. It's done this by applying a lambda function to each row.
800
+ # The lambda function excludes the columns listed in the not_intensities list (which are not to be considered for mean intensity calculations)
801
+ # and calculates the mean of the remaining values in each row.
802
+ ###############################
803
+ # !! This may take a while !! #
804
+ ###############################
805
+ # Calculate mean intensity excluding 'not_intensities' columns
806
+ mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1)
807
+
808
+ # Check if there are any 0 mean intensity values
809
+ if (mean_intensity == 0).any():
810
+ df = df.loc[mean_intensity > 0, :]
811
+ print("Shape after removing 0 mean values: ", df.shape)
812
+ check_zero_intensities = f'df.shape after removing 0 mean values: {df.shape}'
813
+ else:
814
+ print("No zero intensity values.")
815
+ check_zero_intensities = " No zero intensity values found in the DataFrame."
816
+
817
+
818
+
819
+ # Get quantiles (5th, 50th, 95th)
820
+ # List of nucleus size percentiles to extract
821
+ #qs = [0.05,0.50,0.95]
822
+
823
+
824
+
825
+ #df["Nucleus_Size"].quantile(q=qs)
826
+
827
+
828
+ quality_control_df = df
829
+ quality_control_df.head()
830
+
831
+ # Function to perform quality checks
832
+ def perform_quality_checks(df, ls_samples, not_intensities):
833
+ results = {}
834
+ errors = []
835
+ # Check index
836
+ results['index'] = df.index
837
+
838
+ # Check shape
839
+ results['shape'] = df.shape
840
+
841
+ # Check for NaN entries
842
+ results['nan_entries'] = df.isnull().any().any()
843
+
844
+ # Remove rows with 0 mean intensity values
845
+ mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1)
846
+ if (mean_intensity == 0).any():
847
+ df = df.loc[mean_intensity > 0, :]
848
+ results['zero_intensity_removal'] = f"Zero intensity entires are found and removed. Shape after removing: {df.shape}"
849
+ else:
850
+ results['zero_intensity_removal'] = "No zero intensity values found in the DataFrame."
851
+
852
+ return results
853
+
854
+ # Example usage of the function
855
+ quality_check_results = perform_quality_checks(df, ls_samples, not_intensities)
856
+
857
+ # Print results
858
+ for key, value in quality_check_results.items():
859
+ print(f"{key}: {value}")
860
+
861
+
862
+ import panel as pn
863
+ import pandas as pd
864
+
865
+ def quality_check(file, not_intensities):
866
+ # Load the output file
867
+ df = file
868
+
869
+ # Check Index
870
+ check_index = check_format_ofindex(df.index)
871
+
872
+ # Check Shape
873
+ check_shape = df.shape
874
+
875
+ # Check for NaN entries
876
+ check_no_null = df.isnull().any().any()
877
+
878
+ mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1)
879
+ if (mean_intensity == 0).any():
880
+ df = df.loc[mean_intensity > 0, :]
881
+ print("df.shape after removing 0 mean values: ", df.shape)
882
+ check_zero_intensities = f'df.shape after removing 0 mean values: {df.shape}'
883
+ else:
884
+ print("No zero intensity values found in the DataFrame.")
885
+ check_zero_intensities = "No zero intensities."
886
+
887
+ # Create a quality check results table
888
+ quality_check_results_table = pd.DataFrame({
889
+ 'Check': ['Index', 'Shape', 'Check for NaN Entries', 'Check for Zero Intensities'],
890
+ 'Result': [str(check_index), str(check_shape), str(check_no_null), check_zero_intensities]
891
+ })
892
+
893
+ # Create a quality check results component
894
+ quality_check_results_component = pn.Card(
895
+ pn.pane.DataFrame(quality_check_results_table),
896
+ title="Quality Control Results",
897
+ header_background="#2196f3",
898
+ header_color="white",
899
+ )
900
+
901
+ return quality_check_results_component
902
+
903
+ quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99, step=0.01, value=0.05)
904
+
905
+
906
+ # Function to calculate quantile values
907
+ def calculate_quantiles(quantile):
908
+ quantile_value_intensity = df["AF555_Cell_Intensity_Average"].quantile(q=[quantile, 0.50, 1 - quantile])
909
+ return quantile_value_intensity
910
+
911
+ # Function to create the Panel app
912
+ def create_app(quantile = quantile_slider.param.value):
913
+ quantiles = calculate_quantiles(quantile)
914
+ output = pd.DataFrame(quantiles)
915
+
916
+ # Create a Markdown widget to display the output
917
+ output_widget = pn.pane.DataFrame(output)
918
+
919
+ return output_widget
920
+
921
+
922
+ # Bind the create_app function to the quantile slider
923
+ quantile_output_app = pn.bind(create_app, quantile_slider.param.value)
924
+ #pn.Column(quantile_slider,quantile_output_app).servable()
925
+
926
+ # Function to create the line graph plot using Bokeh
927
+ def create_line_graph2(quantile):
928
+ # Calculate histogram
929
+ hist, edges = np.histogram(df['Nucleus_Size'], bins=30)
930
+
931
+ # Calculate the midpoints of bins for plotting
932
+ midpoints = (edges[:-1] + edges[1:]) / 2
933
+
934
+ # Calculate quantiles
935
+ qs = [quantile, 0.50, 1.00 - quantile]
936
+ quantiles = df['Nucleus_Size'].quantile(q=qs).values
937
+
938
+ # Create Bokeh line graph plot
939
+ p = figure(title='Frequency vs. Nucleus_Size',
940
+ x_axis_label='Nucleus_Size',
941
+ y_axis_label='Frequency',
942
+ width=800, height=400)
943
+
944
+ # Plotting histogram
945
+ p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
946
+ fill_color='skyblue', line_color='black', alpha=0.6)
947
+
948
+ # Plotting line graph
949
+ p.line(midpoints, hist, line_width=2, color='blue', alpha=0.7)
950
+
951
+ # Add quantile lines
952
+ for q in quantiles:
953
+ span = Span(location=q, dimension='height', line_color='red', line_dash='dashed', line_width=2)
954
+ p.add_layout(span)
955
+ p.add_layout(Label(x=q, y=max(hist), text=f'{q:.1f}', text_color='red'))
956
+
957
+ return p
958
+
959
+ # Bind the create_line_graph function to the quantile slider
960
+ nucleus_size_line_graph_with_histogram = pn.bind(create_line_graph2, quantile=quantile_slider.param.value)
961
+
962
+ # Clean the 'Nucleus_Size' column by removing NaN and infinite values
963
+ df = df[np.isfinite(df['Nucleus_Size'])] # This will keep only finite values
964
+
965
+ # Check if the DataFrame is not empty after cleaning
966
+ if df.empty:
967
+ raise ValueError("No valid data available after cleaning.")
968
+ else:
969
+ # Calculate the histogram
970
+ hist, edges = np.histogram(df['Nucleus_Size'], bins=30)
971
+ print("Histogram calculated successfully.")
972
+ print("Histogram:", hist)
973
+ print("Edges:", edges)
974
+ plot1 = pn.Column(quantile_slider, pn.pane.Bokeh(nucleus_size_line_graph_with_histogram))
975
+
976
+ #Removing cells based on nucleus size
977
+
978
+ quantile = quantile_slider.value
979
+ qs = [quantile, 0.50, 1.00 - quantile]
980
+ quantiles = df['Nucleus_Size'].quantile(q=qs).values
981
+ threshold = quantiles[2]
982
+
983
+ print(threshold)
984
+
985
+ import panel as pn
986
+ import pandas as pd
987
+ import numpy as np
988
+ from bokeh.plotting import figure
989
+ from bokeh.models import Span, Label
990
+ # Define the quantile slider
991
+ #quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99, step=0.01, value=0.05)
992
+
993
+ # Function to update the threshold and display number of cells removed
994
+ def update_threshold_and_display(quantile):
995
+ qs = [quantile, 0.50, 1.00 - quantile]
996
+ quantiles = df['Nucleus_Size'].quantile(q=qs).values
997
+ threshold = quantiles[2]
998
+
999
+ # Filter the DataFrame based on the new threshold
1000
+ df_filtered = df.loc[(df['Nucleus_Size'] > 42) & (df['Nucleus_Size'] < threshold)]
1001
+
1002
+ # Calculate the number of cells removed
1003
+ cells_before_filter = df.shape[0]
1004
+ cells_after_filter = df_filtered.shape[0]
1005
+ cells_removed = cells_before_filter - cells_after_filter
1006
+
1007
+ # Display the results
1008
+ results = pn.Column(
1009
+ f"Number of cells before filtering: {cells_before_filter}",
1010
+ f"Number of cells after filtering on nucleus size: {cells_after_filter}",
1011
+ f"Number of cells removed: {cells_removed}"
1012
+ )
1013
+
1014
+ return results
1015
+
1016
+ # Bind the update function to the quantile slider
1017
+ results_display = pn.bind(update_threshold_and_display, quantile_slider)
1018
+
1019
+ # Layout the components in a Panel app
1020
+ layout2 = results_display
1021
+
1022
+ print("Number of cells before filtering :", df.shape[0])
1023
+ cells_before_filter = f"Number of cells before filtering :{df.shape[0]}"
1024
+ # Delete small cells and objects w/high AF555 Signal (RBCs)
1025
+ # We usually use the 95th percentile calculated during QC_EDA
1026
+ df = df.loc[(df['Nucleus_Size'] > 42 )]
1027
+ df = df.loc[(df['Nucleus_Size'] < threshold)]
1028
+ cells_after_filter_nucleus_shape = df.shape[0]
1029
+ print("Number of cells after filtering on nucleus size:", df.shape[0])
1030
+
1031
+ df = df.loc[(df['AF555_Cell_Intensity_Average'] < 2000)]
1032
+ print("Number of cells after filtering on AF555A ___ intensity:", df.shape[0])
1033
+ cells_after_filter_intensity_shape = df.shape[0]
1034
+ cells_after_filter_nucleus = f"Number of cells after filtering on nucleus size: {cells_after_filter_nucleus_shape}"
1035
+ cells_after_filter_intensity = f"Number of cells after filtering on AF555A ___ intensity: {cells_after_filter_intensity_shape}"
1036
+
1037
+ num_of_cell_removal_intensity = cells_after_filter_intensity
1038
+
1039
+ print(num_of_cell_removal_intensity )
1040
+
1041
+ num_of_cell_removal = pn.Column(cells_before_filter, cells_after_filter_nucleus)
1042
+
1043
+
1044
+ # Assuming you have a DataFrame 'df' with the intensity columns
1045
+ intensities = df.filter(like='Intensity').columns.tolist()
1046
+
1047
+ # Create a ColumnDataSource from the DataFrame
1048
+ source = ColumnDataSource(df)
1049
+
1050
+ # Function to calculate quantile values
1051
+ def calculate_quantiles(column, quantile):
1052
+ quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile]).values
1053
+ return quantiles
1054
+
1055
+ # Create the dropdown menu
1056
+ column_dropdown = pn.widgets.Select(name='Select Column', options=intensities)
1057
+
1058
+ quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99, step=0.01, value=0.05)
1059
+
1060
+
1061
+ # Function to create the Bokeh plot
1062
+ def create_intensity_plot(column, quantile):
1063
+ quantiles = calculate_quantiles(column, quantile)
1064
+ hist, edges = np.histogram(df[column], bins = 30)
1065
+ # Calculate the midpoints of bins for plotting
1066
+ midpoints = (edges[:-1] + edges[1:]) / 2
1067
+
1068
+ # Create Bokeh plot
1069
+ p = figure(title=f'Distribution of {column} with Quantiles',
1070
+ x_axis_label=f'{column} Values',
1071
+ y_axis_label='Frequency',
1072
+ width=800, height=400)
1073
+
1074
+
1075
+ p.quad(top=hist, bottom=0, left=edges[:-1], right= edges[1:],
1076
+ fill_color='skyblue', line_color='black', alpha=0.7)
1077
+
1078
+ # Plotting line graph
1079
+ p.line(midpoints, hist, line_width=2, color='blue', alpha=0.7)
1080
+
1081
+ # Add quantile lines
1082
+ for q in quantiles:
1083
+ span = Span(location=q, dimension='height', line_color='red', line_dash='dashed', line_width=2)
1084
+ p.add_layout(span)
1085
+ p.add_layout(Label(x=q, y=max(hist), text=f'{q:.1f}', text_color='red'))
1086
+
1087
+ return p
1088
+
1089
+
1090
+ # Bind the create_plot function to the quantile slider, column dropdown, and button click
1091
+ marker_intensity_with_histogram = pn.bind(create_intensity_plot,column_dropdown.param.value, quantile_slider.param.value, watch=True)
1092
+
1093
+ # Create the button
1094
+ generate_plot_button = Button(label='Generate Plot', button_type='primary')
1095
+
1096
+ def update_plot(column, quantile):
1097
+ plot = create_intensity_plot(column, quantile)
1098
+ plot.renderers[0].data_source = source # Update the data source for the renderer
1099
+ return plot
1100
+
1101
+ #Display the dropdown menu, quantile slider, button, and plot
1102
+ #plot = update_plot(column_dropdown.param.value, quantile_slider.param.value)
1103
+
1104
+ def generate_plot(event):
1105
+ updated_plot = update_plot(column_dropdown.param.value, quantile_slider.param.value)
1106
+ #pn.Column(pn.Row(column_dropdown, generate_plot_button), quantile_slider, updated_plot).servable()
1107
+
1108
+ generate_plot_button.on_click(generate_plot)
1109
+ selected_marker_plot = pn.Column(pn.Row(pn.Column(column_dropdown, marker_intensity_with_histogram )))
1110
+ #pn.Column(pn.Row(pn.Column(column_dropdown, marker_intensity_with_histogram ), generate_plot_button)).servable()
1111
+
1112
+ import panel as pn
1113
+ import numpy as np
1114
+ import pandas as pd
1115
+ from bokeh.plotting import figure
1116
+ from bokeh.models import ColumnDataSource, Button, Span, Label
1117
+
1118
+ # Assuming you have a DataFrame 'df' with the intensity columns
1119
+ intensities = df.filter(like='Intensity').columns.tolist()
1120
+
1121
+ # Create a ColumnDataSource from the DataFrame
1122
+ source = ColumnDataSource(df)
1123
+
1124
+ # Function to calculate quantile values
1125
+ def calculate_quantiles(column, quantile):
1126
+ quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile])
1127
+ return quantiles
1128
+
1129
+ quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99, step=0.01, value=0.05)
1130
+
1131
+
1132
+ # Bind the create_line_graph function to the quantile slider
1133
+ #nucleus_size_line_graph = pn.bind(create_line_graph, quantile=quantile_slider.param.value)
1134
+
1135
+ # Layout the components in a Panel app
1136
+ #nucleus_size_graph = pn.Column(nucleus_size_line_graph)
1137
+
1138
+ len(intensities)
1139
+
1140
+ df
1141
+
1142
+ def calculate_cytoplasm_quantiles(column, quantile):
1143
+ # Print the columns of the DataFrame
1144
+ print("DataFrame columns:", df.columns)
1145
+
1146
+ # Check if the column exists in the DataFrame
1147
+ if column not in df.columns:
1148
+ raise KeyError(f"Column '{column}' does not exist in the DataFrame.")
1149
+
1150
+ quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile])
1151
+ return quantiles
1152
+
1153
+ def create_cytoplasm_intensity_df(column, quantile):
1154
+ quantiles = calculate_cytoplasm_quantiles(column, quantile)
1155
+ output = pd.DataFrame(quantiles)
1156
+ return pn.pane.DataFrame(output)
1157
+
1158
+ # Bind the create_app function to the quantile slider
1159
+ cytoplasm_quantile_output_app = pn.bind(create_cytoplasm_intensity_df, column=df.columns[10], quantile=quantile_slider.param.value)
1160
+
1161
+ pn.Column(quantile_slider, cytoplasm_quantile_output_app)
1162
+ def calculate_cytoplasm_quantiles(column, quantile):
1163
+ quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile])
1164
+ return quantiles
1165
+
1166
+ def create_cytoplasm_intensity_df(column, quantile):
1167
+ quantiles = calculate_cytoplasm_quantiles(column, quantile)
1168
+ output = pd.DataFrame(quantiles)
1169
+ # Create a Dataframe widget to display the output
1170
+ output_widget = pn.pane.DataFrame(output)
1171
+ return output_widget
1172
+
1173
+
1174
+ # Bind the create_app function to the quantile slider
1175
+ cytoplasm_quantile_output_app = pn.bind(create_cytoplasm_intensity_df, column=df.columns[10], quantile = quantile_slider.param.value)
1176
+ pn.Column(quantile_slider,cytoplasm_quantile_output_app)
1177
+
1178
+
1179
+ # ## I.5. COLUMNS OF INTERESTS
1180
+
1181
+ # Remove columns containing "DAPI"
1182
+ df = df[[x for x in df.columns.values if 'DAPI' not in x]]
1183
+
1184
+ print("Columns are now...")
1185
+ print([c for c in df.columns.values])
1186
+
1187
+
1188
+ # Create lists of full names and shortened names to use in plotting
1189
+ full_to_short_names, short_to_full_names = \
1190
+ shorten_feature_names(df.columns.values[~df.columns.isin(not_intensities)])
1191
+
1192
+ short_to_full_names
1193
+
1194
+
1195
+ # Save this data to a metadata file
1196
+ filename = os.path.join(metadata_dir, "full_to_short_column_names.csv")
1197
+ fh = open(filename, "w")
1198
+ fh.write("full_name,short_name\n")
1199
+ for k,v in full_to_short_names.items():
1200
+ fh.write(k + "," + v + "\n")
1201
+
1202
+ fh.close()
1203
+ print("The full_to_short_column_names.csv file was created !")
1204
+
1205
+ # Save this data to a metadata file
1206
+ filename = os.path.join(metadata_dir, "short_to_full_column_names.csv")
1207
+ fh = open(filename, "w")
1208
+ fh.write("short_name,full_name\n")
1209
+ for k,v in short_to_full_names.items():
1210
+ fh.write(k + "," + v + "\n")
1211
+
1212
+ fh.close()
1213
+ print("The short_to_full_column_names.csv file was created !")
1214
+
1215
+
1216
+ # ## I.6. EXPOSURE TIME
1217
+
1218
+
1219
+ #import the ashlar analysis file
1220
+ file_path = os.path.join(metadata_dir, 'combined_metadata.csv')
1221
+ ashlar_analysis = pd.read_csv(file_path)
1222
+ ashlar_analysis
1223
+ # Extracting and renaming columns
1224
+ new_df = ashlar_analysis[['Name', 'Cycle', 'ChannelIndex', 'ExposureTime']].copy()
1225
+ new_df.rename(columns={
1226
+ 'Name': 'Target',
1227
+ 'Cycle': 'Round',
1228
+ 'ChannelIndex': 'Channel'
1229
+ }, inplace=True)
1230
+
1231
+ # Applying suffixes to the columns
1232
+ new_df['Round'] = 'R' + new_df['Round'].astype(str)
1233
+ new_df['Channel'] = 'c' + new_df['Channel'].astype(str)
1234
+
1235
+ # Save to CSV
1236
+ new_df.to_csv('Ashlar_Exposure_Time.csv', index=False)
1237
+
1238
+ # Print the new dataframe
1239
+ print(new_df)
1240
+
1241
+ # Here, we want to end up with a data structure that incorporates metadata on each intensity marker column used in our big dataframe in an easy-to-use format.
1242
+ # This is going to include the full name of the intensity marker columns in the big data frame,
1243
+ # the corresponding round and channel,
1244
+ # the target protein (e.g., CD45),
1245
+ # and the segmentation localization information (cell, cytoplasm, nucleus)
1246
+
1247
+ # We can use this data structure to assign unique colors to all channels and rounds, for example, for use in later visualizations
1248
+ # Exposure_time file from ASHLAR analysis
1249
+ filename = "Exposure_Time.csv"
1250
+ filename = os.path.join(metadata_dir, filename)
1251
+ exp_df = pd.read_csv(filename)
1252
+
1253
+ print(exp_df)
1254
+
1255
+ # Verify file imported correctly
1256
+ # File length
1257
+ print("df's shape: ", exp_df.shape)
1258
+ # Headers
1259
+ expected_headers =['Round','Target','Exp','Channel']
1260
+ compare_headers(expected_headers, exp_df.columns.values, "Imported metadata file")
1261
+
1262
+ # Missingness
1263
+ if exp_df.isnull().any().any():
1264
+ print("\nexp_df has null value(s) in row(s):")
1265
+ print(exp_df[exp_df.isna().any(axis=1)])
1266
+ else:
1267
+ print("\nNo null values detected.")
1268
+
1269
+
1270
+ if len(exp_df['Target']) > len(exp_df['Target'].unique()):
1271
+ print("One or more non-unique Target values in exp_df. Currently not supported.")
1272
+ exp_df = exp_df.drop_duplicates(subset = 'Target').reindex()
1273
+
1274
+ # sort exp_df by the values in the 'Target' column in ascending order and then retrieve the first few rows of the sorted df
1275
+ exp_df.sort_values(by = ['Target']).head()
1276
+
1277
+ # Create lowercase version of target
1278
+ exp_df['target_lower'] = exp_df['Target'].str.lower()
1279
+ exp_df.head()
1280
+
1281
+ # Create df that contains marker intensity columns in our df that aren't in not_intensities
1282
+ intensities = pd.DataFrame({'full_column':df.columns.values[~df.columns.isin(not_intensities)]})
1283
+
1284
+ intensities
1285
+ # Extract the marker information from the `full_column`, which corresponds to full column in big dataframe
1286
+ # Use regular expressions (regex) to isolate the part of the field that begins (^) with an alphanumeric value (W), and ends with an underscore (_)
1287
+ # '$' is end of line
1288
+ intensities['marker'] = intensities['full_column'].str.extract(r'([^\W_]+)')
1289
+ # convert to lowercase
1290
+ intensities['marker_lower'] = intensities['marker'].str.lower()
1291
+
1292
+ intensities
1293
+ # Subset the intensities df to exclude any column pertaining to DAPI
1294
+ intensities = intensities.loc[intensities['marker_lower'] != 'dapi']
1295
+
1296
+ intensities.head()
1297
+ # Merge the intensities andexp_df together to create metadata
1298
+ metadata = pd.merge(exp_df, intensities, how = 'left', left_on = 'target_lower',right_on = 'marker_lower')
1299
+ metadata = metadata.drop(columns = ['marker_lower'])
1300
+ metadata = metadata.dropna()
1301
+
1302
+ # Target is the capitalization from the Exposure_Time.csv
1303
+ # target_lower is Target in small caps
1304
+ # marker is the extracted first component of the full column in segmentation data, with corresponding capitalization
1305
+ metadata
1306
+ # Add a column to signify marker target localisation.
1307
+ # Use a lambda to determine segmented location of intensity marker column and update metadata accordingly
1308
+ # Using the add_metadata_location() function in my_modules.py
1309
+ metadata['localisation'] = metadata.apply(
1310
+ lambda row: add_metadata_location(row), axis = 1)
1311
+
1312
+ mlid = metadata
1313
+
1314
+ # Save this data structure to the metadata folder
1315
+ # don't want to add color in because that's better off treating color the same for round, channel, and sample
1316
+ filename = "marker_intensity_metadata.csv"
1317
+ filename = os.path.join(metadata_dir, filename)
1318
+ metadata.to_csv(filename, index = False)
1319
+ print("The marker_intensity_metadata.csv file was created !")
1320
+
1321
+
1322
+
1323
+ # ## I.7. COLORS WORKFLOW
1324
+
1325
+ # ### I.7.1. CHANNELS COLORS
1326
+
1327
+
1328
+ # we want colors that are categorical, since Channel is a non-ordered category (yes, they are numbered, but arbitrarily).
1329
+ # A categorical color palette will have dissimilar colors.
1330
+ # Get those unique colors
1331
+ if len(metadata.Channel.unique()) > 10:
1332
+ print("WARNING: There are more unique channel values than \
1333
+ there are colors to choose from. Select different palette, e.g., \
1334
+ continuous palette 'husl'.")
1335
+ channel_color_values = sb.color_palette("bright",n_colors = len(metadata.Channel.unique()))
1336
+ # chose 'colorblind' because it is categorical and we're unlikely to have > 10
1337
+
1338
+ # You can customize the colors for each channel here
1339
+ custom_colors = {
1340
+ 'c2': 'lightgreen',
1341
+ 'c3': 'tomato',
1342
+ 'c4': 'pink',
1343
+ 'c5': 'turquoise'
1344
+ }
1345
+
1346
+ custom_colors_values = sb.palplot(sb.color_palette([custom_colors.get(ch, 'blue') for ch in metadata.Channel.unique()]))
1347
+
1348
+ # Display those unique customs colors
1349
+ print("Unique channels are:", metadata.Channel.unique())
1350
+ sb.palplot(sb.color_palette(channel_color_values))
1351
+
1352
+ # Function to create a palette plot with custom colors
1353
+ def create_palette_plot():
1354
+ # Get unique channels
1355
+ unique_channels = metadata.Channel.unique()
1356
+
1357
+ # Define custom colors for each channel
1358
+ custom_colors = {
1359
+ 'c2': 'lightgreen',
1360
+ 'c3': 'tomato',
1361
+ 'c4': 'pink',
1362
+ 'c5': 'turquoise'
1363
+ }
1364
+
1365
+ # Get custom colors for each channel
1366
+ colors = [custom_colors.get(ch, 'blue') for ch in unique_channels]
1367
+
1368
+ # Create a palette plot (palplot)
1369
+ palette_plot = sb.palplot(sb.color_palette(colors))
1370
+ channel_color_values = sb.color_palette("bright",n_colors = len(metadata.Channel.unique()))
1371
+ channel_color_values = sb.palplot(channel_color_values)
1372
+ return palette_plot, channel_color_values
1373
+
1374
+
1375
+ # Create the palette plot directly
1376
+ palette_plot = create_palette_plot()
1377
+
1378
+ # Define the Panel app layout
1379
+ app_palette_plot = pn.Column(
1380
+ pn.pane.Markdown("### Custom Color Palette"),
1381
+ palette_plot,
1382
+ )
1383
+
1384
+ # Function to create a palette plot with custom colors
1385
+ def create_palette_plot(custom_colors):
1386
+ # Get unique channels
1387
+ unique_channels = metadata.Channel.unique()
1388
+
1389
+ # Get custom colors for each channel
1390
+ colors = [custom_colors.get(ch, 'blue') for ch in unique_channels]
1391
+
1392
+ # Create a palette plot (palplot)
1393
+ palette_plot = sb.palplot(sb.color_palette(colors))
1394
+
1395
+ return palette_plot
1396
+
1397
+ # Define custom colors for each channel
1398
+ custom_colors = {
1399
+ 'c2': 'lightgreen',
1400
+ 'c3': 'tomato',
1401
+ 'c4': 'pink',
1402
+ 'c5': 'turquoise'
1403
+ }
1404
+
1405
+ # Display those unique customs colo
1406
+ print("Unique channels are:", metadata.Channel.unique())
1407
+ # Function to bind create_palette_plot
1408
+ app_palette_plot = create_palette_plot(custom_colors)
1409
+
1410
+
1411
+ #app_palette_plot.servable()
1412
+
1413
+
1414
+ # Store in a dictionary
1415
+ channel_color_dict = dict(zip(metadata.Channel.unique(), channel_color_values))
1416
+ channel_color_dict
1417
+ for k,v in channel_color_dict.items():
1418
+ channel_color_dict[k] = np.float64(v)
1419
+
1420
+ channel_color_dict
1421
+
1422
+ color_df_channel = color_dict_to_df(channel_color_dict, "Channel")
1423
+
1424
+ # Save to file in metadatadirectory
1425
+ filename = "channel_color_data.csv"
1426
+ filename = os.path.join(metadata_dir, filename)
1427
+ color_df_channel.to_csv(filename, index = False)
1428
+
1429
+ color_df_channel
1430
+
1431
+ # Legend of channel info only
1432
+ g = plt.figure(figsize = (1,1)).add_subplot(111)
1433
+ g.axis('off')
1434
+ handles = []
1435
+ for item in channel_color_dict.keys():
1436
+ h = g.bar(0,0, color = channel_color_dict[item],
1437
+ label = item, linewidth =0)
1438
+ handles.append(h)
1439
+ first_legend = plt.legend(handles=handles, loc='upper right', title = 'Channel'),
1440
+ # box_to_anchor=(10,10),
1441
+ # bbox_transform=plt.gcf().transFigure)
1442
+
1443
+ filename = "Channel_legend.png"
1444
+ filename = os.path.join(metadata_images_dir, filename)
1445
+ plt.savefig(filename, bbox_inches = 'tight')
1446
+
1447
+ # ### I.7.2. ROUNDS COLORS
1448
+
1449
+
1450
+ # we want colors that are sequential, since Round is an ordered category.
1451
+ # We can still generate colors that are easy to distinguish. Also, many of the categorical palettes cap at at about 10 or so unique colors, and repeat from there.
1452
+ # We do not want any repeats!
1453
+ round_color_values = sb.cubehelix_palette(
1454
+ len(metadata.Round.unique()), start=1, rot= -0.75, dark=0.19, light=.85, reverse=True)
1455
+ # round_color_values = sb.color_palette("cubehelix",n_colors = len(metadata.Round.unique()))
1456
+ # chose 'cubehelix' because it is sequential, and round is a continuous process
1457
+ # each color value is a tuple of three values: (R, G, B)
1458
+ print(metadata.Round.unique())
1459
+
1460
+ sb.palplot(sb.color_palette(round_color_values))
1461
+
1462
+ ## TO-DO: write what these parameters mean
1463
+
1464
+ # Store in a dictionary
1465
+ round_color_dict = dict(zip(metadata.Round.unique(), round_color_values))
1466
+
1467
+ for k,v in round_color_dict.items():
1468
+ round_color_dict[k] = np.float64(v)
1469
+
1470
+ round_color_dict
1471
+
1472
+ color_df_round = color_dict_to_df(round_color_dict, "Round")
1473
+
1474
+ # Save to file in metadatadirectory
1475
+ filename = "round_color_data.csv"
1476
+ filename = os.path.join(metadata_dir, filename)
1477
+ color_df_round.to_csv(filename, index = False)
1478
+
1479
+ color_df_round
1480
+
1481
+ # Legend of round info only
1482
+
1483
+ round_legend = plt.figure(figsize = (1,1)).add_subplot(111)
1484
+ round_legend.axis('off')
1485
+ handles = []
1486
+ for item in round_color_dict.keys():
1487
+ h = round_legend.bar(0,0, color = round_color_dict[item],
1488
+ label = item, linewidth =0)
1489
+ handles.append(h)
1490
+ first_legend = plt.legend(handles=handles, loc='upper right', title = 'Round'),
1491
+ # bbox_to_anchor=(10,10),
1492
+ # bbox_transform=plt.gcf().transFigure)
1493
+
1494
+ filename = "Round_legend.png"
1495
+ filename = os.path.join(metadata_images_dir, filename)
1496
+ plt.savefig(filename, bbox_inches = 'tight')
1497
+
1498
+
1499
+ # ### I.7.3. SAMPLES COLORS
1500
+
1501
+ # we want colors that are neither sequential nor categorical.
1502
+ # Categorical would be ideal if we could generate an arbitrary number of colors, but I do not think that we can.
1503
+ # Hense, we will choose `n` colors from a continuous palette. First we will generate the right number of colors. Later, we will assign TMA samples to gray.
1504
+
1505
+ # Get those unique colors
1506
+ color_values = sb.color_palette("husl",n_colors = len(ls_samples))#'HLS'
1507
+ # each color value is a tuple of three values: (R, G, B)
1508
+
1509
+ # Display those unique colors
1510
+ sb.palplot(sb.color_palette(color_values))
1511
+
1512
+ TMA_samples = [s for s in df.Sample_ID.unique() if 'TMA' in s]
1513
+ TMA_color_values = sb.color_palette(n_colors = len(TMA_samples),palette = "gray")
1514
+ sb.palplot(sb.color_palette(TMA_color_values))
1515
+
1516
+ # Store in a dictionary
1517
+ color_dict = dict()
1518
+ color_dict = dict(zip(df.Sample_ID.unique(), color_values))
1519
+
1520
+ # Replace all TMA samples' colors with gray
1521
+ i = 0
1522
+ for key in color_dict.keys():
1523
+ if 'TMA' in key:
1524
+ color_dict[key] = TMA_color_values[i]
1525
+ i +=1
1526
+
1527
+ color_dict
1528
+
1529
+ color_df_sample = color_dict_to_df(color_dict, "Sample_ID")
1530
+
1531
+ # Save to file in metadatadirectory
1532
+ filename = "sample_color_data.csv"
1533
+ filename = os.path.join(metadata_dir, filename)
1534
+ color_df_sample.to_csv(filename, index = False)
1535
+
1536
+ color_df_sample
1537
+
1538
+
1539
+ # Legend of sample info only
1540
+ g = plt.figure(figsize = (1,1)).add_subplot(111)
1541
+ g.axis('off')
1542
+ handles = []
1543
+ for item in color_dict.keys():
1544
+ h = g.bar(0,0, color = color_dict[item],
1545
+ label = item, linewidth =0)
1546
+ handles.append(h)
1547
+ first_legend = plt.legend(handles=handles, loc='upper right', title = 'Sample')
1548
+
1549
+ filename = "Sample_legend.png"
1550
+ filename = os.path.join(metadata_images_dir, filename)
1551
+ plt.savefig(filename, bbox_inches = 'tight')
1552
+
1553
+
1554
+ # ### I.7.4. CLUSTERS COLORS
1555
+
1556
+ '''if 'cluster' in df.columns:
1557
+ cluster_color_values = sb.color_palette("hls",n_colors = len(df.cluster.unique()))
1558
+
1559
+ #print(sorted(test_df.cluster.unique()))
1560
+ # Display those unique colors
1561
+ sb.palplot(sb.color_palette(cluster_color_values))
1562
+
1563
+ cluster_color_dict = dict(zip(sorted(test_df.cluster.unique()), cluster_color_values))
1564
+ print(cluster_color_dict)
1565
+
1566
+ # Create dataframe
1567
+ cluster_color_df = color_dict_to_df(cluster_color_dict, "cluster")
1568
+ cluster_color_df.head()
1569
+
1570
+ # Save to file in metadatadirectory
1571
+ filename = "cluster_color_data.csv"
1572
+ filename = os.path.join(metadata_dir, filename)
1573
+ cluster_color_df.to_csv(filename, index = False)
1574
+
1575
+
1576
+
1577
+ # Legend of cluster info only
1578
+
1579
+ if 'cluster' in df.columns:
1580
+ g = plt.figure(figsize = (1,1)).add_subplot(111)
1581
+ g.axis('off')
1582
+ handles = []
1583
+ for item in sorted(cluster_color_dict.keys()):
1584
+ h = g.bar(0,0, color = cluster_color_dict[item],
1585
+ label = item, linewidth =0)
1586
+ handles.append(h)
1587
+ first_legend = plt.legend(handles=handles, loc='upper right', title = 'Cluster'),
1588
+
1589
+
1590
+ filename = "Clustertype_legend.png"
1591
+ filename = os.path.join(metadata_images_dir, filename)
1592
+ plt.savefig(filename, bbox_inches = 'tight')'''
1593
+
1594
+ mlid.head()
1595
+
1596
+
1597
+ metadata
1598
+
1599
+
1600
+
1601
+ import io
1602
+ import panel as pn
1603
+ pn.extension()
1604
+
1605
+ file_input = pn.widgets.FileInput()
1606
+
1607
+ file_input
1608
+
1609
+
1610
+ def transform_data(variable, window, sigma):
1611
+ """Calculates the rolling average and identifies outliers"""
1612
+ avg = metadata[variable].rolling(window=window).mean()
1613
+ residual = metadata[variable] - avg
1614
+ std = residual.rolling(window=window).std()
1615
+ outliers = np.abs(residual) > std * sigma
1616
+ return avg, avg[outliers]
1617
+
1618
+
1619
+ def get_plot(variable="Exp", window=30, sigma=10):
1620
+ """Plots the rolling average and the outliers"""
1621
+ avg, highlight = transform_data(variable, window, sigma)
1622
+ return avg.hvplot(
1623
+ height=300, legend=False,
1624
+ ) * highlight.hvplot.scatter(padding=0.1, legend=False)
1625
+
1626
+
1627
+ variable_widget = pn.widgets.Select(name="Target", value="Exp", options=list(metadata.columns))
1628
+ window_widget = pn.widgets.IntSlider(name="window", value=30, start=1, end=60)
1629
+ sigma_widget = pn.widgets.IntSlider(name="sigma", value=10, start=0, end=20)
1630
+
1631
+ # Function to save files
1632
+ def save_files(event):
1633
+ for sample in ls_samples:
1634
+ sample_id = sample.split('.csv')[0]
1635
+ filename = os.path.join(output_data_dir, sample_id + "_" + step_suffix + ".csv")
1636
+
1637
+ df_save = df.loc[df['Sample_ID'] == sample, :]
1638
+ if os.path.exists(filename):
1639
+ df_save.to_csv(filename, index=True, index_label='ID', mode='w') # Overwrite by default
1640
+ print(f"File {filename} was overwritten!")
1641
+ else:
1642
+ df_save.to_csv(filename, index=True, index_label='ID') # Save normally if the file doesn't exist
1643
+ print(f"File {filename} was created and saved!")
1644
+
1645
+ # Button to download files
1646
+ download_button = pn.widgets.Button(name='Download Files', button_type='primary')
1647
+ download_button.on_click(save_files)
1648
+
1649
+ app = pn.template.GoldenTemplate(
1650
+ site="Cyc-IF",
1651
+ title="Quality Control",
1652
+ main=[
1653
+ pn.Tabs(
1654
+ ("Dataframes", pn.Column(
1655
+ pn.Row(csv_files_button,pn.bind(handle_click, csv_files_button.param.clicks), ),
1656
+ pn.pane.Markdown("### The Dataframe uploaded:"), pn.pane.DataFrame(intial_dataframe),
1657
+ #pn.pane.Markdown("### The Exposure time DataFrame is :"), pn.pane.DataFrame(exp_df.head()),
1658
+ pn.pane.Markdown("### The DataFrame after merging CycIF data x metadata :"), pn.pane.DataFrame(merged_dataframe.head(25)),
1659
+ )),
1660
+ ("Quality Control", pn.Column(
1661
+ quality_check(quality_control_df, not_intensities)
1662
+ #pn.pane.Markdown("### The Quality check results are:"), quality_check_results(check_shape, check_no_null, check_all_expected_files_present, check_zero_intensities)
1663
+ )),
1664
+ ("Intensities", pn.Column(
1665
+ pn.pane.Markdown("### The Not Intensities DataFrame after processing is :"), pn.pane.DataFrame(not_intensities_df, height=250),
1666
+ pn.pane.Markdown("### Select Intensities to be included"), updated_intensities,
1667
+ #pn.pane.Markdown("### The Intensities DataFrame"), intensities_df,
1668
+ #pn.pane.Markdown("### The metadata obtained that specifies the localisation:"), pn.pane.DataFrame(mlid.head())
1669
+ )),
1670
+ ("Plots", pn.Column(
1671
+ #pn.pane.Markdown(" ### Nucleus Size Distribution: "), pn.Row(nucleus_size_line_graph_with_histogram, num_of_cell_removal),
1672
+ pn.pane.Markdown(" ### Nucleus Size Distribution: "), pn.Row(plot1,layout2),
1673
+ #pn.pane.Markdown("### Nucleus Distribution Plot:"), pn.Column(nucleus_size_plot, nucleus_size_graph),
1674
+ pn.pane.Markdown(" ### Intensity Average Plot:"), pn.Row(selected_marker_plot,num_of_cell_removal_intensity ),
1675
+ #pn.Column(pn.Column(column_dropdown, generate_plot_button), quantile_slider, plot),
1676
+ #pn.pane.Markdown("### Cytoplasm Intensity Plot:"), cytoplasm_intensity_plot,
1677
+ #pn.pane.Markdown("### AF555_Cell_Intensity_Average:"), quantile_output_app,
1678
+ #pn.pane.Markdown("### Distribution of AF555_Cell_Intensity_Average with Quantiles:"), quantile_intensity_plot),
1679
+ pn.Column(download_button),
1680
+ )),
1681
+
1682
+ ),
1683
+ ])
1684
+
1685
+ app.servable()
1686
+
1687
+ if __name__ == "__main__":
1688
+ pn.serve(app, port=5007)