KashyapiNagaHarshitha commited on
Commit
dc47c5c
·
verified ·
1 Parent(s): 31a4d07

Upload Background_Substraction.py

Browse files
Files changed (1) hide show
  1. Background_Substraction.py +1121 -0
Background_Substraction.py ADDED
@@ -0,0 +1,1121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+
5
+ # In[1]:
6
+ import os
7
+ import random
8
+ import re
9
+ import pandas as pd
10
+ import numpy as np
11
+ import seaborn as sb
12
+ import matplotlib.pyplot as plt
13
+ import matplotlib.colors as mplc
14
+ import subprocess
15
+ import warnings
16
+ from scipy import signal
17
+ import plotly.figure_factory as ff
18
+ import plotly
19
+ import plotly.graph_objs as go
20
+ from plotly.offline import download_plotlyjs, plot
21
+ import plotly.express as px
22
+ from my_modules import *
23
+ os.getcwd()
24
+ # In[2]:
25
+
26
+
27
+ #Silence FutureWarnings & UserWarnings
28
+ warnings.filterwarnings('ignore', category= FutureWarning)
29
+ warnings.filterwarnings('ignore', category= UserWarning)
30
+
31
+
32
+ # ## II.2. *DIRECTORIES
33
+
34
+ # In[5]:
35
+
36
+
37
+ # Set base directory
38
+
39
+ ##### MAC WORKSTATION #####
40
+ #base_dir = r'/Volumes/LaboLabrie/Projets/OC_TMA_Pejovic/Temp/Zoe/CyCIF_pipeline/'
41
+ ###########################
42
+
43
+ ##### WINDOWS WORKSTATION #####
44
+ #base_dir = r'C:\Users\LaboLabrie\gerz2701\cyCIF-pipeline\Set_B'
45
+ ###############################
46
+
47
+ ##### LOCAL WORKSTATION #####
48
+ input_path = '/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/'
49
+ #############################
50
+
51
+ #set_name = 'Set_A'
52
+ #set_name = 'test'
53
+
54
+
55
+ #present_dir = os.path.dirname(os.path.realpath(__file__))
56
+
57
+ #input_path = os.path.join(present_dir, 'wetransfer_data-zip_2024-05-17_1431')
58
+ base_dir = input_path
59
+ '''
60
+ # Function to change permissions recursively with error handling
61
+ def change_permissions_recursive(path, mode):
62
+ for root, dirs, files in os.walk(path):
63
+ for dir in dirs:
64
+ try:
65
+ os.chmod(os.path.join(root, dir), mode)
66
+ except Exception as e:
67
+ print(f"An error occurred while changing permissions for directory {os.path.join(root, dir)}: {e}")
68
+ for file in files:
69
+ try:
70
+ os.chmod(os.path.join(root, file), mode)
71
+ except Exception as e:
72
+ print(f"An error occurred while changing permissions for file {os.path.join(root, file)}: {e}")
73
+
74
+
75
+ change_permissions_recursive(base_dir, 0o777)
76
+ change_permissions_recursive('/code', 0o777)
77
+ '''
78
+ set_path = 'test'
79
+ selected_metadata_files = ['Slide_B_DD1s1.one_1.tif.csv', 'Slide_B_DD1s1.one_2.tif.csv']
80
+ ls_samples = ['Ashlar_Exposure_Time.csv', 'new_data.csv', 'DD3S1.csv', 'DD3S2.csv', 'DD3S3.csv', 'TMA.csv']
81
+
82
+ set_name = set_path
83
+
84
+
85
+ # In[7]:
86
+
87
+
88
+ project_name = set_name # Project name
89
+ step_suffix = 'bs' # Curent part (here part II)
90
+ previous_step_suffix_long = "_qc_eda" # Previous part (here QC/EDA NOTEBOOK)
91
+
92
+ # Initial input data directory
93
+ input_data_dir = os.path.join(base_dir, project_name + previous_step_suffix_long)
94
+
95
+ # BS output directories
96
+ output_data_dir = os.path.join(base_dir, project_name + "_" + step_suffix)
97
+ # BS images subdirectory
98
+ output_images_dir = os.path.join(output_data_dir,"images")
99
+
100
+ # Data and Metadata directories
101
+ # Metadata directories
102
+ metadata_dir = os.path.join(base_dir, project_name + "_metadata")
103
+ # images subdirectory
104
+ metadata_images_dir = os.path.join(metadata_dir,"images")
105
+
106
+ # Create directories if they don't already exist
107
+ for d in [base_dir, input_data_dir, output_data_dir, output_images_dir, metadata_dir, metadata_images_dir]:
108
+ if not os.path.exists(d):
109
+ print("Creation of the" , d, "directory...")
110
+ os.makedirs(d)
111
+ else :
112
+ print("The", d, "directory already exists !")
113
+
114
+ os.chdir(input_data_dir)
115
+
116
+
117
+ # In[8]:
118
+
119
+
120
+ # Verify paths
121
+ print('base_dir :', base_dir)
122
+ print('input_data_dir :', input_data_dir)
123
+ print('output_data_dir :', output_data_dir)
124
+ print('output_images_dir :', output_images_dir)
125
+ print('metadata_dir :', metadata_dir)
126
+ print('metadata_images_dir :', metadata_images_dir)
127
+
128
+ # ## II.3. FILES
129
+ #Don't forget to put your data in the projname_data directory !
130
+ # ### II.3.1. METADATA
131
+
132
+ # In[9]:
133
+
134
+
135
+ # Import all metadata we need from the QC/EDA chapter
136
+
137
+ # METADATA
138
+ filename = "marker_intensity_metadata.csv"
139
+ filename = os.path.join(metadata_dir, filename)
140
+
141
+ # Check file exists
142
+ if not os.path.exists(filename):
143
+ print("WARNING: Could not find desired file: "+filename)
144
+ else :
145
+ print("The",filename,"file was imported for further analysis!")
146
+
147
+ # Open, read in information
148
+ metadata = pd.read_csv(filename)
149
+
150
+ # Verify size with verify_line_no() function in my_modules.py
151
+ #verify_line_no(filename, metadata.shape[0] + 1)
152
+
153
+ # Verify headers
154
+ exp_cols = ['Round','Target','Channel','target_lower','full_column','marker','localisation']
155
+ compare_headers(exp_cols, metadata.columns.values, "Marker metadata file")
156
+
157
+ metadata = metadata.dropna()
158
+ metadata.head()
159
+
160
+ # ### II.3.2. NOT_INTENSITIES
161
+
162
+ # In[10]:
163
+
164
+
165
+ # NOT_INTENSITIES
166
+ filename = "not_intensities.csv"
167
+ filename = os.path.join(metadata_dir, filename)
168
+
169
+ # Check file exists
170
+ if not os.path.exists(filename):
171
+ print("WARNING: Could not find desired file: "+filename)
172
+ else :
173
+ print("The",filename,"file was imported for further analysis!")
174
+
175
+ # Open, read in information
176
+ #not_intensities = []
177
+ with open(filename, 'r') as fh:
178
+ not_intensities = fh.read().strip().split("\n")
179
+ # take str, strip whitespace, split on new line character
180
+
181
+ not_intensities = ['Nuc_X', 'Nuc_X_Inv', 'Nuc_Y', 'Nuc_Y_Inv', 'Nucleus_Roundness', 'Nucleus_Size', 'Cell_Size',
182
+ 'ROI_index', 'Sample_ID', 'replicate_ID', 'Cell_ID','cell_type', 'cell_subtype', 'cluster','ID',
183
+ 'Cytoplasm_Size', 'immune_checkpoint', 'Unique_ROI_index', 'Patient', 'Primary_chem(1)_vs_surg(0)']
184
+
185
+ # Verify size
186
+ print("Verifying data read from file is the correct length...\n")
187
+ verify_line_no(filename, len(not_intensities))
188
+
189
+ # Print to console
190
+ print("not_intensities =\n", not_intensities)
191
+
192
+ import os
193
+ import pandas as pd
194
+
195
+ # Function to compare headers (assuming you have this function defined in your my_modules.py)
196
+ def compare_headers(expected, actual, description):
197
+ missing = [col for col in expected if col not in actual]
198
+ if missing:
199
+ print(f"WARNING: Missing expected columns in {description}: {missing}")
200
+ else:
201
+ print(f"All expected columns are present in {description}.")
202
+
203
+ # Get the current script directory
204
+ present_dir = os.path.dirname(os.path.realpath(__file__))
205
+
206
+ # Define the input path
207
+ input_path = os.path.join(present_dir, 'wetransfer_data-zip_2024-05-17_1431')
208
+ base_dir = input_path
209
+ set_path = 'test'
210
+
211
+ # Project and step names
212
+ project_name = set_path # Project name
213
+ previous_step_suffix_long = "_qc_eda" # Previous part (here QC/EDA NOTEBOOK)
214
+
215
+ # Initial input data directory
216
+ input_data_dir = os.path.join(base_dir, project_name + previous_step_suffix_long)
217
+
218
+ # Metadata directories
219
+ metadata_dir = os.path.join(base_dir, project_name + "_metadata")
220
+ metadata_images_dir = os.path.join(metadata_dir, "images")
221
+
222
+ # Define writable directory
223
+ writable_directory = '/tmp'
224
+
225
+ # Check and read metadata file
226
+ filename = "marker_intensity_metadata.csv"
227
+ filename = os.path.join(metadata_dir, filename)
228
+
229
+ # Check if the file exists
230
+ if not os.path.exists(filename):
231
+ print("WARNING: Could not find desired file: " + filename)
232
+ else:
233
+ print("The", filename, "file was imported for further analysis!")
234
+
235
+ # Open, read in information
236
+ metadata = pd.read_csv(filename)
237
+
238
+ # Verify headers
239
+ exp_cols = ['Round', 'Target', 'Channel', 'target_lower', 'full_column', 'marker', 'localisation']
240
+ compare_headers(exp_cols, metadata.columns.values, "Marker metadata file")
241
+
242
+ metadata = metadata.dropna()
243
+ print(metadata.head())
244
+
245
+ # Example of writing to the writable directory
246
+ output_file_path = os.path.join(writable_directory, 'processed_metadata.csv')
247
+ try:
248
+ metadata.to_csv(output_file_path, index=False)
249
+ print(f"Processed metadata written successfully to {output_file_path}")
250
+ except PermissionError as e:
251
+ print(f"Permission denied: Unable to write the file at {output_file_path}. Error: {e}")
252
+ except Exception as e:
253
+ print(f"An error occurred: {e}")
254
+
255
+ # ### II.3.3. FULL_TO_SHORT_COLUMN_NAMES
256
+
257
+ # In[11]:
258
+
259
+
260
+ # FULL_TO_SHORT_COLUMN_NAMES
261
+ filename = "full_to_short_column_names.csv"
262
+ filename = os.path.join(metadata_dir, filename)
263
+
264
+ # Check file exists
265
+ if not os.path.exists(filename):
266
+ print("WARNING: Could not find desired file: " + filename)
267
+ else :
268
+ print("The",filename,"file was imported for further analysis!")
269
+
270
+ # Open, read in information
271
+ df = pd.read_csv(filename, header = 0)
272
+
273
+ # Verify size
274
+ print("Verifying data read from file is the correct length...\n")
275
+ #verify_line_no(filename, df.shape[0] + 1)
276
+
277
+ # Turn into dictionary
278
+ full_to_short_names = df.set_index('full_name').T.to_dict('records')[0]
279
+
280
+ # Print information
281
+ print('full_to_short_names =\n',full_to_short_names)
282
+
283
+
284
+ # ### II.3.4. SHORT_TO_FULL_COLUMN_NAMES
285
+
286
+ # In[12]:
287
+
288
+
289
+ # SHORT_TO_FULL_COLUMN_NAMES
290
+ filename = "short_to_full_column_names.csv"
291
+ filename = os.path.join(metadata_dir, filename)
292
+
293
+ # Check file exists
294
+ if not os.path.exists(filename):
295
+ print("WARNING: Could not find desired file: " + filename)
296
+ else :
297
+ print("The",filename,"file was imported for further analysis!")
298
+
299
+ # Open, read in information
300
+ df = pd.read_csv(filename, header = 0)
301
+
302
+ # Verify size
303
+ print("Verifying data read from file is the correct length...\n")
304
+ #verify_line_no(filename, df.shape[0] + 1)
305
+
306
+ # Turn into dictionary
307
+ short_to_full_names = df.set_index('short_name').T.to_dict('records')[0]
308
+
309
+ # Print information
310
+ print('short_to_full_names =\n',short_to_full_names)
311
+
312
+
313
+ # ### II.3.5. SAMPLES COLORS
314
+
315
+ # In[13]:
316
+
317
+
318
+ # COLORS INFORMATION
319
+ filename = "sample_color_data.csv"
320
+ filename = os.path.join(metadata_dir, filename)
321
+
322
+ # Check file exists
323
+ if not os.path.exists(filename):
324
+ print("WARNING: Could not find desired file: " + filename)
325
+ else :
326
+ print("The",filename,"file was imported for further analysis!")
327
+
328
+ # Open, read in information
329
+ df = pd.read_csv(filename, header = 0)
330
+ df = df.drop(columns = ['hex'])
331
+
332
+
333
+ # our tuple of float values for rgb, (r, g, b) was read in
334
+ # as a string '(r, g, b)'. We need to extract the r-, g-, and b-
335
+ # substrings and convert them back into floats
336
+ df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)
337
+
338
+ # Verify size
339
+ print("Verifying data read from file is the correct length...\n")
340
+ #verify_line_no(filename, df.shape[0] + 1)
341
+
342
+ # Turn into dictionary
343
+ sample_color_dict = df.set_index('Sample_ID')['rgb'].to_dict()
344
+
345
+ # Print information
346
+ print('sample_color_dict =\n',sample_color_dict)
347
+ sample_color_dict = pd.DataFrame.from_dict(sample_color_dict, orient='index', columns=['R', 'G', 'B'])
348
+
349
+
350
+ # In[14]:
351
+
352
+
353
+ sample_color_dict
354
+
355
+
356
+ # ### II.3.6. CHANNELS COLORS
357
+
358
+ # In[15]:
359
+
360
+
361
+ # CHANNELS
362
+ filename = "channel_color_data.csv"
363
+ filename = os.path.join(metadata_dir, filename)
364
+
365
+ # Check file exists
366
+ if not os.path.exists(filename):
367
+ print("WARNING: Could not find desired file: "+filename)
368
+ else :
369
+ print("The",filename,"file was imported for further analysis!")
370
+
371
+ # Open, read in information
372
+ df = pd.read_csv(filename, header = 0)
373
+ df = df.drop(columns = ['hex'])
374
+
375
+ # our tuple of float values for rgb, (r, g, b) was read in
376
+ # as a string '(r, g, b)'. We need to extract the r-, g-, and b-
377
+ # substrings and convert them back into floats
378
+ df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)
379
+
380
+ # Verify size
381
+ print("Verifying data read from file is the correct length...\n")
382
+ #verify_line_no(filename, df.shape[0] + 1)
383
+
384
+ # Turn into dictionary
385
+ channel_color_dict = df.set_index('Channel')['rgb'].to_dict()
386
+
387
+ # Print information
388
+ print('channel_color_dict =\n',channel_color_dict)
389
+ channel_color_dict = pd.DataFrame.from_dict(channel_color_dict, orient='index', columns=['R', 'G', 'B'])
390
+
391
+
392
+ # In[16]:
393
+
394
+
395
+ channel_color_dict
396
+
397
+
398
+ # ### II.3.7. ROUNDS COLORS
399
+
400
+ # In[17]:
401
+
402
+
403
+ # ROUND
404
+ filename = "round_color_data.csv"
405
+ filename = os.path.join(metadata_dir, filename)
406
+
407
+ # Check file exists
408
+ if not os.path.exists(filename):
409
+ print("WARNING: Could not find desired file: "+filename)
410
+ else :
411
+ print("The",filename,"file was imported for further analysis!")
412
+
413
+ # Open, read in information
414
+ df = pd.read_csv(filename, header = 0)
415
+ df = df.drop(columns = ['hex'])
416
+
417
+ # our tuple of float values for rgb, (r, g, b) was read in
418
+ # as a string '(r, g, b)'. We need to extract the r-, g-, and b-
419
+ # substrings and convert them back into floats
420
+ df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)
421
+
422
+ # Verify size
423
+ print("Verifying data read from file is the correct length...\n")
424
+ #verify_line_no(filename, df.shape[0] + 1)
425
+
426
+ # Turn into dictionary
427
+ round_color_dict = df.set_index('Round')['rgb'].to_dict()
428
+
429
+ # Print information
430
+ print('round_color_dict =\n',round_color_dict)
431
+ round_color_dict = pd.DataFrame.from_dict(round_color_dict, orient='index', columns=['R', 'G', 'B'])
432
+
433
+
434
+ # In[18]:
435
+
436
+
437
+ round_color_dict
438
+
439
+
440
+ # ### II.3.8. DATA
441
+
442
+ # In[19]:
443
+
444
+
445
+ # DATA
446
+ # List files in the directory
447
+ # Check if the directory exists
448
+ if os.path.exists(input_data_dir):
449
+ ls_samples = [sample for sample in os.listdir(input_data_dir) if sample.endswith("_qc_eda.csv")]
450
+
451
+ print("The following CSV files were detected:")
452
+ print([sample for sample in ls_samples])
453
+ else:
454
+ print(f"The directory {input_data_dir} does not exist.")
455
+
456
+
457
+ # In[20]:
458
+
459
+
460
+ # Import all the others files
461
+ dfs = {}
462
+
463
+ # Set variable to hold default header values
464
+ # First gather information on expected headers using first file in ls_samples
465
+ # Read in the first row of the file corresponding to the first sample (index = 0) in ls_samples
466
+ df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]) , index_col = 0, nrows = 1)
467
+ expected_headers = df.columns.values
468
+ print(expected_headers)
469
+
470
+ ###############################
471
+ # !! This may take a while !! #
472
+ ###############################
473
+ for sample in ls_samples:
474
+ file_path = os.path.join(input_data_dir,sample)
475
+
476
+ try:
477
+ # Read the CSV file
478
+ df = pd.read_csv(file_path, index_col=0)
479
+ # Check if the DataFrame is empty, if so, don't continue trying to process df and remove it
480
+
481
+ if not df.empty:
482
+ # Reorder the columns to match the expected headers list
483
+ df = df.reindex(columns=expected_headers)
484
+ print(sample, "file is processed !\n")
485
+ #print(df)
486
+
487
+ except pd.errors.EmptyDataError:
488
+ print(f'\nEmpty data error in {sample} file. Removing from analysis...')
489
+ ls_samples.remove(sample)
490
+
491
+ # Add df to dfs
492
+ dfs[sample] = df
493
+
494
+ #print(dfs)
495
+
496
+
497
+ # In[21]:
498
+
499
+
500
+ # Merge dfs into one df
501
+ df = pd.concat(dfs.values(), ignore_index=False , sort = False)
502
+ #del dfs
503
+ df.head()
504
+
505
+
506
+ # In[22]:
507
+
508
+
509
+ df.shape
510
+
511
+
512
+ # In[23]:
513
+
514
+
515
+ # Check for NaN entries (should not be any unless columns do not align)
516
+ # False means no NaN entries
517
+ # True means NaN entries
518
+ df.isnull().any().any()
519
+
520
+
521
+ # ## II.4. *FILTERING
522
+
523
+ # In[24]:
524
+
525
+
526
+ print("Number of cells before filtering :", df.shape[0])
527
+ cells_before_filter = f"Number of cells before filtering :{df.shape[0]}"
528
+
529
+
530
+ # In[25]:
531
+
532
+
533
+ #print(df)
534
+
535
+
536
+ # In[26]:
537
+
538
+
539
+ # Delete small cells and objects w/high AF555 Signal (RBCs)
540
+ # We usually use the 95th percentile calculated during QC_EDA
541
+ df = df.loc[(df['Nucleus_Size'] > 42 )]
542
+ df = df.loc[(df['Nucleus_Size'] < 216)]
543
+ print("Number of cells after filtering on nucleus size:", df.shape[0])
544
+
545
+ df = df.loc[(df['AF555_Cell_Intensity_Average'] < 2000)]
546
+ print("Number of cells after filtering on AF555A ___ intensity:", df.shape[0])
547
+ cells_after_filter_nucleus = f"Number of cells after filtering on nucleus size: {df.shape[0]}"
548
+ cells_after_filter_intensity = f"Number of cells after filtering on AF555A ___ intensity: {df.shape[0]}"
549
+
550
+
551
+ # In[27]:
552
+
553
+
554
+ # Assign cell type
555
+ # Assign tumor cells at each row at first (random assigning here just for development purposes)
556
+ # Generate random values for cell_type column
557
+ random_values = np.random.randint(0, 10, size=len(df))
558
+
559
+ # Assign cell type based on random values
560
+ def assign_cell_type(n):
561
+ return np.random.choice(['STROMA','CANCER','IMMUNE','ENDOTHELIAL'])
562
+
563
+ df['cell_type'] = np.vectorize(assign_cell_type)(random_values)
564
+ df['cell_subtype'] = df['cell_type'].copy()
565
+
566
+
567
+ # In[28]:
568
+
569
+
570
+ filtered_dataframe = df
571
+ df.head()
572
+
573
+
574
+ # In[29]:
575
+
576
+
577
+ quality_control_df = filtered_dataframe
578
+
579
+
580
+ # In[30]:
581
+
582
+
583
+ def check_index_format(index_str, ls_samples):
584
+ """
585
+ Checks if the given index string follows the specified format.
586
+
587
+ Args:
588
+ index_str (str): The index string to be checked.
589
+ ls_samples (list): A list of valid sample names.
590
+
591
+ Returns:
592
+ bool: True if the index string follows the format, False otherwise.
593
+ """
594
+ # Split the index string into parts
595
+ parts = index_str.split('_')
596
+
597
+ # Check if there are exactly 3 parts
598
+ if len(parts) != 3:
599
+ print(len(parts))
600
+ return False
601
+
602
+ # Check if the first part is in ls_samples
603
+ sample_name = parts[0]
604
+ if f'{sample_name}_qc_eda.csv' not in ls_samples:
605
+ print(sample_name)
606
+ return False
607
+
608
+ # Check if the second part is in ['cell', 'cytoplasm', 'nucleus']
609
+ location = parts[1]
610
+ valid_locations = ['Cell', 'Cytoplasm', 'Nucleus']
611
+ if location not in valid_locations:
612
+ print(location)
613
+ return False
614
+
615
+ # Check if the third part is a number
616
+ try:
617
+ index = int(parts[2])
618
+ except ValueError:
619
+ print(index)
620
+ return False
621
+
622
+ # If all checks pass, return True
623
+ return True
624
+
625
+
626
+ # In[31]:
627
+
628
+
629
+ # Let's take a look at a few features to make sure our dataframe is as expected
630
+ df.index
631
+ def check_format_ofindex(index):
632
+ for index in df.index:
633
+ check_index = check_index_format(index, ls_samples)
634
+ if check_index is False:
635
+ index_format = "Bad"
636
+ return index_format
637
+
638
+ index_format = "Good"
639
+ return index_format
640
+ print(check_format_ofindex(df.index))
641
+
642
+
643
+ # In[32]:
644
+
645
+
646
+ import panel as pn
647
+ import pandas as pd
648
+
649
+ def quality_check(file, not_intensities):
650
+ # Load the output file
651
+ df = file
652
+
653
+ # Check Index
654
+ check_index = check_format_ofindex(df.index)
655
+
656
+ # Check Shape
657
+ check_shape = df.shape
658
+
659
+ # Check for NaN entries
660
+ check_no_null = df.isnull().any().any()
661
+
662
+ mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1)
663
+ if (mean_intensity == 0).any():
664
+ df = df.loc[mean_intensity > 0, :]
665
+ print("df.shape after removing 0 mean values: ", df.shape)
666
+ check_zero_intensities = f'Shape after removing 0 mean values: {df.shape}'
667
+ else:
668
+ print("No zero intensity values.")
669
+ check_zero_intensities = "No zero intensity values."
670
+
671
+ # Create a quality check results table
672
+ quality_check_results_table = pd.DataFrame({
673
+ 'Check': ['Index', 'Shape', 'Check for NaN Entries', 'Check for Zero Intensities'],
674
+ 'Result': [str(check_index), str(check_shape), str(check_no_null), check_zero_intensities]
675
+ })
676
+
677
+ # Create a quality check results component
678
+ quality_check_results_component = pn.Card(
679
+ pn.pane.DataFrame(quality_check_results_table),
680
+ title="Quality Control Results",
681
+ header_background="#2196f3",
682
+ header_color="white",
683
+ )
684
+
685
+ return quality_check_results_component
686
+
687
+
688
+ # ## II.5. CELL TYPES COLORS
689
+ # Establish colors to use throughout workflow
690
+
691
+ # we want colors that are categorical, since Cell Type is a non-ordered category.
692
+ # A categorical color palette will have dissimilar colors.
693
+ # Get those unique colors
694
+ cell_types = ['STROMA','CANCER','IMMUNE','ENDOTHELIAL']
695
+ color_values = sb.color_palette("hls", n_colors = len(cell_types))
696
+ # each color value is a tuple of three values: (R, G, B)
697
+
698
+ print("Unique cell types are:",df.cell_type.unique())
699
+ # Display those unique colors
700
+ sb.palplot(sb.color_palette(color_values))
701
+ # In[33]:
702
+
703
+
704
+ # Define your custom colors for each cell type
705
+ custom_colors = {
706
+ 'CANCER': (0.1333, 0.5451, 0.1333),
707
+ 'STROMA': (0.4, 0.4, 0.4),
708
+ 'IMMUNE': (1, 1, 0),
709
+ 'ENDOTHELIAL': (0.502, 0, 0.502)
710
+ }
711
+
712
+ # Retrieve the list of cell types
713
+ cell_types = list(custom_colors.keys())
714
+
715
+ # Extract the corresponding colors from the dictionary
716
+ color_values = [custom_colors[cell] for cell in cell_types]
717
+
718
+ # Display the colors
719
+ sb.palplot(sb.color_palette(color_values))
720
+
721
+
722
+ # In[34]:
723
+
724
+
725
+ # Store in a dctionnary
726
+ celltype_color_dict = dict(zip(cell_types, color_values))
727
+ celltype_color_dict
728
+
729
+
730
+ # In[35]:
731
+
732
+
733
+ celltype_color_df = pd.DataFrame.from_dict(celltype_color_dict, orient='index', columns=['R', 'G', 'B'])
734
+
735
+
736
+ # In[36]:
737
+
738
+
739
+ # Save color information (mapping and legend) to metadata directory
740
+ # Create dataframe
741
+ celltype_color_df = color_dict_to_df(celltype_color_dict, "cell_type")
742
+ celltype_color_df.head()
743
+
744
+ # Save to file in metadatadirectory
745
+ present_dir = os.path.dirname(os.path.realpath(__file__))
746
+ filename = os.path.join(present_dir, "celltype_color_data.csv")
747
+ #filename = "celltype_color_data.csv"
748
+ filename = os.path.join(metadata_dir, filename)
749
+ celltype_color_df.to_csv(filename, index = False)
750
+ print("File" + filename + " was created!")
751
+
752
+
753
+ # In[37]:
754
+
755
+
756
+ celltype_color_df.head()
757
+
758
+
759
+ # In[38]:
760
+
761
+
762
+ # Legend of cell type info only
763
+ g = plt.figure(figsize = (1,1)).add_subplot(111)
764
+ g.axis('off')
765
+ handles = []
766
+ for item in celltype_color_dict.keys():
767
+ h = g.bar(0,0, color = celltype_color_dict[item],
768
+ label = item, linewidth =0)
769
+ handles.append(h)
770
+ first_legend = plt.legend(handles=handles, loc='upper right', title = 'Cell type'),
771
+
772
+
773
+ filename = "Celltype_legend.png"
774
+ filename = os.path.join(metadata_images_dir, filename)
775
+ plt.savefig(filename, bbox_inches = 'tight')
776
+
777
+
778
+ # In[39]:
779
+
780
+
781
+ metadata
782
+
783
+
784
+ # In[40]:
785
+
786
+
787
+ df.columns.values
788
+
789
+
790
+ # In[41]:
791
+
792
+
793
+ df.shape
794
+
795
+
796
+ # In[42]:
797
+
798
+
799
+ metadata.shape
800
+
801
+
802
+ # ## II.6. *CELL SUBTYPES COLORS
803
+
804
+ # In[43]:
805
+
806
+
807
+ # Establish colors to use throughout workflow
808
+
809
+ # we want colors that are categorical, since Cell Type is a non-ordered category.
810
+ # A categorical color palette will have dissimilar colors.
811
+ # Get those unique colors
812
+ cell_subtypes = ['DC','B', 'TCD4','TCD8','M1','M2','Treg', \
813
+ 'IMMUNE_OTHER', 'CANCER', 'αSMA_myCAF',\
814
+ 'STROMA_OTHER', 'ENDOTHELIAL']
815
+ color_values = sb.color_palette("Paired",n_colors = len(cell_subtypes))
816
+ # each color value is a tuple of three values: (R, G, B)
817
+
818
+ print("Unique cell types are:",df.cell_subtype.unique())
819
+ # Display those unique colors
820
+ sb.palplot(sb.color_palette(color_values))
821
+
822
+
823
+ # In[44]:
824
+
825
+
826
+ # Store in a dctionnary
827
+ cellsubtype_color_dict = dict(zip(cell_subtypes, color_values))
828
+ cellsubtype_color_dict
829
+
830
+
831
+ # In[45]:
832
+
833
+
834
+ cellsubtype_color_df = pd.DataFrame.from_dict(cellsubtype_color_dict, orient='index', columns=['R', 'G', 'B'])
835
+
836
+
837
+ # In[46]:
838
+
839
+
840
+ # Save color information (mapping and legend) to metadata directory
841
+ # Create dataframe
842
+ cellsubtype_color_df = color_dict_to_df(cellsubtype_color_dict, "cell_subtype")
843
+
844
+ # Save to file in metadatadirectory
845
+ filename = "cellsubtype_color_data.csv"
846
+ filename = os.path.join(metadata_dir, filename)
847
+ cellsubtype_color_df.to_csv(filename, index = False)
848
+ print("File" + filename + " was created!")
849
+
850
+
851
+ # In[47]:
852
+
853
+
854
+ cellsubtype_color_df.head()
855
+
856
+
857
+ # In[48]:
858
+
859
+
860
+ # Legend of cell type info only
861
+ g = plt.figure(figsize = (1,1)).add_subplot(111)
862
+ g.axis('off')
863
+ handles = []
864
+ for item in cellsubtype_color_dict.keys():
865
+ h = g.bar(0,0, color = cellsubtype_color_dict[item],
866
+ label = item, linewidth =0)
867
+ handles.append(h)
868
+ first_legend = plt.legend(handles=handles, loc='upper right', title = 'Cell subtype'),
869
+
870
+
871
+ filename = "Cellsubtype_legend.png"
872
+ filename = os.path.join(metadata_images_dir, filename)
873
+ plt.savefig(filename, bbox_inches = 'tight')
874
+
875
+
876
+ # ## II.7. IMMUNE CHECKPOINT COLORS
877
+
878
+ # In[49]:
879
+
880
+
881
+ # Assign IMMUNE SUBTYPES
882
+ df['cell_subtype'] = df['cell_type'].copy()
883
+ df['immune_checkpoint'] = 'none'
884
+ df
885
+
886
+ immune_checkpoint = ['B7H4', 'PDL1', 'PD1', 'None']
887
+ color_values = sb.color_palette("husl",n_colors=len(immune_checkpoint))
888
+ # each color value is a tuple of three values: (R, G, B)
889
+
890
+ print("Unique immune checkpoint are:",df.immune_checkpoint.unique())
891
+ # Display those unique colors
892
+ sb.palplot(sb.color_palette(color_values))
893
+ # In[50]:
894
+
895
+
896
+ immune_checkpoint = ['B7H4', 'PDL1', 'PD1', 'B7H4_PDL1', 'None']
897
+
898
+ # Base colors for the primary checkpoints
899
+ base_colors = sb.color_palette("husl", n_colors=3) # Three distinct colors
900
+
901
+ # Function to mix two RGB colors
902
+ def mix_colors(color1, color2):
903
+ return tuple((c1 + c2) / 2 for c1, c2 in zip(color1, color2))
904
+
905
+ # Generate mixed colors for the combinations of checkpoints
906
+ mixed_colors = [
907
+ mix_colors(base_colors[0], base_colors[1]), # Mix B7H4 and PDL1
908
+ # mix_colors(base_colors[0], base_colors[2]), # Mix B7H4 and PD1
909
+ # mix_colors(base_colors[1], base_colors[2]), # Mix PDL1 and PD1
910
+ tuple(np.mean(base_colors, axis=0)) # Mix B7H4, PDL1, and PD1
911
+ ]
912
+
913
+ # Adding the color for 'None'
914
+ #none_color = [(0.8, 0.8, 0.8)] # A shade of gray
915
+
916
+ # Combine all colors into one list
917
+ color_values = base_colors + mixed_colors #+ none_color
918
+
919
+ # Display unique immune checkpoint combinations
920
+ print("Unique immune checkpoint combinations are:", immune_checkpoint)
921
+ # Display the unique colors
922
+ sb.palplot(color_values)
923
+
924
+
925
+ # In[51]:
926
+
927
+
928
+ # Store in a dctionnary
929
+ immunecheckpoint_color_dict = dict(zip(immune_checkpoint, color_values))
930
+ immunecheckpoint_color_dict
931
+
932
+
933
+ # In[52]:
934
+
935
+
936
+ # Save color information (mapping and legend) to metadata directory
937
+ # Create dataframe
938
+ immunecheckpoint_color_df = color_dict_to_df(immunecheckpoint_color_dict, "immune_checkpoint")
939
+ immunecheckpoint_color_df.head()
940
+
941
+ # Save to file in metadatadirectory
942
+ filename = "immunecheckpoint_color_data.csv"
943
+ filename = os.path.join(metadata_dir, filename)
944
+ immunecheckpoint_color_df.to_csv(filename, index = False)
945
+ print("File " + filename + " was created!")
946
+
947
+
948
+ # In[53]:
949
+
950
+
951
+ # Legend of cell type info only
952
+ g = plt.figure(figsize = (1,1)).add_subplot(111)
953
+ g.axis('off')
954
+ handles = []
955
+ for item in immunecheckpoint_color_dict.keys():
956
+ h = g.bar(0,0, color = immunecheckpoint_color_dict[item],
957
+ label = item, linewidth =0)
958
+ handles.append(h)
959
+ first_legend = plt.legend(handles=handles, loc='upper right', title = 'Immune checkpoint'),
960
+
961
+
962
+ filename = "Cellsubtype_legend.png"
963
+ filename = os.path.join(metadata_images_dir, filename)
964
+ plt.savefig(filename, bbox_inches = 'tight')
965
+
966
+
967
+ # ## II.7. BACKGROUND SUBSTRACTION
968
+
969
+ # In[54]:
970
+
971
+
972
+ def do_background_sub(col, df, metadata):
973
+ #print(col.name)
974
+ location = metadata.loc[metadata['full_column'] == col.name, 'localisation'].values[0]
975
+ #print('location = ' + location)
976
+ channel = metadata.loc[metadata['full_column'] == col.name, 'Channel'].values[0]
977
+ #print('channel = ' + channel)
978
+ af_target = metadata.loc[
979
+ (metadata['Channel']==channel) \
980
+ & (metadata['localisation']==location) \
981
+ & (metadata['target_lower'].str.contains(r'^af\d{3}$')),\
982
+ 'full_column'].values[0]
983
+ return col - df.loc[:,af_target]
984
+
985
+
986
+ # In[55]:
987
+
988
+
989
+ metadata_with_localisation = metadata
990
+ metadata_with_localisation
991
+
992
+
993
+ # In[56]:
994
+
995
+
996
+ #Normalization
997
+
998
+ df.loc[:, ~df.columns.isin(not_intensities)] = \
999
+ df.loc[:, ~df.columns.isin(not_intensities)].apply(lambda column: divide_exp_time(column, 'Exp', metadata), axis = 0)
1000
+
1001
+
1002
+ # In[57]:
1003
+
1004
+
1005
+ normalization_df = df
1006
+ normalization_df.head()
1007
+
1008
+
1009
+ # In[58]:
1010
+
1011
+
1012
+ # Do background subtraction
1013
+ # this uses a df (metadata) outside of
1014
+ # the scope of the lambda...
1015
+ # careful that this might break inside of a script...
1016
+
1017
+ df.loc[:,~df.columns.isin(not_intensities)] = \
1018
+ df.loc[:,~df.columns.isin(not_intensities)].apply(lambda column: do_background_sub(column, df, metadata),axis = 0)
1019
+
1020
+
1021
+ # In[59]:
1022
+
1023
+
1024
+ df
1025
+ background_substraction_df = df
1026
+ background_substraction_df.head()
1027
+
1028
+
1029
+ # In[60]:
1030
+
1031
+
1032
+ # Drop AF columns
1033
+ df = df.filter(regex='^(?!AF\d{3}).*')
1034
+ print(df.columns.values)
1035
+
1036
+
1037
+ # In[61]:
1038
+
1039
+
1040
+ intensities_df = df.loc[:, ~df.columns.isin(not_intensities)]
1041
+ intensities_df
1042
+
1043
+
1044
+ # In[62]:
1045
+
1046
+
1047
+ normalization_df.head()
1048
+
1049
+
1050
+ # In[63]:
1051
+
1052
+
1053
+ metadata_df = metadata_with_localisation
1054
+ intensities_df = intensities_df # Assuming you have loaded the intensities DataFrame
1055
+
1056
+ # Create a list of column names from the intensities DataFrame
1057
+ column_names = intensities_df.columns.tolist()
1058
+
1059
+ # Create a Select widget for choosing a column
1060
+ column_selector = pn.widgets.Select(name='Select Column', options=column_names)
1061
+
1062
+ # Create a Markdown widget to display the selected column's information
1063
+ column_info_md = pn.pane.Markdown(name='Column Information', width=400, object='Select a column to view its information.')
1064
+
1065
+ # Define a function to update the column information
1066
+ def update_column_info(event):
1067
+ selected_column = event.new
1068
+ if selected_column:
1069
+ # Get the selected column's intensity
1070
+ intensity = intensities_df[selected_column].values
1071
+
1072
+ # Get the corresponding channel, localization, and experiment from the metadata
1073
+ channel = metadata_df.loc[metadata_df['full_column'] == selected_column, 'Channel'].values[0]
1074
+ localization = metadata_df.loc[metadata_df['full_column'] == selected_column, 'localisation'].values[0]
1075
+ exposure = metadata_df.loc[metadata_df['full_column'] == selected_column, 'Exp'].values[0]
1076
+
1077
+ # Create a Markdown string with the column information
1078
+ column_info_text = f"**Intensity:** {intensity}\n\n**Channel:** {channel}\n\n**Localization:** {localization}\n\n**Exposure:** {exposure}"
1079
+
1080
+ # Update the Markdown widget with the column information
1081
+ column_info_md.object = column_info_text
1082
+ else:
1083
+ column_info_md.object = 'Select a column to view its information.'
1084
+
1085
+ # Watch for changes in the column selector and update the column information
1086
+ column_selector.param.watch(update_column_info, 'value')
1087
+
1088
+ # Create a Panel app and display the widgets
1089
+ bs_info = pn.Column(column_selector, column_info_md)
1090
+ pn.extension()
1091
+ bs_info.servable()
1092
+
1093
+ normalization_df.head()
1094
+
1095
+
1096
+ # In[65]:
1097
+
1098
+
1099
+ import panel as pn
1100
+ df_widget = pn.widgets.DataFrame(metadata, name="MetaData")
1101
+ app2 = pn.template.GoldenTemplate(
1102
+ site="Cyc-IF",
1103
+ title=" Background-Substraction",
1104
+ main=[pn.Tabs(("Background-Substraction",pn.Column(
1105
+ #pn.Column(pn.pane.Markdown("### Celltype thresholds"), pn.pane.DataFrame(celltype_color_df)),
1106
+ #pn.Column(pn.pane.Markdown("### Cell Subtype thresholds"), pn.pane.DataFrame(cellsubtype_color_df)),
1107
+ #pn.Column(pn.pane.Markdown("### Cells Before Filtering"),pn.pane.Str(cells_before_filter)),
1108
+ #pn.Column(pn.pane.Markdown("### Cells After Filtering Nucleus"),pn.pane.Str(cells_after_filter_nucleus)),
1109
+ #pn.Column(pn.pane.Markdown("### Cells After Filtering Intensity"),pn.pane.Str(cells_after_filter_intensity)),
1110
+ #pn.Column(pn.pane.Markdown("### Dataframe after filtering"), pn.pane.DataFrame(filtered_dataframe.head())),
1111
+ pn.Column(pn.pane.Markdown("### The metadata obtained that specifies the localisation:"), metadata_with_localisation.head(8)),
1112
+ pn.Column(pn.pane.Markdown("### The channels and exposure of each intensities column"), bs_info),
1113
+ pn.Column(pn.pane.Markdown("### Dataframe after perfroming normalization"),pn.pane.DataFrame(normalization_df.head(), width = 1500)),
1114
+ pn.Column(pn.pane.Markdown("### Dataframe after background Substraction"), pn.pane.DataFrame(background_substraction_df.head()),
1115
+ ))),
1116
+ ("Quality Control", pn.Column(
1117
+ quality_check(quality_control_df, not_intensities)
1118
+ #pn.pane.Markdown("### The Quality check results are:"), quality_check_results(check_shape, check_no_null, check_all_expected_files_present, check_zero_intensities)
1119
+ ))
1120
+ )],)
1121
+ app2.servable()