KashyapiNagaHarshitha commited on
Commit
554f382
·
verified ·
1 Parent(s): 8e6dd8c

Upload Background_Substraction.py

Browse files
Files changed (1) hide show
  1. Background_Substraction.py +1084 -0
Background_Substraction.py ADDED
@@ -0,0 +1,1084 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+
5
+ # In[1]:
6
+ import os
7
+ import random
8
+ import re
9
+ import pandas as pd
10
+ import numpy as np
11
+ import seaborn as sb
12
+ import matplotlib.pyplot as plt
13
+ import matplotlib.colors as mplc
14
+ import subprocess
15
+ import warnings
16
+
17
+ from scipy import signal
18
+
19
+ import plotly.figure_factory as ff
20
+ import plotly
21
+ import plotly.graph_objs as go
22
+ from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
23
+ import plotly.express as px
24
+ init_notebook_mode(connected = True)
25
+
26
+ from my_modules import *
27
+
28
+
29
+ # In[2]:
30
+
31
+
32
+ #Silence FutureWarnings & UserWarnings
33
+ warnings.filterwarnings('ignore', category= FutureWarning)
34
+ warnings.filterwarnings('ignore', category= UserWarning)
35
+
36
+
37
+ # In[3]:
38
+
39
+
40
+ get_ipython().run_line_magic('store', '-r base_dir')
41
+ get_ipython().run_line_magic('store', '-r set_path')
42
+ get_ipython().run_line_magic('store', '-r ls_samples')
43
+ get_ipython().run_line_magic('store', '-r selected_metadata_files')
44
+
45
+
46
+ # In[4]:
47
+
48
+
49
+ print(base_dir)
50
+ print(set_path)
51
+ print(ls_samples)
52
+ print(selected_metadata_files)
53
+
54
+
55
+ # ## II.2. *DIRECTORIES
56
+
57
+ # In[5]:
58
+
59
+
60
+ # Set base directory
61
+
62
+ ##### MAC WORKSTATION #####
63
+ #base_dir = r'/Volumes/LaboLabrie/Projets/OC_TMA_Pejovic/Temp/Zoe/CyCIF_pipeline/'
64
+ ###########################
65
+
66
+ ##### WINDOWS WORKSTATION #####
67
+ #base_dir = r'C:\Users\LaboLabrie\gerz2701\cyCIF-pipeline\Set_B'
68
+ ###############################
69
+
70
+ ##### LOCAL WORKSTATION #####
71
+ #base_dir = r'/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/'
72
+ #############################
73
+
74
+ #set_name = 'Set_A'
75
+ #set_name = 'test'
76
+
77
+ set_name = set_path
78
+
79
+
80
+ # In[7]:
81
+
82
+
83
+ project_name = set_name # Project name
84
+ step_suffix = 'bs' # Curent part (here part II)
85
+ previous_step_suffix_long = "_qc_eda" # Previous part (here QC/EDA NOTEBOOK)
86
+
87
+ # Initial input data directory
88
+ input_data_dir = os.path.join(base_dir, project_name + previous_step_suffix_long)
89
+
90
+ # BS output directories
91
+ output_data_dir = os.path.join(base_dir, project_name + "_" + step_suffix)
92
+ # BS images subdirectory
93
+ output_images_dir = os.path.join(output_data_dir,"images")
94
+
95
+ # Data and Metadata directories
96
+ # Metadata directories
97
+ metadata_dir = os.path.join(base_dir, project_name + "_metadata")
98
+ # images subdirectory
99
+ metadata_images_dir = os.path.join(metadata_dir,"images")
100
+
101
+ # Create directories if they don't already exist
102
+ for d in [base_dir, input_data_dir, output_data_dir, output_images_dir, metadata_dir, metadata_images_dir]:
103
+ if not os.path.exists(d):
104
+ print("Creation of the" , d, "directory...")
105
+ os.makedirs(d)
106
+ else :
107
+ print("The", d, "directory already exists !")
108
+
109
+ os.chdir(input_data_dir)
110
+
111
+
112
+ # In[8]:
113
+
114
+
115
+ # Verify paths
116
+ print('base_dir :', base_dir)
117
+ print('input_data_dir :', input_data_dir)
118
+ print('output_data_dir :', output_data_dir)
119
+ print('output_images_dir :', output_images_dir)
120
+ print('metadata_dir :', metadata_dir)
121
+ print('metadata_images_dir :', metadata_images_dir)
122
+
123
+
124
+ # ## II.3. FILES
125
+ #Don't forget to put your data in the projname_data directory !
126
+ # ### II.3.1. METADATA
127
+
128
+ # In[9]:
129
+
130
+
131
+ # Import all metadata we need from the QC/EDA chapter
132
+
133
+ # METADATA
134
+ filename = "marker_intensity_metadata.csv"
135
+ filename = os.path.join(metadata_dir, filename)
136
+
137
+ # Check file exists
138
+ if not os.path.exists(filename):
139
+ print("WARNING: Could not find desired file: "+filename)
140
+ else :
141
+ print("The",filename,"file was imported for further analysis!")
142
+
143
+ # Open, read in information
144
+ metadata = pd.read_csv(filename)
145
+
146
+ # Verify size with verify_line_no() function in my_modules.py
147
+ #verify_line_no(filename, metadata.shape[0] + 1)
148
+
149
+ # Verify headers
150
+ exp_cols = ['Round','Target','Channel','target_lower','full_column','marker','localisation']
151
+ compare_headers(exp_cols, metadata.columns.values, "Marker metadata file")
152
+
153
+ metadata = metadata.dropna()
154
+ metadata.head()
155
+
156
+
157
+ # ### II.3.2. NOT_INTENSITIES
158
+
159
+ # In[10]:
160
+
161
+
162
+ # NOT_INTENSITIES
163
+ filename = "not_intensities.csv"
164
+ filename = os.path.join(metadata_dir, filename)
165
+
166
+ # Check file exists
167
+ if not os.path.exists(filename):
168
+ print("WARNING: Could not find desired file: "+filename)
169
+ else :
170
+ print("The",filename,"file was imported for further analysis!")
171
+
172
+ # Open, read in information
173
+ #not_intensities = []
174
+ with open(filename, 'r') as fh:
175
+ not_intensities = fh.read().strip().split("\n")
176
+ # take str, strip whitespace, split on new line character
177
+
178
+ not_intensities = ['Nuc_X', 'Nuc_X_Inv', 'Nuc_Y', 'Nuc_Y_Inv', 'Nucleus_Roundness', 'Nucleus_Size', 'Cell_Size',
179
+ 'ROI_index', 'Sample_ID', 'replicate_ID', 'Cell_ID','cell_type', 'cell_subtype', 'cluster','ID',
180
+ 'Cytoplasm_Size', 'immune_checkpoint', 'Unique_ROI_index', 'Patient', 'Primary_chem(1)_vs_surg(0)']
181
+
182
+ # Verify size
183
+ print("Verifying data read from file is the correct length...\n")
184
+ verify_line_no(filename, len(not_intensities))
185
+
186
+ # Print to console
187
+ print("not_intensities =\n", not_intensities)
188
+
189
+
190
+ # ### II.3.3. FULL_TO_SHORT_COLUMN_NAMES
191
+
192
+ # In[11]:
193
+
194
+
195
+ # FULL_TO_SHORT_COLUMN_NAMES
196
+ filename = "full_to_short_column_names.csv"
197
+ filename = os.path.join(metadata_dir, filename)
198
+
199
+ # Check file exists
200
+ if not os.path.exists(filename):
201
+ print("WARNING: Could not find desired file: " + filename)
202
+ else :
203
+ print("The",filename,"file was imported for further analysis!")
204
+
205
+ # Open, read in information
206
+ df = pd.read_csv(filename, header = 0)
207
+
208
+ # Verify size
209
+ print("Verifying data read from file is the correct length...\n")
210
+ #verify_line_no(filename, df.shape[0] + 1)
211
+
212
+ # Turn into dictionary
213
+ full_to_short_names = df.set_index('full_name').T.to_dict('records')[0]
214
+
215
+ # Print information
216
+ print('full_to_short_names =\n',full_to_short_names)
217
+
218
+
219
+ # ### II.3.4. SHORT_TO_FULL_COLUMN_NAMES
220
+
221
+ # In[12]:
222
+
223
+
224
+ # SHORT_TO_FULL_COLUMN_NAMES
225
+ filename = "short_to_full_column_names.csv"
226
+ filename = os.path.join(metadata_dir, filename)
227
+
228
+ # Check file exists
229
+ if not os.path.exists(filename):
230
+ print("WARNING: Could not find desired file: " + filename)
231
+ else :
232
+ print("The",filename,"file was imported for further analysis!")
233
+
234
+ # Open, read in information
235
+ df = pd.read_csv(filename, header = 0)
236
+
237
+ # Verify size
238
+ print("Verifying data read from file is the correct length...\n")
239
+ #verify_line_no(filename, df.shape[0] + 1)
240
+
241
+ # Turn into dictionary
242
+ short_to_full_names = df.set_index('short_name').T.to_dict('records')[0]
243
+
244
+ # Print information
245
+ print('short_to_full_names =\n',short_to_full_names)
246
+
247
+
248
+ # ### II.3.5. SAMPLES COLORS
249
+
250
+ # In[13]:
251
+
252
+
253
+ # COLORS INFORMATION
254
+ filename = "sample_color_data.csv"
255
+ filename = os.path.join(metadata_dir, filename)
256
+
257
+ # Check file exists
258
+ if not os.path.exists(filename):
259
+ print("WARNING: Could not find desired file: " + filename)
260
+ else :
261
+ print("The",filename,"file was imported for further analysis!")
262
+
263
+ # Open, read in information
264
+ df = pd.read_csv(filename, header = 0)
265
+ df = df.drop(columns = ['hex'])
266
+
267
+
268
+ # our tuple of float values for rgb, (r, g, b) was read in
269
+ # as a string '(r, g, b)'. We need to extract the r-, g-, and b-
270
+ # substrings and convert them back into floats
271
+ df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)
272
+
273
+ # Verify size
274
+ print("Verifying data read from file is the correct length...\n")
275
+ #verify_line_no(filename, df.shape[0] + 1)
276
+
277
+ # Turn into dictionary
278
+ sample_color_dict = df.set_index('Sample_ID')['rgb'].to_dict()
279
+
280
+ # Print information
281
+ print('sample_color_dict =\n',sample_color_dict)
282
+ sample_color_dict = pd.DataFrame.from_dict(sample_color_dict, orient='index', columns=['R', 'G', 'B'])
283
+
284
+
285
+ # In[14]:
286
+
287
+
288
+ sample_color_dict
289
+
290
+
291
+ # ### II.3.6. CHANNELS COLORS
292
+
293
+ # In[15]:
294
+
295
+
296
+ # CHANNELS
297
+ filename = "channel_color_data.csv"
298
+ filename = os.path.join(metadata_dir, filename)
299
+
300
+ # Check file exists
301
+ if not os.path.exists(filename):
302
+ print("WARNING: Could not find desired file: "+filename)
303
+ else :
304
+ print("The",filename,"file was imported for further analysis!")
305
+
306
+ # Open, read in information
307
+ df = pd.read_csv(filename, header = 0)
308
+ df = df.drop(columns = ['hex'])
309
+
310
+ # our tuple of float values for rgb, (r, g, b) was read in
311
+ # as a string '(r, g, b)'. We need to extract the r-, g-, and b-
312
+ # substrings and convert them back into floats
313
+ df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)
314
+
315
+ # Verify size
316
+ print("Verifying data read from file is the correct length...\n")
317
+ #verify_line_no(filename, df.shape[0] + 1)
318
+
319
+ # Turn into dictionary
320
+ channel_color_dict = df.set_index('Channel')['rgb'].to_dict()
321
+
322
+ # Print information
323
+ print('channel_color_dict =\n',channel_color_dict)
324
+ channel_color_dict = pd.DataFrame.from_dict(channel_color_dict, orient='index', columns=['R', 'G', 'B'])
325
+
326
+
327
+ # In[16]:
328
+
329
+
330
+ channel_color_dict
331
+
332
+
333
+ # ### II.3.7. ROUNDS COLORS
334
+
335
+ # In[17]:
336
+
337
+
338
+ # ROUND
339
+ filename = "round_color_data.csv"
340
+ filename = os.path.join(metadata_dir, filename)
341
+
342
+ # Check file exists
343
+ if not os.path.exists(filename):
344
+ print("WARNING: Could not find desired file: "+filename)
345
+ else :
346
+ print("The",filename,"file was imported for further analysis!")
347
+
348
+ # Open, read in information
349
+ df = pd.read_csv(filename, header = 0)
350
+ df = df.drop(columns = ['hex'])
351
+
352
+ # our tuple of float values for rgb, (r, g, b) was read in
353
+ # as a string '(r, g, b)'. We need to extract the r-, g-, and b-
354
+ # substrings and convert them back into floats
355
+ df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)
356
+
357
+ # Verify size
358
+ print("Verifying data read from file is the correct length...\n")
359
+ #verify_line_no(filename, df.shape[0] + 1)
360
+
361
+ # Turn into dictionary
362
+ round_color_dict = df.set_index('Round')['rgb'].to_dict()
363
+
364
+ # Print information
365
+ print('round_color_dict =\n',round_color_dict)
366
+ round_color_dict = pd.DataFrame.from_dict(round_color_dict, orient='index', columns=['R', 'G', 'B'])
367
+
368
+
369
+ # In[18]:
370
+
371
+
372
+ round_color_dict
373
+
374
+
375
+ # ### II.3.8. DATA
376
+
377
+ # In[19]:
378
+
379
+
380
+ # DATA
381
+ # List files in the directory
382
+ # Check if the directory exists
383
+ if os.path.exists(input_data_dir):
384
+ ls_samples = [sample for sample in os.listdir(input_data_dir) if sample.endswith("_qc_eda.csv")]
385
+
386
+ print("The following CSV files were detected:")
387
+ print([sample for sample in ls_samples])
388
+ else:
389
+ print(f"The directory {input_data_dir} does not exist.")
390
+
391
+
392
+ # In[20]:
393
+
394
+
395
+ # Import all the others files
396
+ dfs = {}
397
+
398
+ # Set variable to hold default header values
399
+ # First gather information on expected headers using first file in ls_samples
400
+ # Read in the first row of the file corresponding to the first sample (index = 0) in ls_samples
401
+ df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]) , index_col = 0, nrows = 1)
402
+ expected_headers = df.columns.values
403
+ print(expected_headers)
404
+
405
+ ###############################
406
+ # !! This may take a while !! #
407
+ ###############################
408
+ for sample in ls_samples:
409
+ file_path = os.path.join(input_data_dir,sample)
410
+
411
+ try:
412
+ # Read the CSV file
413
+ df = pd.read_csv(file_path, index_col=0)
414
+ # Check if the DataFrame is empty, if so, don't continue trying to process df and remove it
415
+
416
+ if not df.empty:
417
+ # Reorder the columns to match the expected headers list
418
+ df = df.reindex(columns=expected_headers)
419
+ print(sample, "file is processed !\n")
420
+ #print(df)
421
+
422
+ except pd.errors.EmptyDataError:
423
+ print(f'\nEmpty data error in {sample} file. Removing from analysis...')
424
+ ls_samples.remove(sample)
425
+
426
+ # Add df to dfs
427
+ dfs[sample] = df
428
+
429
+ #print(dfs)
430
+
431
+
432
+ # In[21]:
433
+
434
+
435
+ # Merge dfs into one df
436
+ df = pd.concat(dfs.values(), ignore_index=False , sort = False)
437
+ #del dfs
438
+ df.head()
439
+
440
+
441
+ # In[22]:
442
+
443
+
444
+ df.shape
445
+
446
+
447
+ # In[23]:
448
+
449
+
450
+ # Check for NaN entries (should not be any unless columns do not align)
451
+ # False means no NaN entries
452
+ # True means NaN entries
453
+ df.isnull().any().any()
454
+
455
+
456
+ # ## II.4. *FILTERING
457
+
458
+ # In[24]:
459
+
460
+
461
+ print("Number of cells before filtering :", df.shape[0])
462
+ cells_before_filter = f"Number of cells before filtering :{df.shape[0]}"
463
+
464
+
465
+ # In[25]:
466
+
467
+
468
+ #print(df)
469
+
470
+
471
+ # In[26]:
472
+
473
+
474
+ # Delete small cells and objects w/high AF555 Signal (RBCs)
475
+ # We usually use the 95th percentile calculated during QC_EDA
476
+ df = df.loc[(df['Nucleus_Size'] > 42 )]
477
+ df = df.loc[(df['Nucleus_Size'] < 216)]
478
+ print("Number of cells after filtering on nucleus size:", df.shape[0])
479
+
480
+ df = df.loc[(df['AF555_Cell_Intensity_Average'] < 2000)]
481
+ print("Number of cells after filtering on AF555A ___ intensity:", df.shape[0])
482
+ cells_after_filter_nucleus = f"Number of cells after filtering on nucleus size: {df.shape[0]}"
483
+ cells_after_filter_intensity = f"Number of cells after filtering on AF555A ___ intensity: {df.shape[0]}"
484
+
485
+
486
+ # In[27]:
487
+
488
+
489
+ # Assign cell type
490
+ # Assign tumor cells at each row at first (random assigning here just for development purposes)
491
+ # Generate random values for cell_type column
492
+ random_values = np.random.randint(0, 10, size=len(df))
493
+
494
+ # Assign cell type based on random values
495
+ def assign_cell_type(n):
496
+ return np.random.choice(['STROMA','CANCER','IMMUNE','ENDOTHELIAL'])
497
+
498
+ df['cell_type'] = np.vectorize(assign_cell_type)(random_values)
499
+ df['cell_subtype'] = df['cell_type'].copy()
500
+
501
+
502
+ # In[28]:
503
+
504
+
505
+ filtered_dataframe = df
506
+ df.head()
507
+
508
+
509
+ # In[29]:
510
+
511
+
512
+ quality_control_df = filtered_dataframe
513
+
514
+
515
+ # In[30]:
516
+
517
+
518
+ def check_index_format(index_str, ls_samples):
519
+ """
520
+ Checks if the given index string follows the specified format.
521
+
522
+ Args:
523
+ index_str (str): The index string to be checked.
524
+ ls_samples (list): A list of valid sample names.
525
+
526
+ Returns:
527
+ bool: True if the index string follows the format, False otherwise.
528
+ """
529
+ # Split the index string into parts
530
+ parts = index_str.split('_')
531
+
532
+ # Check if there are exactly 3 parts
533
+ if len(parts) != 3:
534
+ print(len(parts))
535
+ return False
536
+
537
+ # Check if the first part is in ls_samples
538
+ sample_name = parts[0]
539
+ if f'{sample_name}_qc_eda.csv' not in ls_samples:
540
+ print(sample_name)
541
+ return False
542
+
543
+ # Check if the second part is in ['cell', 'cytoplasm', 'nucleus']
544
+ location = parts[1]
545
+ valid_locations = ['Cell', 'Cytoplasm', 'Nucleus']
546
+ if location not in valid_locations:
547
+ print(location)
548
+ return False
549
+
550
+ # Check if the third part is a number
551
+ try:
552
+ index = int(parts[2])
553
+ except ValueError:
554
+ print(index)
555
+ return False
556
+
557
+ # If all checks pass, return True
558
+ return True
559
+
560
+
561
+ # In[31]:
562
+
563
+
564
+ # Let's take a look at a few features to make sure our dataframe is as expected
565
+ df.index
566
+ def check_format_ofindex(index):
567
+ for index in df.index:
568
+ check_index = check_index_format(index, ls_samples)
569
+ if check_index is False:
570
+ index_format = "Bad"
571
+ return index_format
572
+
573
+ index_format = "Good"
574
+ return index_format
575
+ print(check_format_ofindex(df.index))
576
+
577
+
578
+ # In[32]:
579
+
580
+
581
+ import panel as pn
582
+ import pandas as pd
583
+
584
+ def quality_check(file, not_intensities):
585
+ # Load the output file
586
+ df = file
587
+
588
+ # Check Index
589
+ check_index = check_format_ofindex(df.index)
590
+
591
+ # Check Shape
592
+ check_shape = df.shape
593
+
594
+ # Check for NaN entries
595
+ check_no_null = df.isnull().any().any()
596
+
597
+ mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1)
598
+ if (mean_intensity == 0).any():
599
+ df = df.loc[mean_intensity > 0, :]
600
+ print("df.shape after removing 0 mean values: ", df.shape)
601
+ check_zero_intensities = f'Shape after removing 0 mean values: {df.shape}'
602
+ else:
603
+ print("No zero intensity values.")
604
+ check_zero_intensities = "No zero intensity values."
605
+
606
+ # Create a quality check results table
607
+ quality_check_results_table = pd.DataFrame({
608
+ 'Check': ['Index', 'Shape', 'Check for NaN Entries', 'Check for Zero Intensities'],
609
+ 'Result': [str(check_index), str(check_shape), str(check_no_null), check_zero_intensities]
610
+ })
611
+
612
+ # Create a quality check results component
613
+ quality_check_results_component = pn.Card(
614
+ pn.pane.DataFrame(quality_check_results_table),
615
+ title="Quality Control Results",
616
+ header_background="#2196f3",
617
+ header_color="white",
618
+ )
619
+
620
+ return quality_check_results_component
621
+
622
+
623
+ # ## II.5. CELL TYPES COLORS
624
+ # Establish colors to use throughout workflow
625
+
626
+ # we want colors that are categorical, since Cell Type is a non-ordered category.
627
+ # A categorical color palette will have dissimilar colors.
628
+ # Get those unique colors
629
+ cell_types = ['STROMA','CANCER','IMMUNE','ENDOTHELIAL']
630
+ color_values = sb.color_palette("hls", n_colors = len(cell_types))
631
+ # each color value is a tuple of three values: (R, G, B)
632
+
633
+ print("Unique cell types are:",df.cell_type.unique())
634
+ # Display those unique colors
635
+ sb.palplot(sb.color_palette(color_values))
636
+ # In[33]:
637
+
638
+
639
+ # Define your custom colors for each cell type
640
+ custom_colors = {
641
+ 'CANCER': (0.1333, 0.5451, 0.1333),
642
+ 'STROMA': (0.4, 0.4, 0.4),
643
+ 'IMMUNE': (1, 1, 0),
644
+ 'ENDOTHELIAL': (0.502, 0, 0.502)
645
+ }
646
+
647
+ # Retrieve the list of cell types
648
+ cell_types = list(custom_colors.keys())
649
+
650
+ # Extract the corresponding colors from the dictionary
651
+ color_values = [custom_colors[cell] for cell in cell_types]
652
+
653
+ # Display the colors
654
+ sb.palplot(sb.color_palette(color_values))
655
+
656
+
657
+ # In[34]:
658
+
659
+
660
+ # Store in a dctionnary
661
+ celltype_color_dict = dict(zip(cell_types, color_values))
662
+ celltype_color_dict
663
+
664
+
665
+ # In[35]:
666
+
667
+
668
+ celltype_color_df = pd.DataFrame.from_dict(celltype_color_dict, orient='index', columns=['R', 'G', 'B'])
669
+
670
+
671
+ # In[36]:
672
+
673
+
674
+ # Save color information (mapping and legend) to metadata directory
675
+ # Create dataframe
676
+ celltype_color_df = color_dict_to_df(celltype_color_dict, "cell_type")
677
+ celltype_color_df.head()
678
+
679
+ # Save to file in metadatadirectory
680
+ filename = "celltype_color_data.csv"
681
+ filename = os.path.join(metadata_dir, filename)
682
+ celltype_color_df.to_csv(filename, index = False)
683
+ print("File" + filename + " was created!")
684
+
685
+
686
+ # In[37]:
687
+
688
+
689
+ celltype_color_df.head()
690
+
691
+
692
+ # In[38]:
693
+
694
+
695
+ # Legend of cell type info only
696
+ g = plt.figure(figsize = (1,1)).add_subplot(111)
697
+ g.axis('off')
698
+ handles = []
699
+ for item in celltype_color_dict.keys():
700
+ h = g.bar(0,0, color = celltype_color_dict[item],
701
+ label = item, linewidth =0)
702
+ handles.append(h)
703
+ first_legend = plt.legend(handles=handles, loc='upper right', title = 'Cell type'),
704
+
705
+
706
+ filename = "Celltype_legend.png"
707
+ filename = os.path.join(metadata_images_dir, filename)
708
+ plt.savefig(filename, bbox_inches = 'tight')
709
+
710
+
711
+ # In[39]:
712
+
713
+
714
+ metadata
715
+
716
+
717
+ # In[40]:
718
+
719
+
720
+ df.columns.values
721
+
722
+
723
+ # In[41]:
724
+
725
+
726
+ df.shape
727
+
728
+
729
+ # In[42]:
730
+
731
+
732
+ metadata.shape
733
+
734
+
735
+ # ## II.6. *CELL SUBTYPES COLORS
736
+
737
+ # In[43]:
738
+
739
+
740
+ # Establish colors to use throughout workflow
741
+
742
+ # we want colors that are categorical, since Cell Type is a non-ordered category.
743
+ # A categorical color palette will have dissimilar colors.
744
+ # Get those unique colors
745
+ cell_subtypes = ['DC','B', 'TCD4','TCD8','M1','M2','Treg', \
746
+ 'IMMUNE_OTHER', 'CANCER', 'αSMA_myCAF',\
747
+ 'STROMA_OTHER', 'ENDOTHELIAL']
748
+ color_values = sb.color_palette("Paired",n_colors = len(cell_subtypes))
749
+ # each color value is a tuple of three values: (R, G, B)
750
+
751
+ print("Unique cell types are:",df.cell_subtype.unique())
752
+ # Display those unique colors
753
+ sb.palplot(sb.color_palette(color_values))
754
+
755
+
756
+ # In[44]:
757
+
758
+
759
+ # Store in a dctionnary
760
+ cellsubtype_color_dict = dict(zip(cell_subtypes, color_values))
761
+ cellsubtype_color_dict
762
+
763
+
764
+ # In[45]:
765
+
766
+
767
+ cellsubtype_color_df = pd.DataFrame.from_dict(cellsubtype_color_dict, orient='index', columns=['R', 'G', 'B'])
768
+
769
+
770
+ # In[46]:
771
+
772
+
773
+ # Save color information (mapping and legend) to metadata directory
774
+ # Create dataframe
775
+ cellsubtype_color_df = color_dict_to_df(cellsubtype_color_dict, "cell_subtype")
776
+
777
+ # Save to file in metadatadirectory
778
+ filename = "cellsubtype_color_data.csv"
779
+ filename = os.path.join(metadata_dir, filename)
780
+ cellsubtype_color_df.to_csv(filename, index = False)
781
+ print("File" + filename + " was created!")
782
+
783
+
784
+ # In[47]:
785
+
786
+
787
+ cellsubtype_color_df.head()
788
+
789
+
790
+ # In[48]:
791
+
792
+
793
+ # Legend of cell type info only
794
+ g = plt.figure(figsize = (1,1)).add_subplot(111)
795
+ g.axis('off')
796
+ handles = []
797
+ for item in cellsubtype_color_dict.keys():
798
+ h = g.bar(0,0, color = cellsubtype_color_dict[item],
799
+ label = item, linewidth =0)
800
+ handles.append(h)
801
+ first_legend = plt.legend(handles=handles, loc='upper right', title = 'Cell subtype'),
802
+
803
+
804
+ filename = "Cellsubtype_legend.png"
805
+ filename = os.path.join(metadata_images_dir, filename)
806
+ plt.savefig(filename, bbox_inches = 'tight')
807
+
808
+
809
+ # ## II.7. IMMUNE CHECKPOINT COLORS
810
+
811
+ # In[49]:
812
+
813
+
814
+ # Assign IMMUNE SUBTYPES
815
+ df['cell_subtype'] = df['cell_type'].copy()
816
+ df['immune_checkpoint'] = 'none'
817
+ df
818
+
819
+ immune_checkpoint = ['B7H4', 'PDL1', 'PD1', 'None']
820
+ color_values = sb.color_palette("husl",n_colors=len(immune_checkpoint))
821
+ # each color value is a tuple of three values: (R, G, B)
822
+
823
+ print("Unique immune checkpoint are:",df.immune_checkpoint.unique())
824
+ # Display those unique colors
825
+ sb.palplot(sb.color_palette(color_values))
826
+ # In[50]:
827
+
828
+
829
+ immune_checkpoint = ['B7H4', 'PDL1', 'PD1', 'B7H4_PDL1', 'None']
830
+
831
+ # Base colors for the primary checkpoints
832
+ base_colors = sb.color_palette("husl", n_colors=3) # Three distinct colors
833
+
834
+ # Function to mix two RGB colors
835
+ def mix_colors(color1, color2):
836
+ return tuple((c1 + c2) / 2 for c1, c2 in zip(color1, color2))
837
+
838
+ # Generate mixed colors for the combinations of checkpoints
839
+ mixed_colors = [
840
+ mix_colors(base_colors[0], base_colors[1]), # Mix B7H4 and PDL1
841
+ # mix_colors(base_colors[0], base_colors[2]), # Mix B7H4 and PD1
842
+ # mix_colors(base_colors[1], base_colors[2]), # Mix PDL1 and PD1
843
+ tuple(np.mean(base_colors, axis=0)) # Mix B7H4, PDL1, and PD1
844
+ ]
845
+
846
+ # Adding the color for 'None'
847
+ #none_color = [(0.8, 0.8, 0.8)] # A shade of gray
848
+
849
+ # Combine all colors into one list
850
+ color_values = base_colors + mixed_colors #+ none_color
851
+
852
+ # Display unique immune checkpoint combinations
853
+ print("Unique immune checkpoint combinations are:", immune_checkpoint)
854
+ # Display the unique colors
855
+ sb.palplot(color_values)
856
+
857
+
858
+ # In[51]:
859
+
860
+
861
+ # Store in a dctionnary
862
+ immunecheckpoint_color_dict = dict(zip(immune_checkpoint, color_values))
863
+ immunecheckpoint_color_dict
864
+
865
+
866
+ # In[52]:
867
+
868
+
869
+ # Save color information (mapping and legend) to metadata directory
870
+ # Create dataframe
871
+ immunecheckpoint_color_df = color_dict_to_df(immunecheckpoint_color_dict, "immune_checkpoint")
872
+ immunecheckpoint_color_df.head()
873
+
874
+ # Save to file in metadatadirectory
875
+ filename = "immunecheckpoint_color_data.csv"
876
+ filename = os.path.join(metadata_dir, filename)
877
+ immunecheckpoint_color_df.to_csv(filename, index = False)
878
+ print("File " + filename + " was created!")
879
+
880
+
881
+ # In[53]:
882
+
883
+
884
+ # Legend of cell type info only
885
+ g = plt.figure(figsize = (1,1)).add_subplot(111)
886
+ g.axis('off')
887
+ handles = []
888
+ for item in immunecheckpoint_color_dict.keys():
889
+ h = g.bar(0,0, color = immunecheckpoint_color_dict[item],
890
+ label = item, linewidth =0)
891
+ handles.append(h)
892
+ first_legend = plt.legend(handles=handles, loc='upper right', title = 'Immune checkpoint'),
893
+
894
+
895
+ filename = "Cellsubtype_legend.png"
896
+ filename = os.path.join(metadata_images_dir, filename)
897
+ plt.savefig(filename, bbox_inches = 'tight')
898
+
899
+
900
+ # ## II.7. BACKGROUND SUBSTRACTION
901
+
902
+ # In[54]:
903
+
904
+
905
+ def do_background_sub(col, df, metadata):
906
+ #print(col.name)
907
+ location = metadata.loc[metadata['full_column'] == col.name, 'localisation'].values[0]
908
+ #print('location = ' + location)
909
+ channel = metadata.loc[metadata['full_column'] == col.name, 'Channel'].values[0]
910
+ #print('channel = ' + channel)
911
+ af_target = metadata.loc[
912
+ (metadata['Channel']==channel) \
913
+ & (metadata['localisation']==location) \
914
+ & (metadata['target_lower'].str.contains(r'^af\d{3}$')),\
915
+ 'full_column'].values[0]
916
+ return col - df.loc[:,af_target]
917
+
918
+
919
+ # In[55]:
920
+
921
+
922
+ metadata_with_localisation = metadata
923
+ metadata_with_localisation
924
+
925
+
926
+ # In[56]:
927
+
928
+
929
+ #Normalization
930
+
931
+ df.loc[:, ~df.columns.isin(not_intensities)] = \
932
+ df.loc[:, ~df.columns.isin(not_intensities)].apply(lambda column: divide_exp_time(column, 'Exp', metadata), axis = 0)
933
+
934
+
935
+ # In[57]:
936
+
937
+
938
+ normalization_df = df
939
+ normalization_df.head()
940
+
941
+
942
+ # In[58]:
943
+
944
+
945
+ # Do background subtraction
946
+ # this uses a df (metadata) outside of
947
+ # the scope of the lambda...
948
+ # careful that this might break inside of a script...
949
+
950
+ df.loc[:,~df.columns.isin(not_intensities)] = \
951
+ df.loc[:,~df.columns.isin(not_intensities)].apply(lambda column: do_background_sub(column, df, metadata),axis = 0)
952
+
953
+
954
+ # In[59]:
955
+
956
+
957
+ df
958
+ background_substraction_df = df
959
+ background_substraction_df.head()
960
+
961
+
962
+ # In[60]:
963
+
964
+
965
+ # Drop AF columns
966
+ df = df.filter(regex='^(?!AF\d{3}).*')
967
+ print(df.columns.values)
968
+
969
+
970
+ # In[61]:
971
+
972
+
973
+ intensities_df = df.loc[:, ~df.columns.isin(not_intensities)]
974
+ intensities_df
975
+
976
+
977
+ # In[62]:
978
+
979
+
980
+ normalization_df.head()
981
+
982
+
983
+ # In[63]:
984
+
985
+
986
+ metadata_df = metadata_with_localisation
987
+ intensities_df = intensities_df # Assuming you have loaded the intensities DataFrame
988
+
989
+ # Create a list of column names from the intensities DataFrame
990
+ column_names = intensities_df.columns.tolist()
991
+
992
+ # Create a Select widget for choosing a column
993
+ column_selector = pn.widgets.Select(name='Select Column', options=column_names)
994
+
995
+ # Create a Markdown widget to display the selected column's information
996
+ column_info_md = pn.pane.Markdown(name='Column Information', width=400, object='Select a column to view its information.')
997
+
998
+ # Define a function to update the column information
999
+ def update_column_info(event):
1000
+ selected_column = event.new
1001
+ if selected_column:
1002
+ # Get the selected column's intensity
1003
+ intensity = intensities_df[selected_column].values
1004
+
1005
+ # Get the corresponding channel, localization, and experiment from the metadata
1006
+ channel = metadata_df.loc[metadata_df['full_column'] == selected_column, 'Channel'].values[0]
1007
+ localization = metadata_df.loc[metadata_df['full_column'] == selected_column, 'localisation'].values[0]
1008
+ exposure = metadata_df.loc[metadata_df['full_column'] == selected_column, 'Exp'].values[0]
1009
+
1010
+ # Create a Markdown string with the column information
1011
+ column_info_text = f"**Intensity:** {intensity}\n\n**Channel:** {channel}\n\n**Localization:** {localization}\n\n**Exposure:** {exposure}"
1012
+
1013
+ # Update the Markdown widget with the column information
1014
+ column_info_md.object = column_info_text
1015
+ else:
1016
+ column_info_md.object = 'Select a column to view its information.'
1017
+
1018
+ # Watch for changes in the column selector and update the column information
1019
+ column_selector.param.watch(update_column_info, 'value')
1020
+
1021
+ # Create a Panel app and display the widgets
1022
+ bs_info = pn.Column(column_selector, column_info_md)
1023
+ pn.extension()
1024
+ bs_info.servable()
1025
+
1026
+
1027
+ # In[64]:
1028
+
1029
+
1030
+ normalization_df.head()
1031
+
1032
+
1033
+ # In[65]:
1034
+
1035
+
1036
+ import panel as pn
1037
+ df_widget = pn.widgets.DataFrame(metadata, name="MetaData")
1038
+ app2 = pn.template.GoldenTemplate(
1039
+ site="Cyc-IF",
1040
+ title=" Background-Substraction",
1041
+ main=[pn.Tabs(("Background-Substraction",pn.Column(
1042
+ #pn.Column(pn.pane.Markdown("### Celltype thresholds"), pn.pane.DataFrame(celltype_color_df)),
1043
+ #pn.Column(pn.pane.Markdown("### Cell Subtype thresholds"), pn.pane.DataFrame(cellsubtype_color_df)),
1044
+ #pn.Column(pn.pane.Markdown("### Cells Before Filtering"),pn.pane.Str(cells_before_filter)),
1045
+ #pn.Column(pn.pane.Markdown("### Cells After Filtering Nucleus"),pn.pane.Str(cells_after_filter_nucleus)),
1046
+ #pn.Column(pn.pane.Markdown("### Cells After Filtering Intensity"),pn.pane.Str(cells_after_filter_intensity)),
1047
+ #pn.Column(pn.pane.Markdown("### Dataframe after filtering"), pn.pane.DataFrame(filtered_dataframe.head())),
1048
+ pn.Column(pn.pane.Markdown("### The metadata obtained that specifies the localisation:"), metadata_with_localisation.head(8)),
1049
+ pn.Column(pn.pane.Markdown("### The channels and exposure of each intensities column"), bs_info),
1050
+ pn.Column(pn.pane.Markdown("### Dataframe after perfroming normalization"),pn.pane.DataFrame(normalization_df.head(), width = 1500)),
1051
+ pn.Column(pn.pane.Markdown("### Dataframe after background Substraction"), pn.Feed(background_substraction_df.head(),),
1052
+ ))),
1053
+ ("Quality Control", pn.Column(
1054
+ quality_check(quality_control_df, not_intensities)
1055
+ #pn.pane.Markdown("### The Quality check results are:"), quality_check_results(check_shape, check_no_null, check_all_expected_files_present, check_zero_intensities)
1056
+ ))
1057
+ )],)
1058
+
1059
+
1060
+ # In[66]:
1061
+
1062
+
1063
+ app2.show(port = 1003)
1064
+
1065
+
1066
+ # ## II.8. SAVE
1067
+
1068
+ # In[67]:
1069
+
1070
+
1071
+ # Save the data by Sample_ID
1072
+ # Check for the existence of the output file first
1073
+ for sample in ls_samples:
1074
+ sample_id = sample.split('_')[0]
1075
+ filename = os.path.join(output_data_dir, sample_id + "_" + step_suffix + ".csv")
1076
+ if os.path.exists(filename):
1077
+ print("File by name "+filename+" already exists.")
1078
+ else:
1079
+ sample_id_csv = sample_id + '.csv'
1080
+ df_save = df.loc[df['Sample_ID'] == sample_id_csv, :]
1081
+ #print(df_save)
1082
+ filename = os.path.join(output_data_dir, sample_id + "_" + step_suffix + ".csv")
1083
+ df_save.to_csv(filename, index=True, index_label='ID') # Set index parameter to True to retain the index column
1084
+ print("File " + filename + " was created!")