KashyapiNagaHarshitha commited on
Commit
e70c547
·
verified ·
1 Parent(s): fbc6b4f

Upload Z_Score.py

Browse files
Files changed (1) hide show
  1. Z_Score.py +1128 -0
Z_Score.py ADDED
@@ -0,0 +1,1128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ import os
5
+ import random
6
+ import re
7
+ import pandas as pd
8
+ import numpy as np
9
+ import seaborn as sb
10
+ import matplotlib.pyplot as plt
11
+ import matplotlib.colors as mplc
12
+ import subprocess
13
+ import warnings
14
+ from scipy import signal
15
+ from scipy.stats.stats import pearsonr
16
+ import plotly.figure_factory as ff
17
+ import plotly
18
+ import plotly.graph_objs as go
19
+ from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
20
+ import plotly.express as px
21
+ from my_modules import *
22
+ import panel as pn
23
+
24
+ #Silence FutureWarnings & UserWarnings
25
+ warnings.filterwarnings('ignore', category= FutureWarning)
26
+ warnings.filterwarnings('ignore', category= UserWarning)
27
+
28
+
29
+ # ## III.2. *DIRECTORIES
30
+
31
+ # In[4]:
32
+
33
+
34
+ # Set base directory
35
+
36
+ ##### MAC WORKSTATION #####
37
+ #base_dir = r'/Volumes/LaboLabrie/Projets/OC_TMA_Pejovic/Temp/Zoe/CyCIF_pipeline/'
38
+ ###########################
39
+
40
+ ##### WINDOWS WORKSTATION #####
41
+ #base_dir = r'C:\Users\LaboLabrie\gerz2701\cyCIF-pipeline\Set_B'
42
+ ###############################
43
+
44
+ ##### LOCAL WORKSTATION #####
45
+ base_dir = r'/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431'
46
+ #############################
47
+
48
+ #set_name = 'Set_A'
49
+ set_name = 'test'
50
+
51
+
52
+ # In[5]:
53
+
54
+
55
+
56
+ base_dir = '/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431'
57
+ set_path = 'test'
58
+ selected_metadata_files = "['Slide_B_DD1s1.one_1.tif.csv', 'Slide_B_DD1s1.one_2.tif.csv']"
59
+ ls_samples = "['Ashlar_Exposure_Time.csv', 'new_data.csv', 'DD3S1.csv', 'DD3S2.csv', 'DD3S3.csv', 'TMA.csv']"
60
+ print(base_dir)
61
+ print(set_path)
62
+ print(ls_samples)
63
+ print(selected_metadata_files)
64
+
65
+
66
+ project_name = set_name # Project name
67
+ step_suffix = 'zscore' # Curent part (here part III)
68
+ previous_step_suffix_long = "_bs" # Previous part (here BS NOTEBOOK)
69
+
70
+ # Initial input data directory
71
+ input_data_dir = os.path.join(base_dir, project_name + previous_step_suffix_long)
72
+
73
+ # ZSCORE/LOG2 output directories
74
+ output_data_dir = os.path.join(base_dir, project_name + "_" + step_suffix)
75
+ # ZSCORE/LOG2 images subdirectory
76
+ output_images_dir = os.path.join(output_data_dir,"images")
77
+
78
+ # Data and Metadata directories
79
+ # Metadata directories
80
+ metadata_dir = os.path.join(base_dir, project_name + "_metadata")
81
+ # images subdirectory
82
+ metadata_images_dir = os.path.join(metadata_dir,"images")
83
+
84
+ # Create directories if they don't already exist
85
+ for d in [base_dir, input_data_dir, output_data_dir, output_images_dir, metadata_dir, metadata_images_dir]:
86
+ if not os.path.exists(d):
87
+ print("Creation of the" , d, "directory...")
88
+ os.makedirs(d)
89
+ else :
90
+ print("The", d, "directory already exists !")
91
+
92
+ os.chdir(input_data_dir)
93
+
94
+
95
+ # In[7]:
96
+
97
+
98
+ # Verify paths
99
+ print('base_dir :', base_dir)
100
+ print('input_data_dir :', input_data_dir)
101
+ print('output_data_dir :', output_data_dir)
102
+ print('output_images_dir :', output_images_dir)
103
+ print('metadata_dir :', metadata_dir)
104
+ print('metadata_images_dir :', metadata_images_dir)
105
+
106
+
107
+ # ## III.3. FILES
108
+ #Don't forget to put your data in the projname_data directory !
109
+ # ### III.3.1. METADATA
110
+
111
+ # In[8]:
112
+
113
+
114
+ # Import all metadata we need from the BS chapter
115
+
116
+ # METADATA
117
+ filename = "marker_intensity_metadata.csv"
118
+ filename = os.path.join(metadata_dir, filename)
119
+
120
+ # Check file exists
121
+ if not os.path.exists(filename):
122
+ print("WARNING: Could not find desired file: "+filename)
123
+ else :
124
+ print("The",filename,"file was imported for further analysis!")
125
+
126
+ # Open, read in information
127
+ metadata = pd.read_csv(filename)
128
+
129
+ # Verify size with verify_line_no() function in my_modules.py
130
+ #verify_line_no(filename, metadata.shape[0] + 1)
131
+
132
+ # Verify headers
133
+ exp_cols = ['Round','Target','Channel','target_lower','full_column','marker','localisation']
134
+ compare_headers(exp_cols, metadata.columns.values, "Marker metadata file")
135
+
136
+ metadata = metadata.dropna()
137
+ metadata.head()
138
+
139
+
140
+ # ### III.3.2. NOT_INTENSITIES
141
+
142
+ # In[9]:
143
+
144
+
145
+ filename = "not_intensities.csv"
146
+ filename = os.path.join(metadata_dir, filename)
147
+
148
+ # Check file exists
149
+ if not os.path.exists(filename):
150
+ print("WARNING: Could not find desired file: "+filename)
151
+ else :
152
+ print("The",filename,"file was imported for further analysis!")
153
+
154
+ # Open, read in information
155
+ not_intensities = []
156
+ with open(filename, 'r') as fh:
157
+ not_intensities = fh.read().strip().split("\n")
158
+ # take str, strip whitespace, split on new line character
159
+
160
+ # Verify size
161
+ print("Verifying data read from file is the correct length...\n")
162
+ #verify_line_no(filename, len(not_intensities))
163
+
164
+ # Print to console
165
+ print("not_intensities =\n", not_intensities)
166
+ pd.DataFrame(not_intensities)
167
+
168
+
169
+ # ### III.3.3. FULL_TO_SHORT_COLUMN_NAMES
170
+
171
+ # In[10]:
172
+
173
+
174
+ filename = "full_to_short_column_names.csv"
175
+ filename = os.path.join(metadata_dir, filename)
176
+
177
+ # Check file exists
178
+ if not os.path.exists(filename):
179
+ print("WARNING: Could not find desired file: " + filename)
180
+ else :
181
+ print("The",filename,"file was imported for further analysis!")
182
+
183
+ # Open, read in information
184
+ df = pd.read_csv(filename, header = 0)
185
+
186
+ # Verify size
187
+ print("Verifying data read from file is the correct length...\n")
188
+ #verify_line_no(filename, df.shape[0] + 1)
189
+
190
+ # Turn into dictionary
191
+ full_to_short_names = df.set_index('full_name').T.to_dict('records')[0]
192
+
193
+ # CD45 instead of CD45b
194
+ if project_name == 'Slide_A' :
195
+ full_to_short_names['CD45_Cytoplasm_Intensity_Average'] = full_to_short_names.pop('CD45b_Cytoplasm_Intensity_Average')
196
+ full_to_short_names['CD45_Cytoplasm_Intensity_Average'] = 'CD45_Cytoplasm'
197
+
198
+ # Print information
199
+ print('full_to_short_names =\n',full_to_short_names)
200
+
201
+
202
+ # ### III.3.4. SHORT_TO_FULL_COLUMN_NAMES
203
+
204
+ # In[11]:
205
+
206
+
207
+ filename = "short_to_full_column_names.csv"
208
+ filename = os.path.join(metadata_dir, filename)
209
+
210
+ # Check file exists
211
+ if not os.path.exists(filename):
212
+ print("WARNING: Could not find desired file: " + filename)
213
+ else :
214
+ print("The",filename,"file was imported for further analysis!")
215
+
216
+ # Open, read in information
217
+ df = pd.read_csv(filename, header = 0)
218
+
219
+ # Verify size
220
+ print("Verifying data read from file is the correct length...\n")
221
+ #verify_line_no(filename, df.shape[0] + 1)
222
+
223
+ # Turn into dictionary
224
+ short_to_full_names = df.set_index('short_name').T.to_dict('records')[0]
225
+
226
+ # CD45 instead of CD45b
227
+ if project_name == 'Slide_A' :
228
+ short_to_full_names['CD45_Cytoplasm'] = short_to_full_names.pop('CD45b_Cytoplasm')
229
+ short_to_full_names['CD45_Cytoplasm'] = 'CD45_Cytoplasm_Intensity_Average'
230
+
231
+ # Print information
232
+ print('short_to_full_names =\n',short_to_full_names)
233
+
234
+
235
+ # ### III.3.5. SAMPLES COLORS
236
+
237
+ # In[12]:
238
+
239
+
240
+ filename = "sample_color_data.csv"
241
+ filename = os.path.join(metadata_dir, filename)
242
+
243
+ # Check file exists
244
+ if not os.path.exists(filename):
245
+ print("WARNING: Could not find desired file: " + filename)
246
+ else :
247
+ print("The",filename,"file was imported for further analysis!")
248
+
249
+ # Open, read in information
250
+ df = pd.read_csv(filename, header = 0)
251
+ df = df.drop(columns = ['hex'])
252
+
253
+ # our tuple of float values for rgb, (r, g, b) was read in
254
+ # as a string '(r, g, b)'. We need to extract the r-, g-, and b-
255
+ # substrings and convert them back into floats
256
+ df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)
257
+
258
+ # Verify size
259
+ print("Verifying data read from file is the correct length...\n")
260
+ #verify_line_no(filename, df.shape[0] + 1)
261
+
262
+ # Turn into dictionary
263
+ sample_color_dict = df.set_index('Sample_ID')['rgb']
264
+
265
+ # Print information
266
+ print('sample_color_dict =\n',sample_color_dict)
267
+
268
+
269
+ # ### III.3.6. CHANNELS COLORS
270
+
271
+ # In[13]:
272
+
273
+
274
+ filename = "channel_color_data.csv"
275
+ filename = os.path.join(metadata_dir, filename)
276
+
277
+ # Check file exists
278
+ if not os.path.exists(filename):
279
+ print("WARNING: Could not find desired file: "+filename)
280
+ else :
281
+ print("The",filename,"file was imported for further analysis!")
282
+
283
+ # Open, read in information
284
+ df = pd.read_csv(filename, header = 0)
285
+ df = df.drop(columns = ['hex'])
286
+
287
+ # our tuple of float values for rgb, (r, g, b) was read in
288
+ # as a string '(r, g, b)'. We need to extract the r-, g-, and b-
289
+ # substrings and convert them back into floats
290
+ df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)
291
+
292
+ # Verify size
293
+ print("Verifying data read from file is the correct length...\n")
294
+ #verify_line_no(filename, df.shape[0] + 1)
295
+
296
+ # Turn into dictionary
297
+ channel_color_dict = df.set_index('Channel')['rgb']
298
+
299
+ # Print information
300
+ print('channel_color_dict =\n',channel_color_dict)
301
+
302
+
303
+ # ### III.3.7. ROUNDS COLORS
304
+
305
+ # In[14]:
306
+
307
+
308
+ # ROUND
309
+ filename = "round_color_data.csv"
310
+ filename = os.path.join(metadata_dir, filename)
311
+
312
+ # Check file exists
313
+ if not os.path.exists(filename):
314
+ print("WARNING: Could not find desired file: "+filename)
315
+ else :
316
+ print("The",filename,"file was imported for further analysis!")
317
+
318
+ # Open, read in information
319
+ df = pd.read_csv(filename, header = 0)
320
+ df = df.drop(columns = ['hex'])
321
+
322
+ # our tuple of float values for rgb, (r, g, b) was read in
323
+ # as a string '(r, g, b)'. We need to extract the r-, g-, and b-
324
+ # substrings and convert them back into floats
325
+ df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)
326
+
327
+ # Verify size
328
+ print("Verifying data read from file is the correct length...\n")
329
+ #verify_line_no(filename, df.shape[0] + 1)
330
+
331
+ # Turn into dictionary
332
+ round_color_dict = df.set_index('Round')['rgb']
333
+
334
+ # Print information
335
+ print('round_color_dict =\n',round_color_dict)
336
+
337
+
338
+ # ### III.3.8. CELL TYPES COLORS
339
+
340
+ # In[15]:
341
+
342
+
343
+ data = pd.read_csv('/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/test_metadata/celltype_color_data.csv')
344
+ data
345
+
346
+
347
+ # In[16]:
348
+
349
+
350
+ filename = "celltype_color_data.csv"
351
+ filename = os.path.join(metadata_dir, filename)
352
+
353
+ # Check file exists
354
+ if not os.path.exists(filename):
355
+ print("WARNING: Could not find desired file: "+filename)
356
+ else :
357
+ print("The",filename,"file was imported for further analysis!")
358
+
359
+ # Open, read in information
360
+ df = pd.read_csv(filename, header = 0)
361
+ #df = df.drop(columns = ['hex'])
362
+
363
+ # Assuming the RGB values are already in separate columns 'R', 'G', 'B'
364
+ if all(col in df.columns for col in ['R', 'G', 'B']):
365
+ # Create the 'rgb' column as tuples of floats
366
+ df['rgb'] = list(zip(df['R'], df['G'], df['B']))
367
+
368
+ # our tuple of float values for rgb, (r, g, b) was read in
369
+ # as a string '(r, g, b)'. We need to extract the r-, g-, and b-
370
+ # substrings and convert them back into floats
371
+ #df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)
372
+
373
+ # Verify size
374
+ print("Verifying data read from file is the correct length...\n")
375
+ #verify_line_no(filename, df.shape[0] + 1)
376
+
377
+ # Turn into dictionary
378
+ cell_type_color_dict = df.set_index('cell_type')['rgb']
379
+
380
+ # Print information
381
+ print('cell_type_color_dict =\n',cell_type_color_dict)
382
+
383
+
384
+ # ### III.3.9. CELL SUBTYPES COLORS
385
+
386
+ # In[17]:
387
+
388
+
389
+ df = pd.read_csv(filename)
390
+ df.head()
391
+
392
+
393
+ # In[18]:
394
+
395
+
396
+ filename = "cellsubtype_color_data.csv"
397
+ filename = os.path.join(metadata_dir, filename)
398
+
399
+ # Check file exists
400
+ if not os.path.exists(filename):
401
+ print("WARNING: Could not find desired file: "+filename)
402
+ else :
403
+ print("The",filename,"file was imported for further analysis!")
404
+
405
+ # Open, read in information
406
+ df = pd.read_csv(filename, header = 0)
407
+ df = df.drop(columns = ['hex'])
408
+
409
+ # our tuple of float values for rgb, (r, g, b) was read in
410
+ # as a string '(r, g, b)'. We need to extract the r-, g-, and b-
411
+ # substrings and convert them back into floats
412
+ df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)
413
+
414
+ # Verify size
415
+ print("Verifying data read from file is the correct length...\n")
416
+ #verify_line_no(filename, df.shape[0] + 1)
417
+
418
+ # Turn into dictionary
419
+ cell_subtype_color_dict = df.set_index('cell_subtype')['rgb'].to_dict()
420
+
421
+ # Print information
422
+ print('cell_subtype_color_dict =\n',cell_subtype_color_dict)
423
+
424
+
425
+ # In[19]:
426
+
427
+
428
+ df = pd.read_csv(filename)
429
+ df.head()
430
+
431
+
432
+ # ### III.3.10. IMMUNE CHECKPOINT COLORS
433
+
434
+ # In[20]:
435
+
436
+
437
+ metadata_dir = "/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/test_metadata"
438
+ filename = "immunecheckpoint_color_data.csv"
439
+ filename = os.path.join(metadata_dir, filename)
440
+
441
+ # Check file exists
442
+ if not os.path.exists(filename):
443
+ print("WARNING: Could not find desired file: "+filename)
444
+ else:
445
+ print("The", filename, "file was imported for further analysis!")
446
+
447
+ # Open, read in information
448
+ df = pd.read_csv(filename, header=0)
449
+ df = df.drop(columns=['hex'])
450
+
451
+ # Convert the 'rgb' column from string to tuple
452
+ df['rgb'] = df['rgb'].apply(rgb_tuple_from_str)
453
+
454
+ # Verify size
455
+ print("Verifying data read from file is the correct length...\n")
456
+ #verify_line_no(filename, df.shape[0] + 1)
457
+
458
+ # Turn into dictionary
459
+ immune_checkpoint_color_dict = df.set_index('immune_checkpoint')['rgb'].to_dict()
460
+
461
+ # Print information
462
+ print('immune_checkpoint_color_dict =\n', immune_checkpoint_color_dict)
463
+ immune_checkpoint_color_df = pd.DataFrame(immune_checkpoint_color_dict)
464
+ immune_checkpoint_color_df
465
+
466
+
467
+ # ### III.3.10. DATA
468
+
469
+ # In[21]:
470
+
471
+
472
+ # DATA
473
+ # List files in the directory
474
+ # Check if the directory exists
475
+ if os.path.exists(input_data_dir):
476
+ # List files in the directory
477
+ ls_samples = [sample for sample in os.listdir(input_data_dir) if sample.endswith("_bs.csv")]
478
+ print("The following CSV files were detected:")
479
+ print([sample for sample in ls_samples])
480
+ else:
481
+ print(f"The directory {input_data_dir} does not exist.")
482
+
483
+
484
+ # In[22]:
485
+
486
+
487
+ # Import all the others files
488
+ dfs = {}
489
+
490
+ # Set variable to hold default header values
491
+ # First gather information on expected headers using first file in ls_samples
492
+ # Read in the first row of the file corresponding to the first sample (index = 0) in ls_samples
493
+ df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]) , index_col = 0, nrows = 1)
494
+ expected_headers = df.columns.values
495
+ #print(expected_headers)
496
+
497
+ ###############################
498
+ # !! This may take a while !! #
499
+ ###############################
500
+ for sample in ls_samples:
501
+ file_path = os.path.join(input_data_dir,sample)
502
+ print(file_path)
503
+ try:
504
+ # Read the CSV file
505
+ df = pd.read_csv(file_path, index_col=0)
506
+ # Check if the DataFrame is empty, if so, don't continue trying to process df and remove it
507
+
508
+ if not df.empty:
509
+ # Reorder the columns to match the expected headers list
510
+ df = df.reindex(columns=expected_headers)
511
+ print(sample, "file is processed !\n")
512
+ #print(df)
513
+
514
+ except pd.errors.EmptyDataError:
515
+ print(f'\nEmpty data error in {sample} file. Removing from analysis...')
516
+ ls_samples.remove(sample)
517
+
518
+ # Add df to dfs
519
+ dfs[sample] = df
520
+
521
+ #print(dfs)
522
+
523
+
524
+ # In[23]:
525
+
526
+
527
+ # Merge dfs into one df
528
+ df = pd.concat(dfs.values(), ignore_index=False , sort = False)
529
+ del dfs
530
+ merged_df = df
531
+
532
+
533
+ # In[24]:
534
+
535
+
536
+ merged_df
537
+
538
+
539
+ # In[25]:
540
+
541
+
542
+ merged_df_shape = df.shape
543
+
544
+
545
+ # In[26]:
546
+
547
+
548
+ merged_df_index =df.index
549
+
550
+
551
+ # In[27]:
552
+
553
+
554
+ merged_df_col_values = df.columns.values
555
+
556
+
557
+ # In[28]:
558
+
559
+
560
+ # Check for NaN entries (should not be any unless columns do not align)
561
+ # False means no NaN entries
562
+ # True means NaN entries
563
+ merged_df_null_values = df.isnull().any().any()
564
+
565
+
566
+ # In[29]:
567
+
568
+
569
+ df.isnull().any().any()
570
+
571
+
572
+ # ## III.4. MARKERS
573
+
574
+ # In[30]:
575
+
576
+
577
+ # Listing all the markers of interest for downstream analyses
578
+ # !!TODO WITH MARILYNE!!
579
+ markers = [
580
+ '53BP1_Nucleus_Intensity_Average',
581
+ 'AR_Nucleus_Intensity_Average',
582
+ 'CCNB1_Cell_Intensity_Average',
583
+ 'CCND1_Nucleus_Intensity_Average',
584
+ 'CCNE_Nucleus_Intensity_Average',
585
+ 'CD31_Cytoplasm_Intensity_Average',
586
+ 'CKs_Cytoplasm_Intensity_Average',
587
+ 'ERa_Nucleus_Intensity_Average',
588
+ 'Ecad_Cytoplasm_Intensity_Average',
589
+ 'GATA3_Nucleus_Intensity_Average',
590
+ 'H3K27_Nucleus_Intensity_Average',
591
+ 'H3K4me3_Nucleus_Intensity_Average',
592
+ 'HER2_Cytoplasm_Intensity_Average',
593
+ 'HSP90_Cell_Intensity_Average',
594
+ 'Ki67_Nucleus_Intensity_Average',
595
+ 'PAX8_Nucleus_Intensity_Average',
596
+ 'PCNA_Nucleus_Intensity_Average',
597
+ 'PRg_Nucleus_Intensity_Average',
598
+ 'S100b_Cytoplasm_Intensity_Average',
599
+ 'TP53_Cell_Intensity_Average',
600
+ 'Vimentin_Cytoplasm_Intensity_Average',
601
+ 'pAKT_Cytoplasm_Intensity_Average',
602
+ 'pATM_Nucleus_Intensity_Average',
603
+ 'pATR_Nucleus_Intensity_Average',
604
+ 'pERK_Cell_Intensity_Average',
605
+ 'pRB_Nucleus_Intensity_Average',
606
+ 'pS6_Cytoplasm_Intensity_Average',
607
+ 'AXL_Cytoplasm_Intensity_Average',
608
+ 'B7H4_Cell_Intensity_Average',
609
+ 'CD11c_Cytoplasm_Intensity_Average',
610
+ 'CD163_Cytoplasm_Intensity_Average',
611
+ 'CD20_Cytoplasm_Intensity_Average',
612
+ 'CD31_Cytoplasm_Intensity_Average',
613
+ 'CD44_Cytoplasm_Intensity_Average',
614
+ 'CD45_Cytoplasm_Intensity_Average',
615
+ 'CD45b_Cytoplasm_Intensity_Average',
616
+ 'CD4_Cytoplasm_Intensity_Average',
617
+ 'CD68_Cytoplasm_Intensity_Average',
618
+ 'CD8_Cytoplasm_Intensity_Average',
619
+ 'CKs_Cytoplasm_Intensity_Average',
620
+ 'ColVI_Cytoplasm_Intensity_Average',
621
+ 'Desmin_Cytoplasm_Intensity_Average',
622
+ 'Ecad_Cytoplasm_Intensity_Average',
623
+ 'FOXP3_Nucleus_Intensity_Average',
624
+ 'Fibronectin_Cytoplasm_Intensity_Average',
625
+ 'GATA3_Nucleus_Intensity_Average',
626
+ 'HLA_Cytoplasm_Intensity_Average',
627
+ 'Ki67_Nucleus_Intensity_Average',
628
+ 'MMP9_Cytoplasm_Intensity_Average',
629
+ 'PD1_Cytoplasm_Intensity_Average',
630
+ 'PDGFR_Cytoplasm_Intensity_Average',
631
+ 'PDL1_Cytoplasm_Intensity_Average',
632
+ 'Sting_Cytoplasm_Intensity_Average',
633
+ 'Vimentin_Cytoplasm_Intensity_Average',
634
+ 'aSMA_Cytoplasm_Intensity_Average'
635
+ ]
636
+
637
+
638
+ # In[31]:
639
+
640
+
641
+ # Check if all columns in the markers list are present in the DataFrame
642
+ missing_columns = [col for col in markers if col not in df.columns]
643
+ if missing_columns:
644
+ # If columns are missing that can be because the markers may be present in the other slide
645
+ print(f"The following columns are not present in the DataFrame ({len(missing_columns)} columns missing): \n{missing_columns}\n")
646
+ # Filter the DataFrame to keep only the columns that are in the markers list and also exist in the DataFrame
647
+ intersected_columns = list(set(markers).intersection(df.columns))
648
+ df_markers = df[intersected_columns]
649
+ else:
650
+ # Filter the DataFrame to keep only the columns in the markers list
651
+ df_markers = df[markers]
652
+
653
+ initial_df_marker = df_markers
654
+ df_markers.head()
655
+
656
+
657
+ # In[32]:
658
+
659
+
660
+ # Rename CD45b into CD45 (Slide A!)
661
+ if project_name == 'Slide_A' :
662
+ df_markers.rename(columns={"CD45b_Cytoplasm_Intensity_Average": "CD45_Cytoplasm_Intensity_Average"}, inplace=True)
663
+ df_markers.columns.values
664
+
665
+
666
+ # In[33]:
667
+
668
+
669
+ df_markers.shape
670
+
671
+
672
+ # In[34]:
673
+
674
+
675
+ min_values = df_markers.min().tolist()
676
+ min_values
677
+
678
+
679
+ # In[35]:
680
+
681
+
682
+ # Keep not_intensities and markers columns
683
+ # Combine both lists
684
+ combined_columns = list(set(markers) | set(not_intensities))
685
+
686
+ # Filter the DataFrame to keep only the combined columns present in both df and combined_columns
687
+ df_markers_not_intensities = df[df.columns.intersection(combined_columns)]
688
+
689
+
690
+ # In[36]:
691
+
692
+
693
+ df_markers_not_intensities
694
+
695
+
696
+ # In[37]:
697
+
698
+
699
+ df_markers_not_intensities.shape
700
+
701
+
702
+ # ## III.5. NORMALISATION
703
+
704
+ # In[38]:
705
+
706
+
707
+ df_markers.min().tolist()
708
+
709
+
710
+ # In[39]:
711
+
712
+
713
+ '''# LOG2 TRANFORMATION
714
+ #Values need to be higher than 0 for Log2 transformation.
715
+ print("df_marker.shape before normalisation: ", df_markers.shape)
716
+ df_marker_shape_before_norm = df_markers.shape
717
+
718
+ # Option 1
719
+ # This step might not be the best approach because in creates pattern in the data.
720
+ # set anything that is below 0 to 0, so that we can do the log transform, +1 to all columns
721
+ #for f in df_markers.columns[~df_markers.columns.isin(not_intensities)]:
722
+ #df_markers.loc[df_markers[f] < 0,f] = 0
723
+ #option2
724
+ # Add the min from min values (from above) +1 to all columns
725
+ #df_markers.loc[:, ~df_markers.columns.isin(not_intensities)] = \
726
+ #df_markers.loc[:,~df_markers.columns.isin(not_intensities)].copy() + 1
727
+ # Add the minimum value + 1 to each column
728
+ # OR'''
729
+
730
+
731
+ # In[40]:
732
+
733
+
734
+ min_value = df_markers.min().min()
735
+ print("min value = ", min_value)
736
+ df_markers = df_markers + (np.abs(min_value))
737
+
738
+ # +1
739
+ df_markers = df_markers + 1
740
+ df_after_norm = df_markers
741
+ df_marker_shape_after_norm = df_markers.shape
742
+ print("df_markers.shape after normalisation: ", df_markers.shape)
743
+ df_markers.min().tolist()
744
+
745
+ # Apply log2
746
+ df_markers.loc[:,~df_markers.columns.isin(not_intensities)] = \
747
+ np.log2(df_markers.loc[:, ~df_markers.columns.isin(not_intensities)])
748
+ print('log2 transform finished')
749
+
750
+ df_markers
751
+
752
+
753
+ # In[75]:
754
+
755
+
756
+ #main
757
+ pn.extension()
758
+
759
+ not_intensities = [] # Add columns to exclude from transformation if any
760
+
761
+ # Define transformation functions
762
+ def modify(df):
763
+ min_value = df.min().min()
764
+ df = df + (np.abs(min_value))
765
+ df = df + 1
766
+ df.loc[:, ~df.columns.isin(not_intensities)] = np.log2(df.loc[:, ~df.columns.isin(not_intensities)])
767
+ return df
768
+
769
+ def shift(df):
770
+ df.loc[:, ~df.columns.isin(not_intensities)] = np.log2(df.loc[:, ~df.columns.isin(not_intensities)])
771
+ return df
772
+
773
+ # Define the panel widgets
774
+ operation = pn.widgets.RadioButtonGroup(name='Operation', options=['Modify', 'Shift'], button_type='success')
775
+
776
+ # Define a function to update the DataFrame based on the selected operation
777
+ def update_dataframe(operation):
778
+ df = df_markers.copy()
779
+ if operation == 'Modify':
780
+ modified_df = modify(df)
781
+ elif operation == 'Shift':
782
+ modified_df = shift(df)
783
+ return modified_df.head()
784
+
785
+ # Create a panel layout
786
+ layout = pn.Column(
787
+ pn.pane.Markdown("### Data Transformation"),
788
+ operation,
789
+ pn.pane.Markdown("### Transformed DataFrame"),
790
+ pn.bind(lambda op: update_dataframe(op), operation)
791
+ )
792
+
793
+ #df_after_norm
794
+
795
+ df_markers.columns.tolist()
796
+
797
+ # Check for NaN entries (should not be any unless columns do not align)
798
+ # False means no NaN entries
799
+ # True means NaN entries
800
+ df_markers.isnull().any().any()
801
+
802
+ count_nan_in_df_markers = df_markers.isnull().sum().sum()
803
+ print(count_nan_in_df_markers)
804
+
805
+
806
+ # ## III.6. Z-SCORE TRANSFORMATION
807
+
808
+ # In[49]:
809
+
810
+
811
+ # Filter the DataFrame df to keep only the columns specified in the not_intensities list
812
+ #df = df.loc[:, not_intensities]
813
+ #df
814
+
815
+ # Check if all columns in the markers list are present in the DataFrame
816
+ missing_columns = [col for col in not_intensities if col not in df.columns]
817
+ if missing_columns:
818
+ print(f"The following columns are not present in the DataFrame ({len(missing_columns)} columns missing): \
819
+ \n{missing_columns}")
820
+ # Filter the DataFrame to keep only the columns that are in the markers list and also exist in the DataFrame
821
+ intersected_columns = list(set(not_intensities).intersection(df.columns))
822
+ df = df[intersected_columns]
823
+ else:
824
+ # Filter the DataFrame to keep only the columns in the markers list
825
+ df.loc[:, not_intensities]
826
+
827
+ df
828
+
829
+
830
+ # In[50]:
831
+
832
+
833
+ df
834
+
835
+
836
+ # In[51]:
837
+
838
+
839
+ df_merged = df_markers.merge(df, left_index=True, right_on='ID', how='left')
840
+ df_merged
841
+
842
+
843
+ # In[52]:
844
+
845
+
846
+ df_merged.columns.tolist()
847
+
848
+
849
+ # In[53]:
850
+
851
+
852
+ # Create a copy, just in case you need to restart the kernel
853
+ df_merged_copy = df_merged
854
+
855
+
856
+ # In[54]:
857
+
858
+
859
+ # Filters the rows of the DataFrame df_merged based on the values in the 'Sample_ID' column
860
+ # df_subset will contain a subset of rows from df_merged where the 'Sample_ID' matches the values in the list 'keep' ('TMA.csv' in this case)
861
+ keep = ['TMA.csv']
862
+ df_subset = df_merged.loc[df_merged['Sample_ID'].isin(keep),:].copy()
863
+ df_subset
864
+
865
+
866
+ # In[55]:
867
+
868
+ # Convert the DataFrame to numeric, forcing errors to NaN
869
+ df_numeric = df_subset.apply(pd.to_numeric, errors='coerce')
870
+ # Z-score normalization
871
+ # Z-score the rows (apply() with axis = 1, only perform on intensity data)
872
+ # Apply Z-score normalization only on numeric columns
873
+ df_subset.loc[:, ~df_subset.columns.isin(not_intensities)] = \
874
+ df_numeric.loc[:, ~df_numeric.columns.isin(not_intensities)].apply(
875
+ lambda row: (row - row.median()) / row.std(ddof=0), axis=1)
876
+ # Drop columns with all NaN values (if any)
877
+ df_subset.dropna(how='all', inplace=True, axis=1)
878
+
879
+ print('zscore rows finished')
880
+ ###############################
881
+ # !! This may take a while !! #
882
+ ###############################
883
+ '''df_subset.loc[:,~df_subset.columns.isin(not_intensities)] = \
884
+ df_subset.loc[:,~df_subset.columns.isin(not_intensities)].apply(
885
+ lambda row: (row - row.median())/(row.std(ddof=0)), axis = 1)
886
+ df_subset.dropna(how = 'all', inplace = True, axis = 1)
887
+ print('zscore rows finished')'''
888
+
889
+
890
+ # In[56]:
891
+
892
+
893
+ df_subset
894
+ df_numeric = df_merged.apply(pd.to_numeric, errors='coerce')
895
+ # Z-score the rows (apply() with axis = 1, only perform on intensity data)
896
+
897
+ ###############################
898
+ # !! This may take a while !! #
899
+ ###############################
900
+ df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \
901
+ df_numeric.loc[:,~df_numeric.columns.isin(not_intensities)].apply(
902
+ lambda row: (row - row.median())/(row.std(ddof=0)), axis = 1)
903
+ df_merged.dropna(how = 'all', inplace = True, axis = 1)
904
+ print('zscore rows finished')
905
+
906
+ '''# Z-score the rows (apply() with axis = 1, only perform on intensity data)
907
+
908
+ ###############################
909
+ # !! This may take a while !! #
910
+ ###############################
911
+ df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \
912
+ df_merged.loc[:,~df_merged.columns.isin(not_intensities)].apply(
913
+ lambda row: (row - row.median())/(row.std(ddof=0)), axis = 1)
914
+ df_merged.dropna(how = 'all', inplace = True, axis = 1)
915
+ print('zscore rows finished')'''
916
+
917
+
918
+ df_merged
919
+
920
+
921
+ # In[59]:
922
+
923
+
924
+ # Ensuring that the selected columns in df have been adjusted or normalized using the median values
925
+ df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \
926
+ df_merged.loc[:,~df_merged.columns.isin(not_intensities)] - df_subset.loc[:,~df_subset.columns.isin(not_intensities)].median()
927
+ df_merged
928
+
929
+
930
+ # In[60]:
931
+
932
+
933
+ df_merged_zscore = df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \
934
+ df_merged.loc[:,~df_merged.columns.isin(not_intensities)] / df_subset.loc[:,~df_subset.columns.isin(not_intensities)].std(ddof=0)
935
+ df_merged_zscore
936
+
937
+
938
+ # In[61]:
939
+
940
+
941
+ # Check for NaN entries (should not be any unless columns do not align)
942
+ # False means no NaN entries
943
+ # True means NaN entries
944
+ df.isnull().any().any()
945
+
946
+
947
+ # In[62]:
948
+
949
+
950
+ quality_control_df = df_merged_zscore
951
+
952
+
953
+ # In[63]:
954
+
955
+
956
+ def check_index_format(index_str, ls_samples):
957
+ """
958
+ Checks if the given index string follows the specified format.
959
+
960
+ Args:
961
+ index_str (str): The index string to be checked.
962
+ ls_samples (list): A list of valid sample names.
963
+
964
+ Returns:
965
+ bool: True if the index string follows the format, False otherwise.
966
+ """
967
+ # Split the index string into parts
968
+ parts = index_str.split('_')
969
+
970
+ # Check if there are exactly 3 parts
971
+ if len(parts) != 3:
972
+ print(len(parts))
973
+ return False
974
+
975
+ # Check if the first part is in ls_samples
976
+ sample_name = parts[0]
977
+ if f'{sample_name}_bs.csv' not in ls_samples:
978
+ print(sample_name)
979
+ return False
980
+
981
+ # Check if the second part is in ['cell', 'cytoplasm', 'nucleus']
982
+ location = parts[1]
983
+ valid_locations = ['Cell', 'Cytoplasm', 'Nucleus']
984
+ if location not in valid_locations:
985
+ print(location)
986
+ return False
987
+
988
+ # Check if the third part is a number
989
+ try:
990
+ index = int(parts[2])
991
+ except ValueError:
992
+ print(index)
993
+ return False
994
+
995
+ # If all checks pass, return True
996
+ return True
997
+ # Let's take a look at a few features to make sure our dataframe is as expected
998
+ def check_format_ofindex(index):
999
+ for index in df.index:
1000
+ check_index = check_index_format(index, ls_samples)
1001
+ if check_index is False:
1002
+ index_format = "Bad"
1003
+ return index_format
1004
+
1005
+ index_format = "Good"
1006
+ return index_format
1007
+
1008
+
1009
+ # In[64]:
1010
+
1011
+
1012
+ import panel as pn
1013
+ import pandas as pd
1014
+
1015
+ def quality_check(file, not_intensities):
1016
+ # Load the output file
1017
+ df = file
1018
+
1019
+ # Check Index
1020
+ check_index = check_format_ofindex(df.index)
1021
+
1022
+ # Check Shape
1023
+ check_shape = df.shape
1024
+
1025
+ # Check for NaN entries
1026
+ check_no_null = df.isnull().any().any()
1027
+
1028
+ mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1)
1029
+ if (mean_intensity == 0).any():
1030
+ df = df.loc[mean_intensity > 0, :]
1031
+ print("df.shape after removing 0 mean values: ", df.shape)
1032
+ check_zero_intensities = f'Shape after removing 0 mean values: {df.shape}'
1033
+ else:
1034
+ print("No zero intensity values.")
1035
+ check_zero_intensities = "No zero intensity values."
1036
+
1037
+ # Create a quality check results table
1038
+ quality_check_results_table = pd.DataFrame({
1039
+ 'Check': ['Index', 'Shape', 'Check for NaN Entries', 'Check for Zero Intensities'],
1040
+ 'Result': [str(check_index), str(check_shape), str(check_no_null), check_zero_intensities]
1041
+ })
1042
+
1043
+ # Create a quality check results component
1044
+ quality_check_results_component = pn.Card(
1045
+ pn.pane.DataFrame(quality_check_results_table),
1046
+ title="Quality Control Results",
1047
+ header_background="#2196f3",
1048
+ header_color="white",
1049
+ )
1050
+
1051
+ return quality_check_results_component
1052
+
1053
+
1054
+ # In[76]:
1055
+
1056
+
1057
+ import panel as pn
1058
+
1059
+ # Assuming your DataFrames are already defined as:
1060
+ # metadata, merged_df, initial_df_marker, df_markers_not_intensities, df_after_norm,
1061
+ # df_markers, df_subset, df_merged_zscore
1062
+
1063
+ # Create widgets and panes
1064
+ df_widget = pn.widgets.DataFrame(metadata, name="MetaData")
1065
+
1066
+ # Define the three tabs content
1067
+
1068
+ metadata_tab = pn.Column(
1069
+ pn.pane.Markdown("### Sample Metadata"),
1070
+ pn.pane.DataFrame(metadata.head()),
1071
+ pn.pane.Markdown("### Intial Dataframe"),
1072
+ pn.pane.DataFrame(initial_df_marker.head(), width = 1500),
1073
+ pn.Row(pn.pane.Markdown("### Shape: "), pn.pane.Markdown(str(merged_df.shape))),
1074
+ pn.pane.Markdown("### Merged Dataframe"),
1075
+ pn.pane.DataFrame(merged_df.head(), width = 1500),
1076
+ pn.Row(pn.pane.Markdown("### Shape: "), pn.pane.Markdown(str(initial_df_marker.shape))),
1077
+ pn.pane.Markdown("### Markers and not intensities Dataframe"),
1078
+ pn.pane.DataFrame(df_markers_not_intensities.head(), width = 1500),
1079
+ pn.Row(pn.pane.Markdown("### Shape: "),
1080
+ pn.pane.Markdown(str(df_markers_not_intensities.shape)))
1081
+ )
1082
+
1083
+ normalization_tab = pn.Column(
1084
+ #pn.pane.Markdown("### Normalisation performed"),
1085
+ #pn.pane.DataFrame(df_after_norm.head()),
1086
+ #pn.Row(pn.pane.Markdown("### Shape before normalization: ")),
1087
+ #pn.pane.Markdown(str(df_marker_shape_before_norm))),
1088
+ #pn.Row(pn.pane.Markdown("### Shape after normalization: ")),
1089
+ #pn.pane.Markdown(str(df_marker_shape_after_norm))),
1090
+ #pn.pane.Markdown("### Performed log 2 transformation"),
1091
+ #pn.pane.DataFrame(df_markers.head())
1092
+ layout
1093
+ )
1094
+
1095
+ zscore_tab = pn.Column(
1096
+ pn.pane.Markdown("### Performed Z-score transformation"),
1097
+ pn.pane.DataFrame(df_subset.head(), width = 1500),
1098
+ pn.pane.Markdown("### Z-score transformation finished"),
1099
+ pn.pane.DataFrame(df_merged_zscore.head(), width = 1500)
1100
+ )
1101
+
1102
+ quality_control_tab = pn.Column(
1103
+ pn.pane.Markdown("### Quality Control"),
1104
+ quality_check(quality_control_df, not_intensities)
1105
+ )
1106
+
1107
+ # Create the GoldenTemplate
1108
+ app3 = pn.template.GoldenTemplate(
1109
+ site="Cyc-IF",
1110
+ title="Z-Score Computation",
1111
+ main=[
1112
+ pn.Tabs(
1113
+ ("Metadata", metadata_tab),
1114
+ ("Normalization", normalization_tab),
1115
+ ("Z-Score", zscore_tab),
1116
+ ("Quality Control", quality_control_tab)
1117
+ )
1118
+ ]
1119
+ )
1120
+
1121
+ app3.servable()
1122
+
1123
+ if __name__ == "__main__":
1124
+ pn.serve(app3, port=5007)
1125
+
1126
+
1127
+
1128
+