KashyapiNagaHarshitha commited on
Commit
1604536
·
verified ·
1 Parent(s): 302d1f0

Delete Quality_Control.py

Browse files
Files changed (1) hide show
  1. Quality_Control.py +0 -1783
Quality_Control.py DELETED
@@ -1,1783 +0,0 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
-
4
- import warnings
5
- import os
6
- import plotly as plt
7
- import seaborn as sb
8
- import plotly.express as px
9
- import panel as pn
10
- import holoviews as hv
11
- import hvplot.pandas
12
- import pandas as pd
13
- import numpy as np
14
- import json
15
- import matplotlib.pyplot as plt
16
- from bokeh.plotting import figure
17
- from bokeh.io import push_notebook, show
18
- from bokeh.io.export import export_png
19
- from bokeh.resources import INLINE
20
- from bokeh.embed import file_html
21
- from bokeh.io import curdoc
22
- from bokeh.models import Span, Label
23
- from bokeh.models import ColumnDataSource, Button
24
- from my_modules import *
25
- from datasets import load_dataset
26
- os.getcwd()
27
- #Silence FutureWarnings & UserWarnings
28
- warnings.filterwarnings('ignore', category= FutureWarning)
29
- warnings.filterwarnings('ignore', category= UserWarning)
30
-
31
-
32
- #present_dir = os.path.dirname(os.path.realpath(__file__))
33
- #input_path = os.path.join(present_dir, 'wetransfer_data-zip_2024-05-17_1431')
34
- base_dir = '/code/wetransfer_data-zip_2024-05-17_1431'
35
- set_path = 'test'
36
- selected_metadata_files = ['Slide_B_DD1s1.one_1.tif.csv', 'Slide_B_DD1s1.one_2.tif.csv']
37
- ls_samples = ['DD3S1.csv', 'DD3S2.csv', 'DD3S3.csv', 'TMA.csv']
38
-
39
- pn.extension()
40
-
41
- update_button = pn.widgets.Button(name='CSV Files', button_type='primary')
42
- def update_samples(event):
43
- with open('stored_variables.json', 'r') as file:
44
- stored_vars = json.load(file)
45
- # ls_samples = stored_vars['ls_samples']
46
- print(ls_samples)
47
- update_button.on_click(update_samples)
48
-
49
- csv_files_button = pn.widgets.Button(icon="clipboard", button_type="primary")
50
- indicator = pn.indicators.LoadingSpinner(value=False, size=25)
51
-
52
- def handle_click(clicks):
53
- with open('stored_variables.json', 'r') as file:
54
- stored_vars = json.load(file)
55
- # ls_samples = stored_vars['ls_samples']
56
- return f'CSV Files Selected: {ls_samples}'
57
-
58
- pn.Row(
59
- csv_files_button,
60
- pn.bind(handle_click, csv_files_button.param.clicks),
61
- )
62
-
63
-
64
- # ## I.2. *DIRECTORIES
65
-
66
- set_path = 'test'
67
-
68
- # Set base directory
69
-
70
- directorio_actual = os.getcwd()
71
- print(directorio_actual)
72
-
73
- ##### MAC WORKSTATION #####
74
- #base_dir = r'/Volumes/LaboLabrie/Projets/OC_TMA_Pejovic/Temp/Zoe/CyCIF_pipeline/'
75
- ###########################
76
-
77
- ##### WINDOWS WORKSTATION #####
78
- #base_dir = r'C:\Users\LaboLabrie\gerz2701\cyCIF-pipeline\Set_B'
79
- ###############################
80
- input_path = base_dir
81
-
82
- ##### LOCAL WORKSTATION #####
83
- #base_dir = r'/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/'
84
- base_dir = input_path
85
- print(base_dir)
86
- #############################
87
-
88
- #set_name = 'Set_A'
89
- #set_name = 'test'
90
- set_name = set_path
91
-
92
- project_name = set_name # Project name
93
- step_suffix = 'qc_eda' # Curent part (here part I)
94
- previous_step_suffix_long = "" # Previous part (here empty)
95
-
96
- # Initial input data directory
97
- input_data_dir = os.path.join(base_dir, project_name + "_data")
98
-
99
- # QC/EDA output directories
100
- # global output
101
- output_data_dir = os.path.join(base_dir, project_name + "_" + step_suffix)
102
- # images subdirectory
103
- output_images_dir = os.path.join(output_data_dir,"images")
104
-
105
- # Data and Metadata directories
106
- # global data
107
- metadata_dir = os.path.join(base_dir, project_name + "_metadata")
108
- # images subdirectory
109
- metadata_images_dir = os.path.join(metadata_dir,"images")
110
-
111
- # Create directories if they don't already exist
112
- for d in [base_dir, input_data_dir, output_data_dir, output_images_dir, metadata_dir, metadata_images_dir]:
113
- if not os.path.exists(d):
114
- print("Creation of the" , d, "directory...")
115
- os.makedirs(d)
116
- else :
117
- print("The", d, "directory already exists !")
118
-
119
- os.chdir(input_data_dir)
120
- with open('stored_variables.json', 'r') as file:
121
- stored_vars = json.load(file)
122
- # ls_samples = stored_vars['ls_samples']
123
- selected_metadata_files = stored_vars['selected_metadata_files']
124
-
125
- directories = []
126
- for i in [base_dir, input_data_dir, output_data_dir, output_images_dir, metadata_dir, metadata_images_dir]:
127
- directories.append(i)
128
-
129
- directories
130
-
131
- def print_directories(directories):
132
-
133
- label_path = []
134
- labels = [
135
- "base_dir",
136
- "input_data_dir",
137
- "output_data_dir",
138
- "output_images_dir",
139
- "metadata_dir",
140
- "metadata_images_dir"
141
- ]
142
-
143
- for label, path in zip(labels, directories):
144
- label_path.append(f"{label} : {path}")
145
-
146
- return label_path
147
-
148
- print_directories
149
-
150
-
151
- # Verify paths
152
- print('base_dir :', base_dir)
153
- print('input_data_dir :', input_data_dir)
154
- print('output_data_dir :', output_data_dir)
155
- print('output_images_dir :', output_images_dir)
156
- print('metadata_dir :', metadata_dir)
157
- print('metadata_images_dir :', metadata_images_dir)
158
-
159
-
160
- # ## I.3. FILES
161
-
162
- # Listing all the .csv files in the metadata/data directory
163
- # Don't forget to move the csv files into the proj_data directory
164
- # if the data dir is empty it's not going to work
165
- #ls_samples = [sample for sample in os.listdir(input_data_dir) if sample.endswith(".csv")]
166
- print("The following CSV files were detected:\n\n",[sample for sample in ls_samples], "\n\nin", input_data_dir, "directory.")
167
-
168
-
169
- # In[26]:
170
-
171
-
172
- import os
173
- import pandas as pd
174
-
175
- def combine_and_save_metadata_files(metadata_dir, selected_metadata_files):
176
- if len(selected_metadata_files) == []:
177
- if not file:
178
- warnings.warn("No Ashlar file uploaded. Please upload a valid file.", UserWarning)
179
- return
180
-
181
- elif len(selected_metadata_files) > 1:
182
- combined_metadata_df = pd.DataFrame()
183
-
184
- for file in selected_metadata_files:
185
- file_path = os.path.join(metadata_dir, file)
186
- df = pd.read_csv(file_path)
187
- combined_metadata_df = pd.concat([combined_metadata_df, df], ignore_index=True)
188
-
189
- combined_metadata_df.to_csv(os.path.join(metadata_dir, "combined_metadata.csv"), index=False)
190
- print(f"Combined metadata file saved as 'combined_metadata.csv' in {metadata_dir}")
191
-
192
- return combined_metadata_df
193
-
194
- else:
195
- if selected_metadata_files:
196
- single_file_path = os.path.join(metadata_dir, selected_metadata_files[0])
197
- single_file_df = pd.read_csv(single_file_path)
198
- print(f"Only one file selected: {selected_metadata_files[0]}")
199
- return single_file_df
200
- else:
201
- print("No metadata files selected.")
202
- return pd.DataFrame()
203
-
204
-
205
- # In[27]:
206
-
207
-
208
- print(combine_and_save_metadata_files(metadata_dir, selected_metadata_files))
209
-
210
-
211
- # In[28]:
212
-
213
-
214
- ls_samples
215
-
216
-
217
- # In[29]:
218
- path = os.path.join(input_data_dir, ls_samples[0])
219
- #df = load_dataset('csv', data_files = path )
220
- df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]),index_col = 0, nrows = 1)
221
- df.head(10)
222
-
223
-
224
- # In[30]:
225
-
226
-
227
- # First gather information on expected headers using first file in ls_samples
228
- # Read in the first row of the file corresponding to the first sample (index = 0) in ls_samples
229
- df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]) , index_col = 0, nrows = 1)
230
-
231
-
232
- # Make sure the file was imported correctly
233
- print("df :\n", df.head(), "\n")
234
- print("df's columns :\n", df.columns, "\n")
235
- print("df's index :\n", df.index, "\n")
236
- print("df's index name :\n", df.index.name)
237
-
238
-
239
- # In[31]:
240
-
241
-
242
- df.head()
243
-
244
-
245
- # In[32]:
246
-
247
-
248
- # Verify that the ID column in input file became the index
249
- # Verify that the index name column is "ID", if not, rename it
250
- if df.index.name != "ID":
251
- print("Expected the first column in input file (index_col = 0) to be 'ID'. \n"
252
- "This column will be used to set the index names (cell number for each sample). \n"
253
- "It appears that the column '" + df.index.name + "' was actually the imported as the index column.")
254
- #df.index.name = 'ID'
255
- print("A new index name (first column) will be given ('ID') to replace the current one '" + df.index.name + "'\n")
256
-
257
- # Apply the changes to the headers as specified with apply_header_changes() function (in my_modules.py)
258
- # Apply the changes to the dataframe rows as specified with apply_df_changes() function (in my_modules.py)
259
- #df = apply_header_changes(df)
260
- print(df.index)
261
- df.index = df.index.str.replace(r'@1$', '')
262
- df = apply_df_changes(df)
263
-
264
- # Set variable to hold default header values
265
- expected_headers = df.columns.values
266
- expected_header = True
267
- print(expected_header)
268
-
269
- intial_dataframe = df
270
- # Make sure the file is now formated correctly
271
- print("\ndf :\n", df.head(), "\n")
272
- print("df's columns :\n", df.columns, "\n")
273
- print("df's index :\n", df.index, "\n")
274
- print("df's index name :\n", df.index.name)
275
-
276
-
277
- # In[33]:
278
-
279
-
280
- df.head()
281
-
282
-
283
- # In[34]:
284
-
285
-
286
- df.head()
287
-
288
-
289
- # In[35]:
290
-
291
-
292
- print("Used " + ls_samples[0] + " to determine the expected and corrected headers for all files.\n")
293
- print("These headers are: \n" + ", ".join([h for h in expected_headers]))
294
-
295
- corrected_headers = True
296
-
297
-
298
- # In[36]:
299
-
300
-
301
- for sample in ls_samples:
302
- file_path = os.path.join(input_data_dir,sample)
303
- print(file_path)
304
-
305
-
306
- # In[37]:
307
-
308
-
309
- # Import all the others files
310
- dfs = {}
311
- ###############################
312
- # !! This may take a while !! #
313
- ###############################
314
- errors = []
315
-
316
- for sample in ls_samples:
317
- file_path = os.path.join(input_data_dir,sample)
318
-
319
- try:
320
- # Read the CSV file
321
- df = load_dataset("csv", data_files = file_path)
322
- df = pd.read_csv(file_path, index_col=0)
323
- # Check if the DataFrame is empty, if so, don't continue trying to process df and remove it
324
-
325
- if not df.empty:
326
- # Manipulations necessary for concatenation
327
- df = apply_header_changes(df)
328
- df = apply_df_changes(df)
329
- # Reorder the columns to match the expected headers list
330
- #df = df.reindex(columns=expected_headers)
331
- print(df.head(1))
332
- print(sample, "file is processed !\n")
333
- #print(df)
334
-
335
- # Compare df's header df against what is expected
336
- compare_headers(expected_headers, df.columns.values, sample)
337
- #print(df.columns.values)
338
- # Add a new colunm to identify the csv file (sample) where the df comes from
339
- df['Sample_ID'] = sample
340
-
341
- except pd.errors.EmptyDataError:
342
- errors.append(f'\nEmpty data error in {sample} file. Removing from analysis...')
343
- print(f'\nEmpty data error in {sample} file. Removing from analysis...')
344
- ls_samples.remove(sample)
345
-
346
- # Add df to dfs
347
- dfs[sample] = df
348
-
349
- print(dfs)
350
-
351
-
352
- dfs.values()
353
-
354
- # Merge dfs into one df
355
- df = pd.concat(dfs.values(), ignore_index=False , sort = False)
356
- del dfs
357
- merge = True
358
- merged_dataframe = df
359
- df.head()
360
-
361
- # Set index to Sample_ID + cell number :
362
- # create a new custom index for df based on the sample names and integer cell numbers, and then remove the temporary columns 'level_0' and 'index' that were introduced during the operations
363
-
364
- # Creates a copy of the DataFrame df and resets its index without creating a new column for the old index
365
- # This essentially removes the old index column and replaces it with a default integer index
366
- df = df.copy().reset_index(drop=True)
367
-
368
- #print(df)
369
-
370
- # Initializing an empty list index to store the new index labels for the DataFrame
371
- index = []
372
-
373
- for sample in ls_samples:
374
- # Extract a chunk of data from the original df where the 'Sample_ID' column matches the current sample name
375
- # This chunk is stored in the df_chunk df, which is a subset of the original data for that specific sample
376
- df_chunk = df.loc[df['Sample_ID'] == sample,:].copy()
377
- old_index = df_chunk.index
378
- # Reset the index of the df_chunk df, removing the old index and replacing it with a default integer index
379
- df_chunk = df_chunk.reset_index(drop=True)
380
- # A new index is created for the df_chunk df. It combines the sample name with 'Cell_' and the integer index values, converting them to strings
381
- # This new index will have labels like 'SampleName_Cell_0', 'SampleName_Cell_1', and so on.
382
- sample = sample.split('.')[0]
383
- df_chunk = df_chunk.set_index(f'{sample}_Cell_' + df_chunk.index.astype(str))
384
- # The index values of df_chunk are then added to the index list
385
- index = index + df_chunk.index.values.tolist()
386
-
387
- # After processing all the samples in the loop, assign the index list as the new index of the original df.
388
- df.index = index
389
- # Remove the 'level_0' and 'index' columns from df
390
- df = df.loc[:,~df.columns.isin(['level_0','index'])]
391
- assigned_new_index = True
392
- df.head()
393
-
394
-
395
- # ### I.3.2. NOT_INTENSITIES
396
-
397
- # not_intensities is the list of the columns unrelated to the markers fluorescence intensities
398
- # Can include items that aren't in a given header.
399
- #not_intensitiehttp://localhost:8888/lab/tree/Downloads/wetransfer_data-zip_2024-05-17_1431/1_qc_eda.ipynb
400
- #I.3.2.-NOT_INTENSITIESs = ['Nuc_X', 'Nuc_X_Inv', 'Nuc_Y', 'Nuc_Y_Inv', 'Nucleus_Roundness', 'Nucleus_Size', 'Cell_Size',
401
- # 'ROI_index', 'Sample_ID', 'replicate_ID', 'Cell_ID','cell_type', 'cell_subtype', 'cluster','ID',
402
- # 'Cytoplasm_Size', 'immune_checkpoint', 'Unique_ROI_index', 'Patient', 'Primary_chem(1)_vs_surg(0)']
403
- # not_intensities is the list of the columns unrelated to the markers fluorescence intensities
404
- # Can include items that aren't in a given header.
405
- #not_intensities = ['Nuc_X', 'Nuc_X_Inv', 'Nuc_Y', 'Nuc_Y_Inv', 'Nucleus_Roundness', 'Nucleus_Size', 'Cell_Size',
406
- # 'ROI_index', 'Sample_ID', 'replicate_ID', 'Cell_ID','cell_type', 'cell_subtype', 'cluster','ID',
407
- # 'Cytoplasm_Size', 'immune_checkpoint', 'Unique_ROI_index', 'Patient', 'Primary_chem(1)_vs_surg(0)']
408
-
409
- # Get all column names
410
- all_columns = df.columns.tolist()
411
-
412
- # Create a list to store non-intensity column names
413
- not_intensities = []
414
- intensity_columns = []
415
- # Iterate over each column name
416
- for column in all_columns:
417
- # Check if the column name contains 'Intensity_Average'
418
- if 'Intensity_Average' not in column:
419
- print(not_intensities)
420
- not_intensities.append(column)
421
- else:
422
- intensity_columns.append(column)
423
-
424
-
425
- # Create a new DataFrame with non-intensity columns
426
- not_intensities_df = pd.DataFrame(not_intensities)
427
- print("Non-intensity columns:")
428
- print(not_intensities)
429
-
430
- print("non-intensity DataFrame:")
431
- not_intensities
432
- #print(len(intensity_columns))
433
-
434
-
435
- pd.DataFrame(not_intensities)
436
-
437
- path_not_intensities = os.path.join(metadata_dir,"not_intensities.csv")
438
-
439
- # If this file already exists, add only not_intensities items of the list not already present in file
440
- if os.path.exists(path_not_intensities):
441
- print("'not_intensities.csv' already exists.")
442
- print("Reconciling file and Jupyter notebook lists.")
443
- file_not_intensities = open(path_not_intensities, "r")
444
- file_ni = file_not_intensities.read().splitlines()
445
- # Set difference to identify items not already in file
446
- to_add = set(not_intensities) - set(file_ni)
447
- # We want not_intensities to the a complete list
448
- not_intensities = list(set(file_ni) | set(not_intensities))
449
- file_not_intensities.close()
450
- file_not_intensities = open(path_not_intensities, "a")
451
- for item in to_add:
452
- file_not_intensities.write(item +"\n")
453
- file_not_intensities.close()
454
-
455
- else:
456
- # The file does not yet exist
457
- print("Could not find " + path_not_intensities + ". Creating now.")
458
- file_not_intensities = open(path_not_intensities, "w")
459
- for item in not_intensities:
460
- file_not_intensities.write(item + "\n")
461
- file_not_intensities.close()
462
-
463
-
464
- # In[46]:
465
-
466
-
467
- not_intensities_df = pd.read_csv(path_not_intensities)
468
- not_intensities_df
469
-
470
-
471
- # In[47]:
472
-
473
-
474
- # Columns we want to keep: not_intensities, and any intensity column that contains 'Intensity_Average' (drop any intensity marker column that is not a mean intensity)
475
- to_keep = not_intensities + [x for x in df.columns.values[~df.columns.isin(not_intensities)] if 'Intensity_Average' in x]
476
-
477
- to_keep
478
-
479
-
480
- # In[48]:
481
-
482
-
483
- print(len(to_keep) - 1)
484
-
485
-
486
- # In[49]:
487
-
488
-
489
- # However, our to_keep list contains items that might not be in our df headers!
490
- # These items are from our not_intensities list. So let's ask for only those items from to_keep that are actually found in our df
491
- # Retains only the columns from the to_keep list that are found in the df's headers (columns).
492
- # This ensures that we are only keeping the columns that exist in your df, avoiding any potential issues with non-existent column names.
493
- # The result is a df containing only the specified columns.
494
- df = df[[x for x in to_keep if x in df.columns.values]]
495
-
496
- df.head()
497
-
498
-
499
- # In[50]:
500
-
501
-
502
- import pandas as pd
503
-
504
- # Assuming you have a DataFrame named 'df'
505
- # df = pd.read_csv('your_file.csv')
506
-
507
- # Get all column names
508
- all_columns = df.columns.tolist()
509
-
510
- # Create an empty list to store intensity markers
511
- intensity_marker = []
512
-
513
- # Iterate over each column name
514
- for column in all_columns:
515
- # Check if the column name contains 'Intensity_Average'
516
- if 'Intensity_Average' in column:
517
- # Split the column name by underscore
518
- parts = column.split('_')
519
-
520
- # Extract the word before the first underscore
521
- marker = parts[0]
522
-
523
- # Add the marker to the intensity_marker list
524
- intensity_marker.append(marker)
525
-
526
- # Remove duplicates from the intensity_marker list
527
- intensity_marker = list(set(intensity_marker))
528
-
529
- print("Intensity Markers:")
530
- print(intensity_marker)
531
-
532
- # Create a callback function to update the intensities array
533
- def update_intensities(event):
534
- global intensities
535
- global intensities_df
536
- new_intensities = []
537
- selected_columns = []
538
- for marker, cell, cytoplasm, nucleus in zip(marker_options_df['Marker'], marker_options_df['Cell'], marker_options_df['Cytoplasm'], marker_options_df['Nucleus']):
539
- if cell:
540
- new_intensities.append(f"{marker}_Cell_Intensity_Average")
541
- selected_columns.append(f"{marker}_Cell_Intensity_Average")
542
- if cytoplasm:
543
- new_intensities.append(f"{marker}_Cytoplasm_Intensity_Average")
544
- selected_columns.append(f"{marker}_Cytoplasm_Intensity_Average")
545
- if nucleus:
546
- new_intensities.append(f"{marker}_Nucleus_Intensity_Average")
547
- selected_columns.append(f"{marker}_Nucleus_Intensity_Average")
548
- intensities = new_intensities
549
- if selected_columns:
550
- intensities_df = merged_dataframe[selected_columns]
551
- else:
552
- intensities_df = pd.DataFrame()
553
- print("Updated intensities DataFrame:")
554
- print(intensities_df)
555
-
556
-
557
- # In[54]:
558
-
559
-
560
- tabulator_formatters = {
561
- 'bool': {'type': 'tickCross'}
562
- }
563
-
564
- # Create a DataFrame with the intensity markers and default values
565
- marker_options_df = pd.DataFrame({
566
- 'Marker': intensity_marker,
567
- 'Cell': [False] * len(intensity_marker),
568
- 'Cytoplasm': [False] * len(intensity_marker),
569
- 'Nucleus': [False] * len(intensity_marker)
570
- })
571
-
572
- # Create the Tabulator widget and link the callback function
573
- tabulator = pn.widgets.Tabulator(marker_options_df, formatters=tabulator_formatters, sizing_mode='stretch_width')
574
- tabulator.param.watch(update_intensities,'value')
575
-
576
- # Create a Panel layout with the Tabulator widget
577
- marker_options_layout = pn.Column(tabulator, sizing_mode="stretch_width")
578
-
579
- import panel as pn
580
- import pandas as pd
581
- import random
582
- import asyncio
583
-
584
- # Initialize the Panel extension with Tabulator
585
- pn.extension('tabulator')
586
-
587
- # Create a DataFrame with the intensity markers and default values
588
- marker_options_df = pd.DataFrame({
589
- 'Marker': intensity_marker,
590
- 'Cell': [True] * len(intensity_marker),
591
- 'Cytoplasm': [False] * len(intensity_marker),
592
- 'Nucleus': [False] * len(intensity_marker)
593
- })
594
-
595
- # Define formatters for the Tabulator widget
596
- tabulator_formatters = {
597
- 'Cell': {'type': 'tickCross'},
598
- 'Cytoplasm': {'type': 'tickCross'},
599
- 'Nucleus': {'type': 'tickCross'}
600
- }
601
-
602
- # Create the Tabulator widget
603
- tabulator = pn.widgets.Tabulator(marker_options_df, formatters=tabulator_formatters, sizing_mode='stretch_width')
604
-
605
- # Create a DataFrame to store the initial intensities
606
- new_data = [{'Description': f"{marker}_Cell_Intensity_Average"} for marker in intensity_marker if True]
607
- new_data_df = pd.DataFrame(new_data)
608
-
609
- # Create a widget to display the new data as a DataFrame
610
- new_data_table = pn.widgets.Tabulator(new_data_df, name='New Data Table', sizing_mode='stretch_width')
611
-
612
- # Create a button to start the update process
613
- run_button = pn.widgets.Button(name="Save Selection", button_type='primary')
614
-
615
- # Define the update_intensities function
616
- def update_intensities():
617
- global new_data, new_data_df
618
- new_data = []
619
- for _, row in tabulator.value.iterrows():
620
- marker = row['Marker']
621
- if row['Cell']:
622
- new_data.append({'Description': f"{marker}_Cell_Intensity_Average"})
623
- if row['Cytoplasm']:
624
- new_data.append({'Description': f"{marker}_Cytoplasm_Intensity_Average"})
625
- if row['Nucleus']:
626
- new_data.append({'Description': f"{marker}_Nucleus_Intensity_Average"})
627
- new_data_df = pd.DataFrame(new_data)
628
- new_data_table.value = new_data_df
629
-
630
- # Define the runner function
631
- async def runner(event):
632
- update_intensities()
633
-
634
- # Bind the runner function to the button
635
- run_button.on_click(runner)
636
-
637
- # Layout
638
- updated_intensities = pn.Column(tabulator, run_button, new_data_table, sizing_mode="stretch_width")
639
-
640
- pn.extension()
641
- # Serve the layout
642
- #updated_intensities.servable()
643
-
644
-
645
- intensities_df = new_data_table
646
- intensities_df
647
-
648
- intensities_df = pn.pane.DataFrame(intensities_df)
649
- intensities_df
650
-
651
- print(intensities_df)
652
- # ## I.4. QC CHECKS
653
-
654
- def quality_check_results(check_shape, check_no_null,check_zero_intensities):
655
- results = [
656
- f"Check Index: {check_index}",
657
- f"Check Shape: {check_shape}",
658
- f"Check No Null: {check_no_null}",
659
- f"Check Zero Intensities: {check_zero_intensities}"
660
- ]
661
- return pn.Column(*[pn.Row(result) for result in results], sizing_mode="stretch_width")
662
-
663
- print(ls_samples)
664
-
665
- def check_index_format(index_str, ls_samples):
666
- """
667
- Checks if the given index string follows the specified format.
668
-
669
- Args:
670
- index_str (str): The index string to be checked.
671
- ls_samples (list): A list of valid sample names.
672
-
673
- Returns:
674
- bool: True if the index string follows the format, False otherwise.
675
- """
676
- # Split the index string into parts
677
- parts = index_str.split('_')
678
-
679
- # Check if there are exactly 3 parts
680
- if len(parts) != 3:
681
- print(len(parts))
682
- return False
683
-
684
- # Check if the first part is in ls_samples
685
- sample_name = parts[0]
686
- if f'{sample_name}.csv' not in ls_samples:
687
- print(sample_name)
688
- return False
689
-
690
- # Check if the second part is in ['cell', 'cytoplasm', 'nucleus']
691
- location = parts[1]
692
- valid_locations = ['Cell', 'Cytoplasm', 'Nucleus']
693
- if location not in valid_locations:
694
- print(location)
695
- return False
696
-
697
- # Check if the third part is a number
698
- try:
699
- index = int(parts[2])
700
- except ValueError:
701
- print(index)
702
- return False
703
-
704
- # If all checks pass, return True
705
- return True
706
-
707
-
708
- # In[70]:
709
-
710
-
711
- # Let's take a look at a few features to make sure our dataframe is as expected
712
- df.index
713
- def check_format_ofindex(index):
714
- for index in df.index:
715
- check_index = check_index_format(index, ls_samples)
716
- if check_index is False:
717
- index_format = "Bad"
718
- return index_format
719
-
720
- index_format = "Good"
721
- return index_format
722
- print(check_format_ofindex(df.index))
723
-
724
-
725
- # In[71]:
726
-
727
-
728
- df.shape
729
- check_index = df.index
730
- check_shape = df.shape
731
- print(check_shape)
732
-
733
-
734
- # In[72]:
735
-
736
-
737
- # Check for NaN entries (should not be any unless columns do not align)
738
- # False means no NaN entries
739
- # True means NaN entries
740
- df.isnull().any().any()
741
-
742
- check_no_null = df.isnull().any().any()
743
-
744
-
745
- # In[73]:
746
-
747
-
748
- # Check that all expected files were imported into final dataframe
749
- if sorted(df.Sample_ID.unique()) == sorted(ls_samples):
750
- print("All expected filenames are present in big df Sample_ID column.")
751
- check_all_expected_files_present = "All expected filenames are present in big df Sample_ID column."
752
- else:
753
- compare_headers(['no samples'], df.Sample_ID.unique(), "big df Sample_ID column")
754
- check_all_expected_files_present = compare_headers(['no samples'], df.Sample_ID.unique(), "big df Sample_ID column")
755
-
756
- print(df.Sample_ID)
757
-
758
-
759
- # In[74]:
760
-
761
-
762
- # Delete rows that have 0 value mean intensities for intensity columns
763
- print("df.shape before removing 0 mean values: ", df.shape)
764
-
765
- # We use the apply method on df to calculate the mean intensity for each row. It's done this by applying a lambda function to each row.
766
- # The lambda function excludes the columns listed in the not_intensities list (which are not to be considered for mean intensity calculations)
767
- # and calculates the mean of the remaining values in each row.
768
- ###############################
769
- # !! This may take a while !! #
770
- ###############################
771
- # Calculate mean intensity excluding 'not_intensities' columns
772
- mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1)
773
-
774
- # Check if there are any 0 mean intensity values
775
- if (mean_intensity == 0).any():
776
- df = df.loc[mean_intensity > 0, :]
777
- print("Shape after removing 0 mean values: ", df.shape)
778
- check_zero_intensities = f'df.shape after removing 0 mean values: {df.shape}'
779
- else:
780
- print("No zero intensity values.")
781
- check_zero_intensities = " No zero intensity values found in the DataFrame."
782
-
783
-
784
-
785
- # Get quantiles (5th, 50th, 95th)
786
- # List of nucleus size percentiles to extract
787
- #qs = [0.05,0.50,0.95]
788
-
789
-
790
-
791
- #df["Nucleus_Size"].quantile(q=qs)
792
-
793
-
794
- quality_control_df = df
795
- quality_control_df.head()
796
-
797
- # Function to perform quality checks
798
- def perform_quality_checks(df, ls_samples, not_intensities):
799
- results = {}
800
- errors = []
801
- # Check index
802
- results['index'] = df.index
803
-
804
- # Check shape
805
- results['shape'] = df.shape
806
-
807
- # Check for NaN entries
808
- results['nan_entries'] = df.isnull().any().any()
809
-
810
- # Remove rows with 0 mean intensity values
811
- mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1)
812
- if (mean_intensity == 0).any():
813
- df = df.loc[mean_intensity > 0, :]
814
- results['zero_intensity_removal'] = f"Zero intensity entires are found and removed. Shape after removing: {df.shape}"
815
- else:
816
- results['zero_intensity_removal'] = "No zero intensity values found in the DataFrame."
817
-
818
- return results
819
-
820
- # Example usage of the function
821
- quality_check_results = perform_quality_checks(df, ls_samples, not_intensities)
822
-
823
- # Print results
824
- for key, value in quality_check_results.items():
825
- print(f"{key}: {value}")
826
-
827
-
828
- # In[80]:
829
-
830
-
831
- import panel as pn
832
- import pandas as pd
833
-
834
- def quality_check(file, not_intensities):
835
- # Load the output file
836
- df = file
837
-
838
- # Check Index
839
- check_index = check_format_ofindex(df.index)
840
-
841
- # Check Shape
842
- check_shape = df.shape
843
-
844
- # Check for NaN entries
845
- check_no_null = df.isnull().any().any()
846
-
847
- mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1)
848
- if (mean_intensity == 0).any():
849
- df = df.loc[mean_intensity > 0, :]
850
- print("df.shape after removing 0 mean values: ", df.shape)
851
- check_zero_intensities = f'df.shape after removing 0 mean values: {df.shape}'
852
- else:
853
- print("No zero intensity values found in the DataFrame.")
854
- check_zero_intensities = "No zero intensities."
855
-
856
- # Create a quality check results table
857
- quality_check_results_table = pd.DataFrame({
858
- 'Check': ['Index', 'Shape', 'Check for NaN Entries', 'Check for Zero Intensities'],
859
- 'Result': [str(check_index), str(check_shape), str(check_no_null), check_zero_intensities]
860
- })
861
-
862
- # Create a quality check results component
863
- quality_check_results_component = pn.Card(
864
- pn.pane.DataFrame(quality_check_results_table),
865
- title="Quality Control Results",
866
- header_background="#2196f3",
867
- header_color="white",
868
- )
869
-
870
- return quality_check_results_component
871
-
872
- quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99, step=0.01, value=0.05)
873
-
874
-
875
- # Function to calculate quantile values
876
- def calculate_quantiles(quantile):
877
- quantile_value_intensity = df["AF555_Cell_Intensity_Average"].quantile(q=[quantile, 0.50, 1 - quantile])
878
- return quantile_value_intensity
879
-
880
- # Function to create the Panel app
881
- def create_app(quantile = quantile_slider.param.value):
882
- quantiles = calculate_quantiles(quantile)
883
- output = pd.DataFrame(quantiles)
884
-
885
- # Create a Markdown widget to display the output
886
- output_widget = pn.pane.DataFrame(output)
887
-
888
- return output_widget
889
-
890
-
891
- # Bind the create_app function to the quantile slider
892
- quantile_output_app = pn.bind(create_app, quantile_slider.param.value)
893
- #pn.Column(quantile_slider,quantile_output_app).servable()
894
-
895
- # Function to create the line graph plot using Bokeh
896
- def create_line_graph2(quantile):
897
- # Calculate histogram
898
- hist, edges = np.histogram(df['Nucleus_Size'], bins=30)
899
-
900
- # Calculate the midpoints of bins for plotting
901
- midpoints = (edges[:-1] + edges[1:]) / 2
902
-
903
- # Calculate quantiles
904
- qs = [quantile, 0.50, 1.00 - quantile]
905
- quantiles = df['Nucleus_Size'].quantile(q=qs).values
906
-
907
- # Create Bokeh line graph plot
908
- p = figure(title='Frequency vs. Nucleus_Size',
909
- x_axis_label='Nucleus_Size',
910
- y_axis_label='Frequency',
911
- width=800, height=400)
912
-
913
- # Plotting histogram
914
- p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
915
- fill_color='skyblue', line_color='black', alpha=0.6)
916
-
917
- # Plotting line graph
918
- p.line(midpoints, hist, line_width=2, color='blue', alpha=0.7)
919
-
920
- # Add quantile lines
921
- for q in quantiles:
922
- span = Span(location=q, dimension='height', line_color='red', line_dash='dashed', line_width=2)
923
- p.add_layout(span)
924
- p.add_layout(Label(x=q, y=max(hist), text=f'{q:.1f}', text_color='red'))
925
-
926
- return p
927
-
928
-
929
-
930
- # Bind the create_line_graph function to the quantile slider
931
- nucleus_size_line_graph_with_histogram = pn.bind(create_line_graph2, quantile=quantile_slider.param.value)
932
-
933
- # Clean the 'Nucleus_Size' column by removing NaN and infinite values
934
- df = df[np.isfinite(df['Nucleus_Size'])] # This will keep only finite values
935
-
936
- # Check if the DataFrame is not empty after cleaning
937
- if df.empty:
938
- raise ValueError("No valid data available after cleaning.")
939
- else:
940
- # Calculate the histogram
941
- hist, edges = np.histogram(df['Nucleus_Size'], bins=30)
942
- print("Histogram calculated successfully.")
943
- print("Histogram:", hist)
944
- print("Edges:", edges)
945
- plot1 = pn.Column(quantile_slider, pn.pane.Bokeh(nucleus_size_line_graph_with_histogram))
946
-
947
- #Removing cells based on nucleus size
948
-
949
- quantile = quantile_slider.value
950
- qs = [quantile, 0.50, 1.00 - quantile]
951
- quantiles = df['Nucleus_Size'].quantile(q=qs).values
952
- threshold = quantiles[2]
953
-
954
-
955
- # In[89]:
956
-
957
-
958
- print(threshold)
959
-
960
-
961
- # In[90]:
962
-
963
-
964
-
965
- import panel as pn
966
- import pandas as pd
967
- import numpy as np
968
- from bokeh.plotting import figure
969
- from bokeh.models import Span, Label
970
- # Define the quantile slider
971
- #quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99, step=0.01, value=0.05)
972
-
973
- # Function to update the threshold and display number of cells removed
974
- def update_threshold_and_display(quantile):
975
- qs = [quantile, 0.50, 1.00 - quantile]
976
- quantiles = df['Nucleus_Size'].quantile(q=qs).values
977
- threshold = quantiles[2]
978
-
979
- # Filter the DataFrame based on the new threshold
980
- df_filtered = df.loc[(df['Nucleus_Size'] > 42) & (df['Nucleus_Size'] < threshold)]
981
-
982
- # Calculate the number of cells removed
983
- cells_before_filter = df.shape[0]
984
- cells_after_filter = df_filtered.shape[0]
985
- cells_removed = cells_before_filter - cells_after_filter
986
-
987
- # Display the results
988
- results = pn.Column(
989
- f"Number of cells before filtering: {cells_before_filter}",
990
- f"Number of cells after filtering on nucleus size: {cells_after_filter}",
991
- f"Number of cells removed: {cells_removed}"
992
- )
993
-
994
- return results
995
-
996
- # Bind the update function to the quantile slider
997
- results_display = pn.bind(update_threshold_and_display, quantile_slider)
998
-
999
- # Layout the components in a Panel app
1000
- layout2 = results_display
1001
-
1002
-
1003
- # In[91]:
1004
-
1005
-
1006
- print("Number of cells before filtering :", df.shape[0])
1007
- cells_before_filter = f"Number of cells before filtering :{df.shape[0]}"
1008
- # Delete small cells and objects w/high AF555 Signal (RBCs)
1009
- # We usually use the 95th percentile calculated during QC_EDA
1010
- df = df.loc[(df['Nucleus_Size'] > 42 )]
1011
- df = df.loc[(df['Nucleus_Size'] < threshold)]
1012
- cells_after_filter_nucleus_shape = df.shape[0]
1013
- print("Number of cells after filtering on nucleus size:", df.shape[0])
1014
-
1015
- df = df.loc[(df['AF555_Cell_Intensity_Average'] < 2000)]
1016
- print("Number of cells after filtering on AF555A ___ intensity:", df.shape[0])
1017
- cells_after_filter_intensity_shape = df.shape[0]
1018
- cells_after_filter_nucleus = f"Number of cells after filtering on nucleus size: {cells_after_filter_nucleus_shape}"
1019
- cells_after_filter_intensity = f"Number of cells after filtering on AF555A ___ intensity: {cells_after_filter_intensity_shape}"
1020
-
1021
- num_of_cell_removal_intensity = cells_after_filter_intensity
1022
-
1023
- print(num_of_cell_removal_intensity )
1024
-
1025
- num_of_cell_removal = pn.Column(cells_before_filter, cells_after_filter_nucleus)
1026
-
1027
-
1028
- # Assuming you have a DataFrame 'df' with the intensity columns
1029
- intensities = df.filter(like='Intensity').columns.tolist()
1030
-
1031
- # Create a ColumnDataSource from the DataFrame
1032
- source = ColumnDataSource(df)
1033
-
1034
- # Function to calculate quantile values
1035
- def calculate_quantiles(column, quantile):
1036
- quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile]).values
1037
- return quantiles
1038
-
1039
- # Create the dropdown menu
1040
- column_dropdown = pn.widgets.Select(name='Select Column', options=intensities)
1041
-
1042
- quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99, step=0.01, value=0.05)
1043
-
1044
-
1045
- # Function to create the Bokeh plot
1046
- def create_intensity_plot(column, quantile):
1047
- quantiles = calculate_quantiles(column, quantile)
1048
- hist, edges = np.histogram(df[column], bins = 30)
1049
- # Calculate the midpoints of bins for plotting
1050
- midpoints = (edges[:-1] + edges[1:]) / 2
1051
-
1052
- # Create Bokeh plot
1053
- p = figure(title=f'Distribution of {column} with Quantiles',
1054
- x_axis_label=f'{column} Values',
1055
- y_axis_label='Frequency',
1056
- width=800, height=400)
1057
-
1058
-
1059
- p.quad(top=hist, bottom=0, left=edges[:-1], right= edges[1:],
1060
- fill_color='skyblue', line_color='black', alpha=0.7)
1061
-
1062
- # Plotting line graph
1063
- p.line(midpoints, hist, line_width=2, color='blue', alpha=0.7)
1064
-
1065
- # Add quantile lines
1066
- for q in quantiles:
1067
- span = Span(location=q, dimension='height', line_color='red', line_dash='dashed', line_width=2)
1068
- p.add_layout(span)
1069
- p.add_layout(Label(x=q, y=max(hist), text=f'{q:.1f}', text_color='red'))
1070
-
1071
- return p
1072
-
1073
-
1074
- # Bind the create_plot function to the quantile slider, column dropdown, and button click
1075
- marker_intensity_with_histogram = pn.bind(create_intensity_plot,column_dropdown.param.value, quantile_slider.param.value, watch=True)
1076
-
1077
- # Create the button
1078
- generate_plot_button = Button(label='Generate Plot', button_type='primary')
1079
-
1080
- def update_plot(column, quantile):
1081
- plot = create_intensity_plot(column, quantile)
1082
- plot.renderers[0].data_source = source # Update the data source for the renderer
1083
- return plot
1084
-
1085
- #Display the dropdown menu, quantile slider, button, and plot
1086
- #plot = update_plot(column_dropdown.param.value, quantile_slider.param.value)
1087
-
1088
- def generate_plot(event):
1089
- updated_plot = update_plot(column_dropdown.param.value, quantile_slider.param.value)
1090
- #pn.Column(pn.Row(column_dropdown, generate_plot_button), quantile_slider, updated_plot).servable()
1091
-
1092
- generate_plot_button.on_click(generate_plot)
1093
- selected_marker_plot = pn.Column(pn.Row(pn.Column(column_dropdown, marker_intensity_with_histogram )))
1094
- #pn.Column(pn.Row(pn.Column(column_dropdown, marker_intensity_with_histogram ), generate_plot_button)).servable()
1095
-
1096
- import panel as pn
1097
- import numpy as np
1098
- import pandas as pd
1099
- from bokeh.plotting import figure
1100
- from bokeh.models import ColumnDataSource, Button, Span, Label
1101
-
1102
- # Assuming you have a DataFrame 'df' with the intensity columns
1103
- intensities = df.filter(like='Intensity').columns.tolist()
1104
-
1105
- # Create a ColumnDataSource from the DataFrame
1106
- source = ColumnDataSource(df)
1107
-
1108
- # Function to calculate quantile values
1109
- def calculate_quantiles(column, quantile):
1110
- quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile])
1111
- return quantiles
1112
-
1113
-
1114
- # In[105]:
1115
-
1116
-
1117
- quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99, step=0.01, value=0.05)
1118
-
1119
-
1120
- # Bind the create_line_graph function to the quantile slider
1121
- #nucleus_size_line_graph = pn.bind(create_line_graph, quantile=quantile_slider.param.value)
1122
-
1123
- # Layout the components in a Panel app
1124
- #nucleus_size_graph = pn.Column(nucleus_size_line_graph)
1125
-
1126
-
1127
- # In[106]:
1128
-
1129
-
1130
- #df["CKs_Cytoplasm_Intensity_Average"].quantile(q=qs)
1131
-
1132
-
1133
- # In[107]:
1134
-
1135
-
1136
- len(intensities)
1137
- if 'CKs_Cytoplasm_Intensity_Average' in intensities:
1138
- print(1)
1139
-
1140
-
1141
- # In[108]:
1142
-
1143
-
1144
- df
1145
-
1146
-
1147
- # In[109]:
1148
-
1149
-
1150
- def calculate_cytoplasm_quantiles(column, quantile):
1151
- # Print the columns of the DataFrame
1152
- print("DataFrame columns:", df.columns)
1153
-
1154
- # Check if the column exists in the DataFrame
1155
- if column not in df.columns:
1156
- raise KeyError(f"Column '{column}' does not exist in the DataFrame.")
1157
-
1158
- quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile])
1159
- return quantiles
1160
-
1161
- def create_cytoplasm_intensity_df(column, quantile):
1162
- quantiles = calculate_cytoplasm_quantiles(column, quantile)
1163
- output = pd.DataFrame(quantiles)
1164
- return pn.pane.DataFrame(output)
1165
-
1166
- # Bind the create_app function to the quantile slider
1167
- cytoplasm_quantile_output_app = pn.bind(create_cytoplasm_intensity_df, column='CKs_Cytoplasm_Intensity_Average', quantile=quantile_slider.param.value)
1168
-
1169
- pn.Column(quantile_slider, cytoplasm_quantile_output_app)
1170
-
1171
-
1172
- # In[110]:
1173
-
1174
-
1175
- def calculate_cytoplasm_quantiles(column, quantile):
1176
- quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile])
1177
- return quantiles
1178
-
1179
- def create_cytoplasm_intensity_df(column, quantile):
1180
- quantiles = calculate_cytoplasm_quantiles(column, quantile)
1181
- output = pd.DataFrame(quantiles)
1182
- # Create a Dataframe widget to display the output
1183
- output_widget = pn.pane.DataFrame(output)
1184
- return output_widget
1185
-
1186
-
1187
- # Bind the create_app function to the quantile slider
1188
- cytoplasm_quantile_output_app = pn.bind(create_cytoplasm_intensity_df, column='CKs_Cytoplasm_Intensity_Average', quantile = quantile_slider.param.value)
1189
- pn.Column(quantile_slider,cytoplasm_quantile_output_app)
1190
-
1191
-
1192
- # ## I.5. COLUMNS OF INTERESTS
1193
-
1194
- # In[111]:
1195
-
1196
-
1197
- # Remove columns containing "DAPI"
1198
- df = df[[x for x in df.columns.values if 'DAPI' not in x]]
1199
-
1200
- print("Columns are now...")
1201
- print([c for c in df.columns.values])
1202
-
1203
-
1204
- # In[112]:
1205
-
1206
-
1207
- # Create lists of full names and shortened names to use in plotting
1208
- full_to_short_names, short_to_full_names = \
1209
- shorten_feature_names(df.columns.values[~df.columns.isin(not_intensities)])
1210
-
1211
- short_to_full_names
1212
-
1213
-
1214
- # In[113]:
1215
-
1216
-
1217
- # Save this data to a metadata file
1218
- filename = os.path.join(metadata_dir, "full_to_short_column_names.csv")
1219
- fh = open(filename, "w")
1220
- fh.write("full_name,short_name\n")
1221
- for k,v in full_to_short_names.items():
1222
- fh.write(k + "," + v + "\n")
1223
-
1224
- fh.close()
1225
- print("The full_to_short_column_names.csv file was created !")
1226
-
1227
-
1228
- # In[114]:
1229
-
1230
-
1231
- # Save this data to a metadata file
1232
- filename = os.path.join(metadata_dir, "short_to_full_column_names.csv")
1233
- fh = open(filename, "w")
1234
- fh.write("short_name,full_name\n")
1235
- for k,v in short_to_full_names.items():
1236
- fh.write(k + "," + v + "\n")
1237
-
1238
- fh.close()
1239
- print("The short_to_full_column_names.csv file was created !")
1240
-
1241
-
1242
- # ## I.6. EXPOSURE TIME
1243
-
1244
- # In[115]:
1245
-
1246
-
1247
- #import the ashlar analysis file
1248
- file_path = os.path.join(metadata_dir, 'combined_metadata.csv')
1249
- ashlar_analysis = pd.read_csv(file_path)
1250
- ashlar_analysis
1251
-
1252
-
1253
- # In[116]:
1254
-
1255
-
1256
- # Extracting and renaming columns
1257
- new_df = ashlar_analysis[['Name', 'Cycle', 'ChannelIndex', 'ExposureTime']].copy()
1258
- new_df.rename(columns={
1259
- 'Name': 'Target',
1260
- 'Cycle': 'Round',
1261
- 'ChannelIndex': 'Channel'
1262
- }, inplace=True)
1263
-
1264
- # Applying suffixes to the columns
1265
- new_df['Round'] = 'R' + new_df['Round'].astype(str)
1266
- new_df['Channel'] = 'c' + new_df['Channel'].astype(str)
1267
-
1268
- # Save to CSV
1269
- new_df.to_csv('Ashlar_Exposure_Time.csv', index=False)
1270
-
1271
- # Print the new dataframe
1272
- print(new_df)
1273
-
1274
-
1275
- # In[117]:
1276
-
1277
-
1278
- # Here, we want to end up with a data structure that incorporates metadata on each intensity marker column used in our big dataframe in an easy-to-use format.
1279
- # This is going to include the full name of the intensity marker columns in the big data frame,
1280
- # the corresponding round and channel,
1281
- # the target protein (e.g., CD45),
1282
- # and the segmentation localization information (cell, cytoplasm, nucleus)
1283
-
1284
- # We can use this data structure to assign unique colors to all channels and rounds, for example, for use in later visualizations
1285
- # Exposure_time file from ASHLAR analysis
1286
- filename = "Exposure_Time.csv"
1287
- filename = os.path.join(metadata_dir, filename)
1288
- exp_df = pd.read_csv(filename)
1289
-
1290
- print(exp_df)
1291
-
1292
- # Verify file imported correctly
1293
- # File length
1294
- print("df's shape: ", exp_df.shape)
1295
- # Headers
1296
- expected_headers =['Round','Target','Exp','Channel']
1297
- compare_headers(expected_headers, exp_df.columns.values, "Imported metadata file")
1298
-
1299
- # Missingness
1300
- if exp_df.isnull().any().any():
1301
- print("\nexp_df has null value(s) in row(s):")
1302
- print(exp_df[exp_df.isna().any(axis=1)])
1303
- else:
1304
- print("\nNo null values detected.")
1305
-
1306
-
1307
- # In[118]:
1308
-
1309
-
1310
- if len(exp_df['Target']) > len(exp_df['Target'].unique()):
1311
- print("One or more non-unique Target values in exp_df. Currently not supported.")
1312
- exp_df = exp_df.drop_duplicates(subset = 'Target').reindex()
1313
-
1314
-
1315
- # In[119]:
1316
-
1317
-
1318
- # sort exp_df by the values in the 'Target' column in ascending order and then retrieve the first few rows of the sorted df
1319
- exp_df.sort_values(by = ['Target']).head()
1320
-
1321
-
1322
- # In[120]:
1323
-
1324
-
1325
- # Create lowercase version of target
1326
- exp_df['target_lower'] = exp_df['Target'].str.lower()
1327
- exp_df.head()
1328
-
1329
-
1330
- # In[121]:
1331
-
1332
-
1333
- # Create df that contains marker intensity columns in our df that aren't in not_intensities
1334
- intensities = pd.DataFrame({'full_column':df.columns.values[~df.columns.isin(not_intensities)]})
1335
-
1336
- intensities
1337
-
1338
-
1339
- # In[122]:
1340
-
1341
-
1342
- # Extract the marker information from the `full_column`, which corresponds to full column in big dataframe
1343
- # Use regular expressions (regex) to isolate the part of the field that begins (^) with an alphanumeric value (W), and ends with an underscore (_)
1344
- # '$' is end of line
1345
- intensities['marker'] = intensities['full_column'].str.extract(r'([^\W_]+)')
1346
- # convert to lowercase
1347
- intensities['marker_lower'] = intensities['marker'].str.lower()
1348
-
1349
- intensities
1350
-
1351
-
1352
- # In[123]:
1353
-
1354
-
1355
- # Subset the intensities df to exclude any column pertaining to DAPI
1356
- intensities = intensities.loc[intensities['marker_lower'] != 'dapi']
1357
-
1358
- intensities.head()
1359
-
1360
-
1361
- # In[124]:
1362
-
1363
-
1364
- # Merge the intensities andexp_df together to create metadata
1365
- metadata = pd.merge(exp_df, intensities, how = 'left', left_on = 'target_lower',right_on = 'marker_lower')
1366
- metadata = metadata.drop(columns = ['marker_lower'])
1367
- metadata = metadata.dropna()
1368
-
1369
- # Target is the capitalization from the Exposure_Time.csv
1370
- # target_lower is Target in small caps
1371
- # marker is the extracted first component of the full column in segmentation data, with corresponding capitalization
1372
- metadata
1373
-
1374
-
1375
- # In[125]:
1376
-
1377
-
1378
- # Add a column to signify marker target localisation.
1379
- # Use a lambda to determine segmented location of intensity marker column and update metadata accordingly
1380
- # Using the add_metadata_location() function in my_modules.py
1381
- metadata['localisation'] = metadata.apply(
1382
- lambda row: add_metadata_location(row), axis = 1)
1383
-
1384
-
1385
- # In[126]:
1386
-
1387
-
1388
- mlid = metadata
1389
-
1390
-
1391
- # In[127]:
1392
-
1393
-
1394
- # Save this data structure to the metadata folder
1395
- # don't want to add color in because that's better off treating color the same for round, channel, and sample
1396
- filename = "marker_intensity_metadata.csv"
1397
- filename = os.path.join(metadata_dir, filename)
1398
- metadata.to_csv(filename, index = False)
1399
- print("The marker_intensity_metadata.csv file was created !")
1400
-
1401
-
1402
-
1403
- # ## I.7. COLORS WORKFLOW
1404
-
1405
- # ### I.7.1. CHANNELS COLORS
1406
-
1407
-
1408
- # we want colors that are categorical, since Channel is a non-ordered category (yes, they are numbered, but arbitrarily).
1409
- # A categorical color palette will have dissimilar colors.
1410
- # Get those unique colors
1411
- if len(metadata.Channel.unique()) > 10:
1412
- print("WARNING: There are more unique channel values than \
1413
- there are colors to choose from. Select different palette, e.g., \
1414
- continuous palette 'husl'.")
1415
- channel_color_values = sb.color_palette("bright",n_colors = len(metadata.Channel.unique()))
1416
- # chose 'colorblind' because it is categorical and we're unlikely to have > 10
1417
-
1418
- # You can customize the colors for each channel here
1419
- custom_colors = {
1420
- 'c2': 'lightgreen',
1421
- 'c3': 'tomato',
1422
- 'c4': 'pink',
1423
- 'c5': 'turquoise'
1424
- }
1425
-
1426
- custom_colors_values = sb.palplot(sb.color_palette([custom_colors.get(ch, 'blue') for ch in metadata.Channel.unique()]))
1427
-
1428
- # Display those unique customs colors
1429
- print("Unique channels are:", metadata.Channel.unique())
1430
- sb.palplot(sb.color_palette(channel_color_values))
1431
-
1432
-
1433
- # In[131]:
1434
-
1435
-
1436
- # Function to create a palette plot with custom colors
1437
- def create_palette_plot():
1438
- # Get unique channels
1439
- unique_channels = metadata.Channel.unique()
1440
-
1441
- # Define custom colors for each channel
1442
- custom_colors = {
1443
- 'c2': 'lightgreen',
1444
- 'c3': 'tomato',
1445
- 'c4': 'pink',
1446
- 'c5': 'turquoise'
1447
- }
1448
-
1449
- # Get custom colors for each channel
1450
- colors = [custom_colors.get(ch, 'blue') for ch in unique_channels]
1451
-
1452
- # Create a palette plot (palplot)
1453
- palette_plot = sb.palplot(sb.color_palette(colors))
1454
- channel_color_values = sb.color_palette("bright",n_colors = len(metadata.Channel.unique()))
1455
- channel_color_values = sb.palplot(channel_color_values)
1456
- return palette_plot, channel_color_values
1457
-
1458
-
1459
- # Create the palette plot directly
1460
- palette_plot = create_palette_plot()
1461
-
1462
- # Define the Panel app layout
1463
- app_palette_plot = pn.Column(
1464
- pn.pane.Markdown("### Custom Color Palette"),
1465
- palette_plot,
1466
- )
1467
-
1468
- # Function to create a palette plot with custom colors
1469
- def create_palette_plot(custom_colors):
1470
- # Get unique channels
1471
- unique_channels = metadata.Channel.unique()
1472
-
1473
- # Get custom colors for each channel
1474
- colors = [custom_colors.get(ch, 'blue') for ch in unique_channels]
1475
-
1476
- # Create a palette plot (palplot)
1477
- palette_plot = sb.palplot(sb.color_palette(colors))
1478
-
1479
- return palette_plot
1480
-
1481
- # Define custom colors for each channel
1482
- custom_colors = {
1483
- 'c2': 'lightgreen',
1484
- 'c3': 'tomato',
1485
- 'c4': 'pink',
1486
- 'c5': 'turquoise'
1487
- }
1488
-
1489
- # Display those unique customs colo
1490
- print("Unique channels are:", metadata.Channel.unique())
1491
- # Function to bind create_palette_plot
1492
- app_palette_plot = create_palette_plot(custom_colors)
1493
-
1494
-
1495
- #app_palette_plot.servable()
1496
-
1497
-
1498
- # In[133]:
1499
-
1500
-
1501
- # Store in a dictionary
1502
- channel_color_dict = dict(zip(metadata.Channel.unique(), channel_color_values))
1503
- channel_color_dict
1504
- for k,v in channel_color_dict.items():
1505
- channel_color_dict[k] = np.float64(v)
1506
-
1507
- channel_color_dict
1508
-
1509
-
1510
- # In[134]:
1511
-
1512
-
1513
- color_df_channel = color_dict_to_df(channel_color_dict, "Channel")
1514
-
1515
- # Save to file in metadatadirectory
1516
- filename = "channel_color_data.csv"
1517
- filename = os.path.join(metadata_dir, filename)
1518
- color_df_channel.to_csv(filename, index = False)
1519
-
1520
- color_df_channel
1521
-
1522
-
1523
- # In[135]:
1524
-
1525
-
1526
- # Legend of channel info only
1527
- g = plt.figure(figsize = (1,1)).add_subplot(111)
1528
- g.axis('off')
1529
- handles = []
1530
- for item in channel_color_dict.keys():
1531
- h = g.bar(0,0, color = channel_color_dict[item],
1532
- label = item, linewidth =0)
1533
- handles.append(h)
1534
- first_legend = plt.legend(handles=handles, loc='upper right', title = 'Channel'),
1535
- # box_to_anchor=(10,10),
1536
- # bbox_transform=plt.gcf().transFigure)
1537
-
1538
- filename = "Channel_legend.png"
1539
- filename = os.path.join(metadata_images_dir, filename)
1540
- plt.savefig(filename, bbox_inches = 'tight')
1541
-
1542
- # ### I.7.2. ROUNDS COLORS
1543
-
1544
-
1545
- # we want colors that are sequential, since Round is an ordered category.
1546
- # We can still generate colors that are easy to distinguish. Also, many of the categorical palettes cap at at about 10 or so unique colors, and repeat from there.
1547
- # We do not want any repeats!
1548
- round_color_values = sb.cubehelix_palette(
1549
- len(metadata.Round.unique()), start=1, rot= -0.75, dark=0.19, light=.85, reverse=True)
1550
- # round_color_values = sb.color_palette("cubehelix",n_colors = len(metadata.Round.unique()))
1551
- # chose 'cubehelix' because it is sequential, and round is a continuous process
1552
- # each color value is a tuple of three values: (R, G, B)
1553
- print(metadata.Round.unique())
1554
-
1555
- sb.palplot(sb.color_palette(round_color_values))
1556
-
1557
- ## TO-DO: write what these parameters mean
1558
-
1559
-
1560
- # In[137]:
1561
-
1562
-
1563
- # Store in a dictionary
1564
- round_color_dict = dict(zip(metadata.Round.unique(), round_color_values))
1565
-
1566
- for k,v in round_color_dict.items():
1567
- round_color_dict[k] = np.float64(v)
1568
-
1569
- round_color_dict
1570
-
1571
-
1572
- # In[138]:
1573
-
1574
-
1575
- color_df_round = color_dict_to_df(round_color_dict, "Round")
1576
-
1577
- # Save to file in metadatadirectory
1578
- filename = "round_color_data.csv"
1579
- filename = os.path.join(metadata_dir, filename)
1580
- color_df_round.to_csv(filename, index = False)
1581
-
1582
- color_df_round
1583
-
1584
- # Legend of round info only
1585
-
1586
- round_legend = plt.figure(figsize = (1,1)).add_subplot(111)
1587
- round_legend.axis('off')
1588
- handles = []
1589
- for item in round_color_dict.keys():
1590
- h = round_legend.bar(0,0, color = round_color_dict[item],
1591
- label = item, linewidth =0)
1592
- handles.append(h)
1593
- first_legend = plt.legend(handles=handles, loc='upper right', title = 'Round'),
1594
- # bbox_to_anchor=(10,10),
1595
- # bbox_transform=plt.gcf().transFigure)
1596
-
1597
- filename = "Round_legend.png"
1598
- filename = os.path.join(metadata_images_dir, filename)
1599
- plt.savefig(filename, bbox_inches = 'tight')
1600
-
1601
-
1602
- # ### I.7.3. SAMPLES COLORS
1603
-
1604
- # In[140]:
1605
-
1606
-
1607
- # we want colors that are neither sequential nor categorical.
1608
- # Categorical would be ideal if we could generate an arbitrary number of colors, but I do not think that we can.
1609
- # Hense, we will choose `n` colors from a continuous palette. First we will generate the right number of colors. Later, we will assign TMA samples to gray.
1610
-
1611
- # Get those unique colors
1612
- color_values = sb.color_palette("husl",n_colors = len(ls_samples))#'HLS'
1613
- # each color value is a tuple of three values: (R, G, B)
1614
-
1615
- # Display those unique colors
1616
- sb.palplot(sb.color_palette(color_values))
1617
-
1618
-
1619
- # In[141]:
1620
-
1621
-
1622
- TMA_samples = [s for s in df.Sample_ID.unique() if 'TMA' in s]
1623
- TMA_color_values = sb.color_palette(n_colors = len(TMA_samples),palette = "gray")
1624
- sb.palplot(sb.color_palette(TMA_color_values))
1625
-
1626
-
1627
- # In[142]:
1628
-
1629
-
1630
- # Store in a dictionary
1631
- color_dict = dict()
1632
- color_dict = dict(zip(df.Sample_ID.unique(), color_values))
1633
-
1634
- # Replace all TMA samples' colors with gray
1635
- i = 0
1636
- for key in color_dict.keys():
1637
- if 'TMA' in key:
1638
- color_dict[key] = TMA_color_values[i]
1639
- i +=1
1640
-
1641
- color_dict
1642
-
1643
- color_df_sample = color_dict_to_df(color_dict, "Sample_ID")
1644
-
1645
- # Save to file in metadatadirectory
1646
- filename = "sample_color_data.csv"
1647
- filename = os.path.join(metadata_dir, filename)
1648
- color_df_sample.to_csv(filename, index = False)
1649
-
1650
- color_df_sample
1651
-
1652
-
1653
- # Legend of sample info only
1654
- g = plt.figure(figsize = (1,1)).add_subplot(111)
1655
- g.axis('off')
1656
- handles = []
1657
- for item in color_dict.keys():
1658
- h = g.bar(0,0, color = color_dict[item],
1659
- label = item, linewidth =0)
1660
- handles.append(h)
1661
- first_legend = plt.legend(handles=handles, loc='upper right', title = 'Sample')
1662
-
1663
- filename = "Sample_legend.png"
1664
- filename = os.path.join(metadata_images_dir, filename)
1665
- plt.savefig(filename, bbox_inches = 'tight')
1666
-
1667
-
1668
- # ### I.7.4. CLUSTERS COLORS
1669
-
1670
- '''if 'cluster' in df.columns:
1671
- cluster_color_values = sb.color_palette("hls",n_colors = len(df.cluster.unique()))
1672
-
1673
- #print(sorted(test_df.cluster.unique()))
1674
- # Display those unique colors
1675
- sb.palplot(sb.color_palette(cluster_color_values))
1676
-
1677
- cluster_color_dict = dict(zip(sorted(test_df.cluster.unique()), cluster_color_values))
1678
- print(cluster_color_dict)
1679
-
1680
- # Create dataframe
1681
- cluster_color_df = color_dict_to_df(cluster_color_dict, "cluster")
1682
- cluster_color_df.head()
1683
-
1684
- # Save to file in metadatadirectory
1685
- filename = "cluster_color_data.csv"
1686
- filename = os.path.join(metadata_dir, filename)
1687
- cluster_color_df.to_csv(filename, index = False)
1688
-
1689
-
1690
-
1691
- # Legend of cluster info only
1692
-
1693
- if 'cluster' in df.columns:
1694
- g = plt.figure(figsize = (1,1)).add_subplot(111)
1695
- g.axis('off')
1696
- handles = []
1697
- for item in sorted(cluster_color_dict.keys()):
1698
- h = g.bar(0,0, color = cluster_color_dict[item],
1699
- label = item, linewidth =0)
1700
- handles.append(h)
1701
- first_legend = plt.legend(handles=handles, loc='upper right', title = 'Cluster'),
1702
-
1703
-
1704
- filename = "Clustertype_legend.png"
1705
- filename = os.path.join(metadata_images_dir, filename)
1706
- plt.savefig(filename, bbox_inches = 'tight')'''
1707
-
1708
- mlid.head()
1709
-
1710
-
1711
- metadata
1712
-
1713
-
1714
-
1715
- import io
1716
- import panel as pn
1717
- pn.extension()
1718
-
1719
- file_input = pn.widgets.FileInput()
1720
-
1721
- file_input
1722
-
1723
-
1724
- def transform_data(variable, window, sigma):
1725
- """Calculates the rolling average and identifies outliers"""
1726
- avg = metadata[variable].rolling(window=window).mean()
1727
- residual = metadata[variable] - avg
1728
- std = residual.rolling(window=window).std()
1729
- outliers = np.abs(residual) > std * sigma
1730
- return avg, avg[outliers]
1731
-
1732
-
1733
- def get_plot(variable="Exp", window=30, sigma=10):
1734
- """Plots the rolling average and the outliers"""
1735
- avg, highlight = transform_data(variable, window, sigma)
1736
- return avg.hvplot(
1737
- height=300, legend=False,
1738
- ) * highlight.hvplot.scatter(padding=0.1, legend=False)
1739
-
1740
-
1741
- variable_widget = pn.widgets.Select(name="Target", value="Exp", options=list(metadata.columns))
1742
- window_widget = pn.widgets.IntSlider(name="window", value=30, start=1, end=60)
1743
- sigma_widget = pn.widgets.IntSlider(name="sigma", value=10, start=0, end=20)
1744
-
1745
- app = pn.template.GoldenTemplate(
1746
- site="Cyc-IF",
1747
- title="Quality Control",
1748
- main=[
1749
- pn.Tabs(
1750
- ("Dataframes", pn.Column(
1751
- pn.Row(csv_files_button,pn.bind(handle_click, csv_files_button.param.clicks)),
1752
- pn.pane.Markdown("### The Dataframe uploaded:"), pn.pane.DataFrame(intial_dataframe),
1753
- #pn.pane.Markdown("### The Exposure time DataFrame is :"), pn.pane.DataFrame(exp_df.head()),
1754
- pn.pane.Markdown("### The DataFrame after merging CycIF data x metadata :"), pn.pane.DataFrame(merged_dataframe.head()),
1755
- )),
1756
- ("Quality Control", pn.Column(
1757
- quality_check(quality_control_df, not_intensities)
1758
- #pn.pane.Markdown("### The Quality check results are:"), quality_check_results(check_shape, check_no_null, check_all_expected_files_present, check_zero_intensities)
1759
- )),
1760
- ("Intensities", pn.Column(
1761
- pn.pane.Markdown("### The Not Intensities DataFrame after processing is :"), pn.pane.DataFrame(not_intensities_df, height=250),
1762
- pn.pane.Markdown("### Select Intensities to be included"), updated_intensities,
1763
- #pn.pane.Markdown("### The Intensities DataFrame"), intensities_df,
1764
- #pn.pane.Markdown("### The metadata obtained that specifies the localisation:"), pn.pane.DataFrame(mlid.head())
1765
- )),
1766
- ("Plots", pn.Column(
1767
- #pn.pane.Markdown(" ### Nucleus Size Distribution: "), pn.Row(nucleus_size_line_graph_with_histogram, num_of_cell_removal),
1768
- #pn.pane.Markdown(" ### Nucleus Size Distribution: "), pn.Row(plot1,layout2),
1769
- #pn.pane.Markdown("### Nucleus Distribution Plot:"), pn.Column(nucleus_size_plot, nucleus_size_graph),
1770
- pn.pane.Markdown(" ### Intensity Average Plot:"), pn.Row(selected_marker_plot,num_of_cell_removal_intensity ),
1771
- #pn.Column(pn.Column(column_dropdown, generate_plot_button), quantile_slider, plot),
1772
- #pn.pane.Markdown("### Cytoplasm Intensity Plot:"), cytoplasm_intensity_plot,
1773
- #pn.pane.Markdown("### AF555_Cell_Intensity_Average:"), quantile_output_app,
1774
- #pn.pane.Markdown("### Distribution of AF555_Cell_Intensity_Average with Quantiles:"), quantile_intensity_plot)
1775
- )),
1776
-
1777
- ),
1778
- ])
1779
-
1780
- app.servable()
1781
-
1782
- if __name__ == "__main__":
1783
- pn.serve(app, port=5007)