KashyapiNagaHarshitha commited on
Commit
86f21b4
·
verified ·
1 Parent(s): b1dcdcc

Delete Quality_Control.py

Browse files
Files changed (1) hide show
  1. Quality_Control.py +0 -1688
Quality_Control.py DELETED
@@ -1,1688 +0,0 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
-
4
- import warnings
5
- import os
6
- import plotly as plt
7
- import seaborn as sb
8
- import plotly.express as px
9
- import panel as pn
10
- import holoviews as hv
11
- import hvplot.pandas
12
- import pandas as pd
13
- import numpy as np
14
- import json
15
- import panel as pn
16
- import pandas as pd
17
- import random
18
- import asyncio
19
- import matplotlib.pyplot as plt
20
- from bokeh.plotting import figure
21
- from bokeh.io import push_notebook, show
22
- from bokeh.io.export import export_png
23
- from bokeh.resources import INLINE
24
- from bokeh.embed import file_html
25
- from bokeh.io import curdoc
26
- from bokeh.models import Span, Label
27
- from bokeh.models import ColumnDataSource, Button
28
- from my_modules import *
29
- from datasets import load_dataset
30
-
31
- #Silence FutureWarnings & UserWarnings
32
- warnings.filterwarnings('ignore', category= FutureWarning)
33
- warnings.filterwarnings('ignore', category= UserWarning)
34
-
35
- #input_path = os.path.join(present_dir, 'wetransfer_data-zip_2024-05-17_1431')
36
- present_dir = os.path.dirname(os.path.realpath(__file__))
37
- # Construct the full path to the stored_variables.json file
38
- json_path = os.path.join(present_dir, 'stored_variables.json')
39
- with open(json_path, 'r') as file:
40
- stored_vars = json.load(file)
41
- directory = stored_vars['base_dir']
42
- input_path = os.path.join(present_dir,directory)
43
- set_path = stored_vars['set_path']
44
- selected_metadata_files = stored_vars['selected_metadata_files']
45
- ls_samples = stored_vars['ls_samples']
46
- base_dir = input_path
47
-
48
- #input_path = '/Users/harshithakolipaka/Desktop/CycIF/wetransfer_data-zip_2024-05-17_1431'
49
- #set_path = 'test'
50
- #selected_metadata_files = ['Slide_B_DD1s1.one_1.tif.csv', 'Slide_B_DD1s1.one_2.tif.csv']
51
- #ls_samples = ['Ashlar_Exposure_Time.csv', 'new_data.csv', 'DD3S1.csv', 'DD3S2.csv', 'DD3S3.csv', 'TMA.csv']
52
- pn.extension()
53
-
54
- update_button = pn.widgets.Button(name='CSV Files', button_type='primary')
55
- def update_samples(event):
56
- with open(json_path, 'r') as file:
57
- stored_vars = json.load(file)
58
- print(stored_vars)
59
- ls_samples = stored_vars['ls_samples']
60
- return f'CSV Files Selected: {ls_samples}'
61
- update_button.on_click(update_samples)
62
-
63
- csv_files_button = pn.widgets.Button(icon="clipboard", button_type="primary")
64
- indicator = pn.indicators.LoadingSpinner(value=False, size=25)
65
-
66
- def handle_click(clicks):
67
- with open(json_path, 'r') as file:
68
- stored_vars = json.load(file)
69
- print(stored_vars)
70
- #ls_samples = stored_vars['ls_samples']
71
- #return f'CSV Files Selected: {ls_samples}'
72
-
73
- # pn.Row(csv_files_button,pn.bind(
74
- # , csv_files_button.param.clicks),)
75
-
76
-
77
- # ## I.2. *DIRECTORIES
78
-
79
- #set_path = 'test'
80
-
81
- # Set base directory
82
-
83
- directorio_actual = os.getcwd()
84
- print(directorio_actual)
85
-
86
- ##### MAC WORKSTATION #####
87
- #base_dir = r'/Volumes/LaboLabrie/Projets/OC_TMA_Pejovic/Temp/Zoe/CyCIF_pipeline/'
88
- ###########################
89
-
90
- ##### WINDOWS WORKSTATION #####
91
- #base_dir = r'C:\Users\LaboLabrie\gerz2701\cyCIF-pipeline\Set_B'
92
- ###############################
93
- input_path = base_dir
94
-
95
- ##### LOCAL WORKSTATION #####
96
- #base_dir = r'/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/'
97
- base_dir = input_path
98
- print(base_dir)
99
- #############################
100
-
101
- #set_name = 'Set_A'
102
- #set_name = 'test'
103
- set_name = set_path
104
-
105
- project_name = set_name # Project name
106
- step_suffix = 'qc_eda' # Curent part (here part I)
107
- previous_step_suffix_long = "" # Previous part (here empty)
108
-
109
- # Initial input data directory
110
- input_data_dir = os.path.join(base_dir, project_name + "_data")
111
-
112
- # QC/EDA output directories
113
- # global output
114
- output_data_dir = os.path.join(base_dir, project_name + "_" + step_suffix)
115
- # images subdirectory
116
- output_images_dir = os.path.join(output_data_dir,"images")
117
-
118
- # Data and Metadata directories
119
- # global data
120
- metadata_dir = os.path.join(base_dir, project_name + "_metadata")
121
- # images subdirectory
122
- metadata_images_dir = os.path.join(metadata_dir,"images")
123
-
124
- # Create directories if they don't already exist
125
- for d in [base_dir, input_data_dir, output_data_dir, output_images_dir, metadata_dir, metadata_images_dir]:
126
- if not os.path.exists(d):
127
- print("Creation of the" , d, "directory...")
128
- os.makedirs(d)
129
- else :
130
- print("The", d, "directory already exists !")
131
-
132
- os.chdir(input_data_dir)
133
- with open(json_path, 'r') as file:
134
- stored_vars = json.load(file)
135
- # ls_samples = stored_vars['ls_samples']
136
- selected_metadata_files = stored_vars['selected_metadata_files']
137
-
138
- directories = []
139
- for i in [base_dir, input_data_dir, output_data_dir, output_images_dir, metadata_dir, metadata_images_dir]:
140
- directories.append(i)
141
-
142
- directories
143
-
144
- def print_directories(directories):
145
-
146
- label_path = []
147
- labels = [
148
- "base_dir",
149
- "input_data_dir",
150
- "output_data_dir",
151
- "output_images_dir",
152
- "metadata_dir",
153
- "metadata_images_dir"
154
- ]
155
-
156
- for label, path in zip(labels, directories):
157
- label_path.append(f"{label} : {path}")
158
-
159
- return label_path
160
-
161
- print_directories
162
-
163
-
164
- # Verify paths
165
- print('base_dir :', base_dir)
166
- print('input_data_dir :', input_data_dir)
167
- print('output_data_dir :', output_data_dir)
168
- print('output_images_dir :', output_images_dir)
169
- print('metadata_dir :', metadata_dir)
170
- print('metadata_images_dir :', metadata_images_dir)
171
-
172
-
173
- # ## I.3. FILES
174
-
175
- # Listing all the .csv files in the metadata/data directory
176
- # Don't forget to move the csv files into the proj_data directory
177
- # if the data dir is empty it's not going to work
178
- #ls_samples = [sample for sample in os.listdir(input_data_dir) if sample.endswith(".csv")]
179
- print("The following CSV files were detected:\n\n",[sample for sample in ls_samples], "\n\nin", input_data_dir, "directory.")
180
-
181
-
182
- # In[26]:
183
-
184
-
185
- import os
186
- import pandas as pd
187
-
188
- def combine_and_save_metadata_files(metadata_dir, selected_metadata_files):
189
- if len(selected_metadata_files) == []:
190
- if not file:
191
- warnings.warn("No Ashlar file uploaded. Please upload a valid file.", UserWarning)
192
- return
193
-
194
- elif len(selected_metadata_files) > 1:
195
- combined_metadata_df = pd.DataFrame()
196
-
197
- for file in selected_metadata_files:
198
- file_path = os.path.join(metadata_dir, file)
199
- df = pd.read_csv(file_path)
200
- combined_metadata_df = pd.concat([combined_metadata_df, df], ignore_index=True)
201
-
202
- combined_metadata_df.to_csv(os.path.join(metadata_dir, "combined_metadata.csv"), index=False)
203
- print(f"Combined metadata file saved as 'combined_metadata.csv' in {metadata_dir}")
204
-
205
- return combined_metadata_df
206
-
207
- else:
208
- '''if selected_metadata_files:
209
- single_file_path = os.path.join(metadata_dir, selected_metadata_files[0])
210
- single_file_df = pd.read_csv(single_file_path)
211
- print(f"Only one file selected: {selected_metadata_files[0]}")
212
-
213
- return single_file_df'''
214
-
215
- if len(selected_metadata_files) == 1:
216
- combined_metadata_path = os.path.join(metadata_dir, 'combined_metadata.csv')
217
-
218
- if os.path.exists(combined_metadata_path):
219
- print(f"Combined metadata file already exists: {combined_metadata_path}")
220
- combined_metadata_df = pd.read_csv(combined_metadata_path)
221
- else:
222
- if selected_metadata_files:
223
- combined_metadata_df = pd.DataFrame()
224
- for file in selected_metadata_files:
225
- file_path = os.path.join(metadata_dir, file)
226
- metadata_df = pd.read_csv(file_path)
227
- combined_metadata_df = pd.concat([combined_metadata_df, metadata_df], ignore_index=True)
228
-
229
- combined_metadata_df.to_csv(combined_metadata_path, index=False)
230
- print(f"Combined metadata saved to: {combined_metadata_path}")
231
- else:
232
- print("No metadata files selected.")
233
- combined_metadata_df = pd.DataFrame()
234
-
235
- return combined_metadata_df
236
-
237
- print(combine_and_save_metadata_files(metadata_dir, selected_metadata_files))
238
-
239
- ls_samples
240
-
241
- path = os.path.join(input_data_dir, ls_samples[0])
242
- #df = load_dataset('csv', data_files = path )
243
- df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]),index_col = 0, nrows = 1)
244
- df.head(10)
245
-
246
- # First gather information on expected headers using first file in ls_samples
247
- # Read in the first row of the file corresponding to the first sample (index = 0) in ls_samples
248
- df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]) , index_col = 0, nrows = 1)
249
-
250
-
251
- # Make sure the file was imported correctly
252
- print("df :\n", df.head(), "\n")
253
- print("df's columns :\n", df.columns, "\n")
254
- print("df's index :\n", df.index, "\n")
255
- print("df's index name :\n", df.index.name)
256
-
257
- df.head()
258
-
259
- # Verify that the ID column in input file became the index
260
- # Verify that the index name column is "ID", if not, rename it
261
- if df.index.name != "ID":
262
- print("Expected the first column in input file (index_col = 0) to be 'ID'. \n"
263
- "This column will be used to set the index names (cell number for each sample). \n"
264
- "It appears that the column '" + df.index.name + "' was actually the imported as the index column.")
265
- #df.index.name = 'ID'
266
- print("A new index name (first column) will be given ('ID') to replace the current one '" + df.index.name + "'\n")
267
-
268
- # Apply the changes to the headers as specified with apply_header_changes() function (in my_modules.py)
269
- # Apply the changes to the dataframe rows as specified with apply_df_changes() function (in my_modules.py)
270
- #df = apply_header_changes(df)
271
- print(df.index)
272
- df.index = df.index.str.replace(r'@1$', '')
273
- df = apply_df_changes(df)
274
-
275
- # Set variable to hold default header values
276
- expected_headers = df.columns.values
277
- expected_header = True
278
- print(expected_header)
279
-
280
- intial_dataframe = df
281
- # Make sure the file is now formated correctly
282
- print("\ndf :\n", df.head(), "\n")
283
- print("df's columns :\n", df.columns, "\n")
284
- print("df's index :\n", df.index, "\n")
285
- print("df's index name :\n", df.index.name)
286
-
287
- df.head()
288
-
289
-
290
- df.head()
291
-
292
- print("Used " + ls_samples[0] + " to determine the expected and corrected headers for all files.\n")
293
- print("These headers are: \n" + ", ".join([h for h in expected_headers]))
294
-
295
- corrected_headers = True
296
-
297
- for sample in ls_samples:
298
- file_path = os.path.join(input_data_dir,sample)
299
- print(file_path)
300
-
301
- # Import all the others files
302
- dfs = {}
303
- ###############################
304
- # !! This may take a while !! #
305
- ###############################
306
- errors = []
307
-
308
- for sample in ls_samples:
309
- file_path = os.path.join(input_data_dir,sample)
310
-
311
- try:
312
- # Read the CSV file
313
- df = load_dataset("csv", data_files = file_path)
314
- df = pd.read_csv(file_path, index_col=0)
315
- # Check if the DataFrame is empty, if so, don't continue trying to process df and remove it
316
-
317
- if not df.empty:
318
- # Manipulations necessary for concatenation
319
- df = apply_header_changes(df)
320
- df = apply_df_changes(df)
321
- # Reorder the columns to match the expected headers list
322
- #df = df.reindex(columns=expected_headers)
323
- print(df.head(1))
324
- print(sample, "file is processed !\n")
325
- #print(df)
326
-
327
- # Compare df's header df against what is expected
328
- compare_headers(expected_headers, df.columns.values, sample)
329
- #print(df.columns.values)
330
- # Add a new colunm to identify the csv file (sample) where the df comes from
331
- df['Sample_ID'] = sample
332
-
333
- except pd.errors.EmptyDataError:
334
- errors.append(f'\nEmpty data error in {sample} file. Removing from analysis...')
335
- print(f'\nEmpty data error in {sample} file. Removing from analysis...')
336
- ls_samples.remove(sample)
337
-
338
- # Add df to dfs
339
- dfs[sample] = df
340
-
341
- print(dfs)
342
-
343
-
344
- dfs.values()
345
-
346
- # Merge dfs into one df
347
- df = pd.concat(dfs.values(), ignore_index=False , sort = False)
348
- del dfs
349
- merge = True
350
- merged_dataframe = df
351
- df.head()
352
-
353
- # Set index to Sample_ID + cell number :
354
- # create a new custom index for df based on the sample names and integer cell numbers, and then remove the temporary columns 'level_0' and 'index' that were introduced during the operations
355
-
356
- # Creates a copy of the DataFrame df and resets its index without creating a new column for the old index
357
- # This essentially removes the old index column and replaces it with a default integer index
358
- df = df.copy().reset_index(drop=True)
359
-
360
- #print(df)
361
-
362
- # Initializing an empty list index to store the new index labels for the DataFrame
363
- index = []
364
-
365
- for sample in ls_samples:
366
- # Extract a chunk of data from the original df where the 'Sample_ID' column matches the current sample name
367
- # This chunk is stored in the df_chunk df, which is a subset of the original data for that specific sample
368
- df_chunk = df.loc[df['Sample_ID'] == sample,:].copy()
369
- old_index = df_chunk.index
370
- # Reset the index of the df_chunk df, removing the old index and replacing it with a default integer index
371
- df_chunk = df_chunk.reset_index(drop=True)
372
- # A new index is created for the df_chunk df. It combines the sample name with 'Cell_' and the integer index values, converting them to strings
373
- # This new index will have labels like 'SampleName_Cell_0', 'SampleName_Cell_1', and so on.
374
- sample = sample.split('.')[0]
375
- df_chunk = df_chunk.set_index(f'{sample}_Cell_' + df_chunk.index.astype(str))
376
- # The index values of df_chunk are then added to the index list
377
- index = index + df_chunk.index.values.tolist()
378
-
379
- # After processing all the samples in the loop, assign the index list as the new index of the original df.
380
- df.index = index
381
- # Remove the 'level_0' and 'index' columns from df
382
- df = df.loc[:,~df.columns.isin(['level_0','index'])]
383
- assigned_new_index = True
384
- df.head()
385
-
386
-
387
- # ### I.3.2. NOT_INTENSITIES
388
-
389
- # not_intensities is the list of the columns unrelated to the markers fluorescence intensities
390
- # Can include items that aren't in a given header.
391
- #not_intensitiehttp://localhost:8888/lab/tree/Downloads/wetransfer_data-zip_2024-05-17_1431/1_qc_eda.ipynb
392
- #I.3.2.-NOT_INTENSITIESs = ['Nuc_X', 'Nuc_X_Inv', 'Nuc_Y', 'Nuc_Y_Inv', 'Nucleus_Roundness', 'Nucleus_Size', 'Cell_Size',
393
- # 'ROI_index', 'Sample_ID', 'replicate_ID', 'Cell_ID','cell_type', 'cell_subtype', 'cluster','ID',
394
- # 'Cytoplasm_Size', 'immune_checkpoint', 'Unique_ROI_index', 'Patient', 'Primary_chem(1)_vs_surg(0)']
395
- # not_intensities is the list of the columns unrelated to the markers fluorescence intensities
396
- # Can include items that aren't in a given header.
397
- #not_intensities = ['Nuc_X', 'Nuc_X_Inv', 'Nuc_Y', 'Nuc_Y_Inv', 'Nucleus_Roundness', 'Nucleus_Size', 'Cell_Size',
398
- # 'ROI_index', 'Sample_ID', 'replicate_ID', 'Cell_ID','cell_type', 'cell_subtype', 'cluster','ID',
399
- # 'Cytoplasm_Size', 'immune_checkpoint', 'Unique_ROI_index', 'Patient', 'Primary_chem(1)_vs_surg(0)']
400
-
401
- # Get all column names
402
- all_columns = df.columns.tolist()
403
-
404
- # Create a list to store non-intensity column names
405
- not_intensities = []
406
- intensity_columns = []
407
- # Iterate over each column name
408
- for column in all_columns:
409
- # Check if the column name contains 'Intensity_Average'
410
- if 'Intensity_Average' not in column:
411
- print(not_intensities)
412
- not_intensities.append(column)
413
- else:
414
- intensity_columns.append(column)
415
-
416
-
417
- # Create a new DataFrame with non-intensity columns
418
- not_intensities_df = pd.DataFrame(not_intensities)
419
- print("Non-intensity columns:")
420
- print(not_intensities)
421
-
422
- print("non-intensity DataFrame:")
423
- not_intensities
424
- #print(len(intensity_columns))
425
-
426
-
427
- pd.DataFrame(not_intensities)
428
-
429
- path_not_intensities = os.path.join(metadata_dir,"not_intensities.csv")
430
-
431
- # If this file already exists, add only not_intensities items of the list not already present in file
432
- if os.path.exists(path_not_intensities):
433
- print("'not_intensities.csv' already exists.")
434
- print("Reconciling file and Jupyter notebook lists.")
435
- file_not_intensities = open(path_not_intensities, "r")
436
- file_ni = file_not_intensities.read().splitlines()
437
- # Set difference to identify items not already in file
438
- to_add = set(not_intensities) - set(file_ni)
439
- # We want not_intensities to the a complete list
440
- not_intensities = list(set(file_ni) | set(not_intensities))
441
- file_not_intensities.close()
442
- file_not_intensities = open(path_not_intensities, "a")
443
- for item in to_add:
444
- file_not_intensities.write(item +"\n")
445
- file_not_intensities.close()
446
-
447
- else:
448
- # The file does not yet exist
449
- print("Could not find " + path_not_intensities + ". Creating now.")
450
- file_not_intensities = open(path_not_intensities, "w")
451
- for item in not_intensities:
452
- file_not_intensities.write(item + "\n")
453
- file_not_intensities.close()
454
-
455
- not_intensities_df = pd.read_csv(path_not_intensities)
456
- not_intensities_df
457
-
458
- # Columns we want to keep: not_intensities, and any intensity column that contains 'Intensity_Average' (drop any intensity marker column that is not a mean intensity)
459
- to_keep = not_intensities + [x for x in df.columns.values[~df.columns.isin(not_intensities)] if 'Intensity_Average' in x]
460
-
461
- to_keep
462
-
463
- print(len(to_keep) - 1)
464
-
465
- # However, our to_keep list contains items that might not be in our df headers!
466
- # These items are from our not_intensities list. So let's ask for only those items from to_keep that are actually found in our df
467
- # Retains only the columns from the to_keep list that are found in the df's headers (columns).
468
- # This ensures that we are only keeping the columns that exist in your df, avoiding any potential issues with non-existent column names.
469
- # The result is a df containing only the specified columns.
470
- df = df[[x for x in to_keep if x in df.columns.values]]
471
-
472
- df.head()
473
-
474
- # Assuming you have a DataFrame named 'df'
475
- # df = pd.read_csv('your_file.csv')
476
-
477
- # Load or create the stored_variables.json file
478
- json_file_path = os.path.join(present_dir,"stored_variables.json")
479
-
480
- if os.path.exists(json_file_path):
481
- with open(json_file_path, "r") as file:
482
- stored_variables = json.load(file)
483
- else:
484
- stored_variables = {}
485
-
486
- # Get all column names
487
- all_columns = df.columns.tolist()
488
-
489
- # Create an empty list to store intensity markers
490
- intensity_marker = []
491
-
492
- # Iterate over each column name
493
- for column in all_columns:
494
- # Check if the column name contains 'Intensity_Average'
495
- if 'Intensity_Average' in column:
496
- # Split the column name by underscore
497
- parts = column.split('_')
498
-
499
- # Extract the word before the first underscore
500
- marker = parts[0]
501
-
502
- # Add the marker to the intensity_marker list
503
- intensity_marker.append(marker)
504
-
505
- # Remove duplicates from the intensity_marker list
506
- intensity_marker = list(set(intensity_marker))
507
-
508
- print("Intensity Markers:")
509
- print(intensity_marker)
510
-
511
- # Create a DataFrame with the intensity markers and default values
512
- marker_options_df = pd.DataFrame({
513
- 'Marker': intensity_marker,
514
- 'Cell': [True] * len(intensity_marker),
515
- 'Cytoplasm': [False] * len(intensity_marker),
516
- 'Nucleus': [False] * len(intensity_marker)
517
- })
518
-
519
- # Define formatters for the Tabulator widget
520
- tabulator_formatters = {
521
- 'Cell': {'type': 'tickCross'},
522
- 'Cytoplasm': {'type': 'tickCross'},
523
- 'Nucleus': {'type': 'tickCross'}
524
- }
525
-
526
- # Create the Tabulator widget
527
- tabulator = pn.widgets.Tabulator(marker_options_df, formatters=tabulator_formatters, sizing_mode='stretch_width')
528
-
529
- # Create a DataFrame to store the initial intensities
530
- new_data = [{'Description': f"{marker}_Cell_Intensity_Average"} for marker in intensity_marker if True]
531
- new_data_df = pd.DataFrame(new_data)
532
-
533
- # Create a widget to display the new data as a DataFrame
534
- new_data_table = pn.widgets.Tabulator(new_data_df, name='New Data Table', sizing_mode='stretch_width')
535
-
536
- # Create a button to start the update process
537
- run_button = pn.widgets.Button(name="Save Selection", button_type='primary')
538
-
539
- # Function to update stored_variables.json
540
- def update_stored_variables(selected_columns):
541
- stored_variables["selected_intensities"] = selected_columns
542
- with open(json_file_path, "w") as file:
543
- json.dump(stored_variables, file, indent=4)
544
-
545
- # Define the update_intensities function
546
- def update_intensities(event=None):
547
- global new_data, new_data_df
548
- new_data = []
549
- selected_columns = []
550
- for _, row in tabulator.value.iterrows():
551
- marker = row['Marker']
552
- if row['Cell']:
553
- new_data.append({'Description': f"{marker}_Cell_Intensity_Average"})
554
- selected_columns.append(f"{marker}_Cell_Intensity_Average")
555
- if row['Cytoplasm']:
556
- new_data.append({'Description': f"{marker}_Cytoplasm_Intensity_Average"})
557
- selected_columns.append(f"{marker}_Cytoplasm_Intensity_Average")
558
- if row['Nucleus']:
559
- new_data.append({'Description': f"{marker}_Nucleus_Intensity_Average"})
560
- selected_columns.append(f"{marker}_Nucleus_Intensity_Average")
561
- new_data_df = pd.DataFrame(new_data)
562
- new_data_table.value = new_data_df
563
- update_stored_variables(selected_columns)
564
- print("Updated intensities DataFrame:")
565
- print(new_data_df)
566
-
567
- # Define the runner function
568
- async def runner(event):
569
- update_intensities()
570
-
571
- # Bind the runner function to the button
572
- run_button.on_click(runner)
573
-
574
- # Attach the update_intensities function to changes in the Tabulator widget
575
- tabulator.param.watch(update_intensities, 'value')
576
-
577
- # Layout
578
- updated_intensities = pn.Column(tabulator, run_button, new_data_table, sizing_mode="stretch_width")
579
-
580
- '''
581
- # Iterate over each column name
582
- for column in all_columns:
583
- # Check if the column name contains 'Intensity_Average'
584
- if 'Intensity_Average' in column:
585
- # Split the column name by underscore
586
- parts = column.split('_')
587
-
588
- # Extract the word before the first underscore
589
- marker = parts[0]
590
-
591
- # Add the marker to the intensity_marker list
592
- intensity_marker.append(marker)
593
-
594
- # Remove duplicates from the intensity_marker list
595
- intensity_marker = list(set(intensity_marker))
596
-
597
- print("Intensity Markers:")
598
- print(intensity_marker)
599
-
600
- # Create a callback function to update the intensities array
601
- def update_intensities(event):
602
- global intensities
603
- global intensities_df
604
- new_intensities = []
605
- selected_columns = []
606
- for marker, cell, cytoplasm, nucleus in zip(marker_options_df['Marker'], marker_options_df['Cell'], marker_options_df['Cytoplasm'], marker_options_df['Nucleus']):
607
- if cell:
608
- new_intensities.append(f"{marker}_Cell_Intensity_Average")
609
- selected_columns.append(f"{marker}_Cell_Intensity_Average")
610
- if cytoplasm:
611
- new_intensities.append(f"{marker}_Cytoplasm_Intensity_Average")
612
- selected_columns.append(f"{marker}_Cytoplasm_Intensity_Average")
613
- if nucleus:
614
- new_intensities.append(f"{marker}_Nucleus_Intensity_Average")
615
- selected_columns.append(f"{marker}_Nucleus_Intensity_Average")
616
- intensities = new_intensities
617
- if selected_columns:
618
- intensities_df = merged_dataframe[selected_columns]
619
- else:
620
- intensities_df = pd.DataFrame()
621
- print("Updated intensities DataFrame:")
622
- print(intensities_df)
623
-
624
- tabulator_formatters = {
625
- 'bool': {'type': 'tickCross'}
626
- }
627
-
628
- # Create a DataFrame with the intensity markers and default values
629
- marker_options_df = pd.DataFrame({
630
- 'Marker': intensity_marker,
631
- 'Cell': [False] * len(intensity_marker),
632
- 'Cytoplasm': [False] * len(intensity_marker),
633
- 'Nucleus': [False] * len(intensity_marker)
634
- })
635
-
636
- # Create the Tabulator widget and link the callback function
637
- tabulator = pn.widgets.Tabulator(marker_options_df, formatters=tabulator_formatters, sizing_mode='stretch_width')
638
- tabulator.param.watch(update_intensities,'value')
639
-
640
- # Create a Panel layout with the Tabulator widget
641
- marker_options_layout = pn.Column(tabulator, sizing_mode="stretch_width")
642
- # Initialize the Panel extension with Tabulator
643
- pn.extension('tabulator')
644
-
645
- # Create a DataFrame with the intensity markers and default values
646
- marker_options_df = pd.DataFrame({
647
- 'Marker': intensity_marker,
648
- 'Cell': [True] * len(intensity_marker),
649
- 'Cytoplasm': [False] * len(intensity_marker),
650
- 'Nucleus': [False] * len(intensity_marker)
651
- })
652
-
653
- # Define formatters for the Tabulator widget
654
- tabulator_formatters = {
655
- 'Cell': {'type': 'tickCross'},
656
- 'Cytoplasm': {'type': 'tickCross'},
657
- 'Nucleus': {'type': 'tickCross'}
658
- }
659
-
660
- # Create the Tabulator widget
661
- tabulator = pn.widgets.Tabulator(marker_options_df, formatters=tabulator_formatters, sizing_mode='stretch_width')
662
-
663
- # Create a DataFrame to store the initial intensities
664
- new_data = [{'Description': f"{marker}_Cell_Intensity_Average"} for marker in intensity_marker if True]
665
- new_data_df = pd.DataFrame(new_data)
666
-
667
- # Create a widget to display the new data as a DataFrame
668
- new_data_table = pn.widgets.Tabulator(new_data_df, name='New Data Table', sizing_mode='stretch_width')
669
-
670
- # Create a button to start the update process
671
- run_button = pn.widgets.Button(name="Save Selection", button_type='primary')
672
-
673
- # Define the update_intensities function
674
- def update_intensities():
675
- global new_data, new_data_df
676
- new_data = []
677
- for _, row in tabulator.value.iterrows():
678
- marker = row['Marker']
679
- if row['Cell']:
680
- new_data.append({'Description': f"{marker}_Cell_Intensity_Average"})
681
- if row['Cytoplasm']:
682
- new_data.append({'Description': f"{marker}_Cytoplasm_Intensity_Average"})
683
- if row['Nucleus']:
684
- new_data.append({'Description': f"{marker}_Nucleus_Intensity_Average"})
685
- new_data_df = pd.DataFrame(new_data)
686
- new_data_table.value = new_data_df
687
-
688
- # Define the runner function
689
- async def runner(event):
690
- update_intensities()
691
-
692
- # Bind the runner function to the button
693
- run_button.on_click(runner)
694
-
695
- # Layout
696
- updated_intensities = pn.Column(tabulator, run_button, new_data_table, sizing_mode="stretch_width")
697
-
698
- pn.extension()'''
699
- # Serve the layout
700
- #updated_intensities.servable()
701
-
702
-
703
- intensities_df = new_data_table
704
- intensities_df = pn.pane.DataFrame(intensities_df)
705
- print(intensities_df)
706
- # ## I.4. QC CHECKS
707
-
708
- def quality_check_results(check_shape, check_no_null,check_zero_intensities):
709
- results = [
710
- f"Check Index: {check_index}",
711
- f"Check Shape: {check_shape}",
712
- f"Check No Null: {check_no_null}",
713
- f"Check Zero Intensities: {check_zero_intensities}"
714
- ]
715
- return pn.Column(*[pn.Row(result) for result in results], sizing_mode="stretch_width")
716
-
717
- print(ls_samples)
718
-
719
- def check_index_format(index_str, ls_samples):
720
- """
721
- Checks if the given index string follows the specified format.
722
-
723
- Args:
724
- index_str (str): The index string to be checked.
725
- ls_samples (list): A list of valid sample names.
726
-
727
- Returns:
728
- bool: True if the index string follows the format, False otherwise.
729
- """
730
- # Split the index string into parts
731
- parts = index_str.split('_')
732
-
733
- # Check if there are exactly 3 parts
734
- if len(parts) != 3:
735
- print(len(parts))
736
- return False
737
-
738
- # Check if the first part is in ls_samples
739
- sample_name = parts[0]
740
- if f'{sample_name}.csv' not in ls_samples:
741
- print(sample_name)
742
- return False
743
-
744
- # Check if the second part is in ['cell', 'cytoplasm', 'nucleus']
745
- location = parts[1]
746
- valid_locations = ['Cell', 'Cytoplasm', 'Nucleus']
747
- if location not in valid_locations:
748
- print(location)
749
- return False
750
-
751
- # Check if the third part is a number
752
- try:
753
- index = int(parts[2])
754
- except ValueError:
755
- print(index)
756
- return False
757
-
758
- # If all checks pass, return True
759
- return True
760
-
761
- # Let's take a look at a few features to make sure our dataframe is as expected
762
- df.index
763
- def check_format_ofindex(index):
764
- for index in df.index:
765
- check_index = check_index_format(index, ls_samples)
766
- if check_index is False:
767
- index_format = "Bad"
768
- return index_format
769
-
770
- index_format = "Good"
771
- return index_format
772
- print(check_format_ofindex(df.index))
773
-
774
- df.shape
775
- check_index = df.index
776
- check_shape = df.shape
777
- print(check_shape)
778
-
779
- # Check for NaN entries (should not be any unless columns do not align)
780
- # False means no NaN entries
781
- # True means NaN entries
782
- df.isnull().any().any()
783
-
784
- check_no_null = df.isnull().any().any()
785
-
786
- # Check that all expected files were imported into final dataframe
787
- if sorted(df.Sample_ID.unique()) == sorted(ls_samples):
788
- print("All expected filenames are present in big df Sample_ID column.")
789
- check_all_expected_files_present = "All expected filenames are present in big df Sample_ID column."
790
- else:
791
- compare_headers(['no samples'], df.Sample_ID.unique(), "big df Sample_ID column")
792
- check_all_expected_files_present = compare_headers(['no samples'], df.Sample_ID.unique(), "big df Sample_ID column")
793
-
794
- print(df.Sample_ID)
795
-
796
- # Delete rows that have 0 value mean intensities for intensity columns
797
- print("df.shape before removing 0 mean values: ", df.shape)
798
-
799
- # We use the apply method on df to calculate the mean intensity for each row. It's done this by applying a lambda function to each row.
800
- # The lambda function excludes the columns listed in the not_intensities list (which are not to be considered for mean intensity calculations)
801
- # and calculates the mean of the remaining values in each row.
802
- ###############################
803
- # !! This may take a while !! #
804
- ###############################
805
- # Calculate mean intensity excluding 'not_intensities' columns
806
- mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1)
807
-
808
- # Check if there are any 0 mean intensity values
809
- if (mean_intensity == 0).any():
810
- df = df.loc[mean_intensity > 0, :]
811
- print("Shape after removing 0 mean values: ", df.shape)
812
- check_zero_intensities = f'df.shape after removing 0 mean values: {df.shape}'
813
- else:
814
- print("No zero intensity values.")
815
- check_zero_intensities = " No zero intensity values found in the DataFrame."
816
-
817
-
818
-
819
- # Get quantiles (5th, 50th, 95th)
820
- # List of nucleus size percentiles to extract
821
- #qs = [0.05,0.50,0.95]
822
-
823
-
824
-
825
- #df["Nucleus_Size"].quantile(q=qs)
826
-
827
-
828
- quality_control_df = df
829
- quality_control_df.head()
830
-
831
- # Function to perform quality checks
832
- def perform_quality_checks(df, ls_samples, not_intensities):
833
- results = {}
834
- errors = []
835
- # Check index
836
- results['index'] = df.index
837
-
838
- # Check shape
839
- results['shape'] = df.shape
840
-
841
- # Check for NaN entries
842
- results['nan_entries'] = df.isnull().any().any()
843
-
844
- # Remove rows with 0 mean intensity values
845
- mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1)
846
- if (mean_intensity == 0).any():
847
- df = df.loc[mean_intensity > 0, :]
848
- results['zero_intensity_removal'] = f"Zero intensity entires are found and removed. Shape after removing: {df.shape}"
849
- else:
850
- results['zero_intensity_removal'] = "No zero intensity values found in the DataFrame."
851
-
852
- return results
853
-
854
- # Example usage of the function
855
- quality_check_results = perform_quality_checks(df, ls_samples, not_intensities)
856
-
857
- # Print results
858
- for key, value in quality_check_results.items():
859
- print(f"{key}: {value}")
860
-
861
-
862
- import panel as pn
863
- import pandas as pd
864
-
865
- def quality_check(file, not_intensities):
866
- # Load the output file
867
- df = file
868
-
869
- # Check Index
870
- check_index = check_format_ofindex(df.index)
871
-
872
- # Check Shape
873
- check_shape = df.shape
874
-
875
- # Check for NaN entries
876
- check_no_null = df.isnull().any().any()
877
-
878
- mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1)
879
- if (mean_intensity == 0).any():
880
- df = df.loc[mean_intensity > 0, :]
881
- print("df.shape after removing 0 mean values: ", df.shape)
882
- check_zero_intensities = f'df.shape after removing 0 mean values: {df.shape}'
883
- else:
884
- print("No zero intensity values found in the DataFrame.")
885
- check_zero_intensities = "No zero intensities."
886
-
887
- # Create a quality check results table
888
- quality_check_results_table = pd.DataFrame({
889
- 'Check': ['Index', 'Shape', 'Check for NaN Entries', 'Check for Zero Intensities'],
890
- 'Result': [str(check_index), str(check_shape), str(check_no_null), check_zero_intensities]
891
- })
892
-
893
- # Create a quality check results component
894
- quality_check_results_component = pn.Card(
895
- pn.pane.DataFrame(quality_check_results_table),
896
- title="Quality Control Results",
897
- header_background="#2196f3",
898
- header_color="white",
899
- )
900
-
901
- return quality_check_results_component
902
-
903
- quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99, step=0.01, value=0.05)
904
-
905
-
906
- # Function to calculate quantile values
907
- def calculate_quantiles(quantile):
908
- quantile_value_intensity = df["AF555_Cell_Intensity_Average"].quantile(q=[quantile, 0.50, 1 - quantile])
909
- return quantile_value_intensity
910
-
911
- # Function to create the Panel app
912
- def create_app(quantile = quantile_slider.param.value):
913
- quantiles = calculate_quantiles(quantile)
914
- output = pd.DataFrame(quantiles)
915
-
916
- # Create a Markdown widget to display the output
917
- output_widget = pn.pane.DataFrame(output)
918
-
919
- return output_widget
920
-
921
-
922
- # Bind the create_app function to the quantile slider
923
- quantile_output_app = pn.bind(create_app, quantile_slider.param.value)
924
- #pn.Column(quantile_slider,quantile_output_app).servable()
925
-
926
- # Function to create the line graph plot using Bokeh
927
- def create_line_graph2(quantile):
928
- # Calculate histogram
929
- hist, edges = np.histogram(df['Nucleus_Size'], bins=30)
930
-
931
- # Calculate the midpoints of bins for plotting
932
- midpoints = (edges[:-1] + edges[1:]) / 2
933
-
934
- # Calculate quantiles
935
- qs = [quantile, 0.50, 1.00 - quantile]
936
- quantiles = df['Nucleus_Size'].quantile(q=qs).values
937
-
938
- # Create Bokeh line graph plot
939
- p = figure(title='Frequency vs. Nucleus_Size',
940
- x_axis_label='Nucleus_Size',
941
- y_axis_label='Frequency',
942
- width=800, height=400)
943
-
944
- # Plotting histogram
945
- p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
946
- fill_color='skyblue', line_color='black', alpha=0.6)
947
-
948
- # Plotting line graph
949
- p.line(midpoints, hist, line_width=2, color='blue', alpha=0.7)
950
-
951
- # Add quantile lines
952
- for q in quantiles:
953
- span = Span(location=q, dimension='height', line_color='red', line_dash='dashed', line_width=2)
954
- p.add_layout(span)
955
- p.add_layout(Label(x=q, y=max(hist), text=f'{q:.1f}', text_color='red'))
956
-
957
- return p
958
-
959
- # Bind the create_line_graph function to the quantile slider
960
- nucleus_size_line_graph_with_histogram = pn.bind(create_line_graph2, quantile=quantile_slider.param.value)
961
-
962
- # Clean the 'Nucleus_Size' column by removing NaN and infinite values
963
- df = df[np.isfinite(df['Nucleus_Size'])] # This will keep only finite values
964
-
965
- # Check if the DataFrame is not empty after cleaning
966
- if df.empty:
967
- raise ValueError("No valid data available after cleaning.")
968
- else:
969
- # Calculate the histogram
970
- hist, edges = np.histogram(df['Nucleus_Size'], bins=30)
971
- print("Histogram calculated successfully.")
972
- print("Histogram:", hist)
973
- print("Edges:", edges)
974
- plot1 = pn.Column(quantile_slider, pn.pane.Bokeh(nucleus_size_line_graph_with_histogram))
975
-
976
- #Removing cells based on nucleus size
977
-
978
- quantile = quantile_slider.value
979
- qs = [quantile, 0.50, 1.00 - quantile]
980
- quantiles = df['Nucleus_Size'].quantile(q=qs).values
981
- threshold = quantiles[2]
982
-
983
- print(threshold)
984
-
985
- import panel as pn
986
- import pandas as pd
987
- import numpy as np
988
- from bokeh.plotting import figure
989
- from bokeh.models import Span, Label
990
- # Define the quantile slider
991
- #quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99, step=0.01, value=0.05)
992
-
993
- # Function to update the threshold and display number of cells removed
994
- def update_threshold_and_display(quantile):
995
- qs = [quantile, 0.50, 1.00 - quantile]
996
- quantiles = df['Nucleus_Size'].quantile(q=qs).values
997
- threshold = quantiles[2]
998
-
999
- # Filter the DataFrame based on the new threshold
1000
- df_filtered = df.loc[(df['Nucleus_Size'] > 42) & (df['Nucleus_Size'] < threshold)]
1001
-
1002
- # Calculate the number of cells removed
1003
- cells_before_filter = df.shape[0]
1004
- cells_after_filter = df_filtered.shape[0]
1005
- cells_removed = cells_before_filter - cells_after_filter
1006
-
1007
- # Display the results
1008
- results = pn.Column(
1009
- f"Number of cells before filtering: {cells_before_filter}",
1010
- f"Number of cells after filtering on nucleus size: {cells_after_filter}",
1011
- f"Number of cells removed: {cells_removed}"
1012
- )
1013
-
1014
- return results
1015
-
1016
- # Bind the update function to the quantile slider
1017
- results_display = pn.bind(update_threshold_and_display, quantile_slider)
1018
-
1019
- # Layout the components in a Panel app
1020
- layout2 = results_display
1021
-
1022
- print("Number of cells before filtering :", df.shape[0])
1023
- cells_before_filter = f"Number of cells before filtering :{df.shape[0]}"
1024
- # Delete small cells and objects w/high AF555 Signal (RBCs)
1025
- # We usually use the 95th percentile calculated during QC_EDA
1026
- df = df.loc[(df['Nucleus_Size'] > 42 )]
1027
- df = df.loc[(df['Nucleus_Size'] < threshold)]
1028
- cells_after_filter_nucleus_shape = df.shape[0]
1029
- print("Number of cells after filtering on nucleus size:", df.shape[0])
1030
-
1031
- df = df.loc[(df['AF555_Cell_Intensity_Average'] < 2000)]
1032
- print("Number of cells after filtering on AF555A ___ intensity:", df.shape[0])
1033
- cells_after_filter_intensity_shape = df.shape[0]
1034
- cells_after_filter_nucleus = f"Number of cells after filtering on nucleus size: {cells_after_filter_nucleus_shape}"
1035
- cells_after_filter_intensity = f"Number of cells after filtering on AF555A ___ intensity: {cells_after_filter_intensity_shape}"
1036
-
1037
- num_of_cell_removal_intensity = cells_after_filter_intensity
1038
-
1039
- print(num_of_cell_removal_intensity )
1040
-
1041
- num_of_cell_removal = pn.Column(cells_before_filter, cells_after_filter_nucleus)
1042
-
1043
-
1044
- # Assuming you have a DataFrame 'df' with the intensity columns
1045
- intensities = df.filter(like='Intensity').columns.tolist()
1046
-
1047
- # Create a ColumnDataSource from the DataFrame
1048
- source = ColumnDataSource(df)
1049
-
1050
- # Function to calculate quantile values
1051
- def calculate_quantiles(column, quantile):
1052
- quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile]).values
1053
- return quantiles
1054
-
1055
- # Create the dropdown menu
1056
- column_dropdown = pn.widgets.Select(name='Select Column', options=intensities)
1057
-
1058
- quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99, step=0.01, value=0.05)
1059
-
1060
-
1061
- # Function to create the Bokeh plot
1062
- def create_intensity_plot(column, quantile):
1063
- quantiles = calculate_quantiles(column, quantile)
1064
- hist, edges = np.histogram(df[column], bins = 30)
1065
- # Calculate the midpoints of bins for plotting
1066
- midpoints = (edges[:-1] + edges[1:]) / 2
1067
-
1068
- # Create Bokeh plot
1069
- p = figure(title=f'Distribution of {column} with Quantiles',
1070
- x_axis_label=f'{column} Values',
1071
- y_axis_label='Frequency',
1072
- width=800, height=400)
1073
-
1074
-
1075
- p.quad(top=hist, bottom=0, left=edges[:-1], right= edges[1:],
1076
- fill_color='skyblue', line_color='black', alpha=0.7)
1077
-
1078
- # Plotting line graph
1079
- p.line(midpoints, hist, line_width=2, color='blue', alpha=0.7)
1080
-
1081
- # Add quantile lines
1082
- for q in quantiles:
1083
- span = Span(location=q, dimension='height', line_color='red', line_dash='dashed', line_width=2)
1084
- p.add_layout(span)
1085
- p.add_layout(Label(x=q, y=max(hist), text=f'{q:.1f}', text_color='red'))
1086
-
1087
- return p
1088
-
1089
-
1090
- # Bind the create_plot function to the quantile slider, column dropdown, and button click
1091
- marker_intensity_with_histogram = pn.bind(create_intensity_plot,column_dropdown.param.value, quantile_slider.param.value, watch=True)
1092
-
1093
- # Create the button
1094
- generate_plot_button = Button(label='Generate Plot', button_type='primary')
1095
-
1096
- def update_plot(column, quantile):
1097
- plot = create_intensity_plot(column, quantile)
1098
- plot.renderers[0].data_source = source # Update the data source for the renderer
1099
- return plot
1100
-
1101
- #Display the dropdown menu, quantile slider, button, and plot
1102
- #plot = update_plot(column_dropdown.param.value, quantile_slider.param.value)
1103
-
1104
- def generate_plot(event):
1105
- updated_plot = update_plot(column_dropdown.param.value, quantile_slider.param.value)
1106
- #pn.Column(pn.Row(column_dropdown, generate_plot_button), quantile_slider, updated_plot).servable()
1107
-
1108
- generate_plot_button.on_click(generate_plot)
1109
- selected_marker_plot = pn.Column(pn.Row(pn.Column(column_dropdown, marker_intensity_with_histogram )))
1110
- #pn.Column(pn.Row(pn.Column(column_dropdown, marker_intensity_with_histogram ), generate_plot_button)).servable()
1111
-
1112
- import panel as pn
1113
- import numpy as np
1114
- import pandas as pd
1115
- from bokeh.plotting import figure
1116
- from bokeh.models import ColumnDataSource, Button, Span, Label
1117
-
1118
- # Assuming you have a DataFrame 'df' with the intensity columns
1119
- intensities = df.filter(like='Intensity').columns.tolist()
1120
-
1121
- # Create a ColumnDataSource from the DataFrame
1122
- source = ColumnDataSource(df)
1123
-
1124
- # Function to calculate quantile values
1125
- def calculate_quantiles(column, quantile):
1126
- quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile])
1127
- return quantiles
1128
-
1129
- quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99, step=0.01, value=0.05)
1130
-
1131
-
1132
- # Bind the create_line_graph function to the quantile slider
1133
- #nucleus_size_line_graph = pn.bind(create_line_graph, quantile=quantile_slider.param.value)
1134
-
1135
- # Layout the components in a Panel app
1136
- #nucleus_size_graph = pn.Column(nucleus_size_line_graph)
1137
-
1138
- len(intensities)
1139
-
1140
- df
1141
-
1142
- def calculate_cytoplasm_quantiles(column, quantile):
1143
- # Print the columns of the DataFrame
1144
- print("DataFrame columns:", df.columns)
1145
-
1146
- # Check if the column exists in the DataFrame
1147
- if column not in df.columns:
1148
- raise KeyError(f"Column '{column}' does not exist in the DataFrame.")
1149
-
1150
- quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile])
1151
- return quantiles
1152
-
1153
- def create_cytoplasm_intensity_df(column, quantile):
1154
- quantiles = calculate_cytoplasm_quantiles(column, quantile)
1155
- output = pd.DataFrame(quantiles)
1156
- return pn.pane.DataFrame(output)
1157
-
1158
- # Bind the create_app function to the quantile slider
1159
- cytoplasm_quantile_output_app = pn.bind(create_cytoplasm_intensity_df, column=df.columns[10], quantile=quantile_slider.param.value)
1160
-
1161
- pn.Column(quantile_slider, cytoplasm_quantile_output_app)
1162
- def calculate_cytoplasm_quantiles(column, quantile):
1163
- quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile])
1164
- return quantiles
1165
-
1166
- def create_cytoplasm_intensity_df(column, quantile):
1167
- quantiles = calculate_cytoplasm_quantiles(column, quantile)
1168
- output = pd.DataFrame(quantiles)
1169
- # Create a Dataframe widget to display the output
1170
- output_widget = pn.pane.DataFrame(output)
1171
- return output_widget
1172
-
1173
-
1174
- # Bind the create_app function to the quantile slider
1175
- cytoplasm_quantile_output_app = pn.bind(create_cytoplasm_intensity_df, column=df.columns[10], quantile = quantile_slider.param.value)
1176
- pn.Column(quantile_slider,cytoplasm_quantile_output_app)
1177
-
1178
-
1179
- # ## I.5. COLUMNS OF INTERESTS
1180
-
1181
- # Remove columns containing "DAPI"
1182
- df = df[[x for x in df.columns.values if 'DAPI' not in x]]
1183
-
1184
- print("Columns are now...")
1185
- print([c for c in df.columns.values])
1186
-
1187
-
1188
- # Create lists of full names and shortened names to use in plotting
1189
- full_to_short_names, short_to_full_names = \
1190
- shorten_feature_names(df.columns.values[~df.columns.isin(not_intensities)])
1191
-
1192
- short_to_full_names
1193
-
1194
-
1195
- # Save this data to a metadata file
1196
- filename = os.path.join(metadata_dir, "full_to_short_column_names.csv")
1197
- fh = open(filename, "w")
1198
- fh.write("full_name,short_name\n")
1199
- for k,v in full_to_short_names.items():
1200
- fh.write(k + "," + v + "\n")
1201
-
1202
- fh.close()
1203
- print("The full_to_short_column_names.csv file was created !")
1204
-
1205
- # Save this data to a metadata file
1206
- filename = os.path.join(metadata_dir, "short_to_full_column_names.csv")
1207
- fh = open(filename, "w")
1208
- fh.write("short_name,full_name\n")
1209
- for k,v in short_to_full_names.items():
1210
- fh.write(k + "," + v + "\n")
1211
-
1212
- fh.close()
1213
- print("The short_to_full_column_names.csv file was created !")
1214
-
1215
-
1216
- # ## I.6. EXPOSURE TIME
1217
-
1218
-
1219
- #import the ashlar analysis file
1220
- file_path = os.path.join(metadata_dir, 'combined_metadata.csv')
1221
- ashlar_analysis = pd.read_csv(file_path)
1222
- ashlar_analysis
1223
- # Extracting and renaming columns
1224
- new_df = ashlar_analysis[['Name', 'Cycle', 'ChannelIndex', 'ExposureTime']].copy()
1225
- new_df.rename(columns={
1226
- 'Name': 'Target',
1227
- 'Cycle': 'Round',
1228
- 'ChannelIndex': 'Channel'
1229
- }, inplace=True)
1230
-
1231
- # Applying suffixes to the columns
1232
- new_df['Round'] = 'R' + new_df['Round'].astype(str)
1233
- new_df['Channel'] = 'c' + new_df['Channel'].astype(str)
1234
-
1235
- # Save to CSV
1236
- new_df.to_csv('Ashlar_Exposure_Time.csv', index=False)
1237
-
1238
- # Print the new dataframe
1239
- print(new_df)
1240
-
1241
- # Here, we want to end up with a data structure that incorporates metadata on each intensity marker column used in our big dataframe in an easy-to-use format.
1242
- # This is going to include the full name of the intensity marker columns in the big data frame,
1243
- # the corresponding round and channel,
1244
- # the target protein (e.g., CD45),
1245
- # and the segmentation localization information (cell, cytoplasm, nucleus)
1246
-
1247
- # We can use this data structure to assign unique colors to all channels and rounds, for example, for use in later visualizations
1248
- # Exposure_time file from ASHLAR analysis
1249
- filename = "Exposure_Time.csv"
1250
- filename = os.path.join(metadata_dir, filename)
1251
- exp_df = pd.read_csv(filename)
1252
-
1253
- print(exp_df)
1254
-
1255
- # Verify file imported correctly
1256
- # File length
1257
- print("df's shape: ", exp_df.shape)
1258
- # Headers
1259
- expected_headers =['Round','Target','Exp','Channel']
1260
- compare_headers(expected_headers, exp_df.columns.values, "Imported metadata file")
1261
-
1262
- # Missingness
1263
- if exp_df.isnull().any().any():
1264
- print("\nexp_df has null value(s) in row(s):")
1265
- print(exp_df[exp_df.isna().any(axis=1)])
1266
- else:
1267
- print("\nNo null values detected.")
1268
-
1269
-
1270
- if len(exp_df['Target']) > len(exp_df['Target'].unique()):
1271
- print("One or more non-unique Target values in exp_df. Currently not supported.")
1272
- exp_df = exp_df.drop_duplicates(subset = 'Target').reindex()
1273
-
1274
- # sort exp_df by the values in the 'Target' column in ascending order and then retrieve the first few rows of the sorted df
1275
- exp_df.sort_values(by = ['Target']).head()
1276
-
1277
- # Create lowercase version of target
1278
- exp_df['target_lower'] = exp_df['Target'].str.lower()
1279
- exp_df.head()
1280
-
1281
- # Create df that contains marker intensity columns in our df that aren't in not_intensities
1282
- intensities = pd.DataFrame({'full_column':df.columns.values[~df.columns.isin(not_intensities)]})
1283
-
1284
- intensities
1285
- # Extract the marker information from the `full_column`, which corresponds to full column in big dataframe
1286
- # Use regular expressions (regex) to isolate the part of the field that begins (^) with an alphanumeric value (W), and ends with an underscore (_)
1287
- # '$' is end of line
1288
- intensities['marker'] = intensities['full_column'].str.extract(r'([^\W_]+)')
1289
- # convert to lowercase
1290
- intensities['marker_lower'] = intensities['marker'].str.lower()
1291
-
1292
- intensities
1293
- # Subset the intensities df to exclude any column pertaining to DAPI
1294
- intensities = intensities.loc[intensities['marker_lower'] != 'dapi']
1295
-
1296
- intensities.head()
1297
- # Merge the intensities andexp_df together to create metadata
1298
- metadata = pd.merge(exp_df, intensities, how = 'left', left_on = 'target_lower',right_on = 'marker_lower')
1299
- metadata = metadata.drop(columns = ['marker_lower'])
1300
- metadata = metadata.dropna()
1301
-
1302
- # Target is the capitalization from the Exposure_Time.csv
1303
- # target_lower is Target in small caps
1304
- # marker is the extracted first component of the full column in segmentation data, with corresponding capitalization
1305
- metadata
1306
- # Add a column to signify marker target localisation.
1307
- # Use a lambda to determine segmented location of intensity marker column and update metadata accordingly
1308
- # Using the add_metadata_location() function in my_modules.py
1309
- metadata['localisation'] = metadata.apply(
1310
- lambda row: add_metadata_location(row), axis = 1)
1311
-
1312
- mlid = metadata
1313
-
1314
- # Save this data structure to the metadata folder
1315
- # don't want to add color in because that's better off treating color the same for round, channel, and sample
1316
- filename = "marker_intensity_metadata.csv"
1317
- filename = os.path.join(metadata_dir, filename)
1318
- metadata.to_csv(filename, index = False)
1319
- print("The marker_intensity_metadata.csv file was created !")
1320
-
1321
-
1322
-
1323
- # ## I.7. COLORS WORKFLOW
1324
-
1325
- # ### I.7.1. CHANNELS COLORS
1326
-
1327
-
1328
- # we want colors that are categorical, since Channel is a non-ordered category (yes, they are numbered, but arbitrarily).
1329
- # A categorical color palette will have dissimilar colors.
1330
- # Get those unique colors
1331
- if len(metadata.Channel.unique()) > 10:
1332
- print("WARNING: There are more unique channel values than \
1333
- there are colors to choose from. Select different palette, e.g., \
1334
- continuous palette 'husl'.")
1335
- channel_color_values = sb.color_palette("bright",n_colors = len(metadata.Channel.unique()))
1336
- # chose 'colorblind' because it is categorical and we're unlikely to have > 10
1337
-
1338
- # You can customize the colors for each channel here
1339
- custom_colors = {
1340
- 'c2': 'lightgreen',
1341
- 'c3': 'tomato',
1342
- 'c4': 'pink',
1343
- 'c5': 'turquoise'
1344
- }
1345
-
1346
- custom_colors_values = sb.palplot(sb.color_palette([custom_colors.get(ch, 'blue') for ch in metadata.Channel.unique()]))
1347
-
1348
- # Display those unique customs colors
1349
- print("Unique channels are:", metadata.Channel.unique())
1350
- sb.palplot(sb.color_palette(channel_color_values))
1351
-
1352
- # Function to create a palette plot with custom colors
1353
- def create_palette_plot():
1354
- # Get unique channels
1355
- unique_channels = metadata.Channel.unique()
1356
-
1357
- # Define custom colors for each channel
1358
- custom_colors = {
1359
- 'c2': 'lightgreen',
1360
- 'c3': 'tomato',
1361
- 'c4': 'pink',
1362
- 'c5': 'turquoise'
1363
- }
1364
-
1365
- # Get custom colors for each channel
1366
- colors = [custom_colors.get(ch, 'blue') for ch in unique_channels]
1367
-
1368
- # Create a palette plot (palplot)
1369
- palette_plot = sb.palplot(sb.color_palette(colors))
1370
- channel_color_values = sb.color_palette("bright",n_colors = len(metadata.Channel.unique()))
1371
- channel_color_values = sb.palplot(channel_color_values)
1372
- return palette_plot, channel_color_values
1373
-
1374
-
1375
- # Create the palette plot directly
1376
- palette_plot = create_palette_plot()
1377
-
1378
- # Define the Panel app layout
1379
- app_palette_plot = pn.Column(
1380
- pn.pane.Markdown("### Custom Color Palette"),
1381
- palette_plot,
1382
- )
1383
-
1384
- # Function to create a palette plot with custom colors
1385
- def create_palette_plot(custom_colors):
1386
- # Get unique channels
1387
- unique_channels = metadata.Channel.unique()
1388
-
1389
- # Get custom colors for each channel
1390
- colors = [custom_colors.get(ch, 'blue') for ch in unique_channels]
1391
-
1392
- # Create a palette plot (palplot)
1393
- palette_plot = sb.palplot(sb.color_palette(colors))
1394
-
1395
- return palette_plot
1396
-
1397
- # Define custom colors for each channel
1398
- custom_colors = {
1399
- 'c2': 'lightgreen',
1400
- 'c3': 'tomato',
1401
- 'c4': 'pink',
1402
- 'c5': 'turquoise'
1403
- }
1404
-
1405
- # Display those unique customs colo
1406
- print("Unique channels are:", metadata.Channel.unique())
1407
- # Function to bind create_palette_plot
1408
- app_palette_plot = create_palette_plot(custom_colors)
1409
-
1410
-
1411
- #app_palette_plot.servable()
1412
-
1413
-
1414
- # Store in a dictionary
1415
- channel_color_dict = dict(zip(metadata.Channel.unique(), channel_color_values))
1416
- channel_color_dict
1417
- for k,v in channel_color_dict.items():
1418
- channel_color_dict[k] = np.float64(v)
1419
-
1420
- channel_color_dict
1421
-
1422
- color_df_channel = color_dict_to_df(channel_color_dict, "Channel")
1423
-
1424
- # Save to file in metadatadirectory
1425
- filename = "channel_color_data.csv"
1426
- filename = os.path.join(metadata_dir, filename)
1427
- color_df_channel.to_csv(filename, index = False)
1428
-
1429
- color_df_channel
1430
-
1431
- # Legend of channel info only
1432
- g = plt.figure(figsize = (1,1)).add_subplot(111)
1433
- g.axis('off')
1434
- handles = []
1435
- for item in channel_color_dict.keys():
1436
- h = g.bar(0,0, color = channel_color_dict[item],
1437
- label = item, linewidth =0)
1438
- handles.append(h)
1439
- first_legend = plt.legend(handles=handles, loc='upper right', title = 'Channel'),
1440
- # box_to_anchor=(10,10),
1441
- # bbox_transform=plt.gcf().transFigure)
1442
-
1443
- filename = "Channel_legend.png"
1444
- filename = os.path.join(metadata_images_dir, filename)
1445
- plt.savefig(filename, bbox_inches = 'tight')
1446
-
1447
- # ### I.7.2. ROUNDS COLORS
1448
-
1449
-
1450
- # we want colors that are sequential, since Round is an ordered category.
1451
- # We can still generate colors that are easy to distinguish. Also, many of the categorical palettes cap at at about 10 or so unique colors, and repeat from there.
1452
- # We do not want any repeats!
1453
- round_color_values = sb.cubehelix_palette(
1454
- len(metadata.Round.unique()), start=1, rot= -0.75, dark=0.19, light=.85, reverse=True)
1455
- # round_color_values = sb.color_palette("cubehelix",n_colors = len(metadata.Round.unique()))
1456
- # chose 'cubehelix' because it is sequential, and round is a continuous process
1457
- # each color value is a tuple of three values: (R, G, B)
1458
- print(metadata.Round.unique())
1459
-
1460
- sb.palplot(sb.color_palette(round_color_values))
1461
-
1462
- ## TO-DO: write what these parameters mean
1463
-
1464
- # Store in a dictionary
1465
- round_color_dict = dict(zip(metadata.Round.unique(), round_color_values))
1466
-
1467
- for k,v in round_color_dict.items():
1468
- round_color_dict[k] = np.float64(v)
1469
-
1470
- round_color_dict
1471
-
1472
- color_df_round = color_dict_to_df(round_color_dict, "Round")
1473
-
1474
- # Save to file in metadatadirectory
1475
- filename = "round_color_data.csv"
1476
- filename = os.path.join(metadata_dir, filename)
1477
- color_df_round.to_csv(filename, index = False)
1478
-
1479
- color_df_round
1480
-
1481
- # Legend of round info only
1482
-
1483
- round_legend = plt.figure(figsize = (1,1)).add_subplot(111)
1484
- round_legend.axis('off')
1485
- handles = []
1486
- for item in round_color_dict.keys():
1487
- h = round_legend.bar(0,0, color = round_color_dict[item],
1488
- label = item, linewidth =0)
1489
- handles.append(h)
1490
- first_legend = plt.legend(handles=handles, loc='upper right', title = 'Round'),
1491
- # bbox_to_anchor=(10,10),
1492
- # bbox_transform=plt.gcf().transFigure)
1493
-
1494
- filename = "Round_legend.png"
1495
- filename = os.path.join(metadata_images_dir, filename)
1496
- plt.savefig(filename, bbox_inches = 'tight')
1497
-
1498
-
1499
- # ### I.7.3. SAMPLES COLORS
1500
-
1501
- # we want colors that are neither sequential nor categorical.
1502
- # Categorical would be ideal if we could generate an arbitrary number of colors, but I do not think that we can.
1503
- # Hense, we will choose `n` colors from a continuous palette. First we will generate the right number of colors. Later, we will assign TMA samples to gray.
1504
-
1505
- # Get those unique colors
1506
- color_values = sb.color_palette("husl",n_colors = len(ls_samples))#'HLS'
1507
- # each color value is a tuple of three values: (R, G, B)
1508
-
1509
- # Display those unique colors
1510
- sb.palplot(sb.color_palette(color_values))
1511
-
1512
- TMA_samples = [s for s in df.Sample_ID.unique() if 'TMA' in s]
1513
- TMA_color_values = sb.color_palette(n_colors = len(TMA_samples),palette = "gray")
1514
- sb.palplot(sb.color_palette(TMA_color_values))
1515
-
1516
- # Store in a dictionary
1517
- color_dict = dict()
1518
- color_dict = dict(zip(df.Sample_ID.unique(), color_values))
1519
-
1520
- # Replace all TMA samples' colors with gray
1521
- i = 0
1522
- for key in color_dict.keys():
1523
- if 'TMA' in key:
1524
- color_dict[key] = TMA_color_values[i]
1525
- i +=1
1526
-
1527
- color_dict
1528
-
1529
- color_df_sample = color_dict_to_df(color_dict, "Sample_ID")
1530
-
1531
- # Save to file in metadatadirectory
1532
- filename = "sample_color_data.csv"
1533
- filename = os.path.join(metadata_dir, filename)
1534
- color_df_sample.to_csv(filename, index = False)
1535
-
1536
- color_df_sample
1537
-
1538
-
1539
- # Legend of sample info only
1540
- g = plt.figure(figsize = (1,1)).add_subplot(111)
1541
- g.axis('off')
1542
- handles = []
1543
- for item in color_dict.keys():
1544
- h = g.bar(0,0, color = color_dict[item],
1545
- label = item, linewidth =0)
1546
- handles.append(h)
1547
- first_legend = plt.legend(handles=handles, loc='upper right', title = 'Sample')
1548
-
1549
- filename = "Sample_legend.png"
1550
- filename = os.path.join(metadata_images_dir, filename)
1551
- plt.savefig(filename, bbox_inches = 'tight')
1552
-
1553
-
1554
- # ### I.7.4. CLUSTERS COLORS
1555
-
1556
- '''if 'cluster' in df.columns:
1557
- cluster_color_values = sb.color_palette("hls",n_colors = len(df.cluster.unique()))
1558
-
1559
- #print(sorted(test_df.cluster.unique()))
1560
- # Display those unique colors
1561
- sb.palplot(sb.color_palette(cluster_color_values))
1562
-
1563
- cluster_color_dict = dict(zip(sorted(test_df.cluster.unique()), cluster_color_values))
1564
- print(cluster_color_dict)
1565
-
1566
- # Create dataframe
1567
- cluster_color_df = color_dict_to_df(cluster_color_dict, "cluster")
1568
- cluster_color_df.head()
1569
-
1570
- # Save to file in metadatadirectory
1571
- filename = "cluster_color_data.csv"
1572
- filename = os.path.join(metadata_dir, filename)
1573
- cluster_color_df.to_csv(filename, index = False)
1574
-
1575
-
1576
-
1577
- # Legend of cluster info only
1578
-
1579
- if 'cluster' in df.columns:
1580
- g = plt.figure(figsize = (1,1)).add_subplot(111)
1581
- g.axis('off')
1582
- handles = []
1583
- for item in sorted(cluster_color_dict.keys()):
1584
- h = g.bar(0,0, color = cluster_color_dict[item],
1585
- label = item, linewidth =0)
1586
- handles.append(h)
1587
- first_legend = plt.legend(handles=handles, loc='upper right', title = 'Cluster'),
1588
-
1589
-
1590
- filename = "Clustertype_legend.png"
1591
- filename = os.path.join(metadata_images_dir, filename)
1592
- plt.savefig(filename, bbox_inches = 'tight')'''
1593
-
1594
- mlid.head()
1595
-
1596
-
1597
- metadata
1598
-
1599
-
1600
-
1601
- import io
1602
- import panel as pn
1603
- pn.extension()
1604
-
1605
- file_input = pn.widgets.FileInput()
1606
-
1607
- file_input
1608
-
1609
-
1610
- def transform_data(variable, window, sigma):
1611
- """Calculates the rolling average and identifies outliers"""
1612
- avg = metadata[variable].rolling(window=window).mean()
1613
- residual = metadata[variable] - avg
1614
- std = residual.rolling(window=window).std()
1615
- outliers = np.abs(residual) > std * sigma
1616
- return avg, avg[outliers]
1617
-
1618
-
1619
- def get_plot(variable="Exp", window=30, sigma=10):
1620
- """Plots the rolling average and the outliers"""
1621
- avg, highlight = transform_data(variable, window, sigma)
1622
- return avg.hvplot(
1623
- height=300, legend=False,
1624
- ) * highlight.hvplot.scatter(padding=0.1, legend=False)
1625
-
1626
-
1627
- variable_widget = pn.widgets.Select(name="Target", value="Exp", options=list(metadata.columns))
1628
- window_widget = pn.widgets.IntSlider(name="window", value=30, start=1, end=60)
1629
- sigma_widget = pn.widgets.IntSlider(name="sigma", value=10, start=0, end=20)
1630
-
1631
- # Function to save files
1632
- def save_files(event):
1633
- for sample in ls_samples:
1634
- sample_id = sample.split('.csv')[0]
1635
- filename = os.path.join(output_data_dir, sample_id + "_" + step_suffix + ".csv")
1636
-
1637
- df_save = df.loc[df['Sample_ID'] == sample, :]
1638
- if os.path.exists(filename):
1639
- df_save.to_csv(filename, index=True, index_label='ID', mode='w') # Overwrite by default
1640
- print(f"File {filename} was overwritten!")
1641
- else:
1642
- df_save.to_csv(filename, index=True, index_label='ID') # Save normally if the file doesn't exist
1643
- print(f"File {filename} was created and saved!")
1644
-
1645
- # Button to download files
1646
- download_button = pn.widgets.Button(name='Download Files', button_type='primary')
1647
- download_button.on_click(save_files)
1648
-
1649
- app = pn.template.GoldenTemplate(
1650
- site="Cyc-IF",
1651
- title="Quality Control",
1652
- main=[
1653
- pn.Tabs(
1654
- ("Dataframes", pn.Column(
1655
- pn.Row(csv_files_button,pn.bind(handle_click, csv_files_button.param.clicks), ),
1656
- pn.pane.Markdown("### The Dataframe uploaded:"), pn.pane.DataFrame(intial_dataframe),
1657
- #pn.pane.Markdown("### The Exposure time DataFrame is :"), pn.pane.DataFrame(exp_df.head()),
1658
- pn.pane.Markdown("### The DataFrame after merging CycIF data x metadata :"), pn.pane.DataFrame(merged_dataframe.head(25)),
1659
- )),
1660
- ("Quality Control", pn.Column(
1661
- quality_check(quality_control_df, not_intensities)
1662
- #pn.pane.Markdown("### The Quality check results are:"), quality_check_results(check_shape, check_no_null, check_all_expected_files_present, check_zero_intensities)
1663
- )),
1664
- ("Intensities", pn.Column(
1665
- pn.pane.Markdown("### The Not Intensities DataFrame after processing is :"), pn.pane.DataFrame(not_intensities_df, height=250),
1666
- pn.pane.Markdown("### Select Intensities to be included"), updated_intensities,
1667
- #pn.pane.Markdown("### The Intensities DataFrame"), intensities_df,
1668
- #pn.pane.Markdown("### The metadata obtained that specifies the localisation:"), pn.pane.DataFrame(mlid.head())
1669
- )),
1670
- ("Plots", pn.Column(
1671
- #pn.pane.Markdown(" ### Nucleus Size Distribution: "), pn.Row(nucleus_size_line_graph_with_histogram, num_of_cell_removal),
1672
- pn.pane.Markdown(" ### Nucleus Size Distribution: "), pn.Row(plot1,layout2),
1673
- #pn.pane.Markdown("### Nucleus Distribution Plot:"), pn.Column(nucleus_size_plot, nucleus_size_graph),
1674
- pn.pane.Markdown(" ### Intensity Average Plot:"), pn.Row(selected_marker_plot,num_of_cell_removal_intensity ),
1675
- #pn.Column(pn.Column(column_dropdown, generate_plot_button), quantile_slider, plot),
1676
- #pn.pane.Markdown("### Cytoplasm Intensity Plot:"), cytoplasm_intensity_plot,
1677
- #pn.pane.Markdown("### AF555_Cell_Intensity_Average:"), quantile_output_app,
1678
- #pn.pane.Markdown("### Distribution of AF555_Cell_Intensity_Average with Quantiles:"), quantile_intensity_plot),
1679
- pn.Column(download_button),
1680
- )),
1681
-
1682
- ),
1683
- ])
1684
-
1685
- app.servable()
1686
-
1687
- if __name__ == "__main__":
1688
- pn.serve(app, port=5007)