#!/usr/bin/env python # coding: utf-8 import os import random import re import pandas as pd import numpy as np import seaborn as sb import matplotlib.pyplot as plt import matplotlib.colors as mplc import subprocess import warnings from scipy import signal from scipy.stats.stats import pearsonr import plotly.figure_factory as ff import plotly import plotly.graph_objs as go from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot import plotly.express as px from my_modules import * import panel as pn #Silence FutureWarnings & UserWarnings warnings.filterwarnings('ignore', category= FutureWarning) warnings.filterwarnings('ignore', category= UserWarning) # ## III.2. *DIRECTORIES # In[4]: # Set base directory ##### MAC WORKSTATION ##### #base_dir = r'/Volumes/LaboLabrie/Projets/OC_TMA_Pejovic/Temp/Zoe/CyCIF_pipeline/' ########################### ##### WINDOWS WORKSTATION ##### #base_dir = r'C:\Users\LaboLabrie\gerz2701\cyCIF-pipeline\Set_B' ############################### ##### LOCAL WORKSTATION ##### #base_dir = r'/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431' ############################# present_dir = os.path.dirname(os.path.realpath(__file__)) input_path = os.path.join(present_dir, 'wetransfer_data-zip_2024-05-17_1431') base_dir = input_path #set_name = 'Set_A' set_name = 'test' # In[5]: set_path = set_name selected_metadata_files = "['Slide_B_DD1s1.one_1.tif.csv', 'Slide_B_DD1s1.one_2.tif.csv']" ls_samples = "['Ashlar_Exposure_Time.csv', 'new_data.csv', 'DD3S1.csv', 'DD3S2.csv', 'DD3S3.csv', 'TMA.csv']" print(base_dir) print(set_path) print(ls_samples) print(selected_metadata_files) project_name = set_name # Project name step_suffix = 'zscore' # Curent part (here part III) previous_step_suffix_long = "_bs" # Previous part (here BS NOTEBOOK) # Initial input data directory input_data_dir = os.path.join(base_dir, project_name + previous_step_suffix_long) # ZSCORE/LOG2 output directories output_data_dir = os.path.join(base_dir, project_name + "_" + step_suffix) # ZSCORE/LOG2 images subdirectory output_images_dir = os.path.join(output_data_dir,"images") # Data and Metadata directories # Metadata directories metadata_dir = os.path.join(base_dir, project_name + "_metadata") # images subdirectory metadata_images_dir = os.path.join(metadata_dir,"images") # Create directories if they don't already exist for d in [base_dir, input_data_dir, output_data_dir, output_images_dir, metadata_dir, metadata_images_dir]: if not os.path.exists(d): print("Creation of the" , d, "directory...") os.makedirs(d) else : print("The", d, "directory already exists !") os.chdir(input_data_dir) # In[7]: # Verify paths print('base_dir :', base_dir) print('input_data_dir :', input_data_dir) print('output_data_dir :', output_data_dir) print('output_images_dir :', output_images_dir) print('metadata_dir :', metadata_dir) print('metadata_images_dir :', metadata_images_dir) # ## III.3. FILES #Don't forget to put your data in the projname_data directory ! # ### III.3.1. METADATA # In[8]: # Import all metadata we need from the BS chapter # METADATA filename = "marker_intensity_metadata.csv" filename = os.path.join(metadata_dir, filename) # Check file exists if not os.path.exists(filename): print("WARNING: Could not find desired file: "+filename) else : print("The",filename,"file was imported for further analysis!") # Open, read in information metadata = pd.read_csv(filename) # Verify size with verify_line_no() function in my_modules.py #verify_line_no(filename, metadata.shape[0] + 1) # Verify headers exp_cols = ['Round','Target','Channel','target_lower','full_column','marker','localisation'] compare_headers(exp_cols, metadata.columns.values, "Marker metadata file") metadata = metadata.dropna() metadata.head() # ### III.3.2. NOT_INTENSITIES # In[9]: filename = "not_intensities.csv" filename = os.path.join(metadata_dir, filename) # Check file exists if not os.path.exists(filename): print("WARNING: Could not find desired file: "+filename) else : print("The",filename,"file was imported for further analysis!") # Open, read in information not_intensities = [] with open(filename, 'r') as fh: not_intensities = fh.read().strip().split("\n") # take str, strip whitespace, split on new line character # Verify size print("Verifying data read from file is the correct length...\n") #verify_line_no(filename, len(not_intensities)) # Print to console print("not_intensities =\n", not_intensities) pd.DataFrame(not_intensities) # ### III.3.3. FULL_TO_SHORT_COLUMN_NAMES # In[10]: filename = "full_to_short_column_names.csv" filename = os.path.join(metadata_dir, filename) # Check file exists if not os.path.exists(filename): print("WARNING: Could not find desired file: " + filename) else : print("The",filename,"file was imported for further analysis!") # Open, read in information df = pd.read_csv(filename, header = 0) # Verify size print("Verifying data read from file is the correct length...\n") #verify_line_no(filename, df.shape[0] + 1) # Turn into dictionary full_to_short_names = df.set_index('full_name').T.to_dict('records')[0] # CD45 instead of CD45b if project_name == 'Slide_A' : full_to_short_names['CD45_Cytoplasm_Intensity_Average'] = full_to_short_names.pop('CD45b_Cytoplasm_Intensity_Average') full_to_short_names['CD45_Cytoplasm_Intensity_Average'] = 'CD45_Cytoplasm' # Print information print('full_to_short_names =\n',full_to_short_names) # ### III.3.4. SHORT_TO_FULL_COLUMN_NAMES # In[11]: filename = "short_to_full_column_names.csv" filename = os.path.join(metadata_dir, filename) # Check file exists if not os.path.exists(filename): print("WARNING: Could not find desired file: " + filename) else : print("The",filename,"file was imported for further analysis!") # Open, read in information df = pd.read_csv(filename, header = 0) # Verify size print("Verifying data read from file is the correct length...\n") #verify_line_no(filename, df.shape[0] + 1) # Turn into dictionary short_to_full_names = df.set_index('short_name').T.to_dict('records')[0] # CD45 instead of CD45b if project_name == 'Slide_A' : short_to_full_names['CD45_Cytoplasm'] = short_to_full_names.pop('CD45b_Cytoplasm') short_to_full_names['CD45_Cytoplasm'] = 'CD45_Cytoplasm_Intensity_Average' # Print information print('short_to_full_names =\n',short_to_full_names) # ### III.3.5. SAMPLES COLORS # In[12]: filename = "sample_color_data.csv" filename = os.path.join(metadata_dir, filename) # Check file exists if not os.path.exists(filename): print("WARNING: Could not find desired file: " + filename) else : print("The",filename,"file was imported for further analysis!") # Open, read in information df = pd.read_csv(filename, header = 0) df = df.drop(columns = ['hex']) # our tuple of float values for rgb, (r, g, b) was read in # as a string '(r, g, b)'. We need to extract the r-, g-, and b- # substrings and convert them back into floats df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1) # Verify size print("Verifying data read from file is the correct length...\n") #verify_line_no(filename, df.shape[0] + 1) # Turn into dictionary sample_color_dict = df.set_index('Sample_ID')['rgb'] # Print information print('sample_color_dict =\n',sample_color_dict) # ### III.3.6. CHANNELS COLORS # In[13]: filename = "channel_color_data.csv" filename = os.path.join(metadata_dir, filename) # Check file exists if not os.path.exists(filename): print("WARNING: Could not find desired file: "+filename) else : print("The",filename,"file was imported for further analysis!") # Open, read in information df = pd.read_csv(filename, header = 0) df = df.drop(columns = ['hex']) # our tuple of float values for rgb, (r, g, b) was read in # as a string '(r, g, b)'. We need to extract the r-, g-, and b- # substrings and convert them back into floats df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1) # Verify size print("Verifying data read from file is the correct length...\n") #verify_line_no(filename, df.shape[0] + 1) # Turn into dictionary channel_color_dict = df.set_index('Channel')['rgb'] # Print information print('channel_color_dict =\n',channel_color_dict) # ### III.3.7. ROUNDS COLORS # In[14]: # ROUND filename = "round_color_data.csv" filename = os.path.join(metadata_dir, filename) # Check file exists if not os.path.exists(filename): print("WARNING: Could not find desired file: "+filename) else : print("The",filename,"file was imported for further analysis!") # Open, read in information df = pd.read_csv(filename, header = 0) df = df.drop(columns = ['hex']) # our tuple of float values for rgb, (r, g, b) was read in # as a string '(r, g, b)'. We need to extract the r-, g-, and b- # substrings and convert them back into floats df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1) # Verify size print("Verifying data read from file is the correct length...\n") #verify_line_no(filename, df.shape[0] + 1) # Turn into dictionary round_color_dict = df.set_index('Round')['rgb'] # Print information print('round_color_dict =\n',round_color_dict) # ### III.3.8. CELL TYPES COLORS # In[15]: data = pd.read_csv(os.path.join(metadata_dir, 'celltype_color_data.csv')) data # In[16]: filename = "celltype_color_data.csv" filename = os.path.join(metadata_dir, filename) # Check file exists if not os.path.exists(filename): print("WARNING: Could not find desired file: "+filename) else : print("The",filename,"file was imported for further analysis!") # Open, read in information df = pd.read_csv(filename, header = 0) #df = df.drop(columns = ['hex']) # Assuming the RGB values are already in separate columns 'R', 'G', 'B' if all(col in df.columns for col in ['R', 'G', 'B']): # Create the 'rgb' column as tuples of floats df['rgb'] = list(zip(df['R'], df['G'], df['B'])) # our tuple of float values for rgb, (r, g, b) was read in # as a string '(r, g, b)'. We need to extract the r-, g-, and b- # substrings and convert them back into floats #df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1) # Verify size print("Verifying data read from file is the correct length...\n") #verify_line_no(filename, df.shape[0] + 1) # Turn into dictionary cell_type_color_dict = df.set_index('cell_type')['rgb'] # Print information print('cell_type_color_dict =\n',cell_type_color_dict) # ### III.3.9. CELL SUBTYPES COLORS # In[17]: df = pd.read_csv(filename) df.head() # In[18]: filename = "cellsubtype_color_data.csv" filename = os.path.join(metadata_dir, filename) # Check file exists if not os.path.exists(filename): print("WARNING: Could not find desired file: "+filename) else : print("The",filename,"file was imported for further analysis!") # Open, read in information df = pd.read_csv(filename, header = 0) df = df.drop(columns = ['hex']) # our tuple of float values for rgb, (r, g, b) was read in # as a string '(r, g, b)'. We need to extract the r-, g-, and b- # substrings and convert them back into floats df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1) # Verify size print("Verifying data read from file is the correct length...\n") #verify_line_no(filename, df.shape[0] + 1) # Turn into dictionary cell_subtype_color_dict = df.set_index('cell_subtype')['rgb'].to_dict() # Print information print('cell_subtype_color_dict =\n',cell_subtype_color_dict) # In[19]: df = pd.read_csv(filename) df.head() # ### III.3.10. IMMUNE CHECKPOINT COLORS # In[20]: filename = "immunecheckpoint_color_data.csv" filename = os.path.join(metadata_dir, filename) # Check file exists if not os.path.exists(filename): print("WARNING: Could not find desired file: "+filename) else: print("The", filename, "file was imported for further analysis!") # Open, read in information df = pd.read_csv(filename, header=0) df = df.drop(columns=['hex']) # Convert the 'rgb' column from string to tuple df['rgb'] = df['rgb'].apply(rgb_tuple_from_str) # Verify size print("Verifying data read from file is the correct length...\n") #verify_line_no(filename, df.shape[0] + 1) # Turn into dictionary immune_checkpoint_color_dict = df.set_index('immune_checkpoint')['rgb'].to_dict() # Print information print('immune_checkpoint_color_dict =\n', immune_checkpoint_color_dict) immune_checkpoint_color_df = pd.DataFrame(immune_checkpoint_color_dict) immune_checkpoint_color_df # ### III.3.10. DATA # In[21]: # DATA # List files in the directory # Check if the directory exists if os.path.exists(input_data_dir): # List files in the directory ls_samples = [sample for sample in os.listdir(input_data_dir) if sample.endswith("_bs.csv")] print("The following CSV files were detected:") print([sample for sample in ls_samples]) else: print(f"The directory {input_data_dir} does not exist.") # In[22]: # Import all the others files dfs = {} # Set variable to hold default header values # First gather information on expected headers using first file in ls_samples # Read in the first row of the file corresponding to the first sample (index = 0) in ls_samples df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]) , index_col = 0, nrows = 1) expected_headers = df.columns.values #print(expected_headers) ############################### # !! This may take a while !! # ############################### for sample in ls_samples: file_path = os.path.join(input_data_dir,sample) print(file_path) try: # Read the CSV file df = pd.read_csv(file_path, index_col=0) # Check if the DataFrame is empty, if so, don't continue trying to process df and remove it if not df.empty: # Reorder the columns to match the expected headers list df = df.reindex(columns=expected_headers) print(sample, "file is processed !\n") #print(df) except pd.errors.EmptyDataError: print(f'\nEmpty data error in {sample} file. Removing from analysis...') ls_samples.remove(sample) # Add df to dfs dfs[sample] = df #print(dfs) # In[23]: # Merge dfs into one df df = pd.concat(dfs.values(), ignore_index=False , sort = False) del dfs merged_df = df # In[24]: merged_df # In[25]: merged_df_shape = df.shape # In[26]: merged_df_index =df.index # In[27]: merged_df_col_values = df.columns.values # In[28]: # Check for NaN entries (should not be any unless columns do not align) # False means no NaN entries # True means NaN entries merged_df_null_values = df.isnull().any().any() # In[29]: df.isnull().any().any() # ## III.4. MARKERS # In[30]: # Listing all the markers of interest for downstream analyses # !!TODO WITH MARILYNE!! markers = [ '53BP1_Nucleus_Intensity_Average', 'AR_Nucleus_Intensity_Average', 'CCNB1_Cell_Intensity_Average', 'CCND1_Nucleus_Intensity_Average', 'CCNE_Nucleus_Intensity_Average', 'CD31_Cytoplasm_Intensity_Average', 'CKs_Cytoplasm_Intensity_Average', 'ERa_Nucleus_Intensity_Average', 'Ecad_Cytoplasm_Intensity_Average', 'GATA3_Nucleus_Intensity_Average', 'H3K27_Nucleus_Intensity_Average', 'H3K4me3_Nucleus_Intensity_Average', 'HER2_Cytoplasm_Intensity_Average', 'HSP90_Cell_Intensity_Average', 'Ki67_Nucleus_Intensity_Average', 'PAX8_Nucleus_Intensity_Average', 'PCNA_Nucleus_Intensity_Average', 'PRg_Nucleus_Intensity_Average', 'S100b_Cytoplasm_Intensity_Average', 'TP53_Cell_Intensity_Average', 'Vimentin_Cytoplasm_Intensity_Average', 'pAKT_Cytoplasm_Intensity_Average', 'pATM_Nucleus_Intensity_Average', 'pATR_Nucleus_Intensity_Average', 'pERK_Cell_Intensity_Average', 'pRB_Nucleus_Intensity_Average', 'pS6_Cytoplasm_Intensity_Average', 'AXL_Cytoplasm_Intensity_Average', 'B7H4_Cell_Intensity_Average', 'CD11c_Cytoplasm_Intensity_Average', 'CD163_Cytoplasm_Intensity_Average', 'CD20_Cytoplasm_Intensity_Average', 'CD31_Cytoplasm_Intensity_Average', 'CD44_Cytoplasm_Intensity_Average', 'CD45_Cytoplasm_Intensity_Average', 'CD45b_Cytoplasm_Intensity_Average', 'CD4_Cytoplasm_Intensity_Average', 'CD68_Cytoplasm_Intensity_Average', 'CD8_Cytoplasm_Intensity_Average', 'CKs_Cytoplasm_Intensity_Average', 'ColVI_Cytoplasm_Intensity_Average', 'Desmin_Cytoplasm_Intensity_Average', 'Ecad_Cytoplasm_Intensity_Average', 'FOXP3_Nucleus_Intensity_Average', 'Fibronectin_Cytoplasm_Intensity_Average', 'GATA3_Nucleus_Intensity_Average', 'HLA_Cytoplasm_Intensity_Average', 'Ki67_Nucleus_Intensity_Average', 'MMP9_Cytoplasm_Intensity_Average', 'PD1_Cytoplasm_Intensity_Average', 'PDGFR_Cytoplasm_Intensity_Average', 'PDL1_Cytoplasm_Intensity_Average', 'Sting_Cytoplasm_Intensity_Average', 'Vimentin_Cytoplasm_Intensity_Average', 'aSMA_Cytoplasm_Intensity_Average' ] # In[31]: # Check if all columns in the markers list are present in the DataFrame missing_columns = [col for col in markers if col not in df.columns] if missing_columns: # If columns are missing that can be because the markers may be present in the other slide print(f"The following columns are not present in the DataFrame ({len(missing_columns)} columns missing): \n{missing_columns}\n") # Filter the DataFrame to keep only the columns that are in the markers list and also exist in the DataFrame intersected_columns = list(set(markers).intersection(df.columns)) df_markers = df[intersected_columns] else: # Filter the DataFrame to keep only the columns in the markers list df_markers = df[markers] initial_df_marker = df_markers df_markers.head() # In[32]: # Rename CD45b into CD45 (Slide A!) if project_name == 'Slide_A' : df_markers.rename(columns={"CD45b_Cytoplasm_Intensity_Average": "CD45_Cytoplasm_Intensity_Average"}, inplace=True) df_markers.columns.values # In[33]: df_markers.shape # In[34]: min_values = df_markers.min().tolist() min_values # In[35]: # Keep not_intensities and markers columns # Combine both lists combined_columns = list(set(markers) | set(not_intensities)) # Filter the DataFrame to keep only the combined columns present in both df and combined_columns df_markers_not_intensities = df[df.columns.intersection(combined_columns)] # In[36]: df_markers_not_intensities # In[37]: df_markers_not_intensities.shape # ## III.5. NORMALISATION # In[38]: df_markers.min().tolist() # In[39]: '''# LOG2 TRANFORMATION #Values need to be higher than 0 for Log2 transformation. print("df_marker.shape before normalisation: ", df_markers.shape) df_marker_shape_before_norm = df_markers.shape # Option 1 # This step might not be the best approach because in creates pattern in the data. # set anything that is below 0 to 0, so that we can do the log transform, +1 to all columns #for f in df_markers.columns[~df_markers.columns.isin(not_intensities)]: #df_markers.loc[df_markers[f] < 0,f] = 0 #option2 # Add the min from min values (from above) +1 to all columns #df_markers.loc[:, ~df_markers.columns.isin(not_intensities)] = \ #df_markers.loc[:,~df_markers.columns.isin(not_intensities)].copy() + 1 # Add the minimum value + 1 to each column # OR''' # In[40]: min_value = df_markers.min().min() print("min value = ", min_value) df_markers = df_markers + (np.abs(min_value)) # +1 df_markers = df_markers + 1 df_after_norm = df_markers df_marker_shape_after_norm = df_markers.shape print("df_markers.shape after normalisation: ", df_markers.shape) df_markers.min().tolist() # Apply log2 df_markers.loc[:,~df_markers.columns.isin(not_intensities)] = \ np.log2(df_markers.loc[:, ~df_markers.columns.isin(not_intensities)]) print('log2 transform finished') df_markers # In[75]: #main pn.extension() not_intensities = [] # Add columns to exclude from transformation if any # Define transformation functions def modify(df): min_value = df.min().min() df = df + (np.abs(min_value)) df = df + 1 df.loc[:, ~df.columns.isin(not_intensities)] = np.log2(df.loc[:, ~df.columns.isin(not_intensities)]) return df def shift(df): df.loc[:, ~df.columns.isin(not_intensities)] = np.log2(df.loc[:, ~df.columns.isin(not_intensities)]) return df # Define the panel widgets operation = pn.widgets.RadioButtonGroup(name='Operation', options=['Modify', 'Shift'], button_type='success') # Define a function to update the DataFrame based on the selected operation def update_dataframe(operation): df = df_markers.copy() if operation == 'Modify': modified_df = modify(df) elif operation == 'Shift': modified_df = shift(df) return modified_df.head(30) # Create a panel layout layout = pn.Column( pn.pane.Markdown("### Data Transformation"), operation, pn.pane.Markdown("### Transformed DataFrame"), pn.bind(lambda op: update_dataframe(op), operation) ) #df_after_norm df_markers.columns.tolist() # Check for NaN entries (should not be any unless columns do not align) # False means no NaN entries # True means NaN entries df_markers.isnull().any().any() count_nan_in_df_markers = df_markers.isnull().sum().sum() print(count_nan_in_df_markers) # ## III.6. Z-SCORE TRANSFORMATION # In[49]: # Filter the DataFrame df to keep only the columns specified in the not_intensities list #df = df.loc[:, not_intensities] #df # Check if all columns in the markers list are present in the DataFrame missing_columns = [col for col in not_intensities if col not in df.columns] if missing_columns: print(f"The following columns are not present in the DataFrame ({len(missing_columns)} columns missing): \ \n{missing_columns}") # Filter the DataFrame to keep only the columns that are in the markers list and also exist in the DataFrame intersected_columns = list(set(not_intensities).intersection(df.columns)) df = df[intersected_columns] else: # Filter the DataFrame to keep only the columns in the markers list df.loc[:, not_intensities] df # In[50]: df # In[51]: df_merged = df_markers.merge(df, left_index=True, right_on='ID', how='left') df_merged # In[52]: df_merged.columns.tolist() # In[53]: # Create a copy, just in case you need to restart the kernel df_merged_copy = df_merged # In[54]: # Filters the rows of the DataFrame df_merged based on the values in the 'Sample_ID' column # df_subset will contain a subset of rows from df_merged where the 'Sample_ID' matches the values in the list 'keep' ('TMA.csv' in this case) keep = ['TMA.csv'] df_subset = df_merged.loc[df_merged['Sample_ID'].isin(keep),:].copy() df_subset # In[55]: # Convert the DataFrame to numeric, forcing errors to NaN df_numeric = df_subset.apply(pd.to_numeric, errors='coerce') # Z-score normalization # Z-score the rows (apply() with axis = 1, only perform on intensity data) # Apply Z-score normalization only on numeric columns df_subset.loc[:, ~df_subset.columns.isin(not_intensities)] = \ df_numeric.loc[:, ~df_numeric.columns.isin(not_intensities)].apply( lambda row: (row - row.median()) / row.std(ddof=0), axis=1) # Drop columns with all NaN values (if any) df_subset.dropna(how='all', inplace=True, axis=1) print('zscore rows finished') ############################### # !! This may take a while !! # ############################### '''df_subset.loc[:,~df_subset.columns.isin(not_intensities)] = \ df_subset.loc[:,~df_subset.columns.isin(not_intensities)].apply( lambda row: (row - row.median())/(row.std(ddof=0)), axis = 1) df_subset.dropna(how = 'all', inplace = True, axis = 1) print('zscore rows finished')''' # In[56]: df_subset df_numeric = df_merged.apply(pd.to_numeric, errors='coerce') # Z-score the rows (apply() with axis = 1, only perform on intensity data) ############################### # !! This may take a while !! # ############################### df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \ df_numeric.loc[:,~df_numeric.columns.isin(not_intensities)].apply( lambda row: (row - row.median())/(row.std(ddof=0)), axis = 1) df_merged.dropna(how = 'all', inplace = True, axis = 1) print('zscore rows finished') '''# Z-score the rows (apply() with axis = 1, only perform on intensity data) ############################### # !! This may take a while !! # ############################### df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \ df_merged.loc[:,~df_merged.columns.isin(not_intensities)].apply( lambda row: (row - row.median())/(row.std(ddof=0)), axis = 1) df_merged.dropna(how = 'all', inplace = True, axis = 1) print('zscore rows finished')''' df_merged # In[59]: # Ensuring that the selected columns in df have been adjusted or normalized using the median values df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \ df_merged.loc[:,~df_merged.columns.isin(not_intensities)] - df_subset.loc[:,~df_subset.columns.isin(not_intensities)].median() df_merged # In[60]: df_merged_zscore = df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \ df_merged.loc[:,~df_merged.columns.isin(not_intensities)] / df_subset.loc[:,~df_subset.columns.isin(not_intensities)].std(ddof=0) df_merged_zscore # In[61]: # Check for NaN entries (should not be any unless columns do not align) # False means no NaN entries # True means NaN entries df.isnull().any().any() # In[62]: quality_control_df = df_merged_zscore # In[63]: def check_index_format(index_str, ls_samples): """ Checks if the given index string follows the specified format. Args: index_str (str): The index string to be checked. ls_samples (list): A list of valid sample names. Returns: bool: True if the index string follows the format, False otherwise. """ # Split the index string into parts parts = index_str.split('_') # Check if there are exactly 3 parts if len(parts) != 3: print(len(parts)) return False # Check if the first part is in ls_samples sample_name = parts[0] if f'{sample_name}_bs.csv' not in ls_samples: print(sample_name) return False # Check if the second part is in ['cell', 'cytoplasm', 'nucleus'] location = parts[1] valid_locations = ['Cell', 'Cytoplasm', 'Nucleus'] if location not in valid_locations: print(location) return False # Check if the third part is a number try: index = int(parts[2]) except ValueError: print(index) return False # If all checks pass, return True return True # Let's take a look at a few features to make sure our dataframe is as expected def check_format_ofindex(index): for index in df.index: check_index = check_index_format(index, ls_samples) if check_index is False: index_format = "Bad" return index_format index_format = "Good" return index_format # In[64]: import panel as pn import pandas as pd def quality_check(file, not_intensities): # Load the output file df = file # Check Index check_index = check_format_ofindex(df.index) # Check Shape check_shape = df.shape # Check for NaN entries check_no_null = df.isnull().any().any() mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1) if (mean_intensity == 0).any(): df = df.loc[mean_intensity > 0, :] print("df.shape after removing 0 mean values: ", df.shape) check_zero_intensities = f'Shape after removing 0 mean values: {df.shape}' else: print("No zero intensity values.") check_zero_intensities = "No zero intensity values." # Create a quality check results table quality_check_results_table = pd.DataFrame({ 'Check': ['Index', 'Shape', 'Check for NaN Entries', 'Check for Zero Intensities'], 'Result': [str(check_index), str(check_shape), str(check_no_null), check_zero_intensities] }) # Create a quality check results component quality_check_results_component = pn.Card( pn.pane.DataFrame(quality_check_results_table), title="Quality Control Results", header_background="#2196f3", header_color="white", ) return quality_check_results_component # In[76]: import panel as pn # Assuming your DataFrames are already defined as: # metadata, merged_df, initial_df_marker, df_markers_not_intensities, df_after_norm, # df_markers, df_subset, df_merged_zscore # Create widgets and panes df_widget = pn.widgets.DataFrame(metadata, name="MetaData") # Define the three tabs content metadata_tab = pn.Column( pn.pane.Markdown("### Sample Metadata"), pn.pane.DataFrame(metadata.head()), pn.pane.Markdown("### Intial Dataframe"), pn.pane.DataFrame(initial_df_marker.head(), width = 1500), pn.Row(pn.pane.Markdown("### Shape: "), pn.pane.Markdown(str(merged_df.shape))), pn.pane.Markdown("### Merged Dataframe"), pn.pane.DataFrame(merged_df.head(), width = 1500), pn.Row(pn.pane.Markdown("### Shape: "), pn.pane.Markdown(str(initial_df_marker.shape))), pn.pane.Markdown("### Markers and not intensities Dataframe"), pn.pane.DataFrame(df_markers_not_intensities.head(), width = 1500), pn.Row(pn.pane.Markdown("### Shape: "), pn.pane.Markdown(str(df_markers_not_intensities.shape))) ) normalization_tab = pn.Column( #pn.pane.Markdown("### Normalisation performed"), #pn.pane.DataFrame(df_after_norm.head()), #pn.Row(pn.pane.Markdown("### Shape before normalization: ")), #pn.pane.Markdown(str(df_marker_shape_before_norm))), #pn.Row(pn.pane.Markdown("### Shape after normalization: ")), #pn.pane.Markdown(str(df_marker_shape_after_norm))), #pn.pane.Markdown("### Performed log 2 transformation"), #pn.pane.DataFrame(df_markers.head()) layout ) zscore_tab = pn.Column( #pn.pane.Markdown("### Performed Z-score transformation"), #pn.pane.DataFrame(df_subset.head(), width = 1500), pn.pane.Markdown("### Z-score transformation finished"), pn.pane.DataFrame(df_merged_zscore.head(30), width = 1500) ) quality_control_tab = pn.Column( pn.pane.Markdown("### Quality Control"), quality_check(quality_control_df, not_intensities) ) # Create the GoldenTemplate app3 = pn.template.GoldenTemplate( site="Cyc-IF", title="Z-Score Computation", main=[ pn.Tabs( ("Metadata", metadata_tab), ("Normalization", normalization_tab), ("Z-Score", zscore_tab), ("Quality Control", quality_control_tab) ) ] ) app3.servable() if __name__ == "__main__": pn.serve(app3, port=5007)