CycIF / Step4_ZScore.py
KashyapiNagaHarshitha's picture
Update Step4_ZScore.py
4b494f9 verified
#!/usr/bin/env python
# coding: utf-8
import os
import random
import re
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import matplotlib.colors as mplc
import subprocess
import warnings
from scipy import signal
from scipy.stats.stats import pearsonr
import plotly.figure_factory as ff
import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.express as px
from my_modules import *
import panel as pn
#Silence FutureWarnings & UserWarnings
warnings.filterwarnings('ignore', category= FutureWarning)
warnings.filterwarnings('ignore', category= UserWarning)
# ## III.2. *DIRECTORIES
# In[4]:
# Set base directory
##### MAC WORKSTATION #####
#base_dir = r'/Volumes/LaboLabrie/Projets/OC_TMA_Pejovic/Temp/Zoe/CyCIF_pipeline/'
###########################
##### WINDOWS WORKSTATION #####
#base_dir = r'C:\Users\LaboLabrie\gerz2701\cyCIF-pipeline\Set_B'
###############################
##### LOCAL WORKSTATION #####
#base_dir = r'/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431'
#############################
present_dir = os.path.dirname(os.path.realpath(__file__))
input_path = os.path.join(present_dir, 'wetransfer_data-zip_2024-05-17_1431')
base_dir = input_path
#set_name = 'Set_A'
set_name = 'test'
# In[5]:
set_path = set_name
selected_metadata_files = "['Slide_B_DD1s1.one_1.tif.csv', 'Slide_B_DD1s1.one_2.tif.csv']"
ls_samples = "['Ashlar_Exposure_Time.csv', 'new_data.csv', 'DD3S1.csv', 'DD3S2.csv', 'DD3S3.csv', 'TMA.csv']"
print(base_dir)
print(set_path)
print(ls_samples)
print(selected_metadata_files)
project_name = set_name # Project name
step_suffix = 'zscore' # Curent part (here part III)
previous_step_suffix_long = "_bs" # Previous part (here BS NOTEBOOK)
# Initial input data directory
input_data_dir = os.path.join(base_dir, project_name + previous_step_suffix_long)
# ZSCORE/LOG2 output directories
output_data_dir = os.path.join(base_dir, project_name + "_" + step_suffix)
# ZSCORE/LOG2 images subdirectory
output_images_dir = os.path.join(output_data_dir,"images")
# Data and Metadata directories
# Metadata directories
metadata_dir = os.path.join(base_dir, project_name + "_metadata")
# images subdirectory
metadata_images_dir = os.path.join(metadata_dir,"images")
# Create directories if they don't already exist
for d in [base_dir, input_data_dir, output_data_dir, output_images_dir, metadata_dir, metadata_images_dir]:
if not os.path.exists(d):
print("Creation of the" , d, "directory...")
os.makedirs(d)
else :
print("The", d, "directory already exists !")
os.chdir(input_data_dir)
# In[7]:
# Verify paths
print('base_dir :', base_dir)
print('input_data_dir :', input_data_dir)
print('output_data_dir :', output_data_dir)
print('output_images_dir :', output_images_dir)
print('metadata_dir :', metadata_dir)
print('metadata_images_dir :', metadata_images_dir)
# ## III.3. FILES
#Don't forget to put your data in the projname_data directory !
# ### III.3.1. METADATA
# In[8]:
# Import all metadata we need from the BS chapter
# METADATA
filename = "marker_intensity_metadata.csv"
filename = os.path.join(metadata_dir, filename)
# Check file exists
if not os.path.exists(filename):
print("WARNING: Could not find desired file: "+filename)
else :
print("The",filename,"file was imported for further analysis!")
# Open, read in information
metadata = pd.read_csv(filename)
# Verify size with verify_line_no() function in my_modules.py
#verify_line_no(filename, metadata.shape[0] + 1)
# Verify headers
exp_cols = ['Round','Target','Channel','target_lower','full_column','marker','localisation']
compare_headers(exp_cols, metadata.columns.values, "Marker metadata file")
metadata = metadata.dropna()
metadata.head()
# ### III.3.2. NOT_INTENSITIES
# In[9]:
filename = "not_intensities.csv"
filename = os.path.join(metadata_dir, filename)
# Check file exists
if not os.path.exists(filename):
print("WARNING: Could not find desired file: "+filename)
else :
print("The",filename,"file was imported for further analysis!")
# Open, read in information
not_intensities = []
with open(filename, 'r') as fh:
not_intensities = fh.read().strip().split("\n")
# take str, strip whitespace, split on new line character
# Verify size
print("Verifying data read from file is the correct length...\n")
#verify_line_no(filename, len(not_intensities))
# Print to console
print("not_intensities =\n", not_intensities)
pd.DataFrame(not_intensities)
# ### III.3.3. FULL_TO_SHORT_COLUMN_NAMES
# In[10]:
filename = "full_to_short_column_names.csv"
filename = os.path.join(metadata_dir, filename)
# Check file exists
if not os.path.exists(filename):
print("WARNING: Could not find desired file: " + filename)
else :
print("The",filename,"file was imported for further analysis!")
# Open, read in information
df = pd.read_csv(filename, header = 0)
# Verify size
print("Verifying data read from file is the correct length...\n")
#verify_line_no(filename, df.shape[0] + 1)
# Turn into dictionary
full_to_short_names = df.set_index('full_name').T.to_dict('records')[0]
# CD45 instead of CD45b
if project_name == 'Slide_A' :
full_to_short_names['CD45_Cytoplasm_Intensity_Average'] = full_to_short_names.pop('CD45b_Cytoplasm_Intensity_Average')
full_to_short_names['CD45_Cytoplasm_Intensity_Average'] = 'CD45_Cytoplasm'
# Print information
print('full_to_short_names =\n',full_to_short_names)
# ### III.3.4. SHORT_TO_FULL_COLUMN_NAMES
# In[11]:
filename = "short_to_full_column_names.csv"
filename = os.path.join(metadata_dir, filename)
# Check file exists
if not os.path.exists(filename):
print("WARNING: Could not find desired file: " + filename)
else :
print("The",filename,"file was imported for further analysis!")
# Open, read in information
df = pd.read_csv(filename, header = 0)
# Verify size
print("Verifying data read from file is the correct length...\n")
#verify_line_no(filename, df.shape[0] + 1)
# Turn into dictionary
short_to_full_names = df.set_index('short_name').T.to_dict('records')[0]
# CD45 instead of CD45b
if project_name == 'Slide_A' :
short_to_full_names['CD45_Cytoplasm'] = short_to_full_names.pop('CD45b_Cytoplasm')
short_to_full_names['CD45_Cytoplasm'] = 'CD45_Cytoplasm_Intensity_Average'
# Print information
print('short_to_full_names =\n',short_to_full_names)
# ### III.3.5. SAMPLES COLORS
# In[12]:
filename = "sample_color_data.csv"
filename = os.path.join(metadata_dir, filename)
# Check file exists
if not os.path.exists(filename):
print("WARNING: Could not find desired file: " + filename)
else :
print("The",filename,"file was imported for further analysis!")
# Open, read in information
df = pd.read_csv(filename, header = 0)
df = df.drop(columns = ['hex'])
# our tuple of float values for rgb, (r, g, b) was read in
# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
# substrings and convert them back into floats
df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)
# Verify size
print("Verifying data read from file is the correct length...\n")
#verify_line_no(filename, df.shape[0] + 1)
# Turn into dictionary
sample_color_dict = df.set_index('Sample_ID')['rgb']
# Print information
print('sample_color_dict =\n',sample_color_dict)
# ### III.3.6. CHANNELS COLORS
# In[13]:
filename = "channel_color_data.csv"
filename = os.path.join(metadata_dir, filename)
# Check file exists
if not os.path.exists(filename):
print("WARNING: Could not find desired file: "+filename)
else :
print("The",filename,"file was imported for further analysis!")
# Open, read in information
df = pd.read_csv(filename, header = 0)
df = df.drop(columns = ['hex'])
# our tuple of float values for rgb, (r, g, b) was read in
# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
# substrings and convert them back into floats
df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)
# Verify size
print("Verifying data read from file is the correct length...\n")
#verify_line_no(filename, df.shape[0] + 1)
# Turn into dictionary
channel_color_dict = df.set_index('Channel')['rgb']
# Print information
print('channel_color_dict =\n',channel_color_dict)
# ### III.3.7. ROUNDS COLORS
# In[14]:
# ROUND
filename = "round_color_data.csv"
filename = os.path.join(metadata_dir, filename)
# Check file exists
if not os.path.exists(filename):
print("WARNING: Could not find desired file: "+filename)
else :
print("The",filename,"file was imported for further analysis!")
# Open, read in information
df = pd.read_csv(filename, header = 0)
df = df.drop(columns = ['hex'])
# our tuple of float values for rgb, (r, g, b) was read in
# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
# substrings and convert them back into floats
df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)
# Verify size
print("Verifying data read from file is the correct length...\n")
#verify_line_no(filename, df.shape[0] + 1)
# Turn into dictionary
round_color_dict = df.set_index('Round')['rgb']
# Print information
print('round_color_dict =\n',round_color_dict)
# ### III.3.8. CELL TYPES COLORS
# In[15]:
data = pd.read_csv(os.path.join(metadata_dir, 'celltype_color_data.csv'))
data
# In[16]:
filename = "celltype_color_data.csv"
filename = os.path.join(metadata_dir, filename)
# Check file exists
if not os.path.exists(filename):
print("WARNING: Could not find desired file: "+filename)
else :
print("The",filename,"file was imported for further analysis!")
# Open, read in information
df = pd.read_csv(filename, header = 0)
#df = df.drop(columns = ['hex'])
# Assuming the RGB values are already in separate columns 'R', 'G', 'B'
if all(col in df.columns for col in ['R', 'G', 'B']):
# Create the 'rgb' column as tuples of floats
df['rgb'] = list(zip(df['R'], df['G'], df['B']))
# our tuple of float values for rgb, (r, g, b) was read in
# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
# substrings and convert them back into floats
#df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)
# Verify size
print("Verifying data read from file is the correct length...\n")
#verify_line_no(filename, df.shape[0] + 1)
# Turn into dictionary
cell_type_color_dict = df.set_index('cell_type')['rgb']
# Print information
print('cell_type_color_dict =\n',cell_type_color_dict)
# ### III.3.9. CELL SUBTYPES COLORS
# In[17]:
df = pd.read_csv(filename)
df.head()
# In[18]:
filename = "cellsubtype_color_data.csv"
filename = os.path.join(metadata_dir, filename)
# Check file exists
if not os.path.exists(filename):
print("WARNING: Could not find desired file: "+filename)
else :
print("The",filename,"file was imported for further analysis!")
# Open, read in information
df = pd.read_csv(filename, header = 0)
df = df.drop(columns = ['hex'])
# our tuple of float values for rgb, (r, g, b) was read in
# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
# substrings and convert them back into floats
df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)
# Verify size
print("Verifying data read from file is the correct length...\n")
#verify_line_no(filename, df.shape[0] + 1)
# Turn into dictionary
cell_subtype_color_dict = df.set_index('cell_subtype')['rgb'].to_dict()
# Print information
print('cell_subtype_color_dict =\n',cell_subtype_color_dict)
# In[19]:
df = pd.read_csv(filename)
df.head()
# ### III.3.10. IMMUNE CHECKPOINT COLORS
# In[20]:
filename = "immunecheckpoint_color_data.csv"
filename = os.path.join(metadata_dir, filename)
# Check file exists
if not os.path.exists(filename):
print("WARNING: Could not find desired file: "+filename)
else:
print("The", filename, "file was imported for further analysis!")
# Open, read in information
df = pd.read_csv(filename, header=0)
df = df.drop(columns=['hex'])
# Convert the 'rgb' column from string to tuple
df['rgb'] = df['rgb'].apply(rgb_tuple_from_str)
# Verify size
print("Verifying data read from file is the correct length...\n")
#verify_line_no(filename, df.shape[0] + 1)
# Turn into dictionary
immune_checkpoint_color_dict = df.set_index('immune_checkpoint')['rgb'].to_dict()
# Print information
print('immune_checkpoint_color_dict =\n', immune_checkpoint_color_dict)
immune_checkpoint_color_df = pd.DataFrame(immune_checkpoint_color_dict)
immune_checkpoint_color_df
# ### III.3.10. DATA
# In[21]:
# DATA
# List files in the directory
# Check if the directory exists
if os.path.exists(input_data_dir):
# List files in the directory
ls_samples = [sample for sample in os.listdir(input_data_dir) if sample.endswith("_bs.csv")]
print("The following CSV files were detected:")
print([sample for sample in ls_samples])
else:
print(f"The directory {input_data_dir} does not exist.")
# In[22]:
# Import all the others files
dfs = {}
# Set variable to hold default header values
# First gather information on expected headers using first file in ls_samples
# Read in the first row of the file corresponding to the first sample (index = 0) in ls_samples
df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]) , index_col = 0, nrows = 1)
expected_headers = df.columns.values
#print(expected_headers)
###############################
# !! This may take a while !! #
###############################
for sample in ls_samples:
file_path = os.path.join(input_data_dir,sample)
print(file_path)
try:
# Read the CSV file
df = pd.read_csv(file_path, index_col=0)
# Check if the DataFrame is empty, if so, don't continue trying to process df and remove it
if not df.empty:
# Reorder the columns to match the expected headers list
df = df.reindex(columns=expected_headers)
print(sample, "file is processed !\n")
#print(df)
except pd.errors.EmptyDataError:
print(f'\nEmpty data error in {sample} file. Removing from analysis...')
ls_samples.remove(sample)
# Add df to dfs
dfs[sample] = df
#print(dfs)
# In[23]:
# Merge dfs into one df
df = pd.concat(dfs.values(), ignore_index=False , sort = False)
del dfs
merged_df = df
# In[24]:
merged_df
# In[25]:
merged_df_shape = df.shape
# In[26]:
merged_df_index =df.index
# In[27]:
merged_df_col_values = df.columns.values
# In[28]:
# Check for NaN entries (should not be any unless columns do not align)
# False means no NaN entries
# True means NaN entries
merged_df_null_values = df.isnull().any().any()
# In[29]:
df.isnull().any().any()
# ## III.4. MARKERS
# In[30]:
# Listing all the markers of interest for downstream analyses
# !!TODO WITH MARILYNE!!
markers = [
'53BP1_Nucleus_Intensity_Average',
'AR_Nucleus_Intensity_Average',
'CCNB1_Cell_Intensity_Average',
'CCND1_Nucleus_Intensity_Average',
'CCNE_Nucleus_Intensity_Average',
'CD31_Cytoplasm_Intensity_Average',
'CKs_Cytoplasm_Intensity_Average',
'ERa_Nucleus_Intensity_Average',
'Ecad_Cytoplasm_Intensity_Average',
'GATA3_Nucleus_Intensity_Average',
'H3K27_Nucleus_Intensity_Average',
'H3K4me3_Nucleus_Intensity_Average',
'HER2_Cytoplasm_Intensity_Average',
'HSP90_Cell_Intensity_Average',
'Ki67_Nucleus_Intensity_Average',
'PAX8_Nucleus_Intensity_Average',
'PCNA_Nucleus_Intensity_Average',
'PRg_Nucleus_Intensity_Average',
'S100b_Cytoplasm_Intensity_Average',
'TP53_Cell_Intensity_Average',
'Vimentin_Cytoplasm_Intensity_Average',
'pAKT_Cytoplasm_Intensity_Average',
'pATM_Nucleus_Intensity_Average',
'pATR_Nucleus_Intensity_Average',
'pERK_Cell_Intensity_Average',
'pRB_Nucleus_Intensity_Average',
'pS6_Cytoplasm_Intensity_Average',
'AXL_Cytoplasm_Intensity_Average',
'B7H4_Cell_Intensity_Average',
'CD11c_Cytoplasm_Intensity_Average',
'CD163_Cytoplasm_Intensity_Average',
'CD20_Cytoplasm_Intensity_Average',
'CD31_Cytoplasm_Intensity_Average',
'CD44_Cytoplasm_Intensity_Average',
'CD45_Cytoplasm_Intensity_Average',
'CD45b_Cytoplasm_Intensity_Average',
'CD4_Cytoplasm_Intensity_Average',
'CD68_Cytoplasm_Intensity_Average',
'CD8_Cytoplasm_Intensity_Average',
'CKs_Cytoplasm_Intensity_Average',
'ColVI_Cytoplasm_Intensity_Average',
'Desmin_Cytoplasm_Intensity_Average',
'Ecad_Cytoplasm_Intensity_Average',
'FOXP3_Nucleus_Intensity_Average',
'Fibronectin_Cytoplasm_Intensity_Average',
'GATA3_Nucleus_Intensity_Average',
'HLA_Cytoplasm_Intensity_Average',
'Ki67_Nucleus_Intensity_Average',
'MMP9_Cytoplasm_Intensity_Average',
'PD1_Cytoplasm_Intensity_Average',
'PDGFR_Cytoplasm_Intensity_Average',
'PDL1_Cytoplasm_Intensity_Average',
'Sting_Cytoplasm_Intensity_Average',
'Vimentin_Cytoplasm_Intensity_Average',
'aSMA_Cytoplasm_Intensity_Average'
]
# In[31]:
# Check if all columns in the markers list are present in the DataFrame
missing_columns = [col for col in markers if col not in df.columns]
if missing_columns:
# If columns are missing that can be because the markers may be present in the other slide
print(f"The following columns are not present in the DataFrame ({len(missing_columns)} columns missing): \n{missing_columns}\n")
# Filter the DataFrame to keep only the columns that are in the markers list and also exist in the DataFrame
intersected_columns = list(set(markers).intersection(df.columns))
df_markers = df[intersected_columns]
else:
# Filter the DataFrame to keep only the columns in the markers list
df_markers = df[markers]
initial_df_marker = df_markers
df_markers.head()
# In[32]:
# Rename CD45b into CD45 (Slide A!)
if project_name == 'Slide_A' :
df_markers.rename(columns={"CD45b_Cytoplasm_Intensity_Average": "CD45_Cytoplasm_Intensity_Average"}, inplace=True)
df_markers.columns.values
# In[33]:
df_markers.shape
# In[34]:
min_values = df_markers.min().tolist()
min_values
# In[35]:
# Keep not_intensities and markers columns
# Combine both lists
combined_columns = list(set(markers) | set(not_intensities))
# Filter the DataFrame to keep only the combined columns present in both df and combined_columns
df_markers_not_intensities = df[df.columns.intersection(combined_columns)]
# In[36]:
df_markers_not_intensities
# In[37]:
df_markers_not_intensities.shape
# ## III.5. NORMALISATION
# In[38]:
df_markers.min().tolist()
# In[39]:
'''# LOG2 TRANFORMATION
#Values need to be higher than 0 for Log2 transformation.
print("df_marker.shape before normalisation: ", df_markers.shape)
df_marker_shape_before_norm = df_markers.shape
# Option 1
# This step might not be the best approach because in creates pattern in the data.
# set anything that is below 0 to 0, so that we can do the log transform, +1 to all columns
#for f in df_markers.columns[~df_markers.columns.isin(not_intensities)]:
#df_markers.loc[df_markers[f] < 0,f] = 0
#option2
# Add the min from min values (from above) +1 to all columns
#df_markers.loc[:, ~df_markers.columns.isin(not_intensities)] = \
#df_markers.loc[:,~df_markers.columns.isin(not_intensities)].copy() + 1
# Add the minimum value + 1 to each column
# OR'''
# In[40]:
min_value = df_markers.min().min()
print("min value = ", min_value)
df_markers = df_markers + (np.abs(min_value))
# +1
df_markers = df_markers + 1
df_after_norm = df_markers
df_marker_shape_after_norm = df_markers.shape
print("df_markers.shape after normalisation: ", df_markers.shape)
df_markers.min().tolist()
# Apply log2
df_markers.loc[:,~df_markers.columns.isin(not_intensities)] = \
np.log2(df_markers.loc[:, ~df_markers.columns.isin(not_intensities)])
print('log2 transform finished')
df_markers
# In[75]:
#main
pn.extension()
not_intensities = [] # Add columns to exclude from transformation if any
# Define transformation functions
def modify(df):
min_value = df.min().min()
df = df + (np.abs(min_value))
df = df + 1
df.loc[:, ~df.columns.isin(not_intensities)] = np.log2(df.loc[:, ~df.columns.isin(not_intensities)])
return df
def shift(df):
df.loc[:, ~df.columns.isin(not_intensities)] = np.log2(df.loc[:, ~df.columns.isin(not_intensities)])
return df
# Define the panel widgets
operation = pn.widgets.RadioButtonGroup(name='Operation', options=['Modify', 'Shift'], button_type='success')
# Define a function to update the DataFrame based on the selected operation
def update_dataframe(operation):
df = df_markers.copy()
if operation == 'Modify':
modified_df = modify(df)
elif operation == 'Shift':
modified_df = shift(df)
return modified_df.head(30)
# Create a panel layout
layout = pn.Column(
pn.pane.Markdown("### Data Transformation"),
operation,
pn.pane.Markdown("### Transformed DataFrame"),
pn.bind(lambda op: update_dataframe(op), operation)
)
#df_after_norm
df_markers.columns.tolist()
# Check for NaN entries (should not be any unless columns do not align)
# False means no NaN entries
# True means NaN entries
df_markers.isnull().any().any()
count_nan_in_df_markers = df_markers.isnull().sum().sum()
print(count_nan_in_df_markers)
# ## III.6. Z-SCORE TRANSFORMATION
# In[49]:
# Filter the DataFrame df to keep only the columns specified in the not_intensities list
#df = df.loc[:, not_intensities]
#df
# Check if all columns in the markers list are present in the DataFrame
missing_columns = [col for col in not_intensities if col not in df.columns]
if missing_columns:
print(f"The following columns are not present in the DataFrame ({len(missing_columns)} columns missing): \
\n{missing_columns}")
# Filter the DataFrame to keep only the columns that are in the markers list and also exist in the DataFrame
intersected_columns = list(set(not_intensities).intersection(df.columns))
df = df[intersected_columns]
else:
# Filter the DataFrame to keep only the columns in the markers list
df.loc[:, not_intensities]
df
# In[50]:
df
# In[51]:
df_merged = df_markers.merge(df, left_index=True, right_on='ID', how='left')
df_merged
# In[52]:
df_merged.columns.tolist()
# In[53]:
# Create a copy, just in case you need to restart the kernel
df_merged_copy = df_merged
# In[54]:
# Filters the rows of the DataFrame df_merged based on the values in the 'Sample_ID' column
# df_subset will contain a subset of rows from df_merged where the 'Sample_ID' matches the values in the list 'keep' ('TMA.csv' in this case)
keep = ['TMA.csv']
df_subset = df_merged.loc[df_merged['Sample_ID'].isin(keep),:].copy()
df_subset
# In[55]:
# Convert the DataFrame to numeric, forcing errors to NaN
df_numeric = df_subset.apply(pd.to_numeric, errors='coerce')
# Z-score normalization
# Z-score the rows (apply() with axis = 1, only perform on intensity data)
# Apply Z-score normalization only on numeric columns
df_subset.loc[:, ~df_subset.columns.isin(not_intensities)] = \
df_numeric.loc[:, ~df_numeric.columns.isin(not_intensities)].apply(
lambda row: (row - row.median()) / row.std(ddof=0), axis=1)
# Drop columns with all NaN values (if any)
df_subset.dropna(how='all', inplace=True, axis=1)
print('zscore rows finished')
###############################
# !! This may take a while !! #
###############################
'''df_subset.loc[:,~df_subset.columns.isin(not_intensities)] = \
df_subset.loc[:,~df_subset.columns.isin(not_intensities)].apply(
lambda row: (row - row.median())/(row.std(ddof=0)), axis = 1)
df_subset.dropna(how = 'all', inplace = True, axis = 1)
print('zscore rows finished')'''
# In[56]:
df_subset
df_numeric = df_merged.apply(pd.to_numeric, errors='coerce')
# Z-score the rows (apply() with axis = 1, only perform on intensity data)
###############################
# !! This may take a while !! #
###############################
df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \
df_numeric.loc[:,~df_numeric.columns.isin(not_intensities)].apply(
lambda row: (row - row.median())/(row.std(ddof=0)), axis = 1)
df_merged.dropna(how = 'all', inplace = True, axis = 1)
print('zscore rows finished')
'''# Z-score the rows (apply() with axis = 1, only perform on intensity data)
###############################
# !! This may take a while !! #
###############################
df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \
df_merged.loc[:,~df_merged.columns.isin(not_intensities)].apply(
lambda row: (row - row.median())/(row.std(ddof=0)), axis = 1)
df_merged.dropna(how = 'all', inplace = True, axis = 1)
print('zscore rows finished')'''
df_merged
# In[59]:
# Ensuring that the selected columns in df have been adjusted or normalized using the median values
df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \
df_merged.loc[:,~df_merged.columns.isin(not_intensities)] - df_subset.loc[:,~df_subset.columns.isin(not_intensities)].median()
df_merged
# In[60]:
df_merged_zscore = df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \
df_merged.loc[:,~df_merged.columns.isin(not_intensities)] / df_subset.loc[:,~df_subset.columns.isin(not_intensities)].std(ddof=0)
df_merged_zscore
# In[61]:
# Check for NaN entries (should not be any unless columns do not align)
# False means no NaN entries
# True means NaN entries
df.isnull().any().any()
# In[62]:
quality_control_df = df_merged_zscore
# In[63]:
def check_index_format(index_str, ls_samples):
"""
Checks if the given index string follows the specified format.
Args:
index_str (str): The index string to be checked.
ls_samples (list): A list of valid sample names.
Returns:
bool: True if the index string follows the format, False otherwise.
"""
# Split the index string into parts
parts = index_str.split('_')
# Check if there are exactly 3 parts
if len(parts) != 3:
print(len(parts))
return False
# Check if the first part is in ls_samples
sample_name = parts[0]
if f'{sample_name}_bs.csv' not in ls_samples:
print(sample_name)
return False
# Check if the second part is in ['cell', 'cytoplasm', 'nucleus']
location = parts[1]
valid_locations = ['Cell', 'Cytoplasm', 'Nucleus']
if location not in valid_locations:
print(location)
return False
# Check if the third part is a number
try:
index = int(parts[2])
except ValueError:
print(index)
return False
# If all checks pass, return True
return True
# Let's take a look at a few features to make sure our dataframe is as expected
def check_format_ofindex(index):
for index in df.index:
check_index = check_index_format(index, ls_samples)
if check_index is False:
index_format = "Bad"
return index_format
index_format = "Good"
return index_format
# In[64]:
import panel as pn
import pandas as pd
def quality_check(file, not_intensities):
# Load the output file
df = file
# Check Index
check_index = check_format_ofindex(df.index)
# Check Shape
check_shape = df.shape
# Check for NaN entries
check_no_null = df.isnull().any().any()
mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1)
if (mean_intensity == 0).any():
df = df.loc[mean_intensity > 0, :]
print("df.shape after removing 0 mean values: ", df.shape)
check_zero_intensities = f'Shape after removing 0 mean values: {df.shape}'
else:
print("No zero intensity values.")
check_zero_intensities = "No zero intensity values."
# Create a quality check results table
quality_check_results_table = pd.DataFrame({
'Check': ['Index', 'Shape', 'Check for NaN Entries', 'Check for Zero Intensities'],
'Result': [str(check_index), str(check_shape), str(check_no_null), check_zero_intensities]
})
# Create a quality check results component
quality_check_results_component = pn.Card(
pn.pane.DataFrame(quality_check_results_table),
title="Quality Control Results",
header_background="#2196f3",
header_color="white",
)
return quality_check_results_component
# In[76]:
import panel as pn
# Assuming your DataFrames are already defined as:
# metadata, merged_df, initial_df_marker, df_markers_not_intensities, df_after_norm,
# df_markers, df_subset, df_merged_zscore
# Create widgets and panes
df_widget = pn.widgets.DataFrame(metadata, name="MetaData")
# Define the three tabs content
metadata_tab = pn.Column(
pn.pane.Markdown("### Sample Metadata"),
pn.pane.DataFrame(metadata.head()),
pn.pane.Markdown("### Intial Dataframe"),
pn.pane.DataFrame(initial_df_marker.head(), width = 1500),
pn.Row(pn.pane.Markdown("### Shape: "), pn.pane.Markdown(str(merged_df.shape))),
pn.pane.Markdown("### Merged Dataframe"),
pn.pane.DataFrame(merged_df.head(), width = 1500),
pn.Row(pn.pane.Markdown("### Shape: "), pn.pane.Markdown(str(initial_df_marker.shape))),
pn.pane.Markdown("### Markers and not intensities Dataframe"),
pn.pane.DataFrame(df_markers_not_intensities.head(), width = 1500),
pn.Row(pn.pane.Markdown("### Shape: "),
pn.pane.Markdown(str(df_markers_not_intensities.shape)))
)
normalization_tab = pn.Column(
#pn.pane.Markdown("### Normalisation performed"),
#pn.pane.DataFrame(df_after_norm.head()),
#pn.Row(pn.pane.Markdown("### Shape before normalization: ")),
#pn.pane.Markdown(str(df_marker_shape_before_norm))),
#pn.Row(pn.pane.Markdown("### Shape after normalization: ")),
#pn.pane.Markdown(str(df_marker_shape_after_norm))),
#pn.pane.Markdown("### Performed log 2 transformation"),
#pn.pane.DataFrame(df_markers.head())
layout
)
zscore_tab = pn.Column(
#pn.pane.Markdown("### Performed Z-score transformation"),
#pn.pane.DataFrame(df_subset.head(), width = 1500),
pn.pane.Markdown("### Z-score transformation finished"),
pn.pane.DataFrame(df_merged_zscore.head(30), width = 1500)
)
quality_control_tab = pn.Column(
pn.pane.Markdown("### Quality Control"),
quality_check(quality_control_df, not_intensities)
)
# Create the GoldenTemplate
app3 = pn.template.GoldenTemplate(
site="Cyc-IF",
title="Z-Score Computation",
main=[
pn.Tabs(
("Metadata", metadata_tab),
("Normalization", normalization_tab),
("Z-Score", zscore_tab),
("Quality Control", quality_control_tab)
)
]
)
app3.servable()
if __name__ == "__main__":
pn.serve(app3, port=5007)