Spaces:
Sleeping
Sleeping
#!/usr/bin/env python | |
# coding: utf-8 | |
import os | |
import random | |
import re | |
import pandas as pd | |
import numpy as np | |
import seaborn as sb | |
import matplotlib.pyplot as plt | |
import matplotlib.colors as mplc | |
import subprocess | |
import warnings | |
from scipy import signal | |
from scipy.stats.stats import pearsonr | |
import plotly.figure_factory as ff | |
import plotly | |
import plotly.graph_objs as go | |
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot | |
import plotly.express as px | |
from my_modules import * | |
import panel as pn | |
#Silence FutureWarnings & UserWarnings | |
warnings.filterwarnings('ignore', category= FutureWarning) | |
warnings.filterwarnings('ignore', category= UserWarning) | |
# ## III.2. *DIRECTORIES | |
# In[4]: | |
# Set base directory | |
##### MAC WORKSTATION ##### | |
#base_dir = r'/Volumes/LaboLabrie/Projets/OC_TMA_Pejovic/Temp/Zoe/CyCIF_pipeline/' | |
########################### | |
##### WINDOWS WORKSTATION ##### | |
#base_dir = r'C:\Users\LaboLabrie\gerz2701\cyCIF-pipeline\Set_B' | |
############################### | |
##### LOCAL WORKSTATION ##### | |
#base_dir = r'/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431' | |
############################# | |
present_dir = os.path.dirname(os.path.realpath(__file__)) | |
input_path = os.path.join(present_dir, 'wetransfer_data-zip_2024-05-17_1431') | |
base_dir = input_path | |
#set_name = 'Set_A' | |
set_name = 'test' | |
# In[5]: | |
set_path = set_name | |
selected_metadata_files = "['Slide_B_DD1s1.one_1.tif.csv', 'Slide_B_DD1s1.one_2.tif.csv']" | |
ls_samples = "['Ashlar_Exposure_Time.csv', 'new_data.csv', 'DD3S1.csv', 'DD3S2.csv', 'DD3S3.csv', 'TMA.csv']" | |
print(base_dir) | |
print(set_path) | |
print(ls_samples) | |
print(selected_metadata_files) | |
project_name = set_name # Project name | |
step_suffix = 'zscore' # Curent part (here part III) | |
previous_step_suffix_long = "_bs" # Previous part (here BS NOTEBOOK) | |
# Initial input data directory | |
input_data_dir = os.path.join(base_dir, project_name + previous_step_suffix_long) | |
# ZSCORE/LOG2 output directories | |
output_data_dir = os.path.join(base_dir, project_name + "_" + step_suffix) | |
# ZSCORE/LOG2 images subdirectory | |
output_images_dir = os.path.join(output_data_dir,"images") | |
# Data and Metadata directories | |
# Metadata directories | |
metadata_dir = os.path.join(base_dir, project_name + "_metadata") | |
# images subdirectory | |
metadata_images_dir = os.path.join(metadata_dir,"images") | |
# Create directories if they don't already exist | |
for d in [base_dir, input_data_dir, output_data_dir, output_images_dir, metadata_dir, metadata_images_dir]: | |
if not os.path.exists(d): | |
print("Creation of the" , d, "directory...") | |
os.makedirs(d) | |
else : | |
print("The", d, "directory already exists !") | |
os.chdir(input_data_dir) | |
# In[7]: | |
# Verify paths | |
print('base_dir :', base_dir) | |
print('input_data_dir :', input_data_dir) | |
print('output_data_dir :', output_data_dir) | |
print('output_images_dir :', output_images_dir) | |
print('metadata_dir :', metadata_dir) | |
print('metadata_images_dir :', metadata_images_dir) | |
# ## III.3. FILES | |
#Don't forget to put your data in the projname_data directory ! | |
# ### III.3.1. METADATA | |
# In[8]: | |
# Import all metadata we need from the BS chapter | |
# METADATA | |
filename = "marker_intensity_metadata.csv" | |
filename = os.path.join(metadata_dir, filename) | |
# Check file exists | |
if not os.path.exists(filename): | |
print("WARNING: Could not find desired file: "+filename) | |
else : | |
print("The",filename,"file was imported for further analysis!") | |
# Open, read in information | |
metadata = pd.read_csv(filename) | |
# Verify size with verify_line_no() function in my_modules.py | |
#verify_line_no(filename, metadata.shape[0] + 1) | |
# Verify headers | |
exp_cols = ['Round','Target','Channel','target_lower','full_column','marker','localisation'] | |
compare_headers(exp_cols, metadata.columns.values, "Marker metadata file") | |
metadata = metadata.dropna() | |
metadata.head() | |
# ### III.3.2. NOT_INTENSITIES | |
# In[9]: | |
filename = "not_intensities.csv" | |
filename = os.path.join(metadata_dir, filename) | |
# Check file exists | |
if not os.path.exists(filename): | |
print("WARNING: Could not find desired file: "+filename) | |
else : | |
print("The",filename,"file was imported for further analysis!") | |
# Open, read in information | |
not_intensities = [] | |
with open(filename, 'r') as fh: | |
not_intensities = fh.read().strip().split("\n") | |
# take str, strip whitespace, split on new line character | |
# Verify size | |
print("Verifying data read from file is the correct length...\n") | |
#verify_line_no(filename, len(not_intensities)) | |
# Print to console | |
print("not_intensities =\n", not_intensities) | |
pd.DataFrame(not_intensities) | |
# ### III.3.3. FULL_TO_SHORT_COLUMN_NAMES | |
# In[10]: | |
filename = "full_to_short_column_names.csv" | |
filename = os.path.join(metadata_dir, filename) | |
# Check file exists | |
if not os.path.exists(filename): | |
print("WARNING: Could not find desired file: " + filename) | |
else : | |
print("The",filename,"file was imported for further analysis!") | |
# Open, read in information | |
df = pd.read_csv(filename, header = 0) | |
# Verify size | |
print("Verifying data read from file is the correct length...\n") | |
#verify_line_no(filename, df.shape[0] + 1) | |
# Turn into dictionary | |
full_to_short_names = df.set_index('full_name').T.to_dict('records')[0] | |
# CD45 instead of CD45b | |
if project_name == 'Slide_A' : | |
full_to_short_names['CD45_Cytoplasm_Intensity_Average'] = full_to_short_names.pop('CD45b_Cytoplasm_Intensity_Average') | |
full_to_short_names['CD45_Cytoplasm_Intensity_Average'] = 'CD45_Cytoplasm' | |
# Print information | |
print('full_to_short_names =\n',full_to_short_names) | |
# ### III.3.4. SHORT_TO_FULL_COLUMN_NAMES | |
# In[11]: | |
filename = "short_to_full_column_names.csv" | |
filename = os.path.join(metadata_dir, filename) | |
# Check file exists | |
if not os.path.exists(filename): | |
print("WARNING: Could not find desired file: " + filename) | |
else : | |
print("The",filename,"file was imported for further analysis!") | |
# Open, read in information | |
df = pd.read_csv(filename, header = 0) | |
# Verify size | |
print("Verifying data read from file is the correct length...\n") | |
#verify_line_no(filename, df.shape[0] + 1) | |
# Turn into dictionary | |
short_to_full_names = df.set_index('short_name').T.to_dict('records')[0] | |
# CD45 instead of CD45b | |
if project_name == 'Slide_A' : | |
short_to_full_names['CD45_Cytoplasm'] = short_to_full_names.pop('CD45b_Cytoplasm') | |
short_to_full_names['CD45_Cytoplasm'] = 'CD45_Cytoplasm_Intensity_Average' | |
# Print information | |
print('short_to_full_names =\n',short_to_full_names) | |
# ### III.3.5. SAMPLES COLORS | |
# In[12]: | |
filename = "sample_color_data.csv" | |
filename = os.path.join(metadata_dir, filename) | |
# Check file exists | |
if not os.path.exists(filename): | |
print("WARNING: Could not find desired file: " + filename) | |
else : | |
print("The",filename,"file was imported for further analysis!") | |
# Open, read in information | |
df = pd.read_csv(filename, header = 0) | |
df = df.drop(columns = ['hex']) | |
# our tuple of float values for rgb, (r, g, b) was read in | |
# as a string '(r, g, b)'. We need to extract the r-, g-, and b- | |
# substrings and convert them back into floats | |
df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1) | |
# Verify size | |
print("Verifying data read from file is the correct length...\n") | |
#verify_line_no(filename, df.shape[0] + 1) | |
# Turn into dictionary | |
sample_color_dict = df.set_index('Sample_ID')['rgb'] | |
# Print information | |
print('sample_color_dict =\n',sample_color_dict) | |
# ### III.3.6. CHANNELS COLORS | |
# In[13]: | |
filename = "channel_color_data.csv" | |
filename = os.path.join(metadata_dir, filename) | |
# Check file exists | |
if not os.path.exists(filename): | |
print("WARNING: Could not find desired file: "+filename) | |
else : | |
print("The",filename,"file was imported for further analysis!") | |
# Open, read in information | |
df = pd.read_csv(filename, header = 0) | |
df = df.drop(columns = ['hex']) | |
# our tuple of float values for rgb, (r, g, b) was read in | |
# as a string '(r, g, b)'. We need to extract the r-, g-, and b- | |
# substrings and convert them back into floats | |
df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1) | |
# Verify size | |
print("Verifying data read from file is the correct length...\n") | |
#verify_line_no(filename, df.shape[0] + 1) | |
# Turn into dictionary | |
channel_color_dict = df.set_index('Channel')['rgb'] | |
# Print information | |
print('channel_color_dict =\n',channel_color_dict) | |
# ### III.3.7. ROUNDS COLORS | |
# In[14]: | |
# ROUND | |
filename = "round_color_data.csv" | |
filename = os.path.join(metadata_dir, filename) | |
# Check file exists | |
if not os.path.exists(filename): | |
print("WARNING: Could not find desired file: "+filename) | |
else : | |
print("The",filename,"file was imported for further analysis!") | |
# Open, read in information | |
df = pd.read_csv(filename, header = 0) | |
df = df.drop(columns = ['hex']) | |
# our tuple of float values for rgb, (r, g, b) was read in | |
# as a string '(r, g, b)'. We need to extract the r-, g-, and b- | |
# substrings and convert them back into floats | |
df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1) | |
# Verify size | |
print("Verifying data read from file is the correct length...\n") | |
#verify_line_no(filename, df.shape[0] + 1) | |
# Turn into dictionary | |
round_color_dict = df.set_index('Round')['rgb'] | |
# Print information | |
print('round_color_dict =\n',round_color_dict) | |
# ### III.3.8. CELL TYPES COLORS | |
# In[15]: | |
data = pd.read_csv(os.path.join(metadata_dir, 'celltype_color_data.csv')) | |
data | |
# In[16]: | |
filename = "celltype_color_data.csv" | |
filename = os.path.join(metadata_dir, filename) | |
# Check file exists | |
if not os.path.exists(filename): | |
print("WARNING: Could not find desired file: "+filename) | |
else : | |
print("The",filename,"file was imported for further analysis!") | |
# Open, read in information | |
df = pd.read_csv(filename, header = 0) | |
#df = df.drop(columns = ['hex']) | |
# Assuming the RGB values are already in separate columns 'R', 'G', 'B' | |
if all(col in df.columns for col in ['R', 'G', 'B']): | |
# Create the 'rgb' column as tuples of floats | |
df['rgb'] = list(zip(df['R'], df['G'], df['B'])) | |
# our tuple of float values for rgb, (r, g, b) was read in | |
# as a string '(r, g, b)'. We need to extract the r-, g-, and b- | |
# substrings and convert them back into floats | |
#df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1) | |
# Verify size | |
print("Verifying data read from file is the correct length...\n") | |
#verify_line_no(filename, df.shape[0] + 1) | |
# Turn into dictionary | |
cell_type_color_dict = df.set_index('cell_type')['rgb'] | |
# Print information | |
print('cell_type_color_dict =\n',cell_type_color_dict) | |
# ### III.3.9. CELL SUBTYPES COLORS | |
# In[17]: | |
df = pd.read_csv(filename) | |
df.head() | |
# In[18]: | |
filename = "cellsubtype_color_data.csv" | |
filename = os.path.join(metadata_dir, filename) | |
# Check file exists | |
if not os.path.exists(filename): | |
print("WARNING: Could not find desired file: "+filename) | |
else : | |
print("The",filename,"file was imported for further analysis!") | |
# Open, read in information | |
df = pd.read_csv(filename, header = 0) | |
df = df.drop(columns = ['hex']) | |
# our tuple of float values for rgb, (r, g, b) was read in | |
# as a string '(r, g, b)'. We need to extract the r-, g-, and b- | |
# substrings and convert them back into floats | |
df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1) | |
# Verify size | |
print("Verifying data read from file is the correct length...\n") | |
#verify_line_no(filename, df.shape[0] + 1) | |
# Turn into dictionary | |
cell_subtype_color_dict = df.set_index('cell_subtype')['rgb'].to_dict() | |
# Print information | |
print('cell_subtype_color_dict =\n',cell_subtype_color_dict) | |
# In[19]: | |
df = pd.read_csv(filename) | |
df.head() | |
# ### III.3.10. IMMUNE CHECKPOINT COLORS | |
# In[20]: | |
filename = "immunecheckpoint_color_data.csv" | |
filename = os.path.join(metadata_dir, filename) | |
# Check file exists | |
if not os.path.exists(filename): | |
print("WARNING: Could not find desired file: "+filename) | |
else: | |
print("The", filename, "file was imported for further analysis!") | |
# Open, read in information | |
df = pd.read_csv(filename, header=0) | |
df = df.drop(columns=['hex']) | |
# Convert the 'rgb' column from string to tuple | |
df['rgb'] = df['rgb'].apply(rgb_tuple_from_str) | |
# Verify size | |
print("Verifying data read from file is the correct length...\n") | |
#verify_line_no(filename, df.shape[0] + 1) | |
# Turn into dictionary | |
immune_checkpoint_color_dict = df.set_index('immune_checkpoint')['rgb'].to_dict() | |
# Print information | |
print('immune_checkpoint_color_dict =\n', immune_checkpoint_color_dict) | |
immune_checkpoint_color_df = pd.DataFrame(immune_checkpoint_color_dict) | |
immune_checkpoint_color_df | |
# ### III.3.10. DATA | |
# In[21]: | |
# DATA | |
# List files in the directory | |
# Check if the directory exists | |
if os.path.exists(input_data_dir): | |
# List files in the directory | |
ls_samples = [sample for sample in os.listdir(input_data_dir) if sample.endswith("_bs.csv")] | |
print("The following CSV files were detected:") | |
print([sample for sample in ls_samples]) | |
else: | |
print(f"The directory {input_data_dir} does not exist.") | |
# In[22]: | |
# Import all the others files | |
dfs = {} | |
# Set variable to hold default header values | |
# First gather information on expected headers using first file in ls_samples | |
# Read in the first row of the file corresponding to the first sample (index = 0) in ls_samples | |
df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]) , index_col = 0, nrows = 1) | |
expected_headers = df.columns.values | |
#print(expected_headers) | |
############################### | |
# !! This may take a while !! # | |
############################### | |
for sample in ls_samples: | |
file_path = os.path.join(input_data_dir,sample) | |
print(file_path) | |
try: | |
# Read the CSV file | |
df = pd.read_csv(file_path, index_col=0) | |
# Check if the DataFrame is empty, if so, don't continue trying to process df and remove it | |
if not df.empty: | |
# Reorder the columns to match the expected headers list | |
df = df.reindex(columns=expected_headers) | |
print(sample, "file is processed !\n") | |
#print(df) | |
except pd.errors.EmptyDataError: | |
print(f'\nEmpty data error in {sample} file. Removing from analysis...') | |
ls_samples.remove(sample) | |
# Add df to dfs | |
dfs[sample] = df | |
#print(dfs) | |
# In[23]: | |
# Merge dfs into one df | |
df = pd.concat(dfs.values(), ignore_index=False , sort = False) | |
del dfs | |
merged_df = df | |
# In[24]: | |
merged_df | |
# In[25]: | |
merged_df_shape = df.shape | |
# In[26]: | |
merged_df_index =df.index | |
# In[27]: | |
merged_df_col_values = df.columns.values | |
# In[28]: | |
# Check for NaN entries (should not be any unless columns do not align) | |
# False means no NaN entries | |
# True means NaN entries | |
merged_df_null_values = df.isnull().any().any() | |
# In[29]: | |
df.isnull().any().any() | |
# ## III.4. MARKERS | |
# In[30]: | |
# Listing all the markers of interest for downstream analyses | |
# !!TODO WITH MARILYNE!! | |
markers = [ | |
'53BP1_Nucleus_Intensity_Average', | |
'AR_Nucleus_Intensity_Average', | |
'CCNB1_Cell_Intensity_Average', | |
'CCND1_Nucleus_Intensity_Average', | |
'CCNE_Nucleus_Intensity_Average', | |
'CD31_Cytoplasm_Intensity_Average', | |
'CKs_Cytoplasm_Intensity_Average', | |
'ERa_Nucleus_Intensity_Average', | |
'Ecad_Cytoplasm_Intensity_Average', | |
'GATA3_Nucleus_Intensity_Average', | |
'H3K27_Nucleus_Intensity_Average', | |
'H3K4me3_Nucleus_Intensity_Average', | |
'HER2_Cytoplasm_Intensity_Average', | |
'HSP90_Cell_Intensity_Average', | |
'Ki67_Nucleus_Intensity_Average', | |
'PAX8_Nucleus_Intensity_Average', | |
'PCNA_Nucleus_Intensity_Average', | |
'PRg_Nucleus_Intensity_Average', | |
'S100b_Cytoplasm_Intensity_Average', | |
'TP53_Cell_Intensity_Average', | |
'Vimentin_Cytoplasm_Intensity_Average', | |
'pAKT_Cytoplasm_Intensity_Average', | |
'pATM_Nucleus_Intensity_Average', | |
'pATR_Nucleus_Intensity_Average', | |
'pERK_Cell_Intensity_Average', | |
'pRB_Nucleus_Intensity_Average', | |
'pS6_Cytoplasm_Intensity_Average', | |
'AXL_Cytoplasm_Intensity_Average', | |
'B7H4_Cell_Intensity_Average', | |
'CD11c_Cytoplasm_Intensity_Average', | |
'CD163_Cytoplasm_Intensity_Average', | |
'CD20_Cytoplasm_Intensity_Average', | |
'CD31_Cytoplasm_Intensity_Average', | |
'CD44_Cytoplasm_Intensity_Average', | |
'CD45_Cytoplasm_Intensity_Average', | |
'CD45b_Cytoplasm_Intensity_Average', | |
'CD4_Cytoplasm_Intensity_Average', | |
'CD68_Cytoplasm_Intensity_Average', | |
'CD8_Cytoplasm_Intensity_Average', | |
'CKs_Cytoplasm_Intensity_Average', | |
'ColVI_Cytoplasm_Intensity_Average', | |
'Desmin_Cytoplasm_Intensity_Average', | |
'Ecad_Cytoplasm_Intensity_Average', | |
'FOXP3_Nucleus_Intensity_Average', | |
'Fibronectin_Cytoplasm_Intensity_Average', | |
'GATA3_Nucleus_Intensity_Average', | |
'HLA_Cytoplasm_Intensity_Average', | |
'Ki67_Nucleus_Intensity_Average', | |
'MMP9_Cytoplasm_Intensity_Average', | |
'PD1_Cytoplasm_Intensity_Average', | |
'PDGFR_Cytoplasm_Intensity_Average', | |
'PDL1_Cytoplasm_Intensity_Average', | |
'Sting_Cytoplasm_Intensity_Average', | |
'Vimentin_Cytoplasm_Intensity_Average', | |
'aSMA_Cytoplasm_Intensity_Average' | |
] | |
# In[31]: | |
# Check if all columns in the markers list are present in the DataFrame | |
missing_columns = [col for col in markers if col not in df.columns] | |
if missing_columns: | |
# If columns are missing that can be because the markers may be present in the other slide | |
print(f"The following columns are not present in the DataFrame ({len(missing_columns)} columns missing): \n{missing_columns}\n") | |
# Filter the DataFrame to keep only the columns that are in the markers list and also exist in the DataFrame | |
intersected_columns = list(set(markers).intersection(df.columns)) | |
df_markers = df[intersected_columns] | |
else: | |
# Filter the DataFrame to keep only the columns in the markers list | |
df_markers = df[markers] | |
initial_df_marker = df_markers | |
df_markers.head() | |
# In[32]: | |
# Rename CD45b into CD45 (Slide A!) | |
if project_name == 'Slide_A' : | |
df_markers.rename(columns={"CD45b_Cytoplasm_Intensity_Average": "CD45_Cytoplasm_Intensity_Average"}, inplace=True) | |
df_markers.columns.values | |
# In[33]: | |
df_markers.shape | |
# In[34]: | |
min_values = df_markers.min().tolist() | |
min_values | |
# In[35]: | |
# Keep not_intensities and markers columns | |
# Combine both lists | |
combined_columns = list(set(markers) | set(not_intensities)) | |
# Filter the DataFrame to keep only the combined columns present in both df and combined_columns | |
df_markers_not_intensities = df[df.columns.intersection(combined_columns)] | |
# In[36]: | |
df_markers_not_intensities | |
# In[37]: | |
df_markers_not_intensities.shape | |
# ## III.5. NORMALISATION | |
# In[38]: | |
df_markers.min().tolist() | |
# In[39]: | |
'''# LOG2 TRANFORMATION | |
#Values need to be higher than 0 for Log2 transformation. | |
print("df_marker.shape before normalisation: ", df_markers.shape) | |
df_marker_shape_before_norm = df_markers.shape | |
# Option 1 | |
# This step might not be the best approach because in creates pattern in the data. | |
# set anything that is below 0 to 0, so that we can do the log transform, +1 to all columns | |
#for f in df_markers.columns[~df_markers.columns.isin(not_intensities)]: | |
#df_markers.loc[df_markers[f] < 0,f] = 0 | |
#option2 | |
# Add the min from min values (from above) +1 to all columns | |
#df_markers.loc[:, ~df_markers.columns.isin(not_intensities)] = \ | |
#df_markers.loc[:,~df_markers.columns.isin(not_intensities)].copy() + 1 | |
# Add the minimum value + 1 to each column | |
# OR''' | |
# In[40]: | |
min_value = df_markers.min().min() | |
print("min value = ", min_value) | |
df_markers = df_markers + (np.abs(min_value)) | |
# +1 | |
df_markers = df_markers + 1 | |
df_after_norm = df_markers | |
df_marker_shape_after_norm = df_markers.shape | |
print("df_markers.shape after normalisation: ", df_markers.shape) | |
df_markers.min().tolist() | |
# Apply log2 | |
df_markers.loc[:,~df_markers.columns.isin(not_intensities)] = \ | |
np.log2(df_markers.loc[:, ~df_markers.columns.isin(not_intensities)]) | |
print('log2 transform finished') | |
df_markers | |
# In[75]: | |
#main | |
pn.extension() | |
not_intensities = [] # Add columns to exclude from transformation if any | |
# Define transformation functions | |
def modify(df): | |
min_value = df.min().min() | |
df = df + (np.abs(min_value)) | |
df = df + 1 | |
df.loc[:, ~df.columns.isin(not_intensities)] = np.log2(df.loc[:, ~df.columns.isin(not_intensities)]) | |
return df | |
def shift(df): | |
df.loc[:, ~df.columns.isin(not_intensities)] = np.log2(df.loc[:, ~df.columns.isin(not_intensities)]) | |
return df | |
# Define the panel widgets | |
operation = pn.widgets.RadioButtonGroup(name='Operation', options=['Modify', 'Shift'], button_type='success') | |
# Define a function to update the DataFrame based on the selected operation | |
def update_dataframe(operation): | |
df = df_markers.copy() | |
if operation == 'Modify': | |
modified_df = modify(df) | |
elif operation == 'Shift': | |
modified_df = shift(df) | |
return modified_df.head(30) | |
# Create a panel layout | |
layout = pn.Column( | |
pn.pane.Markdown("### Data Transformation"), | |
operation, | |
pn.pane.Markdown("### Transformed DataFrame"), | |
pn.bind(lambda op: update_dataframe(op), operation) | |
) | |
#df_after_norm | |
df_markers.columns.tolist() | |
# Check for NaN entries (should not be any unless columns do not align) | |
# False means no NaN entries | |
# True means NaN entries | |
df_markers.isnull().any().any() | |
count_nan_in_df_markers = df_markers.isnull().sum().sum() | |
print(count_nan_in_df_markers) | |
# ## III.6. Z-SCORE TRANSFORMATION | |
# In[49]: | |
# Filter the DataFrame df to keep only the columns specified in the not_intensities list | |
#df = df.loc[:, not_intensities] | |
#df | |
# Check if all columns in the markers list are present in the DataFrame | |
missing_columns = [col for col in not_intensities if col not in df.columns] | |
if missing_columns: | |
print(f"The following columns are not present in the DataFrame ({len(missing_columns)} columns missing): \ | |
\n{missing_columns}") | |
# Filter the DataFrame to keep only the columns that are in the markers list and also exist in the DataFrame | |
intersected_columns = list(set(not_intensities).intersection(df.columns)) | |
df = df[intersected_columns] | |
else: | |
# Filter the DataFrame to keep only the columns in the markers list | |
df.loc[:, not_intensities] | |
df | |
# In[50]: | |
df | |
# In[51]: | |
df_merged = df_markers.merge(df, left_index=True, right_on='ID', how='left') | |
df_merged | |
# In[52]: | |
df_merged.columns.tolist() | |
# In[53]: | |
# Create a copy, just in case you need to restart the kernel | |
df_merged_copy = df_merged | |
# In[54]: | |
# Filters the rows of the DataFrame df_merged based on the values in the 'Sample_ID' column | |
# df_subset will contain a subset of rows from df_merged where the 'Sample_ID' matches the values in the list 'keep' ('TMA.csv' in this case) | |
keep = ['TMA.csv'] | |
df_subset = df_merged.loc[df_merged['Sample_ID'].isin(keep),:].copy() | |
df_subset | |
# In[55]: | |
# Convert the DataFrame to numeric, forcing errors to NaN | |
df_numeric = df_subset.apply(pd.to_numeric, errors='coerce') | |
# Z-score normalization | |
# Z-score the rows (apply() with axis = 1, only perform on intensity data) | |
# Apply Z-score normalization only on numeric columns | |
df_subset.loc[:, ~df_subset.columns.isin(not_intensities)] = \ | |
df_numeric.loc[:, ~df_numeric.columns.isin(not_intensities)].apply( | |
lambda row: (row - row.median()) / row.std(ddof=0), axis=1) | |
# Drop columns with all NaN values (if any) | |
df_subset.dropna(how='all', inplace=True, axis=1) | |
print('zscore rows finished') | |
############################### | |
# !! This may take a while !! # | |
############################### | |
'''df_subset.loc[:,~df_subset.columns.isin(not_intensities)] = \ | |
df_subset.loc[:,~df_subset.columns.isin(not_intensities)].apply( | |
lambda row: (row - row.median())/(row.std(ddof=0)), axis = 1) | |
df_subset.dropna(how = 'all', inplace = True, axis = 1) | |
print('zscore rows finished')''' | |
# In[56]: | |
df_subset | |
df_numeric = df_merged.apply(pd.to_numeric, errors='coerce') | |
# Z-score the rows (apply() with axis = 1, only perform on intensity data) | |
############################### | |
# !! This may take a while !! # | |
############################### | |
df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \ | |
df_numeric.loc[:,~df_numeric.columns.isin(not_intensities)].apply( | |
lambda row: (row - row.median())/(row.std(ddof=0)), axis = 1) | |
df_merged.dropna(how = 'all', inplace = True, axis = 1) | |
print('zscore rows finished') | |
'''# Z-score the rows (apply() with axis = 1, only perform on intensity data) | |
############################### | |
# !! This may take a while !! # | |
############################### | |
df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \ | |
df_merged.loc[:,~df_merged.columns.isin(not_intensities)].apply( | |
lambda row: (row - row.median())/(row.std(ddof=0)), axis = 1) | |
df_merged.dropna(how = 'all', inplace = True, axis = 1) | |
print('zscore rows finished')''' | |
df_merged | |
# In[59]: | |
# Ensuring that the selected columns in df have been adjusted or normalized using the median values | |
df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \ | |
df_merged.loc[:,~df_merged.columns.isin(not_intensities)] - df_subset.loc[:,~df_subset.columns.isin(not_intensities)].median() | |
df_merged | |
# In[60]: | |
df_merged_zscore = df_merged.loc[:,~df_merged.columns.isin(not_intensities)] = \ | |
df_merged.loc[:,~df_merged.columns.isin(not_intensities)] / df_subset.loc[:,~df_subset.columns.isin(not_intensities)].std(ddof=0) | |
df_merged_zscore | |
# In[61]: | |
# Check for NaN entries (should not be any unless columns do not align) | |
# False means no NaN entries | |
# True means NaN entries | |
df.isnull().any().any() | |
# In[62]: | |
quality_control_df = df_merged_zscore | |
# In[63]: | |
def check_index_format(index_str, ls_samples): | |
""" | |
Checks if the given index string follows the specified format. | |
Args: | |
index_str (str): The index string to be checked. | |
ls_samples (list): A list of valid sample names. | |
Returns: | |
bool: True if the index string follows the format, False otherwise. | |
""" | |
# Split the index string into parts | |
parts = index_str.split('_') | |
# Check if there are exactly 3 parts | |
if len(parts) != 3: | |
print(len(parts)) | |
return False | |
# Check if the first part is in ls_samples | |
sample_name = parts[0] | |
if f'{sample_name}_bs.csv' not in ls_samples: | |
print(sample_name) | |
return False | |
# Check if the second part is in ['cell', 'cytoplasm', 'nucleus'] | |
location = parts[1] | |
valid_locations = ['Cell', 'Cytoplasm', 'Nucleus'] | |
if location not in valid_locations: | |
print(location) | |
return False | |
# Check if the third part is a number | |
try: | |
index = int(parts[2]) | |
except ValueError: | |
print(index) | |
return False | |
# If all checks pass, return True | |
return True | |
# Let's take a look at a few features to make sure our dataframe is as expected | |
def check_format_ofindex(index): | |
for index in df.index: | |
check_index = check_index_format(index, ls_samples) | |
if check_index is False: | |
index_format = "Bad" | |
return index_format | |
index_format = "Good" | |
return index_format | |
# In[64]: | |
import panel as pn | |
import pandas as pd | |
def quality_check(file, not_intensities): | |
# Load the output file | |
df = file | |
# Check Index | |
check_index = check_format_ofindex(df.index) | |
# Check Shape | |
check_shape = df.shape | |
# Check for NaN entries | |
check_no_null = df.isnull().any().any() | |
mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1) | |
if (mean_intensity == 0).any(): | |
df = df.loc[mean_intensity > 0, :] | |
print("df.shape after removing 0 mean values: ", df.shape) | |
check_zero_intensities = f'Shape after removing 0 mean values: {df.shape}' | |
else: | |
print("No zero intensity values.") | |
check_zero_intensities = "No zero intensity values." | |
# Create a quality check results table | |
quality_check_results_table = pd.DataFrame({ | |
'Check': ['Index', 'Shape', 'Check for NaN Entries', 'Check for Zero Intensities'], | |
'Result': [str(check_index), str(check_shape), str(check_no_null), check_zero_intensities] | |
}) | |
# Create a quality check results component | |
quality_check_results_component = pn.Card( | |
pn.pane.DataFrame(quality_check_results_table), | |
title="Quality Control Results", | |
header_background="#2196f3", | |
header_color="white", | |
) | |
return quality_check_results_component | |
# In[76]: | |
import panel as pn | |
# Assuming your DataFrames are already defined as: | |
# metadata, merged_df, initial_df_marker, df_markers_not_intensities, df_after_norm, | |
# df_markers, df_subset, df_merged_zscore | |
# Create widgets and panes | |
df_widget = pn.widgets.DataFrame(metadata, name="MetaData") | |
# Define the three tabs content | |
metadata_tab = pn.Column( | |
pn.pane.Markdown("### Sample Metadata"), | |
pn.pane.DataFrame(metadata.head()), | |
pn.pane.Markdown("### Intial Dataframe"), | |
pn.pane.DataFrame(initial_df_marker.head(), width = 1500), | |
pn.Row(pn.pane.Markdown("### Shape: "), pn.pane.Markdown(str(merged_df.shape))), | |
pn.pane.Markdown("### Merged Dataframe"), | |
pn.pane.DataFrame(merged_df.head(), width = 1500), | |
pn.Row(pn.pane.Markdown("### Shape: "), pn.pane.Markdown(str(initial_df_marker.shape))), | |
pn.pane.Markdown("### Markers and not intensities Dataframe"), | |
pn.pane.DataFrame(df_markers_not_intensities.head(), width = 1500), | |
pn.Row(pn.pane.Markdown("### Shape: "), | |
pn.pane.Markdown(str(df_markers_not_intensities.shape))) | |
) | |
normalization_tab = pn.Column( | |
#pn.pane.Markdown("### Normalisation performed"), | |
#pn.pane.DataFrame(df_after_norm.head()), | |
#pn.Row(pn.pane.Markdown("### Shape before normalization: ")), | |
#pn.pane.Markdown(str(df_marker_shape_before_norm))), | |
#pn.Row(pn.pane.Markdown("### Shape after normalization: ")), | |
#pn.pane.Markdown(str(df_marker_shape_after_norm))), | |
#pn.pane.Markdown("### Performed log 2 transformation"), | |
#pn.pane.DataFrame(df_markers.head()) | |
layout | |
) | |
zscore_tab = pn.Column( | |
#pn.pane.Markdown("### Performed Z-score transformation"), | |
#pn.pane.DataFrame(df_subset.head(), width = 1500), | |
pn.pane.Markdown("### Z-score transformation finished"), | |
pn.pane.DataFrame(df_merged_zscore.head(30), width = 1500) | |
) | |
quality_control_tab = pn.Column( | |
pn.pane.Markdown("### Quality Control"), | |
quality_check(quality_control_df, not_intensities) | |
) | |
# Create the GoldenTemplate | |
app3 = pn.template.GoldenTemplate( | |
site="Cyc-IF", | |
title="Z-Score Computation", | |
main=[ | |
pn.Tabs( | |
("Metadata", metadata_tab), | |
("Normalization", normalization_tab), | |
("Z-Score", zscore_tab), | |
("Quality Control", quality_control_tab) | |
) | |
] | |
) | |
app3.servable() | |
if __name__ == "__main__": | |
pn.serve(app3, port=5007) | |