# V. CELLS QUANTIFICATION AND CLASSIFICATION

## V.1. PACKAGES IMPORT

In [1]:
import os
import random
import re
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import matplotlib.colors as mplc
import plotly.graph_objects as go
import warnings
import plotly.express as px

from scipy.stats.stats import pearsonr

from my_modules import *

  from scipy.stats.stats import pearsonr


In [2]:
#Silence FutureWarnings & UserWarnings
warnings.filterwarnings('ignore', category= FutureWarning)
warnings.filterwarnings('ignore', category= UserWarning)

## V.2. *DIRECTORIES

In [5]:
# Set base directory

##### MAC WORKSTATION #####
#base_dir = r'/Volumes/LaboLabrie/Projets/OC_TMA_Pejovic/Temp/Zoe/CyCIF_pipeline/'
###########################

##### WINDOWS WORKSTATION #####
#base_dir = r'C:\Users\LaboLabrie\gerz2701\Cyc-IF_pipeline\Set_B'
###############################

##### LOCAL WORKSTATION #####
base_dir = r'/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431'
#############################

#set_name = 'Set_A'
set_name = 'test'

In [6]:
project_name = set_name                 # Project name
step_suffix = 'cqc'                     # Curent part (here part V)
previous_step_suffix_long = "_mt"       # Previous part (here MT NOTEBOOK)

# Initial input data directory
input_data_dir = os.path.join(base_dir, project_name + previous_step_suffix_long) 

# BS output directories
output_data_dir = os.path.join(base_dir, project_name + "_" + step_suffix)
# BS images subdirectory
output_images_dir = os.path.join(output_data_dir,"images")

# Data and Metadata directories
# Metadata directories
metadata_dir = os.path.join(base_dir, project_name + "_metadata")
# images subdirectory
metadata_images_dir = os.path.join(metadata_dir,"images")

# Create directories if they don't already exist
for d in [base_dir, input_data_dir, output_data_dir, output_images_dir, metadata_dir, metadata_images_dir]:
    if not os.path.exists(d):
        print("Creation of the" , d, "directory...")
        os.makedirs(d)
    else :
        print("The", d, "directory already exists !")

os.chdir(input_data_dir)

The /Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431 directory already exists !
The /Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/test_mt directory already exists !
Creation of the /Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/test_cqc directory...
Creation of the /Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/test_cqc/images directory...
The /Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/test_metadata directory already exists !
The /Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/test_metadata/images directory already exists !


In [7]:
# Verify paths
print('base_dir :', base_dir)
print('input_data_dir :', input_data_dir)
print('output_data_dir :', output_data_dir)
print('output_images_dir :', output_images_dir)
print('metadata_dir :', metadata_dir)
print('metadata_images_dir :', metadata_images_dir)

base_dir : /Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431
input_data_dir : /Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/test_mt
output_data_dir : /Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/test_cqc
output_images_dir : /Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/test_cqc/images
metadata_dir : /Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/test_metadata
metadata_images_dir : /Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/test_metadata/images


## V.3. FILES

### V.3.1. METADATA

In [8]:
# Import all metadata we need from the BS chapter

filename = "marker_intensity_metadata.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: "+filename)
else :
    print("The",filename,"file was imported for further analysis!")

# Open, read in information
metadata = pd.read_csv(filename)

# Verify size with verify_line_no() function in my_modules.py
#verify_line_no(filename, metadata.shape[0] + 1)

# Verify headers
exp_cols = ['Round','Target','Channel','target_lower','full_column','marker','localisation']
compare_headers(exp_cols, metadata.columns.values, "Marker metadata file")

metadata = metadata.dropna()
metadata.head()

The /Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/test_metadata/marker_intensity_metadata.csv file was imported for further analysis!
['Exp']


Unnamed: 0,Round,Target,Exp,Channel,target_lower,full_column,marker,localisation
0,R0,AF488,300,c2,af488,AF488_Cell_Intensity_Average,AF488,cell
1,R0,AF488,300,c2,af488,AF488_Cytoplasm_Intensity_Average,AF488,cytoplasm
2,R0,AF488,300,c2,af488,AF488_Nucleus_Intensity_Average,AF488,nucleus
3,R0,AF555,1500,c3,af555,AF555_Cell_Intensity_Average,AF555,cell
4,R0,AF555,1500,c3,af555,AF555_Cytoplasm_Intensity_Average,AF555,cytoplasm


### V.3.2. NOT_INTENSITIES

In [9]:
filename = "not_intensities.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: "+filename)
else :
    print("The",filename,"file was imported for further analysis!")
    
# Open, read in information
not_intensities = []
with open(filename, 'r') as fh:
    not_intensities = fh.read().strip().split("\n")
    # take str, strip whitespace, split on new line character

# Verify size
print("Verifying data read from file is the correct length...\n")
#verify_line_no(filename, len(not_intensities))

# Print to console
print("not_intensities =\n", not_intensities)

The /Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/test_metadata/not_intensities.csv file was imported for further analysis!
Verifying data read from file is the correct length...

not_intensities =
 ['Cytoplasm_Size', 'Nuc_X', 'Primary_chem(1)_vs_surg(0)', 'cluster', 'immune_checkpoint', 'Sample_ID', 'Nucleus_Roundness', 'Unique_ROI_index', 'Nuc_Y', 'Nuc_X_Inv', 'Cell_ID', 'cell_subtype', 'ID', 'Nuc_Y_Inv', 'Patient', 'replicate_ID', 'cell_type', 'ROI_index', 'Cell_Size', 'Nucleus_Size']


### V.3.3. FULL_TO_SHORT_COLUMN_NAMES

In [10]:
filename = "full_to_short_column_names.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: " + filename)
else :
    print("The",filename,"file was imported for further analysis!")
    
# Open, read in information
df = pd.read_csv(filename, header = 0)

# Verify size
print("Verifying data read from file is the correct length...\n")
#verify_line_no(filename, df.shape[0] + 1)

# Turn into dictionary
full_to_short_names = df.set_index('full_name').T.to_dict('records')[0]

# CD45 instead of CD45b
if project_name == 'Slide_A' :
    full_to_short_names['CD45_Cytoplasm_Intensity_Average'] = full_to_short_names.pop('CD45b_Cytoplasm_Intensity_Average')
    full_to_short_names['CD45_Cytoplasm_Intensity_Average'] = 'CD45_Cytoplasm'

# Print information
print('full_to_short_names =\n',full_to_short_names)

The /Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/test_metadata/full_to_short_column_names.csv file was imported for further analysis!
Verifying data read from file is the correct length...

full_to_short_names =
 {'AF488_Cell_Intensity_Average': 'AF488_Cell', 'AF488_Cytoplasm_Intensity_Average': 'AF488_Cytoplasm', 'AF488_Nucleus_Intensity_Average': 'AF488_Nucleus', 'AF555_Cell_Intensity_Average': 'AF555_Cell', 'AF555_Cytoplasm_Intensity_Average': 'AF555_Cytoplasm', 'AF555_Nucleus_Intensity_Average': 'AF555_Nucleus', 'AF647_Cell_Intensity_Average': 'AF647_Cell', 'AF647_Cytoplasm_Intensity_Average': 'AF647_Cytoplasm', 'AF647_Nucleus_Intensity_Average': 'AF647_Nucleus', 'AF750_Cell_Intensity_Average': 'AF750_Cell', 'AF750_Cytoplasm_Intensity_Average': 'AF750_Cytoplasm', 'AF750_Nucleus_Intensity_Average': 'AF750_Nucleus', 'aSMA_Cell_Intensity_Average': 'aSMA_Cell', 'aSMA_Cytoplasm_Intensity_Average': 'aSMA_Cytoplasm', 'aSMA_Nucleus_Intensity_Average': 'aSMA_Nucleu

### V.3.4. SHORT_TO_FULL_COLUMN_NAMES

In [11]:
filename = "short_to_full_column_names.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: " + filename)
else :
    print("The",filename,"file was imported for further analysis!")
    
# Open, read in information
df = pd.read_csv(filename, header = 0)

# Verify size
print("Verifying data read from file is the correct length...\n")
#verify_line_no(filename, df.shape[0] + 1)

# Turn into dictionary
short_to_full_names = df.set_index('short_name').T.to_dict('records')[0]

# CD45 instead of CD45b
if project_name == 'Slide_A' :
    short_to_full_names['CD45_Cytoplasm'] = short_to_full_names.pop('CD45b_Cytoplasm')
    short_to_full_names['CD45_Cytoplasm'] = 'CD45_Cytoplasm_Intensity_Average'

# Print information
print('short_to_full_names =\n',short_to_full_names)

The /Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/test_metadata/short_to_full_column_names.csv file was imported for further analysis!
Verifying data read from file is the correct length...

short_to_full_names =
 {'AF488_Cell': 'AF488_Cell_Intensity_Average', 'AF488_Cytoplasm': 'AF488_Cytoplasm_Intensity_Average', 'AF488_Nucleus': 'AF488_Nucleus_Intensity_Average', 'AF555_Cell': 'AF555_Cell_Intensity_Average', 'AF555_Cytoplasm': 'AF555_Cytoplasm_Intensity_Average', 'AF555_Nucleus': 'AF555_Nucleus_Intensity_Average', 'AF647_Cell': 'AF647_Cell_Intensity_Average', 'AF647_Cytoplasm': 'AF647_Cytoplasm_Intensity_Average', 'AF647_Nucleus': 'AF647_Nucleus_Intensity_Average', 'AF750_Cell': 'AF750_Cell_Intensity_Average', 'AF750_Cytoplasm': 'AF750_Cytoplasm_Intensity_Average', 'AF750_Nucleus': 'AF750_Nucleus_Intensity_Average', 'aSMA_Cell': 'aSMA_Cell_Intensity_Average', 'aSMA_Cytoplasm': 'aSMA_Cytoplasm_Intensity_Average', 'aSMA_Nucleus': 'aSMA_Nucleus_Intensity_Averag

### V.3.5. SAMPLES COLORS

In [12]:
filename = "sample_color_data.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: " + filename)
else :
    print("The",filename,"file was imported for further analysis!")
    
# Open, read in information
df = pd.read_csv(filename, header = 0)
df = df.drop(columns = ['hex'])

# our tuple of float values for rgb, (r, g, b) was read in 
# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
# substrings and convert them back into floats
df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)

# Verify size
print("Verifying data read from file is the correct length...\n")
#verify_line_no(filename, df.shape[0] + 1)

# Turn into dictionary
sample_color_dict = df.set_index('Sample_ID').T.to_dict('rgb')[0]

# Print information
print('sample_color_dict =\n',sample_color_dict)



FileNotFoundError: [Errno 2] No such file or directory: '/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/test_metadata/sample_color_data.csv'

### V.3.6. CHANNELS COLORS

In [12]:
filename = "channel_color_data.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: "+filename)
else :
    print("The",filename,"file was imported for further analysis!")
    
# Open, read in information
df = pd.read_csv(filename, header = 0)
df = df.drop(columns = ['hex'])

# our tuple of float values for rgb, (r, g, b) was read in 
# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
# substrings and convert them back into floats
df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)

# Verify size
print("Verifying data read from file is the correct length...\n")
#verify_line_no(filename, df.shape[0] + 1)

# Turn into dictionary
channel_color_dict = df.set_index('Channel').T.to_dict('rgb')[0]

# Print information
print('channel_color_dict =\n',channel_color_dict)

The C:\Users\zoege\Documents\Temp\Cyc-IF_pipeline\Set_B\Set_B_metadata\channel_color_data.csv file was imported for further analysis!
Verifying data read from file is the correct length...

channel_color_dict =
 {'c2': (0.00784313725490196, 0.24313725490196078, 1.0), 'c3': (1.0, 0.48627450980392156, 0.0), 'c4': (0.10196078431372549, 0.788235294117647, 0.2196078431372549), 'c5': (0.9098039215686274, 0.0, 0.043137254901960784)}


### V.3.7. ROUNDS COLORS

In [13]:
filename = "round_color_data.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: "+filename)
else :
    print("The",filename,"file was imported for further analysis!")
    
# Open, read in information
df = pd.read_csv(filename, header = 0)
df = df.drop(columns = ['hex'])

# our tuple of float values for rgb, (r, g, b) was read in 
# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
# substrings and convert them back into floats
df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)

# Verify size
print("Verifying data read from file is the correct length...\n")
#verify_line_no(filename, df.shape[0] + 1)

# Turn into dictionary
round_color_dict = df.set_index('Round').T.to_dict('rgb')[0]

# Print information
print('round_color_dict =\n',round_color_dict)

The C:\Users\zoege\Documents\Temp\Cyc-IF_pipeline\Set_B\Set_B_metadata\round_color_data.csv file was imported for further analysis!
Verifying data read from file is the correct length...

round_color_dict =
 {'R0': (0.28685356234627135, 0.13009829239513535, 0.23110332132624437), 'R1': (0.36541462435986094, 0.2025447048359916, 0.37693310021636883), 'R2': (0.40867533458903105, 0.2940761173840091, 0.5166711878800253), 'R3': (0.42890613750051265, 0.4082290173220481, 0.6335348887063806), 'R4': (0.4444462906865238, 0.5264664993764805, 0.7056321892616532), 'R5': (0.47707206309601013, 0.6427061780374552, 0.7418477948908153), 'R6': (0.5414454866716836, 0.7466759172596551, 0.7572905778378964), 'R7': (0.6414710091647722, 0.8321551072276492, 0.7746773027952071), 'R8': (0.7684256891219349, 0.8992667116749021, 0.8171383269422353)}


### V.3.8. CELL TYPES COLORS

In [14]:
filename = "celltype_color_data.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: "+filename)
else :
    print("The",filename,"file was imported for further analysis!")
    
# Open, read in information
df = pd.read_csv(filename, header = 0)
df = df.drop(columns = ['hex'])

# our tuple of float values for rgb, (r, g, b) was read in 
# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
# substrings and convert them back into floats
df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)

# Verify size
print("Verifying data read from file is the correct length...\n")
#verify_line_no(filename, df.shape[0] + 1)

# Turn into dictionary
cell_type_color_dict = df.set_index('cell_type').T.to_dict('rgb')[0]

# Print information
print('cell_type_color_dict =\n',cell_type_color_dict)

The C:\Users\zoege\Documents\Temp\Cyc-IF_pipeline\Set_B\Set_B_metadata\celltype_color_data.csv file was imported for further analysis!
Verifying data read from file is the correct length...

cell_type_color_dict =
 {'CANCER': (0.1333, 0.5451, 0.1333), 'STROMA': (0.4, 0.4, 0.4), 'IMMUNE': (1.0, 1.0, 0.0), 'ENDOTHELIAL': (0.502, 0.0, 0.502)}


### V.3.9. CELL SUBTYPES COLORS

In [15]:
filename = "cellsubtype_color_data.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: "+filename)
else :
    print("The",filename,"file was imported for further analysis!")
    
# Open, read in information
df = pd.read_csv(filename, header = 0)
df = df.drop(columns = ['hex'])

# our tuple of float values for rgb, (r, g, b) was read in 
# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
# substrings and convert them back into floats
df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)

# Verify size
print("Verifying data read from file is the correct length...\n")
#verify_line_no(filename, df.shape[0] + 1)

# Turn into dictionary
cell_subtype_color_dict = df.set_index('cell_subtype').T.to_dict('rgb')[0]

# Print information
print('cell_subtype_color_dict =\n',cell_subtype_color_dict)  

The C:\Users\zoege\Documents\Temp\Cyc-IF_pipeline\Set_B\Set_B_metadata\cellsubtype_color_data.csv file was imported for further analysis!
Verifying data read from file is the correct length...

cell_subtype_color_dict =
 {'DC': (0.6509803921568628, 0.807843137254902, 0.8901960784313725), 'B': (0.12156862745098039, 0.47058823529411764, 0.7058823529411765), 'TCD4': (0.6980392156862745, 0.8745098039215686, 0.5411764705882353), 'TCD8': (0.2, 0.6274509803921569, 0.17254901960784313), 'M1': (0.984313725490196, 0.6039215686274509, 0.6), 'M2': (0.8901960784313725, 0.10196078431372549, 0.10980392156862745), 'Treg': (0.9921568627450981, 0.7490196078431373, 0.43529411764705883), 'IMMUNE_OTHER': (1.0, 0.4980392156862745, 0.0), 'CANCER': (0.792156862745098, 0.6980392156862745, 0.8392156862745098), 'αSMA_myCAF': (0.41568627450980394, 0.23921568627450981, 0.6039215686274509), 'STROMA_OTHER': (1.0, 1.0, 0.6), 'ENDOTHELIAL': (0.6941176470588235, 0.34901960784313724, 0.1568627450980392)}


### V.4.10. IMMUNE CHECKPOINT COLORS

In [16]:
filename = "immunecheckpoint_color_data.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: "+filename)
else :
    print("The",filename,"file was imported for further analysis!")
    
# Open, read in information
df = pd.read_csv(filename, header = 0)
df = df.drop(columns = ['hex'])

# our tuple of float values for rgb, (r, g, b) was read in 
# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
# substrings and convert them back into floats
df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)

# Verify size
print("Verifying data read from file is the correct length...\n")
#verify_line_no(filename, df.shape[0] + 1)

# Turn into dictionary
immune_checkpoint_color_dict = df.set_index('immune_checkpoint').T.to_dict('rgb')[0]

# Print information
print('immune_checkpoint_color_dict =\n',immune_checkpoint_color_dict)

The C:\Users\zoege\Documents\Temp\Cyc-IF_pipeline\Set_B\Set_B_metadata\immunecheckpoint_color_data.csv file was imported for further analysis!
Verifying data read from file is the correct length...

immune_checkpoint_color_dict =
 {'B7H4': (0.9677975592919913, 0.44127456009157356, 0.5358103155058701), 'PDL1': (0.3126890019504329, 0.6928754610296064, 0.1923704830330379), 'PD1': (0.23299120924703914, 0.639586552066035, 0.9260706093977744), 'B7H4_PDL1': (0.6402432806212122, 0.56707501056059, 0.36409039926945397), 'B7H4_PD1': (0.6003943842695152, 0.5404305560788043, 0.7309404624518223), 'PDL1_PD1': (0.27284010559873606, 0.6662310065478207, 0.5592205462154062), 'B7H4_PDL1_PD1': (0.5044925901631545, 0.5912455243957383, 0.5514171359788941), 'None': (0.8, 0.8, 0.8)}


### V.3.11. DATA

In [17]:
# DATA
# List files in the directory
# Check if the directory exists
if os.path.exists(input_data_dir):
    # List files in the directory
    ls_samples = [sample for sample in os.listdir(input_data_dir) if sample.endswith("_mt.csv")]
    print("The following CSV files were detected:")
    print([sample for sample in ls_samples])
else:
    print(f"The directory {input_data_dir} does not exist.")

The following CSV files were detected:
['DD3S1_mt.csv', 'DD3S2_mt.csv', 'DD3S3_mt.csv', 'DD4S1_mt.csv', 'DD4S2_mt.csv', 'DD4S3_mt.csv', 'DD5S1_mt.csv', 'DD5S2_mt.csv', 'DD5S3_mt.csv', 'TMA_mt.csv']


In [18]:
# Import all the others files
dfs = {}

# Set variable to hold default header values
# First gather information on expected headers using first file in ls_samples
# Read in the first row of the file corresponding to the first sample (index = 0) in ls_samples
df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]) , index_col = 0, nrows = 1)
expected_headers = df.columns.values
print(expected_headers)

###############################
# !! This may take a while !! #
###############################
for sample in ls_samples:
    file_path = os.path.join(input_data_dir,sample)
   
    try:
        # Read the CSV file
        df = pd.read_csv(file_path, index_col=0)
        # Check if the DataFrame is empty, if so, don't continue trying to process df and remove it
        
        if not df.empty:
            # Reorder the columns to match the expected headers list
            df = df.reindex(columns=expected_headers)
            print(sample, "file is processed !\n")
            #print(df) 
   
    except pd.errors.EmptyDataError:
        print(f'\nEmpty data error in {sample} file. Removing from analysis...')
        ls_samples.remove(sample)      
    
    # Add df to dfs 
    dfs[sample] = df

#print(dfs)

['PDL1_Cytoplasm_Intensity_Average' 'HLA_Cytoplasm_Intensity_Average'
 'CKs_Cytoplasm_Intensity_Average' 'Ki67_Nucleus_Intensity_Average'
 'CD163_Cytoplasm_Intensity_Average' 'ColVI_Cytoplasm_Intensity_Average'
 'CD20_Cytoplasm_Intensity_Average' 'PD1_Cytoplasm_Intensity_Average'
 'AXL_Cytoplasm_Intensity_Average' 'CD31_Cytoplasm_Intensity_Average'
 'Fibronectin_Cytoplasm_Intensity_Average'
 'CD45_Cytoplasm_Intensity_Average' 'Ecad_Cytoplasm_Intensity_Average'
 'CD8_Cytoplasm_Intensity_Average' 'GATA3_Nucleus_Intensity_Average'
 'Sting_Cytoplasm_Intensity_Average' 'aSMA_Cytoplasm_Intensity_Average'
 'FOXP3_Nucleus_Intensity_Average' 'CD11c_Cytoplasm_Intensity_Average'
 'CD4_Cytoplasm_Intensity_Average' 'Vimentin_Cytoplasm_Intensity_Average'
 'CD44_Cytoplasm_Intensity_Average' 'CD68_Cytoplasm_Intensity_Average'
 'PDGFR_Cytoplasm_Intensity_Average' 'B7H4_Cell_Intensity_Average'
 'MMP9_Cytoplasm_Intensity_Average' 'Desmin_Cytoplasm_Intensity_Average'
 'cell_subtype' 'cell_type' 'Nuc_Y_Inv

In [19]:
# Merge dfs into one df
df = pd.concat(dfs.values(), ignore_index=False , sort = False)
del dfs

print(df.head())

              PDL1_Cytoplasm_Intensity_Average  \
ID                                               
DD3S1_Cell_0                         -0.677863   
DD3S1_Cell_1                         -0.677863   
DD3S1_Cell_2                         -0.677863   
DD3S1_Cell_3                         -0.741282   
DD3S1_Cell_6                         -0.621521   

              HLA_Cytoplasm_Intensity_Average  \
ID                                              
DD3S1_Cell_0                        -0.417494   
DD3S1_Cell_1                        -0.516487   
DD3S1_Cell_2                        -0.141921   
DD3S1_Cell_3                        -0.460472   
DD3S1_Cell_6                        -0.247254   

              CKs_Cytoplasm_Intensity_Average  Ki67_Nucleus_Intensity_Average  \
ID                                                                              
DD3S1_Cell_0                        -0.912537                       -0.817876   
DD3S1_Cell_1                        -0.838037                 

In [20]:
df.shape

(704629, 37)

In [21]:
df.index 

Index(['DD3S1_Cell_0', 'DD3S1_Cell_1', 'DD3S1_Cell_2', 'DD3S1_Cell_3',
       'DD3S1_Cell_6', 'DD3S1_Cell_7', 'DD3S1_Cell_8', 'DD3S1_Cell_10',
       'DD3S1_Cell_11', 'DD3S1_Cell_12',
       ...
       'TMA_Cell_115750', 'TMA_Cell_115751', 'TMA_Cell_115752',
       'TMA_Cell_115753', 'TMA_Cell_115754', 'TMA_Cell_115755',
       'TMA_Cell_115756', 'TMA_Cell_115757', 'TMA_Cell_115758',
       'TMA_Cell_115760'],
      dtype='object', name='ID', length=704629)

In [22]:
df.columns.values

array(['PDL1_Cytoplasm_Intensity_Average',
       'HLA_Cytoplasm_Intensity_Average',
       'CKs_Cytoplasm_Intensity_Average',
       'Ki67_Nucleus_Intensity_Average',
       'CD163_Cytoplasm_Intensity_Average',
       'ColVI_Cytoplasm_Intensity_Average',
       'CD20_Cytoplasm_Intensity_Average',
       'PD1_Cytoplasm_Intensity_Average',
       'AXL_Cytoplasm_Intensity_Average',
       'CD31_Cytoplasm_Intensity_Average',
       'Fibronectin_Cytoplasm_Intensity_Average',
       'CD45_Cytoplasm_Intensity_Average',
       'Ecad_Cytoplasm_Intensity_Average',
       'CD8_Cytoplasm_Intensity_Average',
       'GATA3_Nucleus_Intensity_Average',
       'Sting_Cytoplasm_Intensity_Average',
       'aSMA_Cytoplasm_Intensity_Average',
       'FOXP3_Nucleus_Intensity_Average',
       'CD11c_Cytoplasm_Intensity_Average',
       'CD4_Cytoplasm_Intensity_Average',
       'Vimentin_Cytoplasm_Intensity_Average',
       'CD44_Cytoplasm_Intensity_Average',
       'CD68_Cytoplasm_Intensity_Average',
      

In [23]:
# Check for NaN entries (should not be any unless columns do not align)
# False means no NaN entries 
# True means NaN entries 
df.isnull().any().any()

False

### V.3.12. UNIQUE ROIs

In [24]:
# UNIQUE ROIs
filename = f'{project_name}_unique_ROIs.csv'
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: " + filename)
else :
    print("The",filename,"file was imported for further analysis!")

The C:\Users\zoege\Documents\Temp\Cyc-IF_pipeline\Set_B\Set_B_metadata\Set_B_unique_ROIs.csv file was imported for further analysis!


In [25]:
# Open, read in information
Unique_ROIs = pd.read_csv(filename,delimiter=';')

# Verify headers
exp_cols = ['Sample_ID', 'ROI_index','Patient','Unique_ROI_index']
compare_headers(exp_cols, Unique_ROIs.columns.values, "Unique_ROIs file")

Unique_ROIs = Unique_ROIs.dropna()
Unique_ROIs

Unnamed: 0,Sample_ID,ROI_index,Patient,Unique_ROI_index
0,DD3S1.csv,0,61,61a
1,DD3S1.csv,1,62,62a
2,DD3S1.csv,2,63,63a
3,DD3S1.csv,3,59,59a
4,DD3S1.csv,4,60,60a
...,...,...,...,...
462,TMA.csv,55,c55,c55a
463,TMA.csv,56,c56,c56a
464,TMA.csv,57,c57,c57a
465,TMA.csv,58,c58,c58a


In [26]:
# Jointure des deux dataframes sur la colonne 'Unique_ROI_index'
df1 = df.merge(Unique_ROIs[['Sample_ID', 'ROI_index', 'Patient', 'Unique_ROI_index']], on=['Sample_ID', 'ROI_index'], how='left')

# Set the index of df1 to be the same as the index of df
df1.set_index(df.index, inplace=True)

# Vérification du nouveau dataframe
display(df1)

Unnamed: 0_level_0,PDL1_Cytoplasm_Intensity_Average,HLA_Cytoplasm_Intensity_Average,CKs_Cytoplasm_Intensity_Average,Ki67_Nucleus_Intensity_Average,CD163_Cytoplasm_Intensity_Average,ColVI_Cytoplasm_Intensity_Average,CD20_Cytoplasm_Intensity_Average,PD1_Cytoplasm_Intensity_Average,AXL_Cytoplasm_Intensity_Average,CD31_Cytoplasm_Intensity_Average,...,Nuc_Y_Inv,Nucleus_Roundness,Nuc_X,ROI_index,Nucleus_Size,Sample_ID,immune_checkpoint,Cell_Size,Patient,Unique_ROI_index
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DD3S1_Cell_0,-0.677863,-0.417494,-0.912537,-0.817876,0.930099,0.232078,-0.483158,1.535604,0.807339,1.167755,...,16632.205078,0.955040,1484.771729,0,127,DD3S1.csv,,339,61,61a
DD3S1_Cell_1,-0.677863,-0.516487,-0.838037,-0.869685,1.114924,0.301333,-0.344770,1.668368,0.875455,1.643023,...,16627.384766,0.966643,1426.250000,0,112,DD3S1.csv,,344,61,61a
DD3S1_Cell_2,-0.677863,-0.141921,-1.016023,-0.755879,0.834577,0.259216,-0.438292,1.336308,0.705088,1.053636,...,16622.238281,0.721534,1531.110474,0,181,DD3S1.csv,,422,61,61a
DD3S1_Cell_3,-0.741282,-0.460472,-0.491711,-0.818084,0.648200,0.107027,-0.444889,1.249805,0.660707,1.165861,...,16623.007812,0.587196,1518.907593,0,119,DD3S1.csv,,278,61,61a
DD3S1_Cell_6,-0.621521,-0.247254,-0.867127,-0.742544,0.810579,0.272128,-0.507117,1.251434,0.947172,2.545301,...,16619.978516,0.935716,1471.914917,0,47,DD3S1.csv,,204,61,61a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TMA_Cell_115755,0.478275,0.558670,-0.962840,1.732291,0.507434,-0.912641,0.311322,0.816068,0.596520,0.090397,...,2663.253418,0.982196,15564.458008,59,142,TMA.csv,,386,c59,c59a
TMA_Cell_115756,0.297418,0.420594,-0.971632,1.966955,0.304365,-1.164112,0.866636,-0.092857,-0.241830,-0.617835,...,2661.765869,0.775977,15629.680664,59,47,TMA.csv,,270,c59,c59a
TMA_Cell_115757,0.346950,0.453951,-0.602893,1.338956,0.559435,-0.801333,0.447061,0.988156,1.567869,0.403878,...,2657.015625,0.688747,15518.421875,59,64,TMA.csv,,202,c59,c59a
TMA_Cell_115758,-0.189415,0.508840,-0.886041,0.647980,-0.227224,-1.022549,-0.099256,0.219755,0.603715,-0.219145,...,2660.258545,0.751402,15539.275391,59,58,TMA.csv,,182,c59,c59a


In [27]:
# Resetting the index temporarily for checking for duplicates
# ID + Unique_ROI_index should not return any duplicate
df_temp = df1.copy()
df_temp.reset_index(inplace=True)
df_temp.rename(columns={'index': 'old_index'}, inplace=True)

# Check for duplicates
duplicates = df_temp[df_temp.duplicated(['ID', 'Unique_ROI_index'], keep=False)]
if not duplicates.empty:
    print("Duplicates found:")
    print(duplicates)
else:
    print("No duplicates found.")

No duplicates found.


### V.3.13. NACT/ACT

In [28]:
filename = "TMA_Clinical_Data_187-OC.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: " + filename)
else :
    print("The",filename,"file was imported for further analysis!")

The C:\Users\zoege\Documents\Temp\Cyc-IF_pipeline\Set_B\Set_B_metadata\TMA_Clinical_Data_187-OC.csv file was imported for further analysis!


In [29]:
# Open, read in information
clinical_data = pd.read_csv(filename,delimiter=',')

# Renaming columns using the `rename` function
clinical_data = clinical_data.rename(columns={'ID': 'Patient'})
clinical_data['Patient'] = clinical_data['Patient'].astype(str)
df1['Patient'] = df1['Patient'].astype(str)

# Display the first few rows to check the DataFrame
display(clinical_data)

Unnamed: 0,Patient,%_tumor,%_necrosis,Age_Diagnosis,BMI,CA125,Race,Other_Cancers,BRCAStatus,Primary_chem(1)_vs_surg(0),...,Histo_Type,Optimal_Debulking,Residual_Disease,Platinum_sensitive,Days_surgery_to_recurrence,Avastin,Disease_Stat,Days_from_surgery_to_last_contact_or_death,Recurrence,Vital_Status
0,1,80,5,72.0,35.0,,C,breast,0,0,...,HGSOC,0.0,1.0,1.0,506.0,1.0,DOD,1776,1.0,1.0
1,2,50,0,36.0,24.0,,C,breast,1,0,...,HGSOC,1.0,0.0,1.0,669.0,1.0,DOD,2223,1.0,1.0
2,3,80,5,55.0,45.0,944.0,C,0,,0,...,HGSOC,1.0,0.0,1.0,596.0,0.0,DOD,1355,1.0,1.0
3,4,80,5,59.0,26.0,50.0,C,0,0,0,...,HGtubal,1.0,0.0,,,,,1783,,
4,5,50,0,70.0,32.0,1426.0,C,0,,1,...,CCC,,0.0,1.0,377.0,1.0,DOD,977,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,183,70,5,72.0,22.0,597.0,C,breast,0,1,...,HGSOC,1.0,0.0,1.0,,,DOD,1036,,1.0
183,184,90,0,72.0,26.0,3788.0,C,breast,,1,...,HGSOC,1.0,0.0,1.0,282.0,0.0,,468,1.0,
184,185,90,0,70.0,29.0,1330.0,C,0,,0,...,HGSOC,1.0,0.0,,,,DOD,1020,,1.0
185,186,80,0,84.0,17.0,564.0,C,squamouscellcarcinoma,,0,...,HGSOC,1.0,0.0,,,,DOC,1869,0.0,1.0


In [30]:
print(clinical_data.columns)
print(clinical_data.shape)

Index(['Patient', '%_tumor', '%_necrosis', 'Age_Diagnosis', 'BMI', 'CA125',
       'Race', 'Other_Cancers', 'BRCAStatus', 'Primary_chem(1)_vs_surg(0)',
       'Diagnosis_to_Start_Chemo', 'Diagnosis_to_surgery', 'Stage',
       'Histo_Type', 'Optimal_Debulking', 'Residual_Disease',
       'Platinum_sensitive', 'Days_surgery_to_recurrence', 'Avastin',
       'Disease_Stat', 'Days_from_surgery_to_last_contact_or_death',
       'Recurrence', 'Vital_Status'],
      dtype='object')
(187, 23)


In [31]:
print(df1.columns)
print(df1.shape)

Index(['PDL1_Cytoplasm_Intensity_Average', 'HLA_Cytoplasm_Intensity_Average',
       'CKs_Cytoplasm_Intensity_Average', 'Ki67_Nucleus_Intensity_Average',
       'CD163_Cytoplasm_Intensity_Average',
       'ColVI_Cytoplasm_Intensity_Average', 'CD20_Cytoplasm_Intensity_Average',
       'PD1_Cytoplasm_Intensity_Average', 'AXL_Cytoplasm_Intensity_Average',
       'CD31_Cytoplasm_Intensity_Average',
       'Fibronectin_Cytoplasm_Intensity_Average',
       'CD45_Cytoplasm_Intensity_Average', 'Ecad_Cytoplasm_Intensity_Average',
       'CD8_Cytoplasm_Intensity_Average', 'GATA3_Nucleus_Intensity_Average',
       'Sting_Cytoplasm_Intensity_Average', 'aSMA_Cytoplasm_Intensity_Average',
       'FOXP3_Nucleus_Intensity_Average', 'CD11c_Cytoplasm_Intensity_Average',
       'CD4_Cytoplasm_Intensity_Average',
       'Vimentin_Cytoplasm_Intensity_Average',
       'CD44_Cytoplasm_Intensity_Average', 'CD68_Cytoplasm_Intensity_Average',
       'PDGFR_Cytoplasm_Intensity_Average', 'B7H4_Cell_Intensity_Aver

In [32]:
# Display data types of columns in df1 and NACT
print(df1.dtypes)
print('')
print(clinical_data.dtypes)

PDL1_Cytoplasm_Intensity_Average           float64
HLA_Cytoplasm_Intensity_Average            float64
CKs_Cytoplasm_Intensity_Average            float64
Ki67_Nucleus_Intensity_Average             float64
CD163_Cytoplasm_Intensity_Average          float64
ColVI_Cytoplasm_Intensity_Average          float64
CD20_Cytoplasm_Intensity_Average           float64
PD1_Cytoplasm_Intensity_Average            float64
AXL_Cytoplasm_Intensity_Average            float64
CD31_Cytoplasm_Intensity_Average           float64
Fibronectin_Cytoplasm_Intensity_Average    float64
CD45_Cytoplasm_Intensity_Average           float64
Ecad_Cytoplasm_Intensity_Average           float64
CD8_Cytoplasm_Intensity_Average            float64
GATA3_Nucleus_Intensity_Average            float64
Sting_Cytoplasm_Intensity_Average          float64
aSMA_Cytoplasm_Intensity_Average           float64
FOXP3_Nucleus_Intensity_Average            float64
CD11c_Cytoplasm_Intensity_Average          float64
CD4_Cytoplasm_Intensity_Average

In [33]:
# Liste des valeurs uniques dans la colonne 'Patient' de chaque DataFrame
patients_df1 = set(df1['Patient'].unique())
patients_clinical_data = set(clinical_data['Patient'].unique())

# Patients présents dans df1 mais absents dans clinical_data
patients_unique_to_df1 = patients_df1 - patients_clinical_data

# Patients présents dans NACT mais absents dans df1
patients_unique_to_clinical_data = patients_clinical_data - patients_df1

# Affichage des patients uniques dans chaque DataFrame
print("Patients uniques dans df1 mais absents dans clinical_data:")
print(patients_unique_to_df1)

# no segmented cores?
print("\nPatients uniques dans clinical_data mais absents dans df1:")
print(patients_unique_to_clinical_data)

Patients uniques dans df1 mais absents dans clinical_data:
{'c37', 'c50', 'c38', 'c6', 'c52', 'c55', 'c13', 'c0', 'c25', 'c51', 'c19', 'c42', 'c57', 'c41', 'c56', 'c40', 'c18', 'c21', 'c48', 'c49', 'c1', 'c26', 'c23', 'c39', 'c15', 'c45', 'c34', 'c58', 'c43', 'c32', 'c8', 'c9', 'c4', 'c35', 'c14', 'c27', 'c7', 'c20', 'c2', 'c12', 'c16', 'c46', 'c3', 'c31', 'c28', 'c53', 'c24', 'c59', 'c5', 'c22', 'c11', 'c54', 'c44', 'c17', 'c30', 'c29', 'c10', 'c47', 'c33', 'c36'}

Patients uniques dans clinical_data mais absents dans df1:
{'138', '2', '93', '151', '31', '145', '163', '1', '118', '76'}


In [34]:
# DataFrames fuson based on 'Patient' col
df2 = pd.merge(df1, clinical_data[['Patient', 'Primary_chem(1)_vs_surg(0)']], on='Patient', how='left')

# Set the index of df1 to be the same as the index of df
df2.set_index(df.index, inplace=True)

print(df2.columns)
print(df2.shape)
df2

Index(['PDL1_Cytoplasm_Intensity_Average', 'HLA_Cytoplasm_Intensity_Average',
       'CKs_Cytoplasm_Intensity_Average', 'Ki67_Nucleus_Intensity_Average',
       'CD163_Cytoplasm_Intensity_Average',
       'ColVI_Cytoplasm_Intensity_Average', 'CD20_Cytoplasm_Intensity_Average',
       'PD1_Cytoplasm_Intensity_Average', 'AXL_Cytoplasm_Intensity_Average',
       'CD31_Cytoplasm_Intensity_Average',
       'Fibronectin_Cytoplasm_Intensity_Average',
       'CD45_Cytoplasm_Intensity_Average', 'Ecad_Cytoplasm_Intensity_Average',
       'CD8_Cytoplasm_Intensity_Average', 'GATA3_Nucleus_Intensity_Average',
       'Sting_Cytoplasm_Intensity_Average', 'aSMA_Cytoplasm_Intensity_Average',
       'FOXP3_Nucleus_Intensity_Average', 'CD11c_Cytoplasm_Intensity_Average',
       'CD4_Cytoplasm_Intensity_Average',
       'Vimentin_Cytoplasm_Intensity_Average',
       'CD44_Cytoplasm_Intensity_Average', 'CD68_Cytoplasm_Intensity_Average',
       'PDGFR_Cytoplasm_Intensity_Average', 'B7H4_Cell_Intensity_Aver

Unnamed: 0_level_0,PDL1_Cytoplasm_Intensity_Average,HLA_Cytoplasm_Intensity_Average,CKs_Cytoplasm_Intensity_Average,Ki67_Nucleus_Intensity_Average,CD163_Cytoplasm_Intensity_Average,ColVI_Cytoplasm_Intensity_Average,CD20_Cytoplasm_Intensity_Average,PD1_Cytoplasm_Intensity_Average,AXL_Cytoplasm_Intensity_Average,CD31_Cytoplasm_Intensity_Average,...,Nucleus_Roundness,Nuc_X,ROI_index,Nucleus_Size,Sample_ID,immune_checkpoint,Cell_Size,Patient,Unique_ROI_index,Primary_chem(1)_vs_surg(0)
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DD3S1_Cell_0,-0.677863,-0.417494,-0.912537,-0.817876,0.930099,0.232078,-0.483158,1.535604,0.807339,1.167755,...,0.955040,1484.771729,0,127,DD3S1.csv,,339,61,61a,1.0
DD3S1_Cell_1,-0.677863,-0.516487,-0.838037,-0.869685,1.114924,0.301333,-0.344770,1.668368,0.875455,1.643023,...,0.966643,1426.250000,0,112,DD3S1.csv,,344,61,61a,1.0
DD3S1_Cell_2,-0.677863,-0.141921,-1.016023,-0.755879,0.834577,0.259216,-0.438292,1.336308,0.705088,1.053636,...,0.721534,1531.110474,0,181,DD3S1.csv,,422,61,61a,1.0
DD3S1_Cell_3,-0.741282,-0.460472,-0.491711,-0.818084,0.648200,0.107027,-0.444889,1.249805,0.660707,1.165861,...,0.587196,1518.907593,0,119,DD3S1.csv,,278,61,61a,1.0
DD3S1_Cell_6,-0.621521,-0.247254,-0.867127,-0.742544,0.810579,0.272128,-0.507117,1.251434,0.947172,2.545301,...,0.935716,1471.914917,0,47,DD3S1.csv,,204,61,61a,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TMA_Cell_115755,0.478275,0.558670,-0.962840,1.732291,0.507434,-0.912641,0.311322,0.816068,0.596520,0.090397,...,0.982196,15564.458008,59,142,TMA.csv,,386,c59,c59a,
TMA_Cell_115756,0.297418,0.420594,-0.971632,1.966955,0.304365,-1.164112,0.866636,-0.092857,-0.241830,-0.617835,...,0.775977,15629.680664,59,47,TMA.csv,,270,c59,c59a,
TMA_Cell_115757,0.346950,0.453951,-0.602893,1.338956,0.559435,-0.801333,0.447061,0.988156,1.567869,0.403878,...,0.688747,15518.421875,59,64,TMA.csv,,202,c59,c59a,
TMA_Cell_115758,-0.189415,0.508840,-0.886041,0.647980,-0.227224,-1.022549,-0.099256,0.219755,0.603715,-0.219145,...,0.751402,15539.275391,59,58,TMA.csv,,182,c59,c59a,


In [35]:
#NACT rows
clinical_data[clinical_data['Primary_chem(1)_vs_surg(0)'] == 1]

Unnamed: 0,Patient,%_tumor,%_necrosis,Age_Diagnosis,BMI,CA125,Race,Other_Cancers,BRCAStatus,Primary_chem(1)_vs_surg(0),...,Histo_Type,Optimal_Debulking,Residual_Disease,Platinum_sensitive,Days_surgery_to_recurrence,Avastin,Disease_Stat,Days_from_surgery_to_last_contact_or_death,Recurrence,Vital_Status
4,5,50,0,70.0,32.0,1426.0,C,0,,1,...,CCC,,0.0,1.0,377.0,1.0,DOD,977,1.0,1.0
6,7,80,5,63.0,29.0,10965.0,C,0,,1,...,HGSOC,,0.0,0.0,516.0,0.0,DOD,814,1.0,1.0
10,11,90,0,58.0,20.0,72.0,C,0,,1,...,Signetringcell,,0.0,0.0,203.0,0.0,DOD,263,1.0,1.0
14,15,60,0,77.0,35.0,183.0,C,0,,1,...,HGSOC,1.0,0.0,1.0,460.0,0.0,NED,2585,1.0,0.0
17,18,90,0,75.0,25.0,55.0,C,0,0.0,1,...,HGSOC,1.0,0.0,1.0,359.0,1.0,DOD,439,1.0,1.0
25,26,80,0,69.0,29.0,1907.0,C,0,0.0,1,...,HGSOC,1.0,0.0,1.0,,0.0,DOD,1085,0.0,1.0
30,31,70,0,62.0,26.0,11785.0,C,0,,1,...,HGSOC,1.0,0.0,1.0,,1.0,DOD,328,0.0,1.0
31,32,90,5,75.0,41.0,4532.0,C,0,,1,...,HGSOC,1.0,0.0,1.0,1157.0,0.0,AWD,1864,1.0,0.0
39,40,70,5,64.0,,8393.0,C,,,1,...,HGSOC,1.0,0.0,1.0,,0.0,,131,0.0,
50,51,90,0,55.0,33.0,1000.0,C,renalcell,,1,...,HGSOC,1.0,0.0,1.0,2103.0,0.0,,2124,1.0,


In [36]:
# Go through 'Primary_chem(1)_vs_surg(0)' col to replace NaN values by 'CONTROL' if 'c' in 'Patient' col
df2.loc[(df2['Primary_chem(1)_vs_surg(0)'].isnull()) & (df2['Patient'].astype(str).str.contains('c|^c')), 'Primary_chem(1)_vs_surg(0)'] = 'CONTROL'

df2

Unnamed: 0_level_0,PDL1_Cytoplasm_Intensity_Average,HLA_Cytoplasm_Intensity_Average,CKs_Cytoplasm_Intensity_Average,Ki67_Nucleus_Intensity_Average,CD163_Cytoplasm_Intensity_Average,ColVI_Cytoplasm_Intensity_Average,CD20_Cytoplasm_Intensity_Average,PD1_Cytoplasm_Intensity_Average,AXL_Cytoplasm_Intensity_Average,CD31_Cytoplasm_Intensity_Average,...,Nucleus_Roundness,Nuc_X,ROI_index,Nucleus_Size,Sample_ID,immune_checkpoint,Cell_Size,Patient,Unique_ROI_index,Primary_chem(1)_vs_surg(0)
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DD3S1_Cell_0,-0.677863,-0.417494,-0.912537,-0.817876,0.930099,0.232078,-0.483158,1.535604,0.807339,1.167755,...,0.955040,1484.771729,0,127,DD3S1.csv,,339,61,61a,1.0
DD3S1_Cell_1,-0.677863,-0.516487,-0.838037,-0.869685,1.114924,0.301333,-0.344770,1.668368,0.875455,1.643023,...,0.966643,1426.250000,0,112,DD3S1.csv,,344,61,61a,1.0
DD3S1_Cell_2,-0.677863,-0.141921,-1.016023,-0.755879,0.834577,0.259216,-0.438292,1.336308,0.705088,1.053636,...,0.721534,1531.110474,0,181,DD3S1.csv,,422,61,61a,1.0
DD3S1_Cell_3,-0.741282,-0.460472,-0.491711,-0.818084,0.648200,0.107027,-0.444889,1.249805,0.660707,1.165861,...,0.587196,1518.907593,0,119,DD3S1.csv,,278,61,61a,1.0
DD3S1_Cell_6,-0.621521,-0.247254,-0.867127,-0.742544,0.810579,0.272128,-0.507117,1.251434,0.947172,2.545301,...,0.935716,1471.914917,0,47,DD3S1.csv,,204,61,61a,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TMA_Cell_115755,0.478275,0.558670,-0.962840,1.732291,0.507434,-0.912641,0.311322,0.816068,0.596520,0.090397,...,0.982196,15564.458008,59,142,TMA.csv,,386,c59,c59a,CONTROL
TMA_Cell_115756,0.297418,0.420594,-0.971632,1.966955,0.304365,-1.164112,0.866636,-0.092857,-0.241830,-0.617835,...,0.775977,15629.680664,59,47,TMA.csv,,270,c59,c59a,CONTROL
TMA_Cell_115757,0.346950,0.453951,-0.602893,1.338956,0.559435,-0.801333,0.447061,0.988156,1.567869,0.403878,...,0.688747,15518.421875,59,64,TMA.csv,,202,c59,c59a,CONTROL
TMA_Cell_115758,-0.189415,0.508840,-0.886041,0.647980,-0.227224,-1.022549,-0.099256,0.219755,0.603715,-0.219145,...,0.751402,15539.275391,59,58,TMA.csv,,182,c59,c59a,CONTROL


In [37]:
# supposed empty
df2[df2['Primary_chem(1)_vs_surg(0)'].isna()]

Unnamed: 0_level_0,PDL1_Cytoplasm_Intensity_Average,HLA_Cytoplasm_Intensity_Average,CKs_Cytoplasm_Intensity_Average,Ki67_Nucleus_Intensity_Average,CD163_Cytoplasm_Intensity_Average,ColVI_Cytoplasm_Intensity_Average,CD20_Cytoplasm_Intensity_Average,PD1_Cytoplasm_Intensity_Average,AXL_Cytoplasm_Intensity_Average,CD31_Cytoplasm_Intensity_Average,...,Nucleus_Roundness,Nuc_X,ROI_index,Nucleus_Size,Sample_ID,immune_checkpoint,Cell_Size,Patient,Unique_ROI_index,Primary_chem(1)_vs_surg(0)
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [38]:
# Création de trois DataFrames en fonction des valeurs dans 'Primary_chem(1)_vs_surg(0)'
df_NACT = df2[df2['Primary_chem(1)_vs_surg(0)'] == 1]
df_ACT = df2[df2['Primary_chem(1)_vs_surg(0)'] == 0]
df_control = df2[df2['Primary_chem(1)_vs_surg(0)'] == 'CONTROL']

filename_NACT = os.path.join(output_data_dir, 'df_NACT.csv')
filename_ACT = os.path.join(output_data_dir, 'df_ACT.csv')
filename_control = os.path.join(output_data_dir, 'df_control.csv')

# Enregistrement des DataFrames dans des fichiers CSV dans le répertoire output_data_dir
df_NACT.to_csv(filename_NACT, index=False)
df_ACT.to_csv(filename_ACT, index=False)
df_control.to_csv(filename_control, index=False)

In [39]:
df_NACT

Unnamed: 0_level_0,PDL1_Cytoplasm_Intensity_Average,HLA_Cytoplasm_Intensity_Average,CKs_Cytoplasm_Intensity_Average,Ki67_Nucleus_Intensity_Average,CD163_Cytoplasm_Intensity_Average,ColVI_Cytoplasm_Intensity_Average,CD20_Cytoplasm_Intensity_Average,PD1_Cytoplasm_Intensity_Average,AXL_Cytoplasm_Intensity_Average,CD31_Cytoplasm_Intensity_Average,...,Nucleus_Roundness,Nuc_X,ROI_index,Nucleus_Size,Sample_ID,immune_checkpoint,Cell_Size,Patient,Unique_ROI_index,Primary_chem(1)_vs_surg(0)
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DD3S1_Cell_0,-0.677863,-0.417494,-0.912537,-0.817876,0.930099,0.232078,-0.483158,1.535604,0.807339,1.167755,...,0.955040,1484.771729,0,127,DD3S1.csv,,339,61,61a,1.0
DD3S1_Cell_1,-0.677863,-0.516487,-0.838037,-0.869685,1.114924,0.301333,-0.344770,1.668368,0.875455,1.643023,...,0.966643,1426.250000,0,112,DD3S1.csv,,344,61,61a,1.0
DD3S1_Cell_2,-0.677863,-0.141921,-1.016023,-0.755879,0.834577,0.259216,-0.438292,1.336308,0.705088,1.053636,...,0.721534,1531.110474,0,181,DD3S1.csv,,422,61,61a,1.0
DD3S1_Cell_3,-0.741282,-0.460472,-0.491711,-0.818084,0.648200,0.107027,-0.444889,1.249805,0.660707,1.165861,...,0.587196,1518.907593,0,119,DD3S1.csv,,278,61,61a,1.0
DD3S1_Cell_6,-0.621521,-0.247254,-0.867127,-0.742544,0.810579,0.272128,-0.507117,1.251434,0.947172,2.545301,...,0.935716,1471.914917,0,47,DD3S1.csv,,204,61,61a,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DD5S3_Cell_62826,-0.446025,0.480212,1.195440,-0.081463,-0.634343,-0.883815,1.050928,0.164709,0.103986,0.171579,...,0.609764,13449.426758,34,103,DD5S3.csv,,237,187,187c,1.0
DD5S3_Cell_62828,-0.414440,0.416668,1.542926,-0.203248,-0.076555,-0.783759,0.423346,0.638305,0.346970,0.453372,...,0.736844,13469.353516,34,65,DD5S3.csv,,231,187,187c,1.0
DD5S3_Cell_62829,-0.720569,0.072144,-0.311962,-0.481442,1.236053,-0.468813,0.252353,2.129042,1.111809,1.340371,...,0.691315,13390.447266,34,85,DD5S3.csv,,224,187,187c,1.0
DD5S3_Cell_62830,-0.442500,0.368654,-1.491754,-0.316270,0.029502,-0.736522,0.390883,0.861894,0.461685,0.586409,...,0.892707,13430.178711,34,56,DD5S3.csv,,139,187,187c,1.0


In [40]:
df_ACT

Unnamed: 0_level_0,PDL1_Cytoplasm_Intensity_Average,HLA_Cytoplasm_Intensity_Average,CKs_Cytoplasm_Intensity_Average,Ki67_Nucleus_Intensity_Average,CD163_Cytoplasm_Intensity_Average,ColVI_Cytoplasm_Intensity_Average,CD20_Cytoplasm_Intensity_Average,PD1_Cytoplasm_Intensity_Average,AXL_Cytoplasm_Intensity_Average,CD31_Cytoplasm_Intensity_Average,...,Nucleus_Roundness,Nuc_X,ROI_index,Nucleus_Size,Sample_ID,immune_checkpoint,Cell_Size,Patient,Unique_ROI_index,Primary_chem(1)_vs_surg(0)
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DD3S1_Cell_1910,-1.137775,-0.495306,-1.178944,-0.832994,1.067765,1.432729,-0.220835,1.952302,1.021130,1.235209,...,0.873320,1143.897461,1,78,DD3S1.csv,,265,62,62a,0.0
DD3S1_Cell_1911,-1.399789,0.791106,0.567668,0.215155,0.518692,-0.627981,-0.000923,1.375652,0.725274,0.945061,...,0.566933,1396.333252,1,93,DD3S1.csv,,210,62,62a,0.0
DD3S1_Cell_1922,-1.186023,-0.520883,-1.104365,-0.894190,0.988875,1.417141,-0.328588,1.869449,0.978622,1.185912,...,0.886386,1148.870972,1,62,DD3S1.csv,,196,62,62a,0.0
DD3S1_Cell_1928,-0.329397,0.396547,1.746212,1.313160,1.139536,-0.546020,1.586974,1.763598,0.924314,1.122929,...,0.738159,1097.823486,1,51,DD3S1.csv,,159,62,62a,0.0
DD3S1_Cell_1942,-0.970875,-0.126315,1.072148,0.178276,1.103388,-0.498249,-0.026061,1.989714,1.040325,1.257470,...,0.890113,1402.000000,1,114,DD3S1.csv,,205,62,62a,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DD5S3_Cell_64060,-0.677863,0.887158,-0.528183,0.525142,5.217188,-1.300569,-0.775098,-1.455787,-0.908090,-0.862636,...,0.830379,13503.915039,35,106,DD5S3.csv,,378,186,186a,0.0
DD5S3_Cell_64061,0.479994,0.415456,0.819658,0.623611,2.082444,-1.053888,1.671235,0.310590,-0.309031,-0.235059,...,0.646098,13254.210938,35,90,DD5S3.csv,,151,186,186a,0.0
DD5S3_Cell_64062,0.660736,0.440965,1.409010,0.406828,2.082444,-0.849359,1.622354,1.128603,0.187663,0.290273,...,0.566233,13260.901367,35,61,DD5S3.csv,,146,186,186a,0.0
DD5S3_Cell_64063,0.232880,0.505008,0.798055,1.721604,0.625211,-1.022122,0.833309,-0.489938,-0.231888,-0.208742,...,0.788226,13292.144531,35,76,DD5S3.csv,,326,186,186a,0.0


In [41]:
df_control

Unnamed: 0_level_0,PDL1_Cytoplasm_Intensity_Average,HLA_Cytoplasm_Intensity_Average,CKs_Cytoplasm_Intensity_Average,Ki67_Nucleus_Intensity_Average,CD163_Cytoplasm_Intensity_Average,ColVI_Cytoplasm_Intensity_Average,CD20_Cytoplasm_Intensity_Average,PD1_Cytoplasm_Intensity_Average,AXL_Cytoplasm_Intensity_Average,CD31_Cytoplasm_Intensity_Average,...,Nucleus_Roundness,Nuc_X,ROI_index,Nucleus_Size,Sample_ID,immune_checkpoint,Cell_Size,Patient,Unique_ROI_index,Primary_chem(1)_vs_surg(0)
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TMA_Cell_4,-0.367050,3.386458,-2.000166,-0.459972,2.082444,-0.331127,0.074576,0.367998,0.208285,2.426724,...,0.639497,1603.610596,0,131,TMA.csv,,307,c0,c0a,CONTROL
TMA_Cell_300,-0.738040,2.894731,-2.027276,-0.536334,2.463664,0.615413,0.434008,0.145870,0.094320,0.796244,...,0.557669,1870.953491,0,86,TMA.csv,,271,c0,c0a,CONTROL
TMA_Cell_1231,-0.166333,3.870697,-1.953719,-0.289915,2.276577,-0.760463,0.856060,0.748574,0.403544,1.869273,...,0.909491,1076.486938,0,115,TMA.csv,,215,c0,c0a,CONTROL
TMA_Cell_1240,-0.293766,2.246166,-1.945873,-0.607328,2.082444,-0.615345,0.243531,0.812863,0.436529,2.194035,...,0.945592,1089.709717,0,93,TMA.csv,,198,c0,c0a,CONTROL
TMA_Cell_2107,-0.677863,2.130819,-2.035940,-0.746582,1.538367,1.246304,0.179943,0.074879,0.057898,0.966884,...,0.664589,1375.939697,0,116,TMA.csv,,167,c0,c0a,CONTROL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TMA_Cell_115755,0.478275,0.558670,-0.962840,1.732291,0.507434,-0.912641,0.311322,0.816068,0.596520,0.090397,...,0.982196,15564.458008,59,142,TMA.csv,,386,c59,c59a,CONTROL
TMA_Cell_115756,0.297418,0.420594,-0.971632,1.966955,0.304365,-1.164112,0.866636,-0.092857,-0.241830,-0.617835,...,0.775977,15629.680664,59,47,TMA.csv,,270,c59,c59a,CONTROL
TMA_Cell_115757,0.346950,0.453951,-0.602893,1.338956,0.559435,-0.801333,0.447061,0.988156,1.567869,0.403878,...,0.688747,15518.421875,59,64,TMA.csv,,202,c59,c59a,CONTROL
TMA_Cell_115758,-0.189415,0.508840,-0.886041,0.647980,-0.227224,-1.022549,-0.099256,0.219755,0.603715,-0.219145,...,0.751402,15539.275391,59,58,TMA.csv,,182,c59,c59a,CONTROL


## V.4. COUNTS

In [42]:
df = df2.copy()

In [43]:
# Get counts for each Sample_ID, sorted by Sample_ID
sample_counts = pd.DataFrame(df.Sample_ID.value_counts()).sort_index()
sample_counts = sample_counts.rename(columns = {'Sample_ID':'counts'})
sample_counts['Sample_ID'] = sample_counts.index
#counts['color'] = counts.apply(lambda row: color_dict[row['Sample_ID']], axis = 1)
sample_counts

Unnamed: 0,counts,Sample_ID
DD3S1.csv,68697,DD3S1.csv
DD3S2.csv,70850,DD3S2.csv
DD3S3.csv,116265,DD3S3.csv
DD4S1.csv,70748,DD4S1.csv
DD4S2.csv,51745,DD4S2.csv
DD4S3.csv,70818,DD4S3.csv
DD5S1.csv,69463,DD5S1.csv
DD5S2.csv,45403,DD5S2.csv
DD5S3.csv,45898,DD5S3.csv
TMA.csv,94742,TMA.csv


### V.4.1. CELL TYPES

In [44]:
# Count by cell type
stroma_counts = pd.DataFrame({'stroma':
    df.loc[
        df['cell_type'] == 'STROMA',:].Sample_ID.value_counts()}).sort_index()

immune_counts = pd.DataFrame({'immune':
    df.loc[
        df['cell_type'] == 'IMMUNE',:].Sample_ID.value_counts()}).sort_index()

cancer_counts = pd.DataFrame({'cancer':
    df.loc[
        df['cell_type'] == 'CANCER',:].Sample_ID.value_counts()}).sort_index()

endothelial_counts = pd.DataFrame({'endothelial':
    df.loc[
        df['cell_type'] == 'ENDOTHELIAL',:].Sample_ID.value_counts()}).sort_index()

counts = pd.concat([sample_counts, stroma_counts,cancer_counts,immune_counts,endothelial_counts], 
                   axis = 1, sort = False)

counts = counts.fillna(0)
print(counts)

filename = os.path.join(output_data_dir , project_name + "_cell_types_number.csv")
counts.to_csv(filename, index = False)

           counts  Sample_ID  stroma  cancer  immune  endothelial
DD3S1.csv   68697  DD3S1.csv   10690   48981    7360         1666
DD3S2.csv   70850  DD3S2.csv   12064   51059    5977         1750
DD3S3.csv  116265  DD3S3.csv   23412   77000   13075         2778
DD4S1.csv   70748  DD4S1.csv    7249   54013    7433         2053
DD4S2.csv   51745  DD4S2.csv    6193   39470    4850         1232
DD4S3.csv   70818  DD4S3.csv    8516   50598   10331         1373
DD5S1.csv   69463  DD5S1.csv   17067   43723    5878         2795
DD5S2.csv   45403  DD5S2.csv   11098   30839    2131         1335
DD5S3.csv   45898  DD5S3.csv    9018   30060    5656         1164
TMA.csv     94742    TMA.csv   23121   59978    9356         2287


In [45]:
# Compute %
# get_perc() function in my_modules.py

counts['stroma_perc'] = counts.apply(lambda row: get_perc(row, 'stroma'), axis = 1)
counts['immune_perc'] = counts.apply(lambda row: get_perc(row, 'immune'), axis = 1)
counts['cancer_perc'] = counts.apply(lambda row: get_perc(row, 'cancer'), axis = 1)
counts['endothelial_perc'] = counts.apply(lambda row: get_perc(row, 'endothelial'), axis = 1)

counts

Unnamed: 0,counts,Sample_ID,stroma,cancer,immune,endothelial,stroma_perc,immune_perc,cancer_perc,endothelial_perc
DD3S1.csv,68697,DD3S1.csv,10690,48981,7360,1666,15.6,10.7,71.3,2.4
DD3S2.csv,70850,DD3S2.csv,12064,51059,5977,1750,17.0,8.4,72.1,2.5
DD3S3.csv,116265,DD3S3.csv,23412,77000,13075,2778,20.1,11.2,66.2,2.4
DD4S1.csv,70748,DD4S1.csv,7249,54013,7433,2053,10.2,10.5,76.3,2.9
DD4S2.csv,51745,DD4S2.csv,6193,39470,4850,1232,12.0,9.4,76.3,2.4
DD4S3.csv,70818,DD4S3.csv,8516,50598,10331,1373,12.0,14.6,71.4,1.9
DD5S1.csv,69463,DD5S1.csv,17067,43723,5878,2795,24.6,8.5,62.9,4.0
DD5S2.csv,45403,DD5S2.csv,11098,30839,2131,1335,24.4,4.7,67.9,2.9
DD5S3.csv,45898,DD5S3.csv,9018,30060,5656,1164,19.6,12.3,65.5,2.5
TMA.csv,94742,TMA.csv,23121,59978,9356,2287,24.4,9.9,63.3,2.4


In [46]:
fig = go.Figure()
title = 'Cell proportions by Sample ID and tissue type'

fig = go.Figure(data=[
    go.Bar(name='Stroma', x=counts['Sample_ID'], y=counts['stroma_perc'], 
           text = counts['stroma_perc'], textposition='auto',
           marker_color = 'rgb' + str(cell_type_color_dict['STROMA'])),
    go.Bar(name='Immune', x=counts['Sample_ID'], y=counts['immune_perc'], 
           text = counts['immune_perc'], textposition='auto',
           marker_color = 'rgb' + str(cell_type_color_dict['IMMUNE'])),
    go.Bar(name='Cancer',x=counts['Sample_ID'], y=counts['cancer_perc'], 
           text = counts['cancer_perc'], textposition='auto', 
           marker_color = 'rgb' + str(cell_type_color_dict['CANCER'])),
    go.Bar(name='Endothelial',x=counts['Sample_ID'], y=counts['endothelial_perc'], 
           text = counts['endothelial_perc'], textposition='auto', 
           marker_color = 'rgb' + str(cell_type_color_dict['ENDOTHELIAL']))
])
    
fig.update_layout( plot_bgcolor = 'white',barmode ='stack')#title = title,
fig.update_xaxes( linecolor = 'black')#title = "Sample",
fig.update_yaxes(title = "Cell count (%)", linecolor = 'black')
plot(fig)
#fig.write_image(output_images_dir + "/" + title.replace(" ","_") + ".png")

'temp-plot.html'

### V.4.2. CELL SUBTYPES

#### V.4.2.1 BY SCENES

In [47]:
# Count by cell SUBtype
cell_subtypes = ['DC', 'B', 'TCD4', 'TCD8', 'M1', 'M2', 'Treg', \
                 'IMMUNE_OTHER', 'CANCER', 'αSMA_myCAF', 'STROMA_OTHER', 'ENDOTHELIAL']
# Initialisation d'un dictionnaire pour stocker les counts des sous-types de cellules
subtype_counts = {}

# Boucle sur les sous-types de cellules pour compter les échantillons correspondants
for subtype in cell_subtypes:
    subtype_counts[subtype.lower()] = pd.DataFrame({subtype.lower():
        df.loc[
            df['cell_subtype'] == subtype, 'Sample_ID'].value_counts()
    }).sort_index()

# Concaténation des counts des sous-types de cellules en un seul DataFrame
counts_subtypes = pd.concat([pd.DataFrame(v) for v in subtype_counts.values()], axis=1, sort=False)
counts_subtypes = counts_subtypes.fillna(0)

# Ajouter une colonne pour le compte total de cellules par ligne
counts_subtypes['total_cells'] = counts_subtypes.sum(axis=1)

# Enregistrement des counts des sous-types de cellules dans un fichier CSV
filename_subtypes = os.path.join(output_data_dir, project_name + "_cell_subtypes_number_by_scenes.csv")
counts_subtypes.to_csv(filename_subtypes, index=False)
counts_subtypes

Unnamed: 0,dc,b,tcd4,tcd8,m1,m2,treg,immune_other,cancer,αsma_mycaf,stroma_other,endothelial,total_cells
DD3S1.csv,81,0.0,2991,2849,310,129,6,994,48981,3967,6723,1666,68697.0
DD3S2.csv,294,0.0,1915,2350,94,25,38,1261,51059,4789,7275,1750,70850.0
DD3S3.csv,280,138.0,6857,3550,252,96,28,1874,77000,7424,15988,2778,116265.0
DD4S1.csv,272,29.0,2667,1950,76,38,12,2389,54013,3106,4143,2053,70748.0
DD4S2.csv,130,14.0,1938,1694,112,102,19,841,39470,1664,4529,1232,51745.0
DD4S3.csv,1150,31.0,2353,2940,155,328,38,3336,50598,3686,4830,1373,70818.0
DD5S1.csv,583,139.0,2840,536,504,28,28,1220,43723,6976,10091,2795,69463.0
DD5S2.csv,160,16.0,1135,145,131,63,14,467,30839,3876,7222,1335,45403.0
DD5S3.csv,1467,35.0,2000,687,479,110,13,865,30060,4831,4187,1164,45898.0
TMA.csv,132,2309.0,5027,952,109,11,209,607,59978,4875,18246,2287,94742.0


In [48]:
# Ajout des colonnes de pourcentages pour chaque sous-type de cellules
counts_perc = counts_subtypes.copy()

# Calcul des pourcentages pour chaque sous-type de cellules, en excluant la colonne 'total_cells'
for col in counts_subtypes.columns:
    if col != 'total_cells':
        counts_perc[col + '_perc'] = (counts_perc[col] / counts_perc['total_cells']) * 100

# Affichage des pourcentages des sous-types de cellules

counts_perc['Sample_ID'] = counts_perc.index
counts_perc.columns.values
display(counts_perc)


Unnamed: 0,dc,b,tcd4,tcd8,m1,m2,treg,immune_other,cancer,αsma_mycaf,...,tcd8_perc,m1_perc,m2_perc,treg_perc,immune_other_perc,cancer_perc,αsma_mycaf_perc,stroma_other_perc,endothelial_perc,Sample_ID
DD3S1.csv,81,0.0,2991,2849,310,129,6,994,48981,3967,...,4.147197,0.451257,0.187781,0.008734,1.446934,71.300057,5.774634,9.786454,2.425142,DD3S1.csv
DD3S2.csv,294,0.0,1915,2350,94,25,38,1261,51059,4789,...,3.316867,0.132675,0.035286,0.053634,1.779817,72.066337,6.759351,10.268172,2.470007,DD3S2.csv
DD3S3.csv,280,138.0,6857,3550,252,96,28,1874,77000,7424,...,3.053369,0.216746,0.08257,0.024083,1.611835,66.228014,6.385413,13.751344,2.389369,DD3S3.csv
DD4S1.csv,272,29.0,2667,1950,76,38,12,2389,54013,3106,...,2.756262,0.107424,0.053712,0.016962,3.376774,76.345621,4.39023,5.855996,2.901849,DD4S1.csv
DD4S2.csv,130,14.0,1938,1694,112,102,19,841,39470,1664,...,3.273746,0.216446,0.19712,0.036719,1.625278,76.277901,3.21577,8.752536,2.380906,DD4S2.csv
DD4S3.csv,1150,31.0,2353,2940,155,328,38,3336,50598,3686,...,4.151487,0.218871,0.463159,0.053659,4.710667,71.447937,5.204891,6.8203,1.938773,DD4S3.csv
DD5S1.csv,583,139.0,2840,536,504,28,28,1220,43723,6976,...,0.771634,0.725566,0.040309,0.040309,1.756331,62.944301,10.042757,14.527158,4.023725,DD5S1.csv
DD5S2.csv,160,16.0,1135,145,131,63,14,467,30839,3876,...,0.319362,0.288527,0.138757,0.030835,1.028566,67.922824,8.536881,15.906438,2.940334,DD5S2.csv
DD5S3.csv,1467,35.0,2000,687,479,110,13,865,30060,4831,...,1.496797,1.043618,0.239662,0.028324,1.884614,65.49305,10.525513,9.122402,2.536058,DD5S3.csv
TMA.csv,132,2309.0,5027,952,109,11,209,607,59978,4875,...,1.004834,0.115049,0.01161,0.220599,0.640687,63.306664,5.145553,19.258618,2.413924,TMA.csv


In [49]:
fig = go.Figure()
title = 'Cell subtypes proportions by Sample ID and tissue type'

for cell_subtype in cell_subtypes:
    fig.add_trace(
        go.Bar(
            name=cell_subtype,
            x=counts_perc['Sample_ID'],
            y=counts_perc[f'{cell_subtype.lower()}_perc'],
            text=counts_perc[f'{cell_subtype.lower()}_perc'],
            textposition='auto',
            marker_color='rgb' + str(cell_subtype_color_dict[cell_subtype])))

fig.update_layout(
    plot_bgcolor='white',
    barmode='stack',
    title=title,
    xaxis=dict(linecolor='black'),
    yaxis=dict(title='Cell count (%)', linecolor='black')
)

# Enregistrer l'image
output_filename = title.replace(" ", "_") + ".png"
#fig.write_image(output_images_dir + "/" + output_filename, scale=1000)

# Afficher le graphique
fig.show()
plot(fig)

'temp-plot.html'

#### V.4.2.2 BY PATIENTS

In [50]:
df

Unnamed: 0_level_0,PDL1_Cytoplasm_Intensity_Average,HLA_Cytoplasm_Intensity_Average,CKs_Cytoplasm_Intensity_Average,Ki67_Nucleus_Intensity_Average,CD163_Cytoplasm_Intensity_Average,ColVI_Cytoplasm_Intensity_Average,CD20_Cytoplasm_Intensity_Average,PD1_Cytoplasm_Intensity_Average,AXL_Cytoplasm_Intensity_Average,CD31_Cytoplasm_Intensity_Average,...,Nucleus_Roundness,Nuc_X,ROI_index,Nucleus_Size,Sample_ID,immune_checkpoint,Cell_Size,Patient,Unique_ROI_index,Primary_chem(1)_vs_surg(0)
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DD3S1_Cell_0,-0.677863,-0.417494,-0.912537,-0.817876,0.930099,0.232078,-0.483158,1.535604,0.807339,1.167755,...,0.955040,1484.771729,0,127,DD3S1.csv,,339,61,61a,1.0
DD3S1_Cell_1,-0.677863,-0.516487,-0.838037,-0.869685,1.114924,0.301333,-0.344770,1.668368,0.875455,1.643023,...,0.966643,1426.250000,0,112,DD3S1.csv,,344,61,61a,1.0
DD3S1_Cell_2,-0.677863,-0.141921,-1.016023,-0.755879,0.834577,0.259216,-0.438292,1.336308,0.705088,1.053636,...,0.721534,1531.110474,0,181,DD3S1.csv,,422,61,61a,1.0
DD3S1_Cell_3,-0.741282,-0.460472,-0.491711,-0.818084,0.648200,0.107027,-0.444889,1.249805,0.660707,1.165861,...,0.587196,1518.907593,0,119,DD3S1.csv,,278,61,61a,1.0
DD3S1_Cell_6,-0.621521,-0.247254,-0.867127,-0.742544,0.810579,0.272128,-0.507117,1.251434,0.947172,2.545301,...,0.935716,1471.914917,0,47,DD3S1.csv,,204,61,61a,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TMA_Cell_115755,0.478275,0.558670,-0.962840,1.732291,0.507434,-0.912641,0.311322,0.816068,0.596520,0.090397,...,0.982196,15564.458008,59,142,TMA.csv,,386,c59,c59a,CONTROL
TMA_Cell_115756,0.297418,0.420594,-0.971632,1.966955,0.304365,-1.164112,0.866636,-0.092857,-0.241830,-0.617835,...,0.775977,15629.680664,59,47,TMA.csv,,270,c59,c59a,CONTROL
TMA_Cell_115757,0.346950,0.453951,-0.602893,1.338956,0.559435,-0.801333,0.447061,0.988156,1.567869,0.403878,...,0.688747,15518.421875,59,64,TMA.csv,,202,c59,c59a,CONTROL
TMA_Cell_115758,-0.189415,0.508840,-0.886041,0.647980,-0.227224,-1.022549,-0.099256,0.219755,0.603715,-0.219145,...,0.751402,15539.275391,59,58,TMA.csv,,182,c59,c59a,CONTROL


In [51]:
# Compter par numéro de patiente
patient_counts = {}

# Boucle sur les sous-types de cellules pour compter les échantillons correspondants par patient
for subtype in cell_subtypes:
    patient_counts[subtype.lower()] = pd.DataFrame({subtype.lower():
        df.loc[
            df['cell_subtype'] == subtype, 'Patient'].value_counts()
    }).sort_index()

# Concaténation des counts des sous-types de cellules en un seul DataFrame
counts_patients = pd.concat([pd.DataFrame(v) for v in patient_counts.values()], axis=1, sort=False)
counts_patients = counts_patients.fillna(0)

# Ajout de la colonne de total de cellules comptées par patientes
counts_patients['Total_cells'] = counts_patients.sum(axis=1)
counts_patients = counts_patients[~counts_patients.index.str.startswith('c')]

# Enregistrement des counts des sous-types de cellules par patient dans un fichier CSV
filename_patients = os.path.join(output_data_dir, project_name + "_cell_subtypes_number_by_patient.csv")
counts_patients.to_csv(filename_patients, index=False)
counts_patients

Unnamed: 0,dc,b,tcd4,tcd8,m1,m2,treg,immune_other,cancer,αsma_mycaf,stroma_other,endothelial,Total_cells
10,2.0,0.0,900.0,24.0,167.0,100.0,5.0,28.0,3981.0,388.0,530.0,335.0,6460.0
100,21.0,0.0,92.0,46.0,0.0,3.0,1.0,30.0,1449.0,30.0,73.0,131.0,1876.0
101,2.0,0.0,20.0,15.0,3.0,1.0,0.0,24.0,1896.0,81.0,173.0,60.0,2275.0
102,1.0,2.0,45.0,24.0,5.0,1.0,0.0,38.0,3564.0,18.0,91.0,16.0,3805.0
103,57.0,0.0,72.0,555.0,7.0,0.0,1.0,78.0,2157.0,17.0,131.0,117.0,3192.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,0.0,0.0,11.0,990.0,3.0,25.0,1.0,184.0,424.0,140.0,22.0,15.0,1815.0
96,0.0,0.0,76.0,2.0,2.0,1.0,0.0,4.0,1854.0,1.0,16.0,3.0,1959.0
97,0.0,0.0,174.0,38.0,0.0,0.0,1.0,56.0,2193.0,28.0,163.0,102.0,2755.0
60,0.0,0.0,0.0,107.0,9.0,16.0,1.0,57.0,1979.0,107.0,1.0,32.0,2309.0


In [52]:
# Ajout des colonnes de pourcentages pour chaque sous-type de cellules par patient
counts_perc_patients = counts_patients.copy()

# Calcul des pourcentages pour chaque sous-type de cellules, en excluant la colonne 'total_cells'
for col in counts_perc_patients.columns:
    if col != 'Total_cells':
        counts_perc_patients[col + '_perc'] = (counts_perc_patients[col] / counts_perc_patients['Total_cells']) * 100


# Affichage des pourcentages des sous-types de cellules par patient
counts_perc_patients['Patient'] = counts_perc_patients.index
counts_perc_patients.columns.values
counts_perc_patients = counts_perc_patients[~counts_perc_patients.index.str.startswith('c')]
counts_perc_patients

Unnamed: 0,dc,b,tcd4,tcd8,m1,m2,treg,immune_other,cancer,αsma_mycaf,...,tcd8_perc,m1_perc,m2_perc,treg_perc,immune_other_perc,cancer_perc,αsma_mycaf_perc,stroma_other_perc,endothelial_perc,Patient
10,2.0,0.0,900.0,24.0,167.0,100.0,5.0,28.0,3981.0,388.0,...,0.371517,2.585139,1.547988,0.077399,0.433437,61.625387,6.006192,8.204334,5.185759,10
100,21.0,0.0,92.0,46.0,0.0,3.0,1.0,30.0,1449.0,30.0,...,2.452026,0.000000,0.159915,0.053305,1.599147,77.238806,1.599147,3.891258,6.982942,100
101,2.0,0.0,20.0,15.0,3.0,1.0,0.0,24.0,1896.0,81.0,...,0.659341,0.131868,0.043956,0.000000,1.054945,83.340659,3.560440,7.604396,2.637363,101
102,1.0,2.0,45.0,24.0,5.0,1.0,0.0,38.0,3564.0,18.0,...,0.630749,0.131406,0.026281,0.000000,0.998686,93.666229,0.473062,2.391590,0.420499,102
103,57.0,0.0,72.0,555.0,7.0,0.0,1.0,78.0,2157.0,17.0,...,17.387218,0.219298,0.000000,0.031328,2.443609,67.575188,0.532581,4.104010,3.665414,103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,0.0,0.0,11.0,990.0,3.0,25.0,1.0,184.0,424.0,140.0,...,54.545455,0.165289,1.377410,0.055096,10.137741,23.360882,7.713499,1.212121,0.826446,92
96,0.0,0.0,76.0,2.0,2.0,1.0,0.0,4.0,1854.0,1.0,...,0.102093,0.102093,0.051046,0.000000,0.204186,94.640123,0.051046,0.816743,0.153139,96
97,0.0,0.0,174.0,38.0,0.0,0.0,1.0,56.0,2193.0,28.0,...,1.379310,0.000000,0.000000,0.036298,2.032668,79.600726,1.016334,5.916515,3.702359,97
60,0.0,0.0,0.0,107.0,9.0,16.0,1.0,57.0,1979.0,107.0,...,4.634041,0.389779,0.692941,0.043309,2.468601,85.708099,4.634041,0.043309,1.385881,60


In [53]:
fig = go.Figure()
title = 'Cell subtypes proportions by Patient and tissue type'

for cell_subtype in cell_subtypes:
    fig.add_trace(
        go.Bar(
            name=cell_subtype,
            x=counts_perc_patients['Patient'],
            y=counts_perc_patients[f'{cell_subtype.lower()}_perc'],
            text=counts_perc_patients[f'{cell_subtype.lower()}_perc'],
            textposition='auto',
            marker_color='rgb' + str(cell_subtype_color_dict[cell_subtype])))

fig.update_layout(
    plot_bgcolor='white',
    barmode='stack',
    title=title,
    xaxis=dict(linecolor='black'),
    yaxis=dict(title='Cell count (%)', linecolor='black'))

output_filename = title.replace(" ", "_") + ".png"
#fig.write_image(output_images_dir + "/" + output_filename, scale=1000)

fig.show()
plot(fig)

'temp-plot.html'

#### V.4.2.3 BY SCENES AND PATIENTS

In [54]:
# Sélectionner les valeurs de 'Sample_ID' spécifiées
allowed_sample_ids = ['DD3S1.csv', 'DD3S2.csv', 'DD3S3.csv']

df_filtered = df[df['Sample_ID'].isin(allowed_sample_ids)]
df_filtered

# Compter par numéro de patiente
patient_counts = {}

# Boucle sur les sous-types de cellules pour compter les échantillons correspondants par patient
for subtype in cell_subtypes:
    patient_counts[subtype.lower()] = pd.DataFrame({subtype.lower():
        df_filtered.loc[
            df_filtered['cell_subtype'] == subtype, 'Patient'].value_counts()
    }).sort_index()

# Concaténation des counts des sous-types de cellules en un seul DataFrame
counts_patients = pd.concat([pd.DataFrame(v) for v in patient_counts.values()], axis=1, sort=False)
counts_patients = counts_patients.fillna(0)

# Ajout de la colonne de total de cellules comptées par patientes
counts_patients['Total_cells'] = counts_patients.sum(axis=1)


counts_patients

Unnamed: 0,dc,b,tcd4,tcd8,m1,m2,treg,immune_other,cancer,αsma_mycaf,stroma_other,endothelial,Total_cells
10,2.0,0.0,900.0,24.0,167.0,100.0,5.0,28.0,3981,388.0,530,335,6460.0
11,1.0,0.0,7.0,5.0,0.0,0.0,0.0,26.0,556,409.0,5103,87,6194.0
14,1.0,0.0,190.0,6.0,2.0,1.0,0.0,53.0,7142,100.0,315,57,7867.0
15,4.0,0.0,32.0,7.0,7.0,1.0,0.0,19.0,1823,747.0,232,86,2958.0
19,38.0,0.0,983.0,258.0,3.0,0.0,4.0,64.0,2322,1543.0,1075,228,6518.0
20,1.0,0.0,26.0,23.0,0.0,0.0,0.0,152.0,4317,40.0,640,94,5293.0
22,1.0,84.0,5.0,416.0,70.0,1.0,0.0,57.0,2447,88.0,163,58,3390.0
23,67.0,1.0,239.0,750.0,0.0,3.0,1.0,155.0,3370,12.0,126,62,4786.0
26,1.0,0.0,159.0,17.0,0.0,0.0,1.0,15.0,2950,49.0,472,46,3710.0
27,5.0,0.0,45.0,0.0,3.0,2.0,0.0,10.0,4522,14.0,1705,123,6429.0


In [55]:
# Filtrer counts_patients pour ne conserver que les lignes avec 'Sample_ID' égal à 'DD3S1.csv', 'DD3S2.csv' ou 'DD3S3.csv'
counts_patients_filtered = counts_patients[counts_patients.index.isin(df_filtered['Patient'])]

# Ajout des colonnes de pourcentages pour chaque sous-type de cellules par patient
counts_perc_patients = counts_patients_filtered.copy()

# Calcul des pourcentages pour chaque sous-type de cellules, en excluant la colonne 'total_cells'
for col in counts_perc_patients.columns:
    if col != 'Total_cells':
        counts_perc_patients[col + '_perc'] = (counts_perc_patients[col] / counts_perc_patients['Total_cells']) * 100


# Affichage des pourcentages des sous-types de cellules par patient
counts_perc_patients['Patient'] = counts_perc_patients.index
counts_perc_patients.columns.values
counts_perc_patients

Unnamed: 0,dc,b,tcd4,tcd8,m1,m2,treg,immune_other,cancer,αsma_mycaf,...,tcd8_perc,m1_perc,m2_perc,treg_perc,immune_other_perc,cancer_perc,αsma_mycaf_perc,stroma_other_perc,endothelial_perc,Patient
10,2.0,0.0,900.0,24.0,167.0,100.0,5.0,28.0,3981,388.0,...,0.371517,2.585139,1.547988,0.077399,0.433437,61.625387,6.006192,8.204334,5.185759,10
11,1.0,0.0,7.0,5.0,0.0,0.0,0.0,26.0,556,409.0,...,0.080723,0.0,0.0,0.0,0.419761,8.976429,6.603164,82.38618,1.404585,11
14,1.0,0.0,190.0,6.0,2.0,1.0,0.0,53.0,7142,100.0,...,0.076268,0.025423,0.012711,0.0,0.6737,90.784289,1.271133,4.004068,0.724546,14
15,4.0,0.0,32.0,7.0,7.0,1.0,0.0,19.0,1823,747.0,...,0.236646,0.236646,0.033807,0.0,0.642326,61.629479,25.25355,7.843137,2.90737,15
19,38.0,0.0,983.0,258.0,3.0,0.0,4.0,64.0,2322,1543.0,...,3.958269,0.046026,0.0,0.061369,0.981896,35.624425,23.672906,16.492789,3.498006,19
20,1.0,0.0,26.0,23.0,0.0,0.0,0.0,152.0,4317,40.0,...,0.434536,0.0,0.0,0.0,2.871717,81.560552,0.755715,12.091442,1.77593,20
22,1.0,84.0,5.0,416.0,70.0,1.0,0.0,57.0,2447,88.0,...,12.271386,2.064897,0.029499,0.0,1.681416,72.182891,2.59587,4.80826,1.710914,22
23,67.0,1.0,239.0,750.0,0.0,3.0,1.0,155.0,3370,12.0,...,15.670706,0.0,0.062683,0.020894,3.238613,70.413707,0.250731,2.632679,1.295445,23
26,1.0,0.0,159.0,17.0,0.0,0.0,1.0,15.0,2950,49.0,...,0.458221,0.0,0.0,0.026954,0.404313,79.514825,1.320755,12.722372,1.239892,26
27,5.0,0.0,45.0,0.0,3.0,2.0,0.0,10.0,4522,14.0,...,0.0,0.046664,0.031109,0.0,0.155545,70.337533,0.217763,26.520454,1.913206,27


In [56]:
fig = go.Figure()
title = 'Cell subtypes proportions by Patient and tissue type DD3'

for cell_subtype in cell_subtypes:
    fig.add_trace(
        go.Bar(
            name=cell_subtype,
            x=counts_perc_patients['Patient'],
            y=counts_perc_patients[f'{cell_subtype.lower()}_perc'],
            text=counts_perc_patients[f'{cell_subtype.lower()}_perc'],
            textposition='auto',
            marker_color='rgb' + str(cell_subtype_color_dict[cell_subtype])))

fig.update_layout(
    plot_bgcolor='white',
    barmode='stack',
    title=title,
    xaxis=dict(linecolor='black'),
    yaxis=dict(title='Cell count (%)', linecolor='black'))

output_filename = title.replace(" ", "_") + ".png"
#fig.write_image(output_images_dir + "/" + output_filename, scale=1000)

fig.show()
plot(fig)

'temp-plot.html'

#### V.4.3.4 BY TREATMENT

In [57]:
df_NACT

Unnamed: 0_level_0,PDL1_Cytoplasm_Intensity_Average,HLA_Cytoplasm_Intensity_Average,CKs_Cytoplasm_Intensity_Average,Ki67_Nucleus_Intensity_Average,CD163_Cytoplasm_Intensity_Average,ColVI_Cytoplasm_Intensity_Average,CD20_Cytoplasm_Intensity_Average,PD1_Cytoplasm_Intensity_Average,AXL_Cytoplasm_Intensity_Average,CD31_Cytoplasm_Intensity_Average,...,Nucleus_Roundness,Nuc_X,ROI_index,Nucleus_Size,Sample_ID,immune_checkpoint,Cell_Size,Patient,Unique_ROI_index,Primary_chem(1)_vs_surg(0)
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DD3S1_Cell_0,-0.677863,-0.417494,-0.912537,-0.817876,0.930099,0.232078,-0.483158,1.535604,0.807339,1.167755,...,0.955040,1484.771729,0,127,DD3S1.csv,,339,61,61a,1.0
DD3S1_Cell_1,-0.677863,-0.516487,-0.838037,-0.869685,1.114924,0.301333,-0.344770,1.668368,0.875455,1.643023,...,0.966643,1426.250000,0,112,DD3S1.csv,,344,61,61a,1.0
DD3S1_Cell_2,-0.677863,-0.141921,-1.016023,-0.755879,0.834577,0.259216,-0.438292,1.336308,0.705088,1.053636,...,0.721534,1531.110474,0,181,DD3S1.csv,,422,61,61a,1.0
DD3S1_Cell_3,-0.741282,-0.460472,-0.491711,-0.818084,0.648200,0.107027,-0.444889,1.249805,0.660707,1.165861,...,0.587196,1518.907593,0,119,DD3S1.csv,,278,61,61a,1.0
DD3S1_Cell_6,-0.621521,-0.247254,-0.867127,-0.742544,0.810579,0.272128,-0.507117,1.251434,0.947172,2.545301,...,0.935716,1471.914917,0,47,DD3S1.csv,,204,61,61a,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DD5S3_Cell_62826,-0.446025,0.480212,1.195440,-0.081463,-0.634343,-0.883815,1.050928,0.164709,0.103986,0.171579,...,0.609764,13449.426758,34,103,DD5S3.csv,,237,187,187c,1.0
DD5S3_Cell_62828,-0.414440,0.416668,1.542926,-0.203248,-0.076555,-0.783759,0.423346,0.638305,0.346970,0.453372,...,0.736844,13469.353516,34,65,DD5S3.csv,,231,187,187c,1.0
DD5S3_Cell_62829,-0.720569,0.072144,-0.311962,-0.481442,1.236053,-0.468813,0.252353,2.129042,1.111809,1.340371,...,0.691315,13390.447266,34,85,DD5S3.csv,,224,187,187c,1.0
DD5S3_Cell_62830,-0.442500,0.368654,-1.491754,-0.316270,0.029502,-0.736522,0.390883,0.861894,0.461685,0.586409,...,0.892707,13430.178711,34,56,DD5S3.csv,,139,187,187c,1.0


In [58]:
# Extraire les valeurs uniques de la colonne 'Patient'
num_NACT_patients = df_NACT['Patient'].unique()
num_ACT_patients = df_ACT['Patient'].unique()

# Numéros des patientes qui ont reçu le traitement
print(num_NACT_patients)
print(num_ACT_patients)

['61' '32' '26' '15' '18' '11' '5' '51' '40' '7' '95' '121' '85' '111'
 '116' '98' '119' '104' '89' '140' '137' '153' '149' '146' '162' '160'
 '157' '171' '169' '183' '182' '179' '177' '174' '178' '187' '184' '141']
['62' '63' '59' '60' '33' '35' '36' '37' '38' '30' '25' '27' '29' '20'
 '21' '22' '23' '24' '14' '16' '17' '19' '12' '8' '9' '10' '4' '52' '53'
 '54' '55' '56' '57' '50' '42' '43' '44' '45' '47' '39' '41' '49' '46'
 '48' '58' '28' '13' '6' '34' '3' '122' '125' '124' '94' '101' '86' '84'
 '83' '91' '88' '87' '75' '74' '82' '81' '80' '79' '67' '66' '65' '64'
 '73' '71' '69' '68' '123' '115' '114' '113' '120' '117' '126' '105' '103'
 '102' '110' '109' '106' '166' '112' '108' '107' '97' '96' '100' '78' '77'
 '70' '72' '99' '92' '90' '136' '135' '134' '133' '132' '131' '130' '129'
 '144' '143' '155' '154' '150' '148' '147' '164' '161' '159' '128' '127'
 '142' '139' '152' '158' '156' '172' '170' '168' '167' '165' '181' '180'
 '176' '173' '175' '185' '186']


##### V.4.3.4.1 NACT

In [59]:
# Compter par numéro de patiente pour les patientes NACT uniquement
patient_counts = {}

# Boucle sur les sous-types de cellules pour compter les échantillons correspondants par patient
for subtype in cell_subtypes:
    patient_counts[subtype.lower()] = pd.DataFrame({subtype.lower():
        df.loc[
            (df['cell_subtype'] == subtype) & (df['Patient'].isin(num_NACT_patients)), 'Patient'].value_counts()
    }).sort_index()

# Concaténation des counts des sous-types de cellules en un seul DataFrame
counts_patients = pd.concat([pd.DataFrame(v) for v in patient_counts.values()], axis=1, sort=False)
counts_patients = counts_patients.fillna(0)

# Ajout de la colonne de total de cellules comptées par patientes
counts_patients['Total_cells'] = counts_patients.sum(axis=1)

# Enregistrement des counts des sous-types de cellules par patient dans un fichier CSV
filename_patients = os.path.join(output_data_dir, project_name + "_cell_subtypes_number_by_patient_NACT.csv")
counts_patients.to_csv(filename_patients, index=False)
counts_patients


Unnamed: 0,dc,b,tcd4,tcd8,m1,m2,treg,immune_other,cancer,αsma_mycaf,stroma_other,endothelial,Total_cells
11,1.0,0.0,7,5.0,0.0,0.0,0.0,26.0,556,409,5103,87,6194.0
111,27.0,1.0,2,30.0,10.0,3.0,1.0,39.0,899,1,3,10,1026.0
116,3.0,0.0,4,2.0,0.0,0.0,0.0,1.0,125,4,56,3,198.0
119,1.0,0.0,60,9.0,3.0,0.0,0.0,17.0,4289,16,104,34,4533.0
121,15.0,2.0,704,54.0,4.0,0.0,2.0,350.0,3652,183,236,587,5789.0
137,2.0,0.0,21,22.0,1.0,1.0,2.0,28.0,1302,409,202,101,2091.0
140,402.0,0.0,385,6.0,307.0,38.0,3.0,6.0,35,66,293,51,1592.0
141,42.0,0.0,158,21.0,10.0,1.0,0.0,25.0,71,565,154,73,1120.0
146,6.0,0.0,80,3.0,1.0,2.0,0.0,52.0,916,89,331,98,1578.0
149,20.0,0.0,270,25.0,275.0,3.0,0.0,265.0,735,1250,425,111,3379.0


In [60]:
# Ajout des colonnes de pourcentages pour chaque sous-type de cellules par patient
counts_perc_patients = counts_patients.copy()

# Calcul des pourcentages pour chaque sous-type de cellules, en excluant la colonne 'total_cells'
for col in counts_perc_patients.columns:
    if col != 'Total_cells':
        counts_perc_patients[col + '_perc'] = (counts_perc_patients[col] / counts_perc_patients['Total_cells']) * 100


# Affichage des pourcentages des sous-types de cellules par patient
counts_perc_patients['Patient'] = counts_perc_patients.index
counts_perc_patients.columns.values
counts_perc_patients



Unnamed: 0,dc,b,tcd4,tcd8,m1,m2,treg,immune_other,cancer,αsma_mycaf,...,tcd8_perc,m1_perc,m2_perc,treg_perc,immune_other_perc,cancer_perc,αsma_mycaf_perc,stroma_other_perc,endothelial_perc,Patient
11,1.0,0.0,7,5.0,0.0,0.0,0.0,26.0,556,409,...,0.080723,0.0,0.0,0.0,0.419761,8.976429,6.603164,82.38618,1.404585,11
111,27.0,1.0,2,30.0,10.0,3.0,1.0,39.0,899,1,...,2.923977,0.974659,0.292398,0.097466,3.80117,87.621832,0.097466,0.292398,0.974659,111
116,3.0,0.0,4,2.0,0.0,0.0,0.0,1.0,125,4,...,1.010101,0.0,0.0,0.0,0.505051,63.131313,2.020202,28.282828,1.515152,116
119,1.0,0.0,60,9.0,3.0,0.0,0.0,17.0,4289,16,...,0.198544,0.066181,0.0,0.0,0.375028,94.617251,0.352967,2.294286,0.750055,119
121,15.0,2.0,704,54.0,4.0,0.0,2.0,350.0,3652,183,...,0.932804,0.069097,0.0,0.034548,6.045949,63.085162,3.161168,4.076697,10.139921,121
137,2.0,0.0,21,22.0,1.0,1.0,2.0,28.0,1302,409,...,1.052128,0.047824,0.047824,0.095648,1.339072,62.266858,19.560019,9.66045,4.830225,137
140,402.0,0.0,385,6.0,307.0,38.0,3.0,6.0,35,66,...,0.376884,19.28392,2.386935,0.188442,0.376884,2.198492,4.145729,18.404523,3.203518,140
141,42.0,0.0,158,21.0,10.0,1.0,0.0,25.0,71,565,...,1.875,0.892857,0.089286,0.0,2.232143,6.339286,50.446429,13.75,6.517857,141
146,6.0,0.0,80,3.0,1.0,2.0,0.0,52.0,916,89,...,0.190114,0.063371,0.126743,0.0,3.295311,58.048162,5.640051,20.975919,6.210393,146
149,20.0,0.0,270,25.0,275.0,3.0,0.0,265.0,735,1250,...,0.739864,8.138503,0.088784,0.0,7.842557,21.751998,36.993193,12.577686,3.284996,149


In [61]:
counts_perc_patients_NACT = counts_perc_patients

In [62]:
fig = go.Figure()
title = 'Cell subtypes proportions by Patient and tissue type - NACT group'

for cell_subtype in cell_subtypes:
    fig.add_trace(
        go.Bar(
            name=cell_subtype,
            x=counts_perc_patients['Patient'],
            y=counts_perc_patients[f'{cell_subtype.lower()}_perc'],
            text=counts_perc_patients[f'{cell_subtype.lower()}_perc'],
            textposition='auto',
            marker_color='rgb' + str(cell_subtype_color_dict[cell_subtype])))

fig.update_layout(
    plot_bgcolor='white',
    barmode='stack',
    title=title,
    xaxis=dict(linecolor='black'),
    yaxis=dict(title='Cell count (%)', linecolor='black'))

output_filename = title.replace(" ", "_") + ".png"
#fig.write_image(output_images_dir + "/" + output_filename, scale=1000)

fig.show()
plot(fig)

'temp-plot.html'

##### V.4.3.4.1 ACT

In [63]:
# Compter par numéro de patiente pour les patientes ACT uniquement
patient_counts = {}

# Boucle sur les sous-types de cellules pour compter les échantillons correspondants par patient
for subtype in cell_subtypes:
    patient_counts[subtype.lower()] = pd.DataFrame({subtype.lower():
        df.loc[
            (df['cell_subtype'] == subtype) & (df['Patient'].isin(num_ACT_patients)), 'Patient'].value_counts()
    }).sort_index()

# Concaténation des counts des sous-types de cellules en un seul DataFrame
counts_patients = pd.concat([pd.DataFrame(v) for v in patient_counts.values()], axis=1, sort=False)
counts_patients = counts_patients.fillna(0)

# Ajout de la colonne de total de cellules comptées par patientes
counts_patients['Total_cells'] = counts_patients.sum(axis=1)

# Enregistrement des counts des sous-types de cellules par patient dans un fichier CSV
filename_patients = os.path.join(output_data_dir, project_name + "_cell_subtypes_number_by_patient_ACT.csv")
counts_patients.to_csv(filename_patients, index=False)
counts_patients

Unnamed: 0,dc,b,tcd4,tcd8,m1,m2,treg,immune_other,cancer,αsma_mycaf,stroma_other,endothelial,Total_cells
10,2.0,0.0,900.0,24.0,167.0,100.0,5.0,28.0,3981,388.0,530,335.0,6460.0
100,21.0,0.0,92.0,46.0,0.0,3.0,1.0,30.0,1449,30.0,73,131.0,1876.0
101,2.0,0.0,20.0,15.0,3.0,1.0,0.0,24.0,1896,81.0,173,60.0,2275.0
102,1.0,2.0,45.0,24.0,5.0,1.0,0.0,38.0,3564,18.0,91,16.0,3805.0
103,57.0,0.0,72.0,555.0,7.0,0.0,1.0,78.0,2157,17.0,131,117.0,3192.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,0.0,0.0,11.0,990.0,3.0,25.0,1.0,184.0,424,140.0,22,15.0,1815.0
96,0.0,0.0,76.0,2.0,2.0,1.0,0.0,4.0,1854,1.0,16,3.0,1959.0
97,0.0,0.0,174.0,38.0,0.0,0.0,1.0,56.0,2193,28.0,163,102.0,2755.0
60,0.0,0.0,0.0,107.0,9.0,16.0,1.0,57.0,1979,107.0,1,32.0,2309.0


In [64]:
# Ajout des colonnes de pourcentages pour chaque sous-type de cellules par patient
counts_perc_patients = counts_patients.copy()

# Calcul des pourcentages pour chaque sous-type de cellules, en excluant la colonne 'total_cells'
for col in counts_perc_patients.columns:
    if col != 'Total_cells':
        counts_perc_patients[col + '_perc'] = (counts_perc_patients[col] / counts_perc_patients['Total_cells']) * 100


# Affichage des pourcentages des sous-types de cellules par patient
counts_perc_patients['Patient'] = counts_perc_patients.index
counts_perc_patients.columns.values
counts_perc_patients



Unnamed: 0,dc,b,tcd4,tcd8,m1,m2,treg,immune_other,cancer,αsma_mycaf,...,tcd8_perc,m1_perc,m2_perc,treg_perc,immune_other_perc,cancer_perc,αsma_mycaf_perc,stroma_other_perc,endothelial_perc,Patient
10,2.0,0.0,900.0,24.0,167.0,100.0,5.0,28.0,3981,388.0,...,0.371517,2.585139,1.547988,0.077399,0.433437,61.625387,6.006192,8.204334,5.185759,10
100,21.0,0.0,92.0,46.0,0.0,3.0,1.0,30.0,1449,30.0,...,2.452026,0.000000,0.159915,0.053305,1.599147,77.238806,1.599147,3.891258,6.982942,100
101,2.0,0.0,20.0,15.0,3.0,1.0,0.0,24.0,1896,81.0,...,0.659341,0.131868,0.043956,0.000000,1.054945,83.340659,3.560440,7.604396,2.637363,101
102,1.0,2.0,45.0,24.0,5.0,1.0,0.0,38.0,3564,18.0,...,0.630749,0.131406,0.026281,0.000000,0.998686,93.666229,0.473062,2.391590,0.420499,102
103,57.0,0.0,72.0,555.0,7.0,0.0,1.0,78.0,2157,17.0,...,17.387218,0.219298,0.000000,0.031328,2.443609,67.575188,0.532581,4.104010,3.665414,103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,0.0,0.0,11.0,990.0,3.0,25.0,1.0,184.0,424,140.0,...,54.545455,0.165289,1.377410,0.055096,10.137741,23.360882,7.713499,1.212121,0.826446,92
96,0.0,0.0,76.0,2.0,2.0,1.0,0.0,4.0,1854,1.0,...,0.102093,0.102093,0.051046,0.000000,0.204186,94.640123,0.051046,0.816743,0.153139,96
97,0.0,0.0,174.0,38.0,0.0,0.0,1.0,56.0,2193,28.0,...,1.379310,0.000000,0.000000,0.036298,2.032668,79.600726,1.016334,5.916515,3.702359,97
60,0.0,0.0,0.0,107.0,9.0,16.0,1.0,57.0,1979,107.0,...,4.634041,0.389779,0.692941,0.043309,2.468601,85.708099,4.634041,0.043309,1.385881,60


In [65]:
counts_perc_patients_ACT = counts_perc_patients

In [66]:
fig = go.Figure()
title = 'Cell subtypes proportions by Patient and tissue type - ACT group'

for cell_subtype in cell_subtypes:
    fig.add_trace(
        go.Bar(
            name=cell_subtype,
            x=counts_perc_patients['Patient'],
            y=counts_perc_patients[f'{cell_subtype.lower()}_perc'],
            text=counts_perc_patients[f'{cell_subtype.lower()}_perc'],
            textposition='auto',
            marker_color='rgb' + str(cell_subtype_color_dict[cell_subtype])))

fig.update_layout(
    plot_bgcolor='white',
    barmode='stack',
    title=title,
    xaxis=dict(linecolor='black'),
    yaxis=dict(title='Cell count (%)', linecolor='black'))

output_filename = title.replace(" ", "_") + ".png"
#fig.write_image(output_images_dir + "/" + output_filename, scale=1000)

fig.show()
plot(fig)

'temp-plot.html'

In [67]:
from scipy.stats import mannwhitneyu


# Choisissez la colonne sur laquelle vous voulez effectuer la comparaison
column_to_compare = 'm2'

# Effectuez le test de Mann-Whitney
statistic, p_value = mannwhitneyu(counts_perc_patients_NACT[column_to_compare], counts_perc_patients_ACT[column_to_compare])

# Affichez les résultats
print("Statistique de test de Mann-Whitney :", statistic)
print("P-value associée :", p_value)

# Interprétez les résultats
alpha = 0.05
if p_value < alpha:
    print("La différence entre les groupes est statistiquement significative.")
else:
    print("Il n'y a pas de différence statistiquement significative entre les groupes.")

Statistique de test de Mann-Whitney : 2845.0
P-value associée : 0.44739619874878456
Il n'y a pas de différence statistiquement significative entre les groupes.


In [68]:
import plotly.graph_objs as go

# Données à comparer
data1 = counts_perc_patients_NACT[column_to_compare]
data2 = counts_perc_patients_ACT[column_to_compare]

# Créer un graphique boxplot
fig = go.Figure()

# Ajouter les données de chaque groupe au graphique
fig.add_trace(go.Violin(y=data1, name='NACT', box_visible=True, meanline_visible=True))
fig.add_trace(go.Violin(y=data2, name='ACT', box_visible=True, meanline_visible=True))

# Mettre en forme le titre et les axes
fig.update_layout(
    title="Comparaison de la distribution de '{}' entre les groupes NACT et ACT".format(column_to_compare),
    xaxis_title="Groupe",
    yaxis_title="Valeurs de '{}'".format(column_to_compare)
)

# Afficher le graphique
fig.show()

plot(fig)

'temp-plot.html'

In [69]:
# Compter le nombre d'occurrences des checkpoints immunitaires par patiente
occurrences_checkpoint_NACT = df_NACT.groupby('Patient')['immune_checkpoint'].value_counts()
occurrences_checkpoint_ACT = df_ACT.groupby('Patient')['immune_checkpoint'].value_counts()

# Afficher le résultat
print(occurrences_checkpoint_NACT)
print(occurrences_checkpoint_ACT)

Patient  immune_checkpoint
104      None                 2461
         PDL1                   20
         PD1                    10
11       None                 6138
         PDL1                   50
                              ... 
95       B7H4_PDL1               1
98       None                 2901
         B7H4                  487
         PDL1                  321
         PD1                     1
Name: immune_checkpoint, Length: 155, dtype: int64
Patient  immune_checkpoint
10       None                 6427
         B7H4                   23
         PD1                     5
         PDL1                    4
         B7H4_PD1                1
                              ... 
97       B7H4                    2
99       None                 1064
         B7H4                  182
         PD1                    13
         PDL1                    6
Name: immune_checkpoint, Length: 583, dtype: int64


In [70]:
from scipy.stats import chi2_contingency

# Créer un tableau de contingence à partir des occurrences de checkpoints immunitaires dans chaque tableau
contingency_table = pd.concat([occurrences_checkpoint_NACT, occurrences_checkpoint_ACT], axis=1, keys=['NACT', 'ACT'], sort=False).fillna(0)

# Effectuer le test de chi-deux
chi2_stat, p_val, _, _ = chi2_contingency(contingency_table)

# Afficher les résultats
print("Statistique de test de chi-deux :", chi2_stat)
print("P-valeur associée :", p_val)

# Interpréter les résultats
alpha = 0.05
if p_val < alpha:
    print("Il y a une différence statistiquement significative entre les distributions des checkpoints immunitaires dans les deux groupes.")
else:
    print("Il n'y a pas de différence statistiquement significative entre les distributions des checkpoints immunitaires dans les deux groupes.")


Statistique de test de chi-deux : 609887.0
P-valeur associée : 0.0
Il y a une différence statistiquement significative entre les distributions des checkpoints immunitaires dans les deux groupes.


In [71]:
import plotly.graph_objs as go

# Concaténer les occurrences des checkpoints immunitaires dans les deux tableaux
all_occurrences = pd.concat([occurrences_checkpoint_NACT, occurrences_checkpoint_ACT], axis=1, keys=['NACT', 'ACT'], sort=False).fillna(0)

# Calculer le nombre total de patientes dans chaque groupe
total_patients_NACT = len(df_NACT['Patient'].unique())
total_patients_ACT = len(df_ACT['Patient'].unique())

# Diviser le nombre d'occurrences de chaque checkpoint immunitaire par le nombre total de patientes dans le groupe correspondant
all_occurrences_normalized = all_occurrences.copy()
all_occurrences_normalized['NACT'] /= total_patients_NACT
all_occurrences_normalized['ACT'] /= total_patients_ACT

all_occurrences_normalized

Unnamed: 0_level_0,Unnamed: 1_level_0,NACT,ACT
Patient,immune_checkpoint,Unnamed: 2_level_1,Unnamed: 3_level_1
104,,64.763158,0.000000
104,PDL1,0.526316,0.000000
104,PD1,0.263158,0.000000
11,,161.526316,0.000000
11,PDL1,1.315789,0.000000
...,...,...,...
97,B7H4,0.000000,0.014388
99,,0.000000,7.654676
99,B7H4,0.000000,1.309353
99,PD1,0.000000,0.093525


In [72]:
df_ACT['Patient'].unique()

array(['62', '63', '59', '60', '33', '35', '36', '37', '38', '30', '25',
       '27', '29', '20', '21', '22', '23', '24', '14', '16', '17', '19',
       '12', '8', '9', '10', '4', '52', '53', '54', '55', '56', '57',
       '50', '42', '43', '44', '45', '47', '39', '41', '49', '46', '48',
       '58', '28', '13', '6', '34', '3', '122', '125', '124', '94', '101',
       '86', '84', '83', '91', '88', '87', '75', '74', '82', '81', '80',
       '79', '67', '66', '65', '64', '73', '71', '69', '68', '123', '115',
       '114', '113', '120', '117', '126', '105', '103', '102', '110',
       '109', '106', '166', '112', '108', '107', '97', '96', '100', '78',
       '77', '70', '72', '99', '92', '90', '136', '135', '134', '133',
       '132', '131', '130', '129', '144', '143', '155', '154', '150',
       '148', '147', '164', '161', '159', '128', '127', '142', '139',
       '152', '158', '156', '172', '170', '168', '167', '165', '181',
       '180', '176', '173', '175', '185', '186'], dtype=object)

In [73]:
import plotly.graph_objs as go

# Calculer le nombre total de patientes dans chaque groupe
total_patients_NACT = len(df_NACT['Patient'].unique())
total_patients_ACT = len(df_ACT['Patient'].unique())

# Diviser le nombre d'occurrences de chaque checkpoint immunitaire par le nombre total de patientes dans le groupe correspondant
all_occurrences_normalized = all_occurrences.copy()
all_occurrences_normalized['NACT'] /= total_patients_NACT
all_occurrences_normalized['ACT'] /= total_patients_ACT

# Créer un objet Figure de Plotly
fig = go.Figure()

# Ajouter les barres pour les proportions normalisées de chaque checkpoint immunitaire dans les deux groupes
for col in all_occurrences_normalized.columns:
    fig.add_trace(go.Bar(x=all_occurrences_normalized.index.get_level_values('immune_checkpoint'), y=all_occurrences_normalized[col], name=col))

# Mettre en forme le titre et les axes
fig.update_layout(
    title="Comparaison des proportions de checkpoints immunitaires entre les groupes NACT et ACT",
    xaxis_title="Checkpoint Immunitaire",
    yaxis_title="Proportion d'Occurrences",
    xaxis_tickangle=-45,
    legend_title="Groupe"
)

# Afficher le graphique
fig.show()
plot(fig)

'temp-plot.html'

### V.4.3. IMMUNE CHECKPOINT

#### V.4.3.1. NACT

In [74]:
# Count of each immune_checkpoint type by cell_subtype
counts_NACT = df_NACT.groupby(['cell_type', 'cell_subtype', 'immune_checkpoint']).size().reset_index(name='count')

# % for each cell_subtype
counts_NACT['percentage'] = counts_NACT.groupby('cell_subtype')['count'].apply(lambda x: (x / x.sum()) * 100)

display(counts_NACT)

Unnamed: 0,cell_type,cell_subtype,immune_checkpoint,count,percentage
0,CANCER,CANCER,B7H4,10827,16.317273
1,CANCER,CANCER,B7H4_PD1,140,0.210993
2,CANCER,CANCER,B7H4_PDL1,19,0.028635
3,CANCER,CANCER,,52778,79.541242
4,CANCER,CANCER,PD1,1553,2.340512
...,...,...,...,...,...
59,STROMA,αSMA_myCAF,B7H4,13,0.101634
60,STROMA,αSMA_myCAF,B7H4_PD1,1,0.007818
61,STROMA,αSMA_myCAF,,12746,99.648190
62,STROMA,αSMA_myCAF,PD1,15,0.117270


In [75]:
fig = px.bar(counts_NACT, x='immune_checkpoint', y='percentage', color='cell_type', 
             title='Percentage of Each Immune Checkpoint Type by Cell Subtype in NACT',
             labels={'immune_checkpoint': 'Immune Checkpoint', 'percentage': 'Percentage', 'cell_subtype': 'Cell Subtype'})

fig.show()
plot(fig)

'temp-plot.html'

#### V.4.3.2. ACT

In [76]:
# Count of each immune_checkpoint type by cell_subtype
counts_ACT = df_ACT.groupby(['cell_type', 'cell_subtype', 'immune_checkpoint']).size().reset_index(name='count')

# % for each cell_subtype
counts_ACT['percentage'] = counts_ACT.groupby('cell_subtype')['count'].apply(lambda x: (x / x.sum()) * 100)

print(counts_ACT)

   cell_type  cell_subtype immune_checkpoint   count  percentage
0     CANCER        CANCER              B7H4   32646    9.083725
1     CANCER        CANCER          B7H4_PD1     425    0.118256
2     CANCER        CANCER         B7H4_PDL1      84    0.023373
3     CANCER        CANCER              None  314226   87.433151
4     CANCER        CANCER               PD1    7532    2.095773
..       ...           ...               ...     ...         ...
66    STROMA  STROMA_OTHER              PDL1     160    0.332226
67    STROMA    αSMA_myCAF              B7H4      51    0.185266
68    STROMA    αSMA_myCAF              None   27200   98.808486
69    STROMA    αSMA_myCAF               PD1     256    0.929962
70    STROMA    αSMA_myCAF              PDL1      21    0.076286

[71 rows x 5 columns]


In [77]:
# Créer le graphique à barres avec Plotly Express
fig = px.bar(counts_ACT, x='immune_checkpoint', y='percentage', color='cell_type', 
             title='Percentage of Each Immune Checkpoint Type by Cell Subtype in ACT',
             labels={'immune_checkpoint': 'Immune Checkpoint', 'percentage': 'Percentage', 'cell_subtype': 'Cell Subtype'})

# Afficher le graphique
fig.show()
plot(fig)

'temp-plot.html'

In [78]:
# Concaténer les deux DataFrames
combined_counts = pd.concat([counts_NACT, counts_ACT])

# Ajouter une colonne pour distinguer les deux ensembles de données
combined_counts['dataset'] = ['NACT'] * len(counts_NACT) + ['ACT'] * len(counts_ACT)

# Créer le graphique à barres avec Plotly Express
fig = px.bar(combined_counts, x='immune_checkpoint', y='percentage', color='cell_subtype', 
             facet_col='dataset', facet_col_wrap=1,
             title='Percentage of Each Immune Checkpoint Type by Cell Subtype',
             labels={'immune_checkpoint': 'Immune Checkpoint', 'percentage': 'Percentage', 'cell_subtype': 'Cell Subtype', 'dataset': 'Dataset'})

# Afficher le graphique
fig.show()
plot(fig)

'temp-plot.html'

In [79]:
# Concaténer les deux DataFrames
combined_counts = pd.concat([counts_NACT, counts_ACT])

# Ajouter une colonne pour distinguer les deux ensembles de données
combined_counts['dataset'] = ['NACT'] * len(counts_NACT) + ['ACT'] * len(counts_ACT)

# Créer le graphique à barres groupées avec Plotly Express
fig = px.bar(combined_counts, x='immune_checkpoint', y='percentage', color='cell_subtype', 
             facet_col='dataset', facet_col_wrap=1,
             barmode='group',
             title='Percentage of Each Immune Checkpoint Type by Cell Subtype',
             labels={'immune_checkpoint': 'Immune Checkpoint', 'percentage': 'Percentage', 'cell_subtype': 'Cell Subtype', 'dataset': 'Dataset'})

# Afficher le graphique
fig.show()
plot(fig)

'temp-plot.html'

In [80]:
combined_counts

Unnamed: 0,cell_type,cell_subtype,immune_checkpoint,count,percentage,dataset
0,CANCER,CANCER,B7H4,10827,16.317273,NACT
1,CANCER,CANCER,B7H4_PD1,140,0.210993,NACT
2,CANCER,CANCER,B7H4_PDL1,19,0.028635,NACT
3,CANCER,CANCER,,52778,79.541242,NACT
4,CANCER,CANCER,PD1,1553,2.340512,NACT
...,...,...,...,...,...,...
66,STROMA,STROMA_OTHER,PDL1,160,0.332226,ACT
67,STROMA,αSMA_myCAF,B7H4,51,0.185266,ACT
68,STROMA,αSMA_myCAF,,27200,98.808486,ACT
69,STROMA,αSMA_myCAF,PD1,256,0.929962,ACT


### V.4.4. NACT VS ACT

In [81]:
#######
####### Count by cell SUBtype !! IN NACT !!
#######
cell_subtypes = ['DC', 'B', 'TCD4', 'TCD8', 'M1', 'M2', 'Treg', \
                 'IMMUNE_OTHER', 'CANCER', 'αSMA_myCAF', 'STROMA_OTHER', 'ENDOTHELIAL']
# Initialisation d'un dictionnaire pour stocker les counts des sous-types de cellules
subtype_counts = {}

# Boucle sur les sous-types de cellules pour compter les échantillons correspondants
for subtype in cell_subtypes:
    subtype_counts[subtype.lower()] = pd.DataFrame({subtype.lower():
        df_NACT.loc[
            df_NACT['cell_subtype'] == subtype, 'Sample_ID'].value_counts()
    }).sort_index()

# Concaténation des counts des sous-types de cellules en un seul DataFrame
counts_subtypes = pd.concat([pd.DataFrame(v) for v in subtype_counts.values()], axis=1, sort=False)
counts_subtypes = counts_subtypes.fillna(0)

# Enregistrement des counts des sous-types de cellules dans un fichier CSV
filename_subtypes = os.path.join(output_data_dir, project_name + "_cell_subtypes_number.csv")
counts_subtypes.to_csv(filename_subtypes, index=False)
counts_subtypes

Unnamed: 0,dc,b,tcd4,tcd8,m1,m2,treg,immune_other,cancer,αsma_mycaf,stroma_other,endothelial
DD3S1.csv,11.0,0.0,551,539,73,22.0,1,110,8186,1143,2380,283
DD3S3.csv,27.0,44.0,1441,728,69,19.0,2,258,13963,2334,5066,321
DD4S1.csv,48.0,3.0,1087,211,26,5.0,5,531,8862,274,843,778
DD4S2.csv,28.0,1.0,807,47,78,74.0,4,161,6999,334,619,268
DD4S3.csv,82.0,1.0,373,144,55,64.0,5,917,5474,37,708,178
DD5S1.csv,55.0,1.0,861,69,342,17.0,7,548,6529,1776,2269,758
DD5S2.csv,71.0,0.0,413,63,27,4.0,5,247,5963,2673,3198,930
DD5S3.csv,490.0,3.0,904,230,332,69.0,7,312,8092,2657,1428,581
DD3S2.csv,0.0,0.0,15,7,2,0.0,1,17,2285,1563,317,119


In [82]:
# Ajout des colonnes de pourcentages pour chaque sous-type de cellules
counts_perc = counts_subtypes.copy()

# Calcul des pourcentages pour chaque sous-type de cellules
for col in counts_perc.columns:
    counts_perc[col + '_perc'] = (counts_perc[col] / counts_perc.sum(axis=1)) * 100

# Affichage des pourcentages des sous-types de cellules

counts_perc['Sample_ID'] = counts_perc.index
counts_perc.columns.values
display(counts_perc)

Unnamed: 0,dc,b,tcd4,tcd8,m1,m2,treg,immune_other,cancer,αsma_mycaf,...,tcd8_perc,m1_perc,m2_perc,treg_perc,immune_other_perc,cancer_perc,αsma_mycaf_perc,stroma_other_perc,endothelial_perc,Sample_ID
DD3S1.csv,11.0,0.0,551,539,73,22.0,1,110,8186,1143,...,4.051649,0.548572,0.165316,0.007514,0.826571,61.508057,8.548777,17.789229,2.112465,DD3S1.csv
DD3S3.csv,27.0,44.0,1441,728,69,19.0,2,258,13963,2334,...,2.998571,0.28417,0.078249,0.008237,1.062533,57.501935,9.589089,20.80514,1.317163,DD3S3.csv
DD4S1.csv,48.0,3.0,1087,211,26,5.0,5,531,8862,274,...,1.663778,0.204988,0.03942,0.03942,4.186401,69.844899,2.147679,6.606528,6.093973,DD4S1.csv
DD4S2.csv,28.0,1.0,807,47,78,74.0,4,161,6999,334,...,0.498469,0.827202,0.784713,0.042413,1.707131,74.199057,3.513225,6.508632,2.816027,DD4S2.csv
DD4S3.csv,82.0,1.0,373,144,55,64.0,5,917,5474,37,...,1.790227,0.683615,0.795412,0.062135,11.395546,67.929123,0.45531,8.711933,2.187943,DD4S3.csv
DD5S1.csv,55.0,1.0,861,69,342,17.0,7,548,6529,1776,...,0.52119,2.583189,0.128379,0.052861,4.13828,49.289035,13.357759,17.048613,5.688107,DD5S1.csv
DD5S2.csv,71.0,0.0,413,63,27,4.0,5,247,5963,2673,...,0.463318,0.198558,0.029416,0.036769,1.816405,43.845251,19.591101,23.405353,6.794795,DD5S2.csv
DD5S3.csv,490.0,3.0,904,230,332,69.0,7,312,8092,2657,...,1.521743,2.196382,0.456411,0.046301,2.063701,53.516636,17.510159,9.399957,3.822127,DD5S3.csv
DD3S2.csv,0.0,0.0,15,7,2,0.0,1,17,2285,1563,...,0.161799,0.046227,0.0,0.023113,0.39292,52.808306,35.686731,7.17931,2.690697,DD3S2.csv


In [83]:
fig = go.Figure()
title = 'NACT_patients_Cell subtypes proportions by Sample ID and tissue type'

for cell_subtype in cell_subtypes:
    fig.add_trace(
        go.Bar(
            name=cell_subtype,
            x=counts_perc['Sample_ID'],
            y=counts_perc[f'{cell_subtype.lower()}_perc'],
            text=counts_perc[f'{cell_subtype.lower()}_perc'],
            textposition='auto',
            marker_color='rgb' + str(cell_subtype_color_dict[cell_subtype])))

fig.update_layout(
    plot_bgcolor='white',
    barmode='stack',
    xaxis=dict(linecolor='black'),
    title=title,
    yaxis=dict(title='Cell count (%)', linecolor='black')
)

# Enregistrer l'image
output_filename = title.replace(" ", "_") + ".png"
#fig.write_image(output_images_dir + "/" + output_filename, scale=1000)

# Afficher le graphique
fig.show()
plot(fig)

'temp-plot.html'

In [84]:
#######
####### Count by cell SUBtype !! IN ACT !!
#######
cell_subtypes = ['DC', 'B', 'TCD4', 'TCD8', 'M1', 'M2', 'Treg', \
                 'IMMUNE_OTHER', 'CANCER', 'αSMA_myCAF', 'STROMA_OTHER', 'ENDOTHELIAL']
# Initialisation d'un dictionnaire pour stocker les counts des sous-types de cellules
subtype_counts = {}

# Boucle sur les sous-types de cellules pour compter les échantillons correspondants
for subtype in cell_subtypes:
    subtype_counts[subtype.lower()] = pd.DataFrame({subtype.lower():
        df_ACT.loc[
            df_ACT['cell_subtype'] == subtype, 'Sample_ID'].value_counts()
    }).sort_index()

# Concaténation des counts des sous-types de cellules en un seul DataFrame
counts_subtypes = pd.concat([pd.DataFrame(v) for v in subtype_counts.values()], axis=1, sort=False)
counts_subtypes = counts_subtypes.fillna(0)

# Enregistrement des counts des sous-types de cellules dans un fichier CSV
filename_subtypes = os.path.join(output_data_dir, project_name + "_cell_subtypes_number.csv")
counts_subtypes.to_csv(filename_subtypes, index=False)
counts_subtypes

Unnamed: 0,dc,b,tcd4,tcd8,m1,m2,treg,immune_other,cancer,αsma_mycaf,stroma_other,endothelial
DD3S1.csv,70,0.0,2440,2310,237,107,5,884,40795,2824,4343,1383
DD3S2.csv,294,0.0,1900,2343,92,25,37,1244,48774,3226,6958,1631
DD3S3.csv,253,94.0,5416,2822,183,77,26,1616,63037,5090,10922,2457
DD4S1.csv,224,26.0,1580,1739,50,33,7,1858,45151,2832,3300,1275
DD4S2.csv,102,13.0,1131,1647,34,28,15,680,32471,1330,3910,964
DD4S3.csv,1068,30.0,1980,2796,100,264,33,2419,45124,3649,4122,1195
DD5S1.csv,528,138.0,1979,467,162,11,21,672,37194,5200,7822,2037
DD5S2.csv,89,16.0,722,82,104,59,9,220,24876,1203,4024,405
DD5S3.csv,977,32.0,1096,457,147,41,6,553,21968,2174,2759,583


In [85]:
# Ajout des colonnes de pourcentages pour chaque sous-type de cellules
counts_perc = counts_subtypes.copy()

# Calcul des pourcentages pour chaque sous-type de cellules
for col in counts_perc.columns:
    counts_perc[col + '_perc'] = (counts_perc[col] / counts_perc.sum(axis=1)) * 100

# Affichage des pourcentages des sous-types de cellules

counts_perc['Sample_ID'] = counts_perc.index
counts_perc.columns.values
display(counts_perc)

Unnamed: 0,dc,b,tcd4,tcd8,m1,m2,treg,immune_other,cancer,αsma_mycaf,...,tcd8_perc,m1_perc,m2_perc,treg_perc,immune_other_perc,cancer_perc,αsma_mycaf_perc,stroma_other_perc,endothelial_perc,Sample_ID
DD3S1.csv,70,0.0,2440,2310,237,107,5,884,40795,2824,...,4.169485,0.427746,0.193116,0.009024,1.595457,73.625322,5.089889,7.826968,2.492095,DD3S1.csv
DD3S2.csv,294,0.0,1900,2343,92,25,37,1244,48774,3226,...,3.521863,0.138282,0.037576,0.055613,1.869804,73.308069,4.843391,10.445711,2.448158,DD3S2.csv
DD3S3.csv,253,94.0,5416,2822,183,77,26,1616,63037,5090,...,3.067416,0.198908,0.083693,0.02826,1.756471,68.515202,5.528227,11.861625,2.668033,DD3S3.csv
DD4S1.csv,224,26.0,1580,1739,50,33,7,1858,45151,2832,...,2.994241,0.086086,0.056817,0.012052,3.198964,77.733303,4.869139,5.67331,2.191747,DD4S1.csv
DD4S2.csv,102,13.0,1131,1647,34,28,15,680,32471,1330,...,3.891047,0.080318,0.066144,0.035434,1.606349,76.702624,3.136029,9.218771,2.272369,DD4S2.csv
DD4S3.csv,1068,30.0,1980,2796,100,264,33,2419,45124,3649,...,4.4533,0.159263,0.420452,0.052556,3.852525,71.860551,5.804437,6.556229,1.900504,DD4S3.csv
DD5S1.csv,528,138.0,1979,467,162,11,21,672,37194,5200,...,0.830433,0.288069,0.01956,0.037342,1.194945,66.136689,9.235544,13.890111,3.616361,DD5S1.csv
DD5S2.csv,89,16.0,722,82,104,59,9,220,24876,1203,...,0.257768,0.326922,0.185464,0.028291,0.691554,78.194228,3.772191,12.616375,1.269287,DD5S2.csv
DD5S3.csv,977,32.0,1096,457,147,41,6,553,21968,2174,...,1.483774,0.477252,0.133109,0.019479,1.795341,71.316005,7.041282,8.933977,1.887279,DD5S3.csv


In [86]:
fig = go.Figure()
title = 'ACT_patients_Cell subtypes proportions by Sample ID and tissue type'

for cell_subtype in cell_subtypes:
    fig.add_trace(
        go.Bar(
            name=cell_subtype,
            x=counts_perc['Sample_ID'],
            y=counts_perc[f'{cell_subtype.lower()}_perc'],
            text=counts_perc[f'{cell_subtype.lower()}_perc'],
            textposition='auto',
            marker_color='rgb' + str(cell_subtype_color_dict[cell_subtype])))

fig.update_layout(
    plot_bgcolor='white',
    barmode='stack',
    xaxis=dict(linecolor='black'),
    title=title,
    yaxis=dict(title='Cell count (%)', linecolor='black')
)

# Enregistrer l'image
output_filename = title.replace(" ", "_") + ".png"
#fig.write_image(output_images_dir + "/" + output_filename, scale=1000)

# Afficher le graphique
fig.show()
plot(fig)

'temp-plot.html'

In [87]:
df1

Unnamed: 0_level_0,PDL1_Cytoplasm_Intensity_Average,HLA_Cytoplasm_Intensity_Average,CKs_Cytoplasm_Intensity_Average,Ki67_Nucleus_Intensity_Average,CD163_Cytoplasm_Intensity_Average,ColVI_Cytoplasm_Intensity_Average,CD20_Cytoplasm_Intensity_Average,PD1_Cytoplasm_Intensity_Average,AXL_Cytoplasm_Intensity_Average,CD31_Cytoplasm_Intensity_Average,...,Nuc_Y_Inv,Nucleus_Roundness,Nuc_X,ROI_index,Nucleus_Size,Sample_ID,immune_checkpoint,Cell_Size,Patient,Unique_ROI_index
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DD3S1_Cell_0,-0.677863,-0.417494,-0.912537,-0.817876,0.930099,0.232078,-0.483158,1.535604,0.807339,1.167755,...,16632.205078,0.955040,1484.771729,0,127,DD3S1.csv,,339,61,61a
DD3S1_Cell_1,-0.677863,-0.516487,-0.838037,-0.869685,1.114924,0.301333,-0.344770,1.668368,0.875455,1.643023,...,16627.384766,0.966643,1426.250000,0,112,DD3S1.csv,,344,61,61a
DD3S1_Cell_2,-0.677863,-0.141921,-1.016023,-0.755879,0.834577,0.259216,-0.438292,1.336308,0.705088,1.053636,...,16622.238281,0.721534,1531.110474,0,181,DD3S1.csv,,422,61,61a
DD3S1_Cell_3,-0.741282,-0.460472,-0.491711,-0.818084,0.648200,0.107027,-0.444889,1.249805,0.660707,1.165861,...,16623.007812,0.587196,1518.907593,0,119,DD3S1.csv,,278,61,61a
DD3S1_Cell_6,-0.621521,-0.247254,-0.867127,-0.742544,0.810579,0.272128,-0.507117,1.251434,0.947172,2.545301,...,16619.978516,0.935716,1471.914917,0,47,DD3S1.csv,,204,61,61a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TMA_Cell_115755,0.478275,0.558670,-0.962840,1.732291,0.507434,-0.912641,0.311322,0.816068,0.596520,0.090397,...,2663.253418,0.982196,15564.458008,59,142,TMA.csv,,386,c59,c59a
TMA_Cell_115756,0.297418,0.420594,-0.971632,1.966955,0.304365,-1.164112,0.866636,-0.092857,-0.241830,-0.617835,...,2661.765869,0.775977,15629.680664,59,47,TMA.csv,,270,c59,c59a
TMA_Cell_115757,0.346950,0.453951,-0.602893,1.338956,0.559435,-0.801333,0.447061,0.988156,1.567869,0.403878,...,2657.015625,0.688747,15518.421875,59,64,TMA.csv,,202,c59,c59a
TMA_Cell_115758,-0.189415,0.508840,-0.886041,0.647980,-0.227224,-1.022549,-0.099256,0.219755,0.603715,-0.219145,...,2660.258545,0.751402,15539.275391,59,58,TMA.csv,,182,c59,c59a


## RATIO M2/M1

In [88]:
# Filtrer les lignes correspondant aux cellules M1 et M2 pour les patients NACT
df_M1_NACT = df[(df['cell_subtype'] == 'M1') & df['Patient'].isin(num_NACT_patients)]
df_M2_NACT = df[(df['cell_subtype'] == 'M2') & df['Patient'].isin(num_NACT_patients)]

# Filtrer les lignes correspondant aux cellules M1 et M2 pour les patients ACT
df_M1_ACT = df[(df['cell_subtype'] == 'M1') & df['Patient'].isin(num_ACT_patients)]
df_M2_ACT = df[(df['cell_subtype'] == 'M2') & df['Patient'].isin(num_ACT_patients)]

In [89]:
# Regrouper les données par patient et calculer le total de cellules M1 et M2 pour les patients NACT
result_M1_NACT = df_M1_NACT.groupby('Patient').size().rename('total_M1_cells')
result_M2_NACT = df_M2_NACT.groupby('Patient').size().rename('total_M2_cells')

# Regrouper les données par patient et calculer le total de cellules M1 et M2 pour les patients ACT
result_M1_ACT = df_M1_ACT.groupby('Patient').size().rename('total_M1_cells')
result_M2_ACT = df_M2_ACT.groupby('Patient').size().rename('total_M2_cells')

In [90]:
# Créer le DataFrame résultant avec Patient comme index pour les patients NACT
result_NACT = pd.concat([result_M1_NACT, result_M2_NACT], axis=1, sort=False).fillna(0)

# Créer le DataFrame résultant avec Patient comme index pour les patients ACT
result_ACT = pd.concat([result_M1_ACT, result_M2_ACT], axis=1, sort=False).fillna(0)

In [91]:
# Ajouter une colonne pour le ratio de M2/M1 pour les patients NACT
result_NACT['M2_M1_ratio'] = result_NACT['total_M2_cells'] / result_NACT['total_M1_cells']
result_NACT['Treatment_Type'] = 'NACT'  # Ajouter une colonne pour indiquer le traitement

# Ajouter une colonne pour le ratio de M2/M1 pour les patients ACT
result_ACT['M2_M1_ratio'] = result_ACT['total_M2_cells'] / result_ACT['total_M1_cells']
result_ACT['Treatment_Type'] = 'ACT'  # Ajouter une colonne pour indiquer le traitement

In [92]:
# Regrouper les données par patient et calculer le total de cellules M1 et M2 pour les patients NACT
result_M1_NACT = df_M1_NACT.groupby('Patient').size().rename('total_M1_cells')
result_M2_NACT = df_M2_NACT.groupby('Patient').size().rename('total_M2_cells')

# Regrouper les données par patient et calculer le total de cellules M1 et M2 pour les patients ACT
result_M1_ACT = df_M1_ACT.groupby('Patient').size().rename('total_M1_cells')
result_M2_ACT = df_M2_ACT.groupby('Patient').size().rename('total_M2_cells')

In [93]:
result_ACT

Unnamed: 0_level_0,total_M1_cells,total_M2_cells,M2_M1_ratio,Treatment_Type
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,167.0,100.0,0.598802,ACT
101,3.0,1.0,0.333333,ACT
102,5.0,1.0,0.200000,ACT
103,7.0,0.0,0.000000,ACT
105,3.0,3.0,1.000000,ACT
...,...,...,...,...
34,0.0,2.0,inf,ACT
45,0.0,3.0,inf,ACT
71,0.0,16.0,inf,ACT
72,0.0,1.0,inf,ACT


In [94]:
result_NACT

Unnamed: 0_level_0,total_M1_cells,total_M2_cells,M2_M1_ratio,Treatment_Type
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
104,1.0,2.0,2.0,NACT
111,10.0,3.0,0.3,NACT
119,3.0,0.0,0.0,NACT
121,4.0,0.0,0.0,NACT
137,1.0,1.0,1.0,NACT
140,307.0,38.0,0.123779,NACT
141,10.0,1.0,0.1,NACT
146,1.0,2.0,2.0,NACT
149,275.0,3.0,0.010909,NACT
15,7.0,1.0,0.142857,NACT


In [95]:
# Concaténer les résultats pour obtenir le DataFrame final
result_final = pd.concat([result_NACT, result_ACT])

# Trier le DataFrame par index
result_sorted = result_final.sort_index()

# Définir l'option pour afficher toutes les lignes du DataFrame
#pd.set_option('display.max_rows', None)

display(result_sorted)

Unnamed: 0_level_0,total_M1_cells,total_M2_cells,M2_M1_ratio,Treatment_Type
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,167.0,100.0,0.598802,ACT
100,0.0,3.0,inf,ACT
101,3.0,1.0,0.333333,ACT
102,5.0,1.0,0.200000,ACT
103,7.0,0.0,0.000000,ACT
...,...,...,...,...
92,3.0,25.0,8.333333,ACT
95,5.0,44.0,8.800000,NACT
96,2.0,1.0,0.500000,ACT
98,73.0,70.0,0.958904,NACT


In [96]:
# Remplacer les valeurs négatives du ratio M2/M1 par 0
#result_sorted['M2_M1_ratio'] = result_sorted['M2_M1_ratio'].clip(lower=0)

#result_sorted['M2_M1_ratio'] = result_sorted['M2_M1_ratio'].replace([-float('inf'), float('inf')], 0)
#result_sorted['M2_M1_ratio'] = result_sorted['M2_M1_ratio'].apply(lambda x: 0 if x < 0 else x)

# Définir l'option pour afficher toutes les lignes du DataFrame
#pd.set_option('display.max_rows', None)

display(result_sorted)

Unnamed: 0_level_0,total_M1_cells,total_M2_cells,M2_M1_ratio,Treatment_Type
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,167.0,100.0,0.598802,ACT
100,0.0,3.0,inf,ACT
101,3.0,1.0,0.333333,ACT
102,5.0,1.0,0.200000,ACT
103,7.0,0.0,0.000000,ACT
...,...,...,...,...
92,3.0,25.0,8.333333,ACT
95,5.0,44.0,8.800000,NACT
96,2.0,1.0,0.500000,ACT
98,73.0,70.0,0.958904,NACT


In [97]:
# Créer le violin plot avec des points individuels
fig = px.violin(result_sorted, x='Treatment_Type', y='M2_M1_ratio', 
                box=True, hover_data=result_sorted.columns)

# Afficher le violin plot
fig.update_layout(title=f'Violin Plot du Ratio M2/M1 en fonction du Traitement',
                  xaxis_title='Traitement', yaxis_title='Ratio M2/M1')


fig.show()
plot(fig)
import plotly.io as pio
#pio.write_image(fig, 'Violinplot_ratio_M2_M1.png')

In [98]:
# Filtrer les lignes correspondant aux cellules CANCER B7H4/PDL1 pour les patients NACT
df_B7H4_NACT = df[(df['immune_checkpoint'] == 'B7H4') & df['Patient'].isin(num_NACT_patients) & (df['cell_type'] == 'CANCER')]
df_PDL1_NACT = df[(df['immune_checkpoint'] == 'PDL1') & df['Patient'].isin(num_NACT_patients) & ((df['cell_type'] == 'CANCER') | (df['cell_type'] == 'IMMUNE'))]

# Filtrer les lignes correspondant aux cellules CANCER B7H4/PDL1 pour les patients ACT
df_B7H4_ACT = df[(df['immune_checkpoint'] == 'B7H4') & df['Patient'].isin(num_ACT_patients) & (df['cell_type'] == 'CANCER')]
df_PDL1_ACT = df[(df['immune_checkpoint'] == 'PDL1') & df['Patient'].isin(num_ACT_patients) & ((df['cell_type'] == 'CANCER') | (df['cell_type'] == 'IMMUNE'))]


In [99]:
#pd.set_option('display.max_rows', None)
print(df_B7H4_NACT.shape)
print(df_PDL1_NACT.shape)

print(df_B7H4_ACT.shape)
print(df_PDL1_ACT.shape)

(10827, 40)
(1229, 40)
(32646, 40)
(5434, 40)


In [100]:
df_PDL1_NACT.head()

Unnamed: 0_level_0,PDL1_Cytoplasm_Intensity_Average,HLA_Cytoplasm_Intensity_Average,CKs_Cytoplasm_Intensity_Average,Ki67_Nucleus_Intensity_Average,CD163_Cytoplasm_Intensity_Average,ColVI_Cytoplasm_Intensity_Average,CD20_Cytoplasm_Intensity_Average,PD1_Cytoplasm_Intensity_Average,AXL_Cytoplasm_Intensity_Average,CD31_Cytoplasm_Intensity_Average,...,Nucleus_Roundness,Nuc_X,ROI_index,Nucleus_Size,Sample_ID,immune_checkpoint,Cell_Size,Patient,Unique_ROI_index,Primary_chem(1)_vs_surg(0)
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DD3S1_Cell_21828,1.609792,2.521615,0.223023,1.084166,0.338872,-1.112327,1.374637,0.236768,-0.450948,-0.132787,...,0.868,3246.450684,11,91,DD3S1.csv,PDL1,220,32,32a,1.0
DD3S1_Cell_21831,1.590608,2.493721,0.231741,0.251524,1.185084,-1.203795,1.610041,-0.700345,-0.673078,-0.641906,...,0.580756,2951.785645,11,70,DD3S1.csv,PDL1,153,32,32a,1.0
DD3S1_Cell_21937,1.61123,2.189588,0.558693,0.249337,0.313269,-1.248514,1.50192,-0.669547,-0.781676,-0.848153,...,0.829105,3137.495117,11,101,DD3S1.csv,PDL1,200,32,32a,1.0
DD3S1_Cell_21939,1.630944,2.690321,-0.185765,1.258937,0.552631,-1.223585,1.519502,-0.700822,-0.721136,-0.785331,...,0.925649,3217.131348,11,99,DD3S1.csv,PDL1,183,32,32a,1.0
DD3S1_Cell_21959,1.517887,2.719076,0.000214,-0.039537,0.431631,-1.257583,1.471392,-0.863404,-0.8037,-0.732914,...,0.777269,2996.028564,11,105,DD3S1.csv,PDL1,154,32,32a,1.0


In [101]:
# Regrouper les données par patient et calculer le total de cellules B7H4 et PDL1 pour les patients NACT
result_B7H4_NACT = df_B7H4_NACT.groupby('Patient').size().rename('B7H4_cells')
result_PDL1_NACT = df_PDL1_NACT.groupby('Patient').size().rename('PDL1_cells')

# Regrouper les données par patient et calculer le total de cellules B7H4 et PDL1 pour les patients ACT
result_B7H4_ACT = df_B7H4_ACT.groupby('Patient').size().rename('B7H4_cells')
result_PDL1_ACT = df_PDL1_ACT.groupby('Patient').size().rename('PDL1_cells')

In [102]:
result_PDL1_NACT

Patient
104     20
111      1
116      3
119     18
121      4
137      4
140      3
141      2
149      1
15       6
153     30
157      7
160    198
162      5
169     12
171      1
178      6
18     307
182     46
183      2
184      3
187      7
26      19
32     126
5       34
61      18
7        6
85       9
89       5
95      23
98     303
Name: PDL1_cells, dtype: int64

In [103]:
# Créer le DataFrame résultant avec Patient comme index pour les patients NACT
result_NACT = pd.concat([result_B7H4_NACT, result_PDL1_NACT], axis=1, sort=False).fillna(0)

# Créer le DataFrame résultant avec Patient comme index pour les patients ACT
result_ACT = pd.concat([result_B7H4_ACT, result_PDL1_ACT], axis=1, sort=False).fillna(0)

In [104]:
result_NACT.head()


Unnamed: 0_level_0,B7H4_cells,PDL1_cells
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1
111,35.0,1.0
116,24.0,3.0
119,2151.0,18.0
121,2361.0,4.0
137,25.0,4.0


In [105]:
len(df_NACT[df_NACT['cell_type'] == 'CANCER'])


66353

In [106]:
#  pour les patients NACT
result_NACT['%_B7H4'] = result_NACT['B7H4_cells'] / len(df_NACT[df_NACT['cell_type'] == 'CANCER'])
result_NACT['%_PDL1'] = result_NACT['PDL1_cells'] / len(df_NACT[(df_NACT['cell_type'] == 'CANCER') | (df_NACT['cell_type'] == 'IMMUNE')])
result_NACT['Treatment_Type'] = 'NACT'  # Ajouter une colonne pour indiquer le traitement

#  pour le ratio de M2/M1 pour les patients ACT
result_ACT['%_B7H4'] = result_ACT['B7H4_cells'] / len(df_ACT[df_ACT['cell_type'] == 'CANCER'])
result_ACT['%_PDL1'] = result_ACT['PDL1_cells'] / len(df_ACT[(df_ACT['cell_type'] == 'CANCER') | (df_ACT['cell_type'] == 'IMMUNE')])
result_ACT['Treatment_Type'] = 'ACT'  # Ajouter une colonne pour indiquer le traitement

In [107]:
len(df_NACT[(df_NACT['cell_type'] == 'CANCER') | (df_NACT['cell_type'] == 'IMMUNE')])

80124

In [108]:
result_NACT

Unnamed: 0_level_0,B7H4_cells,PDL1_cells,%_B7H4,%_PDL1,Treatment_Type
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
111,35.0,1.0,0.000527,1.2e-05,NACT
116,24.0,3.0,0.000362,3.7e-05,NACT
119,2151.0,18.0,0.032418,0.000225,NACT
121,2361.0,4.0,0.035582,5e-05,NACT
137,25.0,4.0,0.000377,5e-05,NACT
146,71.0,0.0,0.00107,0.0,NACT
149,281.0,1.0,0.004235,1.2e-05,NACT
15,1008.0,6.0,0.015191,7.5e-05,NACT
153,9.0,30.0,0.000136,0.000374,NACT
157,2.0,7.0,3e-05,8.7e-05,NACT


In [109]:
# Concaténer les résultats pour obtenir le DataFrame final
result_final = pd.concat([result_NACT, result_ACT])

# Trier le DataFrame par index
result_sorted = result_final.sort_index()

# Définir l'option pour afficher toutes les lignes du DataFrame
#pd.set_option('display.max_rows', None)

display(result_sorted)

Unnamed: 0_level_0,B7H4_cells,PDL1_cells,%_B7H4,%_PDL1,Treatment_Type
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10,18.0,4.0,0.000050,0.000010,ACT
100,663.0,0.0,0.001845,0.000000,ACT
101,61.0,41.0,0.000170,0.000100,ACT
102,135.0,31.0,0.000376,0.000076,ACT
103,12.0,120.0,0.000033,0.000294,ACT
...,...,...,...,...,...
95,207.0,23.0,0.003120,0.000287,NACT
96,812.0,0.0,0.002259,0.000000,ACT
97,2.0,46.0,0.000006,0.000113,ACT
98,472.0,303.0,0.007113,0.003782,NACT


In [120]:
import plotly.express as px

fig = px.box(result_sorted, x='Treatment_Type', y='%_B7H4', 
                hover_data=result_sorted.columns,
                color='Treatment_Type',  # This can be set to a single color if preferred
                template='plotly_white',  # Starting with a clean white template
                )

fig.update_layout(
    title_text='Distribution of % B7H4 in Cancer Cells by Treatment',
    title_x=0.5,  # Center the title
    title_font=dict(size=18, family='Arial, sans-serif'),
    xaxis_title='Treatment Type',
    yaxis_title='% B7H4',
    xaxis=dict(tickangle=-45, title_font=dict(size=14, family='Arial, sans-serif')),
    yaxis=dict(title_font=dict(size=14, family='Arial, sans-serif')),
    legend_title_text='Treatment',
    legend_title_font=dict(size=12, family='Arial, sans-serif'),
)

# Adjusting gridlines to mimic R's ggplot2 style
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='white')
fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True)

fig.show()
plot(fig)

import plotly.io as pio
#pio.write_image(fig, 'Violinplot_%B7H4.png')

AssertionError: 

In [178]:
fig = px.box(result_sorted, x='Treatment_Type', y='%_PDL1', 
                hover_data=result_sorted.columns,
                color='Treatment_Type',  # This can be set to a single color if preferred
                template='plotly_white',  # Starting with a clean white template
                )

fig.update_layout(
    title_text='Distribution of % PDL1 in Cancer Cells by Treatment',
    title_x=0.5,  # Center the title
    title_font=dict(size=18, family='Arial, sans-serif'),
    xaxis_title='Treatment Type',
    yaxis_title='% PDL1',
    xaxis=dict(tickangle=-45, title_font=dict(size=14, family='Arial, sans-serif')),
    yaxis=dict(title_font=dict(size=14, family='Arial, sans-serif')),
    legend_title_text='Treatment',
    legend_title_font=dict(size=12, family='Arial, sans-serif'),
)

# Adjusting gridlines to mimic R's ggplot2 style
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='white')
fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True)


fig.show()
plot(fig)


import plotly.io as pio
#pio.write_image(fig, 'Violinplot_%PDL1.png')

In [194]:
# both on the same plot
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

# Assuming 'result_sorted' is your DataFrame

# Create subplots with a shared y-axis
fig = make_subplots(rows=1, cols=2,
                    shared_yaxes=True, 
                    horizontal_spacing=0.01,
                    subplot_titles=('Distribution of % B7H4', 'Distribution of % PDL1'))

# Define a color map for the treatment types
color_map = {'ACT': "#327EBA", 'NACT': "#E06663"}

# Add first plot for % B7H4
fig1 = px.box(result_sorted, x='Treatment_Type', y='%_B7H4', template='plotly_white',
              color='Treatment_Type', color_discrete_map=color_map)
for trace in fig1.data:
    fig.add_trace(trace, row=1, col=1)

# Add second plot for % PDL1
fig2 = px.box(result_sorted, x='Treatment_Type', y='%_PDL1', template='plotly_white',
              color='Treatment_Type', color_discrete_map=color_map)
for trace in fig2.data:
    fig.add_trace(trace, row=1, col=2)

# Update layout to match the style and adjust gridlines
fig.update_layout(
    title_text='Distribution of % B7H4 and % PDL1 in Cancer Cells by Treatment',
    title_x=0.5,
    title_font=dict(size=18, family='Arial, sans-serif'),
    xaxis_title='Treatment Type',
    yaxis_title='Percentage',
    xaxis=dict(tickangle=-45, title_font=dict(size=14, family='Arial, sans-serif')),
    yaxis=dict(title_font=dict(size=14, family='Arial, sans-serif')),
    legend_title_text='Treatment',
    legend_title_font=dict(size=12, family='Arial, sans-serif'),
    template='plotly_white',
    width=950,  # Increase the width of the figure
    height=700,  # Increase the height of the figure
)

# Set y-axis range to enhance visibility
fig.update_yaxes(range=[-0.001, 0.036])  # Adjust this range based on your data
 
# Adjust grid and lines
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='white')
fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True)

fig.show()


plot(fig)

'temp-plot.html'

In [None]:
f=

## V.5. XY MAPS

#### V.5.1. CELL SUBTYPES

In [None]:
#Create a x, y map and visualize the tissues architecture
for sample in ls_samples:
    sample_id = sample.split('_')[0] + '.csv'
    sample_id2 = sample.split('_')[0]
    location_colors = df.loc[df['Sample_ID'] == sample_id,['Nuc_X','Nuc_Y_Inv','cell_type']]

    #print('nb c endo',len(location_colors.loc[location_colors['cell_type']=='ENDOTHELIAL']))
    #print('nb c immune',len(location_colors.loc[location_colors['cell_type']=='IMMUNE']))
    #print('nb c cancer',len(location_colors.loc[location_colors['cell_type']=='CANCER']))

    fig = go.Figure()
    title = sample_id2 + " Background Subtracted XY Map cell types"

    for celltype in df.loc[df['Sample_ID'] == sample_id,'cell_type'].unique():
        fig.add_scatter(
            mode = 'markers',
            marker=dict(size=3, opacity=0.5, color='rgb' + str(cell_type_color_dict[celltype])),
         x = location_colors.loc[location_colors['cell_type']==celltype,'Nuc_X'],
         y = location_colors.loc[location_colors['cell_type']==celltype,'Nuc_Y_Inv'],
         name = celltype)

    fig.update_layout(title = title, plot_bgcolor = 'white')
    fig.update_xaxes(title_text = 'Nuc_X', linecolor = 'black')
    fig.update_yaxes(title_text = 'Nuc_Y_Inv', linecolor = 'black')
    
    # Adjust the size of the points
    for trace in fig.data:
        trace.marker.size = 2  
        # Adjust the size of the points
    for trace in fig.data:
        trace.marker.size = 2  
    fig.update_layout(
        title=title,
        plot_bgcolor='white',
        legend=dict(
            title='Cell Types',  # Titre de la légende
            font=dict(
                family='Arial',
                size=12,
                color='black'
            ),
            bgcolor='white',
            bordercolor='black',
            borderwidth=0.4,
            itemsizing='constant'
        )
    )
    

    #fig.write_image(output_images_dir + "/" + title.replace(" ", "_") + ".png", width=1200, height=800, scale=4)
    print(sample_id,  "processed!")

    fig.show(renderer='png')  # Display the figure as png
    plot(fig)

In [None]:
#Create a  x,y  map and visualize the tissues architecture
for sample in ls_samples:
    sample_id = sample.split('_')[0] + '.csv'
    sample_id2 = sample.split('_')[0]
    location_colors = df.loc[df['Sample_ID'] == sample_id,['Nuc_X','Nuc_Y_Inv','cell_subtype']]

    fig = go.Figure()
    title = sample_id2 + " Background Subtracted XY Map cell subtypes"

    for cellsubtype in df.loc[df['Sample_ID'] == sample_id,'cell_subtype'].unique():
        fig.add_scatter(
            mode = 'markers',
            marker = dict(size=3, 
                          opacity=0.5, 
                          color='rgb' + str(cell_subtype_color_dict[cellsubtype])),
            x = location_colors.loc[location_colors['cell_subtype']==cellsubtype,'Nuc_X'],
            y = location_colors.loc[location_colors['cell_subtype']==cellsubtype,'Nuc_Y_Inv'],
            name = cellsubtype)

    fig.update_layout(title = title, plot_bgcolor = 'white')
    fig.update_xaxes(title_text = 'Nuc_X', linecolor = 'black')
    fig.update_yaxes(title_text = 'Nuc_Y_Inv', linecolor = 'black')
    
    # Adjust the size of the points
    for trace in fig.data:
        trace.marker.size = 2  
    fig.update_layout(
        title=title,
        plot_bgcolor='white',
        legend=dict(
            title='Cell Subtypes',  #Legende title
            font=dict(
                family='Arial',
                size=12,
                color='black'
            ),
            bgcolor='white',
            bordercolor='black',
            borderwidth=0.4,
            itemsizing='constant'
        )
    )
    

    fig.write_image(output_images_dir + "/" + title.replace(" ", "_") + ".png", width=1200, height=800, scale=4)
    print(sample_id,  "processed!")

    plot(fig)
    fig.show(renderer='png')  # Display the figure as png

#### V.5.1. IMMUNE CHECKPOINTS

In [None]:
immune_checkpoint_color_dict

In [None]:
# !!!! test for B7H4 !!!!


for sample in ls_samples:
    sample_id = sample.split('_')[0] + '.csv'
    sample_id2 = sample.split('_')[0]
    location_colors = df.loc[df['Sample_ID'] == sample_id,['Nuc_X','Nuc_Y_Inv','immune_checkpoint']]


    fig = go.Figure()
    title = sample_id2 + " Background Subtracted XY Map immune checkpoint"

    for immunecheckpoint in df.loc[df['Sample_ID'] == sample_id,'immune_checkpoint'].unique():
        fig.add_scatter(
            mode = 'markers',
            marker=dict(size=3, opacity=0.5, color='rgb' + str(immune_checkpoint_color_dict[immunecheckpoint])),
         x = location_colors.loc[location_colors['immune_checkpoint']==immunecheckpoint,'Nuc_X'],
         y = location_colors.loc[location_colors['immune_checkpoint']==immunecheckpoint,'Nuc_Y_Inv'],
         name = immunecheckpoint)

    fig.update_layout(title = title, plot_bgcolor = 'white')
    fig.update_xaxes(title_text = 'Nuc_X', linecolor = 'black')
    fig.update_yaxes(title_text = 'Nuc_Y_Inv', linecolor = 'black')
    
    # Adjust the size of the points
    for trace in fig.data:
        trace.marker.size = 2  
        # Adjust the size of the points
    for trace in fig.data:
        trace.marker.size = 2  
    fig.update_layout(
        title=title,
        plot_bgcolor='white',
        legend=dict(
            title='Immune checkpoint',  # Titre de la légende
            font=dict(
                family='Arial',
                size=12,
                color='black'
            ),
            bgcolor='white',
            bordercolor='black',
            borderwidth=0.4,
            itemsizing='constant'
        )
    )
    

    fig.write_image(output_images_dir + "/" + title.replace(" ", "_") + ".png", width=1200, height=800, scale=4)
    print(sample_id,  "processed!")

    fig.show(renderer='png')  # Display the figure as png
    plot(fig)

In [None]:
for sample in ls_samples:
    sample_id = sample.split('_')[0] + '.csv'
    sample_id2 = sample.split('_')[0]
    location_colors = df.loc[df['Sample_ID'] == sample_id, ['Nuc_X', 'Nuc_Y_Inv', 'immune_checkpoint']]

    fig = go.Figure()
    title = sample_id2 + " Background Subtracted XY Map immune checkpoint"

    for immunecheckpoint in df.loc[df['Sample_ID'] == sample_id, 'immune_checkpoint'].unique():
        # Vérifier si la combinaison est dans le dictionnaire avant d'ajouter la trace
        if immunecheckpoint in immune_checkpoint_color_dict:
            color = 'rgb' + str(immune_checkpoint_color_dict[immunecheckpoint])
            
            fig.add_scatter(
                mode='markers',
                marker=dict(size=3, opacity=0.5, color=color),
                x=location_colors.loc[location_colors['immune_checkpoint'] == immunecheckpoint, 'Nuc_X'],
                y=location_colors.loc[location_colors['immune_checkpoint'] == immunecheckpoint, 'Nuc_Y_Inv'],
                name=immunecheckpoint
            )
    
    fig.update_layout(
        title=title,
        plot_bgcolor='white',
        xaxis_title='Nuc_X',
        yaxis_title='Nuc_Y_Inv',
        legend_title='Immune checkpoint',
        legend=dict(
            font=dict(
                family='Arial',
                size=12,
                color='black'
            ),
            bgcolor='white',
            bordercolor='black',
            borderwidth=0.4,
            itemsizing='constant'
        )
    )
    
    fig.write_image(output_images_dir + "/" + title.replace(" ", "_") + ".png", width=1200, height=800, scale=4)
    print(sample_id,  "processed!")

    fig.show(renderer='png')  # Display the figure as png
    plot(fig)


### V.5.2. CELL SUBTYPE DENSITY

In [None]:
from shapely.geometry import MultiPoint

 

# Convert pixel to mm

df['x_mm'] = df['Nuc_X'] * 0.650 / 1000

df['y_mm'] = df['Nuc_Y_Inv'] * 0.650 / 1000

 

# Group by Sample_ID and ROI_Index and calculate the convex hull for each group

df_grouped = df.groupby(['Sample_ID', 'ROI_index']).apply(lambda group: MultiPoint(group[['x_mm', 'y_mm']].values).convex_hull)

 

# Calculate the area of each convex hull

df_area = df_grouped.apply(lambda hull: hull.area if hull.geom_type == 'Polygon' else 0)

 

# Convert to DataFrame

df_area = pd.DataFrame(df_area, columns=['Area_mm2'])

 

print(df_area)

## V.6. CORRELATION PLOTS

In [None]:
# Get Pearson correlations and P values for all marker values
# First, get we need to determine how many columns we will be evaluating. 
# And prepare empty Numpy arrays to hold our data.

In [None]:
#This is a work in progress section, need to run this on individual cell types instead of all cell types together

keep_sample_Set_A = [ 'TMA.csv', 'D3S1.csv', 'D3S2.csv','D3S3.csv','D4S1.csv','D4S2.csv','D4S3.csv','D5S1.csv','D5S2.csv','D5S3.csv']
keep_sample_Set_B = [ 'TMA.csv', 'DD3S1.csv','DD3S2.csv','DD3S3.csv','DD4S1.csv','DD4S2.csv','DD4S3.csv','DD5S1.csv','DD5S2.csv','DD5S3.csv']
keep_cell_type = [ 'CANCER', 'STROMA', 'ENDOTHELIAL', 'IMMUNE' ]

# Check project name and execute corresponding operations
if project_name == 'Set_A':
    keep_sample = keep_sample_Set_A
elif project_name == 'Set_B':
    keep_sample = keep_sample_Set_B
else:
    raise ValueError("Unknown project name.")

df_keep_sample = df.loc[(df['Sample_ID'].isin(keep_sample))
                      & (df['Sample_ID'].isin(keep_sample)), :].copy()

# df_keep_sample will change regarding which sample you decided to keep in the kee_sample_proj list
df_keep_sample

In [None]:
# n_corr_cols is the number of colums you will compute the correlations with
# (columns in df_keep_sample that are not in the list not_intensities)
n_corr_cols = len(df_keep_sample.columns[~df_keep_sample.columns.isin(not_intensities)])
print(n_corr_cols)

In [None]:
# An empty 2D NumPy array is created with dimensions n_corr_cols by n_corr_cols and assigned to the variable pvalues 
# This array is intended to store p-values.
pvalues = np.empty((n_corr_cols, n_corr_cols))

# Similarly, another empty 2D NumPy array with dimensions n_corr_cols by n_corr_cols is created and assigned to the variable corrvalues. 
# This array is intended to store correlation values.
corrvalues = np.empty((n_corr_cols,n_corr_cols))

#print(pvalues)
#print(corrvalues)

In [None]:
# Columns of the DataFrame for_corr are renamed using the dictionary full_to_short_names
for_corr = df_keep_sample.loc[:,~df_keep_sample.columns.isin(not_intensities)].copy()
for_corr = for_corr.rename(columns = full_to_short_names)
for_corr.shape

In [None]:
# Compute Pearson correlation coefficient and the corresponding p-value for each pair of columns in dataframe
for i in range(for_corr.shape[1]):
    for j in range(0,for_corr.shape[1]):
        col1 = for_corr[for_corr.columns.values[i]]
        col2 = for_corr[for_corr.columns.values[j]]
        corrvalues[i,j] = pearsonr(col1,col2)[0]
        pvalues[i,j] = pearsonr(col1,col2)[1]

In [None]:
# Correlation and p-value calculations are being organized and formatted into dataframes
corrvalues = pd.DataFrame(corrvalues).round(3)
corrvalues.columns = for_corr.columns.values
corrvalues.index = for_corr.columns.values

pvalues = pd.DataFrame(pvalues)
pvalues.columns = for_corr.columns.values
pvalues.index = for_corr.columns.values

### V.6.1. OPTION 1

In [None]:
# Visualizing and marking statistically significant p-values with an asterisk (*) 
# for ease of interpretation
# p-values that are less than or equal to 0.05

# p_add_star() in my_modules.py
p_w_star = pvalues.copy()
p_w_star = p_w_star.apply(lambda row: p_add_star(row), axis = 1)
p_w_star.columns = for_corr.columns.values
p_w_star.head()

In [None]:
# Check if there is non-significant values
for index, row in p_w_star.iterrows():
    for col in p_w_star.columns:
        value = p_w_star.loc[index, col]
        if "*" not in value:
            print(f"Value without asterisk found at index: {index}, column: {col}, value: {value}")

In [None]:
# Correlation visualization
sb.set()

x_axis_labels = for_corr.columns.values.tolist()
y_axis_labels = for_corr.columns.values.tolist()

ax = sb.heatmap(corrvalues, 
                annot = p_w_star, 
                annot_kws = {"size": 1.25},
                fmt = 's',
                xticklabels = x_axis_labels, 
                yticklabels = y_axis_labels,
                cbar_kws = {'label':'Pearson correlation'},
                linecolor = 'black', 
                linewidth = 0.5,
                cmap = 'coolwarm')

plt.yticks(rotation=0, size = 5)
ax.xaxis.tick_top() # x axis on top
ax.xaxis.set_label_position('top')
ax.tick_params(length=0)
plt.xticks(rotation=45, size = 5)
plt.setp(ax.xaxis.get_majorticklabels(), ha='left')

ax.set_title(label = "Correlations option 1", fontsize = 20)
plt.tight_layout()

filename = "correlations_option1.png"
filename = os.path.join(output_images_dir, filename)
plt.savefig(filename,dpi=500)

### V.6.2. OPTION 2

In [None]:
# If the item is less than or equal to 0.001, it assigns 3 asterisks, 
# if less than or equal to 0.01, it assigns 2 asterisks, 
# if less than or equal to 0.05, it assigns 1 asterisk, 
# and if greater than 0.05, it assigns 0 asterisks. 

# p_to_star() in my_modules.py
p_as_stars = pvalues.copy()
p_as_stars = p_as_stars.apply(lambda row: p_to_star(row), axis = 1)
p_as_stars.columns = for_corr.columns.values
p_as_stars.head()

In [None]:
corr_w_star = corrvalues.round(2).astype(str) + p_as_stars
corr_w_star.head()

In [None]:
# Check if there is non-significant values
for index, row in corr_w_star.iterrows():
    for col in corr_w_star.columns:
        value = corr_w_star.loc[index, col]
        if "*" not in value:
            print(f"Value without asterisk found at index: {index}, column: {col}, value: {value}")

In [None]:
corrvalues.shape

In [None]:
corr_w_star.shape

In [None]:
# Correlation visualization
sb.set()

x_axis_labels = for_corr.columns.values.tolist()
y_axis_labels = for_corr.columns.values.tolist()

ax = sb.heatmap(corrvalues, 
                annot = corr_w_star, 
                annot_kws = {"size": 1.25},
                fmt = 's',
                xticklabels = x_axis_labels, 
                yticklabels=y_axis_labels,
                cbar_kws = {'label':'Pearson correlation'},
                linecolor = 'black', linewidth = 0.5,
                cmap = 'coolwarm')

plt.yticks(rotation=0, size = 5)
ax.xaxis.tick_top() # x axis on top
ax.xaxis.set_label_position('top')
ax.tick_params(length=0)
plt.xticks(rotation=45, size = 5)
plt.setp(ax.xaxis.get_majorticklabels(), ha='left')

ax.set_title(label = "Correlations option 2", fontsize = 20)
plt.tight_layout()

filename = "correlations_option2.png"
filename = os.path.join(output_images_dir, filename)
plt.savefig(filename,dpi=500)

In [None]:
filename = "zscore_pearson_correlations.csv"
filename = os.path.join(output_data_dir, filename)
corrvalues.to_csv(filename, index = True)

filename = "zscore_pearson_p-values.csv"
filename = os.path.join(output_data_dir, filename)
pvalues.to_csv(filename, index = True)

## V.7. PCA

In [None]:
def demo_PCA(data, c):
    reduced_data = PCA(n_components=2).fit_transform(data)
    kmeans = KMeans(init="k-means++", n_clusters=c, n_init=4)
    kmeans.fit(reduced_data)

    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = 0.02  # point in the mesh [x_min, x_max]x[y_min, y_max].

    # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # Obtain labels for each point in mesh. Use last trained model.
    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(1)
    plt.clf()
    plt.imshow(
        Z,
        interpolation="nearest",
        extent=(xx.min(), xx.max(), yy.min(), yy.max()),
        cmap=plt.cm.Paired,
        aspect="auto",
        origin="lower",
    )

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)
    # Plot the centroids as a white X
    centroids = kmeans.cluster_centers_
    plt.scatter(
        centroids[:, 0],
        centroids[:, 1],
        marker="x",
        s=169,
        linewidths=3,
        color="w",
        zorder=10,
    )
    plt.title(
        "K-means clustering on the dataset (PCA-reduced data)\n"
        "Centroids are marked with white cross"
    )
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()
    
    return None


def other_PCA(data, c):
    reduced_data = PCA(n_components=2).fit_transform(data)
    ###kmeans = KMeans(init="k-means++", n_clusters=c, n_init=4)
    ###kmeans.fit(reduced_data)
    
    plt.scatter(reduced_data[:,0], reduced_data[:,1], cmap = 'plasma')
    plt.xlabel('1st Principal Component')
    plt.ylabel('2nd Principal Component')
    plt.title('Scatter Plot of the 2 Principal Component')
    plt.show()
    
    return None

In [None]:
df_test=df.loc[:,~df.columns.isin(not_intensities)]
demo_PCA(df_test, 100)

In [None]:
other_PCA(df_test, 100)

## V.8. KMEANS

In [None]:
# Define number of clusters - according to set & cell type

def kmeans_analysis(df, c):
    analyzed_df = df.loc[:,~df.columns.isin(not_intensities)]
    ###kM = KMeans(n_clusters=c).fit_predict(analyzed_df)
    kM = KMeans(n_clusters=c).fit(analyzed_df)
    ###centroids = kmeans.cluster_centers_ #list of clusters. Each cluster is a matrix of means for each parameter (protein) analyzed
    labels = kM.labels_
    clusterCount = np.bincount(labels) #holds information for how many points are in each cluster
    ###plt.scatter(normed_V1, normed_V2, s=10, c=labels, cmap='coolwarm') # instead of c=bn_class
    return None

def scale_data(df):
    pca = PCA(n_components=2) #We are doing a 2D data visualisation, so we need to select 2 principal components for PCA
    sc = StandardScaler()
    df_scaled = sc.fit_transform(df.loc[:,~df.columns.isin(not_intensities)])
    PCs = pca.fit_transform(df_scaled)
    return PCs, df_scaled

def plot_kmeans(df, nb_cl):
    pc, scaled_df = scale_data(df)
    ###kmeans = KMeans(n_clusters= nb_cl).fit(scaled_df)
    ###df_predict = kmeans.predict(scaled_df)
    test = KMeans(n_clusters= nb_cl).fit_predict(scaled_df)
    sb.scatterplot(x=pc[:,0],y=pc[:,1],hue=test)
    return None

def kmeans_plotting(df, c):
    analyzed_df = df.loc[:,~df.columns.isin(not_intensities)]
    #Initialize the class object
    kmeans = KMeans(n_clusters= c)
    #Predict the labels of clusters.
    l = kmeans.fit_predict(analyzed_df) #returns the array of cluster labels each data point belongs to
    plot_results(l)
    return l

def plot_results(label): #https://www.askpython.com/python/examples/plot-k-means-clusters-python
    #Getting unique labels
    u_labels = np.unique(label)
    #Plotting the results
    for i in u_labels:
        plt.scatter(df.iloc[label == i , 0] , df.iloc[label == i , 1] , label = i)
    plt.title("K-means clustering")
    plt.legend(loc='center left', title = "Clusters", bbox_to_anchor=(1, 0.5))
    plt.show()
    return None

In [None]:
# Define number of clusters - according to set & cell type

clusterList = kmeans_analysis(df, 20)
plot_kmeans(df, 20)

In [None]:
clusterLabels_array = kmeans_plotting(df, 20)

## pca + kmean

In [None]:
def standardization(df):
    df = df.loc[:,~df.columns.isin(not_intensities)]
    scaler = StandardScaler()
    data_std = scaler.fit_transform(df)
    return data_std

def determine_features_kept(x, y):
    real_componentValue = np.interp(0.8, y, x)
    rounded_componentValue = round(real_componentValue)
    print("Number of features/components kept : {}".format(rounded_componentValue))
    return rounded_componentValue

def explained_variance(pCA):
    y = pCA.explained_variance_ratio_
    max_range = len(y) + 1
    x = range(1,max_range)
    plt.figure(figsize = (10,8))
    plt.plot(x, y.cumsum(), marker = 'o', linestyle = '--')
    plt.title("Explained Variance by Components")
    plt.xlabel("Number of components")
    plt.ylabel("Cumulative Explained Variance")
    plt.show()
    nb_features = determine_features_kept(x, y.cumsum())
    return nb_features

def pca_analysis(nb_feat, df):
    pca = PCA(n_components = nb_feat)
    pca.fit(df)
    scores_pca = pca.transform(df)
    return scores_pca

def dimensionality_reduction(df): #using PCA
    standard_df = standardization(df)
    pca = PCA()
    pca.fit(standard_df)
    nb_components = explained_variance(pca)
    pca_scores = pca_analysis(nb_components, standard_df)
    return pca_scores

In [None]:
scoresPCA = dimensionality_reduction(df)

In [None]:
def determine_slope(a, b):
    return abs((b[1] - a[1]) / (b[0] - a[0]))

def record_slopes(X, Y):
    slope_list = []
    if len(X) == len(Y):
        for i in range(len(X)-1):
            A = (X[i], Y[i])
            B = (X[i+1], Y[i+1])
            slope = determine_slope(A, B)
            slope_list.append(slope)
    else:
        print("Error! length of X and Y coordinates do not match!")
        slope_list.append(0)
    return slope_list

def elbow_coordinates(diff_ls, x_coord):
    max_diff = max(diff_ls) #the slopes with the highest difference correspond to the "sharpest" elbow
    index_max = diff_ls.index(max_diff)   
    s_coord = x_coord[index_max+1] #we determine the X position of the elbow
    return s_coord

def highest_slope_diff(ls_slopes, coord_x):
    ls_diffs = [] #list of slope differences
    for i in range(len(ls_slopes)-1):
        diff = abs(ls_slopes[i] - ls_slopes[i+1]) #compare differences between each slope
        ls_diffs.append(diff)
    elbow = elbow_coordinates(ls_diffs, coord_x) #returns the X coordinates where the elbow is
    return elbow

In [None]:
liste = [46405799.34079681, 38240577.23298195, 35026058.95518527, 32185738.815936424, 30500185.571518555, 28678764.340087377, 27418667.363556623, 26234063.97845333, 25405258.778627113, 24454635.699228957]
x = list(range(1, 11))

highest_slope_diff(liste, x)

## V.9. T-SNE

In [None]:
# Implémente notre dataset
## A partir df_tsne, ajoute colonne(s) d'intérêt
cols = list(df.columns)
df['Patient']= df['Patient']
df['cell_type']= df['cell_type']
print('Size of the dataframe: {}'.format(df.shape))

## Selectionner nb random de valeurs (temp, pr performance)
np.random.seed(50)
rndperm = np.random.permutation(df.shape[0])

In [None]:
#Remove index values (unneeded & handicapping for T-SNE analysis)
df = df.reset_index(drop=True)
df.head()

In [None]:
#Keep only the numbers in the patient ID column
df['Patient'] = df['Patient'].map(lambda x: x.lstrip('Pt'))
df.head()

In [None]:
#t-sne
N = 20000

df_subset = df.loc[rndperm[:N],:].copy() #remove later for full subset? (since it is already a subset)

data_subset = df_subset[cols].values

pca = PCA(n_components=3)
data_subset

In [None]:

pca_result = pca.fit_transform(data_subset)

df_subset['pca-one'] = pca_result[:,0]
df_subset['pca-two'] = pca_result[:,1] 
df_subset['pca-three'] = pca_result[:,2]

print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

## V.10. SAVE

In [None]:
# Save the data by Sample_ID
# Check for the existence of the output file first
for sample in ls_samples:
    sample_id = sample.split('_')[0]
    filename = os.path.join(output_data_dir,  sample_id + "_" + step_suffix + ".csv")
    if os.path.exists(filename):
        print("File by name "+filename+" already exists.")
    else:
        sample_id_csv = sample_id + '.csv'
        df_save = df.loc[df['Sample_ID'] == sample_id_csv, :]
        #print(df_save)
        filename = os.path.join(output_data_dir,  sample_id + "_" + step_suffix + ".csv")
        df_save.to_csv(filename, index=True, index_label='ID')  # Set index parameter to True to retain the index column
        print("File " + filename + " was created!")

In [None]:
# Save the dataset as a single file for TSNE/UMAP notebook
filename = "all_Samples_" + project_name + ".csv"
filename = os.path.join(output_data_dir, filename)

if os.path.exists(filename):
    print("File by name "+filename+" already exists.")
else :
    df.to_csv(filename, index = False)
    print("File " + filename + " was created!")