#!/usr/bin/env python # coding: utf-8 import os import pandas as pd import numpy as np import itertools import multiprocessing from scipy.spatial.distance import cdist from numpy.linalg import norm from scipy.stats import spearmanr from tqdm import tqdm manager = multiprocessing.Manager() similarity_list = manager.list() proteinListNew = manager.list() representation_dataframe = "" protein_names = "" representation_name = "" similarity_tasks = "" detailed_output = False def parallelSimilarity(paramList): protein_embedding_dataframe = representation_dataframe i = paramList[0] j = paramList[1] if j > i: protein1 = proteinListNew[i] protein2 = proteinListNew[j] if protein1 in protein_names and protein2 in protein_names: prot1vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein1")['Vector'].item()) prot2vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein2")['Vector'].item()) # Calculate Manhattan Distance and normalize manhattanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'cityblock') manhattanDistNorm = manhattanDist / (norm(prot1vec,1) + norm(prot2vec,1)) manhattanSim = 1 - manhattanDistNorm.item() if norm(prot1vec, 1) == 0 and norm(prot2vec, 1) == 0: manhattanSim = 1.0 real = paramList[2] similarity_list.append((real, manhattanSim)) return similarity_list def calculateCorrelationforOntology(aspect, matrix_type): similarity_list[:] = [] proteinListNew[:] = [] similarityMatrixNameDict = { "All": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix.csv"), "500": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"), "Sparse": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"), "200": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv") } similarityMatrixFileName = similarityMatrixNameDict[matrix_type] human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName) human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace=True) proteinList = human_proteinSimilarityMatrix.columns for prot in proteinList: proteinListNew.append(prot) if matrix_type == "Sparse": sparsified_path = os.path.join(script_dir, "../data/auxilary_input/SparsifiedSimilarityCoordinates_" + aspect + "_for_highest_500.npy") sparsified_similarity_coordinates = np.load(sparsified_path) protParamList = sparsified_similarity_coordinates else: i = range(len(proteinList)) j = range(len(proteinList)) protParamList = list(itertools.product(i, j)) protParamListNew = [] for tup in tqdm(protParamList): i = tup[0] j = tup[1] if matrix_type == "Sparse": protein1 = proteinListNew[i] protein2 = proteinListNew[j] real = human_proteinSimilarityMatrix.loc[protein1, protein2] tupNew = (tup[0],tup[1],real) protParamListNew.append(tupNew) else: if j > i: protein1 = proteinListNew[i] protein2 = proteinListNew[j] real = human_proteinSimilarityMatrix.loc[protein1, protein2] tupNew = (tup[0],tup[1],real) protParamListNew.append(tupNew) pool = multiprocessing.Pool() similarity_listRet = [] for similarity_listRet in tqdm(pool.imap_unordered(parallelSimilarity, protParamListNew), total=len(protParamListNew), position=0, leave=True): pass pool.close() pool.join() real_distance_list = [value[0] for value in similarity_listRet] manhattan_distance_list = [value[1] for value in similarity_listRet] manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list) return { "correlation": manhattanCorr[0], "p_value": manhattanCorr[1] } def calculate_all_correlations(): results = {} for similarity_matrix_type in similarity_tasks: matrix_results = {} for aspect in ["MF", "BP", "CC"]: corr = calculateCorrelationforOntology(aspect, similarity_matrix_type) matrix_results[aspect] = corr results[similarity_matrix_type] = matrix_results return results