PROBE / src /bin /semantic_similarity_infer.py
mgyigit's picture
Update src/bin/semantic_similarity_infer.py
41f6a20 verified
raw
history blame
4.65 kB
#!/usr/bin/env python
# coding: utf-8
import os
import pandas as pd
import numpy as np
import itertools
import multiprocessing
from scipy.spatial.distance import cdist
from numpy.linalg import norm
from scipy.stats import spearmanr
from tqdm import tqdm
manager = multiprocessing.Manager()
similarity_list = manager.list()
proteinListNew = manager.list()
representation_dataframe = ""
protein_names = ""
representation_name = ""
similarity_tasks = ""
detailed_output = False
def parallelSimilarity(paramList):
protein_embedding_dataframe = representation_dataframe
i = paramList[0]
j = paramList[1]
if j > i:
protein1 = proteinListNew[i]
protein2 = proteinListNew[j]
if protein1 in protein_names and protein2 in protein_names:
prot1vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein1")['Vector'].item())
prot2vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein2")['Vector'].item())
# Calculate Manhattan Distance and normalize
manhattanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'cityblock')
manhattanDistNorm = manhattanDist / (norm(prot1vec,1) + norm(prot2vec,1))
manhattanSim = 1 - manhattanDistNorm.item()
if norm(prot1vec, 1) == 0 and norm(prot2vec, 1) == 0:
manhattanSim = 1.0
real = paramList[2]
similarity_list.append((real, manhattanSim))
return similarity_list
def calculateCorrelationforOntology(aspect, matrix_type):
similarity_list[:] = []
proteinListNew[:] = []
similarityMatrixNameDict = {
"All": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix.csv"),
"500": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"),
"Sparse": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"),
"200": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv")
}
similarityMatrixFileName = similarityMatrixNameDict[matrix_type]
human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace=True)
proteinList = human_proteinSimilarityMatrix.columns
for prot in proteinList:
proteinListNew.append(prot)
if matrix_type == "Sparse":
sparsified_path = os.path.join(script_dir, "../data/auxilary_input/SparsifiedSimilarityCoordinates_" + aspect + "_for_highest_500.npy")
sparsified_similarity_coordinates = np.load(sparsified_path)
protParamList = sparsified_similarity_coordinates
else:
i = range(len(proteinList))
j = range(len(proteinList))
protParamList = list(itertools.product(i, j))
protParamListNew = []
for tup in tqdm(protParamList):
i = tup[0]
j = tup[1]
if matrix_type == "Sparse":
protein1 = proteinListNew[i]
protein2 = proteinListNew[j]
real = human_proteinSimilarityMatrix.loc[protein1, protein2]
tupNew = (tup[0],tup[1],real)
protParamListNew.append(tupNew)
else:
if j > i:
protein1 = proteinListNew[i]
protein2 = proteinListNew[j]
real = human_proteinSimilarityMatrix.loc[protein1, protein2]
tupNew = (tup[0],tup[1],real)
protParamListNew.append(tupNew)
pool = multiprocessing.Pool()
similarity_listRet = []
for similarity_listRet in tqdm(pool.imap_unordered(parallelSimilarity, protParamListNew), total=len(protParamListNew), position=0, leave=True):
pass
pool.close()
pool.join()
real_distance_list = [value[0] for value in similarity_listRet]
manhattan_distance_list = [value[1] for value in similarity_listRet]
manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list)
return {
"correlation": manhattanCorr[0], "p_value": manhattanCorr[1]
}
def calculate_all_correlations():
results = {}
for similarity_matrix_type in similarity_tasks:
matrix_results = {}
for aspect in ["MF", "BP", "CC"]:
corr = calculateCorrelationforOntology(aspect, similarity_matrix_type)
matrix_results[aspect] = corr
results[similarity_matrix_type] = matrix_results
return results