Spaces:
Running
Running
| #!/usr/bin/env python | |
| # coding: utf-8 | |
| import os | |
| script_dir = os.path.dirname(os.path.abspath(__file__)) | |
| import pandas as pd | |
| import numpy as np | |
| import itertools | |
| import multiprocessing | |
| from scipy.spatial.distance import cdist | |
| from numpy.linalg import norm | |
| from scipy.stats import spearmanr | |
| from tqdm import tqdm | |
| manager = multiprocessing.Manager() | |
| similarity_list = manager.list() | |
| proteinListNew = manager.list() | |
| representation_dataframe = "" | |
| protein_names = "" | |
| representation_name = "" | |
| similarity_tasks = "" | |
| detailed_output = False | |
| def parallelSimilarity(paramList): | |
| protein_embedding_dataframe = representation_dataframe | |
| i = paramList[0] | |
| j = paramList[1] | |
| if j > i: | |
| protein1 = proteinListNew[i] | |
| protein2 = proteinListNew[j] | |
| if protein1 in protein_names and protein2 in protein_names: | |
| prot1vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein1")['Vector'].item()) | |
| prot2vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein2")['Vector'].item()) | |
| # Calculate Manhattan Distance and normalize | |
| manhattanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'cityblock') | |
| manhattanDistNorm = manhattanDist / (norm(prot1vec,1) + norm(prot2vec,1)) | |
| manhattanSim = 1 - manhattanDistNorm.item() | |
| if norm(prot1vec, 1) == 0 and norm(prot2vec, 1) == 0: | |
| manhattanSim = 1.0 | |
| real = paramList[2] | |
| similarity_list.append((real, manhattanSim)) | |
| return similarity_list | |
| def calculateCorrelationforOntology(aspect, matrix_type): | |
| similarity_list[:] = [] | |
| proteinListNew[:] = [] | |
| similarityMatrixNameDict = { | |
| "All": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix.csv"), | |
| "500": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"), | |
| "Sparse": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"), | |
| "200": os.path.join(script_dir, "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv") | |
| } | |
| similarityMatrixFileName = similarityMatrixNameDict[matrix_type] | |
| human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName) | |
| human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace=True) | |
| proteinList = human_proteinSimilarityMatrix.columns | |
| for prot in proteinList: | |
| proteinListNew.append(prot) | |
| if matrix_type == "Sparse": | |
| sparsified_path = os.path.join(script_dir, "../data/auxilary_input/SparsifiedSimilarityCoordinates_" + aspect + "_for_highest_500.npy") | |
| sparsified_similarity_coordinates = np.load(sparsified_path) | |
| protParamList = sparsified_similarity_coordinates | |
| else: | |
| i = range(len(proteinList)) | |
| j = range(len(proteinList)) | |
| protParamList = list(itertools.product(i, j)) | |
| protParamListNew = [] | |
| for tup in tqdm(protParamList): | |
| i = tup[0] | |
| j = tup[1] | |
| if matrix_type == "Sparse": | |
| protein1 = proteinListNew[i] | |
| protein2 = proteinListNew[j] | |
| real = human_proteinSimilarityMatrix.loc[protein1, protein2] | |
| tupNew = (tup[0],tup[1],real) | |
| protParamListNew.append(tupNew) | |
| else: | |
| if j > i: | |
| protein1 = proteinListNew[i] | |
| protein2 = proteinListNew[j] | |
| real = human_proteinSimilarityMatrix.loc[protein1, protein2] | |
| tupNew = (tup[0],tup[1],real) | |
| protParamListNew.append(tupNew) | |
| pool = multiprocessing.Pool() | |
| similarity_listRet = [] | |
| for similarity_listRet in tqdm(pool.imap_unordered(parallelSimilarity, protParamListNew), total=len(protParamListNew), position=0, leave=True): | |
| pass | |
| pool.close() | |
| pool.join() | |
| real_distance_list = [value[0] for value in similarity_listRet] | |
| manhattan_distance_list = [value[1] for value in similarity_listRet] | |
| manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list) | |
| return { | |
| "correlation": manhattanCorr[0], "p_value": manhattanCorr[1] | |
| } | |
| def calculate_all_correlations(): | |
| results = {} | |
| for similarity_matrix_type in similarity_tasks: | |
| matrix_results = {} | |
| for aspect in ["MF", "BP", "CC"]: | |
| corr = calculateCorrelationforOntology(aspect, similarity_matrix_type) | |
| print(corr) | |
| matrix_results[aspect] = corr | |
| results[similarity_matrix_type] = matrix_results | |
| return results |