Spaces:
Running
Running
| import tqdm | |
| import multiprocessing | |
| import pandas as pd | |
| import numpy as np | |
| import scipy.stats | |
| import os | |
| import sys | |
| script_dir = os.path.dirname(os.path.abspath(__file__)) | |
| sys.path.append('..') | |
| sys.path.append('.') | |
| from sklearn import linear_model | |
| from sklearn.model_selection import KFold | |
| from sklearn.metrics import mean_squared_error, mean_absolute_error | |
| from sklearn.preprocessing import MinMaxScaler | |
| skempi_vectors_path = None | |
| representation_name = None | |
| def load_representation(multi_col_representation_vector_file_path): | |
| print("\nLoading representation vectors...\n") | |
| multi_col_representation_vector = pd.read_csv(multi_col_representation_vector_file_path) | |
| vals = multi_col_representation_vector.iloc[:, 1:(len(multi_col_representation_vector.columns))] | |
| original_values_as_df = pd.DataFrame({'PDB_ID': pd.Series([], dtype='str'), 'Vector': pd.Series([], dtype='object')}) | |
| for index, row in tqdm.tqdm(vals.iterrows(), total=len(vals)): | |
| list_of_floats = [float(item) for item in list(row)] | |
| original_values_as_df.loc[index] = [multi_col_representation_vector.iloc[index]['PDB_ID']] + [list_of_floats] | |
| return original_values_as_df | |
| def calc_train_error(X_train, y_train, model): | |
| '''Returns in-sample error for an already fit model.''' | |
| predictions = model.predict(X_train) | |
| mse = mean_squared_error(y_train, predictions) | |
| mae = mean_absolute_error(y_train, predictions) | |
| corr = scipy.stats.pearsonr(y_train, predictions) | |
| return mse, mae, corr | |
| def calc_validation_error(X_test, y_test, model): | |
| '''Returns out-of-sample error for an already fit model.''' | |
| predictions = model.predict(X_test) | |
| mse = mean_squared_error(y_test, predictions) | |
| mae = mean_absolute_error(y_test, predictions) | |
| corr = scipy.stats.pearsonr(y_test, predictions) | |
| return mse, mae, corr | |
| def calc_metrics(X_train, y_train, X_test, y_test, model): | |
| '''Fits the model and returns the metrics for in-sample and out-of-sample errors.''' | |
| model.fit(X_train, y_train) | |
| #train_mse_error, train_mae_error, train_corr = calc_train_error(X_train, y_train, model) | |
| val_mse_error, val_mae_error, val_corr = calc_validation_error(X_test, y_test, model) | |
| return val_mse_error, val_mae_error, val_corr | |
| def report_results( | |
| validation_mse_error_list, | |
| validation_mae_error_list, | |
| validation_corr_list, | |
| validation_corr_pval_list, | |
| ): | |
| result_summary = { | |
| "val_mse_error": round(np.mean(validation_mse_error_list) * 100, 4), | |
| "val_mse_std": round(np.std(validation_mse_error_list) * 100, 4), | |
| "val_mae_error": round(np.mean(validation_mae_error_list) * 100, 4), | |
| "val_mae_std": round(np.std(validation_mae_error_list) * 100, 4), | |
| "validation_corr": round(np.mean(validation_corr_list), 4), | |
| "validation_corr_pval": round(np.mean(validation_corr_pval_list), 4), | |
| } | |
| result_detail = { | |
| "val_mse_errors": list(np.multiply(validation_mse_error_list, 100)), | |
| "val_mae_errors": list(np.multiply(validation_mae_error_list, 100)), | |
| "validation_corrs": list(np.multiply(validation_corr_list, 100)), | |
| "validation_corr_pvals": list(np.multiply(validation_corr_pval_list, 100)), | |
| } | |
| return result_summary, result_detail | |
| def predictAffinityWithModel(regressor_model, multiplied_vectors_df): | |
| K = 10 | |
| kf = KFold(n_splits=K, shuffle=True, random_state=42) | |
| train_mse_error_list = [] | |
| validation_mse_error_list = [] | |
| train_mae_error_list = [] | |
| validation_mae_error_list = [] | |
| train_corr_list = [] | |
| validation_corr_list = [] | |
| train_corr_pval_list = [] | |
| validation_corr_pval_list = [] | |
| data = np.array(np.asarray(multiplied_vectors_df["Vector"].tolist()), dtype=float) | |
| ppi_affinity_filtered_df = ppi_affinity_df[ | |
| ppi_affinity_df['Protein1'].isin(multiplied_vectors_df['Protein1']) & | |
| ppi_affinity_df['Protein2'].isin(multiplied_vectors_df['Protein2']) | |
| ] | |
| target = np.array(ppi_affinity_filtered_df["Affinity"]) | |
| scaler = MinMaxScaler() | |
| scaler.fit(target.reshape(-1, 1)) | |
| target = scaler.transform(target.reshape(-1, 1))[:, 0] | |
| for train_index, val_index in tqdm.tqdm(kf.split(data, target), total=K): | |
| # split data | |
| X_train, X_val = data[train_index], data[val_index] | |
| y_train, y_val = target[train_index], target[val_index] | |
| # instantiate model | |
| reg = regressor_model | |
| # calculate errors | |
| ( | |
| val_mse_error, | |
| val_mae_error, | |
| val_corr, | |
| ) = calc_metrics(X_train, y_train, X_val, y_val, reg) | |
| # append to appropriate lists | |
| validation_mse_error_list.append(val_mse_error) | |
| validation_mae_error_list.append(val_mae_error) | |
| validation_corr_list.append(val_corr[0]) | |
| validation_corr_pval_list.append(val_corr[1]) | |
| return report_results( | |
| validation_mse_error_list, | |
| validation_mae_error_list, | |
| validation_corr_list, | |
| validation_corr_pval_list, | |
| ) | |
| ppi_affinity_file_path = "../data/auxilary_input/skempi_pipr/SKEMPI_all_dg_avg.txt" | |
| ppi_affinity_file = os.path.join(script_dir, ppi_affinity_file_path) | |
| ppi_affinity_df = pd.read_csv(ppi_affinity_file, sep="\t", header=None) | |
| ppi_affinity_df.columns = ['Protein1', 'Protein2', 'Affinity'] | |
| def calculate_vector_multiplications(skempi_vectors_df): | |
| multiplied_vectors = pd.DataFrame({ | |
| 'Protein1': pd.Series([], dtype='str'), | |
| 'Protein2': pd.Series([], dtype='str'), | |
| 'Vector': pd.Series([], dtype='object') | |
| }) | |
| print("Element-wise vector multiplications are being calculated") | |
| rep_prot_list = list(skempi_vectors_df['PDB_ID']) | |
| for index, row in tqdm.tqdm(ppi_affinity_df.iterrows()): | |
| if row['Protein1'] in rep_prot_list and row['Protein2'] in rep_prot_list: | |
| vec1 = list(skempi_vectors_df[skempi_vectors_df['PDB_ID'] == row['Protein1']]['Vector'])[0] | |
| vec2 = list(skempi_vectors_df[skempi_vectors_df['PDB_ID'] == row['Protein2']]['Vector'])[0] | |
| multiplied_vec = np.multiply(vec1, vec2) | |
| multiplied_vectors = multiplied_vectors.append({ | |
| 'Protein1': row['Protein1'], | |
| 'Protein2': row['Protein2'], | |
| 'Vector': multiplied_vec | |
| }, ignore_index=True) | |
| return multiplied_vectors | |
| def predict_affinities_and_report_results(): | |
| skempi_vectors_df = load_representation(skempi_vectors_path) | |
| multiplied_vectors_df = calculate_vector_multiplications(skempi_vectors_df) | |
| model = linear_model.BayesianRidge() | |
| result_summary, result_detail = predictAffinityWithModel(model, multiplied_vectors_df) | |
| # Return the results as a dictionary instead of writing to a file | |
| return {'summary': result_summary, | |
| 'detail': result_detail} |