File size: 7,130 Bytes
e048d40 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
from fuson_plm.utils.logging import open_logfile, log_update
from fuson_plm.utils.visualizing import set_font
from fuson_plm.benchmarking.idr_prediction.config import SPLIT
from fuson_plm.utils.splitting import split_clusters, check_split_validity
import os
import pandas as pd
def get_training_dfs(train, val, test, idr_db):
"""
Remove unnecessary columns for efficient storing of train, validation, and test sets for benchmarking.
Also, add the values using idr_db
"""
log_update('\nMaking dataframes for IDR prediction benchmark...')
# Delete cluster-related columns we don't need
train = train.drop(columns=['representative seq_id','member seq_id', 'member length', 'representative seq']).rename(columns={'member seq':'Sequence'})
val = val.drop(columns=['representative seq_id','member seq_id', 'member length', 'representative seq']).rename(columns={'member seq':'Sequence'})
test = test.drop(columns=['representative seq_id','member seq_id', 'member length', 'representative seq']).rename(columns={'member seq':'Sequence'})
# Add values and make one df for each one
# idr_db values are in columns: asph,scaled_re,scaled_rg,scaling_exp
value_cols = ['asph','scaled_re','scaled_rg','scaling_exp']
return_dict = {}
for col in value_cols:
temp_train = pd.merge(train, idr_db[['Sequence',col]], on='Sequence',how='left').rename(columns={col:'Value'}).dropna(subset='Value')
temp_val = pd.merge(val, idr_db[['Sequence',col]], on='Sequence',how='left').rename(columns={col:'Value'}).dropna(subset='Value')
temp_test = pd.merge(test, idr_db[['Sequence',col]], on='Sequence',how='left').rename(columns={col:'Value'}).dropna(subset='Value')
return_dict[col] = {
'train': temp_train,
'val': temp_val,
'test': temp_test
}
return return_dict
def main():
"""
"""
# Read all the input files
LOG_PATH = "splitting_log.txt"
IDR_DB_PATH = SPLIT.IDR_DB_PATH
CLUSTER_OUTPUT_PATH = SPLIT.CLUSTER_OUTPUT_PATH
RANDOM_STATE_1 = SPLIT.RANDOM_STATE_1
TEST_SIZE_1 = SPLIT.TEST_SIZE_1
RANDOM_STATE_2 = SPLIT.RANDOM_STATE_2
TEST_SIZE_2 = SPLIT.TEST_SIZE_2
# set font
set_font()
# Prepare the log file
with open_logfile(LOG_PATH):
log_update("Loaded data-splitting configurations from config.py")
SPLIT.print_config(indent='\t')
# Prepare directory to save results
os.makedirs("splits",exist_ok=True)
# Read the clusters and get a list of the representative IDs for splitting
clusters = pd.read_csv(CLUSTER_OUTPUT_PATH)
reps = clusters['representative seq_id'].unique().tolist()
log_update(f"\nPreparing clusters...\n\tCollected {len(reps)} clusters for splitting")
# Make the splits and extract the results
splits = split_clusters(reps,
random_state_1 = RANDOM_STATE_1, test_size_1 = TEST_SIZE_1,
random_state_2= RANDOM_STATE_2, test_size_2 = TEST_SIZE_2)
X_train = splits['X_train']
X_val = splits['X_val']
X_test = splits['X_test']
# Make slices of clusters dataframe for train, val, and test
train_clusters = clusters.loc[clusters['representative seq_id'].isin(X_train)].reset_index(drop=True)
val_clusters = clusters.loc[clusters['representative seq_id'].isin(X_val)].reset_index(drop=True)
test_clusters = clusters.loc[clusters['representative seq_id'].isin(X_test)].reset_index(drop=True)
# Check validity
check_split_validity(train_clusters, val_clusters, test_clusters)
# Print min and max sequence lengths
min_train_seqlen = min(train_clusters['member seq'].str.len())
max_train_seqlen = max(train_clusters['member seq'].str.len())
min_val_seqlen = min(val_clusters['member seq'].str.len())
max_val_seqlen = max(val_clusters['member seq'].str.len())
min_test_seqlen = min(test_clusters['member seq'].str.len())
max_test_seqlen = max(test_clusters['member seq'].str.len())
log_update(f"\nLength breakdown summary...\n\tTrain: min seq length = {min_train_seqlen}, max seq length = {max_train_seqlen}")
log_update(f"\nVal: min seq length = {min_val_seqlen}, max seq length = {max_val_seqlen}")
log_update(f"\nTest: min seq length = {min_test_seqlen}, max seq length = {max_test_seqlen}")
# cols = representative seq_id,member seq_id,representative seq,member seq
train_clusters.to_csv("splits/train_cluster_split.csv",index=False)
val_clusters.to_csv("splits/val_cluster_split.csv",index=False)
test_clusters.to_csv("splits/test_cluster_split.csv",index=False)
log_update('\nSaved cluster splits to splits/train_cluster_split.csv, splits/val_cluster_split.csv, splits/test_cluster_split.csv')
cols=','.join(list(train_clusters.columns))
log_update(f'\tColumns: {cols}')
# Get final dataframes for training, and check their distributions
idr_db = pd.read_csv(IDR_DB_PATH)
train_dfs_dict = get_training_dfs(train_clusters, val_clusters, test_clusters, idr_db)
os.makedirs('splits',exist_ok=True)
train_test_values_dict = {}
idr_property_name_dict = {'asph':'Asphericity','scaled_re':'End-to-End Distance (Re)','scaled_rg':'Radius of Gyration (Rg)','scaling_exp':'Scaling Exponent'}
for idr_property, dfs in train_dfs_dict.items():
os.makedirs(f"splits/{idr_property}", exist_ok=True)
train_df = dfs['train']
val_df = dfs['val']
test_df = dfs['test']
total_seqs = len(train_df)+len(val_df)+len(test_df)
train_df.to_csv(f"splits/{idr_property}/train_df.csv",index=False)
val_df.to_csv(f"splits/{idr_property}/val_df.csv",index=False)
test_df.to_csv(f"splits/{idr_property}/test_df.csv",index=False)
log_update(f"\nSaved {idr_property} training dataframes to splits/{idr_property}/train_df.csv, splits/{idr_property}/val_df.csv splits/test_df.csv")
log_update(f"\tTrain sequences: {len(train_df)} ({100*len(train_df)/total_seqs:.2f}%)")
log_update(f"\tVal sequences: {len(val_df)} ({100*len(val_df)/total_seqs:.2f}%)")
log_update(f"\tTest sequences: {len(test_df)} ({100*len(test_df)/total_seqs:.2f}%)")
log_update(f"\tTotal: {total_seqs}")
# Make sure the lengths are right
log_update(len(idr_db[idr_db[idr_property].notna()]))
assert total_seqs == len(idr_db[idr_db[idr_property].notna()])
train_test_values_dict[
idr_property_name_dict[idr_property]
] = {
'train': train_df['Value'].tolist(),
'val': val_df['Value'].tolist(),
'test': test_df['Value'].tolist()
}
if __name__ == "__main__":
main() |