svincoff's picture
fixed READMEs and added IDR Prediction benchmark
e048d40
from fuson_plm.utils.logging import CustomParams
# Clustering Parameters
# Need to be stacked, because there are 4 properties
CLUSTER = CustomParams(
# MMSeqs2 parameters: see GitHub or MMSeqs2 Wiki for guidance
MIN_SEQ_ID = 0.3, # % identity
C = 0.5, # % sequence length overlap
COV_MODE = 1, # cov-mode: 0 = bidirectional, 1 = target coverage, 2 = query coverage, 3 = target-in-query length coverage.
CLUSTER_MODE = 2,
# File paths
INPUT_PATH = 'processed_data/all_albatross_seqs_and_properties.csv',
PATH_TO_MMSEQS = '../../mmseqs' # path to where you installed MMSeqs2
)
# Here, we'll be splitting the train set into train and val. we aren't touching test
SPLIT = CustomParams(
IDR_DB_PATH = 'processed_data/all_albatross_seqs_and_properties.csv',
CLUSTER_OUTPUT_PATH = 'clustering/mmseqs_full_results.csv',
#RANDOM_STATE = 7, # random_state_1 = state for splitting all data into train & test
#VAL_SIZE = 0.10, # val size for data -> train/val split. e.g. 20 means 80% clusters in train, 20% clusters in val
RANDOM_STATE_1 = 2, # random_state_1 = state for splitting all data into train & other
TEST_SIZE_1 = 0.21, # test size for data -> train/test split. e.g. 20 means 80% clusters in train, 20% clusters in other
RANDOM_STATE_2 = 6, # random_state_2 = state for splitting other from ^ into val and test
TEST_SIZE_2 = 0.50 # test size for train -> train/val split. e.g. 0.50 means 50% clusters in train, 50% clusters in test
)
# Which models to benchmark
TRAIN = CustomParams(
BENCHMARK_FUSONPLM = True,
FUSONPLM_CKPTS= "FusOn-pLM", # Dictionary: key = run name, values = epochs, or string "FusOn-pLM"
BENCHMARK_ESM = True,
# GPU configs
CUDA_VISIBLE_DEVICES="0",
# Overwriting configs
PERMISSION_TO_OVERWRITE_EMBEDDINGS = False, # if False, script will halt if it believes these embeddings have already been made.
PERMISSION_TO_OVERWRITE_MODELS = False # if False, script will halt if it believes these embeddings have already been made.
)