from fuson_plm.utils.logging import CustomParams | |
# Clustering Parameters | |
# Need to be stacked, because there are 4 properties | |
CLUSTER = CustomParams( | |
# MMSeqs2 parameters: see GitHub or MMSeqs2 Wiki for guidance | |
MIN_SEQ_ID = 0.3, # % identity | |
C = 0.5, # % sequence length overlap | |
COV_MODE = 1, # cov-mode: 0 = bidirectional, 1 = target coverage, 2 = query coverage, 3 = target-in-query length coverage. | |
CLUSTER_MODE = 2, | |
# File paths | |
INPUT_PATH = 'processed_data/all_albatross_seqs_and_properties.csv', | |
PATH_TO_MMSEQS = '../../mmseqs' # path to where you installed MMSeqs2 | |
) | |
# Here, we'll be splitting the train set into train and val. we aren't touching test | |
SPLIT = CustomParams( | |
IDR_DB_PATH = 'processed_data/all_albatross_seqs_and_properties.csv', | |
CLUSTER_OUTPUT_PATH = 'clustering/mmseqs_full_results.csv', | |
#RANDOM_STATE = 7, # random_state_1 = state for splitting all data into train & test | |
#VAL_SIZE = 0.10, # val size for data -> train/val split. e.g. 20 means 80% clusters in train, 20% clusters in val | |
RANDOM_STATE_1 = 2, # random_state_1 = state for splitting all data into train & other | |
TEST_SIZE_1 = 0.21, # test size for data -> train/test split. e.g. 20 means 80% clusters in train, 20% clusters in other | |
RANDOM_STATE_2 = 6, # random_state_2 = state for splitting other from ^ into val and test | |
TEST_SIZE_2 = 0.50 # test size for train -> train/val split. e.g. 0.50 means 50% clusters in train, 50% clusters in test | |
) | |
# Which models to benchmark | |
TRAIN = CustomParams( | |
BENCHMARK_FUSONPLM = True, | |
FUSONPLM_CKPTS= "FusOn-pLM", # Dictionary: key = run name, values = epochs, or string "FusOn-pLM" | |
BENCHMARK_ESM = True, | |
# GPU configs | |
CUDA_VISIBLE_DEVICES="0", | |
# Overwriting configs | |
PERMISSION_TO_OVERWRITE_EMBEDDINGS = False, # if False, script will halt if it believes these embeddings have already been made. | |
PERMISSION_TO_OVERWRITE_MODELS = False # if False, script will halt if it believes these embeddings have already been made. | |
) |