from fuson_plm.utils.logging import CustomParams # Clustering Parameters # Need to be stacked, because there are 4 properties CLUSTER = CustomParams( # MMSeqs2 parameters: see GitHub or MMSeqs2 Wiki for guidance MIN_SEQ_ID = 0.3, # % identity C = 0.5, # % sequence length overlap COV_MODE = 1, # cov-mode: 0 = bidirectional, 1 = target coverage, 2 = query coverage, 3 = target-in-query length coverage. CLUSTER_MODE = 2, # File paths INPUT_PATH = 'processed_data/all_albatross_seqs_and_properties.csv', PATH_TO_MMSEQS = '../../mmseqs' # path to where you installed MMSeqs2 ) # Here, we'll be splitting the train set into train and val. we aren't touching test SPLIT = CustomParams( IDR_DB_PATH = 'processed_data/all_albatross_seqs_and_properties.csv', CLUSTER_OUTPUT_PATH = 'clustering/mmseqs_full_results.csv', #RANDOM_STATE = 7, # random_state_1 = state for splitting all data into train & test #VAL_SIZE = 0.10, # val size for data -> train/val split. e.g. 20 means 80% clusters in train, 20% clusters in val RANDOM_STATE_1 = 2, # random_state_1 = state for splitting all data into train & other TEST_SIZE_1 = 0.21, # test size for data -> train/test split. e.g. 20 means 80% clusters in train, 20% clusters in other RANDOM_STATE_2 = 6, # random_state_2 = state for splitting other from ^ into val and test TEST_SIZE_2 = 0.50 # test size for train -> train/val split. e.g. 0.50 means 50% clusters in train, 50% clusters in test ) # Which models to benchmark TRAIN = CustomParams( BENCHMARK_FUSONPLM = True, FUSONPLM_CKPTS= "FusOn-pLM", # Dictionary: key = run name, values = epochs, or string "FusOn-pLM" BENCHMARK_ESM = True, # GPU configs CUDA_VISIBLE_DEVICES="0", # Overwriting configs PERMISSION_TO_OVERWRITE_EMBEDDINGS = False, # if False, script will halt if it believes these embeddings have already been made. PERMISSION_TO_OVERWRITE_MODELS = False # if False, script will halt if it believes these embeddings have already been made. )