fixed READMEs and added IDR Prediction benchmark

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

fuson_plm/benchmarking/caid/README.md +44 -42
fuson_plm/benchmarking/idr_prediction/README.md +211 -0
fuson_plm/benchmarking/idr_prediction/__init__.py +0 -0
fuson_plm/benchmarking/idr_prediction/clean.py +289 -0
fuson_plm/benchmarking/idr_prediction/cluster.py +94 -0
fuson_plm/benchmarking/idr_prediction/clustering/input.fasta +3 -0
fuson_plm/benchmarking/idr_prediction/clustering/mmseqs_full_results.csv +3 -0
fuson_plm/benchmarking/idr_prediction/config.py +41 -0
fuson_plm/benchmarking/idr_prediction/model.py +154 -0
fuson_plm/benchmarking/idr_prediction/plot.py +204 -0
fuson_plm/benchmarking/idr_prediction/processed_data/all_albatross_seqs_and_properties.csv +3 -0
fuson_plm/benchmarking/idr_prediction/processed_data/train_test_value_histograms.png +0 -0
fuson_plm/benchmarking/idr_prediction/processed_data/value_histograms.png +0 -0
fuson_plm/benchmarking/idr_prediction/raw_data/asph_bio_synth_training_data_cleaned_05_09_2023.tsv +0 -0
fuson_plm/benchmarking/idr_prediction/raw_data/asph_nat_meth_test.tsv +0 -0
fuson_plm/benchmarking/idr_prediction/raw_data/scaled_re_bio_synth_training_data_cleaned_05_09_2023.tsv +0 -0
fuson_plm/benchmarking/idr_prediction/raw_data/scaled_re_nat_meth_test.tsv +0 -0
fuson_plm/benchmarking/idr_prediction/raw_data/scaled_rg_bio_synth_training_data_cleaned_05_09_2023.tsv +0 -0
fuson_plm/benchmarking/idr_prediction/raw_data/scaled_rg_nat_meth_test.tsv +0 -0
fuson_plm/benchmarking/idr_prediction/raw_data/scaling_exp_bio_synth_training_data_cleaned_05_09_2023.tsv +0 -0
fuson_plm/benchmarking/idr_prediction/raw_data/scaling_exp_nat_meth_test.tsv +0 -0
fuson_plm/benchmarking/idr_prediction/results/final/asph_best_test_r2.csv +3 -0
fuson_plm/benchmarking/idr_prediction/results/final/asph_hyperparam_screen_test_r2.csv +3 -0
fuson_plm/benchmarking/idr_prediction/results/final/r2_plots/asph/esm2_t33_650M_UR50D_asph_R2.png +0 -0
fuson_plm/benchmarking/idr_prediction/results/final/r2_plots/asph/esm2_t33_650M_UR50D_asph_R2_source_data.csv +3 -0
fuson_plm/benchmarking/idr_prediction/results/final/r2_plots/asph/fuson_plm_asph_R2.png +0 -0
fuson_plm/benchmarking/idr_prediction/results/final/r2_plots/asph/fuson_plm_asph_R2_source_data.csv +3 -0
fuson_plm/benchmarking/idr_prediction/results/final/r2_plots/scaled_re/esm2_t33_650M_UR50D_scaled_re_R2.png +0 -0
fuson_plm/benchmarking/idr_prediction/results/final/r2_plots/scaled_re/esm2_t33_650M_UR50D_scaled_re_R2_source_data.csv +3 -0
fuson_plm/benchmarking/idr_prediction/results/final/r2_plots/scaled_re/fuson_plm_scaled_re_R2.png +0 -0
fuson_plm/benchmarking/idr_prediction/results/final/r2_plots/scaled_re/fuson_plm_scaled_re_R2_source_data.csv +3 -0
fuson_plm/benchmarking/idr_prediction/results/final/r2_plots/scaled_rg/esm2_t33_650M_UR50D_scaled_rg_R2.png +0 -0
fuson_plm/benchmarking/idr_prediction/results/final/r2_plots/scaled_rg/esm2_t33_650M_UR50D_scaled_rg_R2_source_data.csv +3 -0
fuson_plm/benchmarking/idr_prediction/results/final/r2_plots/scaled_rg/fuson_plm_scaled_rg_R2.png +0 -0
fuson_plm/benchmarking/idr_prediction/results/final/r2_plots/scaled_rg/fuson_plm_scaled_rg_R2_source_data.csv +3 -0
fuson_plm/benchmarking/idr_prediction/results/final/r2_plots/scaling_exp/esm2_t33_650M_UR50D_scaling_exp_R2.png +0 -0
fuson_plm/benchmarking/idr_prediction/results/final/r2_plots/scaling_exp/esm2_t33_650M_UR50D_scaling_exp_R2_source_data.csv +3 -0
fuson_plm/benchmarking/idr_prediction/results/final/r2_plots/scaling_exp/fuson_plm_scaling_exp_R2.png +0 -0
fuson_plm/benchmarking/idr_prediction/results/final/r2_plots/scaling_exp/fuson_plm_scaling_exp_R2_source_data.csv +3 -0
fuson_plm/benchmarking/idr_prediction/results/final/scaled_re_best_test_r2.csv +3 -0
fuson_plm/benchmarking/idr_prediction/results/final/scaled_re_hyperparam_screen_test_r2.csv +3 -0
fuson_plm/benchmarking/idr_prediction/results/final/scaled_rg_best_test_r2.csv +3 -0
fuson_plm/benchmarking/idr_prediction/results/final/scaled_rg_hyperparam_screen_test_r2.csv +3 -0
fuson_plm/benchmarking/idr_prediction/results/final/scaling_exp_best_test_r2.csv +3 -0
fuson_plm/benchmarking/idr_prediction/results/final/scaling_exp_hyperparam_screen_test_r2.csv +3 -0
fuson_plm/benchmarking/idr_prediction/split.py +135 -0
fuson_plm/benchmarking/idr_prediction/splits/asph/test_df.csv +3 -0
fuson_plm/benchmarking/idr_prediction/splits/asph/train_df.csv +3 -0
fuson_plm/benchmarking/idr_prediction/splits/asph/val_df.csv +3 -0
fuson_plm/benchmarking/idr_prediction/splits/scaled_re/test_df.csv +3 -0

fuson_plm/benchmarking/caid/README.md CHANGED Viewed

@@ -120,47 +120,46 @@ benchmarking/
 Here we describe what each script does and which files each script creates.
 1. 🐍 **`scrape_fusionpdb.py`**
-    1. Scrapes metadata for FusionPDB Level 2 and Level 3
-        1. Pulls the online tables for [Level 2](https://compbio.uth.edu/FusionPDB/gene_search_result_0.cgi?type=chooseLevel&chooseLevel=level2) and [Level 3](https://compbio.uth.edu/FusionPDB/gene_search_result_0.cgi?type=chooseLevel&chooseLevel=level3), saving results to `raw_data/FusionPDB_level2_curated_09_05_2024.csv` and `raw_data/FusionPDB_level3_curated_09_05_2024.csv` respectively.
-    2. Retrieves structure links
-        1. Using the tables collected in step (i), visits the page for each fusion oncoprotein (FO) in FusionPDB Level 2 and 3, and downloads all AlphaFold2 structure links for each FO.
-        2. Saves results directly to `raw_data/FusionPDB_level2_fusion_structure_links.csv` and `raw_data/FusionPDB_level3_fusion_structure_links.csv`, respectively
-    3. Retrieves FO head gene and tail gene info
-        1. Using the tables collected in step (i), visits the page for each fusion oncoprotein (FO) in FusionPDB Level 2 and 3 to download head/tail info. Collects HGID and TGID (GeneIDs for head and tail) and UniProt accessions for each.
-        2. Saves results directly to `raw_data/level2_head_tail_info.txt` and `raw_data/level3_head_tail_info.txt`, respectively.
-    4. Combines Level 2 and 3 head/tail data
-        1. Merges `raw_data/level2_head_tail_info.txt` and `raw_data/level3_head_tail_info.txt` into a dataframe.
-        2. Saves result at `processed_data/fusionpdb/fusion_heads_and_tails.csv` (columns="FusionGID","HGID","TGID","HGUniProtAcc","TGUniProtAcc")
-    5. Combines Level 2 and 3 structure link data
-        1. Joins structure link data with metadata for each of levels 2 and 3, then combines the result.
-        2. Saves result at `processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_structure_links.csv`
-    6. Combines structure link data and metadata (result of step (v)) with head and tail data (result of step (iv)), and resolves any missing head/tail UniProt IDs.
-        1. Merges the data
-        2. Checks how many rows have either missing or wrong UniProt accessions for the head or tail gene, and compiles the gene symbols for online quering in the UniProt ID Mapping tool (`processed_data/fusionpdb/intermediates/unmapped_parts.txt`)
-        3. Reads the UniProt ID Mapping result. Combines this data with FusionPDB-scraped data by matching FusionPDB's HGID (GeneID for head) and TGID (GeneID for tail) with the GeneID returned by UniProt.
-        4. For any FO where FusionPDB lacked a UniProt ID for the head/tail, this ID is filled in from the UniProt ID Mapping result.
-        5. Saves result to `processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_head_tail_info.csv`. Columns: "FusionGID","FusionGene","Hgene","Tgene","URL","HGID","TGID","HGUniProtAcc","TGUniProtAcc","HGUniProtAcc_Source","TGUniProtAcc_Source", where the "_Source" columns indicate whether the UniProt ID came from FusionPDB, or from the ID Map.
-    7. Downloads AlphaFold2 structures of FOs from FusionPDB.
-        1. Using structure links from `processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_structure_links.csv` (step (v)), directly downloads `.pdb` and `.cif` files.
-        2. Saves results in 📁`raw_data/fusionpdb/structures`
-<br>
 2. 🐍 **`process_fusion_structures.py`**
-    1. Determines pLDDT(s) for each FO structure.
-        1. For each structure in 📁`raw_data/fusionpdb_structures/`, determines amino acid sequence, per-residue pLDDT, and average pLDDT from the AlphaFold2 structure.
-        2. Saves results in `processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_structures_processed.csv`.
-    2. Downloads AlphaFold2 structures for all head and tail proteins
-        1. Reads `processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_head_tail_info.csv` and collects all unique UniProt IDs for all head/tail proteins.
-        2. For each UniProt ID, queries the AlphaFoldDB, downloads the AlphaFold2 structure (if available), and saves it to 📁`raw_data/fusionpdb/head_tail_af2db_structures/`. Saves files converted from PDB to CIF format in `mmcif_converted_files`. Then, extracts the sequence, per-residue pLDDT, and average pLDDT from the file.
-        3. Saves any UniProt IDs that did not have structures in the AlphaFoldDB to: `processed_data/fusionpdb/intermediates/uniprotids_not_in_afdb.txt`. Most of these were very long, but the shorter ones were folded and their average pLDDTs were manually inputted. These were put back into the AlphaFold ID map to look for alternative UniProt IDs, and their results are in `not_in_afdb_idmap.txt`.
-        4. Saves results to `processed_data/fusionpdb/heads_tails_structural_data.csv`
-    3. Cleans the dataase of level 2&3 structural info
-        1. Drops rows where no structure was successfully downloaded
-        2. Drops rows where the FO sequence from FusionPDB does not match the FO sequence from its own AlphaFold2 structure file
-        3. ⭐️Saves **two final, cleaned databases**⭐️:
-            1. ⭐️ **`FusionPDB_level2-3_cleaned_FusionGID_info.csv`**: includes ful IDs and structural information for the Hgene and Tgene of each FO.  Columns="FusionGID","FusionGene","Hgene","Tgene","URL","HGID","TGID","HGUniProtAcc","TGUniProtAcc","HGUniProtAcc_Source","TGUniProtAcc_Source","HG_pLDDT","HG_AA_pLDDTs","HG_Seq","TG_pLDDT","TG_AA_pLDDTs","TG_Seq".
-            2. ⭐️ **`FusionPDB_level2-3_cleaned_structure_info.csv`**: includes full structural information for each FO. Columns= "FusionGID","FusionGene","Fusion_Seq","Fusion_Length","Hgene","Hchr","Hbp","Hstrand","Tgene","Tchr","Tbp","Tstrand","Level","Fusion_Structure_Link","Fusion_Structure_Type","Fusion_pLDDT","Fusion_AA_pLDDTs","Fusion_Seq_Source"
 ### Training
@@ -184,9 +183,12 @@ PERMISSION_TO_OVERWRITE_EMBEDDINGS = False                     # if False, scrip
 PERMISSION_TO_OVERWRITE_MODELS = False                          # if False, script will halt if it believes these embeddings have already been made.
 ```
-<br>
-`train.py` trains the models using embeddings indicated in `config.py`. It also performs a hyperparameter screen. Model raw outputs (probabilities) and performance metrics are saved in `trained_models`. For example, FusOn-pLM-Diso raw outputs (ESM-2-650M-Diso has a folder in the same format, and future trained models will as well):
 ```
 benchmarking/
@@ -209,7 +211,7 @@ benchmarking/
 - **`caid_train_losses.csv`**: train losses over the 2 training epochs for top-performing model
 - **`params.txt`**: hyperparameters of top performing model
-The training script also populates the `results` directory. Results from the FusOn-pLM manuscript are found in `results/final`. A few extra data files and plots are added by `analyze_fus`
 ```
 benchmarking/

 Here we describe what each script does and which files each script creates.
 1. 🐍 **`scrape_fusionpdb.py`**
+    i. Scrapes metadata for FusionPDB Level 2 and Level 3
+        a. Pulls the online tables for [Level 2](https://compbio.uth.edu/FusionPDB/gene_search_result_0.cgi?type=chooseLevel&chooseLevel=level2) and [Level 3](https://compbio.uth.edu/FusionPDB/gene_search_result_0.cgi?type=chooseLevel&chooseLevel=level3), saving results to `raw_data/FusionPDB_level2_curated_09_05_2024.csv` and `raw_data/FusionPDB_level3_curated_09_05_2024.csv` respectively.
+    ii. Retrieves structure links
+        a. Using the tables collected in step (i), visits the page for each fusion oncoprotein (FO) in FusionPDB Level 2 and 3, and downloads all AlphaFold2 structure links for each FO.
+        b. Saves results directly to `raw_data/FusionPDB_level2_fusion_structure_links.csv` and `raw_data/FusionPDB_level3_fusion_structure_links.csv`, respectively
+    iii. Retrieves FO head gene and tail gene info
+        a. Using the tables collected in step (i), visits the page for each fusion oncoprotein (FO) in FusionPDB Level 2 and 3 to download head/tail info. Collects HGID and TGID (GeneIDs for head and tail) and UniProt accessions for each.
+        b. Saves results directly to `raw_data/level2_head_tail_info.txt` and `raw_data/level3_head_tail_info.txt`, respectively.
+    iv. Combines Level 2 and 3 head/tail data
+        a. Merges `raw_data/level2_head_tail_info.txt` and `raw_data/level3_head_tail_info.txt` into a dataframe.
+        b. Saves result at `processed_data/fusionpdb/fusion_heads_and_tails.csv` (columns="FusionGID","HGID","TGID","HGUniProtAcc","TGUniProtAcc")
+    v. Combines Level 2 and 3 structure link data
+        a. Joins structure link data with metadata for each of levels 2 and 3, then combines the result.
+        b. Saves result at `processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_structure_links.csv`
+    vi. Combines structure link data and metadata (result of step (v)) with head and tail data (result of step (iv)), and resolves any missing head/tail UniProt IDs.
+        a. Merges the data
+        b. Checks how many rows have either missing or wrong UniProt accessions for the head or tail gene, and compiles the gene symbols for online quering in the UniProt ID Mapping tool (`processed_data/fusionpdb/intermediates/unmapped_parts.txt`)
+        c. Reads the UniProt ID Mapping result. Combines this data with FusionPDB-scraped data by matching FusionPDB's HGID (GeneID for head) and TGID (GeneID for tail) with the GeneID returned by UniProt.
+        d. For any FO where FusionPDB lacked a UniProt ID for the head/tail, this ID is filled in from the UniProt ID Mapping result.
+        e. Saves result to `processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_head_tail_info.csv`. Columns: "FusionGID","FusionGene","Hgene","Tgene","URL","HGID","TGID","HGUniProtAcc","TGUniProtAcc","HGUniProtAcc_Source","TGUniProtAcc_Source", where the "_Source" columns indicate whether the UniProt ID came from FusionPDB, or from the ID Map.
+    vii. Downloads AlphaFold2 structures of FOs from FusionPDB.
+        a. Using structure links from `processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_structure_links.csv` (step (v)), directly downloads `.pdb` and `.cif` files.
+        b. Saves results in 📁`raw_data/fusionpdb/structures`
 2. 🐍 **`process_fusion_structures.py`**
+    i. Determines pLDDT(s) for each FO structure.
+        a. For each structure in 📁`raw_data/fusionpdb_structures/`, determines amino acid sequence, per-residue pLDDT, and average pLDDT from the AlphaFold2 structure.
+        b. Saves results in `processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_structures_processed.csv`.
+    ii. Downloads AlphaFold2 structures for all head and tail proteins
+        a. Reads `processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_head_tail_info.csv` and collects all unique UniProt IDs for all head/tail proteins.
+        b. For each UniProt ID, queries the AlphaFoldDB, downloads the AlphaFold2 structure (if available), and saves it to 📁`raw_data/fusionpdb/head_tail_af2db_structures/`. Saves files converted from PDB to CIF format in `mmcif_converted_files`. Then, extracts the sequence, per-residue pLDDT, and average pLDDT from the file.
+        c. Saves any UniProt IDs that did not have structures in the AlphaFoldDB to: `processed_data/fusionpdb/intermediates/uniprotids_not_in_afdb.txt`. Most of these were very long, but the shorter ones were folded and their average pLDDTs were manually inputted. These were put back into the AlphaFold ID map to look for alternative UniProt IDs, and their results are in `not_in_afdb_idmap.txt`.
+        d. Saves results to `processed_data/fusionpdb/heads_tails_structural_data.csv`
+    iii. Cleans the dataase of level 2&3 structural info
+        a. Drops rows where no structure was successfully downloaded
+        b. Drops rows where the FO sequence from FusionPDB does not match the FO sequence from its own AlphaFold2 structure file
+        c. ⭐️Saves **two final, cleaned databases**⭐️:
+            a. ⭐️ **`FusionPDB_level2-3_cleaned_FusionGID_info.csv`**: includes ful IDs and structural information for the Hgene and Tgene of each FO.  Columns="FusionGID","FusionGene","Hgene","Tgene","URL","HGID","TGID","HGUniProtAcc","TGUniProtAcc","HGUniProtAcc_Source","TGUniProtAcc_Source","HG_pLDDT","HG_AA_pLDDTs","HG_Seq","TG_pLDDT","TG_AA_pLDDTs","TG_Seq".
+            b. ⭐️ **`FusionPDB_level2-3_cleaned_structure_info.csv`**: includes full structural information for each FO. Columns= "FusionGID","FusionGene","Fusion_Seq","Fusion_Length","Hgene","Hchr","Hbp","Hstrand","Tgene","Tchr","Tbp","Tstrand","Level","Fusion_Structure_Link","Fusion_Structure_Type","Fusion_pLDDT","Fusion_AA_pLDDTs","Fusion_Seq_Source"
 ### Training
 PERMISSION_TO_OVERWRITE_MODELS = False                          # if False, script will halt if it believes these embeddings have already been made.
 ```
+`train.py` trains the models using embeddings indicated in `config.py`. It also performs a hyperparameter screen.
+- All **results** are stored in `caid/results/<timestamp>`, where `timestamp` is a unique string encoding the date and time when you started training.
+- All **raw outputs from models** are stored in `caid/trained_models/<embedding_path>`, where `embedding_path` represents the embeddings used to build the disorder predictor.
+- All **embeddings** made for training will be stored in a new folder called `caid/embeddings/` with subfolders for each model. This allows you to use the same model multiple times without regenerating embeddings.
+Below is the FusOn-pLM-Diso raw outputs folder, `trained_models/fuson_plm/best/'. (ESM-2-650M-Diso has a folder in the same format, and future trained models will as well):
 ```
 benchmarking/
 - **`caid_train_losses.csv`**: train losses over the 2 training epochs for top-performing model
 - **`params.txt`**: hyperparameters of top performing model
+Results from the FusOn-pLM manuscript are found in `results/final`. A few extra data files and plots are added by `analyze_fusion_preds.py`
 ```
 benchmarking/

fuson_plm/benchmarking/idr_prediction/README.md ADDED Viewed

	@@ -0,0 +1,211 @@

+## IDR Property Prediction Benchmark
+This folder contains all the data and code needed to perform the **IDR property prediction benchmark**, where FusOn-pLM-IDR (a regressor built on FusOn-pLM embeddings) is used to predict aggregate properties of intrinsically disordered regions (IDRs), specifically asphericity, end-to-end radius (R<sub>e</sub>), radius of gyration (R<sub>g</sub>), and polymer scaling exponent (Figure 4A-B).
+### TL;DR
+The order in which to run the scripts, after downloading data:
+```
+python clean.py             # clean the data
+python cluster.py           # MMSeqs2 clustering
+python split.py             # make cluster-based train/val/test splits
+python train.py             # train the model
+python plot.py              # if you want to remake r2 plots
+```
+### Downloading raw IDR data
+IDR properties from [Lotthammer et al. 2024](https://doi.org/10.1038/s41592-023-02159-5) (ALBATROSS model) were used to train FusOn-pLM-Diso. Sequences were downloaded from [this link](https://github.com/holehouse-lab/supportingdata/blob/master/2023/ALBATROSS_2023/simulations/data/all_sequences.tgz) and deposited in `raw_data`. All files in `raw_data` are from this direct download.
+```
+benchmarking/
+└── idr_prediction/
+    └── raw_data/
+        ├── asph_bio_synth_training_data_cleaned_05_09_2023.tsv
+        ├── asph_nat_meth_test.tsv
+        ├── scaled_re_bio_synth_training_data_cleaned_05_09_2023.tsv
+        ├── scaled_re_nat_meth_test.tsv
+        ├── scaled_rg_bio_synth_training_data_cleaned_05_09_2023.tsv
+        ├── scaled_rg_nat_meth_test.tsv
+        ├── scaling_exp_bio_synth_training_data_cleaned_05_09_2023.tsv
+        ├── scaling_exp_nat_meth_test.tsv
+```
+- **`asph`**=asphericity, **`scaled_re`**=scaled R<sub>e</sub>, **`scaled_rg`**=scaled R<sub>g</sub>, **`scaling_exp`**=polymer scaling exponent
+- **`<property>_bio_synth_training_data_cleaned_05_09_2023.tsv`** are ALBATROSS **training data** for the four properties, downloaded directly their GitHub
+- **`<property>_nat_meth_test.tsv`** are ALBATROSS **testing data** for the four proeprties, downloaded directly from their GitHub
+### Cleaning raw IDR data
+`clean.py` cleans the raw training and testing data separately for each property. Any duplicates (in both train and test) are removed from train and kept in test. Finally, the four are combined into one file:
+```
+benchmarking/
+└── idr_prediction/
+    └── processed_data/
+        ├── all_albatross_seqs_and_properties.csv
+```
+- **`all_albatross_seqs_and_properties.csv`**: Columns = "Sequence","IDs","UniProt_IDs","UniProt_Names","Split","asph","scaled_re","scaled_rg","scaling_exp". All splits are either "Train" or "Test", indicating ALBATROSS model's usage of them
+To perform cleaning, run
+```
+python clean.py
+```
+### Using config.py for clustering, splitting, training
+This file has configurations for clustering, splitting, training.
+```
+# Clustering Parameters
+CLUSTER = CustomParams(
+    # MMSeqs2 parameters: see GitHub or MMSeqs2 Wiki for guidance
+    MIN_SEQ_ID = 0.3,                                                   # % identity
+    C = 0.5,                                                            # % sequence length overlap
+    COV_MODE = 1,                                                       # cov-mode: 0 = bidirectional, 1 = target coverage, 2 = query coverage, 3 = target-in-query length coverage.
+    CLUSTER_MODE = 2,
+    # File paths
+    INPUT_PATH = 'processed_data/all_albatross_seqs_and_properties.csv',
+    PATH_TO_MMSEQS = '../../mmseqs'                                     # path to where you installed MMSeqs2
+)
+# Split config
+SPLIT = CustomParams(
+    IDR_DB_PATH = 'processed_data/all_albatross_seqs_and_properties.csv',
+    CLUSTER_OUTPUT_PATH = 'clustering/mmseqs_full_results.csv',
+    RANDOM_STATE_1 = 2,                                     # random_state_1 = state for splitting all data into train & other
+    TEST_SIZE_1 = 0.21,                                     # test size for data -> train/test split. e.g. 20 means 80% clusters in train, 20% clusters in other
+    RANDOM_STATE_2 = 6,                                     # random_state_2 = state for splitting other from ^ into val and test
+    TEST_SIZE_2 = 0.50                                      # test size for train -> train/val split. e.g. 0.50 means 50% clusters in train, 50% clusters in test
+)
+# Which models to benchmark
+TRAIN = CustomParams(
+    BENCHMARK_FUSONPLM = True,
+    FUSONPLM_CKPTS= "FusOn-pLM",                            # Dictionary: key = run name, values = epochs, or string "FusOn-pLM"
+    BENCHMARK_ESM = True,
+    # GPU configs
+    CUDA_VISIBLE_DEVICES="0",
+    # Overwriting configs
+    PERMISSION_TO_OVERWRITE_EMBEDDINGS = False,             # if False, script will halt if it believes these embeddings have already been made.
+    PERMISSION_TO_OVERWRITE_MODELS = False                  # if False, script will halt if it believes these embeddings have already been made.
+)
+```
+### Clustering
+Clustering of all sequences in `all_albatross_seqs_and_properties.csv` is performed by `cluster.py`.
+The clustering command entered by the script is:
+```
+mmseqs easy-cluster clustering/input.fasta clustering/raw_output/mmseqs clustering/raw_output --min-seq-id 0.3 -c 0.5 --cov-mode 1 --cluster-mode 2 --dbtype 1
+```
+The script will generate the following files:
+```
+benchmarking/
+└── idr_prediction/
+    └── clustering/
+        ├── input.fasta
+        ├── mmseqs_full_results.csv
+```
+- **`clustering/input.fasta`**: the input file used by MMSeqs2 to cluster the fusion oncoprotein sequences. Headers are our assigned sequence IDs (can be found in the `IDs` column of `processed_data/all_albatross_seqs_and_properties.csv`.)
+- **`clustering/mmseqs_full_results.csv`**: clustering results. Columns:
+    - `representative seq_id`: the seq_id of the sequence representing this cluster
+    - `member seq_id`: the seq_id of a member of the cluster
+    - `representative seq`: the amino acid sequence of the cluster representative (representative seq_id)
+    - `member seq`: the amino acid sequence of the cluster member
+### Splitting
+Cluster-based splitting is performed by `split.py`. Results are formatted as follows:
+```
+benchmarking/
+└── idr_prediction/
+    └── splits/
+        └── asph/
+            ├── test_df.csv
+            ├── val_df.csv
+            ├── train_df.csv
+        └── scaled_re/...   # same format as splits/asph
+        └── scaled_rg/...   # same format as splits/asph
+        └── scaling_exp/... # same format as splits/asph
+        ├── test_cluster_split.csv
+        ├── train_cluster_split.csv
+        ├── val_cluster_split.csv
+```
+- **`<split>_cluster_split.csv`**: cluster information for the clusters in each split (train, val, test). Columns = "representative seq_id", "member seq_id", "representative seq", "member seq", "member length"
+- 📁 **`asph/`**, **`scaled_re/`**, **`scaled_rg/`**, and **`scaling_exp/`** contain the train, val, and test sets for each property (`train_df.csv`, `val_df.csv`, and `test_df.csv`). The splits follow `<split>_cluster_split.csv`, but not every property has a measurement for each of these sequences. The train-val-test ratio still remains 80-10-10 for each property, despite the sequence losses.
+### Training
+The model is defined in `model.py` and `utils.py`. The `train.py` script trains FusOn-pLM-IDR and ESM-2-650M-IDR models *separately for each property* (asphericity, R<sub>e</sub>, R<sub>g</sub>, scaling exponent) with a hyperparameter screen, saves all results separated by property, and makes plots. `plot.py` can be used to regenerate the R<sup>2</sup> plots.
+- All **results** are stored in `idr_prediction/results/<timestamp>`, where `timestamp` is a unique string encoding the date and time when you started training.
+- All **raw outputs from models** are stored in `idr_prediction/trained_models/<embedding_path>`, where `embedding_path` represents the embeddings used to build the disorder predictor.
+- All **embeddings** made for training will be stored in a new folder called `idr_prediction/embeddings/` with subfolders for each model. This allows you to use the same model multiple times without regenerating embeddings.
+Below is the FusOn-pLM-Diso raw outputs folder, `trained_models/fuson_plm/best/`, and the results from the paper, `results/final/`...
+The outputs are structured as follows:
+```
+benchmarking/
+└── idr_prediction/
+    └── results/final/
+        └── r2_plots
+            └── asph/
+                ├── esm2_t33_650M_UR50D_asph_R2.png
+                ├── esm2_t33_650M_UR50D_asph_R2_source_data.csv
+                ├── fuson_plm_asph_R2.png
+                ├── fuson_plm_asph_R2_source_data.csv
+            └── scaled_re/          # same format as r2_plots/asph/...
+            └── scaled_rg/          # same format as r2_plots/asph/...
+            └── scaling_exp/        # same format as r2_plots/asph/...
+        ├── asph_best_test_r2.csv
+        ├── asph_hyperparam_screen_test_r2.csv
+        ├── scaled_re_best_test_r2.csv
+        ├── scaled_re_hyperparam_screen_test_r2.csv
+        ├── scaled_rg_best_test_r2.csv
+        ├── scaled_rg_hyperparam_screen_test_r2.csv
+        ├── scaling_exp_best_test_r2.csv
+        ├── scaling_exp_hyperparam_screen_test_r2.csv
+    └── trained_models/
+        └── asph/
+            └── fuson_plm/best/
+                └── lr0.0001_bs32/
+                    ├── asph_r2.csv
+                    ├── train_val_losses.csv
+                    ├── test_loss.csv
+                    ├── asph_test_predictions.csv
+                └── ... other hyperparameter folders with same format as lr0.001_bs32/
+            └── esm2_t33_650M_UR50D         # same format as asph/fuson_plm/best/
+        └── scaled_re/              # same format as asph/
+        └── scaled_rg/              # same format as asph/
+        └── scaling_exp/            # same format as asph/
+```
+In both directories, results are organized by IDR property and by the type of embedding used to train FusOn-pLM-IDR.
+In the 📁 `results/final` directory.
+- 📁 **`r2_plots/<property>/`**: holds all R<sup>2</sup> plots and source data (the formatted data used to make the R<sup>2</sup> plots) for these properties.
+- **`<property>_best_test_r2.csv`**: holds the R<sup>2</sup> values for the top-performing models of each embedding type (e.g. ESM-2-650M and a specific checkpoint of FusOn-pLM)
+- **`<property>_hyperparam_screen_test_r2.csv`**: holds the R<sup>2</sup> values for all embedding types, for all screened hyperparaemters
+In the 📁 `trained_models` directory:
+- 📁 `<property>/`: holds all results for all trained models predicting this property
+- 📁 `asph/fuson_plm/best/`: holds all FusOn-pLM-IDR results on asphericity prediction for each set of hyperparameters screened when embeddings are made from "fuson_plm/best" (FusOn-pLM model). For example, 📁 `lr0.0001_bs32/` holds results for learning rate of 0.001, batch size 32. If you were to retrain your own checkpoint of fuson_plm and run the IDR prediction benchmark, its results would be stored in a new subfolder of  `trained_models/fuson_plm`.
+- **`asph/fuson_plm/best/lr0.0001_bs32/asph_r2.csv`**: R<sup>2</sup> value for this set of hyperparameters with "fuson_plm/best" embeddings
+- **`asph/fuson_plm/best/lr0.0001_bs32/asph_test_predictions.csv`**: true asphericity values of the test set proteins, alongside FusOn-pLM-IDR's predictions of them.
+- **`asph/fuson_plm/best/lr0.0001_bs32/test_loss.csv`**: FusOn-pLM-IDR's asphericity test loss value
+- **`asph/fuson_plm/best/lr0.0001_bs32/train_val_losses.csv`**: FusOn-pLM-IDR's tarining and validation loss over each epoch while training on asphericity data
+To run the training script, enter:
+```
+nohup python train.py > train.out 2> train.err &
+```
+To run the plotting script, enter:
+```
+python plot.py
+```

fuson_plm/benchmarking/idr_prediction/__init__.py ADDED Viewed

File without changes

fuson_plm/benchmarking/idr_prediction/clean.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import pandas as pd
+import numpy as np
+import os
+from fuson_plm.utils.logging import open_logfile, log_update
+from fuson_plm.utils.constants import DELIMITERS, VALID_AAS
+from fuson_plm.utils.data_cleaning import check_columns_for_listlike, find_invalid_chars
+from fuson_plm.benchmarking.idr_prediction.plot import plot_all_values_hist_grid, plot_all_train_val_test_values_hist_grid
+def process_raw_albatross(df):
+    # return a version of the df with first column split, duplicates cleaned,columns checked for weird characters and invalids
+    # first, look at the splits
+    split_str = df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'})
+    tot_prots = sum(split_str['count'])
+    split_str['pcnt'] = round(100*split_str['count']/tot_prots,2)
+    split_str = split_str.to_string(index=False)
+    split_str = "\t\t" + split_str.replace("\n","\n\t\t")
+    log_update(f"\tTotal proteins: {tot_prots}\n\tSplits:\n{split_str}")
+    # format: IDR_19076_tr___A0A8M9PNM5___A0A8M9PNM5_DANRE
+    # or: synth_test_sequence0
+    df['temp'] = df['ID'].str.split("_")
+    df['ID'] = df['temp'].apply(lambda x: f"{x[0]}" if len(x)==1 else f"{x[0]}_{x[1]}" if len(x)<3 else f"{x[0]}_{x[1]}_{x[2]}")
+    # Not ever column has UniProt IDs and Names, so we have to allow np.nan if this info is missing.
+    df['UniProt_ID'] = df['temp'].apply(lambda x: x[5].strip() if len(x)>=5 else np.nan)
+    df['UniProt_Name'] = df['temp'].apply(lambda x: f"{x[8].strip()}_{x[9].strip()}" if len(x)>=8 else np.nan)
+    df = df.drop(columns=['temp'])
+    cols_to_check = list(df.columns)
+    cols_to_check.remove('Value')   # don't check this one because it shouldn't be string
+    # Investigate the colimns we just created and make sure they don't have any invalid features.
+    # make sure value is float type
+    assert df['Value'].dtype == 'float64'
+    check_columns_for_listlike(df, cols_of_interest=cols_to_check, delimiters=DELIMITERS)
+    # Check for invalid AAs
+    df['invalid_chars'] = df['Sequence'].apply(lambda x: find_invalid_chars(x, VALID_AAS))
+    df[df['invalid_chars'].str.len()>0].sort_values(by='Sequence')
+    all_invalid_chars = set().union(*df['invalid_chars'])
+    log_update(f"\tchecking for invalid characters...\n\t\tset of all invalid characters discovered within train_df: {all_invalid_chars}")
+    # Assert no invalid AAs
+    assert (df['invalid_chars'].str.len()==0).all()
+    df = df.drop(columns=['invalid_chars'])
+    # Check for duplicates - if we find any, REMOVE them from train and keep them in test
+    duplicates = df[df.duplicated('Sequence')]['Sequence'].unique().tolist()
+    n_rows_with_duplicates = len(df[df['Sequence'].isin(duplicates)])
+    log_update(f"\t{len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} rows")
+    # Look for distribution of duplicates WITHIN train, WITHIN test, and BETWEEN train and test
+    # Train only
+    duplicates = df.loc[
+            (df['Split']=='Train')
+        ]
+    duplicates = duplicates[duplicates.duplicated('Sequence')]['Sequence'].unique().tolist()
+    n_rows_with_duplicates = len(df.loc[
+            (df['Sequence'].isin(duplicates)) &
+            (df['Split']=='Train')
+        ])
+    log_update(f"\t\twithin TRAIN only: {len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} Train rows")
+    # Test only
+    duplicates = df.loc[
+            (df['Split']=='Test')
+    ]
+    duplicates = duplicates[duplicates.duplicated('Sequence')]['Sequence'].unique().tolist()
+    n_rows_with_duplicates = len(df.loc[
+            (df['Sequence'].isin(duplicates)) &
+            (df['Split']=='Test')
+        ])
+    log_update(f"\t\twithin TEST only: {len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} Test rows")
+    # Between train and test
+    duplicates_df = df.groupby('Sequence').agg({
+        'Split': lambda x: ','.join(set(x))
+    }).reset_index()
+    duplicates_df = duplicates_df.loc[duplicates_df['Split'].str.contains(',')].reset_index(drop=True)
+    duplicates = duplicates_df['Sequence'].unique().tolist()
+    n_rows_with_duplicates = len(df[df['Sequence'].isin(duplicates)])
+    log_update(f"\t\tduplicates in BOTH TRAIN AND TEST: {len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} rows")
+    log_update(f"\t\tprinting portion of dataframe with train+test shared seqs:\n{duplicates_df.head(5)}")
+    log_update("\tGrouping by sequence, averaging values, and keeping any Train/Test duplicates in the Test set...")
+    df = df.replace(np.nan, '')
+    df = df.groupby('Sequence').agg(
+        Value=('Value', 'mean'),
+        Value_STD=('Value', 'std'),
+        IDs=('ID', lambda x: ','.join(x)),
+        UniProt_IDs=('UniProt_ID', lambda x: ','.join(x)),
+        UniProt_Names=('UniProt_Name', lambda x: ','.join(x)),
+        Split=('Split', lambda x: ','.join(x))
+    ).reset_index()
+    for col in ['IDs','UniProt_IDs','UniProt_Names','Split']:
+        df[col] = df[col].apply(lambda x: [y for y in x.split(',')])
+        df[col] = df[col].apply(lambda x: ','.join(x))
+        df[col] = df[col].str.strip(',')
+        # make sure there are no commas left
+        assert len(df[df[col].str.contains(',,')])==0
+    # set Split to Test if test is in it
+    df['Split'] = df['Split'].apply(lambda x: 'Test' if 'Test' in x else 'Train')
+    # For anything that wasn't duplicated, Value_STD is nan
+    log_update("\tChecking coefficients of variation for averaged rows")
+    # calculate coefficient of variation, should be < 10
+    df['Value_CV'] = 100*df['Value_STD']/df['Value']
+    log_update(f"\t\tTotal rows with coefficient of variation (CV)\n\t\t\t<=10%: {len(df[df['Value_CV']<=10])}\n\t\t\t>10%: {len(df[df['Value_CV']>10])}\n\t\t\t>20%: {len(df[df['Value_CV']>20])}")
+    # Ensure there are no duplicates
+    assert len(df[df['Sequence'].duplicated()])==0
+    log_update(f"\tNo remaining duplicates: {len(df[df['Sequence'].duplicated()])==0}")
+    # Print the final distribution of train and test values
+    split_str = df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'})
+    tot_prots = sum(split_str['count'])
+    split_str['pcnt'] = round(100*split_str['count']/tot_prots,2)
+    split_str = split_str.to_string(index=False)
+    split_str = "\t\t" + split_str.replace("\n","\n\t\t")
+    log_update(f"\tTotal proteins: {tot_prots}\n\tSplits:\n{split_str}")
+    return df
+def combine_albatross_seqs(asph, scaled_re, scaled_rg, scaling_exp):
+    log_update("\nCombining all four dataframes into one file of ALBATROSS sequences")
+    asph = asph[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'asph'})
+    scaled_re = scaled_re[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'scaled_re'})
+    scaled_rg = scaled_rg[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'scaled_rg'})
+    scaling_exp = scaling_exp[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'scaling_exp'})
+    combined = asph.merge(scaled_re, on='Sequence',how='outer',suffixes=('_asph', '_scaledre'))\
+        .merge(scaled_rg, on='Sequence',how='outer',suffixes=('_scaledre', '_scaledrg'))\
+        .merge(scaling_exp, on='Sequence',how='outer',suffixes=('_scaledrg', '_scalingexp')).fillna('')
+    # Make sure something that's in train for one is in train for all, and not test
+    combined['IDs'] = combined['IDs_asph']+','+combined['IDs_scaledre']+','+combined['IDs_scaledrg']+','+combined['IDs_scalingexp']
+    combined['UniProt_IDs'] = combined['UniProt_IDs_asph']+','+combined['UniProt_IDs_scaledre']+','+combined['UniProt_IDs_scaledrg']+','+combined['UniProt_IDs_scalingexp']
+    combined['UniProt_Names'] = combined['UniProt_Names_asph']+','+combined['UniProt_Names_scaledre']+','+combined['UniProt_Names_scaledrg']+','+combined['UniProt_Names_scalingexp']
+    combined['Split'] = combined['Split_asph']+','+combined['Split_scaledre']+','+combined['Split_scaledrg']+','+combined['Split_scalingexp']
+    # Make the lists clean
+    for col in ['IDs','UniProt_IDs','UniProt_Names','Split']:
+        combined[col] = combined[col].apply(lambda x: [y.strip() for y in x.split(',') if len(y)>0])
+        combined[col] = combined[col].apply(lambda x: ','.join(set(x)))
+        combined[col] = combined[col].str.strip(',')
+        # make sure there are no commas left
+        assert len(combined[combined[col].str.contains(',,')])==0
+    combined = combined[['Sequence','IDs','UniProt_IDs','UniProt_Names','Split','asph','scaled_re','scaled_rg','scaling_exp']]   # drop unneeded merge relics
+    combined = combined.replace('',np.nan)
+    # Make sure there are no sequences where split is both train and test
+    log_update("\tChecking for any cases where a protein is Train for one IDR prediction task and Test for another (should NOT happen!)")
+    duplicates_df = combined.groupby('Sequence').agg({
+        'Split': lambda x: ','.join(set(x))
+    }).reset_index()
+    duplicates_df = duplicates_df.loc[duplicates_df['Split'].str.contains(',')].reset_index(drop=True)
+    duplicates = duplicates_df['Sequence'].unique().tolist()
+    n_rows_with_duplicates = len(combined[combined['Sequence'].isin(duplicates)])
+    log_update(f"\t\tsequences in BOTH TRAIN AND TEST: {len(duplicates)} sequences, corresponding to {n_rows_with_duplicates} rows")
+    if len(duplicates)>0:
+        log_update(f"\t\tprinting portion of assert len(combined[combined['asph'].notna()])==len(asph)dataframe with train+test shared seqs:\n{duplicates_df.head(5)}")
+    # Now, get rid of duplicates
+    combined = combined.drop_duplicates().reset_index(drop=True)
+    duplicates = combined[combined.duplicated('Sequence')]['Sequence'].unique().tolist()
+    log_update(f"\tDropped duplicates.\n\tTotal duplicate sequences: {len(duplicates)}\n\tTotal sequences: {len(combined)}")
+    assert len(duplicates)==0
+    # See how many columns have multiple entries for each
+    log_update(f"\tChecking how many sequences have multiple of the following: ID, UniProt ID, UniProt Name")
+    for col in ['IDs','UniProt_IDs','UniProt_Names','Split']:
+        n_multiple = len(combined.loc[(combined[col].notna()) & (combined[col].str.contains(','))])
+        log_update(f"\t\t{col}: {n_multiple}")
+    # See how many entries there are of each cproperty (should match length of original database)
+    assert len(combined[combined['asph'].notna()])==len(asph)
+    assert len(combined[combined['scaled_re'].notna()])==len(scaled_re)
+    assert len(combined[combined['scaled_rg'].notna()])==len(scaled_rg)
+    assert len(combined[combined['scaling_exp'].notna()])==len(scaling_exp)
+    log_update("\tSequences with values for each property:")
+    for property in ['asph','scaled_re','scaled_rg','scaling_exp']:
+        log_update(f"\t\t{property}: {len(combined[combined[property].notna()])}")
+    log_update(f"\nPreview of combined database with columns: {combined.columns}\n{combined.head(10)}")
+    return combined
+def main():
+    with open_logfile("data_cleaning_log.txt"):
+        # Read in all of the raw data
+        raw_data_folder = 'raw_data'
+        dtype_dict = {0:str,1:str,2:float}
+        rename_dict = {0:'ID',1:'Sequence',2:'Value'}
+        # Read in the test data
+        asph_test = pd.read_csv(f"{raw_data_folder}/asph_nat_meth_test.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict)
+        scaled_re_test = pd.read_csv(f"{raw_data_folder}/scaled_re_nat_meth_test.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict)
+        scaled_rg_test = pd.read_csv(f"{raw_data_folder}/scaled_rg_nat_meth_test.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict)
+        scaling_exp_test = pd.read_csv(f"{raw_data_folder}/scaling_exp_nat_meth_test.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict)
+        # Read in the train data
+        asph_train = pd.read_csv(f"{raw_data_folder}/asph_bio_synth_training_data_cleaned_05_09_2023.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict)
+        scaled_re_train = pd.read_csv(f"{raw_data_folder}/scaled_re_bio_synth_training_data_cleaned_05_09_2023.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict)
+        scaled_rg_train = pd.read_csv(f"{raw_data_folder}/scaled_rg_bio_synth_training_data_cleaned_05_09_2023.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict)
+        scaling_exp_train = pd.read_csv(f"{raw_data_folder}/scaling_exp_bio_synth_training_data_cleaned_05_09_2023.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict)
+        # Concatenate - include columns for split
+        asph_test['Split'] = ['Test']*len(asph_test)
+        scaled_re_test['Split'] = ['Test']*len(scaled_re_test)
+        scaled_rg_test['Split'] = ['Test']*len(scaled_rg_test)
+        scaling_exp_test['Split'] = ['Test']*len(scaling_exp_test)
+        asph_train['Split'] = ['Train']*len(asph_train)
+        scaled_re_train['Split'] = ['Train']*len(scaled_re_train)
+        scaled_rg_train['Split'] = ['Train']*len(scaled_rg_train)
+        scaling_exp_train['Split'] = ['Train']*len(scaling_exp_train)
+        asph = pd.concat([asph_test, asph_train])
+        scaled_re = pd.concat([scaled_re_test, scaled_re_train])
+        scaled_rg = pd.concat([scaled_rg_test, scaled_rg_train])
+        scaling_exp = pd.concat([scaling_exp_test, scaling_exp_train])
+        log_update("Initial counts:")
+        log_update(f"\tAsphericity: total entries={len(asph)}, not nan entries={len(asph.loc[asph['Value'].notna()])}")
+        log_update(f"\tScaled re: total entries={len(scaled_re)}, not nan entries={len(scaled_re.loc[scaled_re['Value'].notna()])}")
+        log_update(f"\tScaled rg: total entries={len(scaled_rg)}, not nan entries={len(scaled_rg.loc[scaled_rg['Value'].notna()])}")
+        # change any scaled_rg rows with values less than 1 to np.nan, as done in the paper
+        scaled_rg = scaled_rg.loc[
+            scaled_rg['Value']>=1].reset_index(drop=True)
+        log_update(f"\t\tAfter dropping Rg values < 1: total entries={len(scaled_rg)}")
+        log_update(f"\tScaling exp: total entries={len(scaling_exp)}, not nan entries={len(scaling_exp.loc[scaling_exp['Value'].notna()])}")
+        # Process the raw data
+        log_update(f"Example raw download: asphericity\n{asph.head()}")
+        log_update(f"\nCleaning Asphericity")
+        asph = process_raw_albatross(asph)
+        log_update(f"\nProcessed data: asphericity\n{asph.head()}")
+        log_update(f"\nCleaning Scaled Re")
+        scaled_re = process_raw_albatross(scaled_re)
+        log_update(f"\nProcessed data: scaled re\n{scaled_re.head()}")
+        log_update(f"\nCleaning Scaled Rg")
+        scaled_rg = process_raw_albatross(scaled_rg)
+        log_update(f"\nProcessed data: scaled rg\n{scaled_rg.head()}")
+        log_update(f"\nCleaning Scaling Exp")
+        scaling_exp = process_raw_albatross(scaling_exp)
+        log_update(f"\nProcessed data: scaling exp\n{scaling_exp.head()}")
+        # Give some stats about each dataset
+        log_update("\nStats:")
+        log_update(f"# Asphericity sequences: {len(asph)}\n\tRange: {min(asph['Value']):.4f}-{max(asph['Value']):.4f}")
+        log_update(f"# Scaled Re sequences: {len(scaled_re)}\n\tRange: {min(scaled_re['Value']):.4f}-{max(scaled_re['Value']):.4f}")
+        log_update(f"# Scaled Rg sequences: {len(scaled_rg)}\n\tRange: {min(scaled_rg['Value']):.4f}-{max(scaled_rg['Value']):.4f}")
+        log_update(f"# Scaling Exponent sequences: {len(scaling_exp)}\n\tRange: {min(scaling_exp['Value']):.4f}-{max(scaling_exp['Value']):.4f}")
+        # Combine
+        combined = combine_albatross_seqs(asph, scaled_re, scaled_rg, scaling_exp)
+        # Save processed data
+        proc_folder = "processed_data"
+        os.makedirs(proc_folder,exist_ok=True)
+        combined.to_csv(f"{proc_folder}/all_albatross_seqs_and_properties.csv",index=False)
+        # Plot the data distribution and save it
+        values_dict =  {
+            'Asphericity': asph['Value'].tolist(),
+            'End-to-End Distance (Re)': scaled_re['Value'].tolist(),
+            'Radius of Gyration (Rg)': scaled_rg['Value'].tolist(),
+            'Scaling Exponent': scaling_exp['Value'].tolist()
+        }
+        train_test_values_dict = {
+            'Asphericity': {
+                'train': asph[asph['Split']=='Train']['Value'].tolist(),
+                'test': asph[asph['Split']=='Test']['Value'].tolist()},
+            'End-to-End Distance (Re)': {
+                'train': scaled_re[scaled_re['Split']=='Train']['Value'].tolist(),
+                'test': scaled_re[scaled_re['Split']=='Test']['Value'].tolist()},
+            'Radius of Gyration (Rg)': {
+                'train': scaled_rg[scaled_rg['Split']=='Train']['Value'].tolist(),
+                'test': scaled_rg[scaled_rg['Split']=='Test']['Value'].tolist()},
+            'Scaling Exponent': {
+                'train': scaling_exp[scaling_exp['Split']=='Train']['Value'].tolist(),
+                'test': scaling_exp[scaling_exp['Split']=='Test']['Value'].tolist()},
+        }
+        plot_all_values_hist_grid(values_dict, save_path="processed_data/value_histograms.png")
+        plot_all_train_val_test_values_hist_grid(train_test_values_dict, save_path="processed_data/train_test_value_histograms.png")
+if __name__ == "__main__":
+    main()

fuson_plm/benchmarking/idr_prediction/cluster.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from fuson_plm.utils.logging import open_logfile, log_update
+from fuson_plm.benchmarking.idr_prediction.config import CLUSTER
+from fuson_plm.utils.clustering import ensure_mmseqs_in_path, process_fasta, analyze_clustering_result, make_fasta, run_mmseqs_clustering, cluster_summary
+import os
+import pandas as pd
+def main_old():
+    # Read all the input args
+    LOG_PATH = "clustering_log.txt"
+    INPUT_PATH = CLUSTER.INPUT_PATH
+    MIN_SEQ_ID = CLUSTER.MIN_SEQ_ID
+    C = CLUSTER.C
+    COV_MODE = CLUSTER.COV_MODE
+    CLUSTER_MODE = CLUSTER.CLUSTER_MODE
+    PATH_TO_MMSEQS = CLUSTER.PATH_TO_MMSEQS
+    with open_logfile(LOG_PATH):
+        log_update("Input params from config.py:")
+        CLUSTER.print_config(indent='\t')
+        # Make a subfolder for clustering results, and direct MMSeqs2 outputs here
+        os.makedirs("clustering",exist_ok=True)
+        output_dir = "clustering/raw_output"
+        # Make fasta of input file
+        sequences = pd.read_csv(INPUT_PATH)
+        # We only want to cluster the ones in the Train split from Albatross
+        sequences = sequences.loc[sequences['Split']=='Train'].reset_index(drop=True)
+        log_update(f"\nPreparing input data (albatross TRAIN only)...\n\tdataset size: {len(sequences)} sequences")
+        max_seqlen = max(sequences['Sequence'].str.len().tolist())
+        log_update(f"\tLongest sequence in dataset: {max_seqlen} AAs")
+        # Unfortunately, these IDs are NOT unique. Need to add tags to them
+        sequences['Unique_ID'] = [f"s{i+1}" for i in range(len(sequences))]
+        sequences['Unique_ID'] = sequences["IDs"].apply(lambda x: "_".join(x.split(','))) + "_" + sequences['Unique_ID']
+        log_update("Not all IDs from the database are unique. Created unique IDs by tagging on sequence #s")
+        log_update(f"\tExample: {sequences.iloc[0]['Unique_ID']}")
+        sequences = dict(zip(sequences['Unique_ID'],sequences['Sequence']))
+        fasta_path = make_fasta(sequences, "clustering/input.fasta")
+        log_update(f"\tMade fasta of input sequences, saved at {fasta_path}")
+        run_mmseqs_clustering(fasta_path, output_dir, min_seq_id=MIN_SEQ_ID, c=C, cov_mode=COV_MODE, cluster_mode=CLUSTER_MODE, path_to_mmseqs=PATH_TO_MMSEQS)
+        # Brief read to preview results
+        clusters = analyze_clustering_result('clustering/input.fasta', 'clustering/raw_output/mmseqs_cluster.tsv')
+        # Save clusters
+        clusters.to_csv('clustering/mmseqs_full_results.csv',index=False)
+        log_update("Processed and combined mmseqs output. Wrote comprehensive results to clustering/mmseqs_full_results.csv")
+        cluster_summary(clusters)
+def main():
+    # Read all the input args
+    LOG_PATH = "clustering_log.txt"
+    INPUT_PATH = CLUSTER.INPUT_PATH
+    MIN_SEQ_ID = CLUSTER.MIN_SEQ_ID
+    C = CLUSTER.C
+    COV_MODE = CLUSTER.COV_MODE
+    CLUSTER_MODE = CLUSTER.CLUSTER_MODE
+    PATH_TO_MMSEQS = CLUSTER.PATH_TO_MMSEQS
+    with open_logfile(LOG_PATH):
+        log_update("Input params from config.py:")
+        CLUSTER.print_config(indent='\t')
+        # Make a subfolder for clustering results, and direct MMSeqs2 outputs here
+        os.makedirs("clustering",exist_ok=True)
+        output_dir = "clustering/raw_output"
+        # Make fasta of input file
+        sequences = pd.read_csv(INPUT_PATH)
+        log_update(f"\nPreparing input data (albatross train AND test sequences)...\n\tdataset size: {len(sequences)} sequences")
+        max_seqlen = max(sequences['Sequence'].str.len().tolist())
+        log_update(f"\tLongest sequence in dataset: {max_seqlen} AAs")
+        # Unfortunately, these IDs are NOT unique. Need to add tags to them
+        sequences['Unique_ID'] = [f"s{i+1}" for i in range(len(sequences))]
+        sequences['Unique_ID'] = sequences["IDs"].apply(lambda x: "_".join(x.split(','))) + "_" + sequences['Unique_ID']
+        log_update("Not all IDs from the database are unique. Created unique IDs by tagging on sequence #s")
+        log_update(f"\tExample: {sequences.iloc[0]['Unique_ID']}")
+        sequences = dict(zip(sequences['Unique_ID'],sequences['Sequence']))
+        fasta_path = make_fasta(sequences, "clustering/input.fasta")
+        log_update(f"\tMade fasta of input sequences, saved at {fasta_path}")
+        run_mmseqs_clustering(fasta_path, output_dir, min_seq_id=MIN_SEQ_ID, c=C, cov_mode=COV_MODE, cluster_mode=CLUSTER_MODE, path_to_mmseqs=PATH_TO_MMSEQS)
+        # Brief read to preview results
+        clusters = analyze_clustering_result('clustering/input.fasta', 'clustering/raw_output/mmseqs_cluster.tsv')
+        # Save clusters
+        clusters.to_csv('clustering/mmseqs_full_results.csv',index=False)
+        log_update("Processed and combined mmseqs output. Wrote comprehensive results to clustering/mmseqs_full_results.csv")
+        cluster_summary(clusters)
+if __name__ == "__main__":
+    main()

fuson_plm/benchmarking/idr_prediction/clustering/input.fasta ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6445f3d52da17cd0843cf7ea8d49a38a3f122c4c852aec512e889a855e5aa373
+size 7113231

fuson_plm/benchmarking/idr_prediction/clustering/mmseqs_full_results.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fab007462284f7b9a3057d95aad0ab8257824b92ff9403b10443d790baafe5cd
+size 16308334

fuson_plm/benchmarking/idr_prediction/config.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from fuson_plm.utils.logging import CustomParams
+# Clustering Parameters
+# Need to be stacked, because there are 4 properties
+CLUSTER = CustomParams(
+    # MMSeqs2 parameters: see GitHub or MMSeqs2 Wiki for guidance
+    MIN_SEQ_ID = 0.3,                                               # % identity
+    C = 0.5,                                                        # % sequence length overlap
+    COV_MODE = 1,                                                  # cov-mode: 0 = bidirectional, 1 = target coverage, 2 = query coverage, 3 = target-in-query length coverage.
+    CLUSTER_MODE = 2,
+    # File paths
+    INPUT_PATH = 'processed_data/all_albatross_seqs_and_properties.csv',
+    PATH_TO_MMSEQS = '../../mmseqs'                                    # path to where you installed MMSeqs2
+)
+# Here, we'll be splitting the train set into train and val. we aren't touching test
+SPLIT = CustomParams(
+    IDR_DB_PATH = 'processed_data/all_albatross_seqs_and_properties.csv',
+    CLUSTER_OUTPUT_PATH = 'clustering/mmseqs_full_results.csv',
+    #RANDOM_STATE = 7,                                    # random_state_1 = state for splitting all data into train & test
+    #VAL_SIZE = 0.10,                                    # val size for data -> train/val split. e.g. 20 means 80% clusters in train, 20% clusters in val
+    RANDOM_STATE_1 = 2,                                    # random_state_1 = state for splitting all data into train & other
+    TEST_SIZE_1 = 0.21,                                    # test size for data -> train/test split. e.g. 20 means 80% clusters in train, 20% clusters in other
+    RANDOM_STATE_2 = 6,                                    # random_state_2 = state for splitting other from ^ into val and test
+    TEST_SIZE_2 = 0.50                                     # test size for train -> train/val split. e.g. 0.50 means 50% clusters in train, 50% clusters in test
+)
+# Which models to benchmark
+TRAIN = CustomParams(
+    BENCHMARK_FUSONPLM = True,
+    FUSONPLM_CKPTS= "FusOn-pLM",                                # Dictionary: key = run name, values = epochs, or string "FusOn-pLM"
+    BENCHMARK_ESM = True,
+    # GPU configs
+    CUDA_VISIBLE_DEVICES="0",
+    # Overwriting configs
+    PERMISSION_TO_OVERWRITE_EMBEDDINGS = False,                     # if False, script will halt if it believes these embeddings have already been made.
+    PERMISSION_TO_OVERWRITE_MODELS = False                     # if False, script will halt if it believes these embeddings have already been made.
+)

fuson_plm/benchmarking/idr_prediction/model.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import pytorch_lightning as pl
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+import pandas as pd
+from tqdm import tqdm
+import pickle
+import torch
+import esm
+import numpy as np
+import matplotlib.pyplot as plt
+import random
+import io
+from transformers import EsmModel, EsmTokenizer, EsmConfig, AutoTokenizer
+from sklearn.metrics import roc_auc_score
+#one-hot MLP model (input 1280 (esm-2))
+class ProteinMLPOneHot(pl.LightningModule):
+    def __init__(self):
+        super().__init__()
+        self.network = nn.Sequential(
+            nn.Linear(20, 8),
+            nn.ReLU(),
+            nn.LayerNorm(8),
+            nn.Dropout(0.2),
+            nn.Linear(8, 4),
+            nn.ReLU(),
+            nn.LayerNorm(4),
+            nn.Dropout(0.2),
+            nn.Linear(4, 1)
+        )
+    def forward(self, x):
+        x = self.network(x)
+        return x  #pass x through linear layers with activation functions, dropout, and layernorm
+    def training_step(self, batch, batch_idx):
+        x, y = batch['Protein Input'], batch['Dimension'].float()  #get batch
+        y_hat = self(x).squeeze(-1)  #get prediction from batch
+        loss = F.mse_loss(y_hat, y)  #calc loss from prediction and dimension of each
+        self.log('train_loss', loss, on_epoch=True, prog_bar=True, logger=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        x, y = batch['Protein Input'], batch['Dimension'].float()
+        y_hat = self(x).squeeze(-1)
+        val_loss = F.mse_loss(y_hat, y)
+        self.log('val_loss', val_loss, on_epoch=True, prog_bar=True, logger=True)
+        return val_loss
+    def test_step(self, batch, batch_idx):
+        x, y = batch['Protein Input'], batch['Dimension'].float()
+        y_hat = self(x).squeeze(-1)
+        test_loss = F.mse_loss(y_hat, y)
+        self.log('test_loss', test_loss, on_epoch=True, prog_bar=True, logger=True)
+        return test_loss
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=0.0003)
+        return optimizer
+    # def on_train_epoch_end(self):
+    #     train_loss = self.trainer.callback_metrics['train_loss']
+    #     print(f"Epoch {self.current_epoch + 1} - Training Loss: {train_loss:.4f}")
+    #     wandb.log({'train_loss': train_loss, 'epoch': self.current_epoch + 1})
+    # def on_validation_epoch_end(self):
+    #     val_loss = self.trainer.callback_metrics['val_loss']
+    #     print(f"Epoch {self.current_epoch + 1} - Validation Loss: {val_loss:.4f}")
+    #     wandb.log({'val_loss': val_loss, 'epoch': self.current_epoch + 1})
+    # def on_test_epoch_end(self):
+    #     test_loss = self.trainer.callback_metrics['test_loss']
+    #     print(f"Test Loss: {test_loss:.4f}")
+    #     wandb.log({'test_loss': test_loss})
+#regular MLP model (input 1280 (esm-2))
+class ProteinMLPESM(pl.LightningModule):
+    def __init__(self):
+        super().__init__()
+        self.network = nn.Sequential(
+            nn.Linear(1280, 640),
+            nn.ReLU(),
+            nn.LayerNorm(640),
+            nn.Dropout(0.2),
+            nn.Linear(640, 320),
+            nn.ReLU(),
+            nn.LayerNorm(320),
+            nn.Dropout(0.2),
+            nn.Linear(320, 1)
+        )
+    def forward(self, x):
+        x = self.network(x)
+        return x  #pass x through linear layers with activation functions, dropout, and layernorm
+    def training_step(self, batch, batch_idx):
+        x, y = batch['Protein Input'], batch['Dimension'].float()  #get batch
+        y_hat = self(x).squeeze(-1)  #get prediction from batch
+        loss = F.mse_loss(y_hat, y)  #calc loss from prediction and dimension of each
+        self.log('train_loss', loss, on_epoch=True, prog_bar=True, logger=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        x, y = batch['Protein Input'], batch['Dimension'].float()
+        y_hat = self(x).squeeze(-1)
+        val_loss = F.mse_loss(y_hat, y)
+        self.log('val_loss', val_loss, on_epoch=True, prog_bar=True, logger=True)
+        return val_loss
+    def test_step(self, batch, batch_idx):
+        x, y = batch['Protein Input'], batch['Dimension'].float()
+        y_hat = self(x).squeeze(-1)
+        test_loss = F.mse_loss(y_hat, y)
+        self.log('test_loss', test_loss, on_epoch=True, prog_bar=True, logger=True)
+        return test_loss
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=0.0003)
+        return optimizer
+    # def on_train_epoch_end(self):
+    #     train_loss = self.trainer.callback_metrics['train_loss']
+    #     print(f"Epoch {self.current_epoch + 1} - Training Loss: {train_loss:.4f}")
+    #     wandb.log({'train_loss': train_loss, 'epoch': self.current_epoch + 1})
+    # def on_validation_epoch_end(self):
+    #     val_loss = self.trainer.callback_metrics['val_loss']
+    #     print(f"Epoch {self.current_epoch + 1} - Validation Loss: {val_loss:.4f}")
+    #     wandb.log({'val_loss': val_loss, 'epoch': self.current_epoch + 1})
+    # def on_test_epoch_end(self):
+    #     test_loss = self.trainer.callback_metrics['test_loss']
+    #     print(f"Test Loss: {test_loss:.4f}")
+    #     wandb.log({'test_loss': test_loss})
+class LossTrackerCallback(pl.Callback):
+    def __init__(self):
+        self.train_losses = []
+        self.val_losses = []
+    def on_train_epoch_end(self, trainer, pl_module):
+        # Access the most recent training loss from the logger
+        train_loss = trainer.callback_metrics.get('train_loss')
+        if train_loss:
+            self.train_losses.append(train_loss.item())
+    def on_validation_epoch_end(self, trainer, pl_module):
+        # Access the most recent validation loss from the logger
+        val_loss = trainer.callback_metrics.get('val_loss')
+        if val_loss:
+            self.val_losses.append(val_loss.item())

fuson_plm/benchmarking/idr_prediction/plot.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+import numpy as np
+import os
+from sklearn.metrics import r2_score
+import matplotlib.colors as mcolors
+from fuson_plm.utils.visualizing import set_font
+global default_cmap_dict
+default_cmap_dict = {
+    'Asphericity': '#785EF0',
+    'End-to-End Distance (Re)': '#DC267F',
+    'Radius of Gyration (Rg)': '#FE6100',
+    'Scaling Exponent': '#FFB000'
+}
+# Method for lengthening the model name
+def lengthen_model_name(model_name, model_epoch):
+    if 'esm' in model_name:
+        return model_name
+    return f'{model_name}_e{model_epoch}'
+def plot_train_val_test_values_hist(train_values_list, val_values_list, test_values_list, dataset_name="Data", color="black", save_path=None, ax=None):
+    """
+    Plot Histogram to show the ranges of values
+    """
+    set_font()
+    if ax is None:
+        fig, ax = plt.subplots(1, 1, figsize=(6, 4), dpi=300)
+    total_seqs = len(train_values_list)+len(val_values_list)+len(test_values_list)
+    ax.hist(train_values_list, color=color, alpha=0.7,label=f"train (n={len(train_values_list)})")
+    if not(test_values_list is None):
+        ax.hist(test_values_list, color='black',alpha=0.7,label=f"test (n={len(test_values_list)})")
+    if not(val_values_list is None):
+        ax.hist(val_values_list, color='grey',alpha=0.7,label=f"val (n={len(val_values_list)})")
+    ax.grid(True)
+    ax.set_axisbelow(True)
+    ax.set_title(f'{dataset_name} Distribution (n={total_seqs})')
+    ax.set_xlabel(dataset_name)
+    ax.legend()
+    plt.tight_layout()
+    if save_path is not None:
+        plt.savefig(save_path)
+def plot_values_hist(values_list, dataset_name="Data", color="black", save_path=None, ax=None):
+    """
+    Plot Histogram to show the ranges of values
+    """
+    set_font()
+    if ax is None:
+        fig, ax = plt.subplots(1, 1, figsize=(6, 4), dpi=300)
+    ax.hist(values_list, color=color)
+    ax.grid(True)
+    ax.set_axisbelow(True)
+    ax.set_title(f'{dataset_name} Distribution')
+    ax.set_xlabel(dataset_name)
+    plt.tight_layout()
+    if save_path is not None:
+        plt.savefig(save_path)
+def plot_all_values_hist_grid(values_dict, cmap_dict=default_cmap_dict, save_path="processed_data/value_histograms.png"):
+    """
+    Args:
+        values_dict: dictionary where keys are dataset names and values are value lists
+        cmap_dict: dictioanry where keys are dataset names (same as in values dict) and values are value lists
+    """
+    fig, axes = plt.subplots(2, 2, figsize=(12, 8), dpi=300)
+    axes = axes.flatten()
+    for i, (dataset_name, values_list) in enumerate(values_dict.items()):
+        ax = axes[i]
+        plot_values_hist(values_list, dataset_name=dataset_name, color=cmap_dict[dataset_name], ax=ax)
+    fig.set_tight_layout(True)
+    fig.savefig(save_path)
+def plot_all_train_val_test_values_hist_grid(values_dict, cmap_dict=default_cmap_dict, save_path="processed_data/value_histograms.png"):
+    """
+    Args:
+        values_dict: dictionary where keys are dataset names and values are another dict: {'train': train_values_list, 'test': test_values_list}
+        cmap_dict: dictioanry where keys are dataset names (same as in values dict) and values are value lists
+    """
+    fig, axes = plt.subplots(2, 2, figsize=(12, 8), dpi=300)
+    axes = axes.flatten()
+    for i, (dataset_name, train_val_test_dict) in enumerate(values_dict.items()):
+        ax = axes[i]
+        train_values_list = train_val_test_dict['train']
+        test_values_list, val_values_list = None, None
+        if 'test' in train_val_test_dict:
+            test_values_list = train_val_test_dict['test']
+        if 'val' in train_val_test_dict:
+            val_values_list = train_val_test_dict['val']
+        plot_train_val_test_values_hist(train_values_list, val_values_list, test_values_list, dataset_name=dataset_name, color=cmap_dict[dataset_name], ax=ax)
+    fig.set_tight_layout(True)
+    fig.savefig(save_path)
+#only need to change labels at bottom depending on what embeddings+dimension is being looked at
+def plot_r2(model_type, idr_property, test_preds, save_path):
+    set_font()
+    # prepare ylabels from idr_property
+    ylabel_dict = {'asph': 'Asphericity',
+                    'scaled_re': 'End-to-End Radius, $R_e$',
+                    'scaled_rg': 'Radius of Gyration, $R_g$',
+                    'scaling_exp': 'Polymer Scaling Exponent'}
+    y_unitlabel_dict = {'asph': 'Asphericity',
+                    'scaled_re': '$R_e$ (Å)',
+                    'scaled_rg': '$R_g$ (Å)',
+                    'scaling_exp': 'Exponent'
+    }
+    y_label = ylabel_dict[idr_property]
+    y_unitlabel = y_unitlabel_dict[idr_property]
+    # get true values and predictions
+    true_values = test_preds['true_values'].tolist()
+    predictions = test_preds['predictions'].tolist()
+    # save this source data, including the IDs of the sequences
+    test_df = pd.read_csv(f"splits/{idr_property}/test_df.csv")
+    processed_data = pd.read_csv("processed_data/all_albatross_seqs_and_properties.csv")
+    seq_id_dict = dict(zip(processed_data['Sequence'],processed_data['IDs']))
+    test_df['IDs'] = test_df['Sequence'].map(seq_id_dict)
+    test_df_with_preds = test_preds[['true_values','predictions']]
+    test_df_with_preds['IDs'] = test_df['IDs']
+    print("number of sequences with no ID: ", len(test_df_with_preds.loc[test_df_with_preds['IDs'].isna()]))
+    test_df_with_preds.to_csv(save_path.replace(".png","_source_data.csv"),index=False)
+    r2 = r2_score(true_values, predictions)
+    # Plotting
+    plt.figure(figsize=(10, 8))
+    plt.scatter(true_values, predictions, alpha=0.5, label='Predictions')
+    plt.plot([min(true_values), max(true_values)], [min(true_values), max(true_values)], 'r--', label='Ideal Fit')
+    plt.text(0.65, 0.35, f"$R^2$ = {r2:.2f}", transform=plt.gca().transAxes, fontsize=44)
+    # Adjusting font sizes and setting font properties
+    plt.xlabel(f'True {y_unitlabel}',size=44)
+    plt.ylabel(f'Predicted {y_unitlabel}',size=44)
+    plt.title(f"{y_label}",size=50) #: {model_type}\n($R^2$={r2:.2f})",size=44)
+    # Create legend and set font properties
+    legend = plt.legend(fontsize=32)
+    for text in legend.get_texts():
+        text.set_fontsize(32)
+    # Adjust marker size in the legend
+    for handle in legend.legendHandles:
+        handle._sizes = [100]
+    # Enable grid
+    plt.grid(True)
+    # Adjusting tick labels font size
+    plt.xticks(fontsize=36)
+    plt.yticks(fontsize=36)
+    # Setting font properties for tick labels (another way to adjust them individually)
+    for label in plt.gca().get_xticklabels():
+        label.set_fontsize(32)
+    for label in plt.gca().get_yticklabels():
+        label.set_fontsize(32)
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=300, bbox_inches='tight')
+    plt.show()
+def plot_all_r2(output_dir, idr_properties):
+    for idr_property in idr_properties:
+        # make the R^2 Plots for the BEST one
+        best_results = pd.read_csv(f"{output_dir}/{idr_property}_best_test_r2.csv")
+        model_type_to_path_dict = dict(zip(best_results['model_type'],best_results['path_to_model']))
+        for model_type, path_to_model in model_type_to_path_dict.items():
+            model_preds_folder = path_to_model.split('/best-checkpoint.ckpt')[0]
+            test_preds = pd.read_csv(f"{model_preds_folder}/{idr_property}_test_predictions.csv")
+            # make paths for R^2 plots
+            if not os.path.exists(f"{output_dir}/r2_plots"):
+                    os.makedirs(f"{output_dir}/r2_plots")
+            os.makedirs(f"{output_dir}/r2_plots/{idr_property}", exist_ok=True)
+            model_type_dict = {
+                    'fuson_plm': 'FusOn-pLM',
+                    'esm2_t33_650M_UR50D': 'ESM-2'
+            }
+            r2_save_path = f"{output_dir}/r2_plots/{idr_property}/{model_type}_{idr_property}_R2.png"
+            plot_r2(model_type_dict[model_type], idr_property, test_preds, r2_save_path)
+def main():
+    plot_all_r2("results/final", ["asph","scaled_re","scaled_rg","scaling_exp"])
+if __name__ == '__main__':
+    main()

fuson_plm/benchmarking/idr_prediction/processed_data/all_albatross_seqs_and_properties.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9cb327d8990d40b566a399f3e075c6d5ccc73ca4203bc9dd9a94c00675e4dda5
+size 9552773

fuson_plm/benchmarking/idr_prediction/processed_data/train_test_value_histograms.png ADDED Viewed

fuson_plm/benchmarking/idr_prediction/processed_data/value_histograms.png ADDED Viewed

fuson_plm/benchmarking/idr_prediction/raw_data/asph_bio_synth_training_data_cleaned_05_09_2023.tsv ADDED Viewed