# Python file for making embeddings from a FusOn-pLM model for any dataset from fuson_plm.utils.embedding import get_esm_embeddings, load_esm2_type, redump_pickle_dictionary, load_prott5, get_prott5_embeddings from fuson_plm.utils.logging import log_update, open_logfile, print_configpy from fuson_plm.utils.data_cleaning import find_invalid_chars from fuson_plm.utils.constants import VALID_AAS from fuson_plm.training.model import FusOnpLM from transformers import AutoModelForMaskedLM, AutoTokenizer, AutoModel import logging import torch import pickle import os import pandas as pd import numpy as np def validate_sequence_col(df, seq_col): # if column isn't there, error if seq_col not in list(df.columns): raise Exception("Error: provided sequence column does not exist in the input dataframe") # if column contains invalid characters, error df['invalid_chars'] = df[seq_col].apply(lambda x: find_invalid_chars(x, VALID_AAS)) all_invalid_chars = set().union(*df['invalid_chars']) df = df.drop(columns=['invalid_chars']) if len(all_invalid_chars)>0: raise Exception(f"Error: invalid characters {all_invalid_chars} found in the sequence column") # make sure there are no duplicates sequences = df[seq_col] if len(set(sequences))