FusOn-pLM / fuson_plm /benchmarking /embed.py

dependencies and embedding_exploration benchmark

c43fbc6 about 2 months ago

14 kB

	# Python file for making embeddings from a FusOn-pLM model for any dataset
	from fuson_plm.utils.embedding import get_esm_embeddings, load_esm2_type, redump_pickle_dictionary, load_prott5, get_prott5_embeddings
	from fuson_plm.utils.logging import log_update, open_logfile, print_configpy
	from fuson_plm.utils.data_cleaning import find_invalid_chars
	from fuson_plm.utils.constants import VALID_AAS
	from fuson_plm.training.model import FusOnpLM
	from transformers import AutoModelForMaskedLM, AutoTokenizer, AutoModel
	import logging
	import torch
	import pickle
	import os
	import pandas as pd
	import numpy as np

	def validate_sequence_col(df, seq_col):
	# if column isn't there, error
	if seq_col not in list(df.columns):
	raise Exception("Error: provided sequence column does not exist in the input dataframe")

	# if column contains invalid characters, error
	df['invalid_chars'] = df[seq_col].apply(lambda x: find_invalid_chars(x, VALID_AAS))
	all_invalid_chars = set().union(*df['invalid_chars'])
	df = df.drop(columns=['invalid_chars'])
	if len(all_invalid_chars)>0:
	raise Exception(f"Error: invalid characters {all_invalid_chars} found in the sequence column")

	# make sure there are no duplicates
	sequences = df[seq_col]
	if len(set(sequences))<len(sequences): log_update("\tWARNING: input data has duplicate sequences")

	def load_fuson_model(ckpt_path):
	# Suppress warnings about newly initialized 'esm.pooler.dense.bias', 'esm.pooler.dense.weight' layers - these are not used to extract embeddings
	logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

	# Set device
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {device}")

	# Load model
	model = AutoModel.from_pretrained(ckpt_path) # initialize model
	tokenizer = AutoTokenizer.from_pretrained(ckpt_path) # initialize tokenizer

	# Model to device and in eval mode
	model.to(device)
	model.eval() # disables dropout for deterministic results

	return model, tokenizer, device

	def get_fuson_embeddings(model, tokenizer, sequences, device, average=True, print_updates=False, savepath=None, save_at_end=False, max_length=2000):
	# Correct save path to pickle if necessary
	if savepath is not None:
	if savepath[-4::] != '.pkl': savepath += '.pkl'

	if print_updates: log_update(f"Dataset contains {len(sequences)} sequences.")

	# If no max length was passed, just set it to the maximum in the dataset
	max_seq_len = max([len(s) for s in sequences])
	if max_length is None: max_length=max_seq_len+2 # add 2 for BOS, EOS

	# Initialize an empty dict to store the ESM embeddings
	embedding_dict = {}
	# Iterate through the seqs
	for i in range(len(sequences)):
	sequence = sequences[i]
	# Get the embeddings
	with torch.no_grad():
	# Tokenize the input sequence
	inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True,max_length=max_length)
	inputs = {k: v.to(device) for k, v in inputs.items()}

	outputs = model(**inputs)
	# The embeddings are in the last_hidden_state tensor
	embedding = outputs.last_hidden_state
	# remove extra dimension
	embedding = embedding.squeeze(0)
	# remove BOS and EOS tokens
	embedding = embedding[1:-1, :]

	# Convert embeddings to numpy array (if needed)
	embedding = embedding.cpu().numpy()

	# Average (if necessary)
	if average:
	embedding = embedding.mean(0)

	# Add to dictionary
	embedding_dict[sequence] = embedding

	# Save individual embedding (if necessary)
	if not(savepath is None) and not(save_at_end):
	with open(savepath, 'ab+') as f:
	d = {sequence: embedding}
	pickle.dump(d, f)

	# Print update (if necessary)
	if print_updates: log_update(f"sequence {i+1}: {sequence[0:10]}...")

	# Dump all at once at the end (if necessary)
	if not(savepath is None):
	# If saving for the first time, just dump it
	if save_at_end:
	with open(savepath, 'wb') as f:
	pickle.dump(embedding_dict, f)
	# If we've been saving all along and made it here without crashing, correct the pickle file so it can be loaded nicely
	else:
	redump_pickle_dictionary(savepath)

	def embed_dataset(path_to_file, path_to_output, seq_col='aa_seq', model_type='fuson_plm', fuson_ckpt_path = None, average=True, overwrite=True, print_updates=False,max_length=2000):
	# Make sure we aren't overwriting pre-existing embeddings
	if os.path.exists(path_to_output):
	if overwrite:
	log_update(f"WARNING: these embeddings may already exist at {path_to_output} and will be overwritten")
	else:
	log_update(f"WARNING: these embeddings may already exist at {path_to_output}. Skipping.")
	return None

	dataset = pd.read_csv(path_to_file)
	# Make sure the sequence column is valid
	validate_sequence_col(dataset, seq_col)

	sequences = dataset[seq_col].unique().tolist() # ensure all entries are unique

	### If FusOn-pLM: make fusion embeddings
	if model_type=='fuson_plm':
	if not(os.path.exists(fuson_ckpt_path)): raise Exception("FusOn-pLM ckpt path does not exist")

	# Load model
	try:
	model, tokenizer, device = load_fuson_model(fuson_ckpt_path)
	except:
	raise Exception(f"Could not load FusOn-pLM from {fuson_ckpt_path}")

	# Generate embeddigns
	try:
	get_fuson_embeddings(model, tokenizer, sequences, device, average=average,
	print_updates=print_updates, savepath=path_to_output, save_at_end=False,
	max_length=max_length)
	except:
	raise Exception("Could not generate FusOn-pLM embeddings")

	if model_type=='esm2_t33_650M_UR50D':
	# Load model
	try:
	model, tokenizer, device = load_esm2_type(model_type)
	except:
	raise Exception(f"Could not load {model_type}")
	# Generate embeddings
	try:
	get_esm_embeddings(model, tokenizer, sequences, device, average=average,
	print_updates=print_updates, savepath=path_to_output, save_at_end=False,
	max_length=max_length)
	except:
	raise Exception(f"Could not generate {model_type} embeddings")

	if model_type=="prot_t5_xl_half_uniref50_enc":
	# Load model
	try:
	model, tokenizer, device = load_prott5()
	except:
	raise Exception(f"Could not load {model_type}")
	# Generate embeddings
	try:
	get_prott5_embeddings(model, tokenizer, sequences, device, average=average,
	print_updates=print_updates, savepath=path_to_output, save_at_end=False,
	max_length=max_length)
	except:
	raise Exception(f"Could not generate {model_type} embeddings")


	def embed_dataset_for_benchmark(fuson_ckpts=None, input_data_path=None, input_fname=None, average=True, seq_col='seq', benchmark_fusonplm=False, benchmark_esm=False, benchmark_fo_puncta_ml=False, benchmark_prott5=False, overwrite=False,max_length=None):
	# make directory for embeddings inside benchmarking dataset if one doesn't already eist
	os.makedirs('embeddings',exist_ok=True)

	# Extract input file name from configs
	emb_type_tag ='average' if average else '2D'

	all_embedding_paths = dict() # dictionary organized where embedding path points to model, epoch

	# make the embedding files. Put them in an embedding directory
	if benchmark_fusonplm:
	os.makedirs('embeddings/fuson_plm',exist_ok=True)

	log_update(f"\nMaking Fuson-PLM embeddings")
	# make subdirs for all the
	if type(fuson_ckpts)==dict:
	for model_name, epoch_list in fuson_ckpts.items():
	os.makedirs(f'embeddings/fuson_plm/{model_name}',exist_ok=True)
	for epoch in epoch_list:
	# Assemble ckpt path and throw error if it doesn't exist
	fuson_ckpt_path = f'../../training/checkpoints/{model_name}/checkpoint_epoch_{epoch}'
	if not(os.path.exists(fuson_ckpt_path)): raise Exception(f"Error. Cannot find ckpt path: {fuson_ckpt_path}")

	# Make output directory and output embedding path
	embedding_output_dir = f'embeddings/fuson_plm/{model_name}/epoch{epoch}'
	embedding_output_path = f'{embedding_output_dir}/{input_fname}_{emb_type_tag}_embeddings.pkl'
	os.makedirs(embedding_output_dir,exist_ok=True)

	# Make dictionary item
	model_type = 'fuson_plm'
	all_embedding_paths[embedding_output_path] = {
	'model_type': model_type,
	'model': model_name,
	'epoch': epoch
	}

	# Create embeddings (or skip if they're already made)
	log_update(f"\tUsing ckpt {fuson_ckpt_path} and saving results to {embedding_output_path}...")
	embed_dataset(input_data_path, embedding_output_path,
	seq_col=seq_col, model_type=model_type,
	fuson_ckpt_path=fuson_ckpt_path, average=average,
	overwrite=overwrite,print_updates=True,
	max_length=max_length)
	elif fuson_ckpts=="FusOn-pLM":
	model_name = "best"
	os.makedirs(f'embeddings/fuson_plm/{model_name}',exist_ok=True)

	# Assemble ckpt path and throw error if it doesn't exist
	fuson_ckpt_path = "../../.." # go back to the FusOn-pLM directory to find the best ckpt
	if not(os.path.exists(fuson_ckpt_path)): raise Exception(f"Error. Cannot find ckpt path: {fuson_ckpt_path}")

	# Make output directory and output embedding path
	embedding_output_dir = f'embeddings/fuson_plm/{model_name}'
	embedding_output_path = f'{embedding_output_dir}/{input_fname}_{emb_type_tag}_embeddings.pkl'
	os.makedirs(embedding_output_dir,exist_ok=True)

	# Make dictionary item
	model_type = 'fuson_plm'
	all_embedding_paths[embedding_output_path] = {
	'model_type': model_type,
	'model': model_name,
	'epoch': None
	}

	# Create embeddings (or skip if they're already made)
	log_update(f"\tUsing ckpt {fuson_ckpt_path} and saving results to {embedding_output_path}...")
	embed_dataset(input_data_path, embedding_output_path,
	seq_col=seq_col, model_type=model_type,
	fuson_ckpt_path=fuson_ckpt_path, average=average,
	overwrite=overwrite,print_updates=True,
	max_length=max_length)
	else:
	raise Exception(f"Error. fuson_ckpts should be a dict or str")

	# make the embedding files. Put them in an embedding directory
	if benchmark_esm:
	os.makedirs('embeddings/esm2_t33_650M_UR50D',exist_ok=True)

	# make output path
	embedding_output_path = f'embeddings/esm2_t33_650M_UR50D/{input_fname}_{emb_type_tag}_embeddings.pkl'

	# Make dictioary item
	model_type = 'esm2_t33_650M_UR50D'
	all_embedding_paths[embedding_output_path] = {
	'model_type': model_type,
	'model': model_type,
	'epoch': np.nan
	}

	log_update(f"\nMaking ESM-2-650M embeddings for {input_data_path} and saving results to {embedding_output_path}...")
	embed_dataset(input_data_path, embedding_output_path,
	seq_col=seq_col, model_type=model_type,
	fuson_ckpt_path = None, average=average,
	overwrite=overwrite,print_updates=True,
	max_length=max_length)

	if benchmark_prott5:
	os.makedirs('embeddings/prot_t5_xl_half_uniref50_enc',exist_ok=True)

	# make output path
	embedding_output_path = f'embeddings/prot_t5_xl_half_uniref50_enc/{input_fname}_{emb_type_tag}_embeddings.pkl'

	# Make dictioary item
	model_type = 'prot_t5_xl_half_uniref50_enc'
	all_embedding_paths[embedding_output_path] = {
	'model_type': model_type,
	'model': model_type,
	'epoch': np.nan
	}

	log_update(f"\nMaking ProtT5-XL-UniRef50 embeddings for {input_data_path} and saving results to {embedding_output_path}...")
	embed_dataset(input_data_path, embedding_output_path,
	seq_col=seq_col, model_type=model_type,
	fuson_ckpt_path = None, average=average,
	overwrite=overwrite,print_updates=True,
	max_length=max_length)

	if benchmark_fo_puncta_ml:
	embedding_output_path =f'FOdb_physicochemical_embeddings.pkl'
	# Make dictionary item
	all_embedding_paths[embedding_output_path] = {
	'model_type': 'fo_puncta_ml',
	'model': 'fo_puncta_ml',
	'epoch': np.nan
	}

	return all_embedding_paths