|
|
|
|
|
import polars as pl |
|
import pandas as pd |
|
from joblib import load |
|
import argparse |
|
import logging |
|
import os |
|
import sys |
|
from io import StringIO |
|
import shap |
|
import numpy as np |
|
|
|
from bird_tool_utils import in_tempdir |
|
import extern |
|
from tqdm import tqdm |
|
|
|
|
|
|
|
def flatten_columns(df, cols): |
|
"""Flattens multiple columns in a data frame, cannot specify all columns!""" |
|
flattened_cols = {} |
|
for col in cols: |
|
flattened_cols[col] = pd.DataFrame([(index, value) for (index, values) in df[col].items() for value in values], |
|
columns=['index', col]).set_index('index') |
|
flattened_df = df.drop(cols, axis=1) |
|
for col in cols: |
|
flattened_df = flattened_df.join(flattened_cols[col]) |
|
return flattened_df |
|
|
|
|
|
|
|
def read_annotations(annotations_path, predictive_cogs_series): |
|
|
|
|
|
|
|
|
|
|
|
a1 = pd.read_csv(annotations_path, sep="\t", comment='#', header=None) |
|
|
|
|
|
expected_columns = [ |
|
"query", "seed_ortholog", "evalue", "score", "eggNOG_OGs", "max_annot_lvl", "COG_category", "Description", |
|
"Preferred_name", "GOs", "EC", "KEGG_ko", "KEGG_Pathway", "KEGG_Module", "KEGG_Reaction", "KEGG_rclass", |
|
"BRITE", "KEGG_TC", "CAZy", "BiGG_Reaction", "PFAMs" |
|
] |
|
if (len(a1.columns) != len(expected_columns)): |
|
raise Exception("Unexpected formation (number of columns) in eggnog-mapper .annotations file") |
|
a1.columns = expected_columns |
|
|
|
a2 = a1.loc[:, list(['query', 'eggNOG_OGs'])] |
|
a2.loc[:, 'eggNOG_OGs'] = a2['eggNOG_OGs'].str.split(",") |
|
|
|
a3 = flatten_columns(a2, ['eggNOG_OGs']) |
|
a4 = pd.merge(a3, predictive_cogs_series, on='eggNOG_OGs') |
|
|
|
|
|
|
|
|
|
return a4 |
|
|
|
|
|
|
|
def read_multiple_annotations(to_read, predictive_cogs_series): |
|
|
|
|
|
collected = pd.DataFrame({"accession": 'dummy', 'eggNOG_OGs': predictive_cogs_series, 'query': 'dummy'}) |
|
|
|
for ann in tqdm(to_read, desc="reading annotation files"): |
|
cogs = read_annotations(ann.strip(), predictive_cogs_series) |
|
cogs['accession'] = ann |
|
|
|
|
|
|
|
collected = pd.concat([collected, cogs]) |
|
return collected |
|
|
|
|
|
if __name__ == '__main__': |
|
data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data') |
|
|
|
parent_parser = argparse.ArgumentParser() |
|
parent_parser.add_argument('--debug', help='output debug information', action="store_true") |
|
|
|
parent_parser.add_argument('--quiet', help='only output errors', action="store_true") |
|
|
|
parent_parser.add_argument('--working-directory', help='working directory', default=None) |
|
parent_parser.add_argument('--protein-fasta', help='protein fasta file', required=True) |
|
|
|
parent_parser.add_argument('--annotation-table', help='Pre-calculated counts of COG / KO families in one or more genomes, in TSV format') |
|
|
|
parent_parser.add_argument('--add-missing-annotations', help='when columns are missing, add them as 0 [default: croak]', action="store_true") |
|
|
|
parent_parser.add_argument('--threads', help='number of threads to use', default=1, type=int) |
|
|
|
eggnog_parser = parent_parser.add_mutually_exclusive_group() |
|
eggnog_parser.add_argument('--eggnog-data-dir', |
|
help='eggnog data directory e.g. ~/m/db/eggnog-mapper/2.1.3') |
|
eggnog_parser.add_argument('--eggnog-annotation-file', help='eggnog .annotation file') |
|
|
|
kofam_parser = parent_parser.add_mutually_exclusive_group() |
|
kofam_parser.add_argument('--kofam-hmm-path', |
|
help='path to kofam hmm', |
|
default=os.path.join(data_dir, 'kofam-2022-01-30-profiles.hmm')) |
|
kofam_parser.add_argument('--kofam-tsv-file', help='hmmsearch output file') |
|
|
|
|
|
parent_parser.add_argument('--whitelist', help='whitelist of COGs to use', default=os.path.join(data_dir, 'COGs_with_more_than_50_percent_remaining.tsv')) |
|
|
|
parent_parser.add_argument('--modal-keggs', help='modal keggs', default=os.path.join(data_dir, 'ModalKEGGs.tsv')) |
|
parent_parser.add_argument('--modal-keggs-with-names', help='modal keggs with names', default=os.path.join(data_dir, 'modal_keggs_with_names.csv')) |
|
|
|
parent_parser.add_argument('--models', nargs='+', help='models to use', default=[os.path.join(data_dir, '..', 'XGBoost.model')]) |
|
|
|
parent_parser.add_argument('--training-data-header', help='header of training data', default=os.path.join(data_dir, 'training_data_header.tsv')) |
|
|
|
parent_parser.add_argument('--output-predictions', help='output predictions') |
|
parent_parser.add_argument('--output-annotations', help='output annotations, and exit, do not apply models') |
|
parent_parser.add_argument('--output-shap-values', help='output SHAP values i.e. which genes are predictive, to the specified file') |
|
args = parent_parser.parse_args() |
|
|
|
|
|
if args.debug: |
|
loglevel = logging.DEBUG |
|
elif args.quiet: |
|
loglevel = logging.ERROR |
|
else: |
|
loglevel = logging.INFO |
|
logging.basicConfig(level=loglevel, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%d/%m/%Y %I:%M:%S %p') |
|
|
|
|
|
eg_data = pl.read_csv(args.training_data_header, separator="\t", has_header=True) |
|
header = eg_data.select(pl.exclude(['accession', 'false_negative_rate', 'false_positive_rate'])).columns |
|
|
|
header = list([h for h in header if h not in ['COG0411', 'COG0459', 'COG0564', 'COG1344', 'COG4177']]) |
|
|
|
if not args.annotation_table: |
|
if not args.eggnog_data_dir and not args.eggnog_annotation_file: |
|
raise Exception("Either --annotations, --eggnog-data-dir or --eggnog-annotation-file must be specified") |
|
if not args.kofam_hmm_path and not args.kofam_tsv_file: |
|
raise Exception("Either --annotations, --kofam-hmm-path or --kofam-tsv-file must be specified") |
|
|
|
if not args.output_predictions and not args.output_annotations: |
|
raise Exception("Either --output-predictions or --output-annotations must be specified") |
|
|
|
|
|
working_directory = None |
|
if args.working_directory: |
|
working_directory = os.path.abspath(args.working_directory) |
|
protein_fasta = os.path.abspath(args.protein_fasta) |
|
if args.eggnog_data_dir: |
|
eggnog_data_dir = os.path.abspath(args.eggnog_data_dir) |
|
if args.eggnog_annotation_file: |
|
eggnog_annotation_file = os.path.abspath(args.eggnog_annotation_file) |
|
if args.kofam_hmm_path: |
|
kofam_hmm_path = os.path.abspath(args.kofam_hmm_path) |
|
if args.kofam_tsv_file: |
|
kofam_tsv_file = os.path.abspath(args.kofam_tsv_file) |
|
|
|
modal_keggs = os.path.abspath(args.modal_keggs) |
|
|
|
if args.annotation_table: |
|
if args.output_annotations: |
|
logging.error("Cannot specify --annotation-table and --output-annotations") |
|
sys.exit(1) |
|
logging.info("Reading annotation table ..") |
|
raw = pl.read_csv(args.annotation_table, separator='\t') |
|
if args.add_missing_annotations: |
|
added_column_count = 0 |
|
for h in header: |
|
if h not in raw.columns: |
|
raw = raw.with_columns(pl.lit(0).alias(h)) |
|
added_column_count = 0 |
|
if added_column_count > 0: |
|
logging.info("Added {} extra 0 count columns to annotation table".format(added_column_count)) |
|
d5 = raw.to_pandas()[header] |
|
accessions = list(raw['accession']) |
|
else: |
|
logging.info("Reading or calculating annotations not from a table ..") |
|
with in_tempdir(): |
|
logging.info("Created temp working directory {}".format(os.getcwd())) |
|
if args.working_directory: |
|
logging.info("Changing working directory to {}".format(working_directory)) |
|
if not os.path.exists(working_directory): |
|
logging.info("Creating working directory {}".format(working_directory)) |
|
os.makedirs(working_directory) |
|
os.chdir(working_directory) |
|
|
|
logging.info("Reading in COG whitelist") |
|
whitelist1 = open(args.whitelist).read().splitlines() |
|
logging.info("Found {} COGs in the whitelist".format(len(whitelist1))) |
|
|
|
|
|
predictive_cogs_series = pd.Series(["{}@1|root".format(c) for c in whitelist1], name='eggNOG_OGs') |
|
if args.eggnog_annotation_file: |
|
logging.info("Reading EGNOG annotations from supplied file") |
|
cog_annotations = read_annotations(eggnog_annotation_file, predictive_cogs_series) |
|
else: |
|
logging.info("Running eggnog-mapper") |
|
eggnog_output = 'eggnog_output' |
|
extern.run( |
|
f'EGGNOG_DATA_DIR={eggnog_data_dir} emapper.py -m diamond -i {protein_fasta} --target_orthologs one2one --query_cover 50.0 --evalue 0.0000001 --cpu {args.threads} -o {eggnog_output}' |
|
) |
|
cog_annotations = read_multiple_annotations( |
|
[eggnog_output+'.emapper.annotations'], |
|
predictive_cogs_series) |
|
|
|
|
|
if args.kofam_tsv_file: |
|
hmmsearch_tblout = kofam_tsv_file |
|
else: |
|
|
|
logging.info("Running hmmsearch") |
|
hmmsearch_tblout = 'hmmsearch_tblout.csv' |
|
extern.run( |
|
f'hmmsearch --tblout {hmmsearch_tblout} -o /dev/null --notextw --cpu {args.threads} {kofam_hmm_path} {protein_fasta}' |
|
) |
|
|
|
|
|
|
|
hits = extern.run("sed 's/ */\t/g' {} |cut -f1-6 |grep -v '^#'".format(hmmsearch_tblout)) |
|
hmmsearch_best_hits_df = pd.read_csv(StringIO(hits), sep='\t', header=None) |
|
hmmsearch_best_hits_df = hmmsearch_best_hits_df.loc[:, [0, 2, 4, 5]] |
|
logging.info("Read in {} annotations".format(len(hmmsearch_best_hits_df))) |
|
|
|
hmmsearch_best_hits_df.columns = ['query', 'target', 'evalue', 'score'] |
|
hmmsearch_best_hits_df2 = hmmsearch_best_hits_df.groupby('query', as_index=False).apply( |
|
lambda x: x.nsmallest(1, 'evalue')).reset_index().loc[:, ['query', 'target', 'evalue', 'score']] |
|
logging.info("Found {} unique HMM annotations".format(len(hmmsearch_best_hits_df2))) |
|
|
|
|
|
cog_annotations.loc[:, 'eggNOG_OGs'] = [x.replace('@1|root', '') for x in cog_annotations['eggNOG_OGs']] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logging.info("Read in {} cog_annotations".format(len(cog_annotations))) |
|
|
|
|
|
kegg_pairings = pd.read_csv(modal_keggs, sep="\t", header=None, names=['cog', 'kegg']) |
|
|
|
kegg_cog = pd.merge(hmmsearch_best_hits_df, cog_annotations, on='query') |
|
kegg_cog.loc[:, 'cog=kegg'] = kegg_cog['eggNOG_OGs'] + '=' + kegg_cog['target'] |
|
kegg_pairings.loc[:, 'cog=kegg'] = kegg_pairings['cog'] + '=' + kegg_pairings['kegg'] |
|
|
|
medianed = pd.merge(kegg_cog.loc[:, ['cog=kegg']], kegg_pairings, on='cog=kegg') |
|
medianed.columns = ['cog=kegg', 'cog', 'kegg'] |
|
medianed['accession'] = 'query_genome' |
|
|
|
wide_table = pd.pivot_table(medianed.loc[:, ['accession','cog']], |
|
index='accession', |
|
aggfunc=len, |
|
columns='cog', |
|
values='cog', |
|
fill_value=0) |
|
|
|
whitelist = ['accession'] + whitelist1 |
|
|
|
wide_table2 = wide_table.loc[:, wide_table.columns.intersection(whitelist)] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wide_table2_melt_plus = pd.concat([wide_table2.melt(), pd.DataFrame({'cog': header, 'value': 0})]) |
|
wide_table2_melt_plus2 = pl.DataFrame(wide_table2_melt_plus).groupby('cog').sum() |
|
wide_table2_melt_plus2 = wide_table2_melt_plus2.with_columns(pl.lit('query_genome').alias('accession')) |
|
wide_table2_melt_plus_pivot = wide_table2_melt_plus2.pivot(index='accession', columns='cog', values='value', aggregate_function='sum') |
|
|
|
|
|
d5 = wide_table2_melt_plus_pivot.to_pandas()[header] |
|
|
|
if args.output_annotations: |
|
d6 = d5 |
|
d6['accession'] = args.protein_fasta |
|
d6.to_csv(args.output_annotations, sep="\t", index=False) |
|
logging.info("Wrote annotation table to {}".format(args.output_annotations)) |
|
|
|
d5 = d5[header] |
|
|
|
if args.models is None: |
|
logging.info("Skipping model application, since no models were specified") |
|
else: |
|
if args.output_shap_values: |
|
logging.info("Reading in modal KEGGs with names") |
|
modal_keggs = pd.read_csv(args.modal_keggs_with_names,sep="\t") |
|
modal_keggs.index = modal_keggs['cog'] |
|
columns2 = pd.Series(['{} {} {}'.format(cog, ko, ko_name) for cog, ko, ko_name in zip(modal_keggs.loc[d5.columns,'cog'], modal_keggs.loc[d5.columns,'ko'], modal_keggs.loc[d5.columns, 'ko_name'])]) |
|
dup_columns = columns2.duplicated() |
|
columns2 = ["{}_{}".format(c,i) if is_dup else c for c,i,is_dup in zip(columns2, range(len(columns2)), dup_columns)] |
|
kos_and_names_column_names = [c.replace('[', '').replace(']', '').replace('<', '_') for c in columns2] |
|
|
|
all_results = [] |
|
for model_path in args.models: |
|
logging.info("Loading model {}".format(model_path)) |
|
model = load(model_path) |
|
logging.info("Loaded model {}".format(model_path)) |
|
|
|
doing_perceptron = 'Perceptron' in model_path |
|
|
|
preds = model.predict(d5) |
|
|
|
|
|
|
|
|
|
|
|
results = pl.DataFrame({ |
|
'node': accessions if args.annotation_table else args.protein_fasta, |
|
'preds': preds}) |
|
results = results.select( |
|
pl.col('node'), |
|
pl.col('preds').alias('prediction').cast(pl.Int64), |
|
|
|
pl.lit(model_path).alias('model')) |
|
if doing_perceptron: |
|
results = results.with_columns(pl.lit(-1.0).alias('probability').cast(pl.Float64)) |
|
else: |
|
results = results.with_columns(pl.lit(model.predict_proba(d5)[:, 1]).alias('probability').cast(pl.Float64)) |
|
all_results.append(results) |
|
|
|
if args.output_shap_values: |
|
explainer = shap.TreeExplainer(model.steps[1][1]) |
|
X_pred_scaled = pd.DataFrame(model.steps[0][1].transform(d5), columns=d5.columns) |
|
|
|
X_pred_scaled.columns = kos_and_names_column_names |
|
shap_values = explainer.shap_values(X_pred_scaled) |
|
|
|
all_shaps = list(reversed(np.argsort(np.abs(shap_values).mean(0)))) |
|
all_shaps_names = X_pred_scaled.columns[all_shaps] |
|
|
|
pl.DataFrame({ |
|
'rank': range(1, len(all_shaps) + 1), |
|
'copy_number': list(d5.iloc[:, all_shaps].values[0]), |
|
'shap_value': list(shap_values[:, all_shaps].mean(0)), |
|
'abs_shap_value': list(np.abs(shap_values[:, all_shaps]).mean(0)), |
|
'gene': list(all_shaps_names), |
|
}).write_csv(args.output_shap_values, separator="\t") |
|
logging.info("Wrote SHAP values and gene annotations to {}".format(args.output_shap_values)) |
|
|
|
pl.concat(all_results).write_csv(args.output_predictions, separator="\t") |
|
logging.info("Wrote predictions to {}".format(args.output_predictions)) |
|
|