|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json |
|
import math |
|
import model |
|
import torch |
|
import pickle |
|
import seaborn as sns |
|
import matplotlib.pyplot as plt |
|
from matplotlib import rc |
|
import pandas as pd |
|
import numpy as np |
|
import statsmodels.api as sm |
|
from rdkit import Chem |
|
from Bio import SeqIO |
|
from collections import defaultdict |
|
from scipy import stats |
|
from sklearn.metrics import mean_squared_error,r2_score |
|
|
|
|
|
fingerprint_dict = model.load_pickle('../../Data/input/fingerprint_dict.pickle') |
|
atom_dict = model.load_pickle('../../Data/input/atom_dict.pickle') |
|
bond_dict = model.load_pickle('../../Data/input/bond_dict.pickle') |
|
edge_dict = model.load_pickle('../../Data/input/edge_dict.pickle') |
|
word_dict = model.load_pickle('../../Data/input/sequence_dict.pickle') |
|
|
|
proteins = list() |
|
compounds = list() |
|
adjacencies = list() |
|
|
|
def split_sequence(sequence, ngram): |
|
sequence = '-' + sequence + '=' |
|
|
|
words = [word_dict[sequence[i:i+ngram]] for i in range(len(sequence)-ngram+1)] |
|
return np.array(words) |
|
|
|
|
|
def create_atoms(mol): |
|
"""Create a list of atom (e.g., hydrogen and oxygen) IDs |
|
considering the aromaticity.""" |
|
|
|
atoms = [a.GetSymbol() for a in mol.GetAtoms()] |
|
|
|
for a in mol.GetAromaticAtoms(): |
|
i = a.GetIdx() |
|
atoms[i] = (atoms[i], 'aromatic') |
|
atoms = [atom_dict[a] for a in atoms] |
|
return np.array(atoms) |
|
|
|
def create_ijbonddict(mol): |
|
"""Create a dictionary, which each key is a node ID |
|
and each value is the tuples of its neighboring node |
|
and bond (e.g., single and double) IDs.""" |
|
|
|
i_jbond_dict = defaultdict(lambda: []) |
|
for b in mol.GetBonds(): |
|
i, j = b.GetBeginAtomIdx(), b.GetEndAtomIdx() |
|
bond = bond_dict[str(b.GetBondType())] |
|
i_jbond_dict[i].append((j, bond)) |
|
i_jbond_dict[j].append((i, bond)) |
|
return i_jbond_dict |
|
|
|
def extract_fingerprints(atoms, i_jbond_dict, radius): |
|
"""Extract the r-radius subgraphs (i.e., fingerprints) |
|
from a molecular graph using Weisfeiler-Lehman algorithm.""" |
|
|
|
|
|
|
|
|
|
if (len(atoms) == 1) or (radius == 0): |
|
fingerprints = [fingerprint_dict[a] for a in atoms] |
|
|
|
else: |
|
nodes = atoms |
|
i_jedge_dict = i_jbond_dict |
|
|
|
for _ in range(radius): |
|
|
|
"""Update each node ID considering its neighboring nodes and edges |
|
(i.e., r-radius subgraphs or fingerprints).""" |
|
fingerprints = [] |
|
for i, j_edge in i_jedge_dict.items(): |
|
neighbors = [(nodes[j], edge) for j, edge in j_edge] |
|
fingerprint = (nodes[i], tuple(sorted(neighbors))) |
|
fingerprints.append(fingerprint_dict[fingerprint]) |
|
nodes = fingerprints |
|
|
|
"""Also update each edge ID considering two nodes |
|
on its both sides.""" |
|
_i_jedge_dict = defaultdict(lambda: []) |
|
for i, j_edge in i_jedge_dict.items(): |
|
for j, edge in j_edge: |
|
both_side = tuple(sorted((nodes[i], nodes[j]))) |
|
edge = edge_dict[(both_side, edge)] |
|
_i_jedge_dict[i].append((j, edge)) |
|
i_jedge_dict = _i_jedge_dict |
|
|
|
return np.array(fingerprints) |
|
|
|
def create_adjacency(mol): |
|
adjacency = Chem.GetAdjacencyMatrix(mol) |
|
return np.array(adjacency) |
|
|
|
def dump_dictionary(dictionary, filename): |
|
with open(filename, 'wb') as file: |
|
pickle.dump(dict(dictionary), file) |
|
|
|
def load_tensor(file_name, dtype): |
|
return [dtype(d).to(device) for d in np.load(file_name + '.npy', allow_pickle=True)] |
|
|
|
class Predictor(object): |
|
def __init__(self, model): |
|
self.model = model |
|
|
|
def predict(self, data): |
|
predicted_value = self.model.forward(data) |
|
|
|
return predicted_value |
|
|
|
def deeplearning() : |
|
with open('../../Data/database/Kcat_combination_0918_wildtype_mutant.json', 'r') as infile : |
|
Kcat_data = json.load(infile) |
|
|
|
fingerprint_dict = model.load_pickle('../../Data/input/fingerprint_dict.pickle') |
|
atom_dict = model.load_pickle('../../Data/input/atom_dict.pickle') |
|
bond_dict = model.load_pickle('../../Data/input/bond_dict.pickle') |
|
word_dict = model.load_pickle('../../Data/input/sequence_dict.pickle') |
|
n_fingerprint = len(fingerprint_dict) |
|
n_word = len(word_dict) |
|
print(n_fingerprint) |
|
print(n_word) |
|
|
|
radius=2 |
|
ngram=3 |
|
|
|
|
|
|
|
dim=10 |
|
layer_gnn=3 |
|
side=5 |
|
window=11 |
|
layer_cnn=3 |
|
layer_output=3 |
|
lr=1e-3 |
|
lr_decay=0.5 |
|
decay_interval=10 |
|
weight_decay=1e-6 |
|
iteration=100 |
|
|
|
if torch.cuda.is_available(): |
|
device = torch.device('cuda') |
|
else: |
|
device = torch.device('cpu') |
|
|
|
|
|
Kcat_model = model.KcatPrediction(device, n_fingerprint, n_word, 2*dim, layer_gnn, window, layer_cnn, layer_output).to(device) |
|
Kcat_model.load_state_dict(torch.load('../../Results/output/all--radius2--ngram3--dim20--layer_gnn3--window11--layer_cnn3--layer_output3--lr1e-3--lr_decay0.5--decay_interval10--weight_decay1e-6--iteration50', map_location=device)) |
|
|
|
|
|
predictor = Predictor(Kcat_model) |
|
|
|
print('It\'s time to start the prediction!') |
|
print('-----------------------------------') |
|
|
|
|
|
|
|
i = 0 |
|
x = list() |
|
y = list() |
|
x1 = list() |
|
y1 = list() |
|
new_data = list() |
|
for data in Kcat_data : |
|
smiles = data['Smiles'] |
|
sequence = data['Sequence'] |
|
|
|
Kcat = data['Value'] |
|
enzyme_type = data['Type'] |
|
if "." not in smiles and float(Kcat) > 0 and enzyme_type == 'wildtype': |
|
i += 1 |
|
print('This is', i, '---------------------------------------') |
|
|
|
try : |
|
|
|
mol = Chem.AddHs(Chem.MolFromSmiles(smiles)) |
|
atoms = create_atoms(mol) |
|
|
|
i_jbond_dict = create_ijbonddict(mol) |
|
|
|
|
|
fingerprints = extract_fingerprints(atoms, i_jbond_dict, radius) |
|
|
|
|
|
|
|
adjacency = create_adjacency(mol) |
|
|
|
|
|
|
|
words = split_sequence(sequence,ngram) |
|
|
|
|
|
|
|
fingerprints = torch.LongTensor(fingerprints) |
|
adjacency = torch.FloatTensor(adjacency) |
|
words = torch.LongTensor(words) |
|
|
|
inputs = [fingerprints, adjacency, words] |
|
|
|
value = float(data['Value']) |
|
print(value) |
|
|
|
|
|
|
|
prediction = predictor.predict(inputs) |
|
Kcat_log_value = prediction.item() |
|
Kcat_value = math.pow(2,Kcat_log_value) |
|
print(Kcat_value) |
|
|
|
|
|
data['Value'] = Kcat_value |
|
|
|
new_data.append(data) |
|
|
|
except : |
|
continue |
|
|
|
|
|
|
|
|
|
return new_data |
|
|
|
|
|
def EC_Kcat() : |
|
|
|
datasets = deeplearning() |
|
|
|
print(len(datasets)) |
|
|
|
EC_Kcat = dict() |
|
for data in datasets : |
|
|
|
EC_Number = data['ECNumber'] |
|
try : |
|
if EC_Kcat[EC_Number] and float(data['Value']) > 0 : |
|
value = math.log10(float(data['Value'])) |
|
|
|
EC_Kcat[EC_Number].append(value) |
|
except : |
|
if float(data['Value']) > 0 : |
|
Kcat = list() |
|
value = math.log10(float(data['Value'])) |
|
|
|
Kcat.append(value) |
|
|
|
|
|
EC_Kcat[EC_Number] = Kcat |
|
|
|
return EC_Kcat |
|
|
|
def EC_subsystem() : |
|
with open('../../Data/subsystem/module_ec.txt', 'r') as infile : |
|
datasets = infile.readlines() |
|
|
|
print(len(datasets)) |
|
|
|
metabolism_types = list() |
|
types_abbre = { |
|
'Primary - Carbohydrate & Energy Metabolism': 'Primary-CE', |
|
'Secondary_other': 'Secondary_other', |
|
'Intermediate': 'Intermediate', |
|
'Secondary': 'Secondary', |
|
'Primary - amino acids, fatty acids and nucleotides': 'Primary-AFN', |
|
'x': 'x' |
|
} |
|
|
|
types_EC = dict() |
|
for data in datasets : |
|
|
|
line = data.strip().split('\t') |
|
|
|
metabolism_types.append(line[2]) |
|
abbre = types_abbre[line[2]] |
|
|
|
try : |
|
if types_EC[abbre] : |
|
types_EC[abbre].append(line[1][2:]) |
|
except : |
|
EC_Number = list() |
|
EC_Number.append(line[1][2:]) |
|
types_EC[abbre] = EC_Number |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(len(types_EC)) |
|
|
|
i = 0 |
|
new_types_EC = dict() |
|
for types, EC_Number in types_EC.items() : |
|
|
|
i += len(set(EC_Number)) |
|
new_types_EC[types] = list(set(EC_Number)) |
|
|
|
|
|
print('Total EC number is:', i) |
|
|
|
return new_types_EC |
|
|
|
def median(lst): |
|
sortedLst = sorted(lst) |
|
lstLen = len(lst) |
|
index = (lstLen - 1) // 2 |
|
|
|
if (lstLen % 2): |
|
return sortedLst[index] |
|
else: |
|
return (sortedLst[index] + sortedLst[index + 1])/2.0 |
|
|
|
def Kcat_subsystem() : |
|
EC_Kcat_relation = EC_Kcat() |
|
types_EC = EC_subsystem() |
|
|
|
|
|
types_Kcat = dict() |
|
for types, EC_Number in types_EC.items() : |
|
|
|
Kcat_values = list() |
|
for EC in EC_Number : |
|
try : |
|
Kcat_values += EC_Kcat_relation[EC] |
|
except : |
|
continue |
|
types_Kcat[types] = Kcat_values |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return types_Kcat |
|
|
|
def plot_subsystem_Kcat_counts() : |
|
types_Kcat = Kcat_subsystem() |
|
|
|
for types, Kcat in types_Kcat.items() : |
|
print('The type of %s has %s Kcat values.' % (types, len(Kcat))) |
|
|
|
|
|
|
|
|
|
|
|
types = ['Primary-CE', 'Primary-AFN', 'Intermediate', 'Secondary', 'Secondary_other'] |
|
counts = [len(types_Kcat[subsystem]) for subsystem in types] |
|
|
|
print(types) |
|
print(counts) |
|
|
|
plt.figure(figsize=(3.4,2.5)) |
|
|
|
|
|
plt.bar(range(len(types)), counts, tick_label=types, width=0.5, alpha=0.8, color='pink', edgecolor='r') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
plt.ylabel("Counts", fontsize=12) |
|
plt.xticks(rotation=30, ha='right') |
|
plt.xticks(fontsize=10) |
|
plt.yticks(fontsize=10) |
|
|
|
plt.savefig("../../Results/figures/subsystem_Kcat_counts_4.pdf", dpi=400, bbox_inches='tight') |
|
|
|
|
|
def plot_subsystem_distribution() : |
|
types_Kcat = Kcat_subsystem() |
|
|
|
plt.figure(figsize=(1.5,1.5)) |
|
|
|
|
|
rc('font',**{'family':'serif','serif':['Helvetica']}) |
|
plt.rcParams['pdf.fonttype'] = 42 |
|
|
|
plt.axes([0.12,0.12,0.83,0.83]) |
|
|
|
plt.tick_params(direction='in') |
|
plt.tick_params(which='major',length=1.5) |
|
plt.tick_params(which='major',width=0.4) |
|
|
|
plt.rcParams['font.family'] = 'Helvetica' |
|
|
|
types_color = {'Primary-CE': '#F781BF', 'Intermediate': '#4DAF4A', 'Primary-AFN': '#A65628', 'Secondary': '#3182BD'} |
|
|
|
for types, Kcat in types_Kcat.items() : |
|
if types in ['Primary-CE', 'Intermediate', 'Primary-AFN', 'Secondary'] : |
|
|
|
|
|
|
|
|
|
print('The median value of %s is %.2f' %(types, math.pow(10, median(Kcat)))) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if types in ['Primary-CE', 'Intermediate', 'Primary-AFN', 'Secondary'] : |
|
ecdf = sm.distributions.ECDF(Kcat) |
|
|
|
x = np.linspace(min(Kcat),max(Kcat),50000) |
|
y = ecdf(x) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
plt.plot(x,y,linewidth='0.75',label=types,color=types_color[types]) |
|
|
|
|
|
|
|
plt.axvline(x=median(Kcat),ymin=0,ymax=0.5,linewidth='0.75',linestyle='--',color=types_color[types]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
plt.text(-5, 0.9, 'Primary-CE', fontweight ="normal", fontsize=6, color='#F781BF') |
|
plt.text(-5, 0.8, 'Primary-AFN', fontweight ="normal", fontsize=6, color='#A65628') |
|
plt.text(-5, 0.7, 'Secondary', fontweight ="normal", fontsize=6, color='#3182BD') |
|
plt.text(-5, 0.6, 'Intermediate', fontweight ="normal", fontsize=6, color='#4DAF4A') |
|
|
|
plt.rcParams['font.family'] = 'Helvetica' |
|
|
|
plt.xlabel('Predicted $k$$_\mathregular{cat}$ value', fontsize=7) |
|
plt.ylabel('Cumulative distribution', fontsize=7) |
|
|
|
plt.xticks([-6,-4,-2,0,2,4,6,8]) |
|
|
|
plt.xticks(fontsize=6) |
|
plt.yticks(fontsize=6) |
|
|
|
|
|
ax = plt.gca() |
|
ax.spines['bottom'].set_linewidth(0.5) |
|
ax.spines['left'].set_linewidth(0.5) |
|
ax.spines['top'].set_linewidth(0.5) |
|
ax.spines['right'].set_linewidth(0.5) |
|
|
|
plt.savefig("../../Results/figures/SuppleFig5c.pdf", dpi=400, bbox_inches='tight') |
|
|
|
|
|
if __name__ == "__main__" : |
|
|
|
|
|
|
|
|
|
|
|
|
|
plot_subsystem_distribution() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|