jie1's picture
Upload 3 files
a746d22
#!/usr/bin/python
# coding: utf-8
# Author: LE YUAN
# Date: 2020-10-03
import math
import json
import pickle
import numpy as np
from collections import defaultdict
from rdkit import Chem
word_dict = defaultdict(lambda: len(word_dict))
atom_dict = defaultdict(lambda: len(atom_dict))
bond_dict = defaultdict(lambda: len(bond_dict))
fingerprint_dict = defaultdict(lambda: len(fingerprint_dict))
edge_dict = defaultdict(lambda: len(edge_dict))
proteins = list()
compounds = list()
adjacencies = list()
regression =list()
def split_sequence(sequence, ngram):
sequence = '-' + sequence + '='
# print(sequence)
words = [word_dict[sequence[i:i+ngram]] for i in range(len(sequence)-ngram+1)]
return np.array(words)
# return word_dict
def create_atoms(mol):
"""Create a list of atom (e.g., hydrogen and oxygen) IDs
considering the aromaticity."""
# atom_dict = defaultdict(lambda: len(atom_dict))
atoms = [a.GetSymbol() for a in mol.GetAtoms()]
# print(atoms)
for a in mol.GetAromaticAtoms():
i = a.GetIdx()
atoms[i] = (atoms[i], 'aromatic')
atoms = [atom_dict[a] for a in atoms]
return np.array(atoms)
def create_ijbonddict(mol):
"""Create a dictionary, which each key is a node ID
and each value is the tuples of its neighboring node
and bond (e.g., single and double) IDs."""
# bond_dict = defaultdict(lambda: len(bond_dict))
i_jbond_dict = defaultdict(lambda: [])
for b in mol.GetBonds():
i, j = b.GetBeginAtomIdx(), b.GetEndAtomIdx()
bond = bond_dict[str(b.GetBondType())]
i_jbond_dict[i].append((j, bond))
i_jbond_dict[j].append((i, bond))
return i_jbond_dict
def extract_fingerprints(atoms, i_jbond_dict, radius):
"""Extract the r-radius subgraphs (i.e., fingerprints)
from a molecular graph using Weisfeiler-Lehman algorithm."""
# fingerprint_dict = defaultdict(lambda: len(fingerprint_dict))
# edge_dict = defaultdict(lambda: len(edge_dict))
if (len(atoms) == 1) or (radius == 0):
fingerprints = [fingerprint_dict[a] for a in atoms]
else:
nodes = atoms
i_jedge_dict = i_jbond_dict
for _ in range(radius):
"""Update each node ID considering its neighboring nodes and edges
(i.e., r-radius subgraphs or fingerprints)."""
fingerprints = []
for i, j_edge in i_jedge_dict.items():
neighbors = [(nodes[j], edge) for j, edge in j_edge]
fingerprint = (nodes[i], tuple(sorted(neighbors)))
fingerprints.append(fingerprint_dict[fingerprint])
nodes = fingerprints
"""Also update each edge ID considering two nodes
on its both sides."""
_i_jedge_dict = defaultdict(lambda: [])
for i, j_edge in i_jedge_dict.items():
for j, edge in j_edge:
both_side = tuple(sorted((nodes[i], nodes[j])))
edge = edge_dict[(both_side, edge)]
_i_jedge_dict[i].append((j, edge))
i_jedge_dict = _i_jedge_dict
return np.array(fingerprints)
def create_adjacency(mol):
adjacency = Chem.GetAdjacencyMatrix(mol)
return np.array(adjacency)
def dump_dictionary(dictionary, filename):
with open(filename, 'wb') as file:
pickle.dump(dict(dictionary), file)
def main() :
with open('../../Data/database/Kcat_combination_0918.json', 'r') as infile :
Kcat_data = json.load(infile)
# smiles_all = [data['Smiles'] for data in Kcat_data]
# print(len(Kcat_data))
# smiles = "CC1=NC=C(C(=C1O)CO)CO"
# radius = 3 # The initial setup, I suppose it is 2, but not 2.
radius = 2
ngram = 3
"""Exclude data contains '.' in the SMILES format."""
i = 0
for data in Kcat_data :
smiles = data['Smiles']
sequence = data['Sequence']
# print(smiles)
Kcat = data['Value']
if "." not in smiles and float(Kcat) > 0:
# i += 1
# print('This is',i)
mol = Chem.AddHs(Chem.MolFromSmiles(smiles))
atoms = create_atoms(mol)
# print(atoms)
i_jbond_dict = create_ijbonddict(mol)
# print(i_jbond_dict)
fingerprints = extract_fingerprints(atoms, i_jbond_dict, radius)
# print(fingerprints)
compounds.append(fingerprints)
adjacency = create_adjacency(mol)
adjacencies.append(adjacency)
words = split_sequence(sequence,ngram)
# print(words)
proteins.append(words)
# print(float(Kcat))
regression.append(np.array([math.log2(float(Kcat))]))
print(math.log2(float(Kcat)))
# regression.append(np.array([math.log10(float(Kcat))]))
# print(math.log10(float(Kcat)))
np.save('../../Data/input/'+'compounds', compounds)
np.save('../../Data/input/'+'adjacencies', adjacencies)
np.save('../../Data/input/'+'regression', regression)
np.save('../../Data/input/'+'proteins', proteins)
dump_dictionary(fingerprint_dict, '../../Data/input/fingerprint_dict.pickle')
dump_dictionary(atom_dict, '../../Data/input/atom_dict.pickle')
dump_dictionary(bond_dict, '../../Data/input/bond_dict.pickle')
dump_dictionary(edge_dict, '../../Data/input/edge_dict.pickle')
dump_dictionary(word_dict, '../../Data/input/sequence_dict.pickle')
if __name__ == '__main__' :
main()