Spaces:

jie1
/

succ1

Build error

App Files Files Community

succ1 / DLKcat /DeeplearningApproach /Code /analysis /SuppleFig5c_Pre_subsystem.py

jie1

Upload 28 files

2d12bc4 over 2 years ago

raw

history blame contribute delete

17.1 kB

	#!/usr/bin/python
	# coding: utf-8

	# Author: LE YUAN
	# Date: 2021-01-14

	# This python script is to classify subsystem base on EC number, Kcat and subsystem mapping for the data predicted by deep learning

	import json
	import math
	import model
	import torch
	import pickle
	import seaborn as sns
	import matplotlib.pyplot as plt
	from matplotlib import rc
	import pandas as pd
	import numpy as np
	import statsmodels.api as sm
	from rdkit import Chem
	from Bio import SeqIO
	from collections import defaultdict
	from scipy import stats
	from sklearn.metrics import mean_squared_error,r2_score


	fingerprint_dict = model.load_pickle('../../Data/input/fingerprint_dict.pickle')
	atom_dict = model.load_pickle('../../Data/input/atom_dict.pickle')
	bond_dict = model.load_pickle('../../Data/input/bond_dict.pickle')
	edge_dict = model.load_pickle('../../Data/input/edge_dict.pickle')
	word_dict = model.load_pickle('../../Data/input/sequence_dict.pickle')

	proteins = list()
	compounds = list()
	adjacencies = list()

	def split_sequence(sequence, ngram):
	sequence = '-' + sequence + '='
	# print(sequence)
	words = [word_dict[sequence[i:i+ngram]] for i in range(len(sequence)-ngram+1)]
	return np.array(words)
	# return word_dict

	def create_atoms(mol):
	"""Create a list of atom (e.g., hydrogen and oxygen) IDs
	considering the aromaticity."""
	# atom_dict = defaultdict(lambda: len(atom_dict))
	atoms = [a.GetSymbol() for a in mol.GetAtoms()]
	# print(atoms)
	for a in mol.GetAromaticAtoms():
	i = a.GetIdx()
	atoms[i] = (atoms[i], 'aromatic')
	atoms = [atom_dict[a] for a in atoms]
	return np.array(atoms)

	def create_ijbonddict(mol):
	"""Create a dictionary, which each key is a node ID
	and each value is the tuples of its neighboring node
	and bond (e.g., single and double) IDs."""
	# bond_dict = defaultdict(lambda: len(bond_dict))
	i_jbond_dict = defaultdict(lambda: [])
	for b in mol.GetBonds():
	i, j = b.GetBeginAtomIdx(), b.GetEndAtomIdx()
	bond = bond_dict[str(b.GetBondType())]
	i_jbond_dict[i].append((j, bond))
	i_jbond_dict[j].append((i, bond))
	return i_jbond_dict

	def extract_fingerprints(atoms, i_jbond_dict, radius):
	"""Extract the r-radius subgraphs (i.e., fingerprints)
	from a molecular graph using Weisfeiler-Lehman algorithm."""

	# fingerprint_dict = defaultdict(lambda: len(fingerprint_dict))
	# edge_dict = defaultdict(lambda: len(edge_dict))

	if (len(atoms) == 1) or (radius == 0):
	fingerprints = [fingerprint_dict[a] for a in atoms]

	else:
	nodes = atoms
	i_jedge_dict = i_jbond_dict

	for _ in range(radius):

	"""Update each node ID considering its neighboring nodes and edges
	(i.e., r-radius subgraphs or fingerprints)."""
	fingerprints = []
	for i, j_edge in i_jedge_dict.items():
	neighbors = [(nodes[j], edge) for j, edge in j_edge]
	fingerprint = (nodes[i], tuple(sorted(neighbors)))
	fingerprints.append(fingerprint_dict[fingerprint])
	nodes = fingerprints

	"""Also update each edge ID considering two nodes
	on its both sides."""
	_i_jedge_dict = defaultdict(lambda: [])
	for i, j_edge in i_jedge_dict.items():
	for j, edge in j_edge:
	both_side = tuple(sorted((nodes[i], nodes[j])))
	edge = edge_dict[(both_side, edge)]
	_i_jedge_dict[i].append((j, edge))
	i_jedge_dict = _i_jedge_dict

	return np.array(fingerprints)

	def create_adjacency(mol):
	adjacency = Chem.GetAdjacencyMatrix(mol)
	return np.array(adjacency)

	def dump_dictionary(dictionary, filename):
	with open(filename, 'wb') as file:
	pickle.dump(dict(dictionary), file)

	def load_tensor(file_name, dtype):
	return [dtype(d).to(device) for d in np.load(file_name + '.npy', allow_pickle=True)]

	class Predictor(object):
	def __init__(self, model):
	self.model = model

	def predict(self, data):
	predicted_value = self.model.forward(data)

	return predicted_value

	def deeplearning() :
	with open('../../Data/database/Kcat_combination_0918_wildtype_mutant.json', 'r') as infile :
	Kcat_data = json.load(infile)

	fingerprint_dict = model.load_pickle('../../Data/input/fingerprint_dict.pickle')
	atom_dict = model.load_pickle('../../Data/input/atom_dict.pickle')
	bond_dict = model.load_pickle('../../Data/input/bond_dict.pickle')
	word_dict = model.load_pickle('../../Data/input/sequence_dict.pickle')
	n_fingerprint = len(fingerprint_dict)
	n_word = len(word_dict)
	print(n_fingerprint) # 3958
	print(n_word) # 8542

	radius=2
	ngram=3
	# n_fingerprint = 3958
	# n_word = 8542

	dim=10
	layer_gnn=3
	side=5
	window=11
	layer_cnn=3
	layer_output=3
	lr=1e-3
	lr_decay=0.5
	decay_interval=10
	weight_decay=1e-6
	iteration=100

	if torch.cuda.is_available():
	device = torch.device('cuda')
	else:
	device = torch.device('cpu')

	# torch.manual_seed(1234)
	Kcat_model = model.KcatPrediction(device, n_fingerprint, n_word, 2*dim, layer_gnn, window, layer_cnn, layer_output).to(device)
	Kcat_model.load_state_dict(torch.load('../../Results/output/all--radius2--ngram3--dim20--layer_gnn3--window11--layer_cnn3--layer_output3--lr1e-3--lr_decay0.5--decay_interval10--weight_decay1e-6--iteration50', map_location=device))
	# print(state_dict.keys())
	# model.eval()
	predictor = Predictor(Kcat_model)

	print('It\'s time to start the prediction!')
	print('-----------------------------------')

	# prediction = predictor.predict(inputs)

	i = 0
	x = list()
	y = list()
	x1 = list()
	y1 = list()
	new_data = list()
	for data in Kcat_data :
	smiles = data['Smiles']
	sequence = data['Sequence']
	# print(smiles)
	Kcat = data['Value']
	enzyme_type = data['Type']
	if "." not in smiles and float(Kcat) > 0 and enzyme_type == 'wildtype':
	i += 1
	print('This is', i, '---------------------------------------')

	try :

	mol = Chem.AddHs(Chem.MolFromSmiles(smiles))
	atoms = create_atoms(mol)
	# print(atoms)
	i_jbond_dict = create_ijbonddict(mol)
	# print(i_jbond_dict)

	fingerprints = extract_fingerprints(atoms, i_jbond_dict, radius)
	# print(fingerprints)
	# compounds.append(fingerprints)

	adjacency = create_adjacency(mol)
	# print(adjacency)
	# adjacencies.append(adjacency)

	words = split_sequence(sequence,ngram)
	# print(words)
	# proteins.append(words)

	fingerprints = torch.LongTensor(fingerprints)
	adjacency = torch.FloatTensor(adjacency)
	words = torch.LongTensor(words)

	inputs = [fingerprints, adjacency, words]

	value = float(data['Value'])
	print(value)
	# print(type(value))


	prediction = predictor.predict(inputs)
	Kcat_log_value = prediction.item()
	Kcat_value = math.pow(2,Kcat_log_value)
	print(Kcat_value)
	# print(type(Kcat_value))

	data['Value'] = Kcat_value

	new_data.append(data)

	except :
	continue

	# print(len(new_data))
	# print(new_data[:2])

	return new_data

	# Data1 (Data used in deep learning workflow)
	def EC_Kcat() :

	datasets = deeplearning()

	print(len(datasets)) #

	EC_Kcat = dict()
	for data in datasets :
	# print(data)
	EC_Number = data['ECNumber']
	try :
	if EC_Kcat[EC_Number] and float(data['Value']) > 0 :
	value = math.log10(float(data['Value']))
	# if float(data['Value']) >= 1e-3 and float(data['Value']) <= 1e3 :
	EC_Kcat[EC_Number].append(value) # math.log10(float(data['Value'])) float(data['Value'])
	except :
	if float(data['Value']) > 0 :
	Kcat = list()
	value = math.log10(float(data['Value']))
	# if float(data['Value']) >= 1e-3 and float(data['Value']) <= 1e3 :
	Kcat.append(value)
	# print(len(Kcat))
	# print(Kcat)
	EC_Kcat[EC_Number] = Kcat

	return EC_Kcat

	def EC_subsystem() :
	with open('../../Data/subsystem/module_ec.txt', 'r') as infile :
	datasets = infile.readlines()

	print(len(datasets)) # 2200

	metabolism_types = list()
	types_abbre = {
	'Primary - Carbohydrate & Energy Metabolism': 'Primary-CE',
	'Secondary_other': 'Secondary_other',
	'Intermediate': 'Intermediate',
	'Secondary': 'Secondary',
	'Primary - amino acids, fatty acids and nucleotides': 'Primary-AFN',
	'x': 'x'
	}

	types_EC = dict()
	for data in datasets :
	# print(data)
	line = data.strip().split('\t')
	# print(line)
	metabolism_types.append(line[2])
	abbre = types_abbre[line[2]]
	# EC_types[line[1][2:]] = line[2]
	try :
	if types_EC[abbre] :
	types_EC[abbre].append(line[1][2:])
	except :
	EC_Number = list()
	EC_Number.append(line[1][2:])
	types_EC[abbre] = EC_Number

	# metabolism_types = list(set(metabolism_types))
	# print(len(metabolism_types)) # 6
	# print(metabolism_types)
	# # ['Primary - Carbohydrate & Energy Metabolism', 'Secondary_other', 'Intermediate', 'Secondary', 'Primary - amino acids, fatty acids and nucleotides', 'x']

	# print(types_EC)

	print(len(types_EC))

	i = 0
	new_types_EC = dict()
	for types, EC_Number in types_EC.items() :
	# print(len(set(EC_Number)))
	i += len(set(EC_Number))
	new_types_EC[types] = list(set(EC_Number))
	# print('The type of %s has %s unique EC Number.' % (types, len(set(EC_Number))))

	print('Total EC number is:', i) # 2200, the same with all entries in module.txt

	return new_types_EC

	def median(lst):
	sortedLst = sorted(lst)
	lstLen = len(lst)
	index = (lstLen - 1) // 2

	if (lstLen % 2):
	return sortedLst[index]
	else:
	return (sortedLst[index] + sortedLst[index + 1])/2.0

	def Kcat_subsystem() :
	EC_Kcat_relation = EC_Kcat()
	types_EC = EC_subsystem()
	# print(EC_Kcat_relation)

	types_Kcat = dict()
	for types, EC_Number in types_EC.items() :
	# print(types)
	Kcat_values = list()
	for EC in EC_Number :
	try :
	Kcat_values += EC_Kcat_relation[EC]
	except :
	continue
	types_Kcat[types] = Kcat_values

	# # print(len(types_Kcat))
	# for types, Kcat in types_Kcat.items() :
	# # print('The type of %s has %s Kcat values.' % (types, len(Kcat)))
	# # print('---'*15)
	# # print('The median value of %s is %s' %(types, math.log10(median(Kcat))))
	# print('The median value of %s is %s' %(types, median(Kcat)))

	return types_Kcat

	def plot_subsystem_Kcat_counts() :
	types_Kcat = Kcat_subsystem()

	for types, Kcat in types_Kcat.items() :
	print('The type of %s has %s Kcat values.' % (types, len(Kcat)))
	# print('---'*15)
	# print('The median value of %s is %s' %(types, math.log10(median(Kcat))))
	# print('The median value of %s is %s' %(types, median(Kcat)))

	# types = ['Primary-CE', 'Primary-AFN', 'Intermediate', 'Secondary']
	types = ['Primary-CE', 'Primary-AFN', 'Intermediate', 'Secondary', 'Secondary_other']
	counts = [len(types_Kcat[subsystem]) for subsystem in types]

	print(types)
	print(counts)

	plt.figure(figsize=(3.4,2.5))

	# https://juejin.im/post/6858230839986421767
	plt.bar(range(len(types)), counts, tick_label=types, width=0.5, alpha=0.8, color='pink', edgecolor='r')

	# ax = plt.axes()
	# ax.spines['top'].set_visible(False)
	# ax.spines['right'].set_visible(False)

	# plt.ylim(0,600)
	# plt.yticks([0,100,200,300,400,500,600])

	# plt.xlabel('Subsystem type',fontsize=12)
	plt.ylabel("Counts", fontsize=12)
	plt.xticks(rotation=30, ha='right')
	plt.xticks(fontsize=10)
	plt.yticks(fontsize=10)

	plt.savefig("../../Results/figures/subsystem_Kcat_counts_4.pdf", dpi=400, bbox_inches='tight')

	# https://my.oschina.net/u/4349448/blog/3448306 python code
	def plot_subsystem_distribution() :
	types_Kcat = Kcat_subsystem()

	plt.figure(figsize=(1.5,1.5))
	# To solve the 'Helvetica' font cannot be used in PDF file
	# https://stackoverflow.com/questions/59845568/the-pdf-backend-does-not-currently-support-the-selected-font
	rc('font',**{'family':'serif','serif':['Helvetica']})
	plt.rcParams['pdf.fonttype'] = 42

	plt.axes([0.12,0.12,0.83,0.83])

	plt.tick_params(direction='in')
	plt.tick_params(which='major',length=1.5)
	plt.tick_params(which='major',width=0.4)

	plt.rcParams['font.family'] = 'Helvetica'

	types_color = {'Primary-CE': '#F781BF', 'Intermediate': '#4DAF4A', 'Primary-AFN': '#A65628', 'Secondary': '#3182BD'}

	for types, Kcat in types_Kcat.items() :
	if types in ['Primary-CE', 'Intermediate', 'Primary-AFN', 'Secondary'] :
	# print('The type of %s has %s Kcat values.' % (types, len(Kcat)))
	# print('---'*15) '%.4f' %(Kcat_value)
	# print('The median value on log10 scale of %s is %.4f' %(types, math.log10(median(Kcat))))

	print('The median value of %s is %.2f' %(types, math.pow(10, median(Kcat))))
	# print('The median value in log10 of %s is %.2f' %(types, median(Kcat)))

	# The median value of Primary-CE is 1.0723991918615159
	# The median value of Intermediate is 0.4174998131351355
	# The median value of Primary-AFN is 0.577312924135785
	# The median value of x is -0.3737793622447971
	# The median value of Secondary is 0.6365409301885816
	# The median value of Secondary_other is 0.42474643787125355

	# if types == 'Primary-CE' :
	if types in ['Primary-CE', 'Intermediate', 'Primary-AFN', 'Secondary'] :
	ecdf = sm.distributions.ECDF(Kcat)

	x = np.linspace(min(Kcat),max(Kcat),50000) # 10000
	y = ecdf(x)

	# plt.xlim(1e-4, 1e4)

	# plt.xscale('log')
	# plt.xticks([-2, -1, 0, 1, 2, 3])
	# plt.plot(x,y,linewidth='2',label='Primary-CE')

	plt.plot(x,y,linewidth='0.75',label=types,color=types_color[types])

	# https://blog.csdn.net/weixin_38314865/article/details/115173371?utm_medium=distribute.pc_relevant.none-task-blog-baidujs_baidulandingword-5&spm=1001.2101.3001.4242
	# https://blog.csdn.net/dta0502/article/details/83827345
	plt.axvline(x=median(Kcat),ymin=0,ymax=0.5,linewidth='0.75',linestyle='--',color=types_color[types])

	# plt.text(2.2, 0.3, 'Primary-CE', fontweight ="normal", fontsize=6, color='#F781BF')
	# plt.text(2.2, 0.2, 'Primary-AFN', fontweight ="normal", fontsize=6, color='#A65628')
	# plt.text(2.2, 0.1, 'Secondary', fontweight ="normal", fontsize=6, color='#3182BD')
	# plt.text(2.2, 0.0, 'Intermediate', fontweight ="normal", fontsize=6, color='#4DAF4A')

	plt.text(-5, 0.9, 'Primary-CE', fontweight ="normal", fontsize=6, color='#F781BF')
	plt.text(-5, 0.8, 'Primary-AFN', fontweight ="normal", fontsize=6, color='#A65628')
	plt.text(-5, 0.7, 'Secondary', fontweight ="normal", fontsize=6, color='#3182BD')
	plt.text(-5, 0.6, 'Intermediate', fontweight ="normal", fontsize=6, color='#4DAF4A')

	plt.rcParams['font.family'] = 'Helvetica'

	plt.xlabel('Predicted $k$$_\mathregular{cat}$ value', fontsize=7)
	plt.ylabel('Cumulative distribution', fontsize=7)

	plt.xticks([-6,-4,-2,0,2,4,6,8])

	plt.xticks(fontsize=6)
	plt.yticks(fontsize=6)
	# plt.legend(loc="lower right", frameon=False, prop={"size":6})

	ax = plt.gca()
	ax.spines['bottom'].set_linewidth(0.5)
	ax.spines['left'].set_linewidth(0.5)
	ax.spines['top'].set_linewidth(0.5)
	ax.spines['right'].set_linewidth(0.5)

	plt.savefig("../../Results/figures/SuppleFig5c.pdf", dpi=400, bbox_inches='tight')


	if __name__ == "__main__" :
	# deeplearning()
	# EC_subsystem()
	# main()
	# EC_Kcat()
	# Kcat_subsystem()
	# plot_subsystem_Kcat_counts()
	plot_subsystem_distribution()

	# Results:
	# The median value in log10 of Primary-CE is 1.51
	# The median value in log10 of Intermediate is 0.64
	# The median value in log10 of Primary-AFN is 0.86
	# The median value in log10 of Secondary is 0.86

	# The median value of Primary-CE is 32.06
	# The median value of Intermediate is 4.33
	# The median value of Primary-AFN is 7.21
	# The median value of Secondary is 7.23