Spaces:

jie1
/

succ1

Build error

App Files Files Community

succ1 / DLKcat /DeeplearningApproach /Code /preprocess /findMaxKvalues_AllOrgs.py

jie1

Upload 17 files

70b95b8 over 2 years ago

raw

history blame contribute delete

11.6 kB

	#!/usr/bin/python
	################################################################################
	# findMaxKcats
	# Reads all EC files and finds the max value for each substrate for the chosen
	# microorganism on the different enzymatic parameters [Kcat, KM, SA, MW].
	# For each parameter Writes a table with the following columns:
	# * EC number
	# * substrate
	# * organism name//taxonomical classification (according to KEGG)
	# * Max value
	# * metabolic pathways

	# Benjamin Sanchez. Last edited: 2015-08-26
	# Ivan Domenzain. Last edited: 2018-04-10
	################################################################################

	# Updated by:
	# Author: LE YUAN
	# This code should be run under the Python 2.7 environment

	#INPUTS:
	#1) Enzymatic parameters
	features_list = ['KCAT','SA', 'MW']
	#2) Path in which the EC files are stored (from script createECfiles.py):
	input_path = '/Users/.../brenda_parser/EC_files'
	#3) Path in which you wish to store the final table:
	output_path = '/Users/.../brenda_parser/max_data'
	################################################################################

	#sub_max_std: Recieves a list of substrates///organism_info///values, returns
	#3 lists: substrates - max - std

	def sub_max_std(data):
	#Sorts list, add a last empty line and initialize variables:
	data.sort()
	data.append('')
	#for every substrate gets the index of all its appearences in the rows
	#of the EC data
	substrates, org_strings_reps, values_reps, reps_indexes = substrate_repetitions(data)
	org_strings, values =\
	find_in_substrate(substrates, org_strings_reps, values_reps, reps_indexes)
	#get maximum kvalues
	values = maximum_values(values)

	return (substrates,org_strings,values)


	################################################################################
	#maximum_values: Gets the maximum Kvalue for each organism related
	#to each substrate
	def maximum_values(values):
	for subs_values in values:
	i = values.index(subs_values)
	for org_values in subs_values:
	j = subs_values.index(org_values)
	#org_values = max(org_values)
	values[i][j] = max(values[i][j])

	return(values)
	################################################################################
	#find_in_substrate: Finds the organisms and K values related to each substrate
	def find_in_substrate(substrates, org_strings_reps, values_reps, reps_indexes):

	org_strings = []
	values = []
	for i in range(len(substrates)):
	subs_orgs = []
	subs_values = []

	for j in reps_indexes[i]:

	try:
	org_index = subs_orgs.index(org_strings_reps[j])
	subs_values[org_index].append(values_reps[j])

	except:
	subs_orgs.append(org_strings_reps[j])
	org_index = subs_orgs.index(org_strings_reps[j])
	subs_values.append([values_reps[j]])

	org_strings.append(subs_orgs)
	values.append(subs_values)

	return(org_strings, values)

	################################################################################
	#substrate_repetitions: Finds from each EC file the organism and K values related
	#to each substrate

	def substrate_repetitions(data):

	substrates = []
	org_strings_reps = []
	values_reps = []
	reps_indexes = []

	for row in data:
	if row != '':
	row_index = data.index(row)
	#gets index if row substrate is repeated
	try:
	subs_indx = substrates.index(row[0:row.find('///')])
	#if new substrate
	except:
	substrates.append(row[0:row.find('///')])
	subs_indx = substrates.index(row[0:row.find('///')])
	reps_indexes.append([])
	#list with the indexes of the rows with repeated substrates
	reps_indexes[subs_indx].append(row_index)
	#organisms for related to each substrate
	org_strings_reps.append(row[row.find('///')+3:row.find('////')])
	#values found for each organism
	values_reps.append(float(row[row.find('////')+4:len(row)]))

	return(substrates, org_strings_reps, values_reps, reps_indexes)

	################################################################################

	#create_orgs_list: Finds all the organism names for which data is available in
	#BRENDA database. As an output a list with all found names is created.

	def brenda_orgs_list(dir):
	brenda_orgs=[]
	for ec in dir:

	fid = open(ec,'r')
	csv_fid = csv.reader(fid,delimiter='\t')

	try:
	for row in csv_fid:
	if row != '' and row[0] != 'SEQ' and row[0] != '*':

	#Uncomment and indent properly if you want to exclude any name longer
	#two words (mutants for example, but not exclusively)

	#second_blank = row[1].find(' ',row[1].find(' ')+1)

	#if second_blank == -1:
	org_name = row[1].lower()
	#else:
	# org_name=row[1][0:second_blank]

	if brenda_orgs.count(org_name)==0:
	brenda_orgs.append(org_name)
	except:
	pass#brenda_orgs.append(org_name)
	return (brenda_orgs)


	################################################################################

	#KEGG_orgs_list: Creates a list with all the organisms available at KEGG, as an
	#output a table with the fields: name, KEGG code and Taxonomy is created.
	def KEGG_orgs_list():

	#URL that returns available data of the gene entry on KEGG
	url = 'http://rest.kegg.jp/list/organism'
	#Try/except for avoiding timeout exceedings
	try:
	query = urllib2.urlopen(url, timeout=20).read()
	except:
	query=''

	entries = query.split('\n')
	KEGG_list = []
	tax_kegg = []
	codes = []
	for row in entries:
	if row != '':
	row_list = row.split('\t')
	if len(row_list)>1:
	row_list=[row_list[2],row_list[3],row_list[1]]
	if row_list[0].find('(') != -1:
	row_list[0]=row_list[0][0:row_list[0].find('(')-1]

	#Saves only organisms with specific taxonomic classifications
	taxonomy=row_list[1].lower()
	if taxonomy.find('eukaryotes')!= -1 or taxonomy.find('prokaryotes')!= -1:
	KEGG_list.append(row_list[0].lower())
	tax_kegg.append(taxonomy)
	codes.append(row_list[2])
	return(KEGG_list, tax_kegg, codes)

	################################################################################

	#orgs_list: Two possible options 1) Merges BRENDA and KEGG organisms lists
	# creates a list with only coincidences between lists.

	def orgs_list(dir):

	KEGG_orgs, info_KEGG, codes = KEGG_orgs_list()

	brenda_orgs = brenda_orgs_list(dir)
	#print brenda_orgs
	organism_list = []
	taxonomy = []
	org_codes = []
	#i=0
	counter=0

	for B_org in brenda_orgs:
	flag = False
	if B_org != '*':
	i=0

	while (i < len(KEGG_orgs) and flag == False):
	K_org = KEGG_orgs[i]

	if K_org.find(B_org)!= -1:

	flag = True
	counter = counter+1
	organism_list.append(B_org)
	taxonomy.append(info_KEGG[i])
	org_codes.append(codes[i])
	i = i +1

	if flag == False:
	organism_list.append(B_org)
	taxonomy.append('*')
	org_codes.append('*')

	#for B_org in brenda_orgs:
	# if KEGG_orgs.count(B_org) != 0:
	# counter=counter+1
	# i=KEGG_orgs.index(B_org)
	# organism_list.append(B_org)
	# taxonomy.append(info_KEGG[i])
	# org_codes.append(codes[i])
	# else:
	# organism_list.append(B_org)
	# taxonomy.append('*')
	# org_codes.append('*')

	return(organism_list,taxonomy,org_codes)

	################################################################################

	#EC_string: Receives the information in the EC file and builds a string with
	#substrates, related organisms and Kvalues

	def EC_string(csv_fid, feature_name):
	data_string = []
	ec_pathways = ''
	for row in csv_fid:
	if row[0] != '':
	row[4] = row[4].lower()
	mutant = max(row[4].find('mutant'),row[4].find('mutated'))
	#Ignore invalid values:
	if row[2] != '-999' and mutant == -1:
	#Only allow Kcats <= 1e7 [Bar-Even et al. 2011]
	if row[0] == feature_name and float(row[2]) <= 1e7:
	#Looks for the organism in the organism merged list
	#in order to include taxonomical info if available
	try:
	org_index = organism_list.index(row[1].lower())
	org_string = organism_list[org_index]+'//'+\
	taxonomy[org_index]+ '//'+ organism_code[org_index]

	data_string.append(row[3].lower() + '///' +\
	org_string + '////' + row[2])
	except:
	print 'Organism not found in KEGG or BRENDA'

	#Gets the associated not engineered pathways to the
	#EC number if present
	if row[0] == 'PATH' and row[2].lower() != 'metabolic pathways':
	if row[2].find('(engineered)') == -1:
	ec_pathways = ec_pathways + row[2].lower() + '///'
	#If path not found an asterisk is added to the field
	if ec_pathways == '' or ec_pathways == ' ' or ec_pathways == '\0':
	ec_pathways = '*'
	if len(ec_pathways) > 3 and ec_pathways[-3] == '/':
	ec_pathways = ec_pathways[:-3]

	return(data_string, ec_pathways)

	################################################################################

	#Main Script

	#Read all EC file names:
	import os
	prev_path = os.getcwd()
	os.chdir(input_path)
	dir_files = os.listdir(input_path)
	dir_files.sort()
	import urllib2
	import csv
	organism_list,taxonomy,organism_code = orgs_list(dir_files)

	for feature_name in features_list:
	#Main loop:
	output = ''
	for ec in dir_files:
	ec_number = ec[0:len(ec)-4]
	fid = open(ec,'r')
	csv_fid = csv.reader(fid,delimiter='\t')
	#Builds a string with all the information in the EC file
	data_string, ec_pathways = EC_string(csv_fid, feature_name)
	fid.close()

	import numpy
	substrates,org_strings,max_values = sub_max_std(data_string)

	for sub in substrates:
	i = substrates.index(sub)
	for org in org_strings[i]:
	j = org_strings[i].index(org)
	output = output+ec_number+'\t'+sub+'\t'+org+'\t'+str(max_values[i][j])+'\t'+ ec_pathways+'\n'


	print 'Processed file ' + ec + ' ' + feature_name
	#Write output:
	os.chdir(output_path)
	fid = open('max_' + feature_name + '.txt','w')
	fid.write(output)
	fid.close()
	os.chdir(input_path)
	os.chdir(prev_path)

	################################################################################