|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
features_list = ['KCAT','SA', 'MW'] |
|
|
|
input_path = '/Users/.../brenda_parser/EC_files' |
|
|
|
output_path = '/Users/.../brenda_parser/max_data' |
|
|
|
|
|
|
|
|
|
|
|
def sub_max_std(data): |
|
|
|
data.sort() |
|
data.append('') |
|
|
|
|
|
substrates, org_strings_reps, values_reps, reps_indexes = substrate_repetitions(data) |
|
org_strings, values =\ |
|
find_in_substrate(substrates, org_strings_reps, values_reps, reps_indexes) |
|
|
|
values = maximum_values(values) |
|
|
|
return (substrates,org_strings,values) |
|
|
|
|
|
|
|
|
|
|
|
def maximum_values(values): |
|
for subs_values in values: |
|
i = values.index(subs_values) |
|
for org_values in subs_values: |
|
j = subs_values.index(org_values) |
|
|
|
values[i][j] = max(values[i][j]) |
|
|
|
return(values) |
|
|
|
|
|
def find_in_substrate(substrates, org_strings_reps, values_reps, reps_indexes): |
|
|
|
org_strings = [] |
|
values = [] |
|
for i in range(len(substrates)): |
|
subs_orgs = [] |
|
subs_values = [] |
|
|
|
for j in reps_indexes[i]: |
|
|
|
try: |
|
org_index = subs_orgs.index(org_strings_reps[j]) |
|
subs_values[org_index].append(values_reps[j]) |
|
|
|
except: |
|
subs_orgs.append(org_strings_reps[j]) |
|
org_index = subs_orgs.index(org_strings_reps[j]) |
|
subs_values.append([values_reps[j]]) |
|
|
|
org_strings.append(subs_orgs) |
|
values.append(subs_values) |
|
|
|
return(org_strings, values) |
|
|
|
|
|
|
|
|
|
|
|
def substrate_repetitions(data): |
|
|
|
substrates = [] |
|
org_strings_reps = [] |
|
values_reps = [] |
|
reps_indexes = [] |
|
|
|
for row in data: |
|
if row != '': |
|
row_index = data.index(row) |
|
|
|
try: |
|
subs_indx = substrates.index(row[0:row.find('///')]) |
|
|
|
except: |
|
substrates.append(row[0:row.find('///')]) |
|
subs_indx = substrates.index(row[0:row.find('///')]) |
|
reps_indexes.append([]) |
|
|
|
reps_indexes[subs_indx].append(row_index) |
|
|
|
org_strings_reps.append(row[row.find('///')+3:row.find('////')]) |
|
|
|
values_reps.append(float(row[row.find('////')+4:len(row)])) |
|
|
|
return(substrates, org_strings_reps, values_reps, reps_indexes) |
|
|
|
|
|
|
|
|
|
|
|
|
|
def brenda_orgs_list(dir): |
|
brenda_orgs=[] |
|
for ec in dir: |
|
|
|
fid = open(ec,'r') |
|
csv_fid = csv.reader(fid,delimiter='\t') |
|
|
|
try: |
|
for row in csv_fid: |
|
if row != '' and row[0] != 'SEQ' and row[0] != '*': |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
org_name = row[1].lower() |
|
|
|
|
|
|
|
if brenda_orgs.count(org_name)==0: |
|
brenda_orgs.append(org_name) |
|
except: |
|
pass |
|
return (brenda_orgs) |
|
|
|
|
|
|
|
|
|
|
|
|
|
def KEGG_orgs_list(): |
|
|
|
|
|
url = 'http://rest.kegg.jp/list/organism' |
|
|
|
try: |
|
query = urllib2.urlopen(url, timeout=20).read() |
|
except: |
|
query='' |
|
|
|
entries = query.split('\n') |
|
KEGG_list = [] |
|
tax_kegg = [] |
|
codes = [] |
|
for row in entries: |
|
if row != '': |
|
row_list = row.split('\t') |
|
if len(row_list)>1: |
|
row_list=[row_list[2],row_list[3],row_list[1]] |
|
if row_list[0].find('(') != -1: |
|
row_list[0]=row_list[0][0:row_list[0].find('(')-1] |
|
|
|
|
|
taxonomy=row_list[1].lower() |
|
if taxonomy.find('eukaryotes')!= -1 or taxonomy.find('prokaryotes')!= -1: |
|
KEGG_list.append(row_list[0].lower()) |
|
tax_kegg.append(taxonomy) |
|
codes.append(row_list[2]) |
|
return(KEGG_list, tax_kegg, codes) |
|
|
|
|
|
|
|
|
|
|
|
|
|
def orgs_list(dir): |
|
|
|
KEGG_orgs, info_KEGG, codes = KEGG_orgs_list() |
|
|
|
brenda_orgs = brenda_orgs_list(dir) |
|
|
|
organism_list = [] |
|
taxonomy = [] |
|
org_codes = [] |
|
|
|
counter=0 |
|
|
|
for B_org in brenda_orgs: |
|
flag = False |
|
if B_org != '*': |
|
i=0 |
|
|
|
while (i < len(KEGG_orgs) and flag == False): |
|
K_org = KEGG_orgs[i] |
|
|
|
if K_org.find(B_org)!= -1: |
|
|
|
flag = True |
|
counter = counter+1 |
|
organism_list.append(B_org) |
|
taxonomy.append(info_KEGG[i]) |
|
org_codes.append(codes[i]) |
|
i = i +1 |
|
|
|
if flag == False: |
|
organism_list.append(B_org) |
|
taxonomy.append('*') |
|
org_codes.append('*') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return(organism_list,taxonomy,org_codes) |
|
|
|
|
|
|
|
|
|
|
|
|
|
def EC_string(csv_fid, feature_name): |
|
data_string = [] |
|
ec_pathways = '' |
|
for row in csv_fid: |
|
if row[0] != '': |
|
row[4] = row[4].lower() |
|
mutant = max(row[4].find('mutant'),row[4].find('mutated')) |
|
|
|
if row[2] != '-999' and mutant == -1: |
|
|
|
if row[0] == feature_name and float(row[2]) <= 1e7: |
|
|
|
|
|
try: |
|
org_index = organism_list.index(row[1].lower()) |
|
org_string = organism_list[org_index]+'//'+\ |
|
taxonomy[org_index]+ '//'+ organism_code[org_index] |
|
|
|
data_string.append(row[3].lower() + '///' +\ |
|
org_string + '////' + row[2]) |
|
except: |
|
print 'Organism not found in KEGG or BRENDA' |
|
|
|
|
|
|
|
if row[0] == 'PATH' and row[2].lower() != 'metabolic pathways': |
|
if row[2].find('(engineered)') == -1: |
|
ec_pathways = ec_pathways + row[2].lower() + '///' |
|
|
|
if ec_pathways == '' or ec_pathways == ' ' or ec_pathways == '\0': |
|
ec_pathways = '*' |
|
if len(ec_pathways) > 3 and ec_pathways[-3] == '/': |
|
ec_pathways = ec_pathways[:-3] |
|
|
|
return(data_string, ec_pathways) |
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
prev_path = os.getcwd() |
|
os.chdir(input_path) |
|
dir_files = os.listdir(input_path) |
|
dir_files.sort() |
|
import urllib2 |
|
import csv |
|
organism_list,taxonomy,organism_code = orgs_list(dir_files) |
|
|
|
for feature_name in features_list: |
|
|
|
output = '' |
|
for ec in dir_files: |
|
ec_number = ec[0:len(ec)-4] |
|
fid = open(ec,'r') |
|
csv_fid = csv.reader(fid,delimiter='\t') |
|
|
|
data_string, ec_pathways = EC_string(csv_fid, feature_name) |
|
fid.close() |
|
|
|
import numpy |
|
substrates,org_strings,max_values = sub_max_std(data_string) |
|
|
|
for sub in substrates: |
|
i = substrates.index(sub) |
|
for org in org_strings[i]: |
|
j = org_strings[i].index(org) |
|
output = output+ec_number+'\t'+sub+'\t'+org+'\t'+str(max_values[i][j])+'\t'+ ec_pathways+'\n' |
|
|
|
|
|
print 'Processed file ' + ec + ' ' + feature_name |
|
|
|
os.chdir(output_path) |
|
fid = open('max_' + feature_name + '.txt','w') |
|
fid.write(output) |
|
fid.close() |
|
os.chdir(input_path) |
|
os.chdir(prev_path) |
|
|
|
|
|
|