|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
import re |
|
import json |
|
import requests |
|
import time |
|
from urllib import request |
|
from zeep import Client |
|
import hashlib |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def uniprot_sequence(id) : |
|
url = "https://www.uniprot.org/uniprot/%s.fasta" % id |
|
IdSeq = dict() |
|
|
|
try : |
|
data = request.urlopen(url) |
|
respdata = data.read().decode("utf-8").strip() |
|
IdSeq[id] = "".join(respdata.split("\n")[1:]) |
|
except : |
|
print(id, "can not find from uniprot!") |
|
IdSeq[id] = None |
|
print(IdSeq[id]) |
|
return IdSeq[id] |
|
|
|
def uniprotID_entry() : |
|
|
|
with open("../../Data/database/Kcat_combination_0731.tsv", "r", encoding='utf-8') as file : |
|
combination_lines = file.readlines()[1:] |
|
|
|
uniprotID_list = list() |
|
uniprotID_seq = dict() |
|
uniprotID_noseq = list() |
|
|
|
i=0 |
|
for line in combination_lines : |
|
data = line.strip().split('\t') |
|
uniprotID = data[5] |
|
|
|
if uniprotID : |
|
|
|
if ' ' in uniprotID : |
|
|
|
|
|
|
|
uniprotID_list += uniprotID.split(' ') |
|
else : |
|
|
|
uniprotID_list.append(uniprotID) |
|
|
|
|
|
uniprotID_unique = list(set(uniprotID_list)) |
|
|
|
|
|
|
|
for uniprotID in uniprotID_unique : |
|
i += 1 |
|
print(i) |
|
sequence = uniprot_sequence(uniprotID) |
|
if sequence : |
|
uniprotID_seq[uniprotID] = sequence |
|
else : |
|
uniprotID_noseq.append(uniprotID) |
|
|
|
|
|
print(len(uniprotID_seq)) |
|
print(len(uniprotID_noseq)) |
|
print(uniprotID_noseq) |
|
|
|
|
|
|
|
|
|
with open('../../Data/database/uniprotID_entry.json', 'w') as outfile : |
|
json.dump(uniprotID_seq, outfile, indent=4) |
|
|
|
def uniprotID_noseq() : |
|
with open('../../Data/database/uniprotID_entry.json', 'r') as infile : |
|
uniprotID_seq = json.load(infile) |
|
|
|
print(len(uniprotID_seq)) |
|
|
|
|
|
|
|
uniprotID_noseq = {'P0A5R0':'P9WIL4', 'P0C5C1':'P9WKD2', 'P51698':'A0A1L5BTC1', 'P96807':'P9WNP2', 'Q01745':'I1S2N3', 'P00892':'P0DP89', |
|
'Q02469':'P0C278', 'P96223':'P9WNF8', 'P0A4Z2':'P9WPY2', 'P0A4X4':'P9WQ86', 'P96420':'P9WQB2', 'Q47741':'F2MMN9', 'O05783':'P9WIQ2', |
|
'P0A4X6':'P9WQ80', 'P56967':'F2MMP0', 'O60344':'P0DPD6', 'P04804':'P60906', 'O52310':'P0CL72'} |
|
|
|
|
|
for uniprotID, mappedID in uniprotID_noseq.items() : |
|
sequence = uniprot_sequence(mappedID) |
|
print(uniprotID) |
|
print(sequence) |
|
if sequence : |
|
uniprotID_seq[uniprotID] = sequence |
|
else : |
|
print('No sequence found!---------------------------') |
|
|
|
print(len(uniprotID_seq)) |
|
|
|
with open('../../Data/database/uniprotID_entry_all.json', 'w') as outfile : |
|
json.dump(uniprotID_seq, outfile, indent=4) |
|
|
|
|
|
|
|
|
|
|
|
def seq_by_ec_organism(ec, organism) : |
|
IdSeq = dict() |
|
|
|
params = {"query": "ec:%s AND organism:%s AND reviewed:yes" % (ec, organism), "format": "fasta"} |
|
response = requests.get("http://www.uniprot.org/uniprot/", params=params) |
|
|
|
|
|
try : |
|
|
|
|
|
|
|
|
|
respdata = response.text |
|
|
|
sequence = list() |
|
seq = dict() |
|
i = 0 |
|
for line in respdata.split('\n') : |
|
if line.startswith('>') : |
|
name=line |
|
seq[name] = '' |
|
else : |
|
seq[name] += line.replace('\n', '').strip() |
|
IdSeq[ec+'&'+organism] = list(seq.values()) |
|
|
|
except : |
|
print(ec+'&'+organism, "can not find from uniprot!") |
|
IdSeq[ec+'&'+organism] = None |
|
|
|
print(IdSeq[ec+'&'+organism]) |
|
return IdSeq[ec+'&'+organism] |
|
|
|
|
|
def seq_by_brenda(ec, organism) : |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
email = 'youremail' |
|
|
|
password = 'yourpassword' |
|
|
|
wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl" |
|
password = hashlib.sha256(password.encode("utf-8")).hexdigest() |
|
client = Client(wsdl) |
|
|
|
|
|
|
|
parameters = ( email,password,"ecNumber*%s" % ec,"organism*%s" % organism, "sequence*", "noOfAminoAcids*", "firstAccessionCode*", "source*Swiss-Prot", "id*" ) |
|
entries = client.service.getSequence(*parameters) |
|
|
|
|
|
|
|
sequences = list() |
|
|
|
if entries : |
|
for entry in entries : |
|
sequences.append(entry['sequence']) |
|
|
|
print(sequences) |
|
print(len(sequences)) |
|
return sequences |
|
|
|
def nouniprotID_entry_uniprot() : |
|
|
|
|
|
|
|
|
|
with open("../../Data/database/Kcat_combination_0731.tsv", "r", encoding='utf-8') as file : |
|
combination_lines = file.readlines()[1:] |
|
|
|
IdSeq = dict() |
|
entries = list() |
|
i=0 |
|
for line in combination_lines : |
|
data = line.strip().split('\t') |
|
ec = data[0] |
|
organism = data[2] |
|
uniprotID = data[5] |
|
|
|
if not uniprotID : |
|
entries.append((ec,organism)) |
|
|
|
|
|
entries_unique = set(entries) |
|
|
|
|
|
for entry in list(entries_unique) : |
|
|
|
ec, organism = entry[0], entry[1] |
|
i += 1 |
|
print('This is', str(i)+'------------') |
|
IdSeq[ec+'&'+organism] = seq_by_ec_organism(ec, organism) |
|
|
|
if i%10 == 0 : |
|
time.sleep(3) |
|
|
|
with open('../../Data/database/nouniprotID_entry_all.json', 'w') as outfile : |
|
json.dump(IdSeq, outfile, indent=4) |
|
|
|
|
|
def nouniprotID_entry_brenda() : |
|
with open("../../Data/database/Kcat_combination_0731.tsv", "r") as file : |
|
combination_lines = file.readlines()[1:] |
|
|
|
IdSeq = dict() |
|
entries = list() |
|
i=0 |
|
for line in combination_lines : |
|
data = line.strip().split('\t') |
|
ec = data[0] |
|
organism = data[2] |
|
uniprotID = data[5] |
|
|
|
if not uniprotID : |
|
entries.append((ec,organism)) |
|
|
|
|
|
entries_unique = set(entries) |
|
|
|
|
|
for entry in list(entries_unique) : |
|
|
|
ec, organism = entry[0], entry[1] |
|
i += 1 |
|
print('This is', str(i)+'------------') |
|
|
|
|
|
IdSeq[ec+'&'+organism] = seq_by_brenda(ec,organism) |
|
|
|
with open('../../Data/database/nouniprotID_entry_brenda.json', 'w') as outfile : |
|
json.dump(IdSeq, outfile, indent=4) |
|
|
|
def combine_sequence() : |
|
with open('../../Data/database/uniprotID_entry_all.json', 'r') as file1: |
|
uniprot_file1 = json.load(file1) |
|
|
|
with open('../../Data/database/nouniprotID_entry_all.json', 'r') as file2: |
|
nouniprot_file2 = json.load(file2) |
|
|
|
with open('../../Data/database/nouniprotID_entry_brenda.json', 'r') as file3: |
|
nouniprot_file3 = json.load(file3) |
|
|
|
with open("../../Data/database/Kcat_combination_0731.tsv", "r", encoding='utf-8') as file4 : |
|
Kcat_lines = file4.readlines()[1:] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
i = 0 |
|
j = 0 |
|
n = 0 |
|
entries = list() |
|
for line in Kcat_lines : |
|
data = line.strip().split('\t') |
|
ECNumber, EnzymeType, Organism, Smiles = data[0], data[1], data[2], data[3] |
|
Substrate, UniprotID, Value, Unit = data[4], data[5], data[6], data[7] |
|
|
|
RetrievedSeq = '' |
|
entry = dict() |
|
|
|
if UniprotID : |
|
|
|
try : |
|
if ' ' not in UniprotID : |
|
RetrievedSeq = [uniprot_file1[UniprotID]] |
|
|
|
else : |
|
|
|
RetrievedSeq1 = [uniprot_file1[UniprotID.split(' ')[0]]] |
|
RetrievedSeq2 = [uniprot_file1[UniprotID.split(' ')[1]]] |
|
if RetrievedSeq1 == RetrievedSeq2 : |
|
RetrievedSeq = RetrievedSeq1 |
|
|
|
|
|
except : |
|
continue |
|
|
|
else : |
|
if nouniprot_file2[ECNumber+'&'+Organism] : |
|
|
|
if len(nouniprot_file2[ECNumber+'&'+Organism]) == 1 : |
|
RetrievedSeq = nouniprot_file2[ECNumber+'&'+Organism] |
|
|
|
else : |
|
RetrievedSeq = '' |
|
|
|
|
|
try: |
|
if len(RetrievedSeq) == 1 and EnzymeType == 'wildtype': |
|
sequence = RetrievedSeq |
|
i += 1 |
|
|
|
|
|
|
|
|
|
|
|
entry = { |
|
'ECNumber': ECNumber, |
|
'Organism': Organism, |
|
'Smiles': Smiles, |
|
'Substrate': Substrate, |
|
'Sequence': sequence[0], |
|
'Type': 'wildtype', |
|
'Value': Value, |
|
'Unit': Unit, |
|
} |
|
|
|
entries.append(entry) |
|
|
|
if len(RetrievedSeq) == 1 and EnzymeType != 'wildtype': |
|
sequence = RetrievedSeq[0] |
|
|
|
mutantSites = EnzymeType.split('/') |
|
|
|
|
|
mutant1_1 = [mutantSite[1:-1] for mutantSite in mutantSites] |
|
mutant1_2 = [mutantSite for mutantSite in mutantSites] |
|
mutant1 = [mutant1_1, mutant1_2] |
|
mutant2 = set(mutant1[0]) |
|
if len(mutant1[0]) != len(mutant2) : |
|
print(mutant1) |
|
n += 1 |
|
print(str(n) + '---------------------------') |
|
|
|
mutatedSeq = sequence |
|
for mutantSite in mutantSites : |
|
|
|
|
|
|
|
|
|
if mutatedSeq[int(mutantSite[1:-1])-1] == mutantSite[0] : |
|
|
|
mutatedSeq = list(mutatedSeq) |
|
mutatedSeq[int(mutantSite[1:-1])-1] = mutantSite[-1] |
|
mutatedSeq = ''.join(mutatedSeq) |
|
if not mutatedSeq : |
|
print('-------------') |
|
else : |
|
|
|
|
|
mutatedSeq = '' |
|
|
|
if mutatedSeq : |
|
|
|
|
|
entry = { |
|
'ECNumber': ECNumber, |
|
'Organism': Organism, |
|
'Smiles': Smiles, |
|
'Substrate': Substrate, |
|
'Sequence': mutatedSeq, |
|
'Type': 'mutant', |
|
'Value': Value, |
|
'Unit': Unit, |
|
} |
|
|
|
entries.append(entry) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
except: |
|
continue |
|
|
|
|
|
print(i) |
|
|
|
print(len(entries)) |
|
|
|
|
|
|
|
with open('../../Data/database/Kcat_combination_0918_wildtype_mutant.json', 'w') as outfile : |
|
json.dump(entries, outfile, indent=4) |
|
|
|
def check_substrate_seq() : |
|
with open('../../Data/database/Kcat_combination_0918.json', 'r') as file : |
|
datasets = json.load(file) |
|
|
|
substrate = [data['Substrate'].lower() for data in datasets] |
|
sequence = [data['Sequence'] for data in datasets] |
|
organism = [data['Organism'].lower() for data in datasets] |
|
EC_number = [data['ECNumber'] for data in datasets] |
|
|
|
unique_substrate = len(set(substrate)) |
|
unique_sequence = len(set(sequence)) |
|
unique_organism = len(set(organism)) |
|
unique_EC_number = len(set(EC_number)) |
|
|
|
print('The number of unique substrate:', unique_substrate) |
|
print('The number of unique sequence:', unique_sequence) |
|
print('The number of unique organism:', unique_organism) |
|
print('The number of unique EC Number:', unique_EC_number) |
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__" : |
|
combine_sequence() |
|
check_substrate_seq() |
|
|
|
|
|
|