|
|
|
|
|
|
|
|
|
|
|
import os |
|
import csv |
|
import math |
|
import xlrd |
|
import pickle |
|
import numpy as np |
|
import pandas as pd |
|
from rdkit import Chem |
|
from Bio import SeqIO |
|
from collections import defaultdict |
|
from scipy import stats |
|
from scipy.stats import ranksums |
|
import seaborn as sns |
|
import matplotlib.pyplot as plt |
|
import matplotlib.pyplot as plt |
|
from matplotlib import rc |
|
|
|
|
|
def getIndex() : |
|
|
|
|
|
|
|
with open("../../Data/directory/to/orthomcl_SeqIDs_index.txt", "r") as indexFile : |
|
indexs = indexFile.readlines() |
|
|
|
indexSeqId = dict() |
|
for index in indexs : |
|
index_Seq = index.strip().split(": ") |
|
indexSeqId[index_Seq[0]] = index_Seq[1] |
|
|
|
return indexSeqId |
|
|
|
def getIndex2() : |
|
|
|
|
|
|
|
with open("../../Data/directory/to/orthomcl_SeqIDs_index.txt", "r") as indexFile : |
|
indexs = indexFile.readlines() |
|
|
|
indexSeqId = dict() |
|
for index in indexs : |
|
index_Seq = index.strip().split(": ") |
|
indexSeqId[index_Seq[1]] = index_Seq[0] |
|
|
|
return indexSeqId |
|
|
|
def getOrthologIndex() : |
|
|
|
|
|
with open("../../Data/directory/to/orthomcl_clusters.txt", "r") as orthologFile : |
|
orthologs = orthologFile.readlines() |
|
|
|
orthologIndex = dict() |
|
for ortholog in orthologs : |
|
ortholog_Index = ortholog.strip().split(" ") |
|
|
|
ortholog = ortholog_Index[0][:-1] |
|
|
|
orthologIndex[ortholog] = ortholog_Index[1:] |
|
|
|
return orthologIndex |
|
|
|
def getOrthologIndex2() : |
|
|
|
|
|
with open("../../Data/directory/to/orthomcl_clusters.txt", "r") as orthologFile : |
|
orthologs = orthologFile.readlines() |
|
|
|
orthologIndex = dict() |
|
for ortholog in orthologs : |
|
ortholog_Index = ortholog.strip().split(" ") |
|
|
|
ortholog = ortholog_Index[0][:-1] |
|
|
|
for index in ortholog_Index[1:] : |
|
orthologIndex[index] = ortholog |
|
|
|
|
|
return orthologIndex |
|
|
|
def get_organisms() : |
|
filenames = os.listdir('../../Data/MLKCATRESULT/') |
|
filenames = [filename.split('ForKcat')[0] for filename in filenames if filename.endswith('.txt')] |
|
print(len(filenames)) |
|
|
|
return filenames |
|
|
|
def getDNDS_all() : |
|
with open('../../Data/gene_dn_ds_03_02.csv', 'r') as infile : |
|
lines = infile.readlines()[1:] |
|
|
|
|
|
dnds_dict = dict() |
|
for line in lines : |
|
data = line.strip().split(',') |
|
|
|
if data[2] : |
|
OG_line = line.strip().split(',')[1].split('.')[0] |
|
dnds_score = line.strip().split(',')[2] |
|
|
|
|
|
dnds_dict[OG_line] = float(dnds_score) |
|
|
|
return dnds_dict |
|
|
|
def median(lst): |
|
sortedLst = sorted(lst) |
|
lstLen = len(lst) |
|
index = (lstLen - 1) // 2 |
|
|
|
if (lstLen % 2): |
|
return sortedLst[index] |
|
else: |
|
return (sortedLst[index] + sortedLst[index + 1])/2.0 |
|
|
|
def species_clade() : |
|
with open("../../../BayesianApproach/Data/343_phenotype_clade.tsv", 'r') as infile : |
|
lines = infile.readlines()[1:] |
|
|
|
species = list() |
|
clade = list() |
|
|
|
for line in lines : |
|
data = line.strip().split('\t') |
|
species.append(data[0]) |
|
clade.append(data[1]) |
|
|
|
print(species[-3:]) |
|
print(clade[-3:]) |
|
|
|
species_clade = dict(zip(species,clade)) |
|
|
|
return species_clade |
|
|
|
def main() : |
|
|
|
SeqIdIndex = getIndex2() |
|
IndexOrtholog = getOrthologIndex2() |
|
ortholog_DNDS = getDNDS_all() |
|
organisms = get_organisms() |
|
species_clades = species_clade() |
|
|
|
|
|
|
|
|
|
|
|
all_clades = ['Outgroup', 'Lipomycetaceae', 'Trigonopsidaceae', 'Dipodascaceae/Trichomonascaceae', 'Alloascoideaceae', 'Sporopachydermia clade', |
|
'Pichiaceae', 'CUG-Ala', 'CUG-Ser1', 'CUG-Ser2', 'Phaffomycetaceae', 'Saccharomycodaceae', 'Saccharomycetaceae'] |
|
|
|
all_clades_order = {'Outgroup':1, 'Lipomycetaceae':2, 'Trigonopsidaceae':3, 'Dipodascaceae/Trichomonascaceae':4, 'Alloascoideaceae':5, 'Sporopachydermia clade':6, |
|
'Pichiaceae':7, 'CUG-Ala':8, 'CUG-Ser1':9, 'CUG-Ser2':10, 'Phaffomycetaceae':11, 'Saccharomycodaceae':12, 'Saccharomycetaceae':13} |
|
|
|
alldata = dict() |
|
alldata['type'] = list() |
|
alldata['clade'] = list() |
|
alldata['Kcat_value'] = list() |
|
counts_cluster_1 = list() |
|
counts_cluster_2 = list() |
|
|
|
for clade in all_clades : |
|
for organism in organisms : |
|
if species_clades[organism.lower()] == clade : |
|
|
|
|
|
with open('../prediction/343species_0115/%s_PredictionResults.txt' % organism, 'r') as infile : |
|
lines = infile.readlines() |
|
|
|
|
|
seq_kcat = list() |
|
|
|
for line in lines[1:] : |
|
seqIds_values = dict() |
|
|
|
data = line.strip('\n').split('\t') |
|
smiles = data[4].split(';') |
|
seqIds = data[5].split(';') |
|
values = data[-1].split(';') |
|
|
|
if values : |
|
for i, seqId in enumerate(seqIds) : |
|
for value in values : |
|
if value : |
|
try : |
|
Kcats = value.split(',') |
|
if Kcats[i] != '#' : |
|
seqIds_values[seqId].append(float(Kcats[i])) |
|
else : |
|
pass |
|
except : |
|
Kcat = list() |
|
if Kcats[i] != '#' : |
|
Kcat.append(float(Kcats[i])) |
|
seqIds_values[seqId] = Kcat |
|
else : |
|
pass |
|
|
|
|
|
for seqId, value in seqIds_values.items() : |
|
max_value = max(value) |
|
seq_kcat.append((seqId, max_value)) |
|
|
|
|
|
|
|
seq_kcat_no_copy = list(set(seq_kcat)) |
|
|
|
|
|
|
|
|
|
for item in seq_kcat_no_copy : |
|
seqId = item[0] |
|
max_value = item[1] |
|
index = SeqIdIndex[seqId] |
|
ortholog = IndexOrtholog[index] |
|
try : |
|
dnds = float(ortholog_DNDS[ortholog]) |
|
|
|
kcatValue = math.log10(max_value) |
|
|
|
if dnds>0 and dnds<=0.15 : |
|
|
|
alldata['type'].append('dN/dS <= 0.15') |
|
|
|
alldata['clade'].append(all_clades_order[clade]) |
|
alldata['Kcat_value'].append(kcatValue) |
|
else : |
|
|
|
alldata['type'].append('dN/dS > 0.15') |
|
|
|
alldata['clade'].append(all_clades_order[clade]) |
|
alldata['Kcat_value'].append(kcatValue) |
|
|
|
except : |
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
allData = pd.DataFrame(alldata) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
plt.figure(figsize=(2.5, 2.0)) |
|
|
|
|
|
rc('font',**{'family':'serif','serif':['Helvetica']}) |
|
plt.rcParams['pdf.fonttype'] = 42 |
|
|
|
plt.axes([0.12,0.12,0.83,0.83]) |
|
|
|
plt.tick_params(direction='in') |
|
plt.tick_params(which='major',length=1.5) |
|
plt.tick_params(which='major',width=0.4) |
|
plt.tick_params(which='major',width=0.4) |
|
|
|
palette = {"dN/dS <= 0.15": '#b2182b', "dN/dS > 0.15": '#2166ac'} |
|
|
|
ax = sns.boxplot(data=alldata, x="clade", y="Kcat_value", hue="type", |
|
palette=palette, showfliers=False, linewidth=0.5) |
|
|
|
|
|
|
|
ax.set(xlabel=None) |
|
|
|
|
|
for patch in ax.artists: |
|
r, g, b, a = patch.get_facecolor() |
|
patch.set_facecolor((r, g, b, 0.3)) |
|
|
|
|
|
|
|
|
|
|
|
for i, artist in enumerate(ax.artists): |
|
|
|
|
|
if i % 2 == 0: |
|
col = '#2166ac' |
|
else: |
|
col = '#b2182b' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
artist.set_edgecolor(col) |
|
|
|
|
|
|
|
for j in range(i*5,i*5+5): |
|
|
|
line = ax.lines[j] |
|
line.set_color(col) |
|
line.set_mfc(col) |
|
line.set_mec(col) |
|
handles = [ax.artists[0], ax.artists[1]] |
|
|
|
|
|
|
|
|
|
plt.rcParams['font.family'] = 'Helvetica' |
|
|
|
for i in range(len(all_clades)) : |
|
plt.text(i-0.3, 2.6, '***', fontweight ="normal", fontsize=6) |
|
|
|
plt.ylabel("$k$$_\mathregular{cat}$ value", fontname='Helvetica', fontsize=7) |
|
|
|
plt.xticks(rotation=30,ha='right') |
|
plt.ylim(-2,5) |
|
plt.yticks([-2,-1,0,1,2,3,4,5]) |
|
plt.xticks(fontsize=7) |
|
plt.yticks(fontsize=6) |
|
|
|
ax.spines['bottom'].set_linewidth(0.5) |
|
ax.spines['left'].set_linewidth(0.5) |
|
ax.spines['top'].set_linewidth(0.5) |
|
ax.spines['right'].set_linewidth(0.5) |
|
|
|
ax = plt.gca() |
|
|
|
labels = ax.get_legend_handles_labels()[1] |
|
|
|
|
|
|
|
lgd = plt.legend(handles[0:2], labels[0:2], loc=1, frameon=False, prop={'size': 6}) |
|
|
|
|
|
plt.savefig("../../Results/figures/SuppleFig8b.pdf", dpi=400, bbox_inches = 'tight') |
|
|
|
|
|
if __name__ == '__main__' : |
|
species_clade() |
|
|
|
|