Spaces:

jie1
/

succ1

Build error

File size: 15,691 Bytes

2d12bc4

#!/usr/bin/python
# coding: utf-8

# Author: LE YUAN

import os
import csv
import math
import xlrd
import pickle
import numpy as np
import pandas as pd
from rdkit import Chem
from Bio import SeqIO
from collections import defaultdict
from scipy import stats
from scipy.stats import ranksums
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from matplotlib import rc


def getIndex() :
    # Downloaded the data orthomcl_SeqIDs_index.txt from the Figshare data repository (10.6084/m9.figshare.5854692; 
    # https://figshare.com/articles/Tempo_and_mode_of_genome_evolution_in_the_budding_yeast_subphylum/5854692)
    # get the ortholog accoding to protein sequence id, that means Alloascoidea_hylecoeti@Seq_1 as the key, 0_0 as the value
    with open("../../Data/directory/to/orthomcl_SeqIDs_index.txt", "r") as indexFile :
        indexs = indexFile.readlines()

    indexSeqId = dict()
    for index in indexs :
        index_Seq = index.strip().split(": ")
        indexSeqId[index_Seq[0]] = index_Seq[1]

    return indexSeqId

def getIndex2() :
    # Downloaded the data orthomcl_SeqIDs_index.txt from the Figshare data repository (10.6084/m9.figshare.5854692; 
    # https://figshare.com/articles/Tempo_and_mode_of_genome_evolution_in_the_budding_yeast_subphylum/5854692)
    # get the ortholog accoding to protein sequence id, that means Alloascoidea_hylecoeti@Seq_1 as the key, 0_0 as the value
    with open("../../Data/directory/to/orthomcl_SeqIDs_index.txt", "r") as indexFile :
        indexs = indexFile.readlines()

    indexSeqId = dict()
    for index in indexs :
        index_Seq = index.strip().split(": ")
        indexSeqId[index_Seq[1]] = index_Seq[0]

    return indexSeqId 

def getOrthologIndex() :
    # Downloaded the data orthomcl_clusters.txt from the Figshare data repository (10.6084/m9.figshare.5854692; 
    # https://figshare.com/articles/Tempo_and_mode_of_genome_evolution_in_the_budding_yeast_subphylum/5854692)
    with open("../../Data/directory/to/orthomcl_clusters.txt", "r") as orthologFile :
        orthologs = orthologFile.readlines()

    orthologIndex = dict()
    for ortholog in orthologs :
        ortholog_Index = ortholog.strip().split(" ")
        # orthologIndex = {'OG1001': {'328_2397', '189_1696', '279_256',.....}}
        ortholog = ortholog_Index[0][:-1]
        
        orthologIndex[ortholog] = ortholog_Index[1:]

    return orthologIndex

def getOrthologIndex2() :
    # Downloaded the data orthomcl_clusters.txt from the Figshare data repository (10.6084/m9.figshare.5854692; 
    # https://figshare.com/articles/Tempo_and_mode_of_genome_evolution_in_the_budding_yeast_subphylum/5854692)
    with open("../../Data/directory/to/orthomcl_clusters.txt", "r") as orthologFile :
        orthologs = orthologFile.readlines()

    orthologIndex = dict()
    for ortholog in orthologs :
        ortholog_Index = ortholog.strip().split(" ")
        # orthologIndex = {'OG1001': {'328_2397', '189_1696', '279_256',.....}}
        ortholog = ortholog_Index[0][:-1]
        
        for index in ortholog_Index[1:] :
            orthologIndex[index] = ortholog
    # print(orthologIndex)  # {'302_3224': 'OG1000', '317_1502': 'OG1000', '318_1938': 'OG1001', '320_301': 'OG1001', '325_5347': 'OG1001'}

    return orthologIndex

def get_organisms() :
    filenames = os.listdir('../../Data/MLKCATRESULT/')
    filenames = [filename.split('ForKcat')[0] for filename in filenames if filename.endswith('.txt')]
    print(len(filenames)) # 343
    # print(filenames[:3])  # ['yHMPu5000035645_Yarrowia_divulgata', 'Saccharomyces_uvarum', 'Cyberlindnera_jadinii']
    return filenames

def getDNDS_all() :
    with open('../../Data/gene_dn_ds_03_02.csv', 'r') as infile :
        lines = infile.readlines()[1:]
    # print(len(lines1))

    dnds_dict = dict()
    for line in lines :
        data = line.strip().split(',')
        # print(data)
        if data[2] :
            OG_line = line.strip().split(',')[1].split('.')[0]
            dnds_score = line.strip().split(',')[2]
            # print(dnds_score)

            dnds_dict[OG_line] = float(dnds_score)

    return dnds_dict

def median(lst):
    sortedLst = sorted(lst)
    lstLen = len(lst)
    index = (lstLen - 1) // 2
   
    if (lstLen % 2):
        return sortedLst[index]
    else:
        return (sortedLst[index] + sortedLst[index + 1])/2.0

def species_clade() :
    with open("../../../BayesianApproach/Data/343_phenotype_clade.tsv", 'r') as infile :
        lines = infile.readlines()[1:]

    species = list()
    clade = list()

    for line in lines :
        data = line.strip().split('\t')
        species.append(data[0])
        clade.append(data[1])

    print(species[-3:])
    print(clade[-3:])

    species_clade = dict(zip(species,clade))
    # print(len(species_clade))
    return species_clade

def main() :

    SeqIdIndex = getIndex2()
    IndexOrtholog = getOrthologIndex2()
    ortholog_DNDS = getDNDS_all()
    organisms = get_organisms()
    species_clades = species_clade()

    # organisms = ['Saccharomyces_cerevisiae','Yarrowia_lipolytica','Kluyveromyces_marxianus','Kluyveromyces_lactis','Komagataella_pastoris','Lachancea_kluyveri','Candida_albicans']
    # organisms = ['Saccharomyces_cerevisiae','Yarrowia_lipolytica','Kluyveromyces_marxianus','Kluyveromyces_lactis','Lachancea_kluyveri', 
    #              'Saccharomyces_uvarum']

    all_clades = ['Outgroup', 'Lipomycetaceae', 'Trigonopsidaceae', 'Dipodascaceae/Trichomonascaceae', 'Alloascoideaceae', 'Sporopachydermia clade',
                    'Pichiaceae', 'CUG-Ala', 'CUG-Ser1', 'CUG-Ser2', 'Phaffomycetaceae', 'Saccharomycodaceae', 'Saccharomycetaceae']

    all_clades_order = {'Outgroup':1, 'Lipomycetaceae':2, 'Trigonopsidaceae':3, 'Dipodascaceae/Trichomonascaceae':4, 'Alloascoideaceae':5, 'Sporopachydermia clade':6,
                    'Pichiaceae':7, 'CUG-Ala':8, 'CUG-Ser1':9, 'CUG-Ser2':10, 'Phaffomycetaceae':11, 'Saccharomycodaceae':12, 'Saccharomycetaceae':13}

    alldata = dict()
    alldata['type'] = list()
    alldata['clade'] = list()
    alldata['Kcat_value'] = list()
    counts_cluster_1 = list()
    counts_cluster_2 = list()

    for clade in all_clades :
        for organism in organisms :
            if species_clades[organism.lower()] == clade :
                # print('This is', organism)

                with open('../prediction/343species_0115/%s_PredictionResults.txt' % organism, 'r') as infile :
                    lines = infile.readlines()

                # seqIds_values = dict()
                seq_kcat = list()

                for line in lines[1:] : 
                    seqIds_values = dict()
                    # seq_kcat = list()
                    data = line.strip('\n').split('\t')
                    smiles = data[4].split(';')
                    seqIds = data[5].split(';')
                    values = data[-1].split(';')

                    if values :
                        for i, seqId in enumerate(seqIds) :
                            for value in values :
                                if value :
                                    try :
                                        Kcats = value.split(',')
                                        if Kcats[i] != '#' :
                                            seqIds_values[seqId].append(float(Kcats[i]))
                                        else :
                                            pass
                                    except :
                                        Kcat = list()
                                        if Kcats[i] != '#' :
                                            Kcat.append(float(Kcats[i]))
                                            seqIds_values[seqId] = Kcat
                                        else :
                                            pass
                    # print(seqIds_values)

                    for seqId, value in seqIds_values.items() :
                        max_value = max(value)
                        seq_kcat.append((seqId, max_value))

                # print(len(seq_kcat))  # 5876
                # print(seq_kcat[:3])
                seq_kcat_no_copy = list(set(seq_kcat))
                # print(len(seq_kcat_no_copy))  # 2992
                # print(seq_kcat_no_copy[:3])
                # print(len(seqIds_values))

                for item in seq_kcat_no_copy :
                    seqId = item[0]
                    max_value = item[1]
                    index = SeqIdIndex[seqId]
                    ortholog = IndexOrtholog[index]
                    try :
                        dnds = float(ortholog_DNDS[ortholog])
                        # print(dnds)
                        kcatValue = math.log10(max_value)

                        if dnds>0 and dnds<=0.15 :
                            # alldata['type'].append('Conserved')
                            alldata['type'].append('dN/dS <= 0.15')
                            # alldata['clade'].append(clade)
                            alldata['clade'].append(all_clades_order[clade])
                            alldata['Kcat_value'].append(kcatValue)
                        else :
                            # alldata['type'].append('Non-conserved')
                            alldata['type'].append('dN/dS > 0.15')
                            # alldata['clade'].append(clade)
                            alldata['clade'].append(all_clades_order[clade])
                            alldata['Kcat_value'].append(kcatValue)

                    except :
                        continue

    # All clades:
    # ['Saccharomycodaceae', 'CUG-Ser1', 'CUG-Ser2', 'Dipodascaceae/Trichomonascaceae', 'Pichiaceae', 'Lipomycetaceae', 'Alloascoideaceae', 
    # 'Sporopachydermia clade', 'Saccharomycetaceae', 'Trigonopsidaceae', 'Phaffomycetaceae', 'CUG-Ala', 'Outgroup']

    # print(alldata['type'][:3])
    # print(alldata['clade'][:3])
    # print(alldata['Kcat_value'][:3])

    # print(len(alldata['type']))
    # print(len(alldata['clade']))
    # print(len(alldata['Kcat_value']))

    allData = pd.DataFrame(alldata)
    # print(type(allData))

    # for clade in all_clades :
    #     print('This is the clade:', clade)
    #     cluster_1 = list()
    #     cluster_2 = list()
    #     # types = allData.iloc[:,1]
    #     # print(len(types))
    #     # print(types[:3])
    #     # for clade_type in types :
    #     #     if clade_type == clade :
    #     for row_index, row in allData.iterrows() :
    #         if row['clade'] == clade and row['type'] == 'dN/dS <= 0.15' :
    #             # print(row['Kcat_value'])
    #             cluster_1.append(row['Kcat_value'])
    #         if row['clade'] == clade and row['type'] == 'dN/dS > 0.15' :
    #             # print(row['Kcat_value'])
    #             cluster_2.append(row['Kcat_value'])

    #     stat, p_value = ranksums(cluster_1,cluster_2)
    #     print('The P_value between the two dN/dS clusters is:', p_value)

        # Results :
        # This is the clade: Outgroup
        # The P_value between the two dN/dS clusters is: 1.6243302130328922e-61
        # This is the clade: Lipomycetaceae
        # The P_value between the two dN/dS clusters is: 7.879651158117646e-67
        # This is the clade: CUG-Ser1
        # The P_value between the two dN/dS clusters is: 0.0
        # This is the clade: Phaffomycetaceae
        # The P_value between the two dN/dS clusters is: 3.6142539325434596e-75
        # This is the clade: Dipodascaceae/Trichomonascaceae
        # The P_value between the two dN/dS clusters is: 8.512690063502762e-117
        # This is the clade: Trigonopsidaceae
        # The P_value between the two dN/dS clusters is: 4.157606980523744e-25
        # This is the clade: Saccharomycodaceae
        # The P_value between the two dN/dS clusters is: 7.633228849794443e-58
        # This is the clade: Sporopachydermia clade
        # The P_value between the two dN/dS clusters is: 1.2098972408782565e-07
        # This is the clade: Pichiaceae
        # The P_value between the two dN/dS clusters is: 1.8480664486765028e-291
        # This is the clade: CUG-Ser2
        # The P_value between the two dN/dS clusters is: 2.801972561349211e-19
        # This is the clade: CUG-Ala
        # The P_value between the two dN/dS clusters is: 3.1431166089138013e-38
        # This is the clade: Saccharomycetaceae
        # The P_value between the two dN/dS clusters is: 3.553336840154913e-298
        # This is the clade: Alloascoideaceae
        # The P_value between the two dN/dS clusters is: 0.0002450681317830253

    plt.figure(figsize=(2.5, 2.0))
    # To solve the 'Helvetica' font cannot be used in PDF file
    # https://stackoverflow.com/questions/59845568/the-pdf-backend-does-not-currently-support-the-selected-font
    rc('font',**{'family':'serif','serif':['Helvetica']})
    plt.rcParams['pdf.fonttype'] = 42

    plt.axes([0.12,0.12,0.83,0.83])
    
    plt.tick_params(direction='in')
    plt.tick_params(which='major',length=1.5)
    plt.tick_params(which='major',width=0.4)
    plt.tick_params(which='major',width=0.4)

    palette = {"dN/dS <= 0.15": '#b2182b', "dN/dS > 0.15": '#2166ac'}

    ax = sns.boxplot(data=alldata, x="clade", y="Kcat_value", hue="type",
            palette=palette, showfliers=False, linewidth=0.5)

    # https://stackoverflow.com/questions/58476654/how-to-remove-or-hide-x-axis-label-from-seaborn-boxplot
    # plt.xlabel(None) will remove the Label, but not the ticks. 
    ax.set(xlabel=None)
    # ax.set(xticks=None)

    for patch in ax.artists:
        r, g, b, a = patch.get_facecolor()
        patch.set_facecolor((r, g, b, 0.3))

    # print(ax.artists)
    # print(ax.lines)
    # print(len(ax.lines))
    # https://cduvallet.github.io/posts/2018/03/boxplots-in-python
    for i, artist in enumerate(ax.artists):
        # print(i)

        if i % 2 == 0:
            col = '#2166ac'
        else:
            col = '#b2182b'

        # if i % 2 == 0:
        #     col = '#b2182b'
        # else:
        #     col = '#2166ac'

        # This sets the color for the main box
        artist.set_edgecolor(col)

        # Each box has 5 associated Line2D objects (to make the whiskers, fliers, etc.)
        # Loop over them here, and use the same colour as above
        for j in range(i*5,i*5+5):
            # print(j)
            line = ax.lines[j]
            line.set_color(col)
            line.set_mfc(col)
            line.set_mec(col)
    handles = [ax.artists[0], ax.artists[1]]

    # for tick in ax.get_xticklabels() :
    #     tick.set_rotation(30)

    plt.rcParams['font.family'] = 'Helvetica'

    for i in range(len(all_clades)) :
        plt.text(i-0.3, 2.6, '***', fontweight ="normal", fontsize=6)

    plt.ylabel("$k$$_\mathregular{cat}$ value", fontname='Helvetica', fontsize=7)

    plt.xticks(rotation=30,ha='right')
    plt.ylim(-2,5)
    plt.yticks([-2,-1,0,1,2,3,4,5])
    plt.xticks(fontsize=7)
    plt.yticks(fontsize=6)

    ax.spines['bottom'].set_linewidth(0.5)
    ax.spines['left'].set_linewidth(0.5)
    ax.spines['top'].set_linewidth(0.5)
    ax.spines['right'].set_linewidth(0.5)

    ax = plt.gca()
    # handles,labels = ax.get_legend_handles_labels()
    labels = ax.get_legend_handles_labels()[1]
    # print(handles)
    # print(labels)
    # specify just one legend
    lgd = plt.legend(handles[0:2], labels[0:2], loc=1, frameon=False, prop={'size': 6})

    # https://blog.csdn.net/weixin_38314865/article/details/88633880
    plt.savefig("../../Results/figures/SuppleFig8b.pdf", dpi=400, bbox_inches = 'tight')


if __name__ == '__main__' :
    species_clade()
    # main()