{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ed0cdaf6-71e1-4ef0-894f-0beabdc392cf",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import re\n",
    "from PIL import Image\n",
    "import webbrowser\n",
    "import json\n",
    "import pickle\n",
    "import sys \n",
    "import joblib\n",
    "import sys\n",
    "\n",
    "from rdkit import Chem\n",
    "from rdkit.Chem import Draw\n",
    "from rdkit.Chem import rdChemReactions as Reactions\n",
    "\n",
    "from compound_cacher import CompoundCacher\n",
    "from compound import Compound\n",
    "from chemaxon import *\n",
    "import chemaxon"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e64deced-2a44-4d8e-ba8f-d9843f11724a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_smiles():\n",
    "    db = pd.read_csv('./../data/cache_compounds_20160818.csv',index_col='compound_id')\n",
    "    db_smiles = db['smiles_pH7'].to_dict()\n",
    "    return db_smiles\n",
    "\n",
    "def load_molsig_rad1():\n",
    "    molecular_signature_r1 = json.load(open('./../data/decompose_vector_ac.json'))\n",
    "    return molecular_signature_r1\n",
    "\n",
    "def load_molsig_rad2():\n",
    "    molecular_signature_r2 = json.load(open('./../data/decompose_vector_ac_r2_py3_indent_modified_manual.json'))\n",
    "    return molecular_signature_r2\n",
    "\n",
    "def load_model():\n",
    "    filename = './../model/M12_model_BR.pkl'\n",
    "    loaded_model = joblib.load(open(filename, 'rb'))\n",
    "    return loaded_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "71615c14-49c3-45e7-9495-194ef22fb1ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "db_smiles = load_smiles()\n",
    "molsig_r1 = load_molsig_rad1()\n",
    "molsig_r2 = load_molsig_rad2()\n",
    "loaded_model = load_model()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b86b8049-cbf2-473f-8715-5e5f908193a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_reaction_formula_side(s):\n",
    "    \"\"\"\n",
    "        Parses the side formula, e.g. '2 C00001 + C00002 + 3 C00003'\n",
    "        Ignores stoichiometry.\n",
    "\n",
    "        Returns:\n",
    "            The set of CIDs.\n",
    "    \"\"\"\n",
    "    if s.strip() == \"null\":\n",
    "        return {}\n",
    "\n",
    "    compound_bag = {}\n",
    "    for member in re.split('\\s+\\+\\s+', s):\n",
    "        tokens = member.split(None, 1)\n",
    "        if len(tokens) == 0:\n",
    "            continue\n",
    "        if len(tokens) == 1:\n",
    "            amount = 1\n",
    "            key = member\n",
    "        else:\n",
    "            amount = float(tokens[0])\n",
    "            key = tokens[1]\n",
    "\n",
    "        compound_bag[key] = compound_bag.get(key, 0) + amount\n",
    "\n",
    "    return compound_bag\n",
    "\n",
    "def parse_formula(formula, arrow='<=>', rid=None):\n",
    "    \"\"\"\n",
    "        Parses a two-sided formula such as: 2 C00001 => C00002 + C00003\n",
    "\n",
    "        Return:\n",
    "            The set of substrates, products and the direction of the reaction\n",
    "    \"\"\"\n",
    "    tokens = formula.split(arrow)\n",
    "    if len(tokens) < 2:\n",
    "        print(('Reaction does not contain the arrow sign (%s): %s'\n",
    "                                 % (arrow, formula)))\n",
    "    if len(tokens) > 2:\n",
    "        print(('Reaction contains more than one arrow sign (%s): %s'\n",
    "                                 % (arrow, formula)))\n",
    "\n",
    "    left = tokens[0].strip()\n",
    "    right = tokens[1].strip()\n",
    "\n",
    "    sparse_reaction = {}\n",
    "    for cid, count in parse_reaction_formula_side(left).items():\n",
    "        sparse_reaction[cid] = sparse_reaction.get(cid, 0) - count\n",
    "\n",
    "    for cid, count in parse_reaction_formula_side(right).items():\n",
    "        sparse_reaction[cid] = sparse_reaction.get(cid, 0) + count  \n",
    "    \n",
    "    return sparse_reaction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "7342b178-3472-4734-83e3-3de431abe15e",
   "metadata": {},
   "outputs": [],
   "source": [
    "rxn_string = \"C00222 + C00010 + C00006 <=> C00024 + C00011 + C00005\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "7b4dfe4f-48a8-4011-b201-7fb3a3268cef",
   "metadata": {},
   "outputs": [],
   "source": [
    "rxn_dic = parse_formula(rxn_string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "1f523aa2-b9dc-4153-8c1c-dec58e1ab987",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_ddG0(rxn_dict,pH,I,novel_mets):\n",
    "    ccache = CompoundCacher()\n",
    "    # ddG0 = get_transform_ddG0(rxn_dict, ccache, pH, I, T)\n",
    "    T = 298.15\n",
    "    ddG0_forward = 0\n",
    "    for compound_id, coeff in rxn_dict.items():\n",
    "        if novel_mets != None and compound_id in novel_mets:\n",
    "            comp = novel_mets[compound_id]\n",
    "        else:\n",
    "            comp = ccache.get_compound(compound_id)\n",
    "        ddG0_forward += coeff * comp.transform_pH7(pH, I, T)\n",
    "\n",
    "    return ddG0_forward"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "33cf30ff-8b2c-4da9-9134-75a60a5c5d66",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-3.6254822995515497"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_ddG0(rxn_dic, 7.0,  0.1, {})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "9e39855d-eb9e-4ea9-aeb9-8b770cc24c8e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_rule(rxn_dict, molsig1, molsig2, novel_decomposed1, novel_decomposed2):\n",
    "    if novel_decomposed1 != None:\n",
    "        for cid in novel_decomposed1:\n",
    "            molsig1[cid] = novel_decomposed1[cid]\n",
    "    if novel_decomposed2 != None:\n",
    "        for cid in novel_decomposed2:\n",
    "            molsig2[cid] = novel_decomposed2[cid]\n",
    "\n",
    "    molsigna_df1 = pd.DataFrame.from_dict(molsig1).fillna(0)\n",
    "    all_mets1 = molsigna_df1.columns.tolist()\n",
    "    all_mets1.append(\"C00080\")\n",
    "    all_mets1.append(\"C00282\")\n",
    "\n",
    "    molsigna_df2 = pd.DataFrame.from_dict(molsig2).fillna(0)\n",
    "    all_mets2 = molsigna_df2.columns.tolist()\n",
    "    all_mets2.append(\"C00080\")\n",
    "    all_mets2.append(\"C00282\")\n",
    "\n",
    "    moieties_r1 = open('./data/group_names_r1.txt')\n",
    "    moieties_r2 = open('./data/group_names_r2_py3_modified_manual.txt')\n",
    "    moie_r1 = moieties_r1.read().splitlines()\n",
    "    moie_r2 = moieties_r2.read().splitlines()\n",
    "\n",
    "    molsigna_df1 = molsigna_df1.reindex(moie_r1)\n",
    "    molsigna_df2 = molsigna_df2.reindex(moie_r2)\n",
    "\n",
    "    rule_df1 = pd.DataFrame(index=molsigna_df1.index)\n",
    "    rule_df2 = pd.DataFrame(index=molsigna_df2.index)\n",
    "    # for rid, value in reaction_dict.items():\n",
    "    #     # skip the reactions with missing metabolites\n",
    "    #     mets = value.keys()\n",
    "    #     flag = False\n",
    "    #     for met in mets:\n",
    "    #         if met not in all_mets:\n",
    "    #             flag = True\n",
    "    #             break\n",
    "    #     if flag: continue\n",
    "\n",
    "    rule_df1['change'] = 0\n",
    "    for met, stoic in rxn_dict.items():\n",
    "        if met == \"C00080\" or met == \"C00282\":\n",
    "            continue  # hydogen is zero\n",
    "        rule_df1['change'] += molsigna_df1[met] * stoic\n",
    "\n",
    "    rule_df2['change'] = 0\n",
    "    for met, stoic in rxn_dict.items():\n",
    "        if met == \"C00080\" or met == \"C00282\":\n",
    "            continue  # hydogen is zero\n",
    "        rule_df2['change'] += molsigna_df2[met] * stoic\n",
    "\n",
    "    rule_vec1 = rule_df1.to_numpy().T\n",
    "    rule_vec2 = rule_df2.to_numpy().T\n",
    "\n",
    "    m1, n1 = rule_vec1.shape\n",
    "    m2, n2 = rule_vec2.shape\n",
    "\n",
    "    zeros1 = np.zeros((m1, 44))\n",
    "    zeros2 = np.zeros((m2, 44))\n",
    "    X1 = np.concatenate((rule_vec1, zeros1), 1)\n",
    "    X2 = np.concatenate((rule_vec2, zeros2), 1)\n",
    "\n",
    "    rule_comb = np.concatenate((X1, X2), 1)\n",
    "\n",
    "    # rule_df_final = {}\n",
    "    # rule_df_final['rad1'] = rule_df1\n",
    "    # rule_df_final['rad2'] = rule_df2\n",
    "    return rule_comb, rule_df1, rule_df2\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "a93ea75e-9851-45fd-aa58-d7f325b4b5a6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'C00222': -1,\n",
       " 'C00010': -1,\n",
       " 'C00006': -1,\n",
       " 'C00024': 1,\n",
       " 'C00011': 1,\n",
       " 'C00005': 1}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rxn_dic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "981948dd-db2c-4463-b983-1220353d963e",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "96eb1c38-2ca7-4e38-bcc4-ade1cef73852",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([-19.96775194]), array([6.66052556]))"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loaded_model.predict(X, return_std= True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81128dd3-5005-40a6-b5fe-8ecacef824bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_ddG0(rxn_dict,pH,I,novel_mets):\n",
    "    ccache = CompoundCacher()\n",
    "    # ddG0 = get_transform_ddG0(rxn_dict, ccache, pH, I, T)\n",
    "    T = 298.15\n",
    "    ddG0_forward = 0\n",
    "    for compound_id, coeff in rxn_dict.items():\n",
    "        if novel_mets != None and compound_id in novel_mets:\n",
    "            comp = novel_mets[compound_id]\n",
    "        else:\n",
    "            comp = ccache.get_compound(compound_id)\n",
    "        ddG0_forward += coeff * comp.transform_pH7(pH, I, T)\n",
    "\n",
    "    return ddG0_forward\n",
    "\n",
    "\n",
    "def get_dG0(rxn_dict,rid,pH,I,loaded_model,molsig_r1, molsig_r2, novel_decomposed_r1, novel_decomposed_r2,novel_mets):\n",
    "    rule_comb, rule_df1, rule_df2 = get_rule(rxn_dict,molsig_r1,molsig_r2, novel_decomposed_r1, novel_decomposed_r2)\n",
    "    X  = rule_comb\n",
    "    ymean, ystd = loaded_model.predict(X, return_std=True)\n",
    "    result = {}\n",
    "    return ymean[0] + get_ddG0(rxn_dict, pH, I, novel_mets),ystd[0], rule_df1, rule_df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "751ec201-f062-4ac0-8d24-fe959636cbdc",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c6cb1e4d-24be-42a1-b88b-793a62597c92",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7abe24be-1653-455b-9931-9446480d39bb",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f13433dc-51a3-41e5-8a0b-b0f21724ef98",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "db7c764f-d216-44a9-8f88-0e3a7c51377a",
   "metadata": {},
   "outputs": [],
   "source": [
    "ccc=  CompoundCacher()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "09e6f7f2-5be7-4db3-b55d-756ecb711095",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = ccc.get_compound('C00001')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "d28e44b7-d942-4739-9d7d-2f4e082ac1b9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "81.4472134155519"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a.transform_pH7(7, 0.25 , 298)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "1ef3fc0d-7d63-42ea-8743-522fe010a95d",
   "metadata": {},
   "outputs": [],
   "source": [
    "inchi_k = \"InChI=1S/C14H14O/c15-14-8-4-7-13(11-14)10-9-12-5-2-1-3-6-12/h1-8,11,15H,9-10H2\" ;"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "4e651d1c-2c96-42d1-adab-466dc7518146",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\vuu10\\AppData\\Local\\Continuum\\anaconda3\\envs\\dGPredictor_py3\\lib\\openbabel\\__init__.py:14: UserWarning: \"import openbabel\" is deprecated, instead use \"from openbabel import openbabel\"\n",
      "  warnings.warn('\"import openbabel\" is deprecated, instead use \"from openbabel import openbabel\"')\n"
     ]
    }
   ],
   "source": [
    "c = Compound.from_inchi('Test', 'sajdf', inchi_k )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "6eb5c2dc-f14c-46de-889b-0e9b7faa9f79",
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'Compound' object has no attribute 'smiles_ph7'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-18-7a0d06664090>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msmiles_ph7\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;31mAttributeError\u001b[0m: 'Compound' object has no attribute 'smiles_ph7'"
     ]
    }
   ],
   "source": [
    "c.smiles_ph7()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "edd156dc-4355-4c2c-ba4e-6d98e776a96a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from chemaxon import *\n",
    "import chemaxon"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "880d2ef6-6b03-49d3-8f60-66769c22a84d",
   "metadata": {},
   "outputs": [],
   "source": [
    "pKas, major_ms_smiles = chemaxon.GetDissociationConstants(inchi_k)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "7a2391dc-313c-47f2-9f54-823bfdb95fcd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'OC1=CC=CC(CCC2=CC=CC=C2)=C1\\r'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "major_ms_smiles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "96d90c4a-14a2-45fb-8573-97db84de2dff",
   "metadata": {},
   "outputs": [],
   "source": [
    "major_ms_smiles = Compound.smiles2smiles(major_ms_smiles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "36d46620-b895-4ec8-85d0-7499759812c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "MIN_PH = 0.0\n",
    "MAX_PH = 14.0\n",
    "pKas = sorted([pka for pka in pKas if pka > MIN_PH and pka < MAX_PH], reverse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "ffccf9d9-5a52-4be6-af4c-f39b3db2a27c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[10.1]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pKas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "e83721fa-9a42-42ef-9a03-59fc2689c73b",
   "metadata": {},
   "outputs": [],
   "source": [
    "atom_bag, major_ms_charge = chemaxon.GetAtomBagAndCharge(major_ms_smiles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "47a87ed7-968d-44b6-a237-a8469ba3fe3b",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "49cfefde-ee96-4ca8-89af-c50f2f2ca70b",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9b881c7b-a14a-4561-9c3c-157116efdfd0",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10c8f915-e61a-4560-b546-fe6ea8bfdde3",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "936fafa5-1bf6-495c-be79-d4cc620f4861",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "285f9370-2fba-44c4-a36b-66c95f9f2eed",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "adbcd78f-869a-4cc9-b727-03c80df31edd",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17fbfee9-c8b7-4644-814f-0e8aa0ad5ee9",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "70f90669-ff90-4bc4-955c-63672e42bb3c",
   "metadata": {},
   "outputs": [],
   "source": [
    "formula, formal_charge = GetFormulaAndCharge(molstring)\n",
    "\n",
    "atom_bag = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "e40e4088-c246-4afb-98ae-f92cb738e988",
   "metadata": {},
   "outputs": [],
   "source": [
    "for mol_formula_times in formula.split('.'):\n",
    "    for times, mol_formula in re.findall('^(\\d+)?(\\w+)', mol_formula_times):\n",
    "        if not times:\n",
    "            times = 1\n",
    "        else:\n",
    "            times = int(times)\n",
    "        for atom, count in re.findall(\"([A-Z][a-z]*)([0-9]*)\", mol_formula):\n",
    "            if count == '':\n",
    "                count = 1\n",
    "            else:\n",
    "                count = int(count)\n",
    "            atom_bag[atom] = atom_bag.get(atom, 0) + count * times"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "391cfbba-2da5-4b60-ba32-217754913b35",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'C': 14, 'H': 14, 'O': 1}"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "atom_bag"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "812f8297-a5cc-4d63-b132-243c278c6b76",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6\n",
      "1\n",
      "8\n"
     ]
    }
   ],
   "source": [
    "from rdkit.Chem import rdchem\n",
    "for (elem, c) in atom_bag.items():\n",
    "    ll = rdchem.GetPeriodicTable()\n",
    "    atomic_num = ll.GetAtomicNumber(elem)\n",
    "    print(atomic_num)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "463fcb01-2cd0-4aee-990c-946c534dc766",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "n_protons = sum([c * ll.GetAtomicNumber(str(elem))\n",
    "                 for (elem, c) in atom_bag.items()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "ac1c69f6-54db-41ba-9fdf-e7ab6a2dfcbc",
   "metadata": {},
   "outputs": [],
   "source": [
    "atom_bag['e-'] = n_protons - formal_charge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "61b1931e-dbaf-4e0f-afb2-6595f64d70d6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'C': 14, 'H': 14, 'O': 1, 'e-': 106}"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "atom_bag"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "12bdbf80-7dc5-4d47-a479-703ad5a6aa06",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "formal_charge\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b51f36c0-707a-4856-8c23-9081e2ea2cf7",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_pKas, smiles_list = GetDissociationConstants_val(inchi_k)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "6dd79761-760d-4233-b113-a34e6322a0e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "MID_PH = 7.0\n",
    "N_PKAS = 20\n",
    "\n",
    "n_acidic = N_PKAS\n",
    "n_basic = N_PKAS\n",
    "pH = MID_PH"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "6167191a-b361-4ae0-a78a-927490c72f87",
   "metadata": {},
   "outputs": [],
   "source": [
    "args = []\n",
    "if n_acidic + n_basic > 0:\n",
    "    args += ['pka', '-a', str(n_acidic), '-b', str(n_basic),\n",
    "             'majorms', '-M', 'true', '--pH', str(pH)]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "dd4275ec-c71e-4b5b-bb35-de8b3c7c4883",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['pka', '-a', '20', '-b', '20', 'majorms', '-M', 'true', '--pH', '7.0']"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "args"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79d07dc5-963a-4373-9d72-1eb6de48ede9",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "712a71fb-e3e3-4b01-828d-5a3862aa1b30",
   "metadata": {},
   "outputs": [],
   "source": [
    "logging.debug(\"INPUT: echo %s | %s\" % (inchi_k, ' '.join([CXCALC_BIN] + args)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "287bf822-23b8-42de-85ca-e52678875cfa",
   "metadata": {},
   "outputs": [],
   "source": [
    "molstring= inchi_k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "4d2ff427-237c-4d63-a718-f29f12884d96",
   "metadata": {},
   "outputs": [],
   "source": [
    "p1 = Popen([\"echo\", molstring], stdout=PIPE, shell=use_shell_for_echo)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "923a09f2-b959-4837-ab1a-a858d91de0b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "p2 = Popen([CXCALC_BIN] + args, stdin=p1.stdout,\n",
    "                   executable=CXCALC_BIN, stdout=PIPE, shell=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "a6b30545-c65a-4c56-9985-71a103b9da00",
   "metadata": {},
   "outputs": [],
   "source": [
    "res = p2.communicate()[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "ac059602-027f-4a1a-932f-c1339c38c7d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "if p2.returncode != 0:\n",
    "    raise ChemAxonError(str(args))\n",
    "logging.debug(\"OUTPUT: %s\" % res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "671642a5-3877-44e3-b935-f987fd601444",
   "metadata": {},
   "outputs": [],
   "source": [
    "output = res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "a9f4bb4a-af86-4e97-bf1d-40c58013f90e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "b'id\\tapKa1\\tapKa2\\tapKa3\\tapKa4\\tapKa5\\tapKa6\\tapKa7\\tapKa8\\tapKa9\\tapKa10\\tapKa11\\tapKa12\\tapKa13\\tapKa14\\tapKa15\\tapKa16\\tapKa17\\tapKa18\\tapKa19\\tapKa20\\tbpKa1\\tbpKa2\\tbpKa3\\tbpKa4\\tbpKa5\\tbpKa6\\tbpKa7\\tbpKa8\\tbpKa9\\tbpKa10\\tbpKa11\\tbpKa12\\tbpKa13\\tbpKa14\\tbpKa15\\tbpKa16\\tbpKa17\\tbpKa18\\tbpKa19\\tbpKa20\\tatoms\\tmajor-ms\\r\\n1\\t10.10\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t-5.48\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t15,15\\tOC1=CC=CC(CCC2=CC=CC=C2)=C1\\r\\n'"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "215ffc9b-35a8-4f45-8f39-9c99deae6335",
   "metadata": {},
   "outputs": [],
   "source": [
    "atom2pKa, smiles_list = ParsePkaOutput(output, n_acidic, n_basic)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "21c380d3-5410-4c55-b6d7-cb0588f373ca",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['OC1=CC=CC(CCC2=CC=CC=C2)=C1\\r']"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "smiles_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "1437693a-0923-4df1-837d-acb2b524fcae",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_pKas = []\n",
    "for pKa_list in list(atom2pKa.values()):\n",
    "    all_pKas += [pKa for pKa, _ in pKa_list]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "8e77324c-ed61-4615-a7c7-4f5ca781dc90",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[10.1, -5.48]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_pKas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8616be46-1814-4755-b919-4b7790569890",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}