Spaces:
Runtime error
Runtime error
Upload 6 files
Browse files- CC/Untitled.ipynb +1038 -0
- CC/chemaxon.py +204 -0
- CC/compound.py +337 -0
- CC/compound_cacher.py +202 -0
- CC/molecule.py +292 -0
- CC/thermodynamic_constants.py +36 -0
CC/Untitled.ipynb
ADDED
|
@@ -0,0 +1,1038 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "ed0cdaf6-71e1-4ef0-894f-0beabdc392cf",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import pandas as pd\n",
|
| 11 |
+
"import numpy as np\n",
|
| 12 |
+
"import re\n",
|
| 13 |
+
"from PIL import Image\n",
|
| 14 |
+
"import webbrowser\n",
|
| 15 |
+
"import json\n",
|
| 16 |
+
"import pickle\n",
|
| 17 |
+
"import sys \n",
|
| 18 |
+
"import joblib\n",
|
| 19 |
+
"import sys\n",
|
| 20 |
+
"\n",
|
| 21 |
+
"from rdkit import Chem\n",
|
| 22 |
+
"from rdkit.Chem import Draw\n",
|
| 23 |
+
"from rdkit.Chem import rdChemReactions as Reactions\n",
|
| 24 |
+
"\n",
|
| 25 |
+
"from compound_cacher import CompoundCacher\n",
|
| 26 |
+
"from compound import Compound\n",
|
| 27 |
+
"from chemaxon import *\n",
|
| 28 |
+
"import chemaxon"
|
| 29 |
+
]
|
| 30 |
+
},
|
| 31 |
+
{
|
| 32 |
+
"cell_type": "code",
|
| 33 |
+
"execution_count": 2,
|
| 34 |
+
"id": "e64deced-2a44-4d8e-ba8f-d9843f11724a",
|
| 35 |
+
"metadata": {},
|
| 36 |
+
"outputs": [],
|
| 37 |
+
"source": [
|
| 38 |
+
"def load_smiles():\n",
|
| 39 |
+
" db = pd.read_csv('./../data/cache_compounds_20160818.csv',index_col='compound_id')\n",
|
| 40 |
+
" db_smiles = db['smiles_pH7'].to_dict()\n",
|
| 41 |
+
" return db_smiles\n",
|
| 42 |
+
"\n",
|
| 43 |
+
"def load_molsig_rad1():\n",
|
| 44 |
+
" molecular_signature_r1 = json.load(open('./../data/decompose_vector_ac.json'))\n",
|
| 45 |
+
" return molecular_signature_r1\n",
|
| 46 |
+
"\n",
|
| 47 |
+
"def load_molsig_rad2():\n",
|
| 48 |
+
" molecular_signature_r2 = json.load(open('./../data/decompose_vector_ac_r2_py3_indent_modified_manual.json'))\n",
|
| 49 |
+
" return molecular_signature_r2\n",
|
| 50 |
+
"\n",
|
| 51 |
+
"def load_model():\n",
|
| 52 |
+
" filename = './../model/M12_model_BR.pkl'\n",
|
| 53 |
+
" loaded_model = joblib.load(open(filename, 'rb'))\n",
|
| 54 |
+
" return loaded_model"
|
| 55 |
+
]
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"cell_type": "code",
|
| 59 |
+
"execution_count": 3,
|
| 60 |
+
"id": "71615c14-49c3-45e7-9495-194ef22fb1ee",
|
| 61 |
+
"metadata": {},
|
| 62 |
+
"outputs": [],
|
| 63 |
+
"source": [
|
| 64 |
+
"db_smiles = load_smiles()\n",
|
| 65 |
+
"molsig_r1 = load_molsig_rad1()\n",
|
| 66 |
+
"molsig_r2 = load_molsig_rad2()\n",
|
| 67 |
+
"loaded_model = load_model()"
|
| 68 |
+
]
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
"cell_type": "code",
|
| 72 |
+
"execution_count": 4,
|
| 73 |
+
"id": "b86b8049-cbf2-473f-8715-5e5f908193a2",
|
| 74 |
+
"metadata": {},
|
| 75 |
+
"outputs": [],
|
| 76 |
+
"source": [
|
| 77 |
+
"def parse_reaction_formula_side(s):\n",
|
| 78 |
+
" \"\"\"\n",
|
| 79 |
+
" Parses the side formula, e.g. '2 C00001 + C00002 + 3 C00003'\n",
|
| 80 |
+
" Ignores stoichiometry.\n",
|
| 81 |
+
"\n",
|
| 82 |
+
" Returns:\n",
|
| 83 |
+
" The set of CIDs.\n",
|
| 84 |
+
" \"\"\"\n",
|
| 85 |
+
" if s.strip() == \"null\":\n",
|
| 86 |
+
" return {}\n",
|
| 87 |
+
"\n",
|
| 88 |
+
" compound_bag = {}\n",
|
| 89 |
+
" for member in re.split('\\s+\\+\\s+', s):\n",
|
| 90 |
+
" tokens = member.split(None, 1)\n",
|
| 91 |
+
" if len(tokens) == 0:\n",
|
| 92 |
+
" continue\n",
|
| 93 |
+
" if len(tokens) == 1:\n",
|
| 94 |
+
" amount = 1\n",
|
| 95 |
+
" key = member\n",
|
| 96 |
+
" else:\n",
|
| 97 |
+
" amount = float(tokens[0])\n",
|
| 98 |
+
" key = tokens[1]\n",
|
| 99 |
+
"\n",
|
| 100 |
+
" compound_bag[key] = compound_bag.get(key, 0) + amount\n",
|
| 101 |
+
"\n",
|
| 102 |
+
" return compound_bag\n",
|
| 103 |
+
"\n",
|
| 104 |
+
"def parse_formula(formula, arrow='<=>', rid=None):\n",
|
| 105 |
+
" \"\"\"\n",
|
| 106 |
+
" Parses a two-sided formula such as: 2 C00001 => C00002 + C00003\n",
|
| 107 |
+
"\n",
|
| 108 |
+
" Return:\n",
|
| 109 |
+
" The set of substrates, products and the direction of the reaction\n",
|
| 110 |
+
" \"\"\"\n",
|
| 111 |
+
" tokens = formula.split(arrow)\n",
|
| 112 |
+
" if len(tokens) < 2:\n",
|
| 113 |
+
" print(('Reaction does not contain the arrow sign (%s): %s'\n",
|
| 114 |
+
" % (arrow, formula)))\n",
|
| 115 |
+
" if len(tokens) > 2:\n",
|
| 116 |
+
" print(('Reaction contains more than one arrow sign (%s): %s'\n",
|
| 117 |
+
" % (arrow, formula)))\n",
|
| 118 |
+
"\n",
|
| 119 |
+
" left = tokens[0].strip()\n",
|
| 120 |
+
" right = tokens[1].strip()\n",
|
| 121 |
+
"\n",
|
| 122 |
+
" sparse_reaction = {}\n",
|
| 123 |
+
" for cid, count in parse_reaction_formula_side(left).items():\n",
|
| 124 |
+
" sparse_reaction[cid] = sparse_reaction.get(cid, 0) - count\n",
|
| 125 |
+
"\n",
|
| 126 |
+
" for cid, count in parse_reaction_formula_side(right).items():\n",
|
| 127 |
+
" sparse_reaction[cid] = sparse_reaction.get(cid, 0) + count \n",
|
| 128 |
+
" \n",
|
| 129 |
+
" return sparse_reaction"
|
| 130 |
+
]
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"cell_type": "code",
|
| 134 |
+
"execution_count": 5,
|
| 135 |
+
"id": "7342b178-3472-4734-83e3-3de431abe15e",
|
| 136 |
+
"metadata": {},
|
| 137 |
+
"outputs": [],
|
| 138 |
+
"source": [
|
| 139 |
+
"rxn_string = \"C00222 + C00010 + C00006 <=> C00024 + C00011 + C00005\""
|
| 140 |
+
]
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"cell_type": "code",
|
| 144 |
+
"execution_count": 6,
|
| 145 |
+
"id": "7b4dfe4f-48a8-4011-b201-7fb3a3268cef",
|
| 146 |
+
"metadata": {},
|
| 147 |
+
"outputs": [],
|
| 148 |
+
"source": [
|
| 149 |
+
"rxn_dic = parse_formula(rxn_string)"
|
| 150 |
+
]
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"cell_type": "code",
|
| 154 |
+
"execution_count": 7,
|
| 155 |
+
"id": "1f523aa2-b9dc-4153-8c1c-dec58e1ab987",
|
| 156 |
+
"metadata": {},
|
| 157 |
+
"outputs": [],
|
| 158 |
+
"source": [
|
| 159 |
+
"def get_ddG0(rxn_dict,pH,I,novel_mets):\n",
|
| 160 |
+
" ccache = CompoundCacher()\n",
|
| 161 |
+
" # ddG0 = get_transform_ddG0(rxn_dict, ccache, pH, I, T)\n",
|
| 162 |
+
" T = 298.15\n",
|
| 163 |
+
" ddG0_forward = 0\n",
|
| 164 |
+
" for compound_id, coeff in rxn_dict.items():\n",
|
| 165 |
+
" if novel_mets != None and compound_id in novel_mets:\n",
|
| 166 |
+
" comp = novel_mets[compound_id]\n",
|
| 167 |
+
" else:\n",
|
| 168 |
+
" comp = ccache.get_compound(compound_id)\n",
|
| 169 |
+
" ddG0_forward += coeff * comp.transform_pH7(pH, I, T)\n",
|
| 170 |
+
"\n",
|
| 171 |
+
" return ddG0_forward"
|
| 172 |
+
]
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
"cell_type": "code",
|
| 176 |
+
"execution_count": 8,
|
| 177 |
+
"id": "33cf30ff-8b2c-4da9-9134-75a60a5c5d66",
|
| 178 |
+
"metadata": {},
|
| 179 |
+
"outputs": [
|
| 180 |
+
{
|
| 181 |
+
"data": {
|
| 182 |
+
"text/plain": [
|
| 183 |
+
"-3.6254822995515497"
|
| 184 |
+
]
|
| 185 |
+
},
|
| 186 |
+
"execution_count": 8,
|
| 187 |
+
"metadata": {},
|
| 188 |
+
"output_type": "execute_result"
|
| 189 |
+
}
|
| 190 |
+
],
|
| 191 |
+
"source": [
|
| 192 |
+
"get_ddG0(rxn_dic, 7.0, 0.1, {})"
|
| 193 |
+
]
|
| 194 |
+
},
|
| 195 |
+
{
|
| 196 |
+
"cell_type": "code",
|
| 197 |
+
"execution_count": 9,
|
| 198 |
+
"id": "9e39855d-eb9e-4ea9-aeb9-8b770cc24c8e",
|
| 199 |
+
"metadata": {},
|
| 200 |
+
"outputs": [],
|
| 201 |
+
"source": [
|
| 202 |
+
"def get_rule(rxn_dict, molsig1, molsig2, novel_decomposed1, novel_decomposed2):\n",
|
| 203 |
+
" if novel_decomposed1 != None:\n",
|
| 204 |
+
" for cid in novel_decomposed1:\n",
|
| 205 |
+
" molsig1[cid] = novel_decomposed1[cid]\n",
|
| 206 |
+
" if novel_decomposed2 != None:\n",
|
| 207 |
+
" for cid in novel_decomposed2:\n",
|
| 208 |
+
" molsig2[cid] = novel_decomposed2[cid]\n",
|
| 209 |
+
"\n",
|
| 210 |
+
" molsigna_df1 = pd.DataFrame.from_dict(molsig1).fillna(0)\n",
|
| 211 |
+
" all_mets1 = molsigna_df1.columns.tolist()\n",
|
| 212 |
+
" all_mets1.append(\"C00080\")\n",
|
| 213 |
+
" all_mets1.append(\"C00282\")\n",
|
| 214 |
+
"\n",
|
| 215 |
+
" molsigna_df2 = pd.DataFrame.from_dict(molsig2).fillna(0)\n",
|
| 216 |
+
" all_mets2 = molsigna_df2.columns.tolist()\n",
|
| 217 |
+
" all_mets2.append(\"C00080\")\n",
|
| 218 |
+
" all_mets2.append(\"C00282\")\n",
|
| 219 |
+
"\n",
|
| 220 |
+
" moieties_r1 = open('./data/group_names_r1.txt')\n",
|
| 221 |
+
" moieties_r2 = open('./data/group_names_r2_py3_modified_manual.txt')\n",
|
| 222 |
+
" moie_r1 = moieties_r1.read().splitlines()\n",
|
| 223 |
+
" moie_r2 = moieties_r2.read().splitlines()\n",
|
| 224 |
+
"\n",
|
| 225 |
+
" molsigna_df1 = molsigna_df1.reindex(moie_r1)\n",
|
| 226 |
+
" molsigna_df2 = molsigna_df2.reindex(moie_r2)\n",
|
| 227 |
+
"\n",
|
| 228 |
+
" rule_df1 = pd.DataFrame(index=molsigna_df1.index)\n",
|
| 229 |
+
" rule_df2 = pd.DataFrame(index=molsigna_df2.index)\n",
|
| 230 |
+
" # for rid, value in reaction_dict.items():\n",
|
| 231 |
+
" # # skip the reactions with missing metabolites\n",
|
| 232 |
+
" # mets = value.keys()\n",
|
| 233 |
+
" # flag = False\n",
|
| 234 |
+
" # for met in mets:\n",
|
| 235 |
+
" # if met not in all_mets:\n",
|
| 236 |
+
" # flag = True\n",
|
| 237 |
+
" # break\n",
|
| 238 |
+
" # if flag: continue\n",
|
| 239 |
+
"\n",
|
| 240 |
+
" rule_df1['change'] = 0\n",
|
| 241 |
+
" for met, stoic in rxn_dict.items():\n",
|
| 242 |
+
" if met == \"C00080\" or met == \"C00282\":\n",
|
| 243 |
+
" continue # hydogen is zero\n",
|
| 244 |
+
" rule_df1['change'] += molsigna_df1[met] * stoic\n",
|
| 245 |
+
"\n",
|
| 246 |
+
" rule_df2['change'] = 0\n",
|
| 247 |
+
" for met, stoic in rxn_dict.items():\n",
|
| 248 |
+
" if met == \"C00080\" or met == \"C00282\":\n",
|
| 249 |
+
" continue # hydogen is zero\n",
|
| 250 |
+
" rule_df2['change'] += molsigna_df2[met] * stoic\n",
|
| 251 |
+
"\n",
|
| 252 |
+
" rule_vec1 = rule_df1.to_numpy().T\n",
|
| 253 |
+
" rule_vec2 = rule_df2.to_numpy().T\n",
|
| 254 |
+
"\n",
|
| 255 |
+
" m1, n1 = rule_vec1.shape\n",
|
| 256 |
+
" m2, n2 = rule_vec2.shape\n",
|
| 257 |
+
"\n",
|
| 258 |
+
" zeros1 = np.zeros((m1, 44))\n",
|
| 259 |
+
" zeros2 = np.zeros((m2, 44))\n",
|
| 260 |
+
" X1 = np.concatenate((rule_vec1, zeros1), 1)\n",
|
| 261 |
+
" X2 = np.concatenate((rule_vec2, zeros2), 1)\n",
|
| 262 |
+
"\n",
|
| 263 |
+
" rule_comb = np.concatenate((X1, X2), 1)\n",
|
| 264 |
+
"\n",
|
| 265 |
+
" # rule_df_final = {}\n",
|
| 266 |
+
" # rule_df_final['rad1'] = rule_df1\n",
|
| 267 |
+
" # rule_df_final['rad2'] = rule_df2\n",
|
| 268 |
+
" return rule_comb, rule_df1, rule_df2\n"
|
| 269 |
+
]
|
| 270 |
+
},
|
| 271 |
+
{
|
| 272 |
+
"cell_type": "code",
|
| 273 |
+
"execution_count": 14,
|
| 274 |
+
"id": "a93ea75e-9851-45fd-aa58-d7f325b4b5a6",
|
| 275 |
+
"metadata": {},
|
| 276 |
+
"outputs": [
|
| 277 |
+
{
|
| 278 |
+
"data": {
|
| 279 |
+
"text/plain": [
|
| 280 |
+
"{'C00222': -1,\n",
|
| 281 |
+
" 'C00010': -1,\n",
|
| 282 |
+
" 'C00006': -1,\n",
|
| 283 |
+
" 'C00024': 1,\n",
|
| 284 |
+
" 'C00011': 1,\n",
|
| 285 |
+
" 'C00005': 1}"
|
| 286 |
+
]
|
| 287 |
+
},
|
| 288 |
+
"execution_count": 14,
|
| 289 |
+
"metadata": {},
|
| 290 |
+
"output_type": "execute_result"
|
| 291 |
+
}
|
| 292 |
+
],
|
| 293 |
+
"source": [
|
| 294 |
+
"rxn_dic"
|
| 295 |
+
]
|
| 296 |
+
},
|
| 297 |
+
{
|
| 298 |
+
"cell_type": "code",
|
| 299 |
+
"execution_count": null,
|
| 300 |
+
"id": "981948dd-db2c-4463-b983-1220353d963e",
|
| 301 |
+
"metadata": {},
|
| 302 |
+
"outputs": [],
|
| 303 |
+
"source": []
|
| 304 |
+
},
|
| 305 |
+
{
|
| 306 |
+
"cell_type": "code",
|
| 307 |
+
"execution_count": 23,
|
| 308 |
+
"id": "96eb1c38-2ca7-4e38-bcc4-ade1cef73852",
|
| 309 |
+
"metadata": {},
|
| 310 |
+
"outputs": [
|
| 311 |
+
{
|
| 312 |
+
"data": {
|
| 313 |
+
"text/plain": [
|
| 314 |
+
"(array([-19.96775194]), array([6.66052556]))"
|
| 315 |
+
]
|
| 316 |
+
},
|
| 317 |
+
"execution_count": 23,
|
| 318 |
+
"metadata": {},
|
| 319 |
+
"output_type": "execute_result"
|
| 320 |
+
}
|
| 321 |
+
],
|
| 322 |
+
"source": [
|
| 323 |
+
"loaded_model.predict(X, return_std= True)"
|
| 324 |
+
]
|
| 325 |
+
},
|
| 326 |
+
{
|
| 327 |
+
"cell_type": "code",
|
| 328 |
+
"execution_count": null,
|
| 329 |
+
"id": "81128dd3-5005-40a6-b5fe-8ecacef824bc",
|
| 330 |
+
"metadata": {},
|
| 331 |
+
"outputs": [],
|
| 332 |
+
"source": [
|
| 333 |
+
"def get_ddG0(rxn_dict,pH,I,novel_mets):\n",
|
| 334 |
+
" ccache = CompoundCacher()\n",
|
| 335 |
+
" # ddG0 = get_transform_ddG0(rxn_dict, ccache, pH, I, T)\n",
|
| 336 |
+
" T = 298.15\n",
|
| 337 |
+
" ddG0_forward = 0\n",
|
| 338 |
+
" for compound_id, coeff in rxn_dict.items():\n",
|
| 339 |
+
" if novel_mets != None and compound_id in novel_mets:\n",
|
| 340 |
+
" comp = novel_mets[compound_id]\n",
|
| 341 |
+
" else:\n",
|
| 342 |
+
" comp = ccache.get_compound(compound_id)\n",
|
| 343 |
+
" ddG0_forward += coeff * comp.transform_pH7(pH, I, T)\n",
|
| 344 |
+
"\n",
|
| 345 |
+
" return ddG0_forward\n",
|
| 346 |
+
"\n",
|
| 347 |
+
"\n",
|
| 348 |
+
"def get_dG0(rxn_dict,rid,pH,I,loaded_model,molsig_r1, molsig_r2, novel_decomposed_r1, novel_decomposed_r2,novel_mets):\n",
|
| 349 |
+
" rule_comb, rule_df1, rule_df2 = get_rule(rxn_dict,molsig_r1,molsig_r2, novel_decomposed_r1, novel_decomposed_r2)\n",
|
| 350 |
+
" X = rule_comb\n",
|
| 351 |
+
" ymean, ystd = loaded_model.predict(X, return_std=True)\n",
|
| 352 |
+
" result = {}\n",
|
| 353 |
+
" return ymean[0] + get_ddG0(rxn_dict, pH, I, novel_mets),ystd[0], rule_df1, rule_df2"
|
| 354 |
+
]
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
"cell_type": "code",
|
| 358 |
+
"execution_count": null,
|
| 359 |
+
"id": "751ec201-f062-4ac0-8d24-fe959636cbdc",
|
| 360 |
+
"metadata": {},
|
| 361 |
+
"outputs": [],
|
| 362 |
+
"source": []
|
| 363 |
+
},
|
| 364 |
+
{
|
| 365 |
+
"cell_type": "code",
|
| 366 |
+
"execution_count": null,
|
| 367 |
+
"id": "c6cb1e4d-24be-42a1-b88b-793a62597c92",
|
| 368 |
+
"metadata": {},
|
| 369 |
+
"outputs": [],
|
| 370 |
+
"source": []
|
| 371 |
+
},
|
| 372 |
+
{
|
| 373 |
+
"cell_type": "code",
|
| 374 |
+
"execution_count": null,
|
| 375 |
+
"id": "7abe24be-1653-455b-9931-9446480d39bb",
|
| 376 |
+
"metadata": {},
|
| 377 |
+
"outputs": [],
|
| 378 |
+
"source": []
|
| 379 |
+
},
|
| 380 |
+
{
|
| 381 |
+
"cell_type": "code",
|
| 382 |
+
"execution_count": null,
|
| 383 |
+
"id": "f13433dc-51a3-41e5-8a0b-b0f21724ef98",
|
| 384 |
+
"metadata": {},
|
| 385 |
+
"outputs": [],
|
| 386 |
+
"source": []
|
| 387 |
+
},
|
| 388 |
+
{
|
| 389 |
+
"cell_type": "code",
|
| 390 |
+
"execution_count": 2,
|
| 391 |
+
"id": "db7c764f-d216-44a9-8f88-0e3a7c51377a",
|
| 392 |
+
"metadata": {},
|
| 393 |
+
"outputs": [],
|
| 394 |
+
"source": [
|
| 395 |
+
"ccc= CompoundCacher()"
|
| 396 |
+
]
|
| 397 |
+
},
|
| 398 |
+
{
|
| 399 |
+
"cell_type": "code",
|
| 400 |
+
"execution_count": 3,
|
| 401 |
+
"id": "09e6f7f2-5be7-4db3-b55d-756ecb711095",
|
| 402 |
+
"metadata": {},
|
| 403 |
+
"outputs": [],
|
| 404 |
+
"source": [
|
| 405 |
+
"a = ccc.get_compound('C00001')"
|
| 406 |
+
]
|
| 407 |
+
},
|
| 408 |
+
{
|
| 409 |
+
"cell_type": "code",
|
| 410 |
+
"execution_count": 4,
|
| 411 |
+
"id": "d28e44b7-d942-4739-9d7d-2f4e082ac1b9",
|
| 412 |
+
"metadata": {},
|
| 413 |
+
"outputs": [
|
| 414 |
+
{
|
| 415 |
+
"data": {
|
| 416 |
+
"text/plain": [
|
| 417 |
+
"81.4472134155519"
|
| 418 |
+
]
|
| 419 |
+
},
|
| 420 |
+
"execution_count": 4,
|
| 421 |
+
"metadata": {},
|
| 422 |
+
"output_type": "execute_result"
|
| 423 |
+
}
|
| 424 |
+
],
|
| 425 |
+
"source": [
|
| 426 |
+
"a.transform_pH7(7, 0.25 , 298)"
|
| 427 |
+
]
|
| 428 |
+
},
|
| 429 |
+
{
|
| 430 |
+
"cell_type": "code",
|
| 431 |
+
"execution_count": 5,
|
| 432 |
+
"id": "1ef3fc0d-7d63-42ea-8743-522fe010a95d",
|
| 433 |
+
"metadata": {},
|
| 434 |
+
"outputs": [],
|
| 435 |
+
"source": [
|
| 436 |
+
"inchi_k = \"InChI=1S/C14H14O/c15-14-8-4-7-13(11-14)10-9-12-5-2-1-3-6-12/h1-8,11,15H,9-10H2\" ;"
|
| 437 |
+
]
|
| 438 |
+
},
|
| 439 |
+
{
|
| 440 |
+
"cell_type": "code",
|
| 441 |
+
"execution_count": 6,
|
| 442 |
+
"id": "4e651d1c-2c96-42d1-adab-466dc7518146",
|
| 443 |
+
"metadata": {},
|
| 444 |
+
"outputs": [
|
| 445 |
+
{
|
| 446 |
+
"name": "stderr",
|
| 447 |
+
"output_type": "stream",
|
| 448 |
+
"text": [
|
| 449 |
+
"C:\\Users\\vuu10\\AppData\\Local\\Continuum\\anaconda3\\envs\\dGPredictor_py3\\lib\\openbabel\\__init__.py:14: UserWarning: \"import openbabel\" is deprecated, instead use \"from openbabel import openbabel\"\n",
|
| 450 |
+
" warnings.warn('\"import openbabel\" is deprecated, instead use \"from openbabel import openbabel\"')\n"
|
| 451 |
+
]
|
| 452 |
+
}
|
| 453 |
+
],
|
| 454 |
+
"source": [
|
| 455 |
+
"c = Compound.from_inchi('Test', 'sajdf', inchi_k )"
|
| 456 |
+
]
|
| 457 |
+
},
|
| 458 |
+
{
|
| 459 |
+
"cell_type": "code",
|
| 460 |
+
"execution_count": 18,
|
| 461 |
+
"id": "6eb5c2dc-f14c-46de-889b-0e9b7faa9f79",
|
| 462 |
+
"metadata": {},
|
| 463 |
+
"outputs": [
|
| 464 |
+
{
|
| 465 |
+
"ename": "AttributeError",
|
| 466 |
+
"evalue": "'Compound' object has no attribute 'smiles_ph7'",
|
| 467 |
+
"output_type": "error",
|
| 468 |
+
"traceback": [
|
| 469 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
| 470 |
+
"\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
| 471 |
+
"\u001b[1;32m<ipython-input-18-7a0d06664090>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msmiles_ph7\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
|
| 472 |
+
"\u001b[1;31mAttributeError\u001b[0m: 'Compound' object has no attribute 'smiles_ph7'"
|
| 473 |
+
]
|
| 474 |
+
}
|
| 475 |
+
],
|
| 476 |
+
"source": [
|
| 477 |
+
"c.smiles_ph7()"
|
| 478 |
+
]
|
| 479 |
+
},
|
| 480 |
+
{
|
| 481 |
+
"cell_type": "code",
|
| 482 |
+
"execution_count": 7,
|
| 483 |
+
"id": "edd156dc-4355-4c2c-ba4e-6d98e776a96a",
|
| 484 |
+
"metadata": {},
|
| 485 |
+
"outputs": [],
|
| 486 |
+
"source": [
|
| 487 |
+
"from chemaxon import *\n",
|
| 488 |
+
"import chemaxon"
|
| 489 |
+
]
|
| 490 |
+
},
|
| 491 |
+
{
|
| 492 |
+
"cell_type": "code",
|
| 493 |
+
"execution_count": 8,
|
| 494 |
+
"id": "880d2ef6-6b03-49d3-8f60-66769c22a84d",
|
| 495 |
+
"metadata": {},
|
| 496 |
+
"outputs": [],
|
| 497 |
+
"source": [
|
| 498 |
+
"pKas, major_ms_smiles = chemaxon.GetDissociationConstants(inchi_k)"
|
| 499 |
+
]
|
| 500 |
+
},
|
| 501 |
+
{
|
| 502 |
+
"cell_type": "code",
|
| 503 |
+
"execution_count": 9,
|
| 504 |
+
"id": "7a2391dc-313c-47f2-9f54-823bfdb95fcd",
|
| 505 |
+
"metadata": {},
|
| 506 |
+
"outputs": [
|
| 507 |
+
{
|
| 508 |
+
"data": {
|
| 509 |
+
"text/plain": [
|
| 510 |
+
"'OC1=CC=CC(CCC2=CC=CC=C2)=C1\\r'"
|
| 511 |
+
]
|
| 512 |
+
},
|
| 513 |
+
"execution_count": 9,
|
| 514 |
+
"metadata": {},
|
| 515 |
+
"output_type": "execute_result"
|
| 516 |
+
}
|
| 517 |
+
],
|
| 518 |
+
"source": [
|
| 519 |
+
"major_ms_smiles"
|
| 520 |
+
]
|
| 521 |
+
},
|
| 522 |
+
{
|
| 523 |
+
"cell_type": "code",
|
| 524 |
+
"execution_count": 10,
|
| 525 |
+
"id": "96d90c4a-14a2-45fb-8573-97db84de2dff",
|
| 526 |
+
"metadata": {},
|
| 527 |
+
"outputs": [],
|
| 528 |
+
"source": [
|
| 529 |
+
"major_ms_smiles = Compound.smiles2smiles(major_ms_smiles)"
|
| 530 |
+
]
|
| 531 |
+
},
|
| 532 |
+
{
|
| 533 |
+
"cell_type": "code",
|
| 534 |
+
"execution_count": 11,
|
| 535 |
+
"id": "36d46620-b895-4ec8-85d0-7499759812c6",
|
| 536 |
+
"metadata": {},
|
| 537 |
+
"outputs": [],
|
| 538 |
+
"source": [
|
| 539 |
+
"MIN_PH = 0.0\n",
|
| 540 |
+
"MAX_PH = 14.0\n",
|
| 541 |
+
"pKas = sorted([pka for pka in pKas if pka > MIN_PH and pka < MAX_PH], reverse=True)"
|
| 542 |
+
]
|
| 543 |
+
},
|
| 544 |
+
{
|
| 545 |
+
"cell_type": "code",
|
| 546 |
+
"execution_count": 12,
|
| 547 |
+
"id": "ffccf9d9-5a52-4be6-af4c-f39b3db2a27c",
|
| 548 |
+
"metadata": {},
|
| 549 |
+
"outputs": [
|
| 550 |
+
{
|
| 551 |
+
"data": {
|
| 552 |
+
"text/plain": [
|
| 553 |
+
"[10.1]"
|
| 554 |
+
]
|
| 555 |
+
},
|
| 556 |
+
"execution_count": 12,
|
| 557 |
+
"metadata": {},
|
| 558 |
+
"output_type": "execute_result"
|
| 559 |
+
}
|
| 560 |
+
],
|
| 561 |
+
"source": [
|
| 562 |
+
"pKas"
|
| 563 |
+
]
|
| 564 |
+
},
|
| 565 |
+
{
|
| 566 |
+
"cell_type": "code",
|
| 567 |
+
"execution_count": 13,
|
| 568 |
+
"id": "e83721fa-9a42-42ef-9a03-59fc2689c73b",
|
| 569 |
+
"metadata": {},
|
| 570 |
+
"outputs": [],
|
| 571 |
+
"source": [
|
| 572 |
+
"atom_bag, major_ms_charge = chemaxon.GetAtomBagAndCharge(major_ms_smiles)"
|
| 573 |
+
]
|
| 574 |
+
},
|
| 575 |
+
{
|
| 576 |
+
"cell_type": "code",
|
| 577 |
+
"execution_count": null,
|
| 578 |
+
"id": "47a87ed7-968d-44b6-a237-a8469ba3fe3b",
|
| 579 |
+
"metadata": {},
|
| 580 |
+
"outputs": [],
|
| 581 |
+
"source": []
|
| 582 |
+
},
|
| 583 |
+
{
|
| 584 |
+
"cell_type": "code",
|
| 585 |
+
"execution_count": null,
|
| 586 |
+
"id": "49cfefde-ee96-4ca8-89af-c50f2f2ca70b",
|
| 587 |
+
"metadata": {},
|
| 588 |
+
"outputs": [],
|
| 589 |
+
"source": []
|
| 590 |
+
},
|
| 591 |
+
{
|
| 592 |
+
"cell_type": "code",
|
| 593 |
+
"execution_count": null,
|
| 594 |
+
"id": "9b881c7b-a14a-4561-9c3c-157116efdfd0",
|
| 595 |
+
"metadata": {},
|
| 596 |
+
"outputs": [],
|
| 597 |
+
"source": []
|
| 598 |
+
},
|
| 599 |
+
{
|
| 600 |
+
"cell_type": "code",
|
| 601 |
+
"execution_count": null,
|
| 602 |
+
"id": "10c8f915-e61a-4560-b546-fe6ea8bfdde3",
|
| 603 |
+
"metadata": {},
|
| 604 |
+
"outputs": [],
|
| 605 |
+
"source": []
|
| 606 |
+
},
|
| 607 |
+
{
|
| 608 |
+
"cell_type": "code",
|
| 609 |
+
"execution_count": null,
|
| 610 |
+
"id": "936fafa5-1bf6-495c-be79-d4cc620f4861",
|
| 611 |
+
"metadata": {},
|
| 612 |
+
"outputs": [],
|
| 613 |
+
"source": []
|
| 614 |
+
},
|
| 615 |
+
{
|
| 616 |
+
"cell_type": "code",
|
| 617 |
+
"execution_count": null,
|
| 618 |
+
"id": "285f9370-2fba-44c4-a36b-66c95f9f2eed",
|
| 619 |
+
"metadata": {},
|
| 620 |
+
"outputs": [],
|
| 621 |
+
"source": []
|
| 622 |
+
},
|
| 623 |
+
{
|
| 624 |
+
"cell_type": "code",
|
| 625 |
+
"execution_count": null,
|
| 626 |
+
"id": "adbcd78f-869a-4cc9-b727-03c80df31edd",
|
| 627 |
+
"metadata": {},
|
| 628 |
+
"outputs": [],
|
| 629 |
+
"source": []
|
| 630 |
+
},
|
| 631 |
+
{
|
| 632 |
+
"cell_type": "code",
|
| 633 |
+
"execution_count": null,
|
| 634 |
+
"id": "17fbfee9-c8b7-4644-814f-0e8aa0ad5ee9",
|
| 635 |
+
"metadata": {},
|
| 636 |
+
"outputs": [],
|
| 637 |
+
"source": []
|
| 638 |
+
},
|
| 639 |
+
{
|
| 640 |
+
"cell_type": "code",
|
| 641 |
+
"execution_count": 21,
|
| 642 |
+
"id": "70f90669-ff90-4bc4-955c-63672e42bb3c",
|
| 643 |
+
"metadata": {},
|
| 644 |
+
"outputs": [],
|
| 645 |
+
"source": [
|
| 646 |
+
"formula, formal_charge = GetFormulaAndCharge(molstring)\n",
|
| 647 |
+
"\n",
|
| 648 |
+
"atom_bag = {}"
|
| 649 |
+
]
|
| 650 |
+
},
|
| 651 |
+
{
|
| 652 |
+
"cell_type": "code",
|
| 653 |
+
"execution_count": 25,
|
| 654 |
+
"id": "e40e4088-c246-4afb-98ae-f92cb738e988",
|
| 655 |
+
"metadata": {},
|
| 656 |
+
"outputs": [],
|
| 657 |
+
"source": [
|
| 658 |
+
"for mol_formula_times in formula.split('.'):\n",
|
| 659 |
+
" for times, mol_formula in re.findall('^(\\d+)?(\\w+)', mol_formula_times):\n",
|
| 660 |
+
" if not times:\n",
|
| 661 |
+
" times = 1\n",
|
| 662 |
+
" else:\n",
|
| 663 |
+
" times = int(times)\n",
|
| 664 |
+
" for atom, count in re.findall(\"([A-Z][a-z]*)([0-9]*)\", mol_formula):\n",
|
| 665 |
+
" if count == '':\n",
|
| 666 |
+
" count = 1\n",
|
| 667 |
+
" else:\n",
|
| 668 |
+
" count = int(count)\n",
|
| 669 |
+
" atom_bag[atom] = atom_bag.get(atom, 0) + count * times"
|
| 670 |
+
]
|
| 671 |
+
},
|
| 672 |
+
{
|
| 673 |
+
"cell_type": "code",
|
| 674 |
+
"execution_count": 26,
|
| 675 |
+
"id": "391cfbba-2da5-4b60-ba32-217754913b35",
|
| 676 |
+
"metadata": {},
|
| 677 |
+
"outputs": [
|
| 678 |
+
{
|
| 679 |
+
"data": {
|
| 680 |
+
"text/plain": [
|
| 681 |
+
"{'C': 14, 'H': 14, 'O': 1}"
|
| 682 |
+
]
|
| 683 |
+
},
|
| 684 |
+
"execution_count": 26,
|
| 685 |
+
"metadata": {},
|
| 686 |
+
"output_type": "execute_result"
|
| 687 |
+
}
|
| 688 |
+
],
|
| 689 |
+
"source": [
|
| 690 |
+
"atom_bag"
|
| 691 |
+
]
|
| 692 |
+
},
|
| 693 |
+
{
|
| 694 |
+
"cell_type": "code",
|
| 695 |
+
"execution_count": 52,
|
| 696 |
+
"id": "812f8297-a5cc-4d63-b132-243c278c6b76",
|
| 697 |
+
"metadata": {},
|
| 698 |
+
"outputs": [
|
| 699 |
+
{
|
| 700 |
+
"name": "stdout",
|
| 701 |
+
"output_type": "stream",
|
| 702 |
+
"text": [
|
| 703 |
+
"6\n",
|
| 704 |
+
"1\n",
|
| 705 |
+
"8\n"
|
| 706 |
+
]
|
| 707 |
+
}
|
| 708 |
+
],
|
| 709 |
+
"source": [
|
| 710 |
+
"from rdkit.Chem import rdchem\n",
|
| 711 |
+
"for (elem, c) in atom_bag.items():\n",
|
| 712 |
+
" ll = rdchem.GetPeriodicTable()\n",
|
| 713 |
+
" atomic_num = ll.GetAtomicNumber(elem)\n",
|
| 714 |
+
" print(atomic_num)"
|
| 715 |
+
]
|
| 716 |
+
},
|
| 717 |
+
{
|
| 718 |
+
"cell_type": "code",
|
| 719 |
+
"execution_count": 55,
|
| 720 |
+
"id": "463fcb01-2cd0-4aee-990c-946c534dc766",
|
| 721 |
+
"metadata": {},
|
| 722 |
+
"outputs": [],
|
| 723 |
+
"source": [
|
| 724 |
+
"\n",
|
| 725 |
+
"n_protons = sum([c * ll.GetAtomicNumber(str(elem))\n",
|
| 726 |
+
" for (elem, c) in atom_bag.items()])"
|
| 727 |
+
]
|
| 728 |
+
},
|
| 729 |
+
{
|
| 730 |
+
"cell_type": "code",
|
| 731 |
+
"execution_count": 57,
|
| 732 |
+
"id": "ac1c69f6-54db-41ba-9fdf-e7ab6a2dfcbc",
|
| 733 |
+
"metadata": {},
|
| 734 |
+
"outputs": [],
|
| 735 |
+
"source": [
|
| 736 |
+
"atom_bag['e-'] = n_protons - formal_charge"
|
| 737 |
+
]
|
| 738 |
+
},
|
| 739 |
+
{
|
| 740 |
+
"cell_type": "code",
|
| 741 |
+
"execution_count": 58,
|
| 742 |
+
"id": "61b1931e-dbaf-4e0f-afb2-6595f64d70d6",
|
| 743 |
+
"metadata": {},
|
| 744 |
+
"outputs": [
|
| 745 |
+
{
|
| 746 |
+
"data": {
|
| 747 |
+
"text/plain": [
|
| 748 |
+
"{'C': 14, 'H': 14, 'O': 1, 'e-': 106}"
|
| 749 |
+
]
|
| 750 |
+
},
|
| 751 |
+
"execution_count": 58,
|
| 752 |
+
"metadata": {},
|
| 753 |
+
"output_type": "execute_result"
|
| 754 |
+
}
|
| 755 |
+
],
|
| 756 |
+
"source": [
|
| 757 |
+
"atom_bag"
|
| 758 |
+
]
|
| 759 |
+
},
|
| 760 |
+
{
|
| 761 |
+
"cell_type": "code",
|
| 762 |
+
"execution_count": 60,
|
| 763 |
+
"id": "12bdbf80-7dc5-4d47-a479-703ad5a6aa06",
|
| 764 |
+
"metadata": {},
|
| 765 |
+
"outputs": [
|
| 766 |
+
{
|
| 767 |
+
"data": {
|
| 768 |
+
"text/plain": [
|
| 769 |
+
"0"
|
| 770 |
+
]
|
| 771 |
+
},
|
| 772 |
+
"execution_count": 60,
|
| 773 |
+
"metadata": {},
|
| 774 |
+
"output_type": "execute_result"
|
| 775 |
+
}
|
| 776 |
+
],
|
| 777 |
+
"source": [
|
| 778 |
+
"\n",
|
| 779 |
+
"formal_charge\n",
|
| 780 |
+
"\n"
|
| 781 |
+
]
|
| 782 |
+
},
|
| 783 |
+
{
|
| 784 |
+
"cell_type": "code",
|
| 785 |
+
"execution_count": null,
|
| 786 |
+
"id": "b51f36c0-707a-4856-8c23-9081e2ea2cf7",
|
| 787 |
+
"metadata": {},
|
| 788 |
+
"outputs": [],
|
| 789 |
+
"source": [
|
| 790 |
+
"all_pKas, smiles_list = GetDissociationConstants_val(inchi_k)"
|
| 791 |
+
]
|
| 792 |
+
},
|
| 793 |
+
{
|
| 794 |
+
"cell_type": "code",
|
| 795 |
+
"execution_count": 13,
|
| 796 |
+
"id": "6dd79761-760d-4233-b113-a34e6322a0e5",
|
| 797 |
+
"metadata": {},
|
| 798 |
+
"outputs": [],
|
| 799 |
+
"source": [
|
| 800 |
+
"MID_PH = 7.0\n",
|
| 801 |
+
"N_PKAS = 20\n",
|
| 802 |
+
"\n",
|
| 803 |
+
"n_acidic = N_PKAS\n",
|
| 804 |
+
"n_basic = N_PKAS\n",
|
| 805 |
+
"pH = MID_PH"
|
| 806 |
+
]
|
| 807 |
+
},
|
| 808 |
+
{
|
| 809 |
+
"cell_type": "code",
|
| 810 |
+
"execution_count": 14,
|
| 811 |
+
"id": "6167191a-b361-4ae0-a78a-927490c72f87",
|
| 812 |
+
"metadata": {},
|
| 813 |
+
"outputs": [],
|
| 814 |
+
"source": [
|
| 815 |
+
"args = []\n",
|
| 816 |
+
"if n_acidic + n_basic > 0:\n",
|
| 817 |
+
" args += ['pka', '-a', str(n_acidic), '-b', str(n_basic),\n",
|
| 818 |
+
" 'majorms', '-M', 'true', '--pH', str(pH)]\n"
|
| 819 |
+
]
|
| 820 |
+
},
|
| 821 |
+
{
|
| 822 |
+
"cell_type": "code",
|
| 823 |
+
"execution_count": 15,
|
| 824 |
+
"id": "dd4275ec-c71e-4b5b-bb35-de8b3c7c4883",
|
| 825 |
+
"metadata": {},
|
| 826 |
+
"outputs": [
|
| 827 |
+
{
|
| 828 |
+
"data": {
|
| 829 |
+
"text/plain": [
|
| 830 |
+
"['pka', '-a', '20', '-b', '20', 'majorms', '-M', 'true', '--pH', '7.0']"
|
| 831 |
+
]
|
| 832 |
+
},
|
| 833 |
+
"execution_count": 15,
|
| 834 |
+
"metadata": {},
|
| 835 |
+
"output_type": "execute_result"
|
| 836 |
+
}
|
| 837 |
+
],
|
| 838 |
+
"source": [
|
| 839 |
+
"args"
|
| 840 |
+
]
|
| 841 |
+
},
|
| 842 |
+
{
|
| 843 |
+
"cell_type": "code",
|
| 844 |
+
"execution_count": null,
|
| 845 |
+
"id": "79d07dc5-963a-4373-9d72-1eb6de48ede9",
|
| 846 |
+
"metadata": {},
|
| 847 |
+
"outputs": [],
|
| 848 |
+
"source": []
|
| 849 |
+
},
|
| 850 |
+
{
|
| 851 |
+
"cell_type": "code",
|
| 852 |
+
"execution_count": 16,
|
| 853 |
+
"id": "712a71fb-e3e3-4b01-828d-5a3862aa1b30",
|
| 854 |
+
"metadata": {},
|
| 855 |
+
"outputs": [],
|
| 856 |
+
"source": [
|
| 857 |
+
"logging.debug(\"INPUT: echo %s | %s\" % (inchi_k, ' '.join([CXCALC_BIN] + args)))"
|
| 858 |
+
]
|
| 859 |
+
},
|
| 860 |
+
{
|
| 861 |
+
"cell_type": "code",
|
| 862 |
+
"execution_count": 17,
|
| 863 |
+
"id": "287bf822-23b8-42de-85ca-e52678875cfa",
|
| 864 |
+
"metadata": {},
|
| 865 |
+
"outputs": [],
|
| 866 |
+
"source": [
|
| 867 |
+
"molstring= inchi_k"
|
| 868 |
+
]
|
| 869 |
+
},
|
| 870 |
+
{
|
| 871 |
+
"cell_type": "code",
|
| 872 |
+
"execution_count": 18,
|
| 873 |
+
"id": "4d2ff427-237c-4d63-a718-f29f12884d96",
|
| 874 |
+
"metadata": {},
|
| 875 |
+
"outputs": [],
|
| 876 |
+
"source": [
|
| 877 |
+
"p1 = Popen([\"echo\", molstring], stdout=PIPE, shell=use_shell_for_echo)"
|
| 878 |
+
]
|
| 879 |
+
},
|
| 880 |
+
{
|
| 881 |
+
"cell_type": "code",
|
| 882 |
+
"execution_count": 19,
|
| 883 |
+
"id": "923a09f2-b959-4837-ab1a-a858d91de0b4",
|
| 884 |
+
"metadata": {},
|
| 885 |
+
"outputs": [],
|
| 886 |
+
"source": [
|
| 887 |
+
"p2 = Popen([CXCALC_BIN] + args, stdin=p1.stdout,\n",
|
| 888 |
+
" executable=CXCALC_BIN, stdout=PIPE, shell=False)"
|
| 889 |
+
]
|
| 890 |
+
},
|
| 891 |
+
{
|
| 892 |
+
"cell_type": "code",
|
| 893 |
+
"execution_count": 20,
|
| 894 |
+
"id": "a6b30545-c65a-4c56-9985-71a103b9da00",
|
| 895 |
+
"metadata": {},
|
| 896 |
+
"outputs": [],
|
| 897 |
+
"source": [
|
| 898 |
+
"res = p2.communicate()[0]"
|
| 899 |
+
]
|
| 900 |
+
},
|
| 901 |
+
{
|
| 902 |
+
"cell_type": "code",
|
| 903 |
+
"execution_count": 21,
|
| 904 |
+
"id": "ac059602-027f-4a1a-932f-c1339c38c7d7",
|
| 905 |
+
"metadata": {},
|
| 906 |
+
"outputs": [],
|
| 907 |
+
"source": [
|
| 908 |
+
"if p2.returncode != 0:\n",
|
| 909 |
+
" raise ChemAxonError(str(args))\n",
|
| 910 |
+
"logging.debug(\"OUTPUT: %s\" % res)"
|
| 911 |
+
]
|
| 912 |
+
},
|
| 913 |
+
{
|
| 914 |
+
"cell_type": "code",
|
| 915 |
+
"execution_count": 22,
|
| 916 |
+
"id": "671642a5-3877-44e3-b935-f987fd601444",
|
| 917 |
+
"metadata": {},
|
| 918 |
+
"outputs": [],
|
| 919 |
+
"source": [
|
| 920 |
+
"output = res"
|
| 921 |
+
]
|
| 922 |
+
},
|
| 923 |
+
{
|
| 924 |
+
"cell_type": "code",
|
| 925 |
+
"execution_count": 23,
|
| 926 |
+
"id": "a9f4bb4a-af86-4e97-bf1d-40c58013f90e",
|
| 927 |
+
"metadata": {},
|
| 928 |
+
"outputs": [
|
| 929 |
+
{
|
| 930 |
+
"data": {
|
| 931 |
+
"text/plain": [
|
| 932 |
+
"b'id\\tapKa1\\tapKa2\\tapKa3\\tapKa4\\tapKa5\\tapKa6\\tapKa7\\tapKa8\\tapKa9\\tapKa10\\tapKa11\\tapKa12\\tapKa13\\tapKa14\\tapKa15\\tapKa16\\tapKa17\\tapKa18\\tapKa19\\tapKa20\\tbpKa1\\tbpKa2\\tbpKa3\\tbpKa4\\tbpKa5\\tbpKa6\\tbpKa7\\tbpKa8\\tbpKa9\\tbpKa10\\tbpKa11\\tbpKa12\\tbpKa13\\tbpKa14\\tbpKa15\\tbpKa16\\tbpKa17\\tbpKa18\\tbpKa19\\tbpKa20\\tatoms\\tmajor-ms\\r\\n1\\t10.10\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t-5.48\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t15,15\\tOC1=CC=CC(CCC2=CC=CC=C2)=C1\\r\\n'"
|
| 933 |
+
]
|
| 934 |
+
},
|
| 935 |
+
"execution_count": 23,
|
| 936 |
+
"metadata": {},
|
| 937 |
+
"output_type": "execute_result"
|
| 938 |
+
}
|
| 939 |
+
],
|
| 940 |
+
"source": [
|
| 941 |
+
"output"
|
| 942 |
+
]
|
| 943 |
+
},
|
| 944 |
+
{
|
| 945 |
+
"cell_type": "code",
|
| 946 |
+
"execution_count": 24,
|
| 947 |
+
"id": "215ffc9b-35a8-4f45-8f39-9c99deae6335",
|
| 948 |
+
"metadata": {},
|
| 949 |
+
"outputs": [],
|
| 950 |
+
"source": [
|
| 951 |
+
"atom2pKa, smiles_list = ParsePkaOutput(output, n_acidic, n_basic)"
|
| 952 |
+
]
|
| 953 |
+
},
|
| 954 |
+
{
|
| 955 |
+
"cell_type": "code",
|
| 956 |
+
"execution_count": 26,
|
| 957 |
+
"id": "21c380d3-5410-4c55-b6d7-cb0588f373ca",
|
| 958 |
+
"metadata": {},
|
| 959 |
+
"outputs": [
|
| 960 |
+
{
|
| 961 |
+
"data": {
|
| 962 |
+
"text/plain": [
|
| 963 |
+
"['OC1=CC=CC(CCC2=CC=CC=C2)=C1\\r']"
|
| 964 |
+
]
|
| 965 |
+
},
|
| 966 |
+
"execution_count": 26,
|
| 967 |
+
"metadata": {},
|
| 968 |
+
"output_type": "execute_result"
|
| 969 |
+
}
|
| 970 |
+
],
|
| 971 |
+
"source": [
|
| 972 |
+
"smiles_list"
|
| 973 |
+
]
|
| 974 |
+
},
|
| 975 |
+
{
|
| 976 |
+
"cell_type": "code",
|
| 977 |
+
"execution_count": 27,
|
| 978 |
+
"id": "1437693a-0923-4df1-837d-acb2b524fcae",
|
| 979 |
+
"metadata": {},
|
| 980 |
+
"outputs": [],
|
| 981 |
+
"source": [
|
| 982 |
+
"all_pKas = []\n",
|
| 983 |
+
"for pKa_list in list(atom2pKa.values()):\n",
|
| 984 |
+
" all_pKas += [pKa for pKa, _ in pKa_list]"
|
| 985 |
+
]
|
| 986 |
+
},
|
| 987 |
+
{
|
| 988 |
+
"cell_type": "code",
|
| 989 |
+
"execution_count": 28,
|
| 990 |
+
"id": "8e77324c-ed61-4615-a7c7-4f5ca781dc90",
|
| 991 |
+
"metadata": {},
|
| 992 |
+
"outputs": [
|
| 993 |
+
{
|
| 994 |
+
"data": {
|
| 995 |
+
"text/plain": [
|
| 996 |
+
"[10.1, -5.48]"
|
| 997 |
+
]
|
| 998 |
+
},
|
| 999 |
+
"execution_count": 28,
|
| 1000 |
+
"metadata": {},
|
| 1001 |
+
"output_type": "execute_result"
|
| 1002 |
+
}
|
| 1003 |
+
],
|
| 1004 |
+
"source": [
|
| 1005 |
+
"all_pKas"
|
| 1006 |
+
]
|
| 1007 |
+
},
|
| 1008 |
+
{
|
| 1009 |
+
"cell_type": "code",
|
| 1010 |
+
"execution_count": null,
|
| 1011 |
+
"id": "8616be46-1814-4755-b919-4b7790569890",
|
| 1012 |
+
"metadata": {},
|
| 1013 |
+
"outputs": [],
|
| 1014 |
+
"source": []
|
| 1015 |
+
}
|
| 1016 |
+
],
|
| 1017 |
+
"metadata": {
|
| 1018 |
+
"kernelspec": {
|
| 1019 |
+
"display_name": "Python 3",
|
| 1020 |
+
"language": "python",
|
| 1021 |
+
"name": "python3"
|
| 1022 |
+
},
|
| 1023 |
+
"language_info": {
|
| 1024 |
+
"codemirror_mode": {
|
| 1025 |
+
"name": "ipython",
|
| 1026 |
+
"version": 3
|
| 1027 |
+
},
|
| 1028 |
+
"file_extension": ".py",
|
| 1029 |
+
"mimetype": "text/x-python",
|
| 1030 |
+
"name": "python",
|
| 1031 |
+
"nbconvert_exporter": "python",
|
| 1032 |
+
"pygments_lexer": "ipython3",
|
| 1033 |
+
"version": "3.8.10"
|
| 1034 |
+
}
|
| 1035 |
+
},
|
| 1036 |
+
"nbformat": 4,
|
| 1037 |
+
"nbformat_minor": 5
|
| 1038 |
+
}
|
CC/chemaxon.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import csv
|
| 3 |
+
import re
|
| 4 |
+
import platform
|
| 5 |
+
import io
|
| 6 |
+
from subprocess import Popen, PIPE
|
| 7 |
+
from openbabel import openbabel
|
| 8 |
+
import pdb
|
| 9 |
+
from rdkit.Chem import rdchem
|
| 10 |
+
|
| 11 |
+
if platform.system() == 'Windows':
|
| 12 |
+
CXCALC_BIN = 'C:\\Users\\vuu10\\AppData\\Local\\Programs\\ChemAxon\\MarvinSuite\\bin\\cxcalc.exe'
|
| 13 |
+
#CXCALC_BIN = 'C:\\Program Files (x86)\\ChemAxon\\MarvinBeans\\bin\\cxcalc.bat'
|
| 14 |
+
use_shell_for_echo = True
|
| 15 |
+
else:
|
| 16 |
+
CXCALC_BIN = 'cxcalc'
|
| 17 |
+
use_shell_for_echo = False
|
| 18 |
+
|
| 19 |
+
MID_PH = 7.0
|
| 20 |
+
N_PKAS = 20
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class ChemAxonError(Exception):
|
| 24 |
+
pass
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def RunCxcalc(molstring, args):
|
| 28 |
+
# pdb.set_trace()
|
| 29 |
+
# with open(platform.DEV_NULL, 'w') as dev_null:
|
| 30 |
+
try:
|
| 31 |
+
logging.debug("INPUT: echo %s | %s" %
|
| 32 |
+
(molstring, ' '.join([CXCALC_BIN] + args)))
|
| 33 |
+
p1 = Popen(["echo", molstring], stdout=PIPE,
|
| 34 |
+
shell=use_shell_for_echo)
|
| 35 |
+
# p2 = Popen([CXCALC_BIN] + args, stdin=p1.stdout,
|
| 36 |
+
# executable=CXCALC_BIN, stdout=PIPE, stderr=dev_null, shell=False)
|
| 37 |
+
p2 = Popen([CXCALC_BIN] + args, stdin=p1.stdout,
|
| 38 |
+
executable=CXCALC_BIN, stdout=PIPE, shell=False)
|
| 39 |
+
# p.wait()
|
| 40 |
+
# os.remove(temp_fname)
|
| 41 |
+
res = p2.communicate()[0]
|
| 42 |
+
if p2.returncode != 0:
|
| 43 |
+
raise ChemAxonError(str(args))
|
| 44 |
+
logging.debug("OUTPUT: %s" % res)
|
| 45 |
+
res = res.decode('utf-8')
|
| 46 |
+
return res
|
| 47 |
+
except OSError:
|
| 48 |
+
raise Exception(
|
| 49 |
+
"Marvin (by ChemAxon) must be installed to calculate pKa data.")
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def ParsePkaOutput(s, n_acidic, n_basic):
|
| 53 |
+
"""
|
| 54 |
+
Returns:
|
| 55 |
+
A dictionary that maps the atom index to a list of pKas
|
| 56 |
+
that are assigned to that atom.
|
| 57 |
+
"""
|
| 58 |
+
# s = s.decode('utf-8')
|
| 59 |
+
atom2pKa = {}
|
| 60 |
+
|
| 61 |
+
pkaline = s.split('\n')[1]
|
| 62 |
+
splitline = pkaline.split('\t')
|
| 63 |
+
splitline.pop(0)
|
| 64 |
+
|
| 65 |
+
if n_acidic + n_basic > 0:
|
| 66 |
+
if len(splitline) != (n_acidic + n_basic + 2):
|
| 67 |
+
raise ChemAxonError('ChemAxon failed to find any pKas')
|
| 68 |
+
|
| 69 |
+
pKa_list = []
|
| 70 |
+
acid_or_base_list = []
|
| 71 |
+
for i in range(n_acidic + n_basic):
|
| 72 |
+
x = splitline.pop(0)
|
| 73 |
+
if x == '':
|
| 74 |
+
continue
|
| 75 |
+
|
| 76 |
+
pKa_list.append(float(x))
|
| 77 |
+
if i < n_acidic:
|
| 78 |
+
acid_or_base_list.append('acid')
|
| 79 |
+
else:
|
| 80 |
+
acid_or_base_list.append('base')
|
| 81 |
+
|
| 82 |
+
atom_list = splitline.pop(0)
|
| 83 |
+
|
| 84 |
+
if atom_list: # a comma separated list of the deprotonated atoms
|
| 85 |
+
atom_numbers = [int(y)-1 for y in atom_list.split(',')]
|
| 86 |
+
for i, j in enumerate(atom_numbers):
|
| 87 |
+
atom2pKa.setdefault(j, [])
|
| 88 |
+
atom2pKa[j].append((pKa_list[i], acid_or_base_list[i]))
|
| 89 |
+
|
| 90 |
+
smiles_list = splitline
|
| 91 |
+
return atom2pKa, smiles_list
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def GetDissociationConstants_val(molstring, n_acidic=N_PKAS, n_basic=N_PKAS,
|
| 95 |
+
pH=MID_PH):
|
| 96 |
+
"""
|
| 97 |
+
Returns:
|
| 98 |
+
A pair of (pKa list, major pseudoisomer)
|
| 99 |
+
|
| 100 |
+
- the pKa list is of the pKa values in ascending order.
|
| 101 |
+
- the major pseudoisomer is a SMILES string of the major species
|
| 102 |
+
at the given pH.
|
| 103 |
+
"""
|
| 104 |
+
args = []
|
| 105 |
+
if n_acidic + n_basic > 0:
|
| 106 |
+
args += ['pka', '-a', str(n_acidic), '-b', str(n_basic),
|
| 107 |
+
'majorms', '-M', 'true', '--pH', str(pH)]
|
| 108 |
+
|
| 109 |
+
output = RunCxcalc(molstring, args)
|
| 110 |
+
atom2pKa, smiles_list = ParsePkaOutput(output, n_acidic, n_basic)
|
| 111 |
+
|
| 112 |
+
all_pKas = []
|
| 113 |
+
for pKa_list in list(atom2pKa.values()):
|
| 114 |
+
all_pKas += [pKa for pKa, _ in pKa_list]
|
| 115 |
+
|
| 116 |
+
return sorted(all_pKas), smiles_list
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def GetDissociationConstants(molstring, n_acidic=N_PKAS, n_basic=N_PKAS,
|
| 120 |
+
pH=MID_PH):
|
| 121 |
+
"""
|
| 122 |
+
Arguments:
|
| 123 |
+
molstring - a text description of the molecule (SMILES or InChI)
|
| 124 |
+
n_acidic - the max no. of acidic pKas to calculate
|
| 125 |
+
n_basic - the max no. of basic pKas to calculate
|
| 126 |
+
pH - the pH for which the major pseudoisomer is calculated
|
| 127 |
+
|
| 128 |
+
Returns a pair:
|
| 129 |
+
(all_pKas, major_ms)
|
| 130 |
+
|
| 131 |
+
- all_pKas is a list of floats (pKa values)
|
| 132 |
+
- major_ms is a SMILES string of the major pseudoisomer at pH_mid
|
| 133 |
+
"""
|
| 134 |
+
all_pKas, smiles_list = GetDissociationConstants_val(molstring, n_acidic,
|
| 135 |
+
n_basic, pH)
|
| 136 |
+
major_ms = smiles_list[0]
|
| 137 |
+
return all_pKas, major_ms
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def GetFormulaAndCharge(molstring):
|
| 141 |
+
"""
|
| 142 |
+
Arguments:
|
| 143 |
+
molstring - a text description of the molecule (SMILES or InChI)
|
| 144 |
+
|
| 145 |
+
Returns:
|
| 146 |
+
chemical formula of the molecule
|
| 147 |
+
"""
|
| 148 |
+
args = ['formula', 'formalcharge']
|
| 149 |
+
output = RunCxcalc(molstring, args)
|
| 150 |
+
# the output is a tab separated table whose columns are:
|
| 151 |
+
# id, Formula, Formal charge
|
| 152 |
+
f = io.StringIO(output)
|
| 153 |
+
tsv_output = csv.reader(f, delimiter='\t')
|
| 154 |
+
headers = next(tsv_output)
|
| 155 |
+
if headers != ['id', 'Formula', 'Formal charge']:
|
| 156 |
+
raise ChemAxonError(
|
| 157 |
+
'cannot get the formula and charge for: ' + molstring)
|
| 158 |
+
_, formula, formal_charge = next(tsv_output)
|
| 159 |
+
|
| 160 |
+
try:
|
| 161 |
+
formal_charge = int(formal_charge)
|
| 162 |
+
except ValueError:
|
| 163 |
+
formal_charge = 0
|
| 164 |
+
|
| 165 |
+
return formula, formal_charge
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def GetAtomBagAndCharge(molstring):
|
| 169 |
+
formula, formal_charge = GetFormulaAndCharge(molstring)
|
| 170 |
+
periodic_table = rdchem.GetPeriodicTable()
|
| 171 |
+
|
| 172 |
+
atom_bag = {}
|
| 173 |
+
for mol_formula_times in formula.split('.'):
|
| 174 |
+
for times, mol_formula in re.findall('^(\d+)?(\w+)', mol_formula_times):
|
| 175 |
+
if not times:
|
| 176 |
+
times = 1
|
| 177 |
+
else:
|
| 178 |
+
times = int(times)
|
| 179 |
+
for atom, count in re.findall("([A-Z][a-z]*)([0-9]*)", mol_formula):
|
| 180 |
+
if count == '':
|
| 181 |
+
count = 1
|
| 182 |
+
else:
|
| 183 |
+
count = int(count)
|
| 184 |
+
atom_bag[atom] = atom_bag.get(atom, 0) + count * times
|
| 185 |
+
|
| 186 |
+
n_protons = sum([c * periodic_table.GetAtomicNumber(str(elem))
|
| 187 |
+
for (elem, c) in atom_bag.items()])
|
| 188 |
+
atom_bag['e-'] = n_protons - formal_charge
|
| 189 |
+
|
| 190 |
+
return atom_bag, formal_charge
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
if __name__ == "__main__":
|
| 194 |
+
logging.getLogger().setLevel(logging.WARNING)
|
| 195 |
+
from molecule import Molecule
|
| 196 |
+
compound_list = [
|
| 197 |
+
('D-Erythrulose', 'InChI=1S/C4H8O4/c5-1-3(7)4(8)2-6/h3,5-7H,1-2H2/t3-/m1/s1')]
|
| 198 |
+
|
| 199 |
+
for name, inchi in compound_list:
|
| 200 |
+
print("Formula: %s\nCharge: %d" % GetFormulaAndCharge(inchi))
|
| 201 |
+
diss_table, major_ms = GetDissociationConstants(inchi)
|
| 202 |
+
m = Molecule.FromSmiles(major_ms)
|
| 203 |
+
print("Name: %s\nInChI: %s\npKas: %s" %
|
| 204 |
+
(name, m.ToInChI(), str(diss_table)))
|
CC/compound.py
ADDED
|
@@ -0,0 +1,337 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import urllib.request, urllib.parse, urllib.error, logging
|
| 2 |
+
from openbabel import openbabel
|
| 3 |
+
import chemaxon
|
| 4 |
+
import numpy as np
|
| 5 |
+
from thermodynamic_constants import R, debye_huckel
|
| 6 |
+
from scipy.special import logsumexp
|
| 7 |
+
|
| 8 |
+
MIN_PH = 0.0
|
| 9 |
+
MAX_PH = 14.0
|
| 10 |
+
|
| 11 |
+
class Compound(object):
|
| 12 |
+
|
| 13 |
+
def __init__(self, database, compound_id, inchi,
|
| 14 |
+
atom_bag, pKas, smiles_pH7, majorMSpH7, nHs, zs):
|
| 15 |
+
self.database = database
|
| 16 |
+
self.compound_id = compound_id
|
| 17 |
+
self.inchi = inchi
|
| 18 |
+
self.atom_bag = atom_bag
|
| 19 |
+
self.pKas = pKas
|
| 20 |
+
self.smiles_pH7 = smiles_pH7
|
| 21 |
+
self.majorMSpH7 = majorMSpH7
|
| 22 |
+
self.nHs = nHs
|
| 23 |
+
self.zs = zs
|
| 24 |
+
|
| 25 |
+
@staticmethod
|
| 26 |
+
def from_kegg(compound_id):
|
| 27 |
+
return Compound.from_inchi('KEGG', compound_id,
|
| 28 |
+
Compound.get_inchi(compound_id))
|
| 29 |
+
|
| 30 |
+
@staticmethod
|
| 31 |
+
def from_inchi(database, compound_id, inchi):
|
| 32 |
+
if compound_id == 'C00080':
|
| 33 |
+
# We add an exception for H+ (and put nH = 0) in order to eliminate
|
| 34 |
+
# its effect of the Legendre transform
|
| 35 |
+
return Compound(database, compound_id, inchi,
|
| 36 |
+
{'H' : 1}, [], None, 0, [0], [0])
|
| 37 |
+
elif compound_id == 'C00087':
|
| 38 |
+
# ChemAxon gets confused with the structure of sulfur
|
| 39 |
+
# (returns a protonated form, [SH-], at pH 7).
|
| 40 |
+
# So we implement it manually here.
|
| 41 |
+
return Compound(database, compound_id, inchi,
|
| 42 |
+
{'S' : 1, 'e-': 16}, [], 'S', 0, [0], [0])
|
| 43 |
+
elif compound_id == 'C00237':
|
| 44 |
+
# ChemAxon gets confused with the structure of carbon monoxide
|
| 45 |
+
# (returns a protonated form, [CH]#[O+], at pH 7).
|
| 46 |
+
# So we implement it manually here.
|
| 47 |
+
return Compound(database, compound_id, inchi,
|
| 48 |
+
{'C' : 1, 'O': 1, 'e-': 14}, [], '[C-]#[O+]', 0, [0], [0])
|
| 49 |
+
elif compound_id == 'C00282':
|
| 50 |
+
# ChemAxon gets confused with the structure of hydrogen
|
| 51 |
+
# So we implement it manually here.
|
| 52 |
+
return Compound(database, compound_id, inchi,
|
| 53 |
+
{'H' : 2, 'e-': 2}, [], None, 0, [2], [0])
|
| 54 |
+
elif compound_id == 'C01353':
|
| 55 |
+
# When given the structure of carbonic acid, ChemAxon returns the
|
| 56 |
+
# pKas for CO2(tot), i.e. it assumes the non-hydrated CO2 species is
|
| 57 |
+
# one of the pseudoisomers, and the lower pKa value is 6.05 instead of
|
| 58 |
+
# 3.78. Here, we introduce a new "KEGG" compound that will represent
|
| 59 |
+
# pure bicarbonate (without CO2(sp)) and therefore plug in the pKa
|
| 60 |
+
# values from Alberty's book.
|
| 61 |
+
return Compound(database, compound_id, inchi,
|
| 62 |
+
{'C': 1, 'H': 1, 'O': 3, 'e-': 32}, [10.33, 3.43],
|
| 63 |
+
'OC(=O)[O-]', 1, [0, 1, 2], [-2, -1, 0])
|
| 64 |
+
# Metal Cations get multiple pKa values from ChemAxon, which is
|
| 65 |
+
# obviously a bug. We override the important ones here:
|
| 66 |
+
elif compound_id == 'C00076': # Ca2+
|
| 67 |
+
return Compound(database, compound_id, inchi,
|
| 68 |
+
{'Ca' : 1, 'e-': 18}, [], '[Ca++]', 0, [0], [2])
|
| 69 |
+
elif compound_id == 'C00238': # K+
|
| 70 |
+
return Compound(database, compound_id, inchi,
|
| 71 |
+
{'K' : 1, 'e-': 18}, [], '[K+]', 0, [0], [1])
|
| 72 |
+
elif compound_id == 'C00305': # Mg2+
|
| 73 |
+
return Compound(database, compound_id, inchi,
|
| 74 |
+
{'Mg' : 1, 'e-': 10}, [], '[Mg++]', 0, [0], [2])
|
| 75 |
+
elif compound_id == 'C14818': # Fe2+
|
| 76 |
+
return Compound(database, compound_id, inchi,
|
| 77 |
+
{'Fe' : 1, 'e-': 24}, [], '[Fe++]', 0, [0], [2])
|
| 78 |
+
elif compound_id == 'C14819': # Fe3+
|
| 79 |
+
return Compound(database, compound_id, inchi,
|
| 80 |
+
{'Fe' : 1, 'e-': 23}, [], '[Fe+++]', 0, [0], [3])
|
| 81 |
+
elif compound_id == 'C00138': # ferredoxin(red)
|
| 82 |
+
return Compound(database, compound_id, inchi,
|
| 83 |
+
{'Fe' : 1, 'e-': 26}, [], None, 0, [0], [0])
|
| 84 |
+
elif compound_id == 'C00139': # ferredoxin(ox)
|
| 85 |
+
return Compound(database, compound_id, inchi,
|
| 86 |
+
{'Fe' : 1, 'e-': 25}, [], None, 0, [0], [1])
|
| 87 |
+
elif inchi is None:
|
| 88 |
+
# If the compound has no explicit structure, we assume that it has
|
| 89 |
+
# no proton dissociations in the relevant pH range
|
| 90 |
+
return Compound(database, compound_id, inchi,
|
| 91 |
+
{}, [], None, 0, [0], [0])
|
| 92 |
+
|
| 93 |
+
# Otherwise, we use ChemAxon's software to get the pKas and the
|
| 94 |
+
# properties of all microspecies
|
| 95 |
+
|
| 96 |
+
try:
|
| 97 |
+
pKas, major_ms_smiles = chemaxon.GetDissociationConstants(inchi)
|
| 98 |
+
major_ms_smiles = Compound.smiles2smiles(major_ms_smiles)
|
| 99 |
+
pKas = sorted([pka for pka in pKas if pka > MIN_PH and pka < MAX_PH], reverse=True)
|
| 100 |
+
except chemaxon.ChemAxonError:
|
| 101 |
+
logging.warning('chemaxon failed to find pKas for this molecule: ' + inchi)
|
| 102 |
+
# use the original InChI to get the parameters (i.e. assume it
|
| 103 |
+
# represents the major microspecies at pH 7)
|
| 104 |
+
major_ms_smiles = Compound.inchi2smiles(inchi)
|
| 105 |
+
pKas = []
|
| 106 |
+
|
| 107 |
+
if major_ms_smiles:
|
| 108 |
+
atom_bag, major_ms_charge = chemaxon.GetAtomBagAndCharge(major_ms_smiles)
|
| 109 |
+
major_ms_nH = atom_bag.get('H', 0)
|
| 110 |
+
else:
|
| 111 |
+
atom_bag = {}
|
| 112 |
+
major_ms_charge = 0
|
| 113 |
+
major_ms_nH = 0
|
| 114 |
+
|
| 115 |
+
n_species = len(pKas) + 1
|
| 116 |
+
if pKas == []:
|
| 117 |
+
majorMSpH7 = 0
|
| 118 |
+
else:
|
| 119 |
+
majorMSpH7 = len([1 for pka in pKas if pka > 7])
|
| 120 |
+
|
| 121 |
+
nHs = []
|
| 122 |
+
zs = []
|
| 123 |
+
|
| 124 |
+
for i in range(n_species):
|
| 125 |
+
zs.append((i - majorMSpH7) + major_ms_charge)
|
| 126 |
+
nHs.append((i - majorMSpH7) + major_ms_nH)
|
| 127 |
+
|
| 128 |
+
return Compound(database, compound_id, inchi,
|
| 129 |
+
atom_bag, pKas, major_ms_smiles, majorMSpH7, nHs, zs)
|
| 130 |
+
|
| 131 |
+
def to_json_dict(self):
|
| 132 |
+
return {'database' : self.database,
|
| 133 |
+
'compound_id' : self.compound_id,
|
| 134 |
+
'inchi' : self.inchi,
|
| 135 |
+
'atom_bag' : self.atom_bag,
|
| 136 |
+
'pKas' : self.pKas,
|
| 137 |
+
'smiles_pH7' : self.smiles_pH7,
|
| 138 |
+
'majorMSpH7' : self.majorMSpH7,
|
| 139 |
+
'nHs' : self.nHs,
|
| 140 |
+
'zs' : self.zs}
|
| 141 |
+
|
| 142 |
+
@staticmethod
|
| 143 |
+
def from_json_dict(d):
|
| 144 |
+
return Compound(d['database'], d['compound_id'], d['inchi'], d['atom_bag'],
|
| 145 |
+
d['pKas'], d['smiles_pH7'], d['majorMSpH7'],
|
| 146 |
+
d['nHs'], d['zs'])
|
| 147 |
+
|
| 148 |
+
@staticmethod
|
| 149 |
+
def get_inchi(compound_id):
|
| 150 |
+
s_mol = urllib.request.urlopen('http://rest.kegg.jp/get/cpd:%s/mol' % compound_id).read()
|
| 151 |
+
return Compound.mol2inchi(s_mol)
|
| 152 |
+
|
| 153 |
+
@staticmethod
|
| 154 |
+
def mol2inchi(s):
|
| 155 |
+
openbabel.obErrorLog.SetOutputLevel(-1)
|
| 156 |
+
|
| 157 |
+
conv = openbabel.OBConversion()
|
| 158 |
+
conv.SetInAndOutFormats('mol', 'inchi')
|
| 159 |
+
conv.AddOption("F", conv.OUTOPTIONS)
|
| 160 |
+
conv.AddOption("T", conv.OUTOPTIONS)
|
| 161 |
+
conv.AddOption("x", conv.OUTOPTIONS, "noiso")
|
| 162 |
+
conv.AddOption("w", conv.OUTOPTIONS)
|
| 163 |
+
obmol = openbabel.OBMol()
|
| 164 |
+
if not conv.ReadString(obmol, str(s)):
|
| 165 |
+
return None
|
| 166 |
+
inchi = conv.WriteString(obmol, True) # second argument is trimWhitespace
|
| 167 |
+
if inchi == '':
|
| 168 |
+
return None
|
| 169 |
+
else:
|
| 170 |
+
return inchi
|
| 171 |
+
|
| 172 |
+
@staticmethod
|
| 173 |
+
def inchi2smiles(inchi):
|
| 174 |
+
openbabel.obErrorLog.SetOutputLevel(-1)
|
| 175 |
+
|
| 176 |
+
conv = openbabel.OBConversion()
|
| 177 |
+
conv.SetInAndOutFormats('inchi', 'smiles')
|
| 178 |
+
#conv.AddOption("F", conv.OUTOPTIONS)
|
| 179 |
+
#conv.AddOption("T", conv.OUTOPTIONS)
|
| 180 |
+
#conv.AddOption("x", conv.OUTOPTIONS, "noiso")
|
| 181 |
+
#conv.AddOption("w", conv.OUTOPTIONS)
|
| 182 |
+
obmol = openbabel.OBMol()
|
| 183 |
+
conv.ReadString(obmol, str(inchi))
|
| 184 |
+
smiles = conv.WriteString(obmol, True) # second argument is trimWhitespace
|
| 185 |
+
if smiles == '':
|
| 186 |
+
return None
|
| 187 |
+
else:
|
| 188 |
+
return smiles
|
| 189 |
+
|
| 190 |
+
@staticmethod
|
| 191 |
+
def smiles2smiles(smiles_in):
|
| 192 |
+
openbabel.obErrorLog.SetOutputLevel(-1)
|
| 193 |
+
|
| 194 |
+
conv = openbabel.OBConversion()
|
| 195 |
+
conv.SetInAndOutFormats('smiles', 'smiles')
|
| 196 |
+
#conv.AddOption("F", conv.OUTOPTIONS)
|
| 197 |
+
#conv.AddOption("T", conv.OUTOPTIONS)
|
| 198 |
+
#conv.AddOption("x", conv.OUTOPTIONS, "noiso")
|
| 199 |
+
#conv.AddOption("w", conv.OUTOPTIONS)
|
| 200 |
+
obmol = openbabel.OBMol()
|
| 201 |
+
conv.ReadString(obmol, str(smiles_in))
|
| 202 |
+
smiles_out = conv.WriteString(obmol, True) # second argument is trimWhitespace
|
| 203 |
+
if smiles_out == '':
|
| 204 |
+
return None
|
| 205 |
+
else:
|
| 206 |
+
return smiles_out
|
| 207 |
+
@staticmethod
|
| 208 |
+
def smiles2inchi(smiles):
|
| 209 |
+
openbabel.obErrorLog.SetOutputLevel(-1)
|
| 210 |
+
|
| 211 |
+
conv = openbabel.OBConversion()
|
| 212 |
+
conv.SetInAndOutFormats('smiles', 'inchi')
|
| 213 |
+
conv.AddOption("F", conv.OUTOPTIONS)
|
| 214 |
+
conv.AddOption("T", conv.OUTOPTIONS)
|
| 215 |
+
conv.AddOption("x", conv.OUTOPTIONS, "noiso")
|
| 216 |
+
conv.AddOption("w", conv.OUTOPTIONS)
|
| 217 |
+
obmol = openbabel.OBMol()
|
| 218 |
+
conv.ReadString(obmol, str(smiles))
|
| 219 |
+
inchi = conv.WriteString(obmol, True) # second argument is trimWhitespace
|
| 220 |
+
if inchi == '':
|
| 221 |
+
return None
|
| 222 |
+
else:
|
| 223 |
+
return inchi
|
| 224 |
+
|
| 225 |
+
def __str__(self):
|
| 226 |
+
return "%s\nInChI: %s\npKas: %s\nmajor MS: nH = %d, charge = %d" % \
|
| 227 |
+
(self.compound_id, self.inchi, ', '.join(['%.2f' % p for p in self.pKas]),
|
| 228 |
+
self.nHs[self.majorMSpH7], self.zs[self.majorMSpH7])
|
| 229 |
+
|
| 230 |
+
def _dG0_prime_vector(self, pH, I, T):
|
| 231 |
+
"""
|
| 232 |
+
Calculates the difference in kJ/mol between dG'0 and
|
| 233 |
+
the dG0 of the MS with the least hydrogens (dG0[0])
|
| 234 |
+
|
| 235 |
+
Returns:
|
| 236 |
+
dG'0 - dG0[0]
|
| 237 |
+
"""
|
| 238 |
+
if self.inchi is None:
|
| 239 |
+
return 0
|
| 240 |
+
elif self.pKas == []:
|
| 241 |
+
dG0s = np.zeros((1, 1))
|
| 242 |
+
else:
|
| 243 |
+
dG0s = -np.cumsum([0] + self.pKas) * R * T * np.log(10)
|
| 244 |
+
dG0s = dG0s
|
| 245 |
+
DH = debye_huckel((I, T))
|
| 246 |
+
|
| 247 |
+
# dG0' = dG0 + nH * (R T ln(10) pH + DH) - charge^2 * DH
|
| 248 |
+
pseudoisomers = np.vstack([dG0s, np.array(self.nHs), np.array(self.zs)]).T
|
| 249 |
+
dG0_prime_vector = pseudoisomers[:, 0] + \
|
| 250 |
+
pseudoisomers[:, 1] * (R * T * np.log(10) * pH + DH) - \
|
| 251 |
+
pseudoisomers[:, 2]**2 * DH
|
| 252 |
+
return dG0_prime_vector
|
| 253 |
+
|
| 254 |
+
def _transform(self, pH, I, T):
|
| 255 |
+
|
| 256 |
+
return -R * T * logsumexp(self._dG0_prime_vector(pH, I, T) / (-R * T))
|
| 257 |
+
|
| 258 |
+
def _ddG(self, i_from, i_to, T):
|
| 259 |
+
"""
|
| 260 |
+
Calculates the difference in kJ/mol between two MSs.
|
| 261 |
+
|
| 262 |
+
Returns:
|
| 263 |
+
dG0[i_to] - dG0[i_from]
|
| 264 |
+
"""
|
| 265 |
+
if not (0 <= i_from <= len(self.pKas)):
|
| 266 |
+
raise ValueError('MS index is out of bounds: 0 <= %d <= %d' % (i_from, len(self.pKas)))
|
| 267 |
+
|
| 268 |
+
if not (0 <= i_to <= len(self.pKas)):
|
| 269 |
+
raise ValueError('MS index is out of bounds: 0 <= %d <= %d' % (i_to, len(self.pKas)))
|
| 270 |
+
|
| 271 |
+
if i_from == i_to:
|
| 272 |
+
return 0
|
| 273 |
+
elif i_from < i_to:
|
| 274 |
+
return sum(self.pKas[i_from:i_to]) * R * T * np.log(10)
|
| 275 |
+
else:
|
| 276 |
+
return -sum(self.pKas[i_to:i_from]) * R * T * np.log(10)
|
| 277 |
+
|
| 278 |
+
def transform(self, i, pH, I, T):
|
| 279 |
+
"""
|
| 280 |
+
Returns the difference in kJ/mol between dG'0 and the dG0 of the
|
| 281 |
+
MS with index 'i'.
|
| 282 |
+
|
| 283 |
+
Returns:
|
| 284 |
+
(dG'0 - dG0[0]) + (dG0[0] - dG0[i]) = dG'0 - dG0[i]
|
| 285 |
+
"""
|
| 286 |
+
return self._transform(pH, I, T) + self._ddG(0, i, T)
|
| 287 |
+
|
| 288 |
+
def transform_pH7(self, pH, I, T):
|
| 289 |
+
"""
|
| 290 |
+
Returns the transform for the major MS in pH 7
|
| 291 |
+
"""
|
| 292 |
+
return self.transform(self.majorMSpH7, pH, I, T)
|
| 293 |
+
|
| 294 |
+
def transform_neutral(self, pH, I, T):
|
| 295 |
+
"""
|
| 296 |
+
Returns the transform for the MS with no charge
|
| 297 |
+
"""
|
| 298 |
+
try:
|
| 299 |
+
return self.transform(pH, I, T, self.zs.index(0))
|
| 300 |
+
except ValueError:
|
| 301 |
+
raise ValueError("The compound (%s) does not have a microspecies with 0 charge"
|
| 302 |
+
% self.compound_id)
|
| 303 |
+
|
| 304 |
+
def get_species(self, major_ms_dG0_f, T):
|
| 305 |
+
"""
|
| 306 |
+
Given the chemical formation energy of the major microspecies,
|
| 307 |
+
uses the pKa values to calculate the chemical formation energies
|
| 308 |
+
of all other species, and returns a list of dictionaries with
|
| 309 |
+
all the relevant data: dG0_f, nH, nMg, z (charge)
|
| 310 |
+
"""
|
| 311 |
+
for i, (nH, z) in enumerate(zip(self.nHs, self.zs)):
|
| 312 |
+
dG0_f = major_ms_dG0_f + self._ddG(i, self.majorMSpH7, T)
|
| 313 |
+
d = {'phase': 'aqueous', 'dG0_f': np.round(dG0_f, 2),
|
| 314 |
+
'nH': nH, 'z': z, 'nMg': 0}
|
| 315 |
+
yield d
|
| 316 |
+
|
| 317 |
+
if __name__ == '__main__':
|
| 318 |
+
import sys, json
|
| 319 |
+
logger = logging.getLogger('')
|
| 320 |
+
logger.setLevel(logging.DEBUG)
|
| 321 |
+
from compound_cacher import CompoundCacher, CompoundEncoder
|
| 322 |
+
from molecule import Molecule, OpenBabelError
|
| 323 |
+
ccache = CompoundCacher(cache_fname=None)
|
| 324 |
+
|
| 325 |
+
for compound_id in ['C00087', 'C00282', 'C00237']:
|
| 326 |
+
comp = Compound.from_kegg(compound_id)
|
| 327 |
+
try:
|
| 328 |
+
mol = Molecule.FromInChI(str(comp.inchi))
|
| 329 |
+
sys.stderr.write('%s : formula = %s, nE = %s' %
|
| 330 |
+
(str(comp.inchi), mol.GetFormula(), mol.GetNumElectrons()))
|
| 331 |
+
except OpenBabelError:
|
| 332 |
+
pass
|
| 333 |
+
ccache.add(comp)
|
| 334 |
+
sys.stderr.write('\ncompound id = %s, nH = %s, z = %s, pKa = %s, bag = %s\n\n\n' %
|
| 335 |
+
(compound_id, str(comp.nHs), str(comp.zs), str(comp.pKas), str(comp.atom_bag)))
|
| 336 |
+
|
| 337 |
+
ccache.dump()
|
CC/compound_cacher.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json, os, logging, csv, gzip, numpy, pdb
|
| 2 |
+
from compound import Compound
|
| 3 |
+
base_path = os.path.split(os.path.realpath(__file__))[0]
|
| 4 |
+
|
| 5 |
+
### Input Files:
|
| 6 |
+
# original version of the KEGG compound file
|
| 7 |
+
OLD_COMPOUND_JSON_FNAME = os.path.join(base_path, './data_cc/equilibrator_compounds.json.gz')
|
| 8 |
+
|
| 9 |
+
# a CSV file with additional names and InChIs (mostly compounds missing from KEGG
|
| 10 |
+
# and added manually)
|
| 11 |
+
KEGG_ADDITIONS_TSV_FNAME = os.path.join(base_path, './data_cc/kegg_additions.tsv')
|
| 12 |
+
|
| 13 |
+
### Files created by this module:
|
| 14 |
+
# names and InChIs only
|
| 15 |
+
KEGG_COMPOUND_JSON_FNAME = os.path.join(base_path, './data_cc/kegg_compounds.json.gz')
|
| 16 |
+
|
| 17 |
+
# names, InChIs and pKa data
|
| 18 |
+
DEFAULT_CACHE_FNAME = os.path.join(base_path, './data_cc/compounds.json.gz')
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class CompoundEncoder(json.JSONEncoder):
|
| 22 |
+
def default(self, obj):
|
| 23 |
+
if (isinstance(obj, Compound)):
|
| 24 |
+
return obj.to_json_dict()
|
| 25 |
+
return json.JSONEncoder.default(self, obj)
|
| 26 |
+
|
| 27 |
+
class Singleton(type):
|
| 28 |
+
def __init__(cls,name,bases,dic):
|
| 29 |
+
super(Singleton,cls).__init__(name,bases,dic)
|
| 30 |
+
cls.instance=None
|
| 31 |
+
def __call__(cls,*args,**kw):
|
| 32 |
+
if cls.instance is None:
|
| 33 |
+
cls.instance=super(Singleton,cls).__call__(*args,**kw)
|
| 34 |
+
return cls.instance
|
| 35 |
+
|
| 36 |
+
class CompoundCacher(object, metaclass=Singleton):
|
| 37 |
+
"""
|
| 38 |
+
CompoundCacher is a singleton that handles caching of Compound objects
|
| 39 |
+
for the component-contribution package. The Compounds are retrieved by
|
| 40 |
+
their ID (which is the KEGG ID in most cases).
|
| 41 |
+
The first time a Compound is requested, it is obtained from the relevant
|
| 42 |
+
database and a Compound object is created (this takes a while because
|
| 43 |
+
it usually involves internet communication and then invoking the ChemAxon
|
| 44 |
+
plugin for calculating the pKa values for that structure).
|
| 45 |
+
Any further request for the same Compound ID will draw the object from
|
| 46 |
+
the cache. When the method dump() is called, all cached data is written
|
| 47 |
+
to a file that will be loaded in future python sessions.
|
| 48 |
+
"""
|
| 49 |
+
|
| 50 |
+
def __init__(self, cache_fname=None):
|
| 51 |
+
self.cache_fname = cache_fname
|
| 52 |
+
if self.cache_fname is None:
|
| 53 |
+
self.cache_fname = DEFAULT_CACHE_FNAME
|
| 54 |
+
|
| 55 |
+
compounds = json.load(gzip.open(KEGG_COMPOUND_JSON_FNAME, 'r'))
|
| 56 |
+
self.compound_id2inchi = { d['compound_id']: d['inchi']
|
| 57 |
+
for d in compounds }
|
| 58 |
+
self.need_to_update_cache_file = False
|
| 59 |
+
self.load()
|
| 60 |
+
|
| 61 |
+
def get_all_compound_ids(self):
|
| 62 |
+
return sorted(self.compound_id2inchi.keys())
|
| 63 |
+
|
| 64 |
+
def load(self):
|
| 65 |
+
# parse the JSON cache file and store in a dictionary 'compound_dict'
|
| 66 |
+
self.compound_dict = {}
|
| 67 |
+
self.compound_ids = []
|
| 68 |
+
if os.path.exists(self.cache_fname):
|
| 69 |
+
for d in json.load(gzip.open(self.cache_fname, 'r')):
|
| 70 |
+
self.compound_ids.append(d['compound_id'])
|
| 71 |
+
self.compound_dict[d['compound_id']] = Compound.from_json_dict(d)
|
| 72 |
+
|
| 73 |
+
def dump(self):
|
| 74 |
+
if self.need_to_update_cache_file:
|
| 75 |
+
fp = gzip.open(self.cache_fname, 'w')
|
| 76 |
+
data = sorted(list(self.compound_dict.values()),
|
| 77 |
+
key=lambda d:d.compound_id)
|
| 78 |
+
dict_data = [x.to_json_dict() for x in data]
|
| 79 |
+
json.dump(dict_data, fp, cls=CompoundEncoder,
|
| 80 |
+
sort_keys=True, indent=4, separators=(',', ': '))
|
| 81 |
+
fp.close()
|
| 82 |
+
self.need_to_update_cache_file = False
|
| 83 |
+
|
| 84 |
+
def get_compound(self, compound_id, kegg_additions_cids=None):
|
| 85 |
+
if compound_id not in self.compound_dict:
|
| 86 |
+
logging.debug('Cache miss: %s' % str(compound_id))
|
| 87 |
+
inchi = self.compound_id2inchi[compound_id]
|
| 88 |
+
comp = Compound.from_inchi('KEGG', compound_id, inchi)
|
| 89 |
+
self.add(comp)
|
| 90 |
+
|
| 91 |
+
#if a compound id is in the kegg_additions.tsv
|
| 92 |
+
#remove the one in cache, and replace it with new one
|
| 93 |
+
else:
|
| 94 |
+
if kegg_additions_cids is not None:
|
| 95 |
+
if compound_id in kegg_additions_cids:
|
| 96 |
+
self.remove(compound_id)
|
| 97 |
+
logging.debug('Cache update: %s' % str(compound_id))
|
| 98 |
+
inchi = self.compound_id2inchi[compound_id]
|
| 99 |
+
comp = Compound.from_inchi('KEGG', compound_id, inchi)
|
| 100 |
+
self.add(comp)
|
| 101 |
+
|
| 102 |
+
logging.debug('Cache hit: %s' % str(compound_id))
|
| 103 |
+
return self.compound_dict[compound_id]
|
| 104 |
+
|
| 105 |
+
def remove(self, compound_id):
|
| 106 |
+
if compound_id in self.compound_dict:
|
| 107 |
+
del self.compound_dict[compound_id]
|
| 108 |
+
else:
|
| 109 |
+
logging.debug('%s is not cached, cannot remove it' % str(compound_id))
|
| 110 |
+
|
| 111 |
+
def add(self, comp):
|
| 112 |
+
self.compound_dict[comp.compound_id] = comp
|
| 113 |
+
self.need_to_update_cache_file = True
|
| 114 |
+
|
| 115 |
+
def get_element_matrix(self, compound_ids):
|
| 116 |
+
if type(compound_ids) == str:
|
| 117 |
+
compound_ids = [compound_ids]
|
| 118 |
+
# gather the "atom bags" of all compounds in a list 'atom_bag_list'
|
| 119 |
+
elements = set()
|
| 120 |
+
atom_bag_list = []
|
| 121 |
+
for compound_id in compound_ids:
|
| 122 |
+
comp = self.get_compound(compound_id)
|
| 123 |
+
atom_bag = comp.atom_bag
|
| 124 |
+
if atom_bag is not None:
|
| 125 |
+
elements = elements.union(list(atom_bag.keys()))
|
| 126 |
+
atom_bag_list.append(atom_bag)
|
| 127 |
+
elements.discard('H') # don't balance H (it's enough to balance e-)
|
| 128 |
+
elements = sorted(elements)
|
| 129 |
+
|
| 130 |
+
# create the elemental matrix, where each row is a compound and each
|
| 131 |
+
# column is an element (or e-)
|
| 132 |
+
Ematrix = numpy.matrix(numpy.zeros((len(atom_bag_list), len(elements))))
|
| 133 |
+
for i, atom_bag in enumerate(atom_bag_list):
|
| 134 |
+
if atom_bag is None:
|
| 135 |
+
Ematrix[i, :] = numpy.nan
|
| 136 |
+
else:
|
| 137 |
+
for j, elem in enumerate(elements):
|
| 138 |
+
Ematrix[i, j] = atom_bag.get(elem, 0)
|
| 139 |
+
return elements, Ematrix
|
| 140 |
+
|
| 141 |
+
###############################################################################
|
| 142 |
+
|
| 143 |
+
@staticmethod
|
| 144 |
+
def RebuildCompoundJSON():
|
| 145 |
+
|
| 146 |
+
kegg_dict = {}
|
| 147 |
+
for d in json.load(gzip.open(OLD_COMPOUND_JSON_FNAME, 'r')):
|
| 148 |
+
cid = d['CID']
|
| 149 |
+
kegg_dict[cid] = {'compound_id': cid,
|
| 150 |
+
'name': d['name'],
|
| 151 |
+
'names': d['names'],
|
| 152 |
+
'inchi': d['InChI']}
|
| 153 |
+
|
| 154 |
+
# override some of the compounds or add new ones with 'fake' IDs,
|
| 155 |
+
# i.e. C80000 or higher.
|
| 156 |
+
kegg_additions_cids = []
|
| 157 |
+
for d in csv.DictReader(open(KEGG_ADDITIONS_TSV_FNAME, 'r'),
|
| 158 |
+
delimiter='\t'):
|
| 159 |
+
cid = 'C%05d' % int(d['cid'])
|
| 160 |
+
kegg_additions_cids.append(cid)
|
| 161 |
+
kegg_dict[cid] = {'compound_id': cid,
|
| 162 |
+
'name': d['name'],
|
| 163 |
+
'names': [d['name']],
|
| 164 |
+
'inchi': d['inchi']}
|
| 165 |
+
|
| 166 |
+
compound_json = [kegg_dict[compound_id] for compound_id in sorted(kegg_dict.keys())]
|
| 167 |
+
|
| 168 |
+
new_json = gzip.open(KEGG_COMPOUND_JSON_FNAME, 'w')
|
| 169 |
+
json.dump(compound_json, new_json, sort_keys=True, indent=4)
|
| 170 |
+
new_json.close()
|
| 171 |
+
return kegg_additions_cids
|
| 172 |
+
|
| 173 |
+
###############################################################################
|
| 174 |
+
|
| 175 |
+
@staticmethod
|
| 176 |
+
def BuildCache(start_from_scratch=False, kegg_additions_cids=None):
|
| 177 |
+
if start_from_scratch and os.path.exists(DEFAULT_CACHE_FNAME):
|
| 178 |
+
os.remove(DEFAULT_CACHE_FNAME)
|
| 179 |
+
|
| 180 |
+
ccache = CompoundCacher(cache_fname=DEFAULT_CACHE_FNAME)
|
| 181 |
+
|
| 182 |
+
i = 0
|
| 183 |
+
for compound_id in ccache.get_all_compound_ids():
|
| 184 |
+
logging.debug('Caching %s' % compound_id)
|
| 185 |
+
comp = ccache.get_compound(compound_id, kegg_additions_cids=kegg_additions_cids)
|
| 186 |
+
logging.debug(str(comp))
|
| 187 |
+
i += 1
|
| 188 |
+
if i % 100 == 0:
|
| 189 |
+
logging.debug('Dumping Cache ...')
|
| 190 |
+
ccache.dump()
|
| 191 |
+
|
| 192 |
+
ccache.dump()
|
| 193 |
+
|
| 194 |
+
###############################################################################
|
| 195 |
+
|
| 196 |
+
if __name__ == '__main__':
|
| 197 |
+
logger = logging.getLogger('')
|
| 198 |
+
#logger.setLevel(logging.WARNING)
|
| 199 |
+
logger.setLevel(logging.DEBUG)
|
| 200 |
+
|
| 201 |
+
kegg_additions_cids = CompoundCacher.RebuildCompoundJSON()
|
| 202 |
+
CompoundCacher.BuildCache(start_from_scratch=False, kegg_additions_cids=kegg_additions_cids)
|
CC/molecule.py
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from openbabel import openbabel
|
| 2 |
+
import types
|
| 3 |
+
import re
|
| 4 |
+
import chemaxon
|
| 5 |
+
from thermodynamic_constants import default_T, default_pH
|
| 6 |
+
import pdb
|
| 7 |
+
|
| 8 |
+
class OpenBabelError(Exception):
|
| 9 |
+
pass
|
| 10 |
+
|
| 11 |
+
class Molecule(object):
|
| 12 |
+
|
| 13 |
+
# for more rendering options visit:
|
| 14 |
+
# http://www.ggasoftware.com/opensource/indigo/api/options#rendering
|
| 15 |
+
_obElements = openbabel.OBElementTable()
|
| 16 |
+
_obSmarts = openbabel.OBSmartsPattern()
|
| 17 |
+
|
| 18 |
+
@staticmethod
|
| 19 |
+
def GetNumberOfElements():
|
| 20 |
+
return Molecule._obElements.GetNumberOfElements()
|
| 21 |
+
|
| 22 |
+
@staticmethod
|
| 23 |
+
def GetAllElements():
|
| 24 |
+
return [Molecule._obElements.GetSymbol(i) for i in
|
| 25 |
+
range(Molecule.GetNumberOfElements())]
|
| 26 |
+
|
| 27 |
+
@staticmethod
|
| 28 |
+
def GetSymbol(atomic_num):
|
| 29 |
+
return Molecule._obElements.GetSymbol(atomic_num)
|
| 30 |
+
|
| 31 |
+
@staticmethod
|
| 32 |
+
def GetAtomicNum(elem):
|
| 33 |
+
if type(elem) == str:
|
| 34 |
+
elem = str(elem)
|
| 35 |
+
return Molecule._obElements.GetAtomicNum(elem)
|
| 36 |
+
|
| 37 |
+
@staticmethod
|
| 38 |
+
def VerifySmarts(smarts):
|
| 39 |
+
return Molecule._obSmarts.Init(smarts)
|
| 40 |
+
|
| 41 |
+
def __init__(self):
|
| 42 |
+
self.title = None
|
| 43 |
+
self.obmol = openbabel.OBMol()
|
| 44 |
+
self.smiles = None
|
| 45 |
+
self.inchi = None
|
| 46 |
+
|
| 47 |
+
def __str__(self):
|
| 48 |
+
return self.title or self.smiles or self.inchi or ""
|
| 49 |
+
|
| 50 |
+
def __len__(self):
|
| 51 |
+
return self.GetNumAtoms()
|
| 52 |
+
|
| 53 |
+
def Clone(self):
|
| 54 |
+
tmp = Molecule()
|
| 55 |
+
tmp.title = self.title
|
| 56 |
+
tmp.obmol = openbabel.OBMol(self.obmol)
|
| 57 |
+
tmp.smiles = self.smiles
|
| 58 |
+
tmp.inchi = self.inchi
|
| 59 |
+
return tmp
|
| 60 |
+
|
| 61 |
+
def SetTitle(self, title):
|
| 62 |
+
self.title = title
|
| 63 |
+
|
| 64 |
+
@staticmethod
|
| 65 |
+
def FromSmiles(smiles):
|
| 66 |
+
m = Molecule()
|
| 67 |
+
m.smiles = smiles
|
| 68 |
+
obConversion = openbabel.OBConversion()
|
| 69 |
+
obConversion.AddOption("w", obConversion.OUTOPTIONS)
|
| 70 |
+
obConversion.SetInFormat("smiles")
|
| 71 |
+
if not obConversion.ReadString(m.obmol, m.smiles):
|
| 72 |
+
raise OpenBabelError("Cannot read the SMILES string: " + smiles)
|
| 73 |
+
try:
|
| 74 |
+
m.UpdateSmiles()
|
| 75 |
+
#m.UpdateInChI()
|
| 76 |
+
except OpenBabelError:
|
| 77 |
+
raise OpenBabelError("Failed to create Molecule from SMILES: " + smiles)
|
| 78 |
+
m.SetTitle(smiles)
|
| 79 |
+
return m
|
| 80 |
+
|
| 81 |
+
@staticmethod
|
| 82 |
+
def FromInChI(inchi):
|
| 83 |
+
m = Molecule()
|
| 84 |
+
m.inchi = inchi
|
| 85 |
+
obConversion = openbabel.OBConversion()
|
| 86 |
+
obConversion.AddOption("w", obConversion.OUTOPTIONS)
|
| 87 |
+
obConversion.SetInFormat("inchi")
|
| 88 |
+
obConversion.ReadString(m.obmol, m.inchi)
|
| 89 |
+
try:
|
| 90 |
+
m.UpdateInChI()
|
| 91 |
+
#m.UpdateSmiles()
|
| 92 |
+
except OpenBabelError:
|
| 93 |
+
raise OpenBabelError("Failed to create Molecule from InChI: " + inchi)
|
| 94 |
+
m.SetTitle(inchi)
|
| 95 |
+
return m
|
| 96 |
+
|
| 97 |
+
@staticmethod
|
| 98 |
+
def FromMol(mol):
|
| 99 |
+
m = Molecule()
|
| 100 |
+
obConversion = openbabel.OBConversion()
|
| 101 |
+
obConversion.AddOption("w", obConversion.OUTOPTIONS)
|
| 102 |
+
obConversion.SetInFormat("mol")
|
| 103 |
+
obConversion.ReadString(m.obmol, mol)
|
| 104 |
+
try:
|
| 105 |
+
m.UpdateInChI()
|
| 106 |
+
m.UpdateSmiles()
|
| 107 |
+
except OpenBabelError:
|
| 108 |
+
raise OpenBabelError("Failed to create Molecule from MOL file:\n" + mol)
|
| 109 |
+
m.SetTitle("")
|
| 110 |
+
return m
|
| 111 |
+
|
| 112 |
+
@staticmethod
|
| 113 |
+
def FromOBMol(obmol):
|
| 114 |
+
m = Molecule()
|
| 115 |
+
m.obmol = obmol
|
| 116 |
+
try:
|
| 117 |
+
m.UpdateInChI()
|
| 118 |
+
m.UpdateSmiles()
|
| 119 |
+
except OpenBabelError:
|
| 120 |
+
raise OpenBabelError("Failed to create Molecule from OBMol")
|
| 121 |
+
m.SetTitle("")
|
| 122 |
+
return m
|
| 123 |
+
|
| 124 |
+
@staticmethod
|
| 125 |
+
def _FromFormat(s, fmt='inchi'):
|
| 126 |
+
if fmt == 'smiles' or fmt == 'smi':
|
| 127 |
+
return Molecule.FromSmiles(s)
|
| 128 |
+
if fmt == 'inchi':
|
| 129 |
+
return Molecule.FromInChI(s)
|
| 130 |
+
if fmt == 'mol':
|
| 131 |
+
return Molecule.FromMol(s)
|
| 132 |
+
if fmt == 'obmol':
|
| 133 |
+
return Molecule.FromOBMol(s)
|
| 134 |
+
|
| 135 |
+
@staticmethod
|
| 136 |
+
def _ToFormat(obmol, fmt='inchi'):
|
| 137 |
+
#print('formatting started...')
|
| 138 |
+
#pdb.set_trace()
|
| 139 |
+
obConversion = openbabel.OBConversion()
|
| 140 |
+
obConversion.AddOption("w", obConversion.OUTOPTIONS)
|
| 141 |
+
obConversion.SetOutFormat(fmt)
|
| 142 |
+
res = obConversion.WriteString(obmol)
|
| 143 |
+
#print('res :::: ')
|
| 144 |
+
#print(res)
|
| 145 |
+
if not res:
|
| 146 |
+
raise OpenBabelError("Cannot convert OBMol to %s" % fmt)
|
| 147 |
+
if fmt == 'smiles' or fmt == 'smi':
|
| 148 |
+
#print('I am in')
|
| 149 |
+
res = res.split()
|
| 150 |
+
if res == []:
|
| 151 |
+
raise OpenBabelError("Cannot convert OBMol to %s" % fmt)
|
| 152 |
+
else:
|
| 153 |
+
return res[0]
|
| 154 |
+
elif fmt == 'inchi':
|
| 155 |
+
return res.strip()
|
| 156 |
+
else:
|
| 157 |
+
return res
|
| 158 |
+
|
| 159 |
+
@staticmethod
|
| 160 |
+
def Smiles2InChI(smiles):
|
| 161 |
+
obConversion = openbabel.OBConversion()
|
| 162 |
+
obConversion.AddOption("w", obConversion.OUTOPTIONS)
|
| 163 |
+
obConversion.SetInAndOutFormats("smiles", "inchi")
|
| 164 |
+
obmol = openbabel.OBMol()
|
| 165 |
+
if not obConversion.ReadString(obmol, smiles):
|
| 166 |
+
raise OpenBabelError("Cannot read the SMILES string: " + smiles)
|
| 167 |
+
return obConversion.WriteString(obmol).strip()
|
| 168 |
+
|
| 169 |
+
@staticmethod
|
| 170 |
+
def InChI2Smiles(inchi):
|
| 171 |
+
obConversion = openbabel.OBConversion()
|
| 172 |
+
obConversion.AddOption("w", obConversion.OUTOPTIONS)
|
| 173 |
+
obConversion.SetInAndOutFormats("inchi", "smiles")
|
| 174 |
+
obmol = openbabel.OBMol()
|
| 175 |
+
if not obConversion.ReadString(obmol, inchi):
|
| 176 |
+
raise OpenBabelError("Cannot read the InChI string: " + inchi)
|
| 177 |
+
return obConversion.WriteString(obmol).split()[0]
|
| 178 |
+
|
| 179 |
+
def RemoveHydrogens(self):
|
| 180 |
+
self.obmol.DeleteHydrogens()
|
| 181 |
+
|
| 182 |
+
def RemoveAtoms(self, indices):
|
| 183 |
+
self.obmol.BeginModify()
|
| 184 |
+
for i in sorted(indices, reverse=True):
|
| 185 |
+
self.obmol.DeleteAtom(self.obmol.GetAtom(i+1))
|
| 186 |
+
self.obmol.EndModify()
|
| 187 |
+
self.smiles = None
|
| 188 |
+
self.inchi = None
|
| 189 |
+
|
| 190 |
+
def SetAtomicNum(self, index, new_atomic_num):
|
| 191 |
+
self.obmol.GetAtom(index+1).SetAtomicNum(new_atomic_num)
|
| 192 |
+
self.smiles = None
|
| 193 |
+
self.inchi = None
|
| 194 |
+
|
| 195 |
+
def ToOBMol(self):
|
| 196 |
+
return self.obmol
|
| 197 |
+
|
| 198 |
+
def ToFormat(self, fmt='inchi'):
|
| 199 |
+
return Molecule._ToFormat(self.obmol, fmt=fmt)
|
| 200 |
+
|
| 201 |
+
def ToMolfile(self):
|
| 202 |
+
return self.ToFormat('mol')
|
| 203 |
+
|
| 204 |
+
def UpdateInChI(self):
|
| 205 |
+
self.inchi = Molecule._ToFormat(self.obmol, 'inchi')
|
| 206 |
+
|
| 207 |
+
def ToInChI(self):
|
| 208 |
+
"""
|
| 209 |
+
Lazy storage of the InChI identifier (calculate once only when
|
| 210 |
+
asked for and store for later use).
|
| 211 |
+
"""
|
| 212 |
+
if not self.inchi:
|
| 213 |
+
self.UpdateInChI()
|
| 214 |
+
return self.inchi
|
| 215 |
+
|
| 216 |
+
def UpdateSmiles(self):
|
| 217 |
+
self.smiles = Molecule._ToFormat(self.obmol, 'smiles')
|
| 218 |
+
|
| 219 |
+
def ToSmiles(self):
|
| 220 |
+
"""
|
| 221 |
+
Lazy storage of the SMILES identifier (calculate once only when
|
| 222 |
+
asked for and store for later use).
|
| 223 |
+
"""
|
| 224 |
+
if not self.smiles:
|
| 225 |
+
self.UpdateSmiles()
|
| 226 |
+
return self.smiles
|
| 227 |
+
|
| 228 |
+
def GetFormula(self):
|
| 229 |
+
tokens = re.findall('InChI=1S?/([0-9A-Za-z\.]+)', self.ToInChI())
|
| 230 |
+
if len(tokens) == 1:
|
| 231 |
+
return tokens[0]
|
| 232 |
+
elif len(tokens) > 1:
|
| 233 |
+
raise ValueError('Bad InChI: ' + self.ToInChI())
|
| 234 |
+
else:
|
| 235 |
+
return ''
|
| 236 |
+
|
| 237 |
+
def GetExactMass(self):
|
| 238 |
+
return self.obmol.GetExactMass()
|
| 239 |
+
|
| 240 |
+
def GetAtomBagAndCharge(self):
|
| 241 |
+
inchi = self.ToInChI()
|
| 242 |
+
atom_bag, major_ms_charge = chemaxon.GetAtomBagAndCharge(inchi)
|
| 243 |
+
return atom_bag, major_ms_charge
|
| 244 |
+
|
| 245 |
+
def GetHydrogensAndCharge(self):
|
| 246 |
+
atom_bag, charge = self.GetAtomBagAndCharge()
|
| 247 |
+
return atom_bag.get('H', 0), charge
|
| 248 |
+
|
| 249 |
+
def GetNumElectrons(self):
|
| 250 |
+
"""Calculates the number of electrons in a given molecule."""
|
| 251 |
+
atom_bag, fixed_charge = self.GetAtomBagAndCharge()
|
| 252 |
+
return atom_bag.get('e-', 0)
|
| 253 |
+
|
| 254 |
+
def GetNumAtoms(self):
|
| 255 |
+
return self.obmol.NumAtoms()
|
| 256 |
+
|
| 257 |
+
def GetAtoms(self):
|
| 258 |
+
return [self.obmol.GetAtom(i+1) for i in range(self.obmol.NumAtoms())]
|
| 259 |
+
|
| 260 |
+
def FindSmarts(self, smarts):
|
| 261 |
+
"""
|
| 262 |
+
Corrects the pyBel version of Smarts.findall() which returns results as tuples,
|
| 263 |
+
with 1-based indices even though Molecule.atoms is 0-based.
|
| 264 |
+
|
| 265 |
+
Args:
|
| 266 |
+
mol: the molecule to search in.
|
| 267 |
+
smarts_str: the SMARTS query to search for.
|
| 268 |
+
|
| 269 |
+
Returns:
|
| 270 |
+
The re-mapped list of SMARTS matches.
|
| 271 |
+
"""
|
| 272 |
+
Molecule._obSmarts.Init(smarts)
|
| 273 |
+
if Molecule._obSmarts.Match(self.obmol):
|
| 274 |
+
match_list = Molecule._obSmarts.GetMapList()
|
| 275 |
+
shift_left = lambda m: [(n - 1) for n in m]
|
| 276 |
+
return list(map(shift_left, match_list))
|
| 277 |
+
else:
|
| 278 |
+
return []
|
| 279 |
+
|
| 280 |
+
def GetAtomCharges(self):
|
| 281 |
+
"""
|
| 282 |
+
Returns:
|
| 283 |
+
A list of charges, according to the number of atoms
|
| 284 |
+
in the molecule
|
| 285 |
+
"""
|
| 286 |
+
return [atom.GetFormalCharge() for atom in self.GetAtoms()]
|
| 287 |
+
|
| 288 |
+
if __name__ == '__main__':
|
| 289 |
+
|
| 290 |
+
mol = Molecule.FromInChI('InChI=1/C5H10O2/c1-3-5(6)7-4-2/h3-4H2,1-2H3')
|
| 291 |
+
#mol = Molecule.FromInChI('InChI=1S/H2/h1H')
|
| 292 |
+
print(mol.GetExactMass())
|
CC/thermodynamic_constants.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
R = 8.31e-3 # kJ/(K*mol)
|
| 2 |
+
F = 96.485 # kC/mol
|
| 3 |
+
J_per_cal = 4.184
|
| 4 |
+
default_T = 298.15 # K
|
| 5 |
+
default_I = 0.25 # M
|
| 6 |
+
default_pH = 7.0
|
| 7 |
+
default_c0 = 1 # M
|
| 8 |
+
default_pMg = 10
|
| 9 |
+
default_RT = R * default_T
|
| 10 |
+
default_c_mid = 1e-3 # M
|
| 11 |
+
default_c_range = (1e-6, 1e-2) # M
|
| 12 |
+
dG0_f_Mg = -455.3 # kJ/mol, formation energy of Mg2+
|
| 13 |
+
|
| 14 |
+
symbol_d_G = "ΔG"
|
| 15 |
+
symbol_d_G0 = "ΔG°"
|
| 16 |
+
symbol_d_G_prime = "ΔG'"
|
| 17 |
+
symbol_d_G0_prime = "ΔG'°"
|
| 18 |
+
|
| 19 |
+
symbol_dr_G = "Δ<sub>r</sub>G"
|
| 20 |
+
symbol_dr_G0 = "Δ<sub>r</sub>G°"
|
| 21 |
+
symbol_dr_G_prime = "Δ<sub>r</sub>G'"
|
| 22 |
+
symbol_dr_G0_prime = "Δ<sub>r</sub>G'°"
|
| 23 |
+
symbol_dr_Gc_prime = "Δ<sub>r</sub>G'<sup>c</sup>"
|
| 24 |
+
|
| 25 |
+
symbol_df_G = "Δ<sub>f</sub>G"
|
| 26 |
+
symbol_df_G0 = "Δ<sub>f</sub>G°"
|
| 27 |
+
symbol_df_G_prime = "Δ<sub>f</sub>G'"
|
| 28 |
+
symbol_df_G0_prime = "Δ<sub>f</sub>G'°"
|
| 29 |
+
|
| 30 |
+
# Approximation of the temperature dependency of ionic strength effects
|
| 31 |
+
DH_alpha = lambda T : 1e-3*(9.20483*T) - 1e-5*(1.284668 * T**2) + 1e-8*(4.95199 * T**3)
|
| 32 |
+
DH_beta = 1.6
|
| 33 |
+
|
| 34 |
+
# Debye-Huckel
|
| 35 |
+
debye_huckel = lambda I_T : DH_alpha(I_T[1]) * I_T[0]**(0.5) / (1.0 + DH_beta * I_T[0]**(0.5))
|
| 36 |
+
|