mao jiashun
commited on
Commit
·
295ff14
1
Parent(s):
1286756
Upload 58 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +3 -0
- iupac-gpt/.gitignore +5 -0
- iupac-gpt/LICENSE +32 -0
- iupac-gpt/README.md +62 -0
- iupac-gpt/checkpoints/iupac/config.json +33 -0
- iupac-gpt/checkpoints/iupac/pytorch_model.bin +3 -0
- iupac-gpt/class.txt +3 -0
- iupac-gpt/data/bbbp.csv +0 -0
- iupac-gpt/data/iupacs_logp.csv +0 -0
- iupac-gpt/environment.yml +19 -0
- iupac-gpt/iupac.txt +3 -0
- iupac-gpt/iupacGPT2-gen50K.csv +0 -0
- iupac-gpt/iupac_gpt/__init__.py +21 -0
- iupac-gpt/iupac_gpt/__pycache__/__init__.cpython-37.pyc +0 -0
- iupac-gpt/iupac_gpt/__pycache__/__init__.cpython-38.pyc +0 -0
- iupac-gpt/iupac_gpt/__pycache__/classification.cpython-37.pyc +0 -0
- iupac-gpt/iupac_gpt/__pycache__/classification.cpython-38.pyc +0 -0
- iupac-gpt/iupac_gpt/__pycache__/data.cpython-38.pyc +0 -0
- iupac-gpt/iupac_gpt/__pycache__/iupac_dataset.cpython-38.pyc +0 -0
- iupac-gpt/iupac_gpt/__pycache__/iupac_dataset_class.cpython-38.pyc +0 -0
- iupac-gpt/iupac_gpt/__pycache__/iupac_dataset_pro.cpython-38.pyc +0 -0
- iupac-gpt/iupac_gpt/__pycache__/iupac_tokenization.cpython-38.pyc +0 -0
- iupac-gpt/iupac_gpt/__pycache__/iupac_tokenization_class.cpython-38.pyc +0 -0
- iupac-gpt/iupac_gpt/__pycache__/iupac_tokenization_iupac.cpython-38.pyc +0 -0
- iupac-gpt/iupac_gpt/__pycache__/iupac_tokenization_pro.cpython-38.pyc +0 -0
- iupac-gpt/iupac_gpt/__pycache__/language_modeling.cpython-38.pyc +0 -0
- iupac-gpt/iupac_gpt/__pycache__/tokenization.cpython-38.pyc +0 -0
- iupac-gpt/iupac_gpt/classification.py +362 -0
- iupac-gpt/iupac_gpt/data.py +269 -0
- iupac-gpt/iupac_gpt/iupac_dataset.py +121 -0
- iupac-gpt/iupac_gpt/iupac_dataset_class.py +128 -0
- iupac-gpt/iupac_gpt/iupac_dataset_pro.py +124 -0
- iupac-gpt/iupac_gpt/iupac_spm.model +3 -0
- iupac-gpt/iupac_gpt/iupac_spm.vocab +1391 -0
- iupac-gpt/iupac_gpt/iupac_tokenization.py +131 -0
- iupac-gpt/iupac_gpt/iupac_tokenization_class.py +131 -0
- iupac-gpt/iupac_gpt/iupac_tokenization_iupac.py +134 -0
- iupac-gpt/iupac_gpt/iupac_tokenization_pro.py +131 -0
- iupac-gpt/iupac_gpt/iupacs_logp.csv +0 -0
- iupac-gpt/iupac_gpt/language_modeling.py +68 -0
- iupac-gpt/iupac_gpt/pubchem_iupac_smile_gpt.csv +3 -0
- iupac-gpt/iupac_gpt/real_iupac_tokenizer.pt +3 -0
- iupac-gpt/iupac_gpt/tokenization.py +193 -0
- iupac-gpt/nohup.out +0 -0
- iupac-gpt/notebooks/.ipynb_checkpoints/language-modeling-checkpoint.ipynb +0 -0
- iupac-gpt/notebooks/iupac_head_view.html +0 -0
- iupac-gpt/notebooks/iupac_language-modeling.py +236 -0
- iupac-gpt/notebooks/iupac_language-modeling_retrain.py +224 -0
- iupac-gpt/notebooks/iupac_language-modeling_train.ipynb +0 -0
- iupac-gpt/notebooks/iupac_language-modeling_train.py +231 -0
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
iupac-gpt/class.txt filter=lfs diff=lfs merge=lfs -text
|
37 |
+
iupac-gpt/iupac_gpt/pubchem_iupac_smile_gpt.csv filter=lfs diff=lfs merge=lfs -text
|
38 |
+
iupac-gpt/iupac.txt filter=lfs diff=lfs merge=lfs -text
|
iupac-gpt/.gitignore
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
**/__pycache__/*
|
2 |
+
**/.idea/*
|
3 |
+
**/.ipynb_checkpoints/*
|
4 |
+
**/lightning_logs/*
|
5 |
+
*.log
|
iupac-gpt/LICENSE
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
The Clear BSD License
|
2 |
+
|
3 |
+
Copyright (c) 2021 Sanjar Adilov
|
4 |
+
All rights reserved.
|
5 |
+
|
6 |
+
Redistribution and use in source and binary forms, with or without
|
7 |
+
modification, are permitted (subject to the limitations in the disclaimer
|
8 |
+
below) provided that the following conditions are met:
|
9 |
+
|
10 |
+
* Redistributions of source code must retain the above copyright notice,
|
11 |
+
this list of conditions and the following disclaimer.
|
12 |
+
|
13 |
+
* Redistributions in binary form must reproduce the above copyright
|
14 |
+
notice, this list of conditions and the following disclaimer in the
|
15 |
+
documentation and/or other materials provided with the distribution.
|
16 |
+
|
17 |
+
* Neither the name of the copyright holder nor the names of its
|
18 |
+
contributors may be used to endorse or promote products derived from this
|
19 |
+
software without specific prior written permission.
|
20 |
+
|
21 |
+
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
|
22 |
+
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
|
23 |
+
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
24 |
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
25 |
+
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
|
26 |
+
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
27 |
+
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
28 |
+
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
29 |
+
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
|
30 |
+
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
31 |
+
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
32 |
+
POSSIBILITY OF SUCH DAMAGE.
|
iupac-gpt/README.md
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Generative Pre-Training from Molecules
|
2 |
+
|
3 |
+
Autoregressive transformer language model for drug discovery. (Pre)trained on a large
|
4 |
+
SMILES corpus. Evaluated on molecular property prediction and low-data de novo design
|
5 |
+
tasks.
|
6 |
+
|
7 |
+
|
8 |
+
## Installation
|
9 |
+
|
10 |
+
Set up [conda](https://conda.io/en/latest/index.html) and create a new environment from
|
11 |
+
`environment.yml` (if needed, make corresponding edits for GPU-compatibility).
|
12 |
+
```shell
|
13 |
+
conda env create -f environment.yml
|
14 |
+
conda activate smiles-gpt
|
15 |
+
git clone https://github.com/sanjaradylov/smiles-gpt.git
|
16 |
+
cd smiles-gpt
|
17 |
+
```
|
18 |
+
|
19 |
+
|
20 |
+
## Benchmark
|
21 |
+
|
22 |
+
### Checkpoint
|
23 |
+
[checkpoints/benchmark-5m](https://github.com/sanjaradylov/smiles-gpt/tree/master/checkpoints/benchmark-5m)
|
24 |
+
stores serialized model, tokenizer, and configuration. Do not modify them. Use
|
25 |
+
`from_pretrained` method to load HuggingFace objects, e.g.,
|
26 |
+
```python
|
27 |
+
from transformers import GPT2Config, GPT2LMHeadModel, PreTrainedTokenizerFast
|
28 |
+
|
29 |
+
checkpoint = "checkpoints/benchmark-5m"
|
30 |
+
|
31 |
+
config = GPT2Config.from_pretrained(checkpoint)
|
32 |
+
model = GPT2LMHeadModel.from_pretrained(checkpoint)
|
33 |
+
tokenizer = PreTrainedTokenizerFast.from_pretrained(checkpoint)
|
34 |
+
```
|
35 |
+
|
36 |
+
### Data
|
37 |
+
[data](https://github.com/sanjaradylov/smiles-gpt/tree/master/data) stores
|
38 |
+
[Blood-Brain Barrier Penetration](https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv)
|
39 |
+
classification dataset and 10K subset of ChemBERTa's
|
40 |
+
[PubChem-10M](https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/pubchem_10m.txt.zip).
|
41 |
+
See [Examples](#Examples).
|
42 |
+
|
43 |
+
### Output
|
44 |
+
|
45 |
+
[output](https://github.com/sanjaradylov/smiles-gpt/tree/master/output) stores generated
|
46 |
+
SMILES strings.
|
47 |
+
|
48 |
+
## Examples
|
49 |
+
|
50 |
+
Adapter training for molecular property prediction
|
51 |
+
(replace `data/bbbp.csv` and `p_np` arguments with your dataset and taskname(s),
|
52 |
+
respectively):
|
53 |
+
```shell
|
54 |
+
python3 scripts/classification.py checkpoints/benchmark-5m data/bbbp.csv p_np
|
55 |
+
```
|
56 |
+
For language model pretraining, see
|
57 |
+
[notebooks](https://github.com/sanjaradylov/smiles-gpt/tree/master/notebooks).
|
58 |
+
|
59 |
+
## Citation
|
60 |
+
|
61 |
+
If you use `smiles-gpt` in your research, please consider citing
|
62 |
+
> https://doi.org/10.33774/chemrxiv-2021-5fwjd
|
iupac-gpt/checkpoints/iupac/config.json
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"activation_function": "gelu_new",
|
3 |
+
"adapters": {
|
4 |
+
"adapters": {},
|
5 |
+
"config_map": {}
|
6 |
+
},
|
7 |
+
"architectures": [
|
8 |
+
"GPT2LMHeadModel"
|
9 |
+
],
|
10 |
+
"attn_pdrop": 0.1,
|
11 |
+
"bos_token_id": 2,
|
12 |
+
"embd_pdrop": 0.1,
|
13 |
+
"eos_token_id": 1,
|
14 |
+
"gradient_checkpointing": false,
|
15 |
+
"initializer_range": 0.02,
|
16 |
+
"layer_norm_epsilon": 1e-05,
|
17 |
+
"model_type": "gpt2",
|
18 |
+
"n_ctx": 1280,
|
19 |
+
"n_embd": 256,
|
20 |
+
"n_head": 8,
|
21 |
+
"n_inner": null,
|
22 |
+
"n_layer": 8,
|
23 |
+
"n_positions": 1280,
|
24 |
+
"resid_pdrop": 0.1,
|
25 |
+
"summary_activation": null,
|
26 |
+
"summary_first_dropout": 0.1,
|
27 |
+
"summary_proj_to_labels": true,
|
28 |
+
"summary_type": "cls_index",
|
29 |
+
"summary_use_proj": true,
|
30 |
+
"transformers_version": "2.0.1",
|
31 |
+
"use_cache": true,
|
32 |
+
"vocab_size": 1491
|
33 |
+
}
|
iupac-gpt/checkpoints/iupac/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0aea87ea0c2a89b9a15bfd4682615df1d64f37c25748684d117ecc933153950f
|
3 |
+
size 41264861
|
iupac-gpt/class.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:83bbfd23faf47bacfdec9db23eed87b0d20488da7d2c84d838d0d77e7f2c58d5
|
3 |
+
size 12317646
|
iupac-gpt/data/bbbp.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
iupac-gpt/data/iupacs_logp.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
iupac-gpt/environment.yml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: smiles-gpt
|
2 |
+
channels:
|
3 |
+
- pytorch
|
4 |
+
- anaconda
|
5 |
+
- conda-forge
|
6 |
+
dependencies:
|
7 |
+
- python=3.8
|
8 |
+
- pip
|
9 |
+
- pandas
|
10 |
+
- rdkit
|
11 |
+
- pytorch
|
12 |
+
- torchvision
|
13 |
+
- torchaudio
|
14 |
+
- cpuonly
|
15 |
+
- pip:
|
16 |
+
- tokenizers
|
17 |
+
- adapter-transformers
|
18 |
+
- pytorch-lightning
|
19 |
+
- bertviz
|
iupac-gpt/iupac.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9bead0044a324634255bf5675f623fbd3a0b6babb51da7ca63870b6bf87f800a
|
3 |
+
size 156486208
|
iupac-gpt/iupacGPT2-gen50K.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
iupac-gpt/iupac_gpt/__init__.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""`smiles_gpt` implements transformer models for molecule generation and molecular-
|
2 |
+
property prediction.
|
3 |
+
"""
|
4 |
+
|
5 |
+
__author__ = "Sanjar Ad[iy]lov"
|
6 |
+
__version__ = "1.0.0-pub"
|
7 |
+
|
8 |
+
from . import classification, data, language_modeling, tokenization
|
9 |
+
from .classification import (ClassifierLitModel, RegressorLitModel,
|
10 |
+
GPT2ForSequenceClassification)
|
11 |
+
from .data import CSVDataModule, CVSplitter, LMDataModule
|
12 |
+
from .language_modeling import GPT2LitModel
|
13 |
+
from .tokenization import SMILESBPETokenizer, SMILESAlphabet
|
14 |
+
from .iupac_tokenization_iupac import get_data_loader,prepare_input
|
15 |
+
from .iupac_tokenization_pro import get_data_loader_pro,prepare_input_pro
|
16 |
+
from .iupac_tokenization_class import get_data_loader_class,prepare_input_class
|
17 |
+
|
18 |
+
__all__ = ("classification", "data", "tokenization",
|
19 |
+
"ClassifierLitModel", "CSVDataModule", "CVSplitter",
|
20 |
+
"GPT2ForSequenceClassification", "GPT2LitModel", "LMDataModule",
|
21 |
+
"RegressorLitModel", "SMILESBPETokenizer", "SMILESAlphabet")
|
iupac-gpt/iupac_gpt/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (1.07 kB). View file
|
|
iupac-gpt/iupac_gpt/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (1.08 kB). View file
|
|
iupac-gpt/iupac_gpt/__pycache__/classification.cpython-37.pyc
ADDED
Binary file (13.3 kB). View file
|
|
iupac-gpt/iupac_gpt/__pycache__/classification.cpython-38.pyc
ADDED
Binary file (13.3 kB). View file
|
|
iupac-gpt/iupac_gpt/__pycache__/data.cpython-38.pyc
ADDED
Binary file (11.1 kB). View file
|
|
iupac-gpt/iupac_gpt/__pycache__/iupac_dataset.cpython-38.pyc
ADDED
Binary file (3.09 kB). View file
|
|
iupac-gpt/iupac_gpt/__pycache__/iupac_dataset_class.cpython-38.pyc
ADDED
Binary file (3.22 kB). View file
|
|
iupac-gpt/iupac_gpt/__pycache__/iupac_dataset_pro.cpython-38.pyc
ADDED
Binary file (3.2 kB). View file
|
|
iupac-gpt/iupac_gpt/__pycache__/iupac_tokenization.cpython-38.pyc
ADDED
Binary file (5.1 kB). View file
|
|
iupac-gpt/iupac_gpt/__pycache__/iupac_tokenization_class.cpython-38.pyc
ADDED
Binary file (5.08 kB). View file
|
|
iupac-gpt/iupac_gpt/__pycache__/iupac_tokenization_iupac.cpython-38.pyc
ADDED
Binary file (5.09 kB). View file
|
|
iupac-gpt/iupac_gpt/__pycache__/iupac_tokenization_pro.cpython-38.pyc
ADDED
Binary file (5.11 kB). View file
|
|
iupac-gpt/iupac_gpt/__pycache__/language_modeling.cpython-38.pyc
ADDED
Binary file (3.37 kB). View file
|
|
iupac-gpt/iupac_gpt/__pycache__/tokenization.cpython-38.pyc
ADDED
Binary file (7.44 kB). View file
|
|
iupac-gpt/iupac_gpt/classification.py
ADDED
@@ -0,0 +1,362 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""HuggingFace-compatible classification and regression models including
|
2 |
+
pytorch-lightning models.
|
3 |
+
"""
|
4 |
+
|
5 |
+
__all__ = ("BypassNet", "ClassificationHead", "ClassifierLitModel",
|
6 |
+
"GPT2ForSequenceClassification", "RegressorLitModel",
|
7 |
+
"SequenceClassifierOutput")
|
8 |
+
|
9 |
+
from dataclasses import dataclass
|
10 |
+
from typing import List, Optional
|
11 |
+
|
12 |
+
import pytorch_lightning as pl
|
13 |
+
import torch
|
14 |
+
import torch.nn as nn
|
15 |
+
import torch.nn.functional as F
|
16 |
+
from torchmetrics import AUROC, AveragePrecision
|
17 |
+
from transformers import AdamW, GPT2Model, GPT2PreTrainedModel
|
18 |
+
from transformers.modeling_outputs import SequenceClassifierOutputWithPast
|
19 |
+
from transformers.adapters.model_mixin import ModelWithHeadsAdaptersMixin
|
20 |
+
|
21 |
+
|
22 |
+
@dataclass
|
23 |
+
class SequenceClassifierOutput(SequenceClassifierOutputWithPast):
|
24 |
+
target: Optional[torch.LongTensor] = None
|
25 |
+
|
26 |
+
|
27 |
+
class GPT2ForSequenceClassification(ModelWithHeadsAdaptersMixin, GPT2PreTrainedModel):
|
28 |
+
"""HuggingFace-compatible single- and multi-output (-task) classification model.
|
29 |
+
`config` must be a `GPT2Config` instance with additional `num_tasks` and `num_labels`
|
30 |
+
properties. For multi-task classification, the output is Bypass network with the
|
31 |
+
reduction factor = `config.n_embd // config.n_head`.
|
32 |
+
"""
|
33 |
+
|
34 |
+
_keys_to_ignore_on_load_missing = [
|
35 |
+
r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight", r"output\..*"]
|
36 |
+
|
37 |
+
def __init__(self, config):
|
38 |
+
super().__init__(config)
|
39 |
+
|
40 |
+
self.num_tasks = config.num_tasks
|
41 |
+
self.num_labels = config.num_labels
|
42 |
+
|
43 |
+
self.transformer = GPT2Model(config)
|
44 |
+
|
45 |
+
if self.num_tasks > 1:
|
46 |
+
self.output = BypassNet(
|
47 |
+
config.n_embd, config.n_embd // config.n_head,
|
48 |
+
config.num_tasks, config.num_labels,
|
49 |
+
config.embd_pdrop)
|
50 |
+
else:
|
51 |
+
self.output = ClassificationHead(
|
52 |
+
config.n_embd, config.n_embd // config.n_head,
|
53 |
+
config.num_labels, config.embd_pdrop)
|
54 |
+
|
55 |
+
self.init_weights()
|
56 |
+
|
57 |
+
def forward(self, input_ids=None, past_key_values=None, attention_mask=None,
|
58 |
+
token_type_ids=None, position_ids=None, head_mask=None,
|
59 |
+
inputs_embeds=None, labels=None, use_cache=None, output_attentions=None,
|
60 |
+
output_hidden_states=None, return_dict=None, adapter_names=None,
|
61 |
+
label_mask=None):
|
62 |
+
return_dict = return_dict or self.config.use_return_dict
|
63 |
+
|
64 |
+
transformer_outputs = self.transformer(
|
65 |
+
input_ids, past_key_values=past_key_values, attention_mask=attention_mask,
|
66 |
+
token_type_ids=token_type_ids, position_ids=position_ids,
|
67 |
+
head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache,
|
68 |
+
output_attentions=output_attentions,
|
69 |
+
output_hidden_states=output_hidden_states, return_dict=return_dict,
|
70 |
+
adapter_names=adapter_names)
|
71 |
+
|
72 |
+
hidden_states = transformer_outputs[0]
|
73 |
+
|
74 |
+
if input_ids is not None:
|
75 |
+
batch_size, sequence_length = input_ids.shape[:2]
|
76 |
+
else:
|
77 |
+
batch_size, sequence_length = inputs_embeds.shape[:2]
|
78 |
+
|
79 |
+
assert self.config.pad_token_id is not None or batch_size == 1, \
|
80 |
+
"Cannot handle batch sizes > 1 if no padding token is defined."
|
81 |
+
if self.config.pad_token_id is None:
|
82 |
+
sequence_lengths = -1
|
83 |
+
else:
|
84 |
+
if input_ids is not None:
|
85 |
+
sequence_lengths = torch.ne(
|
86 |
+
input_ids, self.config.pad_token_id).sum(-1) - 1
|
87 |
+
else:
|
88 |
+
sequence_lengths = -1
|
89 |
+
|
90 |
+
if self.num_tasks == 1:
|
91 |
+
logits = self.output(hidden_states)[range(batch_size), sequence_lengths]
|
92 |
+
else:
|
93 |
+
logits = self.output(hidden_states, batch_size, sequence_lengths)
|
94 |
+
|
95 |
+
loss = None
|
96 |
+
if labels is not None:
|
97 |
+
if self.num_labels == 2:
|
98 |
+
if label_mask is not None:
|
99 |
+
nonempty_tasks = (label_mask == 1).view(-1)
|
100 |
+
nonempty_logits = logits.view(-1, self.num_labels)[nonempty_tasks, :]
|
101 |
+
nonempty_labels = labels.view(-1)[nonempty_tasks]
|
102 |
+
else:
|
103 |
+
nonempty_logits = logits.view(-1, self.num_labels)
|
104 |
+
nonempty_labels = labels.view(-1)
|
105 |
+
|
106 |
+
if len(labels.size()) == 1:
|
107 |
+
labels = labels.reshape(1, -1)
|
108 |
+
|
109 |
+
loss = F.cross_entropy(nonempty_logits, nonempty_labels)
|
110 |
+
elif self.num_labels == 1:
|
111 |
+
loss = F.mse_loss(logits.view(-1), labels.view(-1))
|
112 |
+
else:
|
113 |
+
raise NotImplementedError(
|
114 |
+
"Only binary classification and regression supported.")
|
115 |
+
|
116 |
+
if self.num_tasks > 1:
|
117 |
+
logits = logits.transpose(1, 2)
|
118 |
+
|
119 |
+
if labels is not None and self.num_labels == 2 and self.num_tasks == 1:
|
120 |
+
if label_mask is not None:
|
121 |
+
labels = labels.view(-1)
|
122 |
+
else:
|
123 |
+
labels = nonempty_labels
|
124 |
+
|
125 |
+
if not return_dict:
|
126 |
+
output = (logits,) + transformer_outputs[1:]
|
127 |
+
return ((loss,) + output) if loss is not None else output
|
128 |
+
|
129 |
+
return SequenceClassifierOutput(
|
130 |
+
loss=loss, logits=logits, target=labels,
|
131 |
+
past_key_values=transformer_outputs.past_key_values,
|
132 |
+
hidden_states=transformer_outputs.hidden_states,
|
133 |
+
attentions=transformer_outputs.attentions)
|
134 |
+
|
135 |
+
|
136 |
+
class BypassNet(nn.Module):
|
137 |
+
"""Bypass multi-task network from MoleculeNet project [Wu et al., 2018].
|
138 |
+
"""
|
139 |
+
|
140 |
+
def __init__(self, hidden_size: int, intermediate_size: int,
|
141 |
+
num_tasks: int, num_labels: int = 2,
|
142 |
+
dropout: float = 0.2, use_bias: bool = False):
|
143 |
+
super().__init__()
|
144 |
+
self.independent = nn.ModuleList([
|
145 |
+
ClassificationHead(hidden_size, intermediate_size,
|
146 |
+
num_labels, dropout, use_bias)
|
147 |
+
for _ in range(num_tasks)])
|
148 |
+
self.shared = ClassificationHead(hidden_size, intermediate_size,
|
149 |
+
num_labels, dropout, use_bias)
|
150 |
+
|
151 |
+
def forward(self, hidden_states, batch_size, sequence_lengths):
|
152 |
+
logits_list: List[torch.Tensor] = []
|
153 |
+
for layer in self.independent:
|
154 |
+
logits_list.append(layer(hidden_states))
|
155 |
+
shared_logits: torch.Tensor = self.shared(hidden_states)
|
156 |
+
for i in range(len(logits_list)):
|
157 |
+
logits_list[i] = (logits_list[i] + shared_logits)[range(batch_size),
|
158 |
+
sequence_lengths]
|
159 |
+
return torch.stack(logits_list, dim=1)
|
160 |
+
|
161 |
+
|
162 |
+
class ClassificationHead(nn.Module):
|
163 |
+
"""Two-layer feed-forward network with GELU activation and intermediate dropout.
|
164 |
+
"""
|
165 |
+
|
166 |
+
def __init__(self, hidden_size: int, intermediate_size: int,
|
167 |
+
num_labels: int, dropout: float = 0.0, use_bias: bool = False):
|
168 |
+
super().__init__()
|
169 |
+
self.dense = nn.Linear(hidden_size, intermediate_size, bias=use_bias)
|
170 |
+
self.act = nn.GELU()
|
171 |
+
self.dropout = nn.Dropout(dropout)
|
172 |
+
self.out_proj = nn.Linear(intermediate_size, num_labels, bias=use_bias)
|
173 |
+
|
174 |
+
def forward(self, x, *args, **kwargs):
|
175 |
+
x = self.dense(x)
|
176 |
+
x = self.act(x)
|
177 |
+
x = self.dropout(x)
|
178 |
+
return self.out_proj(x)
|
179 |
+
|
180 |
+
|
181 |
+
class ClassifierLitModel(pl.LightningModule):
|
182 |
+
"""Pytorch-lightning module for single- or multi-task classification. Trains GPT2
|
183 |
+
model using `AdamW` optimizer with exponential LR scheduler. Evaluates valid and
|
184 |
+
test data on AUC-ROC and AUC-PRC.
|
185 |
+
|
186 |
+
Args:
|
187 |
+
transformer (`GPT2Model`): (Pretrained) HuggingFace GPT2 model.
|
188 |
+
num_tasks (int): The number of classification tasks.
|
189 |
+
has_empty_labels (bool)
|
190 |
+
batch_size (int)
|
191 |
+
learning_rate (float)
|
192 |
+
scheduler_lambda (float)
|
193 |
+
scheduler_step (int)
|
194 |
+
weight_decay (float)
|
195 |
+
"""
|
196 |
+
|
197 |
+
def __init__(self, transformer: GPT2Model, num_tasks: int, has_empty_labels: bool,
|
198 |
+
batch_size: int, learning_rate: float, scheduler_lambda: float,
|
199 |
+
scheduler_step: int, weight_decay: float, *args, **kwargs):
|
200 |
+
super().__init__()
|
201 |
+
|
202 |
+
self.save_hyperparameters(ignore=("transformer", "num_tasks", "has_empty_labels"))
|
203 |
+
self.transformer = transformer
|
204 |
+
self.num_tasks = num_tasks
|
205 |
+
|
206 |
+
def get_metrics(metric_cls):
|
207 |
+
return [metric_cls(num_classes=2) for _ in range(num_tasks)]
|
208 |
+
|
209 |
+
if has_empty_labels:
|
210 |
+
self.train_roc = get_metrics(AUROC)
|
211 |
+
self.val_roc = get_metrics(AUROC)
|
212 |
+
self.test_roc = get_metrics(AUROC)
|
213 |
+
|
214 |
+
self.train_prc = get_metrics(AveragePrecision)
|
215 |
+
self.val_prc = get_metrics(AveragePrecision)
|
216 |
+
self.test_prc = get_metrics(AveragePrecision)
|
217 |
+
|
218 |
+
self.step = self._step_empty
|
219 |
+
self.epoch_end = self._epoch_end_empty
|
220 |
+
else:
|
221 |
+
#self.train_roc = AUROC(num_classes=2)
|
222 |
+
#self.val_roc = AUROC(num_classes=2)
|
223 |
+
#self.test_roc = AUROC(num_classes=2)
|
224 |
+
|
225 |
+
#self.train_prc = AveragePrecision(num_classes=2)
|
226 |
+
#self.val_prc = AveragePrecision(num_classes=2)
|
227 |
+
#self.test_prc = AveragePrecision(num_classes=2)
|
228 |
+
|
229 |
+
self.train_roc = AUROC(task='multiclass',num_classes=2)
|
230 |
+
self.val_roc = AUROC(task='multiclass',num_classes=2)
|
231 |
+
self.test_roc = AUROC(task='multiclass',num_classes=2)
|
232 |
+
|
233 |
+
self.train_prc = AveragePrecision(task='multiclass',num_classes=2)
|
234 |
+
self.val_prc = AveragePrecision(task='multiclass',num_classes=2)
|
235 |
+
self.test_prc = AveragePrecision(task='multiclass',num_classes=2)
|
236 |
+
|
237 |
+
self.step = self._step_nonempty
|
238 |
+
self.epoch_end = self._epoch_end_nonempty
|
239 |
+
|
240 |
+
def forward(self, *args, **kwargs):
|
241 |
+
return self.transformer(*args, **kwargs)
|
242 |
+
|
243 |
+
def _step_empty(self, batch, batch_idx, roc, prc):
|
244 |
+
outputs = self(**batch)
|
245 |
+
|
246 |
+
if self.num_tasks == 1:
|
247 |
+
outputs["target"] = outputs["target"][:, None]
|
248 |
+
outputs["logits"] = outputs["logits"][:, :, None]
|
249 |
+
|
250 |
+
for task_id in range(self.num_tasks):
|
251 |
+
target = outputs["target"][:, task_id]
|
252 |
+
nonempty_entries = target != -1
|
253 |
+
target = target[nonempty_entries]
|
254 |
+
|
255 |
+
if target.unique().size(0) > 1:
|
256 |
+
logits = outputs["logits"][:, :, task_id][nonempty_entries]
|
257 |
+
|
258 |
+
roc[task_id](logits, target)
|
259 |
+
prc[task_id](logits, target)
|
260 |
+
|
261 |
+
return {"loss": outputs["loss"]}
|
262 |
+
|
263 |
+
def _step_nonempty(self, batch, batch_idx, roc, prc):
|
264 |
+
outputs = self(**batch)
|
265 |
+
|
266 |
+
logits, target = outputs["logits"], outputs["target"]
|
267 |
+
if target.unique().size(0) > 1:
|
268 |
+
roc(logits, target)
|
269 |
+
prc(logits, target)
|
270 |
+
|
271 |
+
return {"loss": outputs["loss"]}
|
272 |
+
|
273 |
+
def _epoch_end_empty(self, outputs_ignored, roc, prc, prefix):
|
274 |
+
mean_roc = sum(a.compute() for a in roc) / self.num_tasks
|
275 |
+
self.log(f"{prefix}_roc", mean_roc, on_step=False, on_epoch=True, prog_bar=True)
|
276 |
+
mean_prc = sum(p.compute() for p in prc) / self.num_tasks #p.compute()[1]
|
277 |
+
self.log(f"{prefix}_prc", mean_prc, on_step=False, on_epoch=True, prog_bar=True)
|
278 |
+
|
279 |
+
def _epoch_end_nonempty(self, outputs, roc, prc, prefix):
|
280 |
+
self.log(f"{prefix}_roc", roc.compute(),
|
281 |
+
on_step=False, on_epoch=True, prog_bar=True)
|
282 |
+
self.log(f"{prefix}_prc", prc.compute(), #prc.compute()[1]
|
283 |
+
on_step=False, on_epoch=True, prog_bar=True)
|
284 |
+
|
285 |
+
def training_step(self, batch, batch_idx):
|
286 |
+
return self.step(batch, batch_idx, self.train_roc, self.train_prc)
|
287 |
+
|
288 |
+
def training_epoch_end(self, outputs):
|
289 |
+
self.epoch_end(outputs, self.train_roc, self.train_prc, "train")
|
290 |
+
|
291 |
+
def validation_step(self, batch, batch_idx):
|
292 |
+
return self.step(batch, batch_idx, self.val_roc, self.val_prc)
|
293 |
+
|
294 |
+
def validation_epoch_end(self, outputs):
|
295 |
+
self.epoch_end(outputs, self.val_roc, self.val_prc, "val")
|
296 |
+
|
297 |
+
def test_step(self, batch, batch_idx):
|
298 |
+
self.step(batch, batch_idx, self.test_roc, self.test_prc)
|
299 |
+
|
300 |
+
def test_epoch_end(self, outputs):
|
301 |
+
self.epoch_end(outputs, self.test_roc, self.test_prc, "test")
|
302 |
+
|
303 |
+
def configure_optimizers(self):
|
304 |
+
optimizer = AdamW(self.parameters(), lr=self.hparams.learning_rate,
|
305 |
+
weight_decay=self.hparams.weight_decay)
|
306 |
+
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(
|
307 |
+
optimizer, self.hparams.scheduler_lambda)
|
308 |
+
return {"optimizer": optimizer,
|
309 |
+
"lr_scheduler": {"scheduler": lr_scheduler,
|
310 |
+
"interval": "step",
|
311 |
+
"frequency": self.hparams.scheduler_step}}
|
312 |
+
|
313 |
+
|
314 |
+
class RegressorLitModel(pl.LightningModule):
|
315 |
+
def __init__(self, transformer: GPT2Model,
|
316 |
+
batch_size: int, learning_rate: float, scheduler_lambda: float,
|
317 |
+
scheduler_step: int, weight_decay: float, *args, **kwargs):
|
318 |
+
super().__init__()
|
319 |
+
|
320 |
+
self.save_hyperparameters(ignore="transformer")
|
321 |
+
self.transformer = transformer
|
322 |
+
|
323 |
+
def forward(self, *args, **kwargs):
|
324 |
+
return self.transformer(*args, **kwargs)
|
325 |
+
hidden_states = transformer_outputs[0]
|
326 |
+
|
327 |
+
def step(self, batch, batch_idx):
|
328 |
+
outputs = self(**batch)
|
329 |
+
rmse_loss = torch.sqrt(outputs["loss"])
|
330 |
+
return {"loss": rmse_loss}
|
331 |
+
|
332 |
+
def epoch_end(self, outputs, prefix):
|
333 |
+
mean_rmse = torch.mean(torch.tensor([out["loss"] for out in outputs]))
|
334 |
+
self.log(f"{prefix}_rmse", mean_rmse, on_step=False, on_epoch=True, prog_bar=True)
|
335 |
+
|
336 |
+
def training_step(self, batch, batch_idx):
|
337 |
+
return self.step(batch, batch_idx)
|
338 |
+
|
339 |
+
def training_epoch_end(self, outputs):
|
340 |
+
self.epoch_end(outputs, "train")
|
341 |
+
|
342 |
+
def validation_step(self, batch, batch_idx):
|
343 |
+
return self.step(batch, batch_idx)
|
344 |
+
|
345 |
+
def validation_epoch_end(self, outputs):
|
346 |
+
self.epoch_end(outputs, "val")
|
347 |
+
|
348 |
+
def test_step(self, batch, batch_idx):
|
349 |
+
return self.step(batch, batch_idx)
|
350 |
+
|
351 |
+
def test_epoch_end(self, outputs):
|
352 |
+
self.epoch_end(outputs, "test")
|
353 |
+
|
354 |
+
def configure_optimizers(self):
|
355 |
+
optimizer = AdamW(self.parameters(), lr=self.hparams.learning_rate,
|
356 |
+
weight_decay=self.hparams.weight_decay)
|
357 |
+
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(
|
358 |
+
optimizer, self.hparams.scheduler_lambda)
|
359 |
+
return {"optimizer": optimizer,
|
360 |
+
"lr_scheduler": {"scheduler": lr_scheduler,
|
361 |
+
"interval": "step",
|
362 |
+
"frequency": self.hparams.scheduler_step}}
|
iupac-gpt/iupac_gpt/data.py
ADDED
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Loads torch-compatible data sets and lightning-compatible data modules.
|
2 |
+
"""
|
3 |
+
|
4 |
+
__all__ = ("CSVDataset", "CSVDataModule", "CVSplitter", "LMDataset", "LMDataModule")
|
5 |
+
|
6 |
+
from collections import defaultdict
|
7 |
+
from dataclasses import dataclass
|
8 |
+
from functools import partial
|
9 |
+
from pathlib import Path
|
10 |
+
from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Tuple, Union
|
11 |
+
|
12 |
+
import torch
|
13 |
+
from pytorch_lightning import LightningDataModule
|
14 |
+
from sklearn.model_selection import ShuffleSplit
|
15 |
+
from tokenizers.implementations import BaseTokenizer
|
16 |
+
from transformers import PreTrainedTokenizerFast
|
17 |
+
from transformers import DataCollatorForLanguageModeling, DataCollatorWithPadding
|
18 |
+
from torch.utils.data import Dataset, DataLoader
|
19 |
+
|
20 |
+
|
21 |
+
@dataclass(init=True, repr=True, eq=False, frozen=False)
|
22 |
+
class CSVDataset(Dataset):
|
23 |
+
"""Stores `pandas.DataFrame` instance of tabular data and retrieves encoded token
|
24 |
+
ids and attention mask. Optionally returns labels and their masks.
|
25 |
+
|
26 |
+
Args:
|
27 |
+
dataframe (`pandas.DataFrame`):
|
28 |
+
Data frame of SMILES strings and their (multi-task) labels.
|
29 |
+
tokenizer (`tokenizers.BaseTokenizer` or `SMILESBPETokenizer`)
|
30 |
+
SMILES tokenizer.
|
31 |
+
smiles_column (`str`, defaults to "smiles"):
|
32 |
+
Column name of SMILES strings in `dataframe`.
|
33 |
+
target_column (`str` or `list` of `str`, defaults to `None`):
|
34 |
+
Target column(s). If `None`, labels are ignored.
|
35 |
+
has_empty_target (`bool`, defaults to `False`):
|
36 |
+
Whether entries have empty target values. If `True`, additionally retrieves
|
37 |
+
a target mask.
|
38 |
+
task_type ("classification" or "regression", defaults to "classification")
|
39 |
+
encode_kwargs (dict, defaults to {"truncation": True})
|
40 |
+
Positional arguments for `tokenizer` encoding, e.g. {"padding": True}.
|
41 |
+
"""
|
42 |
+
|
43 |
+
dataframe: "pandas.DataFrame"
|
44 |
+
tokenizer: BaseTokenizer
|
45 |
+
smiles_column: str = 'smiles'
|
46 |
+
target_column: Union[None, str, List[str]] = None
|
47 |
+
has_empty_target: bool = False
|
48 |
+
task_type: Literal["classification", "regression"] = "classification"
|
49 |
+
encode_kwargs: Optional[Dict[str, Any]] = None
|
50 |
+
|
51 |
+
def __post_init__(self) -> None:
|
52 |
+
if isinstance(self.tokenizer, PreTrainedTokenizerFast):
|
53 |
+
self._encode = partial(self.tokenizer.__call__, add_special_tokens=False)
|
54 |
+
self._id_key = "input_ids"
|
55 |
+
else:
|
56 |
+
self._encode = self.tokenizer.encode
|
57 |
+
self._id_key = "ids"
|
58 |
+
self.encode_kwargs = self.encode_kwargs or {"truncation": True}
|
59 |
+
self._encode = partial(self._encode, **self.encode_kwargs)
|
60 |
+
|
61 |
+
def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:
|
62 |
+
"""Returns dict of encoded token IDs, attention mask, and optionally labels
|
63 |
+
and label mask.
|
64 |
+
"""
|
65 |
+
item: Dict[str, torch.Tensor] = {}
|
66 |
+
|
67 |
+
smiles = self.dataframe.iloc[index][self.smiles_column]
|
68 |
+
encodings = self._encode(smiles)
|
69 |
+
item["input_ids"] = torch.LongTensor(getattr(encodings, self._id_key))
|
70 |
+
item["attention_mask"] = torch.LongTensor(getattr(encodings, "attention_mask"))
|
71 |
+
|
72 |
+
if self.target_column is not None:
|
73 |
+
labels = self.dataframe.iloc[index][self.target_column]
|
74 |
+
if self.has_empty_target:
|
75 |
+
label_mask = ~labels.isna()
|
76 |
+
labels = labels.fillna(-1)
|
77 |
+
item["label_mask"] = torch.BoolTensor(label_mask)
|
78 |
+
if self.task_type == "regression":
|
79 |
+
tensor_type = torch.FloatTensor
|
80 |
+
elif self.task_type == "classification":
|
81 |
+
tensor_type = torch.LongTensor
|
82 |
+
else:
|
83 |
+
raise NotImplementedError("`CSVDataset` supports only classification and "
|
84 |
+
"regression tasks")
|
85 |
+
item["labels"] = tensor_type(labels)
|
86 |
+
|
87 |
+
return item
|
88 |
+
|
89 |
+
def __len__(self) -> int:
|
90 |
+
return self.dataframe.shape[0]
|
91 |
+
|
92 |
+
|
93 |
+
@dataclass(init=True, eq=True, repr=True, frozen=False)
|
94 |
+
class CVSplitter:
|
95 |
+
"""Splits series of SMILES data with either random or scaffold splitting.
|
96 |
+
"""
|
97 |
+
|
98 |
+
mode: str = "random"
|
99 |
+
train_size: float = 0.8
|
100 |
+
val_size: float = 0.1
|
101 |
+
test_size: float = 0.1
|
102 |
+
|
103 |
+
def __post_init__(self) -> None:
|
104 |
+
if self.mode == "scaffold":
|
105 |
+
self.train_val_test_split = self.scaffold_split
|
106 |
+
elif self.mode == "random":
|
107 |
+
self.train_val_test_split = self.random_split
|
108 |
+
|
109 |
+
@staticmethod
|
110 |
+
def get_sorted_scaffolds(smiles_seqs: Sequence[str]):
|
111 |
+
from rdkit.Chem import MolFromSmiles
|
112 |
+
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
|
113 |
+
|
114 |
+
scaffolds: Dict[str, List[int]] = defaultdict(list)
|
115 |
+
molecules = (MolFromSmiles(s, sanitize=True) for s in smiles_seqs)
|
116 |
+
|
117 |
+
for i, molecule in enumerate(molecules):
|
118 |
+
try:
|
119 |
+
scaffold = MurckoScaffoldSmiles(mol=molecule, includeChirality=False)
|
120 |
+
scaffolds[scaffold].append(i)
|
121 |
+
except Exception: # Really don't know what exception is raised...
|
122 |
+
pass
|
123 |
+
|
124 |
+
scaffolds = {scaffold: sorted(ids) for scaffold, ids in scaffolds.items()}
|
125 |
+
scaffold_sets = [scaffold_set
|
126 |
+
for scaffold, scaffold_set in
|
127 |
+
sorted(scaffolds.items(), key=lambda x: (len(x[1]), x[1][0]),
|
128 |
+
reverse=True)]
|
129 |
+
return scaffold_sets
|
130 |
+
|
131 |
+
def scaffold_split(self, smiles_seqs: Sequence[str]) \
|
132 |
+
-> Tuple[List[int], List[int], List[int]]:
|
133 |
+
scaffold_sets = self.get_sorted_scaffolds(smiles_seqs)
|
134 |
+
|
135 |
+
n_samples = len(smiles_seqs)
|
136 |
+
train_idx, val_idx, test_idx = [], [], []
|
137 |
+
train_cutoff = int(self.train_size * n_samples)
|
138 |
+
val_cutoff = int((self.train_size + self.val_size) * n_samples)
|
139 |
+
|
140 |
+
for group_indices in scaffold_sets:
|
141 |
+
n_group = len(group_indices)
|
142 |
+
n_train = len(train_idx)
|
143 |
+
if n_train + n_group > train_cutoff:
|
144 |
+
n_val = len(val_idx)
|
145 |
+
if n_train + n_val + n_group > val_cutoff:
|
146 |
+
test_idx.extend(group_indices)
|
147 |
+
else:
|
148 |
+
val_idx.extend(group_indices)
|
149 |
+
else:
|
150 |
+
train_idx.extend(group_indices)
|
151 |
+
|
152 |
+
return train_idx, val_idx, test_idx
|
153 |
+
|
154 |
+
def random_split(self, smiles_seqs: "pandas.Series") \
|
155 |
+
-> Tuple["numpy.array", "numpy.array", "numpy.array"]:
|
156 |
+
cv = ShuffleSplit(train_size=self.train_size + self.val_size)
|
157 |
+
train_idx, val_idx = next(cv.split(smiles_seqs))
|
158 |
+
cv.train_size = 1 - self.test_size / (self.train_size + self.val_size)
|
159 |
+
train_idx, test_idx = next(cv.split(smiles_seqs.iloc[train_idx]))
|
160 |
+
|
161 |
+
return train_idx, val_idx, test_idx
|
162 |
+
|
163 |
+
|
164 |
+
@dataclass(init=True, repr=True, eq=False, frozen=False)
|
165 |
+
class CSVDataModule(LightningDataModule):
|
166 |
+
"""Lightning data module for tabular data. Accepts pandas `dataframe`, splits the
|
167 |
+
data into train/valid/test with `splitter`, creates `CSVDataset`s and Pytorch
|
168 |
+
`DataLoader`s with `DataCollatorWithPadding` collate function.
|
169 |
+
"""
|
170 |
+
|
171 |
+
dataframe: "pandas.DataFrame"
|
172 |
+
tokenizer: BaseTokenizer
|
173 |
+
smiles_column: str = "smiles"
|
174 |
+
target_column: Union[None, str, List[str]] = None
|
175 |
+
has_empty_target: bool = False
|
176 |
+
task_type: Literal["classification", "regression"] = "classification"
|
177 |
+
splitter: CVSplitter = CVSplitter()
|
178 |
+
batch_size: int = 16
|
179 |
+
num_workers: int = 0
|
180 |
+
|
181 |
+
def __post_init__(self) -> None:
|
182 |
+
super().__init__()
|
183 |
+
self.train_dataset: Optional[CSVDataset] = None
|
184 |
+
self.val_dataset: Optional[CSVDataset] = None
|
185 |
+
self.test_dataset: Optional[CSVDataset] = None
|
186 |
+
self.collate_fn: Callable = DataCollatorWithPadding(self.tokenizer)
|
187 |
+
|
188 |
+
def setup(self, stage: Optional[str] = None) -> None:
|
189 |
+
train_idx, val_idx, test_idx = self.splitter.train_val_test_split(
|
190 |
+
self.dataframe[self.smiles_column])
|
191 |
+
|
192 |
+
train_dataframe = self.dataframe.iloc[train_idx].reset_index(drop=True)
|
193 |
+
self.train_dataset = CSVDataset(train_dataframe, self.tokenizer,
|
194 |
+
self.smiles_column, self.target_column,
|
195 |
+
self.has_empty_target, self.task_type)
|
196 |
+
valid_dataframe = self.dataframe.iloc[val_idx].reset_index(drop=True)
|
197 |
+
self.val_dataset = CSVDataset(valid_dataframe, self.tokenizer,
|
198 |
+
self.smiles_column, self.target_column,
|
199 |
+
self.has_empty_target, self.task_type)
|
200 |
+
test_dataframe = self.dataframe.iloc[test_idx].reset_index(drop=True)
|
201 |
+
self.test_dataset = CSVDataset(test_dataframe, self.tokenizer,
|
202 |
+
self.smiles_column, self.target_column,
|
203 |
+
self.has_empty_target, self.task_type)
|
204 |
+
|
205 |
+
def train_dataloader(self) -> Union[DataLoader, List[DataLoader],
|
206 |
+
Dict[str, DataLoader]]:
|
207 |
+
return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True,
|
208 |
+
collate_fn=self.collate_fn, num_workers=self.num_workers)
|
209 |
+
|
210 |
+
def val_dataloader(self) -> Union[DataLoader, List[DataLoader],
|
211 |
+
Dict[str, DataLoader]]:
|
212 |
+
return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False,
|
213 |
+
collate_fn=self.collate_fn, num_workers=self.num_workers)
|
214 |
+
|
215 |
+
def test_dataloader(self) -> Union[DataLoader, List[DataLoader],
|
216 |
+
Dict[str, DataLoader]]:
|
217 |
+
return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False,
|
218 |
+
collate_fn=self.collate_fn, num_workers=self.num_workers)
|
219 |
+
|
220 |
+
|
221 |
+
@dataclass(init=True, eq=False, repr=True, frozen=False)
|
222 |
+
class LMDataset(Dataset):
|
223 |
+
"""Simple sequential dataset for autoregressive language modeling.
|
224 |
+
"""
|
225 |
+
|
226 |
+
filename: str
|
227 |
+
tokenizer: BaseTokenizer
|
228 |
+
|
229 |
+
def __post_init__(self) -> None:
|
230 |
+
self.smiles_strings = Path(self.filename).read_text(encoding='ascii').splitlines()
|
231 |
+
|
232 |
+
if isinstance(self.tokenizer, PreTrainedTokenizerFast):
|
233 |
+
self._encode = partial(self.tokenizer.__call__, truncation=True)
|
234 |
+
self._id_key = "input_ids"
|
235 |
+
else:
|
236 |
+
self._encode = self.tokenizer.encode
|
237 |
+
self._id_key = "ids"
|
238 |
+
|
239 |
+
def __len__(self) -> int:
|
240 |
+
return len(self.smiles_strings)
|
241 |
+
|
242 |
+
def __getitem__(self, i: int) -> torch.Tensor:
|
243 |
+
encodings = self._encode(self.smiles_strings[i])
|
244 |
+
return torch.LongTensor(getattr(encodings, self._id_key))
|
245 |
+
|
246 |
+
|
247 |
+
@dataclass(init=True, repr=True, eq=False, frozen=False)
|
248 |
+
class LMDataModule(LightningDataModule):
|
249 |
+
"""Lightning data module for autoregressive language modeling.
|
250 |
+
"""
|
251 |
+
|
252 |
+
filename: str
|
253 |
+
tokenizer: BaseTokenizer
|
254 |
+
batch_size: int = 128
|
255 |
+
num_workers: int = 0
|
256 |
+
collate_fn: Union[None, Literal["default"], Callable] = "default"
|
257 |
+
|
258 |
+
def __post_init__(self) -> None:
|
259 |
+
super().__init__()
|
260 |
+
if self.collate_fn == "default":
|
261 |
+
self.collate_fn = DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
|
262 |
+
|
263 |
+
def setup(self, stage: Optional[str] = None) -> None:
|
264 |
+
self.dataset = LMDataset(self.filename, self.tokenizer)
|
265 |
+
|
266 |
+
def train_dataloader(self) -> Union[DataLoader, List[DataLoader],
|
267 |
+
Dict[str, DataLoader]]:
|
268 |
+
return DataLoader(self.dataset, batch_size=self.batch_size, shuffle=True,
|
269 |
+
collate_fn=self.collate_fn, num_workers=self.num_workers)
|
iupac-gpt/iupac_gpt/iupac_dataset.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import time
|
4 |
+
import random
|
5 |
+
from itertools import chain
|
6 |
+
from collections import Counter
|
7 |
+
import numpy as np
|
8 |
+
import torch
|
9 |
+
from torch.nn.utils.rnn import pad_sequence
|
10 |
+
from transformers.data.data_collator import DataCollator
|
11 |
+
from multiprocessing import Pool
|
12 |
+
import mmap
|
13 |
+
from torch.utils.data import Dataset
|
14 |
+
|
15 |
+
class IUPACDataset(Dataset):
|
16 |
+
def __init__(self, dataset_dir='./',dataset_filename="iupacs_logp.txt", tokenizer=None,max_length=None,target_col=None,
|
17 |
+
dataset_size=None,iupac_name_col="iupac"):
|
18 |
+
self.dataset_dir = dataset_dir
|
19 |
+
self.tokenizer = tokenizer
|
20 |
+
self.target_col = target_col
|
21 |
+
self.max_length = max_length
|
22 |
+
self.dataset_size = dataset_size
|
23 |
+
self.dataset_filename = dataset_filename
|
24 |
+
|
25 |
+
# where the data is
|
26 |
+
self.dataset_fn = os.path.join(self.dataset_dir,self.dataset_filename)
|
27 |
+
|
28 |
+
# a bit of an odd way to read in a data file, but it lets
|
29 |
+
# us keep the data in csv format, and it's pretty fast
|
30 |
+
# (30s for 17G on my machine).
|
31 |
+
# we need to use mmap for data-parallel training with
|
32 |
+
# multiple processes so that the processes don't each keep
|
33 |
+
# a local copy of the dataset in host memory
|
34 |
+
line_offsets = []
|
35 |
+
# each element of data_mm is a character in the dataset file
|
36 |
+
self.data_mm = np.memmap(self.dataset_fn, dtype=np.uint8, mode="r")
|
37 |
+
|
38 |
+
# process chunksize bytes at a time
|
39 |
+
chunksize = int(1e9)
|
40 |
+
for i in range(0, len(self.data_mm), chunksize):
|
41 |
+
chunk = self.data_mm[i:i + chunksize]
|
42 |
+
# the index of each newline is the character before
|
43 |
+
# the beginning of the next line
|
44 |
+
newlines = np.nonzero(chunk == 0x0a)[0]
|
45 |
+
line_offsets.append(i + newlines + 1)
|
46 |
+
if self.dataset_size is not None and i > self.dataset_size:
|
47 |
+
# don't need to keep loading data
|
48 |
+
break
|
49 |
+
# line_offsets indicates the beginning of each line in self.dataset_fn
|
50 |
+
self.line_offsets = np.hstack(line_offsets)
|
51 |
+
|
52 |
+
if (self.dataset_size is not None
|
53 |
+
and self.dataset_size > self.line_offsets.shape[0]):
|
54 |
+
msg = "specified dataset_size {}, but the dataset only has {} items"
|
55 |
+
raise ValueError(msg.format(self.dataset_size,
|
56 |
+
self.line_offsets.shape[0]))
|
57 |
+
|
58 |
+
# extract headers
|
59 |
+
header_line = bytes(self.data_mm[0:self.line_offsets[0]])
|
60 |
+
headers = header_line.decode("utf8").strip().split("|")
|
61 |
+
|
62 |
+
# figure out which column IDs are of interest
|
63 |
+
try:
|
64 |
+
self.name_col_id = headers.index(iupac_name_col)
|
65 |
+
except ValueError as e:
|
66 |
+
raise RuntimeError("Expecting a column called '{}' "
|
67 |
+
"that contains IUPAC names".format(iupac_name_col))
|
68 |
+
self.target_col_id = None
|
69 |
+
if self.target_col is not None:
|
70 |
+
try:
|
71 |
+
self.target_col_id = headers.index(self.target_col)
|
72 |
+
except ValueError as e:
|
73 |
+
raise RuntimeError("User supplied target col " + target_col + \
|
74 |
+
"but column is not present in data file")
|
75 |
+
|
76 |
+
def __getitem__(self, idx):
|
77 |
+
# model_inputs is a dict with keys
|
78 |
+
# input_ids, target
|
79 |
+
|
80 |
+
if self.dataset_size is not None and idx > self.dataset_size:
|
81 |
+
msg = "provided index {} is larger than dataset size {}"
|
82 |
+
raise IndexError(msg.format(idx, self.dataset_size))
|
83 |
+
|
84 |
+
start = self.line_offsets[idx]
|
85 |
+
end = self.line_offsets[idx + 1]
|
86 |
+
line = bytes(self.data_mm[start:end])
|
87 |
+
line = line.decode("utf8").strip().split("|")
|
88 |
+
name = line[self.name_col_id]
|
89 |
+
|
90 |
+
# get the target value, if needed
|
91 |
+
target = None
|
92 |
+
if self.target_col_id is not None:
|
93 |
+
target = line[self.target_col_id]
|
94 |
+
if self.target_col == "Log P" and len(target) == 0:
|
95 |
+
target = 3.16 # average of training data
|
96 |
+
else:
|
97 |
+
target = float(target)
|
98 |
+
|
99 |
+
tokenized = self.tokenizer(name) #after this the tokenizer.eos_token_id have been added automaticly
|
100 |
+
input_ids = torch.tensor(tokenized["input_ids"])
|
101 |
+
|
102 |
+
iupac_unk = torch.tensor([self.tokenizer._convert_token_to_id(self.tokenizer.unk_token)])
|
103 |
+
input_ids = torch.tensor(input_ids)
|
104 |
+
input_ids = torch.cat([iupac_unk,input_ids])
|
105 |
+
|
106 |
+
return_dict = {}
|
107 |
+
return_dict["input_ids"] = input_ids #np.array(tokenized["input_ids"])
|
108 |
+
return_dict["labels"] = input_ids
|
109 |
+
#return_dict["property"] = torch.tensor(np.array(target))
|
110 |
+
|
111 |
+
if self.max_length is not None:
|
112 |
+
return_dict["input_ids"] = return_dict["input_ids"][:self.max_length]
|
113 |
+
return_dict["labels"] = return_dict["labels"][:self.max_length]
|
114 |
+
|
115 |
+
return return_dict
|
116 |
+
|
117 |
+
def __len__(self):
|
118 |
+
if self.dataset_size is None:
|
119 |
+
return len(self.line_offsets) - 1
|
120 |
+
else:
|
121 |
+
return self.dataset_size
|
iupac-gpt/iupac_gpt/iupac_dataset_class.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import time
|
4 |
+
import random
|
5 |
+
from itertools import chain
|
6 |
+
from collections import Counter
|
7 |
+
import numpy as np
|
8 |
+
import torch
|
9 |
+
from torch.nn.utils.rnn import pad_sequence
|
10 |
+
from transformers.data.data_collator import DataCollator
|
11 |
+
from multiprocessing import Pool
|
12 |
+
import mmap
|
13 |
+
from torch.utils.data import Dataset
|
14 |
+
|
15 |
+
class IUPACDataset(Dataset):
|
16 |
+
def __init__(self, dataset_dir='./',dataset_filename="iupacs_logp.txt", tokenizer=None,max_length=None,target_col=None,
|
17 |
+
dataset_size=None,iupac_name_col="iupac"):
|
18 |
+
self.dataset_dir = dataset_dir
|
19 |
+
self.tokenizer = tokenizer
|
20 |
+
self.target_col = target_col
|
21 |
+
self.max_length = max_length
|
22 |
+
self.dataset_size = dataset_size
|
23 |
+
self.dataset_filename = dataset_filename
|
24 |
+
|
25 |
+
# where the data is
|
26 |
+
self.dataset_fn = os.path.join(self.dataset_dir,self.dataset_filename)
|
27 |
+
|
28 |
+
# a bit of an odd way to read in a data file, but it lets
|
29 |
+
# us keep the data in csv format, and it's pretty fast
|
30 |
+
# (30s for 17G on my machine).
|
31 |
+
# we need to use mmap for data-parallel training with
|
32 |
+
# multiple processes so that the processes don't each keep
|
33 |
+
# a local copy of the dataset in host memory
|
34 |
+
line_offsets = []
|
35 |
+
# each element of data_mm is a character in the dataset file
|
36 |
+
self.data_mm = np.memmap(self.dataset_fn, dtype=np.uint8, mode="r")
|
37 |
+
|
38 |
+
# process chunksize bytes at a time
|
39 |
+
chunksize = int(1e9)
|
40 |
+
for i in range(0, len(self.data_mm), chunksize):
|
41 |
+
chunk = self.data_mm[i:i + chunksize]
|
42 |
+
# the index of each newline is the character before
|
43 |
+
# the beginning of the next line
|
44 |
+
newlines = np.nonzero(chunk == 0x0a)[0]
|
45 |
+
line_offsets.append(i + newlines + 1)
|
46 |
+
if self.dataset_size is not None and i > self.dataset_size:
|
47 |
+
# don't need to keep loading data
|
48 |
+
break
|
49 |
+
# line_offsets indicates the beginning of each line in self.dataset_fn
|
50 |
+
self.line_offsets = np.hstack(line_offsets)
|
51 |
+
|
52 |
+
if (self.dataset_size is not None
|
53 |
+
and self.dataset_size > self.line_offsets.shape[0]):
|
54 |
+
msg = "specified dataset_size {}, but the dataset only has {} items"
|
55 |
+
raise ValueError(msg.format(self.dataset_size,
|
56 |
+
self.line_offsets.shape[0]))
|
57 |
+
|
58 |
+
# extract headers
|
59 |
+
header_line = bytes(self.data_mm[0:self.line_offsets[0]])
|
60 |
+
headers = header_line.decode("utf8").strip().split("|")
|
61 |
+
|
62 |
+
# figure out which column IDs are of interest
|
63 |
+
try:
|
64 |
+
self.name_col_id = headers.index(iupac_name_col)
|
65 |
+
except ValueError as e:
|
66 |
+
raise RuntimeError("Expecting a column called '{}' "
|
67 |
+
"that contains IUPAC names".format(iupac_name_col))
|
68 |
+
self.target_col_id = None
|
69 |
+
if self.target_col is not None:
|
70 |
+
try:
|
71 |
+
self.target_col_id = headers.index(self.target_col)
|
72 |
+
except ValueError as e:
|
73 |
+
raise RuntimeError("User supplied target col " + target_col + \
|
74 |
+
"but column is not present in data file")
|
75 |
+
|
76 |
+
def __getitem__(self, idx):
|
77 |
+
# model_inputs is a dict with keys
|
78 |
+
# input_ids, target
|
79 |
+
|
80 |
+
if self.dataset_size is not None and idx > self.dataset_size:
|
81 |
+
msg = "provided index {} is larger than dataset size {}"
|
82 |
+
raise IndexError(msg.format(idx, self.dataset_size))
|
83 |
+
|
84 |
+
start = self.line_offsets[idx]
|
85 |
+
end = self.line_offsets[idx + 1]
|
86 |
+
line = bytes(self.data_mm[start:end])
|
87 |
+
line = line.decode("utf8").strip().split("|")
|
88 |
+
name = line[self.name_col_id]
|
89 |
+
|
90 |
+
# get the target value, if needed
|
91 |
+
target = None
|
92 |
+
if self.target_col_id is not None:
|
93 |
+
target = line[self.target_col_id]
|
94 |
+
if self.target_col == "Log P" and len(target) == 0:
|
95 |
+
target = 3.16 # average of training data
|
96 |
+
else:
|
97 |
+
target = float(target)
|
98 |
+
|
99 |
+
if target>3.16:
|
100 |
+
target = 1
|
101 |
+
else:
|
102 |
+
target=0
|
103 |
+
|
104 |
+
tokenized = self.tokenizer(name) #after this the tokenizer.eos_token_id have been added automaticly
|
105 |
+
input_ids = torch.tensor(tokenized["input_ids"])
|
106 |
+
|
107 |
+
iupac_unk = torch.tensor([self.tokenizer._convert_token_to_id(self.tokenizer.unk_token)])
|
108 |
+
input_ids = torch.tensor(input_ids)
|
109 |
+
input_ids = torch.cat([iupac_unk,input_ids])
|
110 |
+
|
111 |
+
attention_mask = torch.ones(input_ids.numel(), dtype=int)
|
112 |
+
|
113 |
+
return_dict = {}
|
114 |
+
return_dict["input_ids"] = input_ids
|
115 |
+
return_dict["labels"] = torch.tensor(np.array(target))
|
116 |
+
return_dict["attention_mask"] = attention_mask
|
117 |
+
|
118 |
+
if self.max_length is not None:
|
119 |
+
return_dict["input_ids"] = return_dict["input_ids"][:self.max_length]
|
120 |
+
return_dict["attention_mask"] = return_dict["attention_mask"][:self.max_length]
|
121 |
+
|
122 |
+
return return_dict
|
123 |
+
|
124 |
+
def __len__(self):
|
125 |
+
if self.dataset_size is None:
|
126 |
+
return len(self.line_offsets) - 1
|
127 |
+
else:
|
128 |
+
return self.dataset_size
|
iupac-gpt/iupac_gpt/iupac_dataset_pro.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import time
|
4 |
+
import random
|
5 |
+
from itertools import chain
|
6 |
+
from collections import Counter
|
7 |
+
import numpy as np
|
8 |
+
import torch
|
9 |
+
from torch.nn.utils.rnn import pad_sequence
|
10 |
+
from transformers.data.data_collator import DataCollator
|
11 |
+
from multiprocessing import Pool
|
12 |
+
import mmap
|
13 |
+
from torch.utils.data import Dataset
|
14 |
+
|
15 |
+
class IUPACDataset(Dataset):
|
16 |
+
def __init__(self, dataset_dir='./',dataset_filename="iupacs_logp.txt", tokenizer=None,max_length=None,target_col=None,
|
17 |
+
dataset_size=None,iupac_name_col="iupac"):
|
18 |
+
self.dataset_dir = dataset_dir
|
19 |
+
self.tokenizer = tokenizer
|
20 |
+
self.target_col = target_col
|
21 |
+
self.max_length = max_length
|
22 |
+
self.dataset_size = dataset_size
|
23 |
+
self.dataset_filename = dataset_filename
|
24 |
+
|
25 |
+
# where the data is
|
26 |
+
self.dataset_fn = os.path.join(self.dataset_dir,self.dataset_filename)
|
27 |
+
|
28 |
+
# a bit of an odd way to read in a data file, but it lets
|
29 |
+
# us keep the data in csv format, and it's pretty fast
|
30 |
+
# (30s for 17G on my machine).
|
31 |
+
# we need to use mmap for data-parallel training with
|
32 |
+
# multiple processes so that the processes don't each keep
|
33 |
+
# a local copy of the dataset in host memory
|
34 |
+
line_offsets = []
|
35 |
+
# each element of data_mm is a character in the dataset file
|
36 |
+
self.data_mm = np.memmap(self.dataset_fn, dtype=np.uint8, mode="r")
|
37 |
+
|
38 |
+
# process chunksize bytes at a time
|
39 |
+
chunksize = int(1e9)
|
40 |
+
for i in range(0, len(self.data_mm), chunksize):
|
41 |
+
chunk = self.data_mm[i:i + chunksize]
|
42 |
+
# the index of each newline is the character before
|
43 |
+
# the beginning of the next line
|
44 |
+
newlines = np.nonzero(chunk == 0x0a)[0]
|
45 |
+
line_offsets.append(i + newlines + 1)
|
46 |
+
if self.dataset_size is not None and i > self.dataset_size:
|
47 |
+
# don't need to keep loading data
|
48 |
+
break
|
49 |
+
# line_offsets indicates the beginning of each line in self.dataset_fn
|
50 |
+
self.line_offsets = np.hstack(line_offsets)
|
51 |
+
|
52 |
+
if (self.dataset_size is not None
|
53 |
+
and self.dataset_size > self.line_offsets.shape[0]):
|
54 |
+
msg = "specified dataset_size {}, but the dataset only has {} items"
|
55 |
+
raise ValueError(msg.format(self.dataset_size,
|
56 |
+
self.line_offsets.shape[0]))
|
57 |
+
|
58 |
+
# extract headers
|
59 |
+
header_line = bytes(self.data_mm[0:self.line_offsets[0]])
|
60 |
+
headers = header_line.decode("utf8").strip().split("|")
|
61 |
+
|
62 |
+
# figure out which column IDs are of interest
|
63 |
+
try:
|
64 |
+
self.name_col_id = headers.index(iupac_name_col)
|
65 |
+
except ValueError as e:
|
66 |
+
raise RuntimeError("Expecting a column called '{}' "
|
67 |
+
"that contains IUPAC names".format(iupac_name_col))
|
68 |
+
self.target_col_id = None
|
69 |
+
if self.target_col is not None:
|
70 |
+
try:
|
71 |
+
self.target_col_id = headers.index(self.target_col)
|
72 |
+
except ValueError as e:
|
73 |
+
raise RuntimeError("User supplied target col " + target_col + \
|
74 |
+
"but column is not present in data file")
|
75 |
+
|
76 |
+
def __getitem__(self, idx):
|
77 |
+
# model_inputs is a dict with keys
|
78 |
+
# input_ids, target
|
79 |
+
|
80 |
+
if self.dataset_size is not None and idx > self.dataset_size:
|
81 |
+
msg = "provided index {} is larger than dataset size {}"
|
82 |
+
raise IndexError(msg.format(idx, self.dataset_size))
|
83 |
+
|
84 |
+
start = self.line_offsets[idx]
|
85 |
+
end = self.line_offsets[idx + 1]
|
86 |
+
line = bytes(self.data_mm[start:end])
|
87 |
+
line = line.decode("utf8").strip().split("|")
|
88 |
+
name = line[self.name_col_id]
|
89 |
+
|
90 |
+
# get the target value, if needed
|
91 |
+
target = None
|
92 |
+
if self.target_col_id is not None:
|
93 |
+
target = line[self.target_col_id]
|
94 |
+
if self.target_col == "Log P" and len(target) == 0:
|
95 |
+
target = 3.16 # average of training data
|
96 |
+
else:
|
97 |
+
target = float(target)
|
98 |
+
|
99 |
+
|
100 |
+
tokenized = self.tokenizer(name) #after this the tokenizer.eos_token_id have been added automaticly
|
101 |
+
input_ids = torch.tensor(tokenized["input_ids"])
|
102 |
+
|
103 |
+
iupac_unk = torch.tensor([self.tokenizer._convert_token_to_id(self.tokenizer.unk_token)])
|
104 |
+
input_ids = torch.tensor(input_ids)
|
105 |
+
input_ids = torch.cat([iupac_unk,input_ids])
|
106 |
+
|
107 |
+
attention_mask = torch.ones(input_ids.numel(), dtype=int)
|
108 |
+
|
109 |
+
return_dict = {}
|
110 |
+
return_dict["input_ids"] = input_ids
|
111 |
+
return_dict["labels"] = torch.tensor(np.array(target))
|
112 |
+
return_dict["attention_mask"] = attention_mask
|
113 |
+
|
114 |
+
if self.max_length is not None:
|
115 |
+
return_dict["input_ids"] = return_dict["input_ids"][:self.max_length]
|
116 |
+
return_dict["attention_mask"] = return_dict["attention_mask"][:self.max_length]
|
117 |
+
|
118 |
+
return return_dict
|
119 |
+
|
120 |
+
def __len__(self):
|
121 |
+
if self.dataset_size is None:
|
122 |
+
return len(self.line_offsets) - 1
|
123 |
+
else:
|
124 |
+
return self.dataset_size
|
iupac-gpt/iupac_gpt/iupac_spm.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bb18836fd01a60e6cf61ad64e7e6556ac1f676d3ca39a16f375d54e8a8fb4e60
|
3 |
+
size 275487
|
iupac-gpt/iupac_gpt/iupac_spm.vocab
ADDED
@@ -0,0 +1,1391 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<pad> 0
|
2 |
+
</s> 0
|
3 |
+
<unk> 0
|
4 |
+
0 0
|
5 |
+
1 0
|
6 |
+
2 0
|
7 |
+
3 0
|
8 |
+
4 0
|
9 |
+
5 0
|
10 |
+
6 0
|
11 |
+
7 0
|
12 |
+
8 0
|
13 |
+
9 0
|
14 |
+
10 0
|
15 |
+
11 0
|
16 |
+
12 0
|
17 |
+
13 0
|
18 |
+
14 0
|
19 |
+
15 0
|
20 |
+
16 0
|
21 |
+
17 0
|
22 |
+
18 0
|
23 |
+
19 0
|
24 |
+
20 0
|
25 |
+
21 0
|
26 |
+
22 0
|
27 |
+
23 0
|
28 |
+
24 0
|
29 |
+
25 0
|
30 |
+
26 0
|
31 |
+
27 0
|
32 |
+
28 0
|
33 |
+
29 0
|
34 |
+
30 0
|
35 |
+
31 0
|
36 |
+
32 0
|
37 |
+
33 0
|
38 |
+
34 0
|
39 |
+
35 0
|
40 |
+
36 0
|
41 |
+
37 0
|
42 |
+
38 0
|
43 |
+
39 0
|
44 |
+
40 0
|
45 |
+
41 0
|
46 |
+
42 0
|
47 |
+
43 0
|
48 |
+
44 0
|
49 |
+
45 0
|
50 |
+
46 0
|
51 |
+
47 0
|
52 |
+
48 0
|
53 |
+
49 0
|
54 |
+
50 0
|
55 |
+
51 0
|
56 |
+
52 0
|
57 |
+
53 0
|
58 |
+
54 0
|
59 |
+
55 0
|
60 |
+
56 0
|
61 |
+
57 0
|
62 |
+
58 0
|
63 |
+
59 0
|
64 |
+
60 0
|
65 |
+
61 0
|
66 |
+
62 0
|
67 |
+
63 0
|
68 |
+
64 0
|
69 |
+
65 0
|
70 |
+
66 0
|
71 |
+
67 0
|
72 |
+
68 0
|
73 |
+
69 0
|
74 |
+
70 0
|
75 |
+
71 0
|
76 |
+
72 0
|
77 |
+
73 0
|
78 |
+
74 0
|
79 |
+
75 0
|
80 |
+
76 0
|
81 |
+
77 0
|
82 |
+
78 0
|
83 |
+
79 0
|
84 |
+
80 0
|
85 |
+
81 0
|
86 |
+
82 0
|
87 |
+
83 0
|
88 |
+
84 0
|
89 |
+
85 0
|
90 |
+
86 0
|
91 |
+
87 0
|
92 |
+
88 0
|
93 |
+
89 0
|
94 |
+
90 0
|
95 |
+
91 0
|
96 |
+
92 0
|
97 |
+
93 0
|
98 |
+
94 0
|
99 |
+
95 0
|
100 |
+
96 0
|
101 |
+
97 0
|
102 |
+
98 0
|
103 |
+
99 0
|
104 |
+
; 0
|
105 |
+
. 0
|
106 |
+
.0 0
|
107 |
+
' 0
|
108 |
+
R 0
|
109 |
+
S 0
|
110 |
+
H 0
|
111 |
+
N 0
|
112 |
+
E 0
|
113 |
+
Z 0
|
114 |
+
aR 0
|
115 |
+
aS 0
|
116 |
+
bR 0
|
117 |
+
bS 0
|
118 |
+
cR 0
|
119 |
+
cS 0
|
120 |
+
dR 0
|
121 |
+
dS 0
|
122 |
+
aH 0
|
123 |
+
bH 0
|
124 |
+
cH 0
|
125 |
+
aE 0
|
126 |
+
aZ 0
|
127 |
+
a, 0
|
128 |
+
a- 0
|
129 |
+
b, 0
|
130 |
+
b- 0
|
131 |
+
c, 0
|
132 |
+
c- 0
|
133 |
+
d, 0
|
134 |
+
d- 0
|
135 |
+
a] 0
|
136 |
+
b] 0
|
137 |
+
c] 0
|
138 |
+
d] 0
|
139 |
+
e] 0
|
140 |
+
f] 0
|
141 |
+
g] 0
|
142 |
+
h] 0
|
143 |
+
i] 0
|
144 |
+
j] 0
|
145 |
+
k] 0
|
146 |
+
l] 0
|
147 |
+
m] 0
|
148 |
+
<high> 0
|
149 |
+
<med> 0
|
150 |
+
<low> 0
|
151 |
+
- 0
|
152 |
+
yl 0
|
153 |
+
, 0
|
154 |
+
) 0
|
155 |
+
( 0
|
156 |
+
] 0
|
157 |
+
[ 0
|
158 |
+
meth 0
|
159 |
+
phenyl 0
|
160 |
+
di 0
|
161 |
+
an 0
|
162 |
+
eth 0
|
163 |
+
oxy 0
|
164 |
+
prop 0
|
165 |
+
e 0
|
166 |
+
amino 0
|
167 |
+
oxo 0
|
168 |
+
fluoro 0
|
169 |
+
cyclo 0
|
170 |
+
o 0
|
171 |
+
amide 0
|
172 |
+
tri 0
|
173 |
+
chloro 0
|
174 |
+
but 0
|
175 |
+
hydroxy 0
|
176 |
+
a 0
|
177 |
+
one 0
|
178 |
+
pyridin 0
|
179 |
+
hydro 0
|
180 |
+
benzo 0
|
181 |
+
acet 0
|
182 |
+
l 0
|
183 |
+
en 0
|
184 |
+
ol 0
|
185 |
+
amine 0
|
186 |
+
ylamin 0
|
187 |
+
oxa 0
|
188 |
+
oyl 0
|
189 |
+
carboxamide 0
|
190 |
+
benz 0
|
191 |
+
piperidin 0
|
192 |
+
thia 0
|
193 |
+
ate 0
|
194 |
+
sulf 0
|
195 |
+
bromo 0
|
196 |
+
ylidene 0
|
197 |
+
pyrimidin 0
|
198 |
+
tetra 0
|
199 |
+
ic_acid 0
|
200 |
+
penta 0
|
201 |
+
pyrrolidin 0
|
202 |
+
sulfonyl 0
|
203 |
+
hexa 0
|
204 |
+
hex 0
|
205 |
+
ane 0
|
206 |
+
pyrazol 0
|
207 |
+
phenoxy 0
|
208 |
+
carbonyl 0
|
209 |
+
thiophen 0
|
210 |
+
aza 0
|
211 |
+
piperazin 0
|
212 |
+
azo 0
|
213 |
+
carboxylate 0
|
214 |
+
imidazol 0
|
215 |
+
furan 0
|
216 |
+
nitro 0
|
217 |
+
carbam 0
|
218 |
+
anilino 0
|
219 |
+
pent 0
|
220 |
+
d 0
|
221 |
+
tert- 0
|
222 |
+
benzen 0
|
223 |
+
indol 0
|
224 |
+
sulfon 0
|
225 |
+
carboxylic_acid 0
|
226 |
+
diazo 0
|
227 |
+
az 0
|
228 |
+
ene 0
|
229 |
+
quinolin 0
|
230 |
+
naphthalen 0
|
231 |
+
morpholin 0
|
232 |
+
ium 0
|
233 |
+
cyano 0
|
234 |
+
bi 0
|
235 |
+
bis 0
|
236 |
+
hepta 0
|
237 |
+
pyrrol 0
|
238 |
+
spiro 0
|
239 |
+
r 0
|
240 |
+
ole 0
|
241 |
+
azin 0
|
242 |
+
hydrochloride 0
|
243 |
+
urea 0
|
244 |
+
yn 0
|
245 |
+
azido 0
|
246 |
+
carbamate 0
|
247 |
+
pyrrolo 0
|
248 |
+
it 0
|
249 |
+
imidazo 0
|
250 |
+
pyrazin 0
|
251 |
+
guanidin 0
|
252 |
+
thio 0
|
253 |
+
pyrazolo 0
|
254 |
+
iodo 0
|
255 |
+
imino 0
|
256 |
+
sulfam 0
|
257 |
+
carbon 0
|
258 |
+
olidin 0
|
259 |
+
epin 0
|
260 |
+
isoquinolin 0
|
261 |
+
deca 0
|
262 |
+
anilin 0
|
263 |
+
quinazolin 0
|
264 |
+
nitrile 0
|
265 |
+
hydrazin 0
|
266 |
+
epan 0
|
267 |
+
pyridazin 0
|
268 |
+
chromen 0
|
269 |
+
octa 0
|
270 |
+
octan 0
|
271 |
+
thieno 0
|
272 |
+
in 0
|
273 |
+
amido 0
|
274 |
+
hept 0
|
275 |
+
thiol 0
|
276 |
+
hydroiodide 0
|
277 |
+
imid 0
|
278 |
+
isoindol 0
|
279 |
+
nona 0
|
280 |
+
pyrido 0
|
281 |
+
inden 0
|
282 |
+
carbazol 0
|
283 |
+
ox 0
|
284 |
+
dodeca 0
|
285 |
+
etidin 0
|
286 |
+
oct 0
|
287 |
+
phenol 0
|
288 |
+
imidazolidin 0
|
289 |
+
sil 0
|
290 |
+
carboxy 0
|
291 |
+
imido 0
|
292 |
+
phosphor 0
|
293 |
+
purin 0
|
294 |
+
phospha 0
|
295 |
+
fluoren 0
|
296 |
+
carbox 0
|
297 |
+
indazol 0
|
298 |
+
undeca 0
|
299 |
+
furo 0
|
300 |
+
tetradeca 0
|
301 |
+
cyclopenta[a]phenanthren 0
|
302 |
+
form 0
|
303 |
+
quinoxalin 0
|
304 |
+
trideca 0
|
305 |
+
hexadeca 0
|
306 |
+
imine 0
|
307 |
+
sulfinyl 0
|
308 |
+
octadeca 0
|
309 |
+
carba 0
|
310 |
+
dec 0
|
311 |
+
adamant 0
|
312 |
+
chloride 0
|
313 |
+
sila 0
|
314 |
+
icos 0
|
315 |
+
ine 0
|
316 |
+
ide 0
|
317 |
+
naphthyridin 0
|
318 |
+
heptadeca 0
|
319 |
+
thione 0
|
320 |
+
anthracen 0
|
321 |
+
dodec 0
|
322 |
+
oxir 0
|
323 |
+
pyran 0
|
324 |
+
hydrogen 0
|
325 |
+
pentadeca 0
|
326 |
+
oxido 0
|
327 |
+
carbo 0
|
328 |
+
henicos 0
|
329 |
+
deuterio 0
|
330 |
+
docos 0
|
331 |
+
non 0
|
332 |
+
id 0
|
333 |
+
tert-butyl(dimethyl)silyl 0
|
334 |
+
carbamic_acid 0
|
335 |
+
pyrano 0
|
336 |
+
nonadeca 0
|
337 |
+
tris 0
|
338 |
+
but-2-eno 0
|
339 |
+
ic 0
|
340 |
+
at 0
|
341 |
+
phosphate 0
|
342 |
+
hydrazide 0
|
343 |
+
aceton 0
|
344 |
+
octadec 0
|
345 |
+
sulfo 0
|
346 |
+
thiomorpholin 0
|
347 |
+
pyrimido 0
|
348 |
+
oxamide 0
|
349 |
+
carbonimidoyl 0
|
350 |
+
oxet 0
|
351 |
+
inan 0
|
352 |
+
sodium 0
|
353 |
+
al 0
|
354 |
+
(2+) 0
|
355 |
+
oxide 0
|
356 |
+
phthalazin 0
|
357 |
+
benzal 0
|
358 |
+
carbohydrazide 0
|
359 |
+
bora 0
|
360 |
+
benzhydr 0
|
361 |
+
tetracos 0
|
362 |
+
bor 0
|
363 |
+
hexadec 0
|
364 |
+
ioda 0
|
365 |
+
azonia 0
|
366 |
+
isocyano 0
|
367 |
+
acridin 0
|
368 |
+
hydroxylamin 0
|
369 |
+
formamide 0
|
370 |
+
phenanthren 0
|
371 |
+
ul 0
|
372 |
+
indeno 0
|
373 |
+
xanthen 0
|
374 |
+
nitroso 0
|
375 |
+
tetradec 0
|
376 |
+
phosphin 0
|
377 |
+
olan 0
|
378 |
+
peroxy 0
|
379 |
+
phosphono 0
|
380 |
+
tetr 0
|
381 |
+
pyrazolidin 0
|
382 |
+
dicarbon 0
|
383 |
+
olate 0
|
384 |
+
tricos 0
|
385 |
+
hexacos 0
|
386 |
+
indolo 0
|
387 |
+
indolizin 0
|
388 |
+
phosphon 0
|
389 |
+
undec 0
|
390 |
+
chromeno 0
|
391 |
+
pentacos 0
|
392 |
+
pyrazino 0
|
393 |
+
thi 0
|
394 |
+
hydrate 0
|
395 |
+
bromide 0
|
396 |
+
uid 0
|
397 |
+
boronic_acid 0
|
398 |
+
trityl 0
|
399 |
+
cen 0
|
400 |
+
sulfate 0
|
401 |
+
isochromen 0
|
402 |
+
octacos 0
|
403 |
+
isocyanato 0
|
404 |
+
acetal 0
|
405 |
+
azide 0
|
406 |
+
dimethylacetamide 0
|
407 |
+
tetrakis 0
|
408 |
+
iridin 0
|
409 |
+
nonadec 0
|
410 |
+
naphtho 0
|
411 |
+
heptadec 0
|
412 |
+
pyren 0
|
413 |
+
heptacos 0
|
414 |
+
carbamimidamido 0
|
415 |
+
sulfinam 0
|
416 |
+
oxid 0
|
417 |
+
iodide 0
|
418 |
+
etheno 0
|
419 |
+
disulfon 0
|
420 |
+
potassium 0
|
421 |
+
chrysen 0
|
422 |
+
yne 0
|
423 |
+
phosphino 0
|
424 |
+
carboximidoyl 0
|
425 |
+
quinolizin 0
|
426 |
+
tert-butyl(diphenyl)silyl 0
|
427 |
+
formamid 0
|
428 |
+
thiochromen 0
|
429 |
+
porphyrin 0
|
430 |
+
dicyan 0
|
431 |
+
triacont 0
|
432 |
+
pteridin 0
|
433 |
+
(3+) 0
|
434 |
+
sulfin 0
|
435 |
+
ar 0
|
436 |
+
pentadec 0
|
437 |
+
io 0
|
438 |
+
phenothiazin 0
|
439 |
+
undecyl 0
|
440 |
+
oxal 0
|
441 |
+
phospho 0
|
442 |
+
borin 0
|
443 |
+
uide 0
|
444 |
+
uranium 0
|
445 |
+
picen 0
|
446 |
+
hydrobromide 0
|
447 |
+
cinnolin 0
|
448 |
+
isoindolo 0
|
449 |
+
phthal 0
|
450 |
+
phenac 0
|
451 |
+
phenanthridin 0
|
452 |
+
azino 0
|
453 |
+
tridec 0
|
454 |
+
zirconium 0
|
455 |
+
len 0
|
456 |
+
phenanthrolin 0
|
457 |
+
platinum 0
|
458 |
+
phenolate 0
|
459 |
+
sulfonato 0
|
460 |
+
oxybenzon 0
|
461 |
+
zinc 0
|
462 |
+
chlora 0
|
463 |
+
hydroperoxy 0
|
464 |
+
yttrium 0
|
465 |
+
pyrrolizin 0
|
466 |
+
carbothioyl 0
|
467 |
+
sel 0
|
468 |
+
iron 0
|
469 |
+
spirobi 0
|
470 |
+
copper 0
|
471 |
+
triphenylen 0
|
472 |
+
titanium 0
|
473 |
+
perox 0
|
474 |
+
nonacos 0
|
475 |
+
(1+) 0
|
476 |
+
tridecyl 0
|
477 |
+
lithium 0
|
478 |
+
tetrol 0
|
479 |
+
(4+) 0
|
480 |
+
carboxylato 0
|
481 |
+
thiopyran 0
|
482 |
+
pentacont 0
|
483 |
+
etan 0
|
484 |
+
iridium 0
|
485 |
+
thioxanthen 0
|
486 |
+
nickel 0
|
487 |
+
phenoxazin 0
|
488 |
+
hexatriacont 0
|
489 |
+
azulen 0
|
490 |
+
tetracont 0
|
491 |
+
tritriacont 0
|
492 |
+
azon 0
|
493 |
+
carbono 0
|
494 |
+
sulfino 0
|
495 |
+
dotriacont 0
|
496 |
+
stann 0
|
497 |
+
nitrate 0
|
498 |
+
broma 0
|
499 |
+
on 0
|
500 |
+
et 0
|
501 |
+
acetylen 0
|
502 |
+
fluoride 0
|
503 |
+
isothiocyanato 0
|
504 |
+
magnesium 0
|
505 |
+
cobalt 0
|
506 |
+
acenaphthylen 0
|
507 |
+
sulfamate 0
|
508 |
+
ruthenium 0
|
509 |
+
aldehyde 0
|
510 |
+
phosphite 0
|
511 |
+
nonafl 0
|
512 |
+
palladium 0
|
513 |
+
pentadecyl 0
|
514 |
+
purino 0
|
515 |
+
tetratriacont 0
|
516 |
+
epoxy 0
|
517 |
+
aluma 0
|
518 |
+
phenanthro 0
|
519 |
+
phenazin 0
|
520 |
+
fluoranthen 0
|
521 |
+
sulfinato 0
|
522 |
+
ocin 0
|
523 |
+
hentriacont 0
|
524 |
+
azanida 0
|
525 |
+
stanna 0
|
526 |
+
toluen 0
|
527 |
+
ylidyne 0
|
528 |
+
thiopyrano 0
|
529 |
+
perchlorate 0
|
530 |
+
calcium 0
|
531 |
+
mono 0
|
532 |
+
tungsten 0
|
533 |
+
sulfur 0
|
534 |
+
cyanamide 0
|
535 |
+
tricarbon 0
|
536 |
+
chlorid 0
|
537 |
+
dehydro 0
|
538 |
+
pyridazino 0
|
539 |
+
sulfido 0
|
540 |
+
irin 0
|
541 |
+
phosph 0
|
542 |
+
iran 0
|
543 |
+
thiocyanate 0
|
544 |
+
hypoiodite 0
|
545 |
+
ylium 0
|
546 |
+
imidazolo 0
|
547 |
+
octatriacont 0
|
548 |
+
dimethylurea 0
|
549 |
+
heptadecyl 0
|
550 |
+
tritio 0
|
551 |
+
hydrazono 0
|
552 |
+
selena 0
|
553 |
+
cyanide 0
|
554 |
+
dotetracont 0
|
555 |
+
isoquinolino 0
|
556 |
+
diazonium 0
|
557 |
+
pentatriacont 0
|
558 |
+
hydroxide 0
|
559 |
+
manganese 0
|
560 |
+
chromium 0
|
561 |
+
pentakis 0
|
562 |
+
hypofluorite 0
|
563 |
+
tin 0
|
564 |
+
sulfono 0
|
565 |
+
phosphoroso 0
|
566 |
+
vanadium 0
|
567 |
+
boranuida 0
|
568 |
+
ecin 0
|
569 |
+
hexakis 0
|
570 |
+
s-indacen 0
|
571 |
+
os 0
|
572 |
+
fluoreno 0
|
573 |
+
mercury 0
|
574 |
+
sulfamic_acid 0
|
575 |
+
thiochromeno 0
|
576 |
+
phenalen 0
|
577 |
+
rhodium 0
|
578 |
+
amid 0
|
579 |
+
sulfite 0
|
580 |
+
ocan 0
|
581 |
+
phosphonato 0
|
582 |
+
heptatriacont 0
|
583 |
+
nonatriacont 0
|
584 |
+
borono 0
|
585 |
+
silver 0
|
586 |
+
gold 0
|
587 |
+
isothiochromen 0
|
588 |
+
nitron 0
|
589 |
+
hafnium 0
|
590 |
+
hexacont 0
|
591 |
+
(2-) 0
|
592 |
+
hypochlorite 0
|
593 |
+
arsa 0
|
594 |
+
diphosphat 0
|
595 |
+
molybdenum 0
|
596 |
+
thallium 0
|
597 |
+
nonadecyl 0
|
598 |
+
fluora 0
|
599 |
+
nonatetracont 0
|
600 |
+
rhenium 0
|
601 |
+
tetracarbon 0
|
602 |
+
perylen 0
|
603 |
+
diphosphon 0
|
604 |
+
cyanate 0
|
605 |
+
oxygen 0
|
606 |
+
germ 0
|
607 |
+
nitramide 0
|
608 |
+
tell 0
|
609 |
+
aluminum 0
|
610 |
+
azuleno 0
|
611 |
+
quinolino 0
|
612 |
+
iod 0
|
613 |
+
actinium 0
|
614 |
+
terephthal 0
|
615 |
+
ecan 0
|
616 |
+
trithion 0
|
617 |
+
barium 0
|
618 |
+
hentetracont 0
|
619 |
+
dithion 0
|
620 |
+
phosphat 0
|
621 |
+
selenophen 0
|
622 |
+
xylen 0
|
623 |
+
germa 0
|
624 |
+
hen 0
|
625 |
+
perimidin 0
|
626 |
+
nitric_acid 0
|
627 |
+
rubidium 0
|
628 |
+
octatetracont 0
|
629 |
+
but-1-eno 0
|
630 |
+
nitramido 0
|
631 |
+
heptakis 0
|
632 |
+
thiocyanat 0
|
633 |
+
dibor 0
|
634 |
+
nitrous 0
|
635 |
+
hydrazon 0
|
636 |
+
thianthren 0
|
637 |
+
dili 0
|
638 |
+
hydride 0
|
639 |
+
oxonio 0
|
640 |
+
tetratetracont 0
|
641 |
+
isochromeno 0
|
642 |
+
dihydropter 0
|
643 |
+
indolizino 0
|
644 |
+
osmium 0
|
645 |
+
phosphonia 0
|
646 |
+
oxanthren 0
|
647 |
+
diazano 0
|
648 |
+
do 0
|
649 |
+
cyanato 0
|
650 |
+
diacetamid 0
|
651 |
+
oxam 0
|
652 |
+
silicate 0
|
653 |
+
cadmium 0
|
654 |
+
hydrofluoride 0
|
655 |
+
hexatetracont 0
|
656 |
+
boron 0
|
657 |
+
phosphindol 0
|
658 |
+
phenoxathiin 0
|
659 |
+
phosphonous_acid 0
|
660 |
+
octakis 0
|
661 |
+
bismuth 0
|
662 |
+
chromenylium 0
|
663 |
+
corrin 0
|
664 |
+
pyrylium 0
|
665 |
+
thion 0
|
666 |
+
cinnam 0
|
667 |
+
tritetracont 0
|
668 |
+
nitrite 0
|
669 |
+
gadolinium 0
|
670 |
+
diazonio 0
|
671 |
+
antimony 0
|
672 |
+
oxalo 0
|
673 |
+
onic_acid 0
|
674 |
+
biphenylen 0
|
675 |
+
sulfonio 0
|
676 |
+
cesium 0
|
677 |
+
oxonium 0
|
678 |
+
stiba 0
|
679 |
+
styren 0
|
680 |
+
heptacont 0
|
681 |
+
selenol 0
|
682 |
+
chloroform 0
|
683 |
+
diselen 0
|
684 |
+
onin 0
|
685 |
+
oxaldehyd 0
|
686 |
+
cerium 0
|
687 |
+
technetium 0
|
688 |
+
(1-) 0
|
689 |
+
lead 0
|
690 |
+
ite 0
|
691 |
+
acenaphthyleno 0
|
692 |
+
dicarboximid 0
|
693 |
+
oxonia 0
|
694 |
+
strontium 0
|
695 |
+
(5+) 0
|
696 |
+
iodid 0
|
697 |
+
lanthanum 0
|
698 |
+
rutherfordium 0
|
699 |
+
perchloric_acid 0
|
700 |
+
iren 0
|
701 |
+
tricosyl 0
|
702 |
+
hypobromite 0
|
703 |
+
europium 0
|
704 |
+
isocyanate 0
|
705 |
+
ido 0
|
706 |
+
iodosyl 0
|
707 |
+
nitrilium 0
|
708 |
+
neodymium 0
|
709 |
+
peroxide 0
|
710 |
+
pentatetracont 0
|
711 |
+
phenylen 0
|
712 |
+
tantalum 0
|
713 |
+
hect 0
|
714 |
+
buta-1,3-dieno 0
|
715 |
+
samarium 0
|
716 |
+
galla 0
|
717 |
+
methylal 0
|
718 |
+
fluorid 0
|
719 |
+
praseodymium 0
|
720 |
+
ytterbium 0
|
721 |
+
dimethoxyethane 0
|
722 |
+
scandium 0
|
723 |
+
seleno 0
|
724 |
+
dimethoxyethan 0
|
725 |
+
octacont 0
|
726 |
+
cub 0
|
727 |
+
gallium 0
|
728 |
+
diphosphate 0
|
729 |
+
pentacosyl 0
|
730 |
+
thalla 0
|
731 |
+
ous_acid 0
|
732 |
+
selenoate 0
|
733 |
+
arson 0
|
734 |
+
niobium 0
|
735 |
+
alumina 0
|
736 |
+
anisol 0
|
737 |
+
beryllium 0
|
738 |
+
thioph 0
|
739 |
+
heptatetracont 0
|
740 |
+
onan 0
|
741 |
+
tellura 0
|
742 |
+
quinoxalino 0
|
743 |
+
indiga 0
|
744 |
+
heptacosyl 0
|
745 |
+
isothiocyanate 0
|
746 |
+
inin 0
|
747 |
+
diphospho 0
|
748 |
+
thionia 0
|
749 |
+
selenido 0
|
750 |
+
nonacosyl 0
|
751 |
+
terbium 0
|
752 |
+
(6+) 0
|
753 |
+
indig 0
|
754 |
+
dysprosium 0
|
755 |
+
quinazolino 0
|
756 |
+
iodyl 0
|
757 |
+
indium 0
|
758 |
+
hexatriacontyl 0
|
759 |
+
thiopyr 0
|
760 |
+
triphosphon 0
|
761 |
+
thorium 0
|
762 |
+
carbohydrazonoyl 0
|
763 |
+
as-indacen 0
|
764 |
+
fluoroform 0
|
765 |
+
erbium 0
|
766 |
+
phosphindolo 0
|
767 |
+
lutetium 0
|
768 |
+
selenopheno 0
|
769 |
+
arsin 0
|
770 |
+
arsor 0
|
771 |
+
iodat 0
|
772 |
+
silanuida 0
|
773 |
+
plumba 0
|
774 |
+
plumb 0
|
775 |
+
borano 0
|
776 |
+
sulfonium 0
|
777 |
+
tellurophen 0
|
778 |
+
indazolo 0
|
779 |
+
nitroxyl 0
|
780 |
+
nitrogen 0
|
781 |
+
anthra 0
|
782 |
+
isophosphindol 0
|
783 |
+
disulfid 0
|
784 |
+
nonacont 0
|
785 |
+
selone 0
|
786 |
+
iodonio 0
|
787 |
+
onate 0
|
788 |
+
trili 0
|
789 |
+
iodine 0
|
790 |
+
seleninyl 0
|
791 |
+
phenoxaphosphinin 0
|
792 |
+
phen 0
|
793 |
+
thulium 0
|
794 |
+
chloryl 0
|
795 |
+
phosphinimyl 0
|
796 |
+
cyanic_acid 0
|
797 |
+
acridophosphin 0
|
798 |
+
tetrali 0
|
799 |
+
cumen 0
|
800 |
+
holmium 0
|
801 |
+
selenopyran 0
|
802 |
+
dibenzamid 0
|
803 |
+
nitrous_acid 0
|
804 |
+
phthalal 0
|
805 |
+
selenocyanate 0
|
806 |
+
argon 0
|
807 |
+
iodate 0
|
808 |
+
isothiochromeno 0
|
809 |
+
mercurio 0
|
810 |
+
sulfide 0
|
811 |
+
bromid 0
|
812 |
+
iodonia 0
|
813 |
+
disulfate 0
|
814 |
+
fluorine 0
|
815 |
+
aceanthrylen 0
|
816 |
+
coronen 0
|
817 |
+
phenoxid 0
|
818 |
+
hydrazonic 0
|
819 |
+
telluro 0
|
820 |
+
silicon 0
|
821 |
+
chloronio 0
|
822 |
+
hypochlorous_acid 0
|
823 |
+
dodecakis 0
|
824 |
+
hydroseleno 0
|
825 |
+
phosphinolin 0
|
826 |
+
inda 0
|
827 |
+
phenaleno 0
|
828 |
+
phenylene 0
|
829 |
+
arsenic 0
|
830 |
+
chlorosyl 0
|
831 |
+
perchloryl 0
|
832 |
+
chlorate 0
|
833 |
+
bism 0
|
834 |
+
onat 0
|
835 |
+
terephthalal 0
|
836 |
+
7,8-dihydropter 0
|
837 |
+
silano 0
|
838 |
+
boranthren 0
|
839 |
+
fermium 0
|
840 |
+
phosphano 0
|
841 |
+
arsoroso 0
|
842 |
+
hydrido 0
|
843 |
+
alum 0
|
844 |
+
selenium 0
|
845 |
+
pol 0
|
846 |
+
nonakis 0
|
847 |
+
stibo 0
|
848 |
+
phospheno 0
|
849 |
+
astatine 0
|
850 |
+
phosphanida 0
|
851 |
+
phenophosphazinin 0
|
852 |
+
stibor 0
|
853 |
+
sulfenat 0
|
854 |
+
silanida 0
|
855 |
+
pyranthren 0
|
856 |
+
arsono 0
|
857 |
+
decakis 0
|
858 |
+
oxaldehyde 0
|
859 |
+
cyanid 0
|
860 |
+
neptunium 0
|
861 |
+
diphosphor 0
|
862 |
+
bromate 0
|
863 |
+
selenate 0
|
864 |
+
selenin 0
|
865 |
+
selenonyl 0
|
866 |
+
phenoselenazin 0
|
867 |
+
hypoiodous_acid 0
|
868 |
+
silanylia 0
|
869 |
+
ditellur 0
|
870 |
+
arso 0
|
871 |
+
helicen 0
|
872 |
+
americium 0
|
873 |
+
pyreno 0
|
874 |
+
selenoxanthen 0
|
875 |
+
amoyl 0
|
876 |
+
telluroate 0
|
877 |
+
selen 0
|
878 |
+
selenochromen 0
|
879 |
+
diyl 0
|
880 |
+
dithianon 0
|
881 |
+
ose 0
|
882 |
+
plutonium 0
|
883 |
+
silicic_acid 0
|
884 |
+
5,6,7,8-tetrahydropter 0
|
885 |
+
xenon 0
|
886 |
+
sulfamide 0
|
887 |
+
bisma 0
|
888 |
+
germanium 0
|
889 |
+
triphosphate 0
|
890 |
+
triphospho 0
|
891 |
+
triselen 0
|
892 |
+
isocyanide 0
|
893 |
+
isophosphinolin 0
|
894 |
+
tetrasulfide 0
|
895 |
+
dict 0
|
896 |
+
bromine 0
|
897 |
+
curium 0
|
898 |
+
acephenanthrylen 0
|
899 |
+
promethium 0
|
900 |
+
phosphanthridin 0
|
901 |
+
gall 0
|
902 |
+
selenocyanat 0
|
903 |
+
stilben 0
|
904 |
+
disulfide 0
|
905 |
+
isochromenylium 0
|
906 |
+
tetrathion 0
|
907 |
+
thall 0
|
908 |
+
selenat 0
|
909 |
+
chlor 0
|
910 |
+
silanthren 0
|
911 |
+
(3-) 0
|
912 |
+
tetradecakis 0
|
913 |
+
xantheno 0
|
914 |
+
chromio 0
|
915 |
+
chlorite 0
|
916 |
+
californium 0
|
917 |
+
tetraphosphat 0
|
918 |
+
chlorine 0
|
919 |
+
iodoform 0
|
920 |
+
telluropyran 0
|
921 |
+
polona 0
|
922 |
+
lawrencium 0
|
923 |
+
naphthyridino 0
|
924 |
+
selenon 0
|
925 |
+
phenoxarsinin 0
|
926 |
+
as-indaceno 0
|
927 |
+
mercura 0
|
928 |
+
periodate 0
|
929 |
+
selenite 0
|
930 |
+
hypofluorous_acid 0
|
931 |
+
adip 0
|
932 |
+
bromyl 0
|
933 |
+
arsino 0
|
934 |
+
tungstenio 0
|
935 |
+
tellurochromen 0
|
936 |
+
stibin 0
|
937 |
+
trisulfide 0
|
938 |
+
isoselenochromen 0
|
939 |
+
zircona 0
|
940 |
+
hexali 0
|
941 |
+
tetraphosphate 0
|
942 |
+
onamide 0
|
943 |
+
chloronia 0
|
944 |
+
thiochromenylium 0
|
945 |
+
phosphorus 0
|
946 |
+
titana 0
|
947 |
+
dicyclohexylurea 0
|
948 |
+
phenarsazinin 0
|
949 |
+
(8+) 0
|
950 |
+
nitroform 0
|
951 |
+
molybdenio 0
|
952 |
+
undecakis 0
|
953 |
+
rubicen 0
|
954 |
+
diselenid 0
|
955 |
+
triphosphat 0
|
956 |
+
diboron 0
|
957 |
+
trisulfid 0
|
958 |
+
hexadecakis 0
|
959 |
+
pleiaden 0
|
960 |
+
ter 0
|
961 |
+
arsonous_acid 0
|
962 |
+
ars 0
|
963 |
+
permangan 0
|
964 |
+
methoxychlor 0
|
965 |
+
tellurinyl 0
|
966 |
+
triacetamid 0
|
967 |
+
isocyanatid 0
|
968 |
+
(7+) 0
|
969 |
+
phthalazino 0
|
970 |
+
chloric_acid 0
|
971 |
+
stibon 0
|
972 |
+
tellone 0
|
973 |
+
stib 0
|
974 |
+
protactinium 0
|
975 |
+
fluor 0
|
976 |
+
arsonato 0
|
977 |
+
einsteinium 0
|
978 |
+
tellur 0
|
979 |
+
molybda 0
|
980 |
+
telluroxanthen 0
|
981 |
+
water 0
|
982 |
+
pentali 0
|
983 |
+
vanadio 0
|
984 |
+
formazan 0
|
985 |
+
ovalen 0
|
986 |
+
brom 0
|
987 |
+
thioxantheno 0
|
988 |
+
selenomorpholin 0
|
989 |
+
arsonium 0
|
990 |
+
nobelium 0
|
991 |
+
cinnolino 0
|
992 |
+
nitrid 0
|
993 |
+
telluropyrano 0
|
994 |
+
neo 0
|
995 |
+
tellurate 0
|
996 |
+
bromic_acid 0
|
997 |
+
phosphinolino 0
|
998 |
+
iodite 0
|
999 |
+
arsindol 0
|
1000 |
+
phosphen 0
|
1001 |
+
tribenzamid 0
|
1002 |
+
tellurium 0
|
1003 |
+
oxyl 0
|
1004 |
+
icosakis 0
|
1005 |
+
tellurat 0
|
1006 |
+
krypton 0
|
1007 |
+
bromite 0
|
1008 |
+
tridecakis 0
|
1009 |
+
all 0
|
1010 |
+
isotellurochromen 0
|
1011 |
+
diarsor 0
|
1012 |
+
bromosyl 0
|
1013 |
+
helium 0
|
1014 |
+
disulfite 0
|
1015 |
+
deuteride 0
|
1016 |
+
carboselenoyl 0
|
1017 |
+
bromoform 0
|
1018 |
+
trinaphthylen 0
|
1019 |
+
octali 0
|
1020 |
+
furano 0
|
1021 |
+
selenino 0
|
1022 |
+
iodic_acid 0
|
1023 |
+
hydrotelluro 0
|
1024 |
+
boronia 0
|
1025 |
+
phosphinolizin 0
|
1026 |
+
prism 0
|
1027 |
+
periodic_acid 0
|
1028 |
+
orot 0
|
1029 |
+
pentadecakis 0
|
1030 |
+
polonium 0
|
1031 |
+
hexasulfide 0
|
1032 |
+
stibono 0
|
1033 |
+
selenanthren 0
|
1034 |
+
ozone 0
|
1035 |
+
phosphindolizin 0
|
1036 |
+
urana 0
|
1037 |
+
pyridino 0
|
1038 |
+
phenotellurazin 0
|
1039 |
+
meitnerium 0
|
1040 |
+
tetrasulfid 0
|
1041 |
+
selenonia 0
|
1042 |
+
hypobromous_acid 0
|
1043 |
+
selenopyrano 0
|
1044 |
+
chlorat 0
|
1045 |
+
trifluoromethanesulfonimid 0
|
1046 |
+
seaborgium 0
|
1047 |
+
azor 0
|
1048 |
+
azonous_acid 0
|
1049 |
+
selenoph 0
|
1050 |
+
periodyl 0
|
1051 |
+
perbromate 0
|
1052 |
+
oson 0
|
1053 |
+
berkelium 0
|
1054 |
+
tungsta 0
|
1055 |
+
ribo 0
|
1056 |
+
pentaphosphate 0
|
1057 |
+
hafna 0
|
1058 |
+
telluropheno 0
|
1059 |
+
tellurite 0
|
1060 |
+
nitronium 0
|
1061 |
+
mon 0
|
1062 |
+
astata 0
|
1063 |
+
isothiocyanatid 0
|
1064 |
+
dubnium 0
|
1065 |
+
isothiochromenylium 0
|
1066 |
+
tellurin 0
|
1067 |
+
sodio 0
|
1068 |
+
selenono 0
|
1069 |
+
selenochromeno 0
|
1070 |
+
nitrosyl 0
|
1071 |
+
mendelevium 0
|
1072 |
+
ous 0
|
1073 |
+
neon 0
|
1074 |
+
fluoronio 0
|
1075 |
+
azid 0
|
1076 |
+
then 0
|
1077 |
+
stannanylia 0
|
1078 |
+
potassio 0
|
1079 |
+
phosphanthren 0
|
1080 |
+
disilic 0
|
1081 |
+
chlorazin 0
|
1082 |
+
titanio 0
|
1083 |
+
bromat 0
|
1084 |
+
triacontakis 0
|
1085 |
+
pentasulfide 0
|
1086 |
+
nonadecakis 0
|
1087 |
+
rhenio 0
|
1088 |
+
platina 0
|
1089 |
+
phenoxatellurin 0
|
1090 |
+
pentazocine 0
|
1091 |
+
ferrio 0
|
1092 |
+
cos 0
|
1093 |
+
vanada 0
|
1094 |
+
triselenid 0
|
1095 |
+
telluronyl 0
|
1096 |
+
tellurocyanate 0
|
1097 |
+
pentazocin 0
|
1098 |
+
fulven 0
|
1099 |
+
distibor 0
|
1100 |
+
diphosphite 0
|
1101 |
+
radon 0
|
1102 |
+
pentathion 0
|
1103 |
+
nitrous_oxide 0
|
1104 |
+
ferra 0
|
1105 |
+
ditelluron 0
|
1106 |
+
bis(trifluoromethylsulfonyl)imid 0
|
1107 |
+
acridino 0
|
1108 |
+
telluron 0
|
1109 |
+
isophosphinolino 0
|
1110 |
+
diselenon 0
|
1111 |
+
diarson 0
|
1112 |
+
stibanuida 0
|
1113 |
+
germano 0
|
1114 |
+
xanthylium 0
|
1115 |
+
tert-butyl(dimethyl)silanyl 0
|
1116 |
+
radium 0
|
1117 |
+
osma 0
|
1118 |
+
chlorous_acid 0
|
1119 |
+
bromonio 0
|
1120 |
+
arsonia 0
|
1121 |
+
arsinolin 0
|
1122 |
+
amate 0
|
1123 |
+
urazol 0
|
1124 |
+
triphosphor 0
|
1125 |
+
nonali 0
|
1126 |
+
deutero 0
|
1127 |
+
nioba 0
|
1128 |
+
acridarsin 0
|
1129 |
+
yttrio 0
|
1130 |
+
tert-butyl-dimethylsilyl 0
|
1131 |
+
pyrimidino 0
|
1132 |
+
pteridino 0
|
1133 |
+
phenoxaselenin 0
|
1134 |
+
isocyanid 0
|
1135 |
+
irida 0
|
1136 |
+
heptadecakis 0
|
1137 |
+
bohrium 0
|
1138 |
+
pentacosakis 0
|
1139 |
+
octadecakis 0
|
1140 |
+
thianthreno 0
|
1141 |
+
telluroph 0
|
1142 |
+
t- 0
|
1143 |
+
isophosphindolo 0
|
1144 |
+
isoarsindol 0
|
1145 |
+
henicosakis 0
|
1146 |
+
(4-) 0
|
1147 |
+
ruthena 0
|
1148 |
+
heptali 0
|
1149 |
+
arsen 0
|
1150 |
+
telluranthren 0
|
1151 |
+
chryseno 0
|
1152 |
+
carbotelluroyl 0
|
1153 |
+
quinolizino 0
|
1154 |
+
nonacosakis 0
|
1155 |
+
francium 0
|
1156 |
+
ethion 0
|
1157 |
+
chroma 0
|
1158 |
+
arsanthridin 0
|
1159 |
+
arsanthren 0
|
1160 |
+
tricosakis 0
|
1161 |
+
tetraphosphor 0
|
1162 |
+
tetracosakis 0
|
1163 |
+
tellurocyanat 0
|
1164 |
+
stibonia 0
|
1165 |
+
stibonato 0
|
1166 |
+
phosphanuida 0
|
1167 |
+
phenoxathiino 0
|
1168 |
+
manganio 0
|
1169 |
+
eicosa 0
|
1170 |
+
cobaltio 0
|
1171 |
+
cera 0
|
1172 |
+
amic_acid 0
|
1173 |
+
stibino 0
|
1174 |
+
stannanuida 0
|
1175 |
+
samario 0
|
1176 |
+
s-indaceno 0
|
1177 |
+
praseodymio 0
|
1178 |
+
phenoxastibinin 0
|
1179 |
+
pallada 0
|
1180 |
+
neodymio 0
|
1181 |
+
isoselenocyanate 0
|
1182 |
+
germanuida 0
|
1183 |
+
diazoamino 0
|
1184 |
+
telluronia 0
|
1185 |
+
tantalio 0
|
1186 |
+
phenoxyl 0
|
1187 |
+
phenothiarsinin 0
|
1188 |
+
oxanthreno 0
|
1189 |
+
octacosakis 0
|
1190 |
+
mangana 0
|
1191 |
+
lanthanio 0
|
1192 |
+
isoarsinolin 0
|
1193 |
+
indan 0
|
1194 |
+
hexacosakis 0
|
1195 |
+
hassium 0
|
1196 |
+
arsinolizin 0
|
1197 |
+
alli 0
|
1198 |
+
thioxanth 0
|
1199 |
+
tert-butyl(diphenyl)silanyl 0
|
1200 |
+
stronta 0
|
1201 |
+
stannano 0
|
1202 |
+
rhodio 0
|
1203 |
+
rhoda 0
|
1204 |
+
praseodyma 0
|
1205 |
+
phenazino 0
|
1206 |
+
pentaphosphat 0
|
1207 |
+
nitric 0
|
1208 |
+
methoxyl 0
|
1209 |
+
magnesio 0
|
1210 |
+
dichrom 0
|
1211 |
+
chlorazine 0
|
1212 |
+
californa 0
|
1213 |
+
butoxyl 0
|
1214 |
+
bromous_acid 0
|
1215 |
+
azonic_acid 0
|
1216 |
+
arsinolino 0
|
1217 |
+
arsindolo 0
|
1218 |
+
arsindolizin 0
|
1219 |
+
allo 0
|
1220 |
+
actina 0
|
1221 |
+
uronic_acid 0
|
1222 |
+
thora 0
|
1223 |
+
telluromorpholin 0
|
1224 |
+
stibonium 0
|
1225 |
+
stibano 0
|
1226 |
+
rhena 0
|
1227 |
+
phosphinolizino 0
|
1228 |
+
phenothiazino 0
|
1229 |
+
perbromyl 0
|
1230 |
+
niobio 0
|
1231 |
+
nickelio 0
|
1232 |
+
isotellurochromeno 0
|
1233 |
+
isoselenocyanato 0
|
1234 |
+
iodous_acid 0
|
1235 |
+
iodous 0
|
1236 |
+
hydroselenonyl 0
|
1237 |
+
dysprosio 0
|
1238 |
+
cyclopenta[a]phenanthr 0
|
1239 |
+
cerio 0
|
1240 |
+
bara 0
|
1241 |
+
aurio 0
|
1242 |
+
arsanuida 0
|
1243 |
+
ytterbio 0
|
1244 |
+
uronate 0
|
1245 |
+
tol 0
|
1246 |
+
thulio 0
|
1247 |
+
tert-butyl-diphenylsilyl 0
|
1248 |
+
tellurono 0
|
1249 |
+
stannanida 0
|
1250 |
+
scandio 0
|
1251 |
+
propoxyl 0
|
1252 |
+
periodic 0
|
1253 |
+
perbromic_acid 0
|
1254 |
+
nitror 0
|
1255 |
+
lutetio 0
|
1256 |
+
isothiocyanic_acid 0
|
1257 |
+
iridio 0
|
1258 |
+
iodic 0
|
1259 |
+
hypobor 0
|
1260 |
+
hydroxyl 0
|
1261 |
+
hydroseleninyl 0
|
1262 |
+
holmio 0
|
1263 |
+
hexasulfid 0
|
1264 |
+
heptacosakis 0
|
1265 |
+
gadolinio 0
|
1266 |
+
europio 0
|
1267 |
+
ethoxyl 0
|
1268 |
+
erbio 0
|
1269 |
+
docosakis 0
|
1270 |
+
chlorous 0
|
1271 |
+
chloric 0
|
1272 |
+
arsinimyl 0
|
1273 |
+
argentio 0
|
1274 |
+
▁ -0.24368
|
1275 |
+
c -3.77761
|
1276 |
+
m -3.81933
|
1277 |
+
t -4.1484
|
1278 |
+
p -4.28552
|
1279 |
+
n -4.34236
|
1280 |
+
u -4.43826
|
1281 |
+
s -4.52053
|
1282 |
+
i -4.6648
|
1283 |
+
is -4.8052
|
1284 |
+
g -4.85455
|
1285 |
+
x -5.02503
|
1286 |
+
y -5.19016
|
1287 |
+
h -5.25276
|
1288 |
+
b -5.25733
|
1289 |
+
v -5.50657
|
1290 |
+
th -5.56431
|
1291 |
+
f -5.60089
|
1292 |
+
ph -5.65809
|
1293 |
+
hy -5.71657
|
1294 |
+
▁p -6.08895
|
1295 |
+
cy -6.12699
|
1296 |
+
yc -6.28409
|
1297 |
+
im -6.3188
|
1298 |
+
ti -6.4861
|
1299 |
+
ch -6.53742
|
1300 |
+
ut -6.55604
|
1301 |
+
cys -6.59438
|
1302 |
+
st -6.61931
|
1303 |
+
▁h -6.69232
|
1304 |
+
pi -6.72852
|
1305 |
+
uc -6.85542
|
1306 |
+
us -6.89267
|
1307 |
+
▁b -6.96641
|
1308 |
+
▁g -6.99289
|
1309 |
+
▁c -7.03458
|
1310 |
+
ys -7.04986
|
1311 |
+
ct -7.06609
|
1312 |
+
▁hy -7.10659
|
1313 |
+
gu -7.12486
|
1314 |
+
sp -7.1249
|
1315 |
+
xy -7.2108
|
1316 |
+
▁s -7.3108
|
1317 |
+
yp -7.394
|
1318 |
+
um -7.39798
|
1319 |
+
xim -7.47115
|
1320 |
+
thy -7.52489
|
1321 |
+
ps -7.53214
|
1322 |
+
fu -7.86517
|
1323 |
+
▁cy -7.98841
|
1324 |
+
mph -7.99202
|
1325 |
+
▁n -8.03554
|
1326 |
+
ni -8.04807
|
1327 |
+
▁m -8.12601
|
1328 |
+
nth -8.18462
|
1329 |
+
cu -8.19705
|
1330 |
+
phth -8.20839
|
1331 |
+
ip -8.32472
|
1332 |
+
▁f -8.36171
|
1333 |
+
ty -8.47003
|
1334 |
+
▁cu -8.49492
|
1335 |
+
ym -8.59996
|
1336 |
+
ff -8.60659
|
1337 |
+
uf -8.65435
|
1338 |
+
fi -8.70783
|
1339 |
+
pt -8.74056
|
1340 |
+
tun -8.78867
|
1341 |
+
yt -8.80236
|
1342 |
+
▁ch -8.81859
|
1343 |
+
▁ps -9.02055
|
1344 |
+
▁sty -9.02375
|
1345 |
+
▁phyt -9.0593
|
1346 |
+
ub -9.1473
|
1347 |
+
mb -9.15357
|
1348 |
+
▁fu -9.19661
|
1349 |
+
if -9.24761
|
1350 |
+
ci -9.28944
|
1351 |
+
▁sym -9.29607
|
1352 |
+
ss -9.31017
|
1353 |
+
up -9.34393
|
1354 |
+
sty -9.34753
|
1355 |
+
▁t -9.40241
|
1356 |
+
pp -9.46886
|
1357 |
+
mi -9.51896
|
1358 |
+
gn -9.58869
|
1359 |
+
ms -9.85318
|
1360 |
+
▁pi -9.85785
|
1361 |
+
ist -9.89882
|
1362 |
+
tig -9.95137
|
1363 |
+
▁thy -10.0245
|
1364 |
+
vii -10.0685
|
1365 |
+
hi -10.077
|
1366 |
+
sym -10.0864
|
1367 |
+
▁sub -10.1129
|
1368 |
+
ptu -10.1771
|
1369 |
+
cti -10.2664
|
1370 |
+
ig -10.5468
|
1371 |
+
tu -10.5569
|
1372 |
+
▁fuc -10.6338
|
1373 |
+
▁sy -10.726
|
1374 |
+
▁th -10.8515
|
1375 |
+
uv -10.9123
|
1376 |
+
si -10.9398
|
1377 |
+
▁cys -11.0937
|
1378 |
+
bu -11.3456
|
1379 |
+
mu -11.3477
|
1380 |
+
vi -11.4565
|
1381 |
+
mp -11.4617
|
1382 |
+
ib -11.5026
|
1383 |
+
pu -11.5547
|
1384 |
+
▁i -11.5794
|
1385 |
+
▁bu -11.6761
|
1386 |
+
▁gu -11.6864
|
1387 |
+
▁mu -11.7005
|
1388 |
+
▁st -11.7307
|
1389 |
+
un -11.844
|
1390 |
+
uct -11.8441
|
1391 |
+
▁u -12.046
|
iupac-gpt/iupac_gpt/iupac_tokenization.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import (
|
2 |
+
AdamW,
|
3 |
+
DataCollatorWithPadding,
|
4 |
+
HfArgumentParser,
|
5 |
+
T5Config,
|
6 |
+
T5ForConditionalGeneration,
|
7 |
+
T5Tokenizer,
|
8 |
+
Trainer,
|
9 |
+
TrainingArguments,
|
10 |
+
)
|
11 |
+
from torch.utils.data import DataLoader
|
12 |
+
import os
|
13 |
+
import tempfile
|
14 |
+
import re
|
15 |
+
import pandas as pd
|
16 |
+
import numpy as np
|
17 |
+
from typing import Dict, Optional
|
18 |
+
from dataclasses import dataclass, field
|
19 |
+
import logging
|
20 |
+
|
21 |
+
import torch
|
22 |
+
from torch.nn.utils.rnn import pad_sequence
|
23 |
+
from torch.optim.lr_scheduler import LambdaLR
|
24 |
+
import os.path as pt
|
25 |
+
import torch.optim as optim
|
26 |
+
import torch.nn as nn
|
27 |
+
from tqdm import tqdm
|
28 |
+
from torch.autograd import Variable
|
29 |
+
from .iupac_dataset import IUPACDataset
|
30 |
+
import os
|
31 |
+
#os.environ["CUDA_VISIBLE_DEVICES"]="0"
|
32 |
+
|
33 |
+
|
34 |
+
class T5Collator:
|
35 |
+
def __init__(self, pad_token_id):
|
36 |
+
super().__init__()
|
37 |
+
self.pad_token_id = pad_token_id
|
38 |
+
def __call__(self, records):
|
39 |
+
# records is a list of dicts
|
40 |
+
batch = {}
|
41 |
+
padvals = {"input_ids": self.pad_token_id,'labels':-100}
|
42 |
+
for k in records[0]:
|
43 |
+
if k in padvals:
|
44 |
+
batch[k] = pad_sequence([torch.tensor(r[k]) for r in records],
|
45 |
+
batch_first=True,
|
46 |
+
padding_value=padvals[k])
|
47 |
+
else:
|
48 |
+
batch[k] = torch.FloatTensor([r[k] for r in records]) #torch.Tensor
|
49 |
+
return batch
|
50 |
+
|
51 |
+
class T5IUPACTokenizer(T5Tokenizer):
|
52 |
+
def prepare_for_tokenization(self, text, is_split_into_words=False,
|
53 |
+
**kwargs):
|
54 |
+
return re.sub(" ", "_", text), kwargs
|
55 |
+
|
56 |
+
def _decode(self, *args, **kwargs):
|
57 |
+
# replace "_" with " ", except for the _ in extra_id_#
|
58 |
+
text = super()._decode(*args, **kwargs)
|
59 |
+
text = re.sub("extra_id_", "extraAidA", text)
|
60 |
+
text = re.sub("_", " ", text)
|
61 |
+
text = re.sub("extraAidA", "extra_id_", text)
|
62 |
+
return text
|
63 |
+
|
64 |
+
def sentinels(self, sentinel_ids):
|
65 |
+
return self.vocab_size - sentinel_ids - 1
|
66 |
+
|
67 |
+
def sentinel_mask(self, ids):
|
68 |
+
return ((self.vocab_size - self._extra_ids <= ids) &
|
69 |
+
(ids < self.vocab_size))
|
70 |
+
|
71 |
+
def _tokenize(self, text, sample=False):
|
72 |
+
#pieces = super()._tokenize(text, sample=sample)
|
73 |
+
pieces = super()._tokenize(text)
|
74 |
+
# sentencepiece adds a non-printing token at the start. Remove it
|
75 |
+
return pieces[1:]
|
76 |
+
|
77 |
+
def prepare_input(data,device):
|
78 |
+
from collections.abc import Mapping
|
79 |
+
if isinstance(data, Mapping):
|
80 |
+
return type(data)({k: prepare_input(v,device) for k, v in data.items()})
|
81 |
+
elif isinstance(data, (tuple, list)):
|
82 |
+
return type(data)(prepare_input(v,device) for v in data)
|
83 |
+
elif isinstance(data, torch.Tensor):
|
84 |
+
kwargs = dict(device=device)
|
85 |
+
if data.dtype != torch.int64:
|
86 |
+
# NLP models inputs are int64 and those get adjusted to the right dtype of the
|
87 |
+
# embedding. Other models such as wav2vec2's inputs are already float and thus
|
88 |
+
# may need special handling to match the dtypes of the model
|
89 |
+
kwargs.update(dict(dtype=torch.int64))
|
90 |
+
|
91 |
+
return data.to(**kwargs)
|
92 |
+
return data
|
93 |
+
|
94 |
+
def get_data_loader(is_train=1):
|
95 |
+
|
96 |
+
full_path = '/home/jmwang/drugai/iupac-gpt/iupac_gpt/'
|
97 |
+
|
98 |
+
iupac_tokenizer = T5IUPACTokenizer(vocab_file=full_path+'iupac_spm.model')
|
99 |
+
iupac_vocab_size = iupac_tokenizer.vocab_size
|
100 |
+
print('iupac_vocab_size:',iupac_vocab_size)
|
101 |
+
if is_train:
|
102 |
+
torch.save(iupac_tokenizer, pt.join(full_path,"real_iupac_tokenizer.pt"))
|
103 |
+
print("training...",len(iupac_tokenizer))
|
104 |
+
else:
|
105 |
+
iupac_tokenizer = torch.load(pt.join(full_path,"real_iupac_tokenizer.pt"), map_location="cpu")
|
106 |
+
print('fina_tune...',len(iupac_tokenizer))
|
107 |
+
|
108 |
+
dataset_filename = 'data/pubchem_iupac_smile_gpt.csv'
|
109 |
+
target_col = "aLogP"
|
110 |
+
iupac_name_col = 'PUBCHEM_IUPAC_NAME' #canon_smiles
|
111 |
+
MAXLEN=1024
|
112 |
+
dataset_kwargs = {"dataset_dir":'/home/jmwang/drugai/iupac-gpt',"dataset_filename": dataset_filename,"tokenizer": iupac_tokenizer,"max_length": MAXLEN,"target_col": target_col,'dataset_size':None,"iupac_name_col":iupac_name_col}
|
113 |
+
train_dataset = IUPACDataset(**dataset_kwargs)
|
114 |
+
collator = T5Collator(iupac_tokenizer.pad_token_id)
|
115 |
+
train_dataloader = DataLoader(train_dataset,batch_size=64,collate_fn=collator,shuffle=True)
|
116 |
+
|
117 |
+
return train_dataloader,iupac_tokenizer
|
118 |
+
|
119 |
+
if __name__ == "__main__":
|
120 |
+
|
121 |
+
train_dataloader,iupac_tokenizer = get_data_loader(is_train=1)
|
122 |
+
pbar = tqdm(train_dataloader)
|
123 |
+
device = 'cpu'
|
124 |
+
for inputs in pbar:
|
125 |
+
|
126 |
+
src_label = Variable(inputs["labels"].to(device))
|
127 |
+
inputs = prepare_input(inputs,device)
|
128 |
+
src = Variable(inputs["input_ids"].to(device))
|
129 |
+
#self.tokenizer._convert_token_to_id
|
130 |
+
|
131 |
+
print(src[:,:].shape,src_label)
|
iupac-gpt/iupac_gpt/iupac_tokenization_class.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import (
|
2 |
+
AdamW,
|
3 |
+
DataCollatorWithPadding,
|
4 |
+
HfArgumentParser,
|
5 |
+
T5Config,
|
6 |
+
T5ForConditionalGeneration,
|
7 |
+
T5Tokenizer,
|
8 |
+
Trainer,
|
9 |
+
TrainingArguments,
|
10 |
+
)
|
11 |
+
from torch.utils.data import DataLoader
|
12 |
+
import os
|
13 |
+
import tempfile
|
14 |
+
import re
|
15 |
+
import pandas as pd
|
16 |
+
import numpy as np
|
17 |
+
from typing import Dict, Optional
|
18 |
+
from dataclasses import dataclass, field
|
19 |
+
import logging
|
20 |
+
|
21 |
+
import torch
|
22 |
+
from torch.nn.utils.rnn import pad_sequence
|
23 |
+
from torch.optim.lr_scheduler import LambdaLR
|
24 |
+
import os.path as pt
|
25 |
+
import torch.optim as optim
|
26 |
+
import torch.nn as nn
|
27 |
+
from tqdm import tqdm
|
28 |
+
from torch.autograd import Variable
|
29 |
+
from .iupac_dataset_class import IUPACDataset
|
30 |
+
import os
|
31 |
+
#os.environ["CUDA_VISIBLE_DEVICES"]="0"
|
32 |
+
|
33 |
+
|
34 |
+
class T5Collator:
|
35 |
+
def __init__(self, pad_token_id):
|
36 |
+
super().__init__()
|
37 |
+
self.pad_token_id = pad_token_id
|
38 |
+
def __call__(self, records):
|
39 |
+
# records is a list of dicts
|
40 |
+
batch = {}
|
41 |
+
padvals = {"input_ids": self.pad_token_id,'attention_mask':0}
|
42 |
+
for k in records[0]:
|
43 |
+
if k in padvals:
|
44 |
+
batch[k] = pad_sequence([torch.tensor(r[k]) for r in records],
|
45 |
+
batch_first=True,
|
46 |
+
padding_value=padvals[k])
|
47 |
+
else:
|
48 |
+
batch[k] = torch.LongTensor([r[k] for r in records]) #torch.Tensor LongTensor FloatTensor
|
49 |
+
return batch
|
50 |
+
|
51 |
+
class T5IUPACTokenizer(T5Tokenizer):
|
52 |
+
def prepare_for_tokenization(self, text, is_split_into_words=False,
|
53 |
+
**kwargs):
|
54 |
+
return re.sub(" ", "_", text), kwargs
|
55 |
+
|
56 |
+
def _decode(self, *args, **kwargs):
|
57 |
+
# replace "_" with " ", except for the _ in extra_id_#
|
58 |
+
text = super()._decode(*args, **kwargs)
|
59 |
+
text = re.sub("extra_id_", "extraAidA", text)
|
60 |
+
text = re.sub("_", " ", text)
|
61 |
+
text = re.sub("extraAidA", "extra_id_", text)
|
62 |
+
return text
|
63 |
+
|
64 |
+
def sentinels(self, sentinel_ids):
|
65 |
+
return self.vocab_size - sentinel_ids - 1
|
66 |
+
|
67 |
+
def sentinel_mask(self, ids):
|
68 |
+
return ((self.vocab_size - self._extra_ids <= ids) &
|
69 |
+
(ids < self.vocab_size))
|
70 |
+
|
71 |
+
def _tokenize(self, text, sample=False):
|
72 |
+
#pieces = super()._tokenize(text, sample=sample)
|
73 |
+
pieces = super()._tokenize(text)
|
74 |
+
# sentencepiece adds a non-printing token at the start. Remove it
|
75 |
+
return pieces[1:]
|
76 |
+
|
77 |
+
def prepare_input_class(data,device):
|
78 |
+
from collections.abc import Mapping
|
79 |
+
if isinstance(data, Mapping):
|
80 |
+
return type(data)({k: prepare_input_class(v,device) for k, v in data.items()})
|
81 |
+
elif isinstance(data, (tuple, list)):
|
82 |
+
return type(data)(prepare_input_class(v,device) for v in data)
|
83 |
+
elif isinstance(data, torch.Tensor):
|
84 |
+
kwargs = dict(device=device)
|
85 |
+
if data.dtype != torch.int64:
|
86 |
+
# NLP models inputs are int64 and those get adjusted to the right dtype of the
|
87 |
+
# embedding. Other models such as wav2vec2's inputs are already float and thus
|
88 |
+
# may need special handling to match the dtypes of the model
|
89 |
+
kwargs.update(dict(dtype=torch.int64))
|
90 |
+
|
91 |
+
return data.to(**kwargs)
|
92 |
+
return data
|
93 |
+
|
94 |
+
def get_data_loader_class(is_train=1):
|
95 |
+
|
96 |
+
full_path = '/root/autodl-tmp/wjm/iupac-gpt/iupac_gpt/'
|
97 |
+
|
98 |
+
iupac_tokenizer = T5IUPACTokenizer(vocab_file=full_path+'iupac_spm.model')
|
99 |
+
iupac_vocab_size = iupac_tokenizer.vocab_size
|
100 |
+
print('iupac_vocab_size:',iupac_vocab_size)
|
101 |
+
if is_train:
|
102 |
+
torch.save(iupac_tokenizer, pt.join(full_path,"real_iupac_tokenizer.pt"))
|
103 |
+
print("training...",len(iupac_tokenizer))
|
104 |
+
else:
|
105 |
+
iupac_tokenizer = torch.load(pt.join(full_path,"real_iupac_tokenizer.pt"), map_location="cpu")
|
106 |
+
print('fina_tune...',len(iupac_tokenizer))
|
107 |
+
|
108 |
+
dataset_filename = 'iupacs_logp.csv' #'./pubchem_iupac_smile_gpt.csv'
|
109 |
+
target_col = "LogP" #"aLogP"
|
110 |
+
iupac_name_col = 'iupac' #'PUBCHEM_IUPAC_NAME'
|
111 |
+
MAXLEN=1024
|
112 |
+
dataset_kwargs = {"dataset_dir":full_path,"dataset_filename": dataset_filename,"tokenizer": iupac_tokenizer,"max_length": MAXLEN,"target_col": target_col,'dataset_size':None,"iupac_name_col":iupac_name_col}
|
113 |
+
train_dataset = IUPACDataset(**dataset_kwargs)
|
114 |
+
collator = T5Collator(iupac_tokenizer.pad_token_id)
|
115 |
+
train_dataloader = DataLoader(train_dataset,batch_size=64,collate_fn=collator,shuffle=True)
|
116 |
+
|
117 |
+
return train_dataloader,iupac_tokenizer
|
118 |
+
|
119 |
+
if __name__ == "__main__":
|
120 |
+
|
121 |
+
train_dataloader,iupac_tokenizer = get_data_loader_class(is_train=1)
|
122 |
+
pbar = tqdm(train_dataloader)
|
123 |
+
device = 'cpu'
|
124 |
+
for inputs in pbar:
|
125 |
+
|
126 |
+
src_label = Variable(inputs["labels"].to(device))
|
127 |
+
inputs = prepare_input_class(inputs,device)
|
128 |
+
src = Variable(inputs["input_ids"].to(device))
|
129 |
+
#self.tokenizer._convert_token_to_id
|
130 |
+
|
131 |
+
print(src[:,:].shape,src_label)
|
iupac-gpt/iupac_gpt/iupac_tokenization_iupac.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import (
|
2 |
+
AdamW,
|
3 |
+
DataCollatorWithPadding,
|
4 |
+
HfArgumentParser,
|
5 |
+
T5Config,
|
6 |
+
T5ForConditionalGeneration,
|
7 |
+
T5Tokenizer,
|
8 |
+
Trainer,
|
9 |
+
TrainingArguments,
|
10 |
+
)
|
11 |
+
from torch.utils.data import DataLoader
|
12 |
+
import os
|
13 |
+
import tempfile
|
14 |
+
import re
|
15 |
+
import pandas as pd
|
16 |
+
import numpy as np
|
17 |
+
from typing import Dict, Optional
|
18 |
+
from dataclasses import dataclass, field
|
19 |
+
import logging
|
20 |
+
|
21 |
+
import torch
|
22 |
+
from torch.nn.utils.rnn import pad_sequence
|
23 |
+
from torch.optim.lr_scheduler import LambdaLR
|
24 |
+
import os.path as pt
|
25 |
+
import torch.optim as optim
|
26 |
+
import torch.nn as nn
|
27 |
+
from tqdm import tqdm
|
28 |
+
from torch.autograd import Variable
|
29 |
+
from .iupac_dataset import IUPACDataset
|
30 |
+
import os
|
31 |
+
#os.environ["CUDA_VISIBLE_DEVICES"]="0"
|
32 |
+
|
33 |
+
|
34 |
+
class T5Collator:
|
35 |
+
def __init__(self, pad_token_id):
|
36 |
+
super().__init__()
|
37 |
+
self.pad_token_id = pad_token_id
|
38 |
+
def __call__(self, records):
|
39 |
+
# records is a list of dicts
|
40 |
+
batch = {}
|
41 |
+
padvals = {"input_ids": self.pad_token_id,'attention_mask':0,'labels':-100}
|
42 |
+
for k in records[0]:
|
43 |
+
if k in padvals:
|
44 |
+
batch[k] = pad_sequence([torch.tensor(r[k]) for r in records],
|
45 |
+
batch_first=True,
|
46 |
+
padding_value=padvals[k])
|
47 |
+
else:
|
48 |
+
batch[k] = torch.FloatTensor([r[k] for r in records]) #torch.Tensor
|
49 |
+
return batch
|
50 |
+
|
51 |
+
class T5IUPACTokenizer(T5Tokenizer):
|
52 |
+
def prepare_for_tokenization(self, text, is_split_into_words=False,
|
53 |
+
**kwargs):
|
54 |
+
return re.sub(" ", "_", text), kwargs
|
55 |
+
|
56 |
+
def _decode(self, *args, **kwargs):
|
57 |
+
# replace "_" with " ", except for the _ in extra_id_#
|
58 |
+
text = super()._decode(*args, **kwargs)
|
59 |
+
text = re.sub("extra_id_", "extraAidA", text)
|
60 |
+
text = re.sub("_", " ", text)
|
61 |
+
text = re.sub("extraAidA", "extra_id_", text)
|
62 |
+
return text
|
63 |
+
|
64 |
+
def sentinels(self, sentinel_ids):
|
65 |
+
return self.vocab_size - sentinel_ids - 1
|
66 |
+
|
67 |
+
def sentinel_mask(self, ids):
|
68 |
+
return ((self.vocab_size - self._extra_ids <= ids) &
|
69 |
+
(ids < self.vocab_size))
|
70 |
+
|
71 |
+
def _tokenize(self, text, sample=False):
|
72 |
+
#pieces = super()._tokenize(text, sample=sample)
|
73 |
+
pieces = super()._tokenize(text)
|
74 |
+
# sentencepiece adds a non-printing token at the start. Remove it
|
75 |
+
return pieces[1:]
|
76 |
+
|
77 |
+
def prepare_input(data,device):
|
78 |
+
from collections.abc import Mapping
|
79 |
+
if isinstance(data, Mapping):
|
80 |
+
return type(data)({k: prepare_input(v,device) for k, v in data.items()})
|
81 |
+
elif isinstance(data, (tuple, list)):
|
82 |
+
return type(data)(prepare_input(v,device) for v in data)
|
83 |
+
elif isinstance(data, torch.Tensor):
|
84 |
+
kwargs = dict(device=device)
|
85 |
+
if data.dtype != torch.int64:
|
86 |
+
# NLP models inputs are int64 and those get adjusted to the right dtype of the
|
87 |
+
# embedding. Other models such as wav2vec2's inputs are already float and thus
|
88 |
+
# may need special handling to match the dtypes of the model
|
89 |
+
kwargs.update(dict(dtype=torch.int64))
|
90 |
+
|
91 |
+
return data.to(**kwargs)
|
92 |
+
return data
|
93 |
+
|
94 |
+
def get_data_loader(is_train=1,dataset_filename = './pubchem_iupac_smile_gpt.csv'):
|
95 |
+
|
96 |
+
full_path = '/home/jmwang/drugai/iupac-gpt/iupac_gpt/'
|
97 |
+
|
98 |
+
iupac_tokenizer = T5IUPACTokenizer(vocab_file=full_path+'iupac_spm.model')
|
99 |
+
iupac_vocab_size = iupac_tokenizer.vocab_size
|
100 |
+
print('iupac_vocab_size:',iupac_vocab_size)
|
101 |
+
if is_train:
|
102 |
+
torch.save(iupac_tokenizer, pt.join(full_path,"real_iupac_tokenizer.pt"))
|
103 |
+
print("training...",len(iupac_tokenizer))
|
104 |
+
else:
|
105 |
+
iupac_tokenizer = torch.load(pt.join(full_path,"real_iupac_tokenizer.pt"), map_location="cpu")
|
106 |
+
print('fina_tune...',len(iupac_tokenizer))
|
107 |
+
|
108 |
+
target_col = "aLogP"
|
109 |
+
iupac_name_col = 'PUBCHEM_IUPAC_NAME'
|
110 |
+
MAXLEN=1024
|
111 |
+
dataset_kwargs = {"dataset_dir":full_path,"dataset_filename": dataset_filename,"tokenizer": iupac_tokenizer,"max_length": MAXLEN,"target_col": target_col,'dataset_size':None,"iupac_name_col":iupac_name_col}
|
112 |
+
train_dataset = IUPACDataset(**dataset_kwargs)
|
113 |
+
|
114 |
+
#for i in train_dataset:
|
115 |
+
# train_dataset[i]=train_dataset[i].to(device)
|
116 |
+
|
117 |
+
collator = T5Collator(iupac_tokenizer.pad_token_id)
|
118 |
+
train_dataloader = DataLoader(train_dataset,batch_size=64,collate_fn=collator,shuffle=True)
|
119 |
+
|
120 |
+
return train_dataloader,iupac_tokenizer
|
121 |
+
|
122 |
+
if __name__ == "__main__":
|
123 |
+
|
124 |
+
train_dataloader,iupac_tokenizer = get_data_loader(is_train=1)
|
125 |
+
pbar = tqdm(train_dataloader)
|
126 |
+
device = 'cpu'
|
127 |
+
for inputs in pbar:
|
128 |
+
|
129 |
+
src_label = Variable(inputs["labels"].to(device))
|
130 |
+
inputs = prepare_input(inputs,device)
|
131 |
+
src = Variable(inputs["input_ids"].to(device))
|
132 |
+
#self.tokenizer._convert_token_to_id
|
133 |
+
|
134 |
+
print(src[:,:].shape,src_label)
|
iupac-gpt/iupac_gpt/iupac_tokenization_pro.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import (
|
2 |
+
AdamW,
|
3 |
+
DataCollatorWithPadding,
|
4 |
+
HfArgumentParser,
|
5 |
+
T5Config,
|
6 |
+
T5ForConditionalGeneration,
|
7 |
+
T5Tokenizer,
|
8 |
+
Trainer,
|
9 |
+
TrainingArguments,
|
10 |
+
)
|
11 |
+
from torch.utils.data import DataLoader
|
12 |
+
import os
|
13 |
+
import tempfile
|
14 |
+
import re
|
15 |
+
import pandas as pd
|
16 |
+
import numpy as np
|
17 |
+
from typing import Dict, Optional
|
18 |
+
from dataclasses import dataclass, field
|
19 |
+
import logging
|
20 |
+
|
21 |
+
import torch
|
22 |
+
from torch.nn.utils.rnn import pad_sequence
|
23 |
+
from torch.optim.lr_scheduler import LambdaLR
|
24 |
+
import os.path as pt
|
25 |
+
import torch.optim as optim
|
26 |
+
import torch.nn as nn
|
27 |
+
from tqdm import tqdm
|
28 |
+
from torch.autograd import Variable
|
29 |
+
from .iupac_dataset_pro import IUPACDataset
|
30 |
+
import os
|
31 |
+
#os.environ["CUDA_VISIBLE_DEVICES"]="0"
|
32 |
+
|
33 |
+
|
34 |
+
class T5Collator:
|
35 |
+
def __init__(self, pad_token_id):
|
36 |
+
super().__init__()
|
37 |
+
self.pad_token_id = pad_token_id
|
38 |
+
def __call__(self, records):
|
39 |
+
# records is a list of dicts
|
40 |
+
batch = {}
|
41 |
+
padvals = {"input_ids": self.pad_token_id,'attention_mask':0}
|
42 |
+
for k in records[0]:
|
43 |
+
if k in padvals:
|
44 |
+
batch[k] = pad_sequence([torch.tensor(r[k]) for r in records],
|
45 |
+
batch_first=True,
|
46 |
+
padding_value=padvals[k])
|
47 |
+
else:
|
48 |
+
batch[k] = torch.FloatTensor([r[k] for r in records]) #torch.Tensor LongTensor FloatTensor
|
49 |
+
return batch
|
50 |
+
|
51 |
+
class T5IUPACTokenizer(T5Tokenizer):
|
52 |
+
def prepare_for_tokenization(self, text, is_split_into_words=False,
|
53 |
+
**kwargs):
|
54 |
+
return re.sub(" ", "_", text), kwargs
|
55 |
+
|
56 |
+
def _decode(self, *args, **kwargs):
|
57 |
+
# replace "_" with " ", except for the _ in extra_id_#
|
58 |
+
text = super()._decode(*args, **kwargs)
|
59 |
+
text = re.sub("extra_id_", "extraAidA", text)
|
60 |
+
text = re.sub("_", " ", text)
|
61 |
+
text = re.sub("extraAidA", "extra_id_", text)
|
62 |
+
return text
|
63 |
+
|
64 |
+
def sentinels(self, sentinel_ids):
|
65 |
+
return self.vocab_size - sentinel_ids - 1
|
66 |
+
|
67 |
+
def sentinel_mask(self, ids):
|
68 |
+
return ((self.vocab_size - self._extra_ids <= ids) &
|
69 |
+
(ids < self.vocab_size))
|
70 |
+
|
71 |
+
def _tokenize(self, text, sample=False):
|
72 |
+
#pieces = super()._tokenize(text, sample=sample)
|
73 |
+
pieces = super()._tokenize(text)
|
74 |
+
# sentencepiece adds a non-printing token at the start. Remove it
|
75 |
+
return pieces[1:]
|
76 |
+
|
77 |
+
def prepare_input_pro(data,device):
|
78 |
+
from collections.abc import Mapping
|
79 |
+
if isinstance(data, Mapping):
|
80 |
+
return type(data)({k: prepare_input_pro(v,device) for k, v in data.items()})
|
81 |
+
elif isinstance(data, (tuple, list)):
|
82 |
+
return type(data)(prepare_input_pro(v,device) for v in data)
|
83 |
+
elif isinstance(data, torch.Tensor):
|
84 |
+
kwargs = dict(device=device)
|
85 |
+
if data.dtype != torch.int64:
|
86 |
+
# NLP models inputs are int64 and those get adjusted to the right dtype of the
|
87 |
+
# embedding. Other models such as wav2vec2's inputs are already float and thus
|
88 |
+
# may need special handling to match the dtypes of the model
|
89 |
+
kwargs.update(dict(dtype=torch.int64))
|
90 |
+
|
91 |
+
return data.to(**kwargs)
|
92 |
+
return data
|
93 |
+
|
94 |
+
def get_data_loader_pro(is_train=1):
|
95 |
+
|
96 |
+
full_path = '/root/autodl-tmp/wjm/iupac-gpt/iupac_gpt/'
|
97 |
+
|
98 |
+
iupac_tokenizer = T5IUPACTokenizer(vocab_file=full_path+'iupac_spm.model')
|
99 |
+
iupac_vocab_size = iupac_tokenizer.vocab_size
|
100 |
+
print('iupac_vocab_size:',iupac_vocab_size)
|
101 |
+
if is_train:
|
102 |
+
torch.save(iupac_tokenizer, pt.join(full_path,"real_iupac_tokenizer.pt"))
|
103 |
+
print("training...",len(iupac_tokenizer))
|
104 |
+
else:
|
105 |
+
iupac_tokenizer = torch.load(pt.join(full_path,"real_iupac_tokenizer.pt"), map_location="cpu")
|
106 |
+
print('fina_tune...',len(iupac_tokenizer))
|
107 |
+
|
108 |
+
dataset_filename = 'iupacs_logp.csv'
|
109 |
+
target_col = "LogP"
|
110 |
+
iupac_name_col = 'iupac'
|
111 |
+
MAXLEN=1024
|
112 |
+
dataset_kwargs = {"dataset_dir":full_path,"dataset_filename": dataset_filename,"tokenizer": iupac_tokenizer,"max_length": MAXLEN,"target_col": target_col,'dataset_size':None,"iupac_name_col":iupac_name_col}
|
113 |
+
train_dataset = IUPACDataset(**dataset_kwargs)
|
114 |
+
collator = T5Collator(iupac_tokenizer.pad_token_id)
|
115 |
+
train_dataloader = DataLoader(train_dataset,batch_size=64,collate_fn=collator,shuffle=True)
|
116 |
+
|
117 |
+
return train_dataloader,iupac_tokenizer
|
118 |
+
|
119 |
+
if __name__ == "__main__":
|
120 |
+
|
121 |
+
train_dataloader,iupac_tokenizer = get_data_loader_class(is_train=1)
|
122 |
+
pbar = tqdm(train_dataloader)
|
123 |
+
device = 'cpu'
|
124 |
+
for inputs in pbar:
|
125 |
+
|
126 |
+
src_label = Variable(inputs["labels"].to(device))
|
127 |
+
inputs = prepare_input_class(inputs,device)
|
128 |
+
src = Variable(inputs["input_ids"].to(device))
|
129 |
+
#self.tokenizer._convert_token_to_id
|
130 |
+
|
131 |
+
print(src[:,:].shape,src_label)
|
iupac-gpt/iupac_gpt/iupacs_logp.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
iupac-gpt/iupac_gpt/language_modeling.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Pytorch-lightning module for causal language modeling.
|
2 |
+
"""
|
3 |
+
|
4 |
+
__all__ = ("GPT2LitModel",)
|
5 |
+
|
6 |
+
import pytorch_lightning as pl
|
7 |
+
import torch
|
8 |
+
|
9 |
+
|
10 |
+
class GPT2LitModel(pl.LightningModule):
|
11 |
+
"""Lightning module for autoregressive (causal) transformer language modeling.
|
12 |
+
Successfully tested on HuggingFace `GPT2LMHeadModel`.
|
13 |
+
"""
|
14 |
+
|
15 |
+
def __init__(self, transformer, batch_size: int, learning_rate: float,
|
16 |
+
final_learning_rate: float, weight_decay: float, adam_eps: float,
|
17 |
+
adam_betas: tuple, scheduler_T_max: int,
|
18 |
+
save_model_every: int = 10_000, checkpoint: str = ""):
|
19 |
+
super().__init__()
|
20 |
+
self.save_hyperparameters(ignore=("transformer", "save_model_every",
|
21 |
+
"checkpoints"))
|
22 |
+
self.transformer = transformer
|
23 |
+
self.save_model_every = save_model_every
|
24 |
+
self.checkpoint = checkpoint or "./gpt2litmodel-logs"
|
25 |
+
|
26 |
+
def forward(self, *args, **kwargs):
|
27 |
+
return self.transformer(*args, **kwargs)
|
28 |
+
|
29 |
+
def training_step(self, batch, batch_idx):
|
30 |
+
outputs = self(**batch)
|
31 |
+
|
32 |
+
if self.save_model_every > 0 and batch_idx % self.save_model_every == 0:
|
33 |
+
self.transformer.save_pretrained(self.checkpoint)
|
34 |
+
|
35 |
+
return {'loss': outputs['loss']}
|
36 |
+
|
37 |
+
def training_epoch_end(self, outputs):
|
38 |
+
if self.save_model_every > 0:
|
39 |
+
self.transformer.save_pretrained(self.checkpoint)
|
40 |
+
|
41 |
+
losses = [step_output["loss"] for step_output in outputs]
|
42 |
+
mean_loss = torch.tensor(losses).mean()
|
43 |
+
ppl = torch.exp(mean_loss)
|
44 |
+
|
45 |
+
self.log("ppl", ppl, on_step=False, on_epoch=True, prog_bar=True)
|
46 |
+
|
47 |
+
def configure_optimizers(self):
|
48 |
+
parameters = self.named_parameters()
|
49 |
+
no_decay = ["bias", "LayerNorm.weight"]
|
50 |
+
grouped_parameters = [
|
51 |
+
{"params": [p for n, p in parameters
|
52 |
+
if not any(nd in n for nd in no_decay)],
|
53 |
+
"weight_decay": self.hparams.weight_decay},
|
54 |
+
{"params": [p for n, p in parameters
|
55 |
+
if any(nd in n for nd in no_decay)],
|
56 |
+
"weight_decay": 0.0}]
|
57 |
+
optimizer = torch.optim.Adam(
|
58 |
+
grouped_parameters, lr=self.hparams.learning_rate,
|
59 |
+
weight_decay=self.hparams.weight_decay,
|
60 |
+
eps=self.hparams.adam_eps, betas=self.hparams.adam_betas)
|
61 |
+
|
62 |
+
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
|
63 |
+
optimizer, self.hparams.scheduler_T_max,
|
64 |
+
eta_min=self.hparams.final_learning_rate)
|
65 |
+
|
66 |
+
return {'optimizer': optimizer,
|
67 |
+
'lr_scheduler': {'scheduler': lr_scheduler,
|
68 |
+
'interval': 'step', 'frequency': 1}}
|
iupac-gpt/iupac_gpt/pubchem_iupac_smile_gpt.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b052dd26a26107e9c86a2b155a693669fb1f4fbf498762abe2d19fbaa6867567
|
3 |
+
size 2825708735
|
iupac-gpt/iupac_gpt/real_iupac_tokenizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1696e3f3060bcce33275387e4eb4e175f4c64a015962ac4f3c5f49f25ed6f335
|
3 |
+
size 3529
|
iupac-gpt/iupac_gpt/tokenization.py
ADDED
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""SMILES-based tokenization utilities.
|
2 |
+
"""
|
3 |
+
|
4 |
+
__all__ = ("PAD_TOKEN", "BOS_TOKEN", "EOS_TOKEN", "UNK_TOKEN", "SUFFIX",
|
5 |
+
"SPECIAL_TOKENS", "PAD_TOKEN_ID", "BOS_TOKEN_ID", "EOS_TOKEN_ID",
|
6 |
+
"UNK_TOKEN_ID", "SMILESBPETokenizer", "SMILESAlphabet")
|
7 |
+
|
8 |
+
from collections.abc import Collection, Iterator
|
9 |
+
from dataclasses import dataclass
|
10 |
+
from itertools import chain
|
11 |
+
from typing import Any, Dict, FrozenSet, List, Optional, Set, Tuple, Union
|
12 |
+
from tokenizers import AddedToken, Tokenizer
|
13 |
+
from tokenizers import decoders, models, normalizers, processors, trainers
|
14 |
+
from tokenizers.implementations import BaseTokenizer
|
15 |
+
from transformers import PreTrainedTokenizerFast
|
16 |
+
|
17 |
+
|
18 |
+
SUFFIX, PAD_TOKEN, BOS_TOKEN, EOS_TOKEN, UNK_TOKEN = "", "<pad>", "<s>", "</s>", "<unk>"
|
19 |
+
SPECIAL_TOKENS = [PAD_TOKEN, BOS_TOKEN, EOS_TOKEN, UNK_TOKEN]
|
20 |
+
PAD_TOKEN_ID, BOS_TOKEN_ID, EOS_TOKEN_ID, UNK_TOKEN_ID = range(4)
|
21 |
+
|
22 |
+
|
23 |
+
class SMILESBPETokenizer(BaseTokenizer):
|
24 |
+
"""Tokenizes SMILES strings and applies BPE.
|
25 |
+
|
26 |
+
Args:
|
27 |
+
vocab (`str` or `dict`, optional, defaults to `None`):
|
28 |
+
Token vocabulary.
|
29 |
+
merges (`str` or `dict` or `tuple`, optional, defaults to `None`):
|
30 |
+
BPE merges.
|
31 |
+
unk_token (`str` or `tokenizers.AddedToken`, optional, defaults to "<unk>")
|
32 |
+
suffix (`str`, defaults to "")
|
33 |
+
dropout (`float`, defaults to `None`)
|
34 |
+
|
35 |
+
Examples:
|
36 |
+
>>> tokenizer = SMILESBPETokenizer()
|
37 |
+
>>> tokenizer.train("path-to-smiles-strings-file")
|
38 |
+
Tokenization logs...
|
39 |
+
>>> tokenizer.save_model("checkpoints-path")
|
40 |
+
>>> same_tokenizer = SMILESBPETokenizer.from_file("checkpoints-path/vocab.json",
|
41 |
+
... "checkpoints-path/merges.txt")
|
42 |
+
"""
|
43 |
+
|
44 |
+
def __init__(
|
45 |
+
self,
|
46 |
+
vocab: Optional[Union[str, Dict[str, int]]] = None,
|
47 |
+
merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
|
48 |
+
unk_token: Union[str, AddedToken] = "<unk>",
|
49 |
+
suffix: str = SUFFIX,
|
50 |
+
dropout: Optional[float] = None,
|
51 |
+
) -> None:
|
52 |
+
unk_token_str = str(unk_token)
|
53 |
+
|
54 |
+
tokenizer = Tokenizer(models.BPE(vocab, merges, dropout=dropout,
|
55 |
+
unk_token=unk_token_str,
|
56 |
+
end_of_word_suffix=suffix))
|
57 |
+
|
58 |
+
if tokenizer.token_to_id(unk_token_str) is not None:
|
59 |
+
tokenizer.add_special_tokens([unk_token_str])
|
60 |
+
|
61 |
+
tokenizer.normalizer = normalizers.Strip(left=False, right=True)
|
62 |
+
tokenizer.decoder = decoders.Metaspace(add_prefix_space=True)
|
63 |
+
tokenizer.post_processor = processors.TemplateProcessing(
|
64 |
+
single=f"{BOS_TOKEN} $A {EOS_TOKEN}",
|
65 |
+
special_tokens=[(BOS_TOKEN, BOS_TOKEN_ID), (EOS_TOKEN, EOS_TOKEN_ID)])
|
66 |
+
|
67 |
+
parameters = {"model": "BPE", "unk_token": unk_token, "suffix": suffix,
|
68 |
+
"dropout": dropout}
|
69 |
+
|
70 |
+
super().__init__(tokenizer, parameters)
|
71 |
+
|
72 |
+
@classmethod
|
73 |
+
def from_file(cls, vocab_filename: str, merges_filename: str, **kwargs) \
|
74 |
+
-> "SMILESBPETokenizer":
|
75 |
+
vocab, merges = models.BPE.read_file(vocab_filename, merges_filename)
|
76 |
+
return cls(vocab, merges, **kwargs)
|
77 |
+
|
78 |
+
def train(
|
79 |
+
self,
|
80 |
+
files: Union[str, List[str]],
|
81 |
+
vocab_size: int = 1_000,
|
82 |
+
min_frequency: int = 2,
|
83 |
+
special_tokens: List[Union[str, AddedToken]] = None,
|
84 |
+
limit_alphabet: int = 200,
|
85 |
+
initial_alphabet: List[str] = None,
|
86 |
+
suffix: Optional[str] = SUFFIX,
|
87 |
+
show_progress: bool = True,
|
88 |
+
) -> None:
|
89 |
+
special_tokens = special_tokens or SPECIAL_TOKENS
|
90 |
+
initial_alphabet = initial_alphabet or []
|
91 |
+
|
92 |
+
trainer = trainers.BpeTrainer(vocab_size=vocab_size,
|
93 |
+
min_frequency=min_frequency,
|
94 |
+
special_tokens=special_tokens,
|
95 |
+
limit_alphabet=limit_alphabet,
|
96 |
+
initial_alphabet=initial_alphabet,
|
97 |
+
end_of_word_suffix=suffix,
|
98 |
+
show_progress=show_progress)
|
99 |
+
if isinstance(files, str):
|
100 |
+
files = [files]
|
101 |
+
self._tokenizer.train(files, trainer=trainer)
|
102 |
+
|
103 |
+
def train_from_iterator(
|
104 |
+
self,
|
105 |
+
iterator: Iterator,
|
106 |
+
vocab_size: int = 1_000,
|
107 |
+
min_frequency: int = 2,
|
108 |
+
special_tokens: List[Union[str, AddedToken]] = None,
|
109 |
+
limit_alphabet: int = 200,
|
110 |
+
initial_alphabet: List[str] = None,
|
111 |
+
suffix: Optional[str] = SUFFIX,
|
112 |
+
show_progress: bool = True,
|
113 |
+
) -> None:
|
114 |
+
special_tokens = special_tokens or SPECIAL_TOKENS
|
115 |
+
initial_alphabet = initial_alphabet or []
|
116 |
+
|
117 |
+
trainer = trainers.BpeTrainer(vocab_size=vocab_size,
|
118 |
+
min_frequency=min_frequency,
|
119 |
+
special_tokens=special_tokens,
|
120 |
+
limit_alphabet=limit_alphabet,
|
121 |
+
initial_alphabet=initial_alphabet,
|
122 |
+
end_of_word_suffix=suffix,
|
123 |
+
show_progress=show_progress)
|
124 |
+
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
|
125 |
+
|
126 |
+
@staticmethod
|
127 |
+
def get_hf_tokenizer(
|
128 |
+
tokenizer_file: str,
|
129 |
+
special_tokens: Optional[Dict[str, str]] = None,
|
130 |
+
model_max_length: int = 512,
|
131 |
+
*init_inputs, **kwargs
|
132 |
+
) -> PreTrainedTokenizerFast:
|
133 |
+
"""Gets HuggingFace tokenizer from the pretrained `tokenizer_file`. Optionally,
|
134 |
+
appends `special_tokens` to vocabulary and sets `model_max_length`.
|
135 |
+
"""
|
136 |
+
tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file,
|
137 |
+
*init_inputs, **kwargs)
|
138 |
+
special_tokens = special_tokens or dict(zip(
|
139 |
+
["pad_token", "bos_token", "eos_token", "unk_token"],
|
140 |
+
SPECIAL_TOKENS))
|
141 |
+
tokenizer.add_special_tokens(special_tokens)
|
142 |
+
tokenizer.model_max_length = model_max_length
|
143 |
+
return tokenizer
|
144 |
+
|
145 |
+
|
146 |
+
@dataclass(init=True, eq=False, repr=True, frozen=True)
|
147 |
+
class SMILESAlphabet(Collection):
|
148 |
+
atoms: FrozenSet[str] = frozenset([
|
149 |
+
'Ac', 'Ag', 'Al', 'Am', 'Ar', 'As', 'At', 'Au', 'B', 'Ba', 'Be', 'Bh',
|
150 |
+
'Bi', 'Bk', 'Br', 'C', 'Ca', 'Cd', 'Ce', 'Cf', 'Cl', 'Cm', 'Co', 'Cr',
|
151 |
+
'Cs', 'Cu', 'Db', 'Dy', 'Er', 'Es', 'Eu', 'F', 'Fe', 'Fm', 'Fr', 'Ga',
|
152 |
+
'Gd', 'Ge', 'H', 'He', 'Hf', 'Hg', 'Ho', 'Hs', 'I', 'In', 'Ir', 'K',
|
153 |
+
'Kr', 'La', 'Li', 'Lr', 'Lu', 'Md', 'Mg', 'Mn', 'Mo', 'Mt', 'N', 'Na',
|
154 |
+
'Nb', 'Nd', 'Ne', 'Ni', 'No', 'Np', 'O', 'Os', 'P', 'Pa', 'Pb', 'Pd',
|
155 |
+
'Pm', 'Po', 'Pr', 'Pt', 'Pu', 'Ra', 'Rb', 'Re', 'Rf', 'Rh', 'Rn',
|
156 |
+
'Ru', 'S', 'Sb', 'Sc', 'Se', 'Sg', 'Si', 'Sm', 'Sn', 'Sr', 'Ta', 'Tb',
|
157 |
+
'Tc', 'Te', 'Th', 'Ti', 'Tl', 'Tm', 'U', 'V', 'W', 'Xe', 'Y', 'Yb',
|
158 |
+
'Zn', 'Zr'
|
159 |
+
])
|
160 |
+
|
161 |
+
# Bonds, charges, etc.
|
162 |
+
non_atoms: FrozenSet[str] = frozenset([
|
163 |
+
'-', '=', '#', ':', '(', ')', '.', '[', ']', '+', '-', '\\', '/', '*',
|
164 |
+
'1', '2', '3', '4', '5', '6', '7', '8', '9', '0',
|
165 |
+
'@', 'AL', 'TH', 'SP', 'TB', 'OH',
|
166 |
+
])
|
167 |
+
|
168 |
+
additional: FrozenSet[str] = frozenset()
|
169 |
+
|
170 |
+
def __contains__(self, item: Any) -> bool:
|
171 |
+
return item in self.atoms or item in self.non_atoms
|
172 |
+
|
173 |
+
def __iter__(self):
|
174 |
+
return (token for token in chain(self.atoms, self.non_atoms))
|
175 |
+
|
176 |
+
def __len__(self) -> int:
|
177 |
+
return len(self.atoms) + len(self.non_atoms) + len(self.additional)
|
178 |
+
|
179 |
+
def get_alphabet(self) -> Set[str]:
|
180 |
+
alphabet = set()
|
181 |
+
for token in self.atoms:
|
182 |
+
if len(token) > 1:
|
183 |
+
alphabet.update(list(token))
|
184 |
+
alphabet.add(token[0].lower())
|
185 |
+
else:
|
186 |
+
alphabet.add(token)
|
187 |
+
alphabet.add(token.lower())
|
188 |
+
for token in chain(self.non_atoms, self.additional):
|
189 |
+
if len(token) > 1:
|
190 |
+
alphabet.update(list(token))
|
191 |
+
else:
|
192 |
+
alphabet.add(token)
|
193 |
+
return alphabet
|
iupac-gpt/nohup.out
ADDED
The diff for this file is too large to render.
See raw diff
|
|
iupac-gpt/notebooks/.ipynb_checkpoints/language-modeling-checkpoint.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
iupac-gpt/notebooks/iupac_head_view.html
ADDED
The diff for this file is too large to render.
See raw diff
|
|
iupac-gpt/notebooks/iupac_language-modeling.py
ADDED
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
# # Generative Pre-Training from Molecules
|
5 |
+
|
6 |
+
import os
|
7 |
+
#os.environ["CUDA_VISIBLE_DEVICES"] = ['1',"2"]
|
8 |
+
from pprint import pprint
|
9 |
+
import sys
|
10 |
+
sys.path.append('/root/autodl-tmp/wjm/iupac-gpt')
|
11 |
+
from tqdm import tqdm
|
12 |
+
try:
|
13 |
+
import iupac_gpt as gpt
|
14 |
+
except ImportError:
|
15 |
+
import sys
|
16 |
+
sys.path.extend([".."]) # Parent directory stores `smiles_gpt` package.
|
17 |
+
import iupac_gpt as gpt
|
18 |
+
import torch
|
19 |
+
|
20 |
+
# For demonstration purposes, we use only 10K subset of PubChem data made available by
|
21 |
+
# [ChemBERTa](https://arxiv.org/abs/2010.09885) developers. The original model was pretrained
|
22 |
+
# on the first 5M compounds with the following hyperparameters:
|
23 |
+
# ```python
|
24 |
+
# hyperparams = {"batch_size": 128, "max_epochs": 2, "max_length": 512,
|
25 |
+
# "learning_rate": 5e-4, "weight_decay": 0.0,
|
26 |
+
# "adam_eps": 1e-8, "adam_betas": (0.9, 0.999),
|
27 |
+
# "scheduler_T_max": 150_000, "final_learning_rate": 5e-8,
|
28 |
+
# "vocab_size": 1_000, "min_frequency": 2, "top_p": 0.96,
|
29 |
+
# "n_layer": 4, "n_head": 8, "n_embd": 512}
|
30 |
+
# ```
|
31 |
+
# Tokenizer, model, optimizer, scheduler, and trainer hyperparameters.
|
32 |
+
hyperparams = {"batch_size": 64, "max_epochs": 10, "max_length": 1280,
|
33 |
+
"learning_rate": 5e-4, "weight_decay": 0.0,
|
34 |
+
"adam_eps": 1e-8, "adam_betas": (0.9, 0.999),
|
35 |
+
"scheduler_T_max": 1_000, "final_learning_rate": 5e-8,
|
36 |
+
"vocab_size": 1491, "min_frequency": 2, "top_p": 0.96,
|
37 |
+
"n_layer": 8, "n_head": 8, "n_embd": 256}
|
38 |
+
|
39 |
+
gpus = [0] # Specify either a list of GPU devices or an integer (0 for no GPU).
|
40 |
+
num_workers = 24 # Number of dataloader worker processes.
|
41 |
+
# ## Tokenization
|
42 |
+
#
|
43 |
+
# `smiles_gpt.SMILESBPETokenizer` first splits SMILES strings into characters, runs
|
44 |
+
# byte-pair encoding, and augments the resulting list with `"<s>"` (beginning-of-SMILES) and
|
45 |
+
# `"</s>"` (end-of-SMILES) special tokens. `smiles_gpt.SMILESAlphabet` stores 72 possible
|
46 |
+
# characters as an initial vocabulary.
|
47 |
+
device = 'gpu'
|
48 |
+
train_dataloader,iupac_tokenizer = gpt.get_data_loader(is_train=1,dataset_filename = './pubchem_iupac_smile_gpt.csv')
|
49 |
+
pbar = tqdm(train_dataloader) #train_dataloader.cuda()
|
50 |
+
|
51 |
+
|
52 |
+
'''
|
53 |
+
for inputs in pbar:
|
54 |
+
src_label = Variable(inputs["labels"].to(device))
|
55 |
+
inputs = prepare_input(inputs,device)
|
56 |
+
src = Variable(inputs["input_ids"].to(device))
|
57 |
+
#self.tokenizer._convert_token_to_id
|
58 |
+
|
59 |
+
print(src[:,:].shape,src_label)
|
60 |
+
'''
|
61 |
+
tokenizer = iupac_tokenizer
|
62 |
+
#start mark <unk> 2, end mark </s> 1, pad <pad> 0
|
63 |
+
|
64 |
+
iupac_string = "2-amino-9-[4-hydroxy-3-(hydroxymethyl)-2-methylidenecyclopentyl]-1H-purin-6-one"
|
65 |
+
iupac_encoded = tokenizer(iupac_string)
|
66 |
+
iupac_encoded['input_ids'] = [2]+iupac_encoded['input_ids']
|
67 |
+
|
68 |
+
iupac_merges = [tokenizer.decode(i) for i in iupac_encoded['input_ids']]
|
69 |
+
#iupac_encoded['attention_mask']
|
70 |
+
|
71 |
+
print(iupac_encoded['input_ids'])
|
72 |
+
print(iupac_merges)
|
73 |
+
|
74 |
+
print(tokenizer.unk_token_id,tokenizer.eos_token_id,tokenizer.unk_token,tokenizer.eos_token,tokenizer.vocab_size) #2 1 1491
|
75 |
+
# ## Data Module
|
76 |
+
batch = next(iter(pbar))
|
77 |
+
|
78 |
+
|
79 |
+
# ## GPT-2 Model
|
80 |
+
#
|
81 |
+
# Now we load HuggingFace
|
82 |
+
# [`GPT2LMHeadModel`](https://huggingface.co/transformers/model_doc/gpt2.html#gpt2lmheadmodel)
|
83 |
+
# with the configuration composed of previously
|
84 |
+
# defined model hyperparameters. The model processes mini-batch of input ids and labels, then
|
85 |
+
# returns predictions and cross-entropy loss between labels and predictions.
|
86 |
+
|
87 |
+
from transformers import GPT2Config, GPT2LMHeadModel
|
88 |
+
|
89 |
+
config = GPT2Config(vocab_size=tokenizer.vocab_size,
|
90 |
+
bos_token_id=tokenizer.unk_token_id,
|
91 |
+
eos_token_id=tokenizer.eos_token_id,
|
92 |
+
n_layer=hyperparams["n_layer"],
|
93 |
+
n_head=hyperparams["n_head"],
|
94 |
+
n_embd=hyperparams["n_embd"],
|
95 |
+
n_positions=hyperparams["max_length"],
|
96 |
+
n_ctx=hyperparams["max_length"])
|
97 |
+
model = GPT2LMHeadModel(config)
|
98 |
+
|
99 |
+
#model= torch.nn.DataParallel(model.cuda(),device_ids=gpus,output_device=gpus[0])
|
100 |
+
|
101 |
+
outputs = model(**batch)
|
102 |
+
print(outputs.keys())
|
103 |
+
|
104 |
+
#['loss', 'logits', 'past_key_values']
|
105 |
+
# ## Trainer
|
106 |
+
#
|
107 |
+
# GPT-2 is trained with autoregressive language modeling objective:
|
108 |
+
# $$
|
109 |
+
# P(\boldsymbol{s}) = P(s_1) \cdot P(s_2 | s_1) \cdots P(s_T | s_1, \ldots, s_{T-1}) =
|
110 |
+
# \prod_{t=1}^{T} P(s_t | s_{j < t}),
|
111 |
+
# $$
|
112 |
+
# where $\boldsymbol{s}$ is a tokenized (encoded) SMILES string, $s_t$ is a token from pretrained
|
113 |
+
# vocabulary $\mathcal{V}$.
|
114 |
+
#
|
115 |
+
# We use `pytorch_lightning.Trainer` to train GPT-2. Since `Trainer` requires lightning modules,
|
116 |
+
# we import our
|
117 |
+
# [`smiles_gpt.GPT2LitModel`](https://github.com/sanjaradylov/smiles-gpt/blob/master/smiles_gpt/language_modeling.py#L10)
|
118 |
+
# wrapper that implements training phases for
|
119 |
+
# `GPT2LMHeadModel`, configures an `Adam` optimizer with `CosineAnnealingLR` scheduler, and
|
120 |
+
# logs average perplexity every epoch.
|
121 |
+
|
122 |
+
# In[8]:
|
123 |
+
|
124 |
+
|
125 |
+
from pytorch_lightning import Trainer
|
126 |
+
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
|
127 |
+
|
128 |
+
checkpoint = "./checkpoints/iupac"
|
129 |
+
|
130 |
+
|
131 |
+
'''
|
132 |
+
trainer = Trainer(
|
133 |
+
gpus=gpus,
|
134 |
+
max_epochs=hyperparams["max_epochs"],
|
135 |
+
callbacks=[EarlyStopping("ppl", 0.1, 3)], #[EarlyStopping("ppl", 0.2, 2)]
|
136 |
+
auto_lr_find=False, # Set to True to search for optimal learning rate.
|
137 |
+
auto_scale_batch_size=False, # Set to True to scale batch size
|
138 |
+
# accelerator="dp" # Uncomment for GPU training.
|
139 |
+
accelerator="gpu", #devices=4,
|
140 |
+
strategy="ddp"
|
141 |
+
)
|
142 |
+
lit_model = gpt.GPT2LitModel(
|
143 |
+
model,
|
144 |
+
batch_size=hyperparams["batch_size"],
|
145 |
+
learning_rate=hyperparams["learning_rate"],
|
146 |
+
final_learning_rate=hyperparams["final_learning_rate"],
|
147 |
+
weight_decay=hyperparams["weight_decay"],
|
148 |
+
adam_eps=hyperparams["adam_eps"],
|
149 |
+
adam_betas=hyperparams["adam_betas"],
|
150 |
+
scheduler_T_max=hyperparams["scheduler_T_max"],
|
151 |
+
save_model_every=1, checkpoint=checkpoint)
|
152 |
+
trainer.fit(lit_model, train_dataloader)
|
153 |
+
|
154 |
+
|
155 |
+
#model.module.save_pretrained('./pretrained')
|
156 |
+
model.save_pretrained('./pretrained')
|
157 |
+
|
158 |
+
'''
|
159 |
+
|
160 |
+
|
161 |
+
# ## Interpretability
|
162 |
+
#
|
163 |
+
# [BertViz](https://github.com/jessevig/bertviz) inspects attention heads of transformers
|
164 |
+
# capturing specific patterns in data. Each head can be representative of some syntactic
|
165 |
+
# or short-/long-term relationships between tokens.
|
166 |
+
|
167 |
+
# In[9]:
|
168 |
+
|
169 |
+
|
170 |
+
import torch
|
171 |
+
from bertviz import head_view
|
172 |
+
|
173 |
+
input_ids_list = iupac_encoded['input_ids']
|
174 |
+
model = GPT2LMHeadModel.from_pretrained(checkpoint, output_attentions=True)
|
175 |
+
attention = model(torch.LongTensor(input_ids_list))[-1]
|
176 |
+
tokens = [tokenizer.decode(i) for i in input_ids_list]
|
177 |
+
print(input_ids_list,attention,tokens)
|
178 |
+
# Don't worry if a snippet is not displayed---just rerun this cell.
|
179 |
+
head_view(attention, tokens)
|
180 |
+
|
181 |
+
|
182 |
+
|
183 |
+
from bertviz import model_view
|
184 |
+
|
185 |
+
# Don't worry if a snippet is not displayed---just rerun this cell.
|
186 |
+
model_view(attention, tokens)
|
187 |
+
|
188 |
+
|
189 |
+
# ## Sampling
|
190 |
+
#
|
191 |
+
# Finally, we generate novel SMILES strings with top-$p$ sampling$-$i.e., sampling from the
|
192 |
+
# smallest vocabulary subset $\mathcal{V}^{(p)} \subset \mathcal{V}$ s.t. it takes up the most
|
193 |
+
# probable tokens whose cumulative probability mass exceeds $p$, $0 < p < 1$. Model
|
194 |
+
# terminates the procedure upon encountering `"</s>"` or reaching maximum number
|
195 |
+
# `hyperparams["max_length"]`. Special tokens are eventually removed.
|
196 |
+
|
197 |
+
|
198 |
+
|
199 |
+
import tqdm
|
200 |
+
|
201 |
+
model.eval() # Set the base model to evaluation mode.
|
202 |
+
|
203 |
+
generated_smiles_list = []
|
204 |
+
n_generated = 50000
|
205 |
+
|
206 |
+
for _ in tqdm.tqdm(range(n_generated)):
|
207 |
+
# Generate from "<unk>" so that the next token is arbitrary.
|
208 |
+
smiles_start = torch.LongTensor([[tokenizer.unk_token_id]])
|
209 |
+
# Get generated token IDs.
|
210 |
+
generated_ids = model.generate(smiles_start,
|
211 |
+
max_length=hyperparams["max_length"],
|
212 |
+
do_sample=True,top_p=hyperparams["top_p"],
|
213 |
+
repetition_penalty=1.2,
|
214 |
+
pad_token_id=tokenizer.eos_token_id)
|
215 |
+
# Decode the IDs into tokens and remove "<s>" and "</s>".
|
216 |
+
generated_smiles = tokenizer.decode(generated_ids[0],
|
217 |
+
skip_special_tokens=True)
|
218 |
+
generated_smiles_list.append(generated_smiles)
|
219 |
+
|
220 |
+
print(generated_smiles_list[:10])
|
221 |
+
|
222 |
+
|
223 |
+
import numpy as np
|
224 |
+
import pandas as pd
|
225 |
+
|
226 |
+
df2 = pd.DataFrame(generated_smiles_list, columns=['iupac'])
|
227 |
+
|
228 |
+
df2.to_csv("iupacGPT2-gen50K.csv",index=None,sep="|")
|
229 |
+
|
230 |
+
|
231 |
+
|
232 |
+
|
233 |
+
|
234 |
+
|
235 |
+
|
236 |
+
|
iupac-gpt/notebooks/iupac_language-modeling_retrain.py
ADDED
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
# # Generative Pre-Training from Molecules
|
5 |
+
|
6 |
+
import os
|
7 |
+
#os.environ["CUDA_VISIBLE_DEVICES"] = ['1',"2"]
|
8 |
+
from pprint import pprint
|
9 |
+
import sys
|
10 |
+
sys.path.append('/root/autodl-tmp/wjm/iupac-gpt')
|
11 |
+
from tqdm import tqdm
|
12 |
+
try:
|
13 |
+
import iupac_gpt as gpt
|
14 |
+
except ImportError:
|
15 |
+
import sys
|
16 |
+
sys.path.extend([".."]) # Parent directory stores `smiles_gpt` package.
|
17 |
+
import iupac_gpt as gpt
|
18 |
+
import torch
|
19 |
+
|
20 |
+
# For demonstration purposes, we use only 10K subset of PubChem data made available by
|
21 |
+
# [ChemBERTa](https://arxiv.org/abs/2010.09885) developers. The original model was pretrained
|
22 |
+
# on the first 5M compounds with the following hyperparameters:
|
23 |
+
# ```python
|
24 |
+
# hyperparams = {"batch_size": 128, "max_epochs": 2, "max_length": 512,
|
25 |
+
# "learning_rate": 5e-4, "weight_decay": 0.0,
|
26 |
+
# "adam_eps": 1e-8, "adam_betas": (0.9, 0.999),
|
27 |
+
# "scheduler_T_max": 150_000, "final_learning_rate": 5e-8,
|
28 |
+
# "vocab_size": 1_000, "min_frequency": 2, "top_p": 0.96,
|
29 |
+
# "n_layer": 4, "n_head": 8, "n_embd": 512}
|
30 |
+
# ```
|
31 |
+
# Tokenizer, model, optimizer, scheduler, and trainer hyperparameters.
|
32 |
+
hyperparams = {"batch_size": 128, "max_epochs": 10, "max_length": 1280,
|
33 |
+
"learning_rate": 5e-4, "weight_decay": 0.0,
|
34 |
+
"adam_eps": 1e-8, "adam_betas": (0.9, 0.999),
|
35 |
+
"scheduler_T_max": 1_000, "final_learning_rate": 5e-8,
|
36 |
+
"vocab_size": 1491, "min_frequency": 2, "top_p": 0.96,
|
37 |
+
"n_layer": 8, "n_head": 8, "n_embd": 256}
|
38 |
+
|
39 |
+
gpus = [0] # Specify either a list of GPU devices or an integer (0 for no GPU).
|
40 |
+
num_workers = 16 # Number of dataloader worker processes.
|
41 |
+
# ## Tokenization
|
42 |
+
#
|
43 |
+
# `smiles_gpt.SMILESBPETokenizer` first splits SMILES strings into characters, runs
|
44 |
+
# byte-pair encoding, and augments the resulting list with `"<s>"` (beginning-of-SMILES) and
|
45 |
+
# `"</s>"` (end-of-SMILES) special tokens. `smiles_gpt.SMILESAlphabet` stores 72 possible
|
46 |
+
# characters as an initial vocabulary.
|
47 |
+
device = 'gpu'
|
48 |
+
train_dataloader,iupac_tokenizer = gpt.get_data_loader(is_train=1,dataset_filename = './pubchem_iupac_smile_gpt.csv')
|
49 |
+
pbar = tqdm(train_dataloader) #train_dataloader.cuda()
|
50 |
+
|
51 |
+
|
52 |
+
'''
|
53 |
+
for inputs in pbar:
|
54 |
+
src_label = Variable(inputs["labels"].to(device))
|
55 |
+
inputs = prepare_input(inputs,device)
|
56 |
+
src = Variable(inputs["input_ids"].to(device))
|
57 |
+
#self.tokenizer._convert_token_to_id
|
58 |
+
|
59 |
+
print(src[:,:].shape,src_label)
|
60 |
+
'''
|
61 |
+
tokenizer = iupac_tokenizer
|
62 |
+
#start mark <unk> 2, end mark </s> 1, pad <pad> 0
|
63 |
+
|
64 |
+
iupac_string = "2-amino-9-[4-hydroxy-3-(hydroxymethyl)-2-methylidenecyclopentyl]-1H-purin-6-one"
|
65 |
+
iupac_encoded = tokenizer(iupac_string)
|
66 |
+
iupac_encoded['input_ids'] = [2]+iupac_encoded['input_ids']
|
67 |
+
|
68 |
+
iupac_merges = [tokenizer.decode(i) for i in iupac_encoded['input_ids']]
|
69 |
+
#iupac_encoded['attention_mask']
|
70 |
+
|
71 |
+
print(iupac_encoded['input_ids'])
|
72 |
+
print(iupac_merges)
|
73 |
+
|
74 |
+
print(tokenizer.unk_token_id,tokenizer.eos_token_id,tokenizer.unk_token,tokenizer.eos_token,tokenizer.vocab_size) #2 1 1491
|
75 |
+
# ## Data Module
|
76 |
+
#batch = next(iter(pbar))
|
77 |
+
|
78 |
+
|
79 |
+
# ## GPT-2 Model
|
80 |
+
#
|
81 |
+
# Now we load HuggingFace
|
82 |
+
# [`GPT2LMHeadModel`](https://huggingface.co/transformers/model_doc/gpt2.html#gpt2lmheadmodel)
|
83 |
+
# with the configuration composed of previously
|
84 |
+
# defined model hyperparameters. The model processes mini-batch of input ids and labels, then
|
85 |
+
# returns predictions and cross-entropy loss between labels and predictions.
|
86 |
+
|
87 |
+
from transformers import GPT2Config, GPT2LMHeadModel
|
88 |
+
|
89 |
+
config = GPT2Config(vocab_size=tokenizer.vocab_size,
|
90 |
+
bos_token_id=tokenizer.unk_token_id,
|
91 |
+
eos_token_id=tokenizer.eos_token_id,
|
92 |
+
n_layer=hyperparams["n_layer"],
|
93 |
+
n_head=hyperparams["n_head"],
|
94 |
+
n_embd=hyperparams["n_embd"],
|
95 |
+
n_positions=hyperparams["max_length"],
|
96 |
+
n_ctx=hyperparams["max_length"])
|
97 |
+
#model = GPT2LMHeadModel(config)
|
98 |
+
|
99 |
+
#model= torch.nn.DataParallel(model.cuda(),device_ids=gpus,output_device=gpus[0])
|
100 |
+
|
101 |
+
#outputs = model(**batch)
|
102 |
+
#print(outputs.keys())
|
103 |
+
|
104 |
+
#['loss', 'logits', 'past_key_values']
|
105 |
+
# ## Trainer
|
106 |
+
#
|
107 |
+
# GPT-2 is trained with autoregressive language modeling objective:
|
108 |
+
# $$
|
109 |
+
# P(\boldsymbol{s}) = P(s_1) \cdot P(s_2 | s_1) \cdots P(s_T | s_1, \ldots, s_{T-1}) =
|
110 |
+
# \prod_{t=1}^{T} P(s_t | s_{j < t}),
|
111 |
+
# $$
|
112 |
+
# where $\boldsymbol{s}$ is a tokenized (encoded) SMILES string, $s_t$ is a token from pretrained
|
113 |
+
# vocabulary $\mathcal{V}$.
|
114 |
+
#
|
115 |
+
# We use `pytorch_lightning.Trainer` to train GPT-2. Since `Trainer` requires lightning modules,
|
116 |
+
# we import our
|
117 |
+
# [`smiles_gpt.GPT2LitModel`](https://github.com/sanjaradylov/smiles-gpt/blob/master/smiles_gpt/language_modeling.py#L10)
|
118 |
+
# wrapper that implements training phases for
|
119 |
+
# `GPT2LMHeadModel`, configures an `Adam` optimizer with `CosineAnnealingLR` scheduler, and
|
120 |
+
# logs average perplexity every epoch.
|
121 |
+
checkpoint = "../checkpoints/iupac"
|
122 |
+
|
123 |
+
model = GPT2LMHeadModel.from_pretrained('./pretrained',local_files_only=True)
|
124 |
+
|
125 |
+
|
126 |
+
from pytorch_lightning import Trainer
|
127 |
+
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
|
128 |
+
|
129 |
+
|
130 |
+
|
131 |
+
trainer = Trainer(
|
132 |
+
gpus=gpus,
|
133 |
+
max_epochs=hyperparams["max_epochs"],
|
134 |
+
callbacks=[EarlyStopping("ppl", 0.1, 3)], #[EarlyStopping("ppl", 0.2, 2)]
|
135 |
+
auto_lr_find=False, # Set to True to search for optimal learning rate.
|
136 |
+
auto_scale_batch_size=False, # Set to True to scale batch size
|
137 |
+
# accelerator="dp" # Uncomment for GPU training.
|
138 |
+
accelerator="gpu", #devices=4,
|
139 |
+
strategy="ddp"
|
140 |
+
)
|
141 |
+
lit_model = gpt.GPT2LitModel(
|
142 |
+
model,
|
143 |
+
batch_size=hyperparams["batch_size"],
|
144 |
+
learning_rate=hyperparams["learning_rate"],
|
145 |
+
final_learning_rate=hyperparams["final_learning_rate"],
|
146 |
+
weight_decay=hyperparams["weight_decay"],
|
147 |
+
adam_eps=hyperparams["adam_eps"],
|
148 |
+
adam_betas=hyperparams["adam_betas"],
|
149 |
+
scheduler_T_max=hyperparams["scheduler_T_max"],
|
150 |
+
save_model_every=1, checkpoint=checkpoint)
|
151 |
+
trainer.fit(lit_model, train_dataloader)
|
152 |
+
|
153 |
+
|
154 |
+
#model.module.save_pretrained('./pretrained')
|
155 |
+
model.save_pretrained('./pretrained')
|
156 |
+
|
157 |
+
# ## Interpretability
|
158 |
+
#
|
159 |
+
# [BertViz](https://github.com/jessevig/bertviz) inspects attention heads of transformers
|
160 |
+
# capturing specific patterns in data. Each head can be representative of some syntactic
|
161 |
+
# or short-/long-term relationships between tokens.
|
162 |
+
|
163 |
+
# In[9]:
|
164 |
+
|
165 |
+
|
166 |
+
import torch
|
167 |
+
from bertviz import head_view
|
168 |
+
|
169 |
+
input_ids_list = iupac_encoded['input_ids']
|
170 |
+
model = GPT2LMHeadModel.from_pretrained(checkpoint, output_attentions=True)
|
171 |
+
attention = model(torch.LongTensor(input_ids_list))[-1]
|
172 |
+
tokens = [tokenizer.decode(i) for i in input_ids_list]
|
173 |
+
print(input_ids_list,attention,tokens)
|
174 |
+
# Don't worry if a snippet is not displayed---just rerun this cell.
|
175 |
+
head_view(attention, tokens)
|
176 |
+
|
177 |
+
|
178 |
+
|
179 |
+
from bertviz import model_view
|
180 |
+
|
181 |
+
# Don't worry if a snippet is not displayed---just rerun this cell.
|
182 |
+
model_view(attention, tokens)
|
183 |
+
|
184 |
+
|
185 |
+
# ## Sampling
|
186 |
+
#
|
187 |
+
# Finally, we generate novel SMILES strings with top-$p$ sampling$-$i.e., sampling from the
|
188 |
+
# smallest vocabulary subset $\mathcal{V}^{(p)} \subset \mathcal{V}$ s.t. it takes up the most
|
189 |
+
# probable tokens whose cumulative probability mass exceeds $p$, $0 < p < 1$. Model
|
190 |
+
# terminates the procedure upon encountering `"</s>"` or reaching maximum number
|
191 |
+
# `hyperparams["max_length"]`. Special tokens are eventually removed.
|
192 |
+
|
193 |
+
|
194 |
+
|
195 |
+
import tqdm
|
196 |
+
|
197 |
+
model.eval() # Set the base model to evaluation mode.
|
198 |
+
|
199 |
+
generated_smiles_list = []
|
200 |
+
n_generated = 50000
|
201 |
+
|
202 |
+
for _ in tqdm.tqdm(range(n_generated)):
|
203 |
+
# Generate from "<unk>" so that the next token is arbitrary.
|
204 |
+
smiles_start = torch.LongTensor([[tokenizer.unk_token_id]])
|
205 |
+
# Get generated token IDs.
|
206 |
+
generated_ids = model.generate(smiles_start,
|
207 |
+
max_length=hyperparams["max_length"],
|
208 |
+
do_sample=True,top_p=hyperparams["top_p"],
|
209 |
+
repetition_penalty=1.2,
|
210 |
+
pad_token_id=tokenizer.eos_token_id)
|
211 |
+
# Decode the IDs into tokens and remove "<s>" and "</s>".
|
212 |
+
generated_smiles = tokenizer.decode(generated_ids[0],
|
213 |
+
skip_special_tokens=True)
|
214 |
+
generated_smiles_list.append(generated_smiles)
|
215 |
+
|
216 |
+
print(generated_smiles_list[:10])
|
217 |
+
|
218 |
+
|
219 |
+
import numpy as np
|
220 |
+
import pandas as pd
|
221 |
+
|
222 |
+
df2 = pd.DataFrame(generated_smiles_list, columns=['iupac'])
|
223 |
+
|
224 |
+
df2.to_csv("iupacGPT2-gen50K.csv",index=None,mode='a')
|
iupac-gpt/notebooks/iupac_language-modeling_train.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
iupac-gpt/notebooks/iupac_language-modeling_train.py
ADDED
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
# # Generative Pre-Training from Molecules
|
5 |
+
|
6 |
+
import os
|
7 |
+
#os.environ["CUDA_VISIBLE_DEVICES"] = ['1',"2"]
|
8 |
+
from pprint import pprint
|
9 |
+
import sys
|
10 |
+
sys.path.append('/root/autodl-tmp/wjm/iupac-gpt')
|
11 |
+
from tqdm import tqdm
|
12 |
+
try:
|
13 |
+
import iupac_gpt as gpt
|
14 |
+
except ImportError:
|
15 |
+
import sys
|
16 |
+
sys.path.extend([".."]) # Parent directory stores `smiles_gpt` package.
|
17 |
+
import iupac_gpt as gpt
|
18 |
+
import torch
|
19 |
+
|
20 |
+
# For demonstration purposes, we use only 10K subset of PubChem data made available by
|
21 |
+
# [ChemBERTa](https://arxiv.org/abs/2010.09885) developers. The original model was pretrained
|
22 |
+
# on the first 5M compounds with the following hyperparameters:
|
23 |
+
# ```python
|
24 |
+
# hyperparams = {"batch_size": 128, "max_epochs": 2, "max_length": 512,
|
25 |
+
# "learning_rate": 5e-4, "weight_decay": 0.0,
|
26 |
+
# "adam_eps": 1e-8, "adam_betas": (0.9, 0.999),
|
27 |
+
# "scheduler_T_max": 150_000, "final_learning_rate": 5e-8,
|
28 |
+
# "vocab_size": 1_000, "min_frequency": 2, "top_p": 0.96,
|
29 |
+
# "n_layer": 4, "n_head": 8, "n_embd": 512}
|
30 |
+
# ```
|
31 |
+
# Tokenizer, model, optimizer, scheduler, and trainer hyperparameters.
|
32 |
+
hyperparams = {"batch_size": 128, "max_epochs": 10, "max_length": 1280,
|
33 |
+
"learning_rate": 5e-4, "weight_decay": 0.0,
|
34 |
+
"adam_eps": 1e-8, "adam_betas": (0.9, 0.999),
|
35 |
+
"scheduler_T_max": 1_000, "final_learning_rate": 5e-8,
|
36 |
+
"vocab_size": 1491, "min_frequency": 2, "top_p": 0.96,
|
37 |
+
"n_layer": 8, "n_head": 8, "n_embd": 256}
|
38 |
+
|
39 |
+
gpus = [0,1,2] # Specify either a list of GPU devices or an integer (0 for no GPU).
|
40 |
+
num_workers = 32 # Number of dataloader worker processes.
|
41 |
+
# ## Tokenization
|
42 |
+
#
|
43 |
+
# `smiles_gpt.SMILESBPETokenizer` first splits SMILES strings into characters, runs
|
44 |
+
# byte-pair encoding, and augments the resulting list with `"<s>"` (beginning-of-SMILES) and
|
45 |
+
# `"</s>"` (end-of-SMILES) special tokens. `smiles_gpt.SMILESAlphabet` stores 72 possible
|
46 |
+
# characters as an initial vocabulary.
|
47 |
+
device = 'gpu'
|
48 |
+
train_dataloader,iupac_tokenizer = gpt.get_data_loader(is_train=1,dataset_filename = './pubchem_iupac_smile_gpt_1bw.csv')
|
49 |
+
pbar = tqdm(train_dataloader) #train_dataloader.cuda()
|
50 |
+
|
51 |
+
|
52 |
+
'''
|
53 |
+
for inputs in pbar:
|
54 |
+
src_label = Variable(inputs["labels"].to(device))
|
55 |
+
inputs = prepare_input(inputs,device)
|
56 |
+
src = Variable(inputs["input_ids"].to(device))
|
57 |
+
#self.tokenizer._convert_token_to_id
|
58 |
+
|
59 |
+
print(src[:,:].shape,src_label)
|
60 |
+
'''
|
61 |
+
tokenizer = iupac_tokenizer
|
62 |
+
#start mark <unk> 2, end mark </s> 1, pad <pad> 0
|
63 |
+
|
64 |
+
iupac_string = "2-amino-9-[4-hydroxy-3-(hydroxymethyl)-2-methylidenecyclopentyl]-1H-purin-6-one"
|
65 |
+
iupac_encoded = tokenizer(iupac_string)
|
66 |
+
iupac_encoded['input_ids'] = [2]+iupac_encoded['input_ids']
|
67 |
+
|
68 |
+
iupac_merges = [tokenizer.decode(i) for i in iupac_encoded['input_ids']]
|
69 |
+
#iupac_encoded['attention_mask']
|
70 |
+
|
71 |
+
print(iupac_encoded['input_ids'])
|
72 |
+
print(iupac_merges)
|
73 |
+
|
74 |
+
print(tokenizer.unk_token_id,tokenizer.eos_token_id,tokenizer.unk_token,tokenizer.eos_token,tokenizer.vocab_size) #2 1 1491
|
75 |
+
# ## Data Module
|
76 |
+
batch = next(iter(pbar))
|
77 |
+
|
78 |
+
|
79 |
+
# ## GPT-2 Model
|
80 |
+
#
|
81 |
+
# Now we load HuggingFace
|
82 |
+
# [`GPT2LMHeadModel`](https://huggingface.co/transformers/model_doc/gpt2.html#gpt2lmheadmodel)
|
83 |
+
# with the configuration composed of previously
|
84 |
+
# defined model hyperparameters. The model processes mini-batch of input ids and labels, then
|
85 |
+
# returns predictions and cross-entropy loss between labels and predictions.
|
86 |
+
|
87 |
+
from transformers import GPT2Config, GPT2LMHeadModel
|
88 |
+
|
89 |
+
config = GPT2Config(vocab_size=tokenizer.vocab_size,
|
90 |
+
bos_token_id=tokenizer.unk_token_id,
|
91 |
+
eos_token_id=tokenizer.eos_token_id,
|
92 |
+
n_layer=hyperparams["n_layer"],
|
93 |
+
n_head=hyperparams["n_head"],
|
94 |
+
n_embd=hyperparams["n_embd"],
|
95 |
+
n_positions=hyperparams["max_length"],
|
96 |
+
n_ctx=hyperparams["max_length"])
|
97 |
+
model = GPT2LMHeadModel(config)
|
98 |
+
|
99 |
+
#model= torch.nn.DataParallel(model.cuda(),device_ids=gpus,output_device=gpus[0])
|
100 |
+
|
101 |
+
outputs = model(**batch)
|
102 |
+
print(outputs.keys())
|
103 |
+
|
104 |
+
#['loss', 'logits', 'past_key_values']
|
105 |
+
# ## Trainer
|
106 |
+
#
|
107 |
+
# GPT-2 is trained with autoregressive language modeling objective:
|
108 |
+
# $$
|
109 |
+
# P(\boldsymbol{s}) = P(s_1) \cdot P(s_2 | s_1) \cdots P(s_T | s_1, \ldots, s_{T-1}) =
|
110 |
+
# \prod_{t=1}^{T} P(s_t | s_{j < t}),
|
111 |
+
# $$
|
112 |
+
# where $\boldsymbol{s}$ is a tokenized (encoded) SMILES string, $s_t$ is a token from pretrained
|
113 |
+
# vocabulary $\mathcal{V}$.
|
114 |
+
#
|
115 |
+
# We use `pytorch_lightning.Trainer` to train GPT-2. Since `Trainer` requires lightning modules,
|
116 |
+
# we import our
|
117 |
+
# [`smiles_gpt.GPT2LitModel`](https://github.com/sanjaradylov/smiles-gpt/blob/master/smiles_gpt/language_modeling.py#L10)
|
118 |
+
# wrapper that implements training phases for
|
119 |
+
# `GPT2LMHeadModel`, configures an `Adam` optimizer with `CosineAnnealingLR` scheduler, and
|
120 |
+
# logs average perplexity every epoch.
|
121 |
+
|
122 |
+
# In[8]:
|
123 |
+
|
124 |
+
|
125 |
+
from pytorch_lightning import Trainer
|
126 |
+
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
|
127 |
+
|
128 |
+
checkpoint = "../checkpoints/iupac"
|
129 |
+
|
130 |
+
trainer = Trainer(
|
131 |
+
gpus=gpus,
|
132 |
+
max_epochs=hyperparams["max_epochs"],
|
133 |
+
callbacks=[EarlyStopping("ppl", 0.1, 3)], #[EarlyStopping("ppl", 0.2, 2)]
|
134 |
+
auto_lr_find=False, # Set to True to search for optimal learning rate.
|
135 |
+
auto_scale_batch_size=False, # Set to True to scale batch size
|
136 |
+
# accelerator="dp" # Uncomment for GPU training.
|
137 |
+
accelerator="gpu", #devices=4,
|
138 |
+
strategy="ddp"
|
139 |
+
)
|
140 |
+
lit_model = gpt.GPT2LitModel(
|
141 |
+
model,
|
142 |
+
batch_size=hyperparams["batch_size"],
|
143 |
+
learning_rate=hyperparams["learning_rate"],
|
144 |
+
final_learning_rate=hyperparams["final_learning_rate"],
|
145 |
+
weight_decay=hyperparams["weight_decay"],
|
146 |
+
adam_eps=hyperparams["adam_eps"],
|
147 |
+
adam_betas=hyperparams["adam_betas"],
|
148 |
+
scheduler_T_max=hyperparams["scheduler_T_max"],
|
149 |
+
save_model_every=1, checkpoint=checkpoint)
|
150 |
+
trainer.fit(lit_model, train_dataloader)
|
151 |
+
|
152 |
+
|
153 |
+
#model.module.save_pretrained('./pretrained')
|
154 |
+
model.save_pretrained('./pretrained')
|
155 |
+
|
156 |
+
# ## Interpretability
|
157 |
+
#
|
158 |
+
# [BertViz](https://github.com/jessevig/bertviz) inspects attention heads of transformers
|
159 |
+
# capturing specific patterns in data. Each head can be representative of some syntactic
|
160 |
+
# or short-/long-term relationships between tokens.
|
161 |
+
|
162 |
+
# In[9]:
|
163 |
+
|
164 |
+
|
165 |
+
import torch
|
166 |
+
from bertviz import head_view
|
167 |
+
|
168 |
+
input_ids_list = iupac_encoded['input_ids']
|
169 |
+
model = GPT2LMHeadModel.from_pretrained(checkpoint, output_attentions=True)
|
170 |
+
attention = model(torch.LongTensor(input_ids_list))[-1]
|
171 |
+
tokens = [tokenizer.decode(i) for i in input_ids_list]
|
172 |
+
print(input_ids_list,attention,tokens)
|
173 |
+
# Don't worry if a snippet is not displayed---just rerun this cell.
|
174 |
+
head_view(attention, tokens)
|
175 |
+
|
176 |
+
|
177 |
+
|
178 |
+
from bertviz import model_view
|
179 |
+
|
180 |
+
# Don't worry if a snippet is not displayed---just rerun this cell.
|
181 |
+
model_view(attention, tokens)
|
182 |
+
|
183 |
+
|
184 |
+
# ## Sampling
|
185 |
+
#
|
186 |
+
# Finally, we generate novel SMILES strings with top-$p$ sampling$-$i.e., sampling from the
|
187 |
+
# smallest vocabulary subset $\mathcal{V}^{(p)} \subset \mathcal{V}$ s.t. it takes up the most
|
188 |
+
# probable tokens whose cumulative probability mass exceeds $p$, $0 < p < 1$. Model
|
189 |
+
# terminates the procedure upon encountering `"</s>"` or reaching maximum number
|
190 |
+
# `hyperparams["max_length"]`. Special tokens are eventually removed.
|
191 |
+
|
192 |
+
|
193 |
+
|
194 |
+
import tqdm
|
195 |
+
|
196 |
+
model.eval() # Set the base model to evaluation mode.
|
197 |
+
|
198 |
+
generated_smiles_list = []
|
199 |
+
n_generated = 30000
|
200 |
+
|
201 |
+
for _ in tqdm.tqdm(range(n_generated)):
|
202 |
+
# Generate from "<unk>" so that the next token is arbitrary.
|
203 |
+
smiles_start = torch.LongTensor([[tokenizer.unk_token_id]])
|
204 |
+
# Get generated token IDs.
|
205 |
+
generated_ids = model.generate(smiles_start,
|
206 |
+
max_length=hyperparams["max_length"],
|
207 |
+
do_sample=True,top_p=hyperparams["top_p"],
|
208 |
+
repetition_penalty=1.2,
|
209 |
+
pad_token_id=tokenizer.eos_token_id)
|
210 |
+
# Decode the IDs into tokens and remove "<s>" and "</s>".
|
211 |
+
generated_smiles = tokenizer.decode(generated_ids[0],
|
212 |
+
skip_special_tokens=True)
|
213 |
+
generated_smiles_list.append(generated_smiles)
|
214 |
+
|
215 |
+
print(generated_smiles_list[:10])
|
216 |
+
|
217 |
+
|
218 |
+
import numpy as np
|
219 |
+
import pandas as pd
|
220 |
+
|
221 |
+
df2 = pd.DataFrame(generated_smiles_list, columns=['iupac'])
|
222 |
+
|
223 |
+
df2.to_csv("iupacGPT2-gen30K.csv",index=None,mode='a')
|
224 |
+
|
225 |
+
|
226 |
+
|
227 |
+
|
228 |
+
|
229 |
+
|
230 |
+
|
231 |
+
|