mao jiashun commited on
Commit
295ff14
·
1 Parent(s): 1286756

Upload 58 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. iupac-gpt/.gitignore +5 -0
  3. iupac-gpt/LICENSE +32 -0
  4. iupac-gpt/README.md +62 -0
  5. iupac-gpt/checkpoints/iupac/config.json +33 -0
  6. iupac-gpt/checkpoints/iupac/pytorch_model.bin +3 -0
  7. iupac-gpt/class.txt +3 -0
  8. iupac-gpt/data/bbbp.csv +0 -0
  9. iupac-gpt/data/iupacs_logp.csv +0 -0
  10. iupac-gpt/environment.yml +19 -0
  11. iupac-gpt/iupac.txt +3 -0
  12. iupac-gpt/iupacGPT2-gen50K.csv +0 -0
  13. iupac-gpt/iupac_gpt/__init__.py +21 -0
  14. iupac-gpt/iupac_gpt/__pycache__/__init__.cpython-37.pyc +0 -0
  15. iupac-gpt/iupac_gpt/__pycache__/__init__.cpython-38.pyc +0 -0
  16. iupac-gpt/iupac_gpt/__pycache__/classification.cpython-37.pyc +0 -0
  17. iupac-gpt/iupac_gpt/__pycache__/classification.cpython-38.pyc +0 -0
  18. iupac-gpt/iupac_gpt/__pycache__/data.cpython-38.pyc +0 -0
  19. iupac-gpt/iupac_gpt/__pycache__/iupac_dataset.cpython-38.pyc +0 -0
  20. iupac-gpt/iupac_gpt/__pycache__/iupac_dataset_class.cpython-38.pyc +0 -0
  21. iupac-gpt/iupac_gpt/__pycache__/iupac_dataset_pro.cpython-38.pyc +0 -0
  22. iupac-gpt/iupac_gpt/__pycache__/iupac_tokenization.cpython-38.pyc +0 -0
  23. iupac-gpt/iupac_gpt/__pycache__/iupac_tokenization_class.cpython-38.pyc +0 -0
  24. iupac-gpt/iupac_gpt/__pycache__/iupac_tokenization_iupac.cpython-38.pyc +0 -0
  25. iupac-gpt/iupac_gpt/__pycache__/iupac_tokenization_pro.cpython-38.pyc +0 -0
  26. iupac-gpt/iupac_gpt/__pycache__/language_modeling.cpython-38.pyc +0 -0
  27. iupac-gpt/iupac_gpt/__pycache__/tokenization.cpython-38.pyc +0 -0
  28. iupac-gpt/iupac_gpt/classification.py +362 -0
  29. iupac-gpt/iupac_gpt/data.py +269 -0
  30. iupac-gpt/iupac_gpt/iupac_dataset.py +121 -0
  31. iupac-gpt/iupac_gpt/iupac_dataset_class.py +128 -0
  32. iupac-gpt/iupac_gpt/iupac_dataset_pro.py +124 -0
  33. iupac-gpt/iupac_gpt/iupac_spm.model +3 -0
  34. iupac-gpt/iupac_gpt/iupac_spm.vocab +1391 -0
  35. iupac-gpt/iupac_gpt/iupac_tokenization.py +131 -0
  36. iupac-gpt/iupac_gpt/iupac_tokenization_class.py +131 -0
  37. iupac-gpt/iupac_gpt/iupac_tokenization_iupac.py +134 -0
  38. iupac-gpt/iupac_gpt/iupac_tokenization_pro.py +131 -0
  39. iupac-gpt/iupac_gpt/iupacs_logp.csv +0 -0
  40. iupac-gpt/iupac_gpt/language_modeling.py +68 -0
  41. iupac-gpt/iupac_gpt/pubchem_iupac_smile_gpt.csv +3 -0
  42. iupac-gpt/iupac_gpt/real_iupac_tokenizer.pt +3 -0
  43. iupac-gpt/iupac_gpt/tokenization.py +193 -0
  44. iupac-gpt/nohup.out +0 -0
  45. iupac-gpt/notebooks/.ipynb_checkpoints/language-modeling-checkpoint.ipynb +0 -0
  46. iupac-gpt/notebooks/iupac_head_view.html +0 -0
  47. iupac-gpt/notebooks/iupac_language-modeling.py +236 -0
  48. iupac-gpt/notebooks/iupac_language-modeling_retrain.py +224 -0
  49. iupac-gpt/notebooks/iupac_language-modeling_train.ipynb +0 -0
  50. iupac-gpt/notebooks/iupac_language-modeling_train.py +231 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ iupac-gpt/class.txt filter=lfs diff=lfs merge=lfs -text
37
+ iupac-gpt/iupac_gpt/pubchem_iupac_smile_gpt.csv filter=lfs diff=lfs merge=lfs -text
38
+ iupac-gpt/iupac.txt filter=lfs diff=lfs merge=lfs -text
iupac-gpt/.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ **/__pycache__/*
2
+ **/.idea/*
3
+ **/.ipynb_checkpoints/*
4
+ **/lightning_logs/*
5
+ *.log
iupac-gpt/LICENSE ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The Clear BSD License
2
+
3
+ Copyright (c) 2021 Sanjar Adilov
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without
7
+ modification, are permitted (subject to the limitations in the disclaimer
8
+ below) provided that the following conditions are met:
9
+
10
+ * Redistributions of source code must retain the above copyright notice,
11
+ this list of conditions and the following disclaimer.
12
+
13
+ * Redistributions in binary form must reproduce the above copyright
14
+ notice, this list of conditions and the following disclaimer in the
15
+ documentation and/or other materials provided with the distribution.
16
+
17
+ * Neither the name of the copyright holder nor the names of its
18
+ contributors may be used to endorse or promote products derived from this
19
+ software without specific prior written permission.
20
+
21
+ NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
22
+ THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
23
+ CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
25
+ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
26
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
27
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
28
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
29
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
30
+ IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32
+ POSSIBILITY OF SUCH DAMAGE.
iupac-gpt/README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generative Pre-Training from Molecules
2
+
3
+ Autoregressive transformer language model for drug discovery. (Pre)trained on a large
4
+ SMILES corpus. Evaluated on molecular property prediction and low-data de novo design
5
+ tasks.
6
+
7
+
8
+ ## Installation
9
+
10
+ Set up [conda](https://conda.io/en/latest/index.html) and create a new environment from
11
+ `environment.yml` (if needed, make corresponding edits for GPU-compatibility).
12
+ ```shell
13
+ conda env create -f environment.yml
14
+ conda activate smiles-gpt
15
+ git clone https://github.com/sanjaradylov/smiles-gpt.git
16
+ cd smiles-gpt
17
+ ```
18
+
19
+
20
+ ## Benchmark
21
+
22
+ ### Checkpoint
23
+ [checkpoints/benchmark-5m](https://github.com/sanjaradylov/smiles-gpt/tree/master/checkpoints/benchmark-5m)
24
+ stores serialized model, tokenizer, and configuration. Do not modify them. Use
25
+ `from_pretrained` method to load HuggingFace objects, e.g.,
26
+ ```python
27
+ from transformers import GPT2Config, GPT2LMHeadModel, PreTrainedTokenizerFast
28
+
29
+ checkpoint = "checkpoints/benchmark-5m"
30
+
31
+ config = GPT2Config.from_pretrained(checkpoint)
32
+ model = GPT2LMHeadModel.from_pretrained(checkpoint)
33
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(checkpoint)
34
+ ```
35
+
36
+ ### Data
37
+ [data](https://github.com/sanjaradylov/smiles-gpt/tree/master/data) stores
38
+ [Blood-Brain Barrier Penetration](https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv)
39
+ classification dataset and 10K subset of ChemBERTa's
40
+ [PubChem-10M](https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/pubchem_10m.txt.zip).
41
+ See [Examples](#Examples).
42
+
43
+ ### Output
44
+
45
+ [output](https://github.com/sanjaradylov/smiles-gpt/tree/master/output) stores generated
46
+ SMILES strings.
47
+
48
+ ## Examples
49
+
50
+ Adapter training for molecular property prediction
51
+ (replace `data/bbbp.csv` and `p_np` arguments with your dataset and taskname(s),
52
+ respectively):
53
+ ```shell
54
+ python3 scripts/classification.py checkpoints/benchmark-5m data/bbbp.csv p_np
55
+ ```
56
+ For language model pretraining, see
57
+ [notebooks](https://github.com/sanjaradylov/smiles-gpt/tree/master/notebooks).
58
+
59
+ ## Citation
60
+
61
+ If you use `smiles-gpt` in your research, please consider citing
62
+ > https://doi.org/10.33774/chemrxiv-2021-5fwjd
iupac-gpt/checkpoints/iupac/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "adapters": {
4
+ "adapters": {},
5
+ "config_map": {}
6
+ },
7
+ "architectures": [
8
+ "GPT2LMHeadModel"
9
+ ],
10
+ "attn_pdrop": 0.1,
11
+ "bos_token_id": 2,
12
+ "embd_pdrop": 0.1,
13
+ "eos_token_id": 1,
14
+ "gradient_checkpointing": false,
15
+ "initializer_range": 0.02,
16
+ "layer_norm_epsilon": 1e-05,
17
+ "model_type": "gpt2",
18
+ "n_ctx": 1280,
19
+ "n_embd": 256,
20
+ "n_head": 8,
21
+ "n_inner": null,
22
+ "n_layer": 8,
23
+ "n_positions": 1280,
24
+ "resid_pdrop": 0.1,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "transformers_version": "2.0.1",
31
+ "use_cache": true,
32
+ "vocab_size": 1491
33
+ }
iupac-gpt/checkpoints/iupac/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0aea87ea0c2a89b9a15bfd4682615df1d64f37c25748684d117ecc933153950f
3
+ size 41264861
iupac-gpt/class.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83bbfd23faf47bacfdec9db23eed87b0d20488da7d2c84d838d0d77e7f2c58d5
3
+ size 12317646
iupac-gpt/data/bbbp.csv ADDED
The diff for this file is too large to render. See raw diff
 
iupac-gpt/data/iupacs_logp.csv ADDED
The diff for this file is too large to render. See raw diff
 
iupac-gpt/environment.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: smiles-gpt
2
+ channels:
3
+ - pytorch
4
+ - anaconda
5
+ - conda-forge
6
+ dependencies:
7
+ - python=3.8
8
+ - pip
9
+ - pandas
10
+ - rdkit
11
+ - pytorch
12
+ - torchvision
13
+ - torchaudio
14
+ - cpuonly
15
+ - pip:
16
+ - tokenizers
17
+ - adapter-transformers
18
+ - pytorch-lightning
19
+ - bertviz
iupac-gpt/iupac.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bead0044a324634255bf5675f623fbd3a0b6babb51da7ca63870b6bf87f800a
3
+ size 156486208
iupac-gpt/iupacGPT2-gen50K.csv ADDED
The diff for this file is too large to render. See raw diff
 
iupac-gpt/iupac_gpt/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """`smiles_gpt` implements transformer models for molecule generation and molecular-
2
+ property prediction.
3
+ """
4
+
5
+ __author__ = "Sanjar Ad[iy]lov"
6
+ __version__ = "1.0.0-pub"
7
+
8
+ from . import classification, data, language_modeling, tokenization
9
+ from .classification import (ClassifierLitModel, RegressorLitModel,
10
+ GPT2ForSequenceClassification)
11
+ from .data import CSVDataModule, CVSplitter, LMDataModule
12
+ from .language_modeling import GPT2LitModel
13
+ from .tokenization import SMILESBPETokenizer, SMILESAlphabet
14
+ from .iupac_tokenization_iupac import get_data_loader,prepare_input
15
+ from .iupac_tokenization_pro import get_data_loader_pro,prepare_input_pro
16
+ from .iupac_tokenization_class import get_data_loader_class,prepare_input_class
17
+
18
+ __all__ = ("classification", "data", "tokenization",
19
+ "ClassifierLitModel", "CSVDataModule", "CVSplitter",
20
+ "GPT2ForSequenceClassification", "GPT2LitModel", "LMDataModule",
21
+ "RegressorLitModel", "SMILESBPETokenizer", "SMILESAlphabet")
iupac-gpt/iupac_gpt/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (1.07 kB). View file
 
iupac-gpt/iupac_gpt/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (1.08 kB). View file
 
iupac-gpt/iupac_gpt/__pycache__/classification.cpython-37.pyc ADDED
Binary file (13.3 kB). View file
 
iupac-gpt/iupac_gpt/__pycache__/classification.cpython-38.pyc ADDED
Binary file (13.3 kB). View file
 
iupac-gpt/iupac_gpt/__pycache__/data.cpython-38.pyc ADDED
Binary file (11.1 kB). View file
 
iupac-gpt/iupac_gpt/__pycache__/iupac_dataset.cpython-38.pyc ADDED
Binary file (3.09 kB). View file
 
iupac-gpt/iupac_gpt/__pycache__/iupac_dataset_class.cpython-38.pyc ADDED
Binary file (3.22 kB). View file
 
iupac-gpt/iupac_gpt/__pycache__/iupac_dataset_pro.cpython-38.pyc ADDED
Binary file (3.2 kB). View file
 
iupac-gpt/iupac_gpt/__pycache__/iupac_tokenization.cpython-38.pyc ADDED
Binary file (5.1 kB). View file
 
iupac-gpt/iupac_gpt/__pycache__/iupac_tokenization_class.cpython-38.pyc ADDED
Binary file (5.08 kB). View file
 
iupac-gpt/iupac_gpt/__pycache__/iupac_tokenization_iupac.cpython-38.pyc ADDED
Binary file (5.09 kB). View file
 
iupac-gpt/iupac_gpt/__pycache__/iupac_tokenization_pro.cpython-38.pyc ADDED
Binary file (5.11 kB). View file
 
iupac-gpt/iupac_gpt/__pycache__/language_modeling.cpython-38.pyc ADDED
Binary file (3.37 kB). View file
 
iupac-gpt/iupac_gpt/__pycache__/tokenization.cpython-38.pyc ADDED
Binary file (7.44 kB). View file
 
iupac-gpt/iupac_gpt/classification.py ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """HuggingFace-compatible classification and regression models including
2
+ pytorch-lightning models.
3
+ """
4
+
5
+ __all__ = ("BypassNet", "ClassificationHead", "ClassifierLitModel",
6
+ "GPT2ForSequenceClassification", "RegressorLitModel",
7
+ "SequenceClassifierOutput")
8
+
9
+ from dataclasses import dataclass
10
+ from typing import List, Optional
11
+
12
+ import pytorch_lightning as pl
13
+ import torch
14
+ import torch.nn as nn
15
+ import torch.nn.functional as F
16
+ from torchmetrics import AUROC, AveragePrecision
17
+ from transformers import AdamW, GPT2Model, GPT2PreTrainedModel
18
+ from transformers.modeling_outputs import SequenceClassifierOutputWithPast
19
+ from transformers.adapters.model_mixin import ModelWithHeadsAdaptersMixin
20
+
21
+
22
+ @dataclass
23
+ class SequenceClassifierOutput(SequenceClassifierOutputWithPast):
24
+ target: Optional[torch.LongTensor] = None
25
+
26
+
27
+ class GPT2ForSequenceClassification(ModelWithHeadsAdaptersMixin, GPT2PreTrainedModel):
28
+ """HuggingFace-compatible single- and multi-output (-task) classification model.
29
+ `config` must be a `GPT2Config` instance with additional `num_tasks` and `num_labels`
30
+ properties. For multi-task classification, the output is Bypass network with the
31
+ reduction factor = `config.n_embd // config.n_head`.
32
+ """
33
+
34
+ _keys_to_ignore_on_load_missing = [
35
+ r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight", r"output\..*"]
36
+
37
+ def __init__(self, config):
38
+ super().__init__(config)
39
+
40
+ self.num_tasks = config.num_tasks
41
+ self.num_labels = config.num_labels
42
+
43
+ self.transformer = GPT2Model(config)
44
+
45
+ if self.num_tasks > 1:
46
+ self.output = BypassNet(
47
+ config.n_embd, config.n_embd // config.n_head,
48
+ config.num_tasks, config.num_labels,
49
+ config.embd_pdrop)
50
+ else:
51
+ self.output = ClassificationHead(
52
+ config.n_embd, config.n_embd // config.n_head,
53
+ config.num_labels, config.embd_pdrop)
54
+
55
+ self.init_weights()
56
+
57
+ def forward(self, input_ids=None, past_key_values=None, attention_mask=None,
58
+ token_type_ids=None, position_ids=None, head_mask=None,
59
+ inputs_embeds=None, labels=None, use_cache=None, output_attentions=None,
60
+ output_hidden_states=None, return_dict=None, adapter_names=None,
61
+ label_mask=None):
62
+ return_dict = return_dict or self.config.use_return_dict
63
+
64
+ transformer_outputs = self.transformer(
65
+ input_ids, past_key_values=past_key_values, attention_mask=attention_mask,
66
+ token_type_ids=token_type_ids, position_ids=position_ids,
67
+ head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache,
68
+ output_attentions=output_attentions,
69
+ output_hidden_states=output_hidden_states, return_dict=return_dict,
70
+ adapter_names=adapter_names)
71
+
72
+ hidden_states = transformer_outputs[0]
73
+
74
+ if input_ids is not None:
75
+ batch_size, sequence_length = input_ids.shape[:2]
76
+ else:
77
+ batch_size, sequence_length = inputs_embeds.shape[:2]
78
+
79
+ assert self.config.pad_token_id is not None or batch_size == 1, \
80
+ "Cannot handle batch sizes > 1 if no padding token is defined."
81
+ if self.config.pad_token_id is None:
82
+ sequence_lengths = -1
83
+ else:
84
+ if input_ids is not None:
85
+ sequence_lengths = torch.ne(
86
+ input_ids, self.config.pad_token_id).sum(-1) - 1
87
+ else:
88
+ sequence_lengths = -1
89
+
90
+ if self.num_tasks == 1:
91
+ logits = self.output(hidden_states)[range(batch_size), sequence_lengths]
92
+ else:
93
+ logits = self.output(hidden_states, batch_size, sequence_lengths)
94
+
95
+ loss = None
96
+ if labels is not None:
97
+ if self.num_labels == 2:
98
+ if label_mask is not None:
99
+ nonempty_tasks = (label_mask == 1).view(-1)
100
+ nonempty_logits = logits.view(-1, self.num_labels)[nonempty_tasks, :]
101
+ nonempty_labels = labels.view(-1)[nonempty_tasks]
102
+ else:
103
+ nonempty_logits = logits.view(-1, self.num_labels)
104
+ nonempty_labels = labels.view(-1)
105
+
106
+ if len(labels.size()) == 1:
107
+ labels = labels.reshape(1, -1)
108
+
109
+ loss = F.cross_entropy(nonempty_logits, nonempty_labels)
110
+ elif self.num_labels == 1:
111
+ loss = F.mse_loss(logits.view(-1), labels.view(-1))
112
+ else:
113
+ raise NotImplementedError(
114
+ "Only binary classification and regression supported.")
115
+
116
+ if self.num_tasks > 1:
117
+ logits = logits.transpose(1, 2)
118
+
119
+ if labels is not None and self.num_labels == 2 and self.num_tasks == 1:
120
+ if label_mask is not None:
121
+ labels = labels.view(-1)
122
+ else:
123
+ labels = nonempty_labels
124
+
125
+ if not return_dict:
126
+ output = (logits,) + transformer_outputs[1:]
127
+ return ((loss,) + output) if loss is not None else output
128
+
129
+ return SequenceClassifierOutput(
130
+ loss=loss, logits=logits, target=labels,
131
+ past_key_values=transformer_outputs.past_key_values,
132
+ hidden_states=transformer_outputs.hidden_states,
133
+ attentions=transformer_outputs.attentions)
134
+
135
+
136
+ class BypassNet(nn.Module):
137
+ """Bypass multi-task network from MoleculeNet project [Wu et al., 2018].
138
+ """
139
+
140
+ def __init__(self, hidden_size: int, intermediate_size: int,
141
+ num_tasks: int, num_labels: int = 2,
142
+ dropout: float = 0.2, use_bias: bool = False):
143
+ super().__init__()
144
+ self.independent = nn.ModuleList([
145
+ ClassificationHead(hidden_size, intermediate_size,
146
+ num_labels, dropout, use_bias)
147
+ for _ in range(num_tasks)])
148
+ self.shared = ClassificationHead(hidden_size, intermediate_size,
149
+ num_labels, dropout, use_bias)
150
+
151
+ def forward(self, hidden_states, batch_size, sequence_lengths):
152
+ logits_list: List[torch.Tensor] = []
153
+ for layer in self.independent:
154
+ logits_list.append(layer(hidden_states))
155
+ shared_logits: torch.Tensor = self.shared(hidden_states)
156
+ for i in range(len(logits_list)):
157
+ logits_list[i] = (logits_list[i] + shared_logits)[range(batch_size),
158
+ sequence_lengths]
159
+ return torch.stack(logits_list, dim=1)
160
+
161
+
162
+ class ClassificationHead(nn.Module):
163
+ """Two-layer feed-forward network with GELU activation and intermediate dropout.
164
+ """
165
+
166
+ def __init__(self, hidden_size: int, intermediate_size: int,
167
+ num_labels: int, dropout: float = 0.0, use_bias: bool = False):
168
+ super().__init__()
169
+ self.dense = nn.Linear(hidden_size, intermediate_size, bias=use_bias)
170
+ self.act = nn.GELU()
171
+ self.dropout = nn.Dropout(dropout)
172
+ self.out_proj = nn.Linear(intermediate_size, num_labels, bias=use_bias)
173
+
174
+ def forward(self, x, *args, **kwargs):
175
+ x = self.dense(x)
176
+ x = self.act(x)
177
+ x = self.dropout(x)
178
+ return self.out_proj(x)
179
+
180
+
181
+ class ClassifierLitModel(pl.LightningModule):
182
+ """Pytorch-lightning module for single- or multi-task classification. Trains GPT2
183
+ model using `AdamW` optimizer with exponential LR scheduler. Evaluates valid and
184
+ test data on AUC-ROC and AUC-PRC.
185
+
186
+ Args:
187
+ transformer (`GPT2Model`): (Pretrained) HuggingFace GPT2 model.
188
+ num_tasks (int): The number of classification tasks.
189
+ has_empty_labels (bool)
190
+ batch_size (int)
191
+ learning_rate (float)
192
+ scheduler_lambda (float)
193
+ scheduler_step (int)
194
+ weight_decay (float)
195
+ """
196
+
197
+ def __init__(self, transformer: GPT2Model, num_tasks: int, has_empty_labels: bool,
198
+ batch_size: int, learning_rate: float, scheduler_lambda: float,
199
+ scheduler_step: int, weight_decay: float, *args, **kwargs):
200
+ super().__init__()
201
+
202
+ self.save_hyperparameters(ignore=("transformer", "num_tasks", "has_empty_labels"))
203
+ self.transformer = transformer
204
+ self.num_tasks = num_tasks
205
+
206
+ def get_metrics(metric_cls):
207
+ return [metric_cls(num_classes=2) for _ in range(num_tasks)]
208
+
209
+ if has_empty_labels:
210
+ self.train_roc = get_metrics(AUROC)
211
+ self.val_roc = get_metrics(AUROC)
212
+ self.test_roc = get_metrics(AUROC)
213
+
214
+ self.train_prc = get_metrics(AveragePrecision)
215
+ self.val_prc = get_metrics(AveragePrecision)
216
+ self.test_prc = get_metrics(AveragePrecision)
217
+
218
+ self.step = self._step_empty
219
+ self.epoch_end = self._epoch_end_empty
220
+ else:
221
+ #self.train_roc = AUROC(num_classes=2)
222
+ #self.val_roc = AUROC(num_classes=2)
223
+ #self.test_roc = AUROC(num_classes=2)
224
+
225
+ #self.train_prc = AveragePrecision(num_classes=2)
226
+ #self.val_prc = AveragePrecision(num_classes=2)
227
+ #self.test_prc = AveragePrecision(num_classes=2)
228
+
229
+ self.train_roc = AUROC(task='multiclass',num_classes=2)
230
+ self.val_roc = AUROC(task='multiclass',num_classes=2)
231
+ self.test_roc = AUROC(task='multiclass',num_classes=2)
232
+
233
+ self.train_prc = AveragePrecision(task='multiclass',num_classes=2)
234
+ self.val_prc = AveragePrecision(task='multiclass',num_classes=2)
235
+ self.test_prc = AveragePrecision(task='multiclass',num_classes=2)
236
+
237
+ self.step = self._step_nonempty
238
+ self.epoch_end = self._epoch_end_nonempty
239
+
240
+ def forward(self, *args, **kwargs):
241
+ return self.transformer(*args, **kwargs)
242
+
243
+ def _step_empty(self, batch, batch_idx, roc, prc):
244
+ outputs = self(**batch)
245
+
246
+ if self.num_tasks == 1:
247
+ outputs["target"] = outputs["target"][:, None]
248
+ outputs["logits"] = outputs["logits"][:, :, None]
249
+
250
+ for task_id in range(self.num_tasks):
251
+ target = outputs["target"][:, task_id]
252
+ nonempty_entries = target != -1
253
+ target = target[nonempty_entries]
254
+
255
+ if target.unique().size(0) > 1:
256
+ logits = outputs["logits"][:, :, task_id][nonempty_entries]
257
+
258
+ roc[task_id](logits, target)
259
+ prc[task_id](logits, target)
260
+
261
+ return {"loss": outputs["loss"]}
262
+
263
+ def _step_nonempty(self, batch, batch_idx, roc, prc):
264
+ outputs = self(**batch)
265
+
266
+ logits, target = outputs["logits"], outputs["target"]
267
+ if target.unique().size(0) > 1:
268
+ roc(logits, target)
269
+ prc(logits, target)
270
+
271
+ return {"loss": outputs["loss"]}
272
+
273
+ def _epoch_end_empty(self, outputs_ignored, roc, prc, prefix):
274
+ mean_roc = sum(a.compute() for a in roc) / self.num_tasks
275
+ self.log(f"{prefix}_roc", mean_roc, on_step=False, on_epoch=True, prog_bar=True)
276
+ mean_prc = sum(p.compute() for p in prc) / self.num_tasks #p.compute()[1]
277
+ self.log(f"{prefix}_prc", mean_prc, on_step=False, on_epoch=True, prog_bar=True)
278
+
279
+ def _epoch_end_nonempty(self, outputs, roc, prc, prefix):
280
+ self.log(f"{prefix}_roc", roc.compute(),
281
+ on_step=False, on_epoch=True, prog_bar=True)
282
+ self.log(f"{prefix}_prc", prc.compute(), #prc.compute()[1]
283
+ on_step=False, on_epoch=True, prog_bar=True)
284
+
285
+ def training_step(self, batch, batch_idx):
286
+ return self.step(batch, batch_idx, self.train_roc, self.train_prc)
287
+
288
+ def training_epoch_end(self, outputs):
289
+ self.epoch_end(outputs, self.train_roc, self.train_prc, "train")
290
+
291
+ def validation_step(self, batch, batch_idx):
292
+ return self.step(batch, batch_idx, self.val_roc, self.val_prc)
293
+
294
+ def validation_epoch_end(self, outputs):
295
+ self.epoch_end(outputs, self.val_roc, self.val_prc, "val")
296
+
297
+ def test_step(self, batch, batch_idx):
298
+ self.step(batch, batch_idx, self.test_roc, self.test_prc)
299
+
300
+ def test_epoch_end(self, outputs):
301
+ self.epoch_end(outputs, self.test_roc, self.test_prc, "test")
302
+
303
+ def configure_optimizers(self):
304
+ optimizer = AdamW(self.parameters(), lr=self.hparams.learning_rate,
305
+ weight_decay=self.hparams.weight_decay)
306
+ lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(
307
+ optimizer, self.hparams.scheduler_lambda)
308
+ return {"optimizer": optimizer,
309
+ "lr_scheduler": {"scheduler": lr_scheduler,
310
+ "interval": "step",
311
+ "frequency": self.hparams.scheduler_step}}
312
+
313
+
314
+ class RegressorLitModel(pl.LightningModule):
315
+ def __init__(self, transformer: GPT2Model,
316
+ batch_size: int, learning_rate: float, scheduler_lambda: float,
317
+ scheduler_step: int, weight_decay: float, *args, **kwargs):
318
+ super().__init__()
319
+
320
+ self.save_hyperparameters(ignore="transformer")
321
+ self.transformer = transformer
322
+
323
+ def forward(self, *args, **kwargs):
324
+ return self.transformer(*args, **kwargs)
325
+ hidden_states = transformer_outputs[0]
326
+
327
+ def step(self, batch, batch_idx):
328
+ outputs = self(**batch)
329
+ rmse_loss = torch.sqrt(outputs["loss"])
330
+ return {"loss": rmse_loss}
331
+
332
+ def epoch_end(self, outputs, prefix):
333
+ mean_rmse = torch.mean(torch.tensor([out["loss"] for out in outputs]))
334
+ self.log(f"{prefix}_rmse", mean_rmse, on_step=False, on_epoch=True, prog_bar=True)
335
+
336
+ def training_step(self, batch, batch_idx):
337
+ return self.step(batch, batch_idx)
338
+
339
+ def training_epoch_end(self, outputs):
340
+ self.epoch_end(outputs, "train")
341
+
342
+ def validation_step(self, batch, batch_idx):
343
+ return self.step(batch, batch_idx)
344
+
345
+ def validation_epoch_end(self, outputs):
346
+ self.epoch_end(outputs, "val")
347
+
348
+ def test_step(self, batch, batch_idx):
349
+ return self.step(batch, batch_idx)
350
+
351
+ def test_epoch_end(self, outputs):
352
+ self.epoch_end(outputs, "test")
353
+
354
+ def configure_optimizers(self):
355
+ optimizer = AdamW(self.parameters(), lr=self.hparams.learning_rate,
356
+ weight_decay=self.hparams.weight_decay)
357
+ lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(
358
+ optimizer, self.hparams.scheduler_lambda)
359
+ return {"optimizer": optimizer,
360
+ "lr_scheduler": {"scheduler": lr_scheduler,
361
+ "interval": "step",
362
+ "frequency": self.hparams.scheduler_step}}
iupac-gpt/iupac_gpt/data.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Loads torch-compatible data sets and lightning-compatible data modules.
2
+ """
3
+
4
+ __all__ = ("CSVDataset", "CSVDataModule", "CVSplitter", "LMDataset", "LMDataModule")
5
+
6
+ from collections import defaultdict
7
+ from dataclasses import dataclass
8
+ from functools import partial
9
+ from pathlib import Path
10
+ from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Tuple, Union
11
+
12
+ import torch
13
+ from pytorch_lightning import LightningDataModule
14
+ from sklearn.model_selection import ShuffleSplit
15
+ from tokenizers.implementations import BaseTokenizer
16
+ from transformers import PreTrainedTokenizerFast
17
+ from transformers import DataCollatorForLanguageModeling, DataCollatorWithPadding
18
+ from torch.utils.data import Dataset, DataLoader
19
+
20
+
21
+ @dataclass(init=True, repr=True, eq=False, frozen=False)
22
+ class CSVDataset(Dataset):
23
+ """Stores `pandas.DataFrame` instance of tabular data and retrieves encoded token
24
+ ids and attention mask. Optionally returns labels and their masks.
25
+
26
+ Args:
27
+ dataframe (`pandas.DataFrame`):
28
+ Data frame of SMILES strings and their (multi-task) labels.
29
+ tokenizer (`tokenizers.BaseTokenizer` or `SMILESBPETokenizer`)
30
+ SMILES tokenizer.
31
+ smiles_column (`str`, defaults to "smiles"):
32
+ Column name of SMILES strings in `dataframe`.
33
+ target_column (`str` or `list` of `str`, defaults to `None`):
34
+ Target column(s). If `None`, labels are ignored.
35
+ has_empty_target (`bool`, defaults to `False`):
36
+ Whether entries have empty target values. If `True`, additionally retrieves
37
+ a target mask.
38
+ task_type ("classification" or "regression", defaults to "classification")
39
+ encode_kwargs (dict, defaults to {"truncation": True})
40
+ Positional arguments for `tokenizer` encoding, e.g. {"padding": True}.
41
+ """
42
+
43
+ dataframe: "pandas.DataFrame"
44
+ tokenizer: BaseTokenizer
45
+ smiles_column: str = 'smiles'
46
+ target_column: Union[None, str, List[str]] = None
47
+ has_empty_target: bool = False
48
+ task_type: Literal["classification", "regression"] = "classification"
49
+ encode_kwargs: Optional[Dict[str, Any]] = None
50
+
51
+ def __post_init__(self) -> None:
52
+ if isinstance(self.tokenizer, PreTrainedTokenizerFast):
53
+ self._encode = partial(self.tokenizer.__call__, add_special_tokens=False)
54
+ self._id_key = "input_ids"
55
+ else:
56
+ self._encode = self.tokenizer.encode
57
+ self._id_key = "ids"
58
+ self.encode_kwargs = self.encode_kwargs or {"truncation": True}
59
+ self._encode = partial(self._encode, **self.encode_kwargs)
60
+
61
+ def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:
62
+ """Returns dict of encoded token IDs, attention mask, and optionally labels
63
+ and label mask.
64
+ """
65
+ item: Dict[str, torch.Tensor] = {}
66
+
67
+ smiles = self.dataframe.iloc[index][self.smiles_column]
68
+ encodings = self._encode(smiles)
69
+ item["input_ids"] = torch.LongTensor(getattr(encodings, self._id_key))
70
+ item["attention_mask"] = torch.LongTensor(getattr(encodings, "attention_mask"))
71
+
72
+ if self.target_column is not None:
73
+ labels = self.dataframe.iloc[index][self.target_column]
74
+ if self.has_empty_target:
75
+ label_mask = ~labels.isna()
76
+ labels = labels.fillna(-1)
77
+ item["label_mask"] = torch.BoolTensor(label_mask)
78
+ if self.task_type == "regression":
79
+ tensor_type = torch.FloatTensor
80
+ elif self.task_type == "classification":
81
+ tensor_type = torch.LongTensor
82
+ else:
83
+ raise NotImplementedError("`CSVDataset` supports only classification and "
84
+ "regression tasks")
85
+ item["labels"] = tensor_type(labels)
86
+
87
+ return item
88
+
89
+ def __len__(self) -> int:
90
+ return self.dataframe.shape[0]
91
+
92
+
93
+ @dataclass(init=True, eq=True, repr=True, frozen=False)
94
+ class CVSplitter:
95
+ """Splits series of SMILES data with either random or scaffold splitting.
96
+ """
97
+
98
+ mode: str = "random"
99
+ train_size: float = 0.8
100
+ val_size: float = 0.1
101
+ test_size: float = 0.1
102
+
103
+ def __post_init__(self) -> None:
104
+ if self.mode == "scaffold":
105
+ self.train_val_test_split = self.scaffold_split
106
+ elif self.mode == "random":
107
+ self.train_val_test_split = self.random_split
108
+
109
+ @staticmethod
110
+ def get_sorted_scaffolds(smiles_seqs: Sequence[str]):
111
+ from rdkit.Chem import MolFromSmiles
112
+ from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
113
+
114
+ scaffolds: Dict[str, List[int]] = defaultdict(list)
115
+ molecules = (MolFromSmiles(s, sanitize=True) for s in smiles_seqs)
116
+
117
+ for i, molecule in enumerate(molecules):
118
+ try:
119
+ scaffold = MurckoScaffoldSmiles(mol=molecule, includeChirality=False)
120
+ scaffolds[scaffold].append(i)
121
+ except Exception: # Really don't know what exception is raised...
122
+ pass
123
+
124
+ scaffolds = {scaffold: sorted(ids) for scaffold, ids in scaffolds.items()}
125
+ scaffold_sets = [scaffold_set
126
+ for scaffold, scaffold_set in
127
+ sorted(scaffolds.items(), key=lambda x: (len(x[1]), x[1][0]),
128
+ reverse=True)]
129
+ return scaffold_sets
130
+
131
+ def scaffold_split(self, smiles_seqs: Sequence[str]) \
132
+ -> Tuple[List[int], List[int], List[int]]:
133
+ scaffold_sets = self.get_sorted_scaffolds(smiles_seqs)
134
+
135
+ n_samples = len(smiles_seqs)
136
+ train_idx, val_idx, test_idx = [], [], []
137
+ train_cutoff = int(self.train_size * n_samples)
138
+ val_cutoff = int((self.train_size + self.val_size) * n_samples)
139
+
140
+ for group_indices in scaffold_sets:
141
+ n_group = len(group_indices)
142
+ n_train = len(train_idx)
143
+ if n_train + n_group > train_cutoff:
144
+ n_val = len(val_idx)
145
+ if n_train + n_val + n_group > val_cutoff:
146
+ test_idx.extend(group_indices)
147
+ else:
148
+ val_idx.extend(group_indices)
149
+ else:
150
+ train_idx.extend(group_indices)
151
+
152
+ return train_idx, val_idx, test_idx
153
+
154
+ def random_split(self, smiles_seqs: "pandas.Series") \
155
+ -> Tuple["numpy.array", "numpy.array", "numpy.array"]:
156
+ cv = ShuffleSplit(train_size=self.train_size + self.val_size)
157
+ train_idx, val_idx = next(cv.split(smiles_seqs))
158
+ cv.train_size = 1 - self.test_size / (self.train_size + self.val_size)
159
+ train_idx, test_idx = next(cv.split(smiles_seqs.iloc[train_idx]))
160
+
161
+ return train_idx, val_idx, test_idx
162
+
163
+
164
+ @dataclass(init=True, repr=True, eq=False, frozen=False)
165
+ class CSVDataModule(LightningDataModule):
166
+ """Lightning data module for tabular data. Accepts pandas `dataframe`, splits the
167
+ data into train/valid/test with `splitter`, creates `CSVDataset`s and Pytorch
168
+ `DataLoader`s with `DataCollatorWithPadding` collate function.
169
+ """
170
+
171
+ dataframe: "pandas.DataFrame"
172
+ tokenizer: BaseTokenizer
173
+ smiles_column: str = "smiles"
174
+ target_column: Union[None, str, List[str]] = None
175
+ has_empty_target: bool = False
176
+ task_type: Literal["classification", "regression"] = "classification"
177
+ splitter: CVSplitter = CVSplitter()
178
+ batch_size: int = 16
179
+ num_workers: int = 0
180
+
181
+ def __post_init__(self) -> None:
182
+ super().__init__()
183
+ self.train_dataset: Optional[CSVDataset] = None
184
+ self.val_dataset: Optional[CSVDataset] = None
185
+ self.test_dataset: Optional[CSVDataset] = None
186
+ self.collate_fn: Callable = DataCollatorWithPadding(self.tokenizer)
187
+
188
+ def setup(self, stage: Optional[str] = None) -> None:
189
+ train_idx, val_idx, test_idx = self.splitter.train_val_test_split(
190
+ self.dataframe[self.smiles_column])
191
+
192
+ train_dataframe = self.dataframe.iloc[train_idx].reset_index(drop=True)
193
+ self.train_dataset = CSVDataset(train_dataframe, self.tokenizer,
194
+ self.smiles_column, self.target_column,
195
+ self.has_empty_target, self.task_type)
196
+ valid_dataframe = self.dataframe.iloc[val_idx].reset_index(drop=True)
197
+ self.val_dataset = CSVDataset(valid_dataframe, self.tokenizer,
198
+ self.smiles_column, self.target_column,
199
+ self.has_empty_target, self.task_type)
200
+ test_dataframe = self.dataframe.iloc[test_idx].reset_index(drop=True)
201
+ self.test_dataset = CSVDataset(test_dataframe, self.tokenizer,
202
+ self.smiles_column, self.target_column,
203
+ self.has_empty_target, self.task_type)
204
+
205
+ def train_dataloader(self) -> Union[DataLoader, List[DataLoader],
206
+ Dict[str, DataLoader]]:
207
+ return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True,
208
+ collate_fn=self.collate_fn, num_workers=self.num_workers)
209
+
210
+ def val_dataloader(self) -> Union[DataLoader, List[DataLoader],
211
+ Dict[str, DataLoader]]:
212
+ return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False,
213
+ collate_fn=self.collate_fn, num_workers=self.num_workers)
214
+
215
+ def test_dataloader(self) -> Union[DataLoader, List[DataLoader],
216
+ Dict[str, DataLoader]]:
217
+ return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False,
218
+ collate_fn=self.collate_fn, num_workers=self.num_workers)
219
+
220
+
221
+ @dataclass(init=True, eq=False, repr=True, frozen=False)
222
+ class LMDataset(Dataset):
223
+ """Simple sequential dataset for autoregressive language modeling.
224
+ """
225
+
226
+ filename: str
227
+ tokenizer: BaseTokenizer
228
+
229
+ def __post_init__(self) -> None:
230
+ self.smiles_strings = Path(self.filename).read_text(encoding='ascii').splitlines()
231
+
232
+ if isinstance(self.tokenizer, PreTrainedTokenizerFast):
233
+ self._encode = partial(self.tokenizer.__call__, truncation=True)
234
+ self._id_key = "input_ids"
235
+ else:
236
+ self._encode = self.tokenizer.encode
237
+ self._id_key = "ids"
238
+
239
+ def __len__(self) -> int:
240
+ return len(self.smiles_strings)
241
+
242
+ def __getitem__(self, i: int) -> torch.Tensor:
243
+ encodings = self._encode(self.smiles_strings[i])
244
+ return torch.LongTensor(getattr(encodings, self._id_key))
245
+
246
+
247
+ @dataclass(init=True, repr=True, eq=False, frozen=False)
248
+ class LMDataModule(LightningDataModule):
249
+ """Lightning data module for autoregressive language modeling.
250
+ """
251
+
252
+ filename: str
253
+ tokenizer: BaseTokenizer
254
+ batch_size: int = 128
255
+ num_workers: int = 0
256
+ collate_fn: Union[None, Literal["default"], Callable] = "default"
257
+
258
+ def __post_init__(self) -> None:
259
+ super().__init__()
260
+ if self.collate_fn == "default":
261
+ self.collate_fn = DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
262
+
263
+ def setup(self, stage: Optional[str] = None) -> None:
264
+ self.dataset = LMDataset(self.filename, self.tokenizer)
265
+
266
+ def train_dataloader(self) -> Union[DataLoader, List[DataLoader],
267
+ Dict[str, DataLoader]]:
268
+ return DataLoader(self.dataset, batch_size=self.batch_size, shuffle=True,
269
+ collate_fn=self.collate_fn, num_workers=self.num_workers)
iupac-gpt/iupac_gpt/iupac_dataset.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+ import random
5
+ from itertools import chain
6
+ from collections import Counter
7
+ import numpy as np
8
+ import torch
9
+ from torch.nn.utils.rnn import pad_sequence
10
+ from transformers.data.data_collator import DataCollator
11
+ from multiprocessing import Pool
12
+ import mmap
13
+ from torch.utils.data import Dataset
14
+
15
+ class IUPACDataset(Dataset):
16
+ def __init__(self, dataset_dir='./',dataset_filename="iupacs_logp.txt", tokenizer=None,max_length=None,target_col=None,
17
+ dataset_size=None,iupac_name_col="iupac"):
18
+ self.dataset_dir = dataset_dir
19
+ self.tokenizer = tokenizer
20
+ self.target_col = target_col
21
+ self.max_length = max_length
22
+ self.dataset_size = dataset_size
23
+ self.dataset_filename = dataset_filename
24
+
25
+ # where the data is
26
+ self.dataset_fn = os.path.join(self.dataset_dir,self.dataset_filename)
27
+
28
+ # a bit of an odd way to read in a data file, but it lets
29
+ # us keep the data in csv format, and it's pretty fast
30
+ # (30s for 17G on my machine).
31
+ # we need to use mmap for data-parallel training with
32
+ # multiple processes so that the processes don't each keep
33
+ # a local copy of the dataset in host memory
34
+ line_offsets = []
35
+ # each element of data_mm is a character in the dataset file
36
+ self.data_mm = np.memmap(self.dataset_fn, dtype=np.uint8, mode="r")
37
+
38
+ # process chunksize bytes at a time
39
+ chunksize = int(1e9)
40
+ for i in range(0, len(self.data_mm), chunksize):
41
+ chunk = self.data_mm[i:i + chunksize]
42
+ # the index of each newline is the character before
43
+ # the beginning of the next line
44
+ newlines = np.nonzero(chunk == 0x0a)[0]
45
+ line_offsets.append(i + newlines + 1)
46
+ if self.dataset_size is not None and i > self.dataset_size:
47
+ # don't need to keep loading data
48
+ break
49
+ # line_offsets indicates the beginning of each line in self.dataset_fn
50
+ self.line_offsets = np.hstack(line_offsets)
51
+
52
+ if (self.dataset_size is not None
53
+ and self.dataset_size > self.line_offsets.shape[0]):
54
+ msg = "specified dataset_size {}, but the dataset only has {} items"
55
+ raise ValueError(msg.format(self.dataset_size,
56
+ self.line_offsets.shape[0]))
57
+
58
+ # extract headers
59
+ header_line = bytes(self.data_mm[0:self.line_offsets[0]])
60
+ headers = header_line.decode("utf8").strip().split("|")
61
+
62
+ # figure out which column IDs are of interest
63
+ try:
64
+ self.name_col_id = headers.index(iupac_name_col)
65
+ except ValueError as e:
66
+ raise RuntimeError("Expecting a column called '{}' "
67
+ "that contains IUPAC names".format(iupac_name_col))
68
+ self.target_col_id = None
69
+ if self.target_col is not None:
70
+ try:
71
+ self.target_col_id = headers.index(self.target_col)
72
+ except ValueError as e:
73
+ raise RuntimeError("User supplied target col " + target_col + \
74
+ "but column is not present in data file")
75
+
76
+ def __getitem__(self, idx):
77
+ # model_inputs is a dict with keys
78
+ # input_ids, target
79
+
80
+ if self.dataset_size is not None and idx > self.dataset_size:
81
+ msg = "provided index {} is larger than dataset size {}"
82
+ raise IndexError(msg.format(idx, self.dataset_size))
83
+
84
+ start = self.line_offsets[idx]
85
+ end = self.line_offsets[idx + 1]
86
+ line = bytes(self.data_mm[start:end])
87
+ line = line.decode("utf8").strip().split("|")
88
+ name = line[self.name_col_id]
89
+
90
+ # get the target value, if needed
91
+ target = None
92
+ if self.target_col_id is not None:
93
+ target = line[self.target_col_id]
94
+ if self.target_col == "Log P" and len(target) == 0:
95
+ target = 3.16 # average of training data
96
+ else:
97
+ target = float(target)
98
+
99
+ tokenized = self.tokenizer(name) #after this the tokenizer.eos_token_id have been added automaticly
100
+ input_ids = torch.tensor(tokenized["input_ids"])
101
+
102
+ iupac_unk = torch.tensor([self.tokenizer._convert_token_to_id(self.tokenizer.unk_token)])
103
+ input_ids = torch.tensor(input_ids)
104
+ input_ids = torch.cat([iupac_unk,input_ids])
105
+
106
+ return_dict = {}
107
+ return_dict["input_ids"] = input_ids #np.array(tokenized["input_ids"])
108
+ return_dict["labels"] = input_ids
109
+ #return_dict["property"] = torch.tensor(np.array(target))
110
+
111
+ if self.max_length is not None:
112
+ return_dict["input_ids"] = return_dict["input_ids"][:self.max_length]
113
+ return_dict["labels"] = return_dict["labels"][:self.max_length]
114
+
115
+ return return_dict
116
+
117
+ def __len__(self):
118
+ if self.dataset_size is None:
119
+ return len(self.line_offsets) - 1
120
+ else:
121
+ return self.dataset_size
iupac-gpt/iupac_gpt/iupac_dataset_class.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+ import random
5
+ from itertools import chain
6
+ from collections import Counter
7
+ import numpy as np
8
+ import torch
9
+ from torch.nn.utils.rnn import pad_sequence
10
+ from transformers.data.data_collator import DataCollator
11
+ from multiprocessing import Pool
12
+ import mmap
13
+ from torch.utils.data import Dataset
14
+
15
+ class IUPACDataset(Dataset):
16
+ def __init__(self, dataset_dir='./',dataset_filename="iupacs_logp.txt", tokenizer=None,max_length=None,target_col=None,
17
+ dataset_size=None,iupac_name_col="iupac"):
18
+ self.dataset_dir = dataset_dir
19
+ self.tokenizer = tokenizer
20
+ self.target_col = target_col
21
+ self.max_length = max_length
22
+ self.dataset_size = dataset_size
23
+ self.dataset_filename = dataset_filename
24
+
25
+ # where the data is
26
+ self.dataset_fn = os.path.join(self.dataset_dir,self.dataset_filename)
27
+
28
+ # a bit of an odd way to read in a data file, but it lets
29
+ # us keep the data in csv format, and it's pretty fast
30
+ # (30s for 17G on my machine).
31
+ # we need to use mmap for data-parallel training with
32
+ # multiple processes so that the processes don't each keep
33
+ # a local copy of the dataset in host memory
34
+ line_offsets = []
35
+ # each element of data_mm is a character in the dataset file
36
+ self.data_mm = np.memmap(self.dataset_fn, dtype=np.uint8, mode="r")
37
+
38
+ # process chunksize bytes at a time
39
+ chunksize = int(1e9)
40
+ for i in range(0, len(self.data_mm), chunksize):
41
+ chunk = self.data_mm[i:i + chunksize]
42
+ # the index of each newline is the character before
43
+ # the beginning of the next line
44
+ newlines = np.nonzero(chunk == 0x0a)[0]
45
+ line_offsets.append(i + newlines + 1)
46
+ if self.dataset_size is not None and i > self.dataset_size:
47
+ # don't need to keep loading data
48
+ break
49
+ # line_offsets indicates the beginning of each line in self.dataset_fn
50
+ self.line_offsets = np.hstack(line_offsets)
51
+
52
+ if (self.dataset_size is not None
53
+ and self.dataset_size > self.line_offsets.shape[0]):
54
+ msg = "specified dataset_size {}, but the dataset only has {} items"
55
+ raise ValueError(msg.format(self.dataset_size,
56
+ self.line_offsets.shape[0]))
57
+
58
+ # extract headers
59
+ header_line = bytes(self.data_mm[0:self.line_offsets[0]])
60
+ headers = header_line.decode("utf8").strip().split("|")
61
+
62
+ # figure out which column IDs are of interest
63
+ try:
64
+ self.name_col_id = headers.index(iupac_name_col)
65
+ except ValueError as e:
66
+ raise RuntimeError("Expecting a column called '{}' "
67
+ "that contains IUPAC names".format(iupac_name_col))
68
+ self.target_col_id = None
69
+ if self.target_col is not None:
70
+ try:
71
+ self.target_col_id = headers.index(self.target_col)
72
+ except ValueError as e:
73
+ raise RuntimeError("User supplied target col " + target_col + \
74
+ "but column is not present in data file")
75
+
76
+ def __getitem__(self, idx):
77
+ # model_inputs is a dict with keys
78
+ # input_ids, target
79
+
80
+ if self.dataset_size is not None and idx > self.dataset_size:
81
+ msg = "provided index {} is larger than dataset size {}"
82
+ raise IndexError(msg.format(idx, self.dataset_size))
83
+
84
+ start = self.line_offsets[idx]
85
+ end = self.line_offsets[idx + 1]
86
+ line = bytes(self.data_mm[start:end])
87
+ line = line.decode("utf8").strip().split("|")
88
+ name = line[self.name_col_id]
89
+
90
+ # get the target value, if needed
91
+ target = None
92
+ if self.target_col_id is not None:
93
+ target = line[self.target_col_id]
94
+ if self.target_col == "Log P" and len(target) == 0:
95
+ target = 3.16 # average of training data
96
+ else:
97
+ target = float(target)
98
+
99
+ if target>3.16:
100
+ target = 1
101
+ else:
102
+ target=0
103
+
104
+ tokenized = self.tokenizer(name) #after this the tokenizer.eos_token_id have been added automaticly
105
+ input_ids = torch.tensor(tokenized["input_ids"])
106
+
107
+ iupac_unk = torch.tensor([self.tokenizer._convert_token_to_id(self.tokenizer.unk_token)])
108
+ input_ids = torch.tensor(input_ids)
109
+ input_ids = torch.cat([iupac_unk,input_ids])
110
+
111
+ attention_mask = torch.ones(input_ids.numel(), dtype=int)
112
+
113
+ return_dict = {}
114
+ return_dict["input_ids"] = input_ids
115
+ return_dict["labels"] = torch.tensor(np.array(target))
116
+ return_dict["attention_mask"] = attention_mask
117
+
118
+ if self.max_length is not None:
119
+ return_dict["input_ids"] = return_dict["input_ids"][:self.max_length]
120
+ return_dict["attention_mask"] = return_dict["attention_mask"][:self.max_length]
121
+
122
+ return return_dict
123
+
124
+ def __len__(self):
125
+ if self.dataset_size is None:
126
+ return len(self.line_offsets) - 1
127
+ else:
128
+ return self.dataset_size
iupac-gpt/iupac_gpt/iupac_dataset_pro.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+ import random
5
+ from itertools import chain
6
+ from collections import Counter
7
+ import numpy as np
8
+ import torch
9
+ from torch.nn.utils.rnn import pad_sequence
10
+ from transformers.data.data_collator import DataCollator
11
+ from multiprocessing import Pool
12
+ import mmap
13
+ from torch.utils.data import Dataset
14
+
15
+ class IUPACDataset(Dataset):
16
+ def __init__(self, dataset_dir='./',dataset_filename="iupacs_logp.txt", tokenizer=None,max_length=None,target_col=None,
17
+ dataset_size=None,iupac_name_col="iupac"):
18
+ self.dataset_dir = dataset_dir
19
+ self.tokenizer = tokenizer
20
+ self.target_col = target_col
21
+ self.max_length = max_length
22
+ self.dataset_size = dataset_size
23
+ self.dataset_filename = dataset_filename
24
+
25
+ # where the data is
26
+ self.dataset_fn = os.path.join(self.dataset_dir,self.dataset_filename)
27
+
28
+ # a bit of an odd way to read in a data file, but it lets
29
+ # us keep the data in csv format, and it's pretty fast
30
+ # (30s for 17G on my machine).
31
+ # we need to use mmap for data-parallel training with
32
+ # multiple processes so that the processes don't each keep
33
+ # a local copy of the dataset in host memory
34
+ line_offsets = []
35
+ # each element of data_mm is a character in the dataset file
36
+ self.data_mm = np.memmap(self.dataset_fn, dtype=np.uint8, mode="r")
37
+
38
+ # process chunksize bytes at a time
39
+ chunksize = int(1e9)
40
+ for i in range(0, len(self.data_mm), chunksize):
41
+ chunk = self.data_mm[i:i + chunksize]
42
+ # the index of each newline is the character before
43
+ # the beginning of the next line
44
+ newlines = np.nonzero(chunk == 0x0a)[0]
45
+ line_offsets.append(i + newlines + 1)
46
+ if self.dataset_size is not None and i > self.dataset_size:
47
+ # don't need to keep loading data
48
+ break
49
+ # line_offsets indicates the beginning of each line in self.dataset_fn
50
+ self.line_offsets = np.hstack(line_offsets)
51
+
52
+ if (self.dataset_size is not None
53
+ and self.dataset_size > self.line_offsets.shape[0]):
54
+ msg = "specified dataset_size {}, but the dataset only has {} items"
55
+ raise ValueError(msg.format(self.dataset_size,
56
+ self.line_offsets.shape[0]))
57
+
58
+ # extract headers
59
+ header_line = bytes(self.data_mm[0:self.line_offsets[0]])
60
+ headers = header_line.decode("utf8").strip().split("|")
61
+
62
+ # figure out which column IDs are of interest
63
+ try:
64
+ self.name_col_id = headers.index(iupac_name_col)
65
+ except ValueError as e:
66
+ raise RuntimeError("Expecting a column called '{}' "
67
+ "that contains IUPAC names".format(iupac_name_col))
68
+ self.target_col_id = None
69
+ if self.target_col is not None:
70
+ try:
71
+ self.target_col_id = headers.index(self.target_col)
72
+ except ValueError as e:
73
+ raise RuntimeError("User supplied target col " + target_col + \
74
+ "but column is not present in data file")
75
+
76
+ def __getitem__(self, idx):
77
+ # model_inputs is a dict with keys
78
+ # input_ids, target
79
+
80
+ if self.dataset_size is not None and idx > self.dataset_size:
81
+ msg = "provided index {} is larger than dataset size {}"
82
+ raise IndexError(msg.format(idx, self.dataset_size))
83
+
84
+ start = self.line_offsets[idx]
85
+ end = self.line_offsets[idx + 1]
86
+ line = bytes(self.data_mm[start:end])
87
+ line = line.decode("utf8").strip().split("|")
88
+ name = line[self.name_col_id]
89
+
90
+ # get the target value, if needed
91
+ target = None
92
+ if self.target_col_id is not None:
93
+ target = line[self.target_col_id]
94
+ if self.target_col == "Log P" and len(target) == 0:
95
+ target = 3.16 # average of training data
96
+ else:
97
+ target = float(target)
98
+
99
+
100
+ tokenized = self.tokenizer(name) #after this the tokenizer.eos_token_id have been added automaticly
101
+ input_ids = torch.tensor(tokenized["input_ids"])
102
+
103
+ iupac_unk = torch.tensor([self.tokenizer._convert_token_to_id(self.tokenizer.unk_token)])
104
+ input_ids = torch.tensor(input_ids)
105
+ input_ids = torch.cat([iupac_unk,input_ids])
106
+
107
+ attention_mask = torch.ones(input_ids.numel(), dtype=int)
108
+
109
+ return_dict = {}
110
+ return_dict["input_ids"] = input_ids
111
+ return_dict["labels"] = torch.tensor(np.array(target))
112
+ return_dict["attention_mask"] = attention_mask
113
+
114
+ if self.max_length is not None:
115
+ return_dict["input_ids"] = return_dict["input_ids"][:self.max_length]
116
+ return_dict["attention_mask"] = return_dict["attention_mask"][:self.max_length]
117
+
118
+ return return_dict
119
+
120
+ def __len__(self):
121
+ if self.dataset_size is None:
122
+ return len(self.line_offsets) - 1
123
+ else:
124
+ return self.dataset_size
iupac-gpt/iupac_gpt/iupac_spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb18836fd01a60e6cf61ad64e7e6556ac1f676d3ca39a16f375d54e8a8fb4e60
3
+ size 275487
iupac-gpt/iupac_gpt/iupac_spm.vocab ADDED
@@ -0,0 +1,1391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <pad> 0
2
+ </s> 0
3
+ <unk> 0
4
+ 0 0
5
+ 1 0
6
+ 2 0
7
+ 3 0
8
+ 4 0
9
+ 5 0
10
+ 6 0
11
+ 7 0
12
+ 8 0
13
+ 9 0
14
+ 10 0
15
+ 11 0
16
+ 12 0
17
+ 13 0
18
+ 14 0
19
+ 15 0
20
+ 16 0
21
+ 17 0
22
+ 18 0
23
+ 19 0
24
+ 20 0
25
+ 21 0
26
+ 22 0
27
+ 23 0
28
+ 24 0
29
+ 25 0
30
+ 26 0
31
+ 27 0
32
+ 28 0
33
+ 29 0
34
+ 30 0
35
+ 31 0
36
+ 32 0
37
+ 33 0
38
+ 34 0
39
+ 35 0
40
+ 36 0
41
+ 37 0
42
+ 38 0
43
+ 39 0
44
+ 40 0
45
+ 41 0
46
+ 42 0
47
+ 43 0
48
+ 44 0
49
+ 45 0
50
+ 46 0
51
+ 47 0
52
+ 48 0
53
+ 49 0
54
+ 50 0
55
+ 51 0
56
+ 52 0
57
+ 53 0
58
+ 54 0
59
+ 55 0
60
+ 56 0
61
+ 57 0
62
+ 58 0
63
+ 59 0
64
+ 60 0
65
+ 61 0
66
+ 62 0
67
+ 63 0
68
+ 64 0
69
+ 65 0
70
+ 66 0
71
+ 67 0
72
+ 68 0
73
+ 69 0
74
+ 70 0
75
+ 71 0
76
+ 72 0
77
+ 73 0
78
+ 74 0
79
+ 75 0
80
+ 76 0
81
+ 77 0
82
+ 78 0
83
+ 79 0
84
+ 80 0
85
+ 81 0
86
+ 82 0
87
+ 83 0
88
+ 84 0
89
+ 85 0
90
+ 86 0
91
+ 87 0
92
+ 88 0
93
+ 89 0
94
+ 90 0
95
+ 91 0
96
+ 92 0
97
+ 93 0
98
+ 94 0
99
+ 95 0
100
+ 96 0
101
+ 97 0
102
+ 98 0
103
+ 99 0
104
+ ; 0
105
+ . 0
106
+ .0 0
107
+ ' 0
108
+ R 0
109
+ S 0
110
+ H 0
111
+ N 0
112
+ E 0
113
+ Z 0
114
+ aR 0
115
+ aS 0
116
+ bR 0
117
+ bS 0
118
+ cR 0
119
+ cS 0
120
+ dR 0
121
+ dS 0
122
+ aH 0
123
+ bH 0
124
+ cH 0
125
+ aE 0
126
+ aZ 0
127
+ a, 0
128
+ a- 0
129
+ b, 0
130
+ b- 0
131
+ c, 0
132
+ c- 0
133
+ d, 0
134
+ d- 0
135
+ a] 0
136
+ b] 0
137
+ c] 0
138
+ d] 0
139
+ e] 0
140
+ f] 0
141
+ g] 0
142
+ h] 0
143
+ i] 0
144
+ j] 0
145
+ k] 0
146
+ l] 0
147
+ m] 0
148
+ <high> 0
149
+ <med> 0
150
+ <low> 0
151
+ - 0
152
+ yl 0
153
+ , 0
154
+ ) 0
155
+ ( 0
156
+ ] 0
157
+ [ 0
158
+ meth 0
159
+ phenyl 0
160
+ di 0
161
+ an 0
162
+ eth 0
163
+ oxy 0
164
+ prop 0
165
+ e 0
166
+ amino 0
167
+ oxo 0
168
+ fluoro 0
169
+ cyclo 0
170
+ o 0
171
+ amide 0
172
+ tri 0
173
+ chloro 0
174
+ but 0
175
+ hydroxy 0
176
+ a 0
177
+ one 0
178
+ pyridin 0
179
+ hydro 0
180
+ benzo 0
181
+ acet 0
182
+ l 0
183
+ en 0
184
+ ol 0
185
+ amine 0
186
+ ylamin 0
187
+ oxa 0
188
+ oyl 0
189
+ carboxamide 0
190
+ benz 0
191
+ piperidin 0
192
+ thia 0
193
+ ate 0
194
+ sulf 0
195
+ bromo 0
196
+ ylidene 0
197
+ pyrimidin 0
198
+ tetra 0
199
+ ic_acid 0
200
+ penta 0
201
+ pyrrolidin 0
202
+ sulfonyl 0
203
+ hexa 0
204
+ hex 0
205
+ ane 0
206
+ pyrazol 0
207
+ phenoxy 0
208
+ carbonyl 0
209
+ thiophen 0
210
+ aza 0
211
+ piperazin 0
212
+ azo 0
213
+ carboxylate 0
214
+ imidazol 0
215
+ furan 0
216
+ nitro 0
217
+ carbam 0
218
+ anilino 0
219
+ pent 0
220
+ d 0
221
+ tert- 0
222
+ benzen 0
223
+ indol 0
224
+ sulfon 0
225
+ carboxylic_acid 0
226
+ diazo 0
227
+ az 0
228
+ ene 0
229
+ quinolin 0
230
+ naphthalen 0
231
+ morpholin 0
232
+ ium 0
233
+ cyano 0
234
+ bi 0
235
+ bis 0
236
+ hepta 0
237
+ pyrrol 0
238
+ spiro 0
239
+ r 0
240
+ ole 0
241
+ azin 0
242
+ hydrochloride 0
243
+ urea 0
244
+ yn 0
245
+ azido 0
246
+ carbamate 0
247
+ pyrrolo 0
248
+ it 0
249
+ imidazo 0
250
+ pyrazin 0
251
+ guanidin 0
252
+ thio 0
253
+ pyrazolo 0
254
+ iodo 0
255
+ imino 0
256
+ sulfam 0
257
+ carbon 0
258
+ olidin 0
259
+ epin 0
260
+ isoquinolin 0
261
+ deca 0
262
+ anilin 0
263
+ quinazolin 0
264
+ nitrile 0
265
+ hydrazin 0
266
+ epan 0
267
+ pyridazin 0
268
+ chromen 0
269
+ octa 0
270
+ octan 0
271
+ thieno 0
272
+ in 0
273
+ amido 0
274
+ hept 0
275
+ thiol 0
276
+ hydroiodide 0
277
+ imid 0
278
+ isoindol 0
279
+ nona 0
280
+ pyrido 0
281
+ inden 0
282
+ carbazol 0
283
+ ox 0
284
+ dodeca 0
285
+ etidin 0
286
+ oct 0
287
+ phenol 0
288
+ imidazolidin 0
289
+ sil 0
290
+ carboxy 0
291
+ imido 0
292
+ phosphor 0
293
+ purin 0
294
+ phospha 0
295
+ fluoren 0
296
+ carbox 0
297
+ indazol 0
298
+ undeca 0
299
+ furo 0
300
+ tetradeca 0
301
+ cyclopenta[a]phenanthren 0
302
+ form 0
303
+ quinoxalin 0
304
+ trideca 0
305
+ hexadeca 0
306
+ imine 0
307
+ sulfinyl 0
308
+ octadeca 0
309
+ carba 0
310
+ dec 0
311
+ adamant 0
312
+ chloride 0
313
+ sila 0
314
+ icos 0
315
+ ine 0
316
+ ide 0
317
+ naphthyridin 0
318
+ heptadeca 0
319
+ thione 0
320
+ anthracen 0
321
+ dodec 0
322
+ oxir 0
323
+ pyran 0
324
+ hydrogen 0
325
+ pentadeca 0
326
+ oxido 0
327
+ carbo 0
328
+ henicos 0
329
+ deuterio 0
330
+ docos 0
331
+ non 0
332
+ id 0
333
+ tert-butyl(dimethyl)silyl 0
334
+ carbamic_acid 0
335
+ pyrano 0
336
+ nonadeca 0
337
+ tris 0
338
+ but-2-eno 0
339
+ ic 0
340
+ at 0
341
+ phosphate 0
342
+ hydrazide 0
343
+ aceton 0
344
+ octadec 0
345
+ sulfo 0
346
+ thiomorpholin 0
347
+ pyrimido 0
348
+ oxamide 0
349
+ carbonimidoyl 0
350
+ oxet 0
351
+ inan 0
352
+ sodium 0
353
+ al 0
354
+ (2+) 0
355
+ oxide 0
356
+ phthalazin 0
357
+ benzal 0
358
+ carbohydrazide 0
359
+ bora 0
360
+ benzhydr 0
361
+ tetracos 0
362
+ bor 0
363
+ hexadec 0
364
+ ioda 0
365
+ azonia 0
366
+ isocyano 0
367
+ acridin 0
368
+ hydroxylamin 0
369
+ formamide 0
370
+ phenanthren 0
371
+ ul 0
372
+ indeno 0
373
+ xanthen 0
374
+ nitroso 0
375
+ tetradec 0
376
+ phosphin 0
377
+ olan 0
378
+ peroxy 0
379
+ phosphono 0
380
+ tetr 0
381
+ pyrazolidin 0
382
+ dicarbon 0
383
+ olate 0
384
+ tricos 0
385
+ hexacos 0
386
+ indolo 0
387
+ indolizin 0
388
+ phosphon 0
389
+ undec 0
390
+ chromeno 0
391
+ pentacos 0
392
+ pyrazino 0
393
+ thi 0
394
+ hydrate 0
395
+ bromide 0
396
+ uid 0
397
+ boronic_acid 0
398
+ trityl 0
399
+ cen 0
400
+ sulfate 0
401
+ isochromen 0
402
+ octacos 0
403
+ isocyanato 0
404
+ acetal 0
405
+ azide 0
406
+ dimethylacetamide 0
407
+ tetrakis 0
408
+ iridin 0
409
+ nonadec 0
410
+ naphtho 0
411
+ heptadec 0
412
+ pyren 0
413
+ heptacos 0
414
+ carbamimidamido 0
415
+ sulfinam 0
416
+ oxid 0
417
+ iodide 0
418
+ etheno 0
419
+ disulfon 0
420
+ potassium 0
421
+ chrysen 0
422
+ yne 0
423
+ phosphino 0
424
+ carboximidoyl 0
425
+ quinolizin 0
426
+ tert-butyl(diphenyl)silyl 0
427
+ formamid 0
428
+ thiochromen 0
429
+ porphyrin 0
430
+ dicyan 0
431
+ triacont 0
432
+ pteridin 0
433
+ (3+) 0
434
+ sulfin 0
435
+ ar 0
436
+ pentadec 0
437
+ io 0
438
+ phenothiazin 0
439
+ undecyl 0
440
+ oxal 0
441
+ phospho 0
442
+ borin 0
443
+ uide 0
444
+ uranium 0
445
+ picen 0
446
+ hydrobromide 0
447
+ cinnolin 0
448
+ isoindolo 0
449
+ phthal 0
450
+ phenac 0
451
+ phenanthridin 0
452
+ azino 0
453
+ tridec 0
454
+ zirconium 0
455
+ len 0
456
+ phenanthrolin 0
457
+ platinum 0
458
+ phenolate 0
459
+ sulfonato 0
460
+ oxybenzon 0
461
+ zinc 0
462
+ chlora 0
463
+ hydroperoxy 0
464
+ yttrium 0
465
+ pyrrolizin 0
466
+ carbothioyl 0
467
+ sel 0
468
+ iron 0
469
+ spirobi 0
470
+ copper 0
471
+ triphenylen 0
472
+ titanium 0
473
+ perox 0
474
+ nonacos 0
475
+ (1+) 0
476
+ tridecyl 0
477
+ lithium 0
478
+ tetrol 0
479
+ (4+) 0
480
+ carboxylato 0
481
+ thiopyran 0
482
+ pentacont 0
483
+ etan 0
484
+ iridium 0
485
+ thioxanthen 0
486
+ nickel 0
487
+ phenoxazin 0
488
+ hexatriacont 0
489
+ azulen 0
490
+ tetracont 0
491
+ tritriacont 0
492
+ azon 0
493
+ carbono 0
494
+ sulfino 0
495
+ dotriacont 0
496
+ stann 0
497
+ nitrate 0
498
+ broma 0
499
+ on 0
500
+ et 0
501
+ acetylen 0
502
+ fluoride 0
503
+ isothiocyanato 0
504
+ magnesium 0
505
+ cobalt 0
506
+ acenaphthylen 0
507
+ sulfamate 0
508
+ ruthenium 0
509
+ aldehyde 0
510
+ phosphite 0
511
+ nonafl 0
512
+ palladium 0
513
+ pentadecyl 0
514
+ purino 0
515
+ tetratriacont 0
516
+ epoxy 0
517
+ aluma 0
518
+ phenanthro 0
519
+ phenazin 0
520
+ fluoranthen 0
521
+ sulfinato 0
522
+ ocin 0
523
+ hentriacont 0
524
+ azanida 0
525
+ stanna 0
526
+ toluen 0
527
+ ylidyne 0
528
+ thiopyrano 0
529
+ perchlorate 0
530
+ calcium 0
531
+ mono 0
532
+ tungsten 0
533
+ sulfur 0
534
+ cyanamide 0
535
+ tricarbon 0
536
+ chlorid 0
537
+ dehydro 0
538
+ pyridazino 0
539
+ sulfido 0
540
+ irin 0
541
+ phosph 0
542
+ iran 0
543
+ thiocyanate 0
544
+ hypoiodite 0
545
+ ylium 0
546
+ imidazolo 0
547
+ octatriacont 0
548
+ dimethylurea 0
549
+ heptadecyl 0
550
+ tritio 0
551
+ hydrazono 0
552
+ selena 0
553
+ cyanide 0
554
+ dotetracont 0
555
+ isoquinolino 0
556
+ diazonium 0
557
+ pentatriacont 0
558
+ hydroxide 0
559
+ manganese 0
560
+ chromium 0
561
+ pentakis 0
562
+ hypofluorite 0
563
+ tin 0
564
+ sulfono 0
565
+ phosphoroso 0
566
+ vanadium 0
567
+ boranuida 0
568
+ ecin 0
569
+ hexakis 0
570
+ s-indacen 0
571
+ os 0
572
+ fluoreno 0
573
+ mercury 0
574
+ sulfamic_acid 0
575
+ thiochromeno 0
576
+ phenalen 0
577
+ rhodium 0
578
+ amid 0
579
+ sulfite 0
580
+ ocan 0
581
+ phosphonato 0
582
+ heptatriacont 0
583
+ nonatriacont 0
584
+ borono 0
585
+ silver 0
586
+ gold 0
587
+ isothiochromen 0
588
+ nitron 0
589
+ hafnium 0
590
+ hexacont 0
591
+ (2-) 0
592
+ hypochlorite 0
593
+ arsa 0
594
+ diphosphat 0
595
+ molybdenum 0
596
+ thallium 0
597
+ nonadecyl 0
598
+ fluora 0
599
+ nonatetracont 0
600
+ rhenium 0
601
+ tetracarbon 0
602
+ perylen 0
603
+ diphosphon 0
604
+ cyanate 0
605
+ oxygen 0
606
+ germ 0
607
+ nitramide 0
608
+ tell 0
609
+ aluminum 0
610
+ azuleno 0
611
+ quinolino 0
612
+ iod 0
613
+ actinium 0
614
+ terephthal 0
615
+ ecan 0
616
+ trithion 0
617
+ barium 0
618
+ hentetracont 0
619
+ dithion 0
620
+ phosphat 0
621
+ selenophen 0
622
+ xylen 0
623
+ germa 0
624
+ hen 0
625
+ perimidin 0
626
+ nitric_acid 0
627
+ rubidium 0
628
+ octatetracont 0
629
+ but-1-eno 0
630
+ nitramido 0
631
+ heptakis 0
632
+ thiocyanat 0
633
+ dibor 0
634
+ nitrous 0
635
+ hydrazon 0
636
+ thianthren 0
637
+ dili 0
638
+ hydride 0
639
+ oxonio 0
640
+ tetratetracont 0
641
+ isochromeno 0
642
+ dihydropter 0
643
+ indolizino 0
644
+ osmium 0
645
+ phosphonia 0
646
+ oxanthren 0
647
+ diazano 0
648
+ do 0
649
+ cyanato 0
650
+ diacetamid 0
651
+ oxam 0
652
+ silicate 0
653
+ cadmium 0
654
+ hydrofluoride 0
655
+ hexatetracont 0
656
+ boron 0
657
+ phosphindol 0
658
+ phenoxathiin 0
659
+ phosphonous_acid 0
660
+ octakis 0
661
+ bismuth 0
662
+ chromenylium 0
663
+ corrin 0
664
+ pyrylium 0
665
+ thion 0
666
+ cinnam 0
667
+ tritetracont 0
668
+ nitrite 0
669
+ gadolinium 0
670
+ diazonio 0
671
+ antimony 0
672
+ oxalo 0
673
+ onic_acid 0
674
+ biphenylen 0
675
+ sulfonio 0
676
+ cesium 0
677
+ oxonium 0
678
+ stiba 0
679
+ styren 0
680
+ heptacont 0
681
+ selenol 0
682
+ chloroform 0
683
+ diselen 0
684
+ onin 0
685
+ oxaldehyd 0
686
+ cerium 0
687
+ technetium 0
688
+ (1-) 0
689
+ lead 0
690
+ ite 0
691
+ acenaphthyleno 0
692
+ dicarboximid 0
693
+ oxonia 0
694
+ strontium 0
695
+ (5+) 0
696
+ iodid 0
697
+ lanthanum 0
698
+ rutherfordium 0
699
+ perchloric_acid 0
700
+ iren 0
701
+ tricosyl 0
702
+ hypobromite 0
703
+ europium 0
704
+ isocyanate 0
705
+ ido 0
706
+ iodosyl 0
707
+ nitrilium 0
708
+ neodymium 0
709
+ peroxide 0
710
+ pentatetracont 0
711
+ phenylen 0
712
+ tantalum 0
713
+ hect 0
714
+ buta-1,3-dieno 0
715
+ samarium 0
716
+ galla 0
717
+ methylal 0
718
+ fluorid 0
719
+ praseodymium 0
720
+ ytterbium 0
721
+ dimethoxyethane 0
722
+ scandium 0
723
+ seleno 0
724
+ dimethoxyethan 0
725
+ octacont 0
726
+ cub 0
727
+ gallium 0
728
+ diphosphate 0
729
+ pentacosyl 0
730
+ thalla 0
731
+ ous_acid 0
732
+ selenoate 0
733
+ arson 0
734
+ niobium 0
735
+ alumina 0
736
+ anisol 0
737
+ beryllium 0
738
+ thioph 0
739
+ heptatetracont 0
740
+ onan 0
741
+ tellura 0
742
+ quinoxalino 0
743
+ indiga 0
744
+ heptacosyl 0
745
+ isothiocyanate 0
746
+ inin 0
747
+ diphospho 0
748
+ thionia 0
749
+ selenido 0
750
+ nonacosyl 0
751
+ terbium 0
752
+ (6+) 0
753
+ indig 0
754
+ dysprosium 0
755
+ quinazolino 0
756
+ iodyl 0
757
+ indium 0
758
+ hexatriacontyl 0
759
+ thiopyr 0
760
+ triphosphon 0
761
+ thorium 0
762
+ carbohydrazonoyl 0
763
+ as-indacen 0
764
+ fluoroform 0
765
+ erbium 0
766
+ phosphindolo 0
767
+ lutetium 0
768
+ selenopheno 0
769
+ arsin 0
770
+ arsor 0
771
+ iodat 0
772
+ silanuida 0
773
+ plumba 0
774
+ plumb 0
775
+ borano 0
776
+ sulfonium 0
777
+ tellurophen 0
778
+ indazolo 0
779
+ nitroxyl 0
780
+ nitrogen 0
781
+ anthra 0
782
+ isophosphindol 0
783
+ disulfid 0
784
+ nonacont 0
785
+ selone 0
786
+ iodonio 0
787
+ onate 0
788
+ trili 0
789
+ iodine 0
790
+ seleninyl 0
791
+ phenoxaphosphinin 0
792
+ phen 0
793
+ thulium 0
794
+ chloryl 0
795
+ phosphinimyl 0
796
+ cyanic_acid 0
797
+ acridophosphin 0
798
+ tetrali 0
799
+ cumen 0
800
+ holmium 0
801
+ selenopyran 0
802
+ dibenzamid 0
803
+ nitrous_acid 0
804
+ phthalal 0
805
+ selenocyanate 0
806
+ argon 0
807
+ iodate 0
808
+ isothiochromeno 0
809
+ mercurio 0
810
+ sulfide 0
811
+ bromid 0
812
+ iodonia 0
813
+ disulfate 0
814
+ fluorine 0
815
+ aceanthrylen 0
816
+ coronen 0
817
+ phenoxid 0
818
+ hydrazonic 0
819
+ telluro 0
820
+ silicon 0
821
+ chloronio 0
822
+ hypochlorous_acid 0
823
+ dodecakis 0
824
+ hydroseleno 0
825
+ phosphinolin 0
826
+ inda 0
827
+ phenaleno 0
828
+ phenylene 0
829
+ arsenic 0
830
+ chlorosyl 0
831
+ perchloryl 0
832
+ chlorate 0
833
+ bism 0
834
+ onat 0
835
+ terephthalal 0
836
+ 7,8-dihydropter 0
837
+ silano 0
838
+ boranthren 0
839
+ fermium 0
840
+ phosphano 0
841
+ arsoroso 0
842
+ hydrido 0
843
+ alum 0
844
+ selenium 0
845
+ pol 0
846
+ nonakis 0
847
+ stibo 0
848
+ phospheno 0
849
+ astatine 0
850
+ phosphanida 0
851
+ phenophosphazinin 0
852
+ stibor 0
853
+ sulfenat 0
854
+ silanida 0
855
+ pyranthren 0
856
+ arsono 0
857
+ decakis 0
858
+ oxaldehyde 0
859
+ cyanid 0
860
+ neptunium 0
861
+ diphosphor 0
862
+ bromate 0
863
+ selenate 0
864
+ selenin 0
865
+ selenonyl 0
866
+ phenoselenazin 0
867
+ hypoiodous_acid 0
868
+ silanylia 0
869
+ ditellur 0
870
+ arso 0
871
+ helicen 0
872
+ americium 0
873
+ pyreno 0
874
+ selenoxanthen 0
875
+ amoyl 0
876
+ telluroate 0
877
+ selen 0
878
+ selenochromen 0
879
+ diyl 0
880
+ dithianon 0
881
+ ose 0
882
+ plutonium 0
883
+ silicic_acid 0
884
+ 5,6,7,8-tetrahydropter 0
885
+ xenon 0
886
+ sulfamide 0
887
+ bisma 0
888
+ germanium 0
889
+ triphosphate 0
890
+ triphospho 0
891
+ triselen 0
892
+ isocyanide 0
893
+ isophosphinolin 0
894
+ tetrasulfide 0
895
+ dict 0
896
+ bromine 0
897
+ curium 0
898
+ acephenanthrylen 0
899
+ promethium 0
900
+ phosphanthridin 0
901
+ gall 0
902
+ selenocyanat 0
903
+ stilben 0
904
+ disulfide 0
905
+ isochromenylium 0
906
+ tetrathion 0
907
+ thall 0
908
+ selenat 0
909
+ chlor 0
910
+ silanthren 0
911
+ (3-) 0
912
+ tetradecakis 0
913
+ xantheno 0
914
+ chromio 0
915
+ chlorite 0
916
+ californium 0
917
+ tetraphosphat 0
918
+ chlorine 0
919
+ iodoform 0
920
+ telluropyran 0
921
+ polona 0
922
+ lawrencium 0
923
+ naphthyridino 0
924
+ selenon 0
925
+ phenoxarsinin 0
926
+ as-indaceno 0
927
+ mercura 0
928
+ periodate 0
929
+ selenite 0
930
+ hypofluorous_acid 0
931
+ adip 0
932
+ bromyl 0
933
+ arsino 0
934
+ tungstenio 0
935
+ tellurochromen 0
936
+ stibin 0
937
+ trisulfide 0
938
+ isoselenochromen 0
939
+ zircona 0
940
+ hexali 0
941
+ tetraphosphate 0
942
+ onamide 0
943
+ chloronia 0
944
+ thiochromenylium 0
945
+ phosphorus 0
946
+ titana 0
947
+ dicyclohexylurea 0
948
+ phenarsazinin 0
949
+ (8+) 0
950
+ nitroform 0
951
+ molybdenio 0
952
+ undecakis 0
953
+ rubicen 0
954
+ diselenid 0
955
+ triphosphat 0
956
+ diboron 0
957
+ trisulfid 0
958
+ hexadecakis 0
959
+ pleiaden 0
960
+ ter 0
961
+ arsonous_acid 0
962
+ ars 0
963
+ permangan 0
964
+ methoxychlor 0
965
+ tellurinyl 0
966
+ triacetamid 0
967
+ isocyanatid 0
968
+ (7+) 0
969
+ phthalazino 0
970
+ chloric_acid 0
971
+ stibon 0
972
+ tellone 0
973
+ stib 0
974
+ protactinium 0
975
+ fluor 0
976
+ arsonato 0
977
+ einsteinium 0
978
+ tellur 0
979
+ molybda 0
980
+ telluroxanthen 0
981
+ water 0
982
+ pentali 0
983
+ vanadio 0
984
+ formazan 0
985
+ ovalen 0
986
+ brom 0
987
+ thioxantheno 0
988
+ selenomorpholin 0
989
+ arsonium 0
990
+ nobelium 0
991
+ cinnolino 0
992
+ nitrid 0
993
+ telluropyrano 0
994
+ neo 0
995
+ tellurate 0
996
+ bromic_acid 0
997
+ phosphinolino 0
998
+ iodite 0
999
+ arsindol 0
1000
+ phosphen 0
1001
+ tribenzamid 0
1002
+ tellurium 0
1003
+ oxyl 0
1004
+ icosakis 0
1005
+ tellurat 0
1006
+ krypton 0
1007
+ bromite 0
1008
+ tridecakis 0
1009
+ all 0
1010
+ isotellurochromen 0
1011
+ diarsor 0
1012
+ bromosyl 0
1013
+ helium 0
1014
+ disulfite 0
1015
+ deuteride 0
1016
+ carboselenoyl 0
1017
+ bromoform 0
1018
+ trinaphthylen 0
1019
+ octali 0
1020
+ furano 0
1021
+ selenino 0
1022
+ iodic_acid 0
1023
+ hydrotelluro 0
1024
+ boronia 0
1025
+ phosphinolizin 0
1026
+ prism 0
1027
+ periodic_acid 0
1028
+ orot 0
1029
+ pentadecakis 0
1030
+ polonium 0
1031
+ hexasulfide 0
1032
+ stibono 0
1033
+ selenanthren 0
1034
+ ozone 0
1035
+ phosphindolizin 0
1036
+ urana 0
1037
+ pyridino 0
1038
+ phenotellurazin 0
1039
+ meitnerium 0
1040
+ tetrasulfid 0
1041
+ selenonia 0
1042
+ hypobromous_acid 0
1043
+ selenopyrano 0
1044
+ chlorat 0
1045
+ trifluoromethanesulfonimid 0
1046
+ seaborgium 0
1047
+ azor 0
1048
+ azonous_acid 0
1049
+ selenoph 0
1050
+ periodyl 0
1051
+ perbromate 0
1052
+ oson 0
1053
+ berkelium 0
1054
+ tungsta 0
1055
+ ribo 0
1056
+ pentaphosphate 0
1057
+ hafna 0
1058
+ telluropheno 0
1059
+ tellurite 0
1060
+ nitronium 0
1061
+ mon 0
1062
+ astata 0
1063
+ isothiocyanatid 0
1064
+ dubnium 0
1065
+ isothiochromenylium 0
1066
+ tellurin 0
1067
+ sodio 0
1068
+ selenono 0
1069
+ selenochromeno 0
1070
+ nitrosyl 0
1071
+ mendelevium 0
1072
+ ous 0
1073
+ neon 0
1074
+ fluoronio 0
1075
+ azid 0
1076
+ then 0
1077
+ stannanylia 0
1078
+ potassio 0
1079
+ phosphanthren 0
1080
+ disilic 0
1081
+ chlorazin 0
1082
+ titanio 0
1083
+ bromat 0
1084
+ triacontakis 0
1085
+ pentasulfide 0
1086
+ nonadecakis 0
1087
+ rhenio 0
1088
+ platina 0
1089
+ phenoxatellurin 0
1090
+ pentazocine 0
1091
+ ferrio 0
1092
+ cos 0
1093
+ vanada 0
1094
+ triselenid 0
1095
+ telluronyl 0
1096
+ tellurocyanate 0
1097
+ pentazocin 0
1098
+ fulven 0
1099
+ distibor 0
1100
+ diphosphite 0
1101
+ radon 0
1102
+ pentathion 0
1103
+ nitrous_oxide 0
1104
+ ferra 0
1105
+ ditelluron 0
1106
+ bis(trifluoromethylsulfonyl)imid 0
1107
+ acridino 0
1108
+ telluron 0
1109
+ isophosphinolino 0
1110
+ diselenon 0
1111
+ diarson 0
1112
+ stibanuida 0
1113
+ germano 0
1114
+ xanthylium 0
1115
+ tert-butyl(dimethyl)silanyl 0
1116
+ radium 0
1117
+ osma 0
1118
+ chlorous_acid 0
1119
+ bromonio 0
1120
+ arsonia 0
1121
+ arsinolin 0
1122
+ amate 0
1123
+ urazol 0
1124
+ triphosphor 0
1125
+ nonali 0
1126
+ deutero 0
1127
+ nioba 0
1128
+ acridarsin 0
1129
+ yttrio 0
1130
+ tert-butyl-dimethylsilyl 0
1131
+ pyrimidino 0
1132
+ pteridino 0
1133
+ phenoxaselenin 0
1134
+ isocyanid 0
1135
+ irida 0
1136
+ heptadecakis 0
1137
+ bohrium 0
1138
+ pentacosakis 0
1139
+ octadecakis 0
1140
+ thianthreno 0
1141
+ telluroph 0
1142
+ t- 0
1143
+ isophosphindolo 0
1144
+ isoarsindol 0
1145
+ henicosakis 0
1146
+ (4-) 0
1147
+ ruthena 0
1148
+ heptali 0
1149
+ arsen 0
1150
+ telluranthren 0
1151
+ chryseno 0
1152
+ carbotelluroyl 0
1153
+ quinolizino 0
1154
+ nonacosakis 0
1155
+ francium 0
1156
+ ethion 0
1157
+ chroma 0
1158
+ arsanthridin 0
1159
+ arsanthren 0
1160
+ tricosakis 0
1161
+ tetraphosphor 0
1162
+ tetracosakis 0
1163
+ tellurocyanat 0
1164
+ stibonia 0
1165
+ stibonato 0
1166
+ phosphanuida 0
1167
+ phenoxathiino 0
1168
+ manganio 0
1169
+ eicosa 0
1170
+ cobaltio 0
1171
+ cera 0
1172
+ amic_acid 0
1173
+ stibino 0
1174
+ stannanuida 0
1175
+ samario 0
1176
+ s-indaceno 0
1177
+ praseodymio 0
1178
+ phenoxastibinin 0
1179
+ pallada 0
1180
+ neodymio 0
1181
+ isoselenocyanate 0
1182
+ germanuida 0
1183
+ diazoamino 0
1184
+ telluronia 0
1185
+ tantalio 0
1186
+ phenoxyl 0
1187
+ phenothiarsinin 0
1188
+ oxanthreno 0
1189
+ octacosakis 0
1190
+ mangana 0
1191
+ lanthanio 0
1192
+ isoarsinolin 0
1193
+ indan 0
1194
+ hexacosakis 0
1195
+ hassium 0
1196
+ arsinolizin 0
1197
+ alli 0
1198
+ thioxanth 0
1199
+ tert-butyl(diphenyl)silanyl 0
1200
+ stronta 0
1201
+ stannano 0
1202
+ rhodio 0
1203
+ rhoda 0
1204
+ praseodyma 0
1205
+ phenazino 0
1206
+ pentaphosphat 0
1207
+ nitric 0
1208
+ methoxyl 0
1209
+ magnesio 0
1210
+ dichrom 0
1211
+ chlorazine 0
1212
+ californa 0
1213
+ butoxyl 0
1214
+ bromous_acid 0
1215
+ azonic_acid 0
1216
+ arsinolino 0
1217
+ arsindolo 0
1218
+ arsindolizin 0
1219
+ allo 0
1220
+ actina 0
1221
+ uronic_acid 0
1222
+ thora 0
1223
+ telluromorpholin 0
1224
+ stibonium 0
1225
+ stibano 0
1226
+ rhena 0
1227
+ phosphinolizino 0
1228
+ phenothiazino 0
1229
+ perbromyl 0
1230
+ niobio 0
1231
+ nickelio 0
1232
+ isotellurochromeno 0
1233
+ isoselenocyanato 0
1234
+ iodous_acid 0
1235
+ iodous 0
1236
+ hydroselenonyl 0
1237
+ dysprosio 0
1238
+ cyclopenta[a]phenanthr 0
1239
+ cerio 0
1240
+ bara 0
1241
+ aurio 0
1242
+ arsanuida 0
1243
+ ytterbio 0
1244
+ uronate 0
1245
+ tol 0
1246
+ thulio 0
1247
+ tert-butyl-diphenylsilyl 0
1248
+ tellurono 0
1249
+ stannanida 0
1250
+ scandio 0
1251
+ propoxyl 0
1252
+ periodic 0
1253
+ perbromic_acid 0
1254
+ nitror 0
1255
+ lutetio 0
1256
+ isothiocyanic_acid 0
1257
+ iridio 0
1258
+ iodic 0
1259
+ hypobor 0
1260
+ hydroxyl 0
1261
+ hydroseleninyl 0
1262
+ holmio 0
1263
+ hexasulfid 0
1264
+ heptacosakis 0
1265
+ gadolinio 0
1266
+ europio 0
1267
+ ethoxyl 0
1268
+ erbio 0
1269
+ docosakis 0
1270
+ chlorous 0
1271
+ chloric 0
1272
+ arsinimyl 0
1273
+ argentio 0
1274
+ ▁ -0.24368
1275
+ c -3.77761
1276
+ m -3.81933
1277
+ t -4.1484
1278
+ p -4.28552
1279
+ n -4.34236
1280
+ u -4.43826
1281
+ s -4.52053
1282
+ i -4.6648
1283
+ is -4.8052
1284
+ g -4.85455
1285
+ x -5.02503
1286
+ y -5.19016
1287
+ h -5.25276
1288
+ b -5.25733
1289
+ v -5.50657
1290
+ th -5.56431
1291
+ f -5.60089
1292
+ ph -5.65809
1293
+ hy -5.71657
1294
+ ▁p -6.08895
1295
+ cy -6.12699
1296
+ yc -6.28409
1297
+ im -6.3188
1298
+ ti -6.4861
1299
+ ch -6.53742
1300
+ ut -6.55604
1301
+ cys -6.59438
1302
+ st -6.61931
1303
+ ▁h -6.69232
1304
+ pi -6.72852
1305
+ uc -6.85542
1306
+ us -6.89267
1307
+ ▁b -6.96641
1308
+ ▁g -6.99289
1309
+ ▁c -7.03458
1310
+ ys -7.04986
1311
+ ct -7.06609
1312
+ ▁hy -7.10659
1313
+ gu -7.12486
1314
+ sp -7.1249
1315
+ xy -7.2108
1316
+ ▁s -7.3108
1317
+ yp -7.394
1318
+ um -7.39798
1319
+ xim -7.47115
1320
+ thy -7.52489
1321
+ ps -7.53214
1322
+ fu -7.86517
1323
+ ▁cy -7.98841
1324
+ mph -7.99202
1325
+ ▁n -8.03554
1326
+ ni -8.04807
1327
+ ▁m -8.12601
1328
+ nth -8.18462
1329
+ cu -8.19705
1330
+ phth -8.20839
1331
+ ip -8.32472
1332
+ ▁f -8.36171
1333
+ ty -8.47003
1334
+ ▁cu -8.49492
1335
+ ym -8.59996
1336
+ ff -8.60659
1337
+ uf -8.65435
1338
+ fi -8.70783
1339
+ pt -8.74056
1340
+ tun -8.78867
1341
+ yt -8.80236
1342
+ ▁ch -8.81859
1343
+ ▁ps -9.02055
1344
+ ▁sty -9.02375
1345
+ ▁phyt -9.0593
1346
+ ub -9.1473
1347
+ mb -9.15357
1348
+ ▁fu -9.19661
1349
+ if -9.24761
1350
+ ci -9.28944
1351
+ ▁sym -9.29607
1352
+ ss -9.31017
1353
+ up -9.34393
1354
+ sty -9.34753
1355
+ ▁t -9.40241
1356
+ pp -9.46886
1357
+ mi -9.51896
1358
+ gn -9.58869
1359
+ ms -9.85318
1360
+ ▁pi -9.85785
1361
+ ist -9.89882
1362
+ tig -9.95137
1363
+ ▁thy -10.0245
1364
+ vii -10.0685
1365
+ hi -10.077
1366
+ sym -10.0864
1367
+ ▁sub -10.1129
1368
+ ptu -10.1771
1369
+ cti -10.2664
1370
+ ig -10.5468
1371
+ tu -10.5569
1372
+ ▁fuc -10.6338
1373
+ ▁sy -10.726
1374
+ ▁th -10.8515
1375
+ uv -10.9123
1376
+ si -10.9398
1377
+ ▁cys -11.0937
1378
+ bu -11.3456
1379
+ mu -11.3477
1380
+ vi -11.4565
1381
+ mp -11.4617
1382
+ ib -11.5026
1383
+ pu -11.5547
1384
+ ▁i -11.5794
1385
+ ▁bu -11.6761
1386
+ ▁gu -11.6864
1387
+ ▁mu -11.7005
1388
+ ▁st -11.7307
1389
+ un -11.844
1390
+ uct -11.8441
1391
+ ▁u -12.046
iupac-gpt/iupac_gpt/iupac_tokenization.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ AdamW,
3
+ DataCollatorWithPadding,
4
+ HfArgumentParser,
5
+ T5Config,
6
+ T5ForConditionalGeneration,
7
+ T5Tokenizer,
8
+ Trainer,
9
+ TrainingArguments,
10
+ )
11
+ from torch.utils.data import DataLoader
12
+ import os
13
+ import tempfile
14
+ import re
15
+ import pandas as pd
16
+ import numpy as np
17
+ from typing import Dict, Optional
18
+ from dataclasses import dataclass, field
19
+ import logging
20
+
21
+ import torch
22
+ from torch.nn.utils.rnn import pad_sequence
23
+ from torch.optim.lr_scheduler import LambdaLR
24
+ import os.path as pt
25
+ import torch.optim as optim
26
+ import torch.nn as nn
27
+ from tqdm import tqdm
28
+ from torch.autograd import Variable
29
+ from .iupac_dataset import IUPACDataset
30
+ import os
31
+ #os.environ["CUDA_VISIBLE_DEVICES"]="0"
32
+
33
+
34
+ class T5Collator:
35
+ def __init__(self, pad_token_id):
36
+ super().__init__()
37
+ self.pad_token_id = pad_token_id
38
+ def __call__(self, records):
39
+ # records is a list of dicts
40
+ batch = {}
41
+ padvals = {"input_ids": self.pad_token_id,'labels':-100}
42
+ for k in records[0]:
43
+ if k in padvals:
44
+ batch[k] = pad_sequence([torch.tensor(r[k]) for r in records],
45
+ batch_first=True,
46
+ padding_value=padvals[k])
47
+ else:
48
+ batch[k] = torch.FloatTensor([r[k] for r in records]) #torch.Tensor
49
+ return batch
50
+
51
+ class T5IUPACTokenizer(T5Tokenizer):
52
+ def prepare_for_tokenization(self, text, is_split_into_words=False,
53
+ **kwargs):
54
+ return re.sub(" ", "_", text), kwargs
55
+
56
+ def _decode(self, *args, **kwargs):
57
+ # replace "_" with " ", except for the _ in extra_id_#
58
+ text = super()._decode(*args, **kwargs)
59
+ text = re.sub("extra_id_", "extraAidA", text)
60
+ text = re.sub("_", " ", text)
61
+ text = re.sub("extraAidA", "extra_id_", text)
62
+ return text
63
+
64
+ def sentinels(self, sentinel_ids):
65
+ return self.vocab_size - sentinel_ids - 1
66
+
67
+ def sentinel_mask(self, ids):
68
+ return ((self.vocab_size - self._extra_ids <= ids) &
69
+ (ids < self.vocab_size))
70
+
71
+ def _tokenize(self, text, sample=False):
72
+ #pieces = super()._tokenize(text, sample=sample)
73
+ pieces = super()._tokenize(text)
74
+ # sentencepiece adds a non-printing token at the start. Remove it
75
+ return pieces[1:]
76
+
77
+ def prepare_input(data,device):
78
+ from collections.abc import Mapping
79
+ if isinstance(data, Mapping):
80
+ return type(data)({k: prepare_input(v,device) for k, v in data.items()})
81
+ elif isinstance(data, (tuple, list)):
82
+ return type(data)(prepare_input(v,device) for v in data)
83
+ elif isinstance(data, torch.Tensor):
84
+ kwargs = dict(device=device)
85
+ if data.dtype != torch.int64:
86
+ # NLP models inputs are int64 and those get adjusted to the right dtype of the
87
+ # embedding. Other models such as wav2vec2's inputs are already float and thus
88
+ # may need special handling to match the dtypes of the model
89
+ kwargs.update(dict(dtype=torch.int64))
90
+
91
+ return data.to(**kwargs)
92
+ return data
93
+
94
+ def get_data_loader(is_train=1):
95
+
96
+ full_path = '/home/jmwang/drugai/iupac-gpt/iupac_gpt/'
97
+
98
+ iupac_tokenizer = T5IUPACTokenizer(vocab_file=full_path+'iupac_spm.model')
99
+ iupac_vocab_size = iupac_tokenizer.vocab_size
100
+ print('iupac_vocab_size:',iupac_vocab_size)
101
+ if is_train:
102
+ torch.save(iupac_tokenizer, pt.join(full_path,"real_iupac_tokenizer.pt"))
103
+ print("training...",len(iupac_tokenizer))
104
+ else:
105
+ iupac_tokenizer = torch.load(pt.join(full_path,"real_iupac_tokenizer.pt"), map_location="cpu")
106
+ print('fina_tune...',len(iupac_tokenizer))
107
+
108
+ dataset_filename = 'data/pubchem_iupac_smile_gpt.csv'
109
+ target_col = "aLogP"
110
+ iupac_name_col = 'PUBCHEM_IUPAC_NAME' #canon_smiles
111
+ MAXLEN=1024
112
+ dataset_kwargs = {"dataset_dir":'/home/jmwang/drugai/iupac-gpt',"dataset_filename": dataset_filename,"tokenizer": iupac_tokenizer,"max_length": MAXLEN,"target_col": target_col,'dataset_size':None,"iupac_name_col":iupac_name_col}
113
+ train_dataset = IUPACDataset(**dataset_kwargs)
114
+ collator = T5Collator(iupac_tokenizer.pad_token_id)
115
+ train_dataloader = DataLoader(train_dataset,batch_size=64,collate_fn=collator,shuffle=True)
116
+
117
+ return train_dataloader,iupac_tokenizer
118
+
119
+ if __name__ == "__main__":
120
+
121
+ train_dataloader,iupac_tokenizer = get_data_loader(is_train=1)
122
+ pbar = tqdm(train_dataloader)
123
+ device = 'cpu'
124
+ for inputs in pbar:
125
+
126
+ src_label = Variable(inputs["labels"].to(device))
127
+ inputs = prepare_input(inputs,device)
128
+ src = Variable(inputs["input_ids"].to(device))
129
+ #self.tokenizer._convert_token_to_id
130
+
131
+ print(src[:,:].shape,src_label)
iupac-gpt/iupac_gpt/iupac_tokenization_class.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ AdamW,
3
+ DataCollatorWithPadding,
4
+ HfArgumentParser,
5
+ T5Config,
6
+ T5ForConditionalGeneration,
7
+ T5Tokenizer,
8
+ Trainer,
9
+ TrainingArguments,
10
+ )
11
+ from torch.utils.data import DataLoader
12
+ import os
13
+ import tempfile
14
+ import re
15
+ import pandas as pd
16
+ import numpy as np
17
+ from typing import Dict, Optional
18
+ from dataclasses import dataclass, field
19
+ import logging
20
+
21
+ import torch
22
+ from torch.nn.utils.rnn import pad_sequence
23
+ from torch.optim.lr_scheduler import LambdaLR
24
+ import os.path as pt
25
+ import torch.optim as optim
26
+ import torch.nn as nn
27
+ from tqdm import tqdm
28
+ from torch.autograd import Variable
29
+ from .iupac_dataset_class import IUPACDataset
30
+ import os
31
+ #os.environ["CUDA_VISIBLE_DEVICES"]="0"
32
+
33
+
34
+ class T5Collator:
35
+ def __init__(self, pad_token_id):
36
+ super().__init__()
37
+ self.pad_token_id = pad_token_id
38
+ def __call__(self, records):
39
+ # records is a list of dicts
40
+ batch = {}
41
+ padvals = {"input_ids": self.pad_token_id,'attention_mask':0}
42
+ for k in records[0]:
43
+ if k in padvals:
44
+ batch[k] = pad_sequence([torch.tensor(r[k]) for r in records],
45
+ batch_first=True,
46
+ padding_value=padvals[k])
47
+ else:
48
+ batch[k] = torch.LongTensor([r[k] for r in records]) #torch.Tensor LongTensor FloatTensor
49
+ return batch
50
+
51
+ class T5IUPACTokenizer(T5Tokenizer):
52
+ def prepare_for_tokenization(self, text, is_split_into_words=False,
53
+ **kwargs):
54
+ return re.sub(" ", "_", text), kwargs
55
+
56
+ def _decode(self, *args, **kwargs):
57
+ # replace "_" with " ", except for the _ in extra_id_#
58
+ text = super()._decode(*args, **kwargs)
59
+ text = re.sub("extra_id_", "extraAidA", text)
60
+ text = re.sub("_", " ", text)
61
+ text = re.sub("extraAidA", "extra_id_", text)
62
+ return text
63
+
64
+ def sentinels(self, sentinel_ids):
65
+ return self.vocab_size - sentinel_ids - 1
66
+
67
+ def sentinel_mask(self, ids):
68
+ return ((self.vocab_size - self._extra_ids <= ids) &
69
+ (ids < self.vocab_size))
70
+
71
+ def _tokenize(self, text, sample=False):
72
+ #pieces = super()._tokenize(text, sample=sample)
73
+ pieces = super()._tokenize(text)
74
+ # sentencepiece adds a non-printing token at the start. Remove it
75
+ return pieces[1:]
76
+
77
+ def prepare_input_class(data,device):
78
+ from collections.abc import Mapping
79
+ if isinstance(data, Mapping):
80
+ return type(data)({k: prepare_input_class(v,device) for k, v in data.items()})
81
+ elif isinstance(data, (tuple, list)):
82
+ return type(data)(prepare_input_class(v,device) for v in data)
83
+ elif isinstance(data, torch.Tensor):
84
+ kwargs = dict(device=device)
85
+ if data.dtype != torch.int64:
86
+ # NLP models inputs are int64 and those get adjusted to the right dtype of the
87
+ # embedding. Other models such as wav2vec2's inputs are already float and thus
88
+ # may need special handling to match the dtypes of the model
89
+ kwargs.update(dict(dtype=torch.int64))
90
+
91
+ return data.to(**kwargs)
92
+ return data
93
+
94
+ def get_data_loader_class(is_train=1):
95
+
96
+ full_path = '/root/autodl-tmp/wjm/iupac-gpt/iupac_gpt/'
97
+
98
+ iupac_tokenizer = T5IUPACTokenizer(vocab_file=full_path+'iupac_spm.model')
99
+ iupac_vocab_size = iupac_tokenizer.vocab_size
100
+ print('iupac_vocab_size:',iupac_vocab_size)
101
+ if is_train:
102
+ torch.save(iupac_tokenizer, pt.join(full_path,"real_iupac_tokenizer.pt"))
103
+ print("training...",len(iupac_tokenizer))
104
+ else:
105
+ iupac_tokenizer = torch.load(pt.join(full_path,"real_iupac_tokenizer.pt"), map_location="cpu")
106
+ print('fina_tune...',len(iupac_tokenizer))
107
+
108
+ dataset_filename = 'iupacs_logp.csv' #'./pubchem_iupac_smile_gpt.csv'
109
+ target_col = "LogP" #"aLogP"
110
+ iupac_name_col = 'iupac' #'PUBCHEM_IUPAC_NAME'
111
+ MAXLEN=1024
112
+ dataset_kwargs = {"dataset_dir":full_path,"dataset_filename": dataset_filename,"tokenizer": iupac_tokenizer,"max_length": MAXLEN,"target_col": target_col,'dataset_size':None,"iupac_name_col":iupac_name_col}
113
+ train_dataset = IUPACDataset(**dataset_kwargs)
114
+ collator = T5Collator(iupac_tokenizer.pad_token_id)
115
+ train_dataloader = DataLoader(train_dataset,batch_size=64,collate_fn=collator,shuffle=True)
116
+
117
+ return train_dataloader,iupac_tokenizer
118
+
119
+ if __name__ == "__main__":
120
+
121
+ train_dataloader,iupac_tokenizer = get_data_loader_class(is_train=1)
122
+ pbar = tqdm(train_dataloader)
123
+ device = 'cpu'
124
+ for inputs in pbar:
125
+
126
+ src_label = Variable(inputs["labels"].to(device))
127
+ inputs = prepare_input_class(inputs,device)
128
+ src = Variable(inputs["input_ids"].to(device))
129
+ #self.tokenizer._convert_token_to_id
130
+
131
+ print(src[:,:].shape,src_label)
iupac-gpt/iupac_gpt/iupac_tokenization_iupac.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ AdamW,
3
+ DataCollatorWithPadding,
4
+ HfArgumentParser,
5
+ T5Config,
6
+ T5ForConditionalGeneration,
7
+ T5Tokenizer,
8
+ Trainer,
9
+ TrainingArguments,
10
+ )
11
+ from torch.utils.data import DataLoader
12
+ import os
13
+ import tempfile
14
+ import re
15
+ import pandas as pd
16
+ import numpy as np
17
+ from typing import Dict, Optional
18
+ from dataclasses import dataclass, field
19
+ import logging
20
+
21
+ import torch
22
+ from torch.nn.utils.rnn import pad_sequence
23
+ from torch.optim.lr_scheduler import LambdaLR
24
+ import os.path as pt
25
+ import torch.optim as optim
26
+ import torch.nn as nn
27
+ from tqdm import tqdm
28
+ from torch.autograd import Variable
29
+ from .iupac_dataset import IUPACDataset
30
+ import os
31
+ #os.environ["CUDA_VISIBLE_DEVICES"]="0"
32
+
33
+
34
+ class T5Collator:
35
+ def __init__(self, pad_token_id):
36
+ super().__init__()
37
+ self.pad_token_id = pad_token_id
38
+ def __call__(self, records):
39
+ # records is a list of dicts
40
+ batch = {}
41
+ padvals = {"input_ids": self.pad_token_id,'attention_mask':0,'labels':-100}
42
+ for k in records[0]:
43
+ if k in padvals:
44
+ batch[k] = pad_sequence([torch.tensor(r[k]) for r in records],
45
+ batch_first=True,
46
+ padding_value=padvals[k])
47
+ else:
48
+ batch[k] = torch.FloatTensor([r[k] for r in records]) #torch.Tensor
49
+ return batch
50
+
51
+ class T5IUPACTokenizer(T5Tokenizer):
52
+ def prepare_for_tokenization(self, text, is_split_into_words=False,
53
+ **kwargs):
54
+ return re.sub(" ", "_", text), kwargs
55
+
56
+ def _decode(self, *args, **kwargs):
57
+ # replace "_" with " ", except for the _ in extra_id_#
58
+ text = super()._decode(*args, **kwargs)
59
+ text = re.sub("extra_id_", "extraAidA", text)
60
+ text = re.sub("_", " ", text)
61
+ text = re.sub("extraAidA", "extra_id_", text)
62
+ return text
63
+
64
+ def sentinels(self, sentinel_ids):
65
+ return self.vocab_size - sentinel_ids - 1
66
+
67
+ def sentinel_mask(self, ids):
68
+ return ((self.vocab_size - self._extra_ids <= ids) &
69
+ (ids < self.vocab_size))
70
+
71
+ def _tokenize(self, text, sample=False):
72
+ #pieces = super()._tokenize(text, sample=sample)
73
+ pieces = super()._tokenize(text)
74
+ # sentencepiece adds a non-printing token at the start. Remove it
75
+ return pieces[1:]
76
+
77
+ def prepare_input(data,device):
78
+ from collections.abc import Mapping
79
+ if isinstance(data, Mapping):
80
+ return type(data)({k: prepare_input(v,device) for k, v in data.items()})
81
+ elif isinstance(data, (tuple, list)):
82
+ return type(data)(prepare_input(v,device) for v in data)
83
+ elif isinstance(data, torch.Tensor):
84
+ kwargs = dict(device=device)
85
+ if data.dtype != torch.int64:
86
+ # NLP models inputs are int64 and those get adjusted to the right dtype of the
87
+ # embedding. Other models such as wav2vec2's inputs are already float and thus
88
+ # may need special handling to match the dtypes of the model
89
+ kwargs.update(dict(dtype=torch.int64))
90
+
91
+ return data.to(**kwargs)
92
+ return data
93
+
94
+ def get_data_loader(is_train=1,dataset_filename = './pubchem_iupac_smile_gpt.csv'):
95
+
96
+ full_path = '/home/jmwang/drugai/iupac-gpt/iupac_gpt/'
97
+
98
+ iupac_tokenizer = T5IUPACTokenizer(vocab_file=full_path+'iupac_spm.model')
99
+ iupac_vocab_size = iupac_tokenizer.vocab_size
100
+ print('iupac_vocab_size:',iupac_vocab_size)
101
+ if is_train:
102
+ torch.save(iupac_tokenizer, pt.join(full_path,"real_iupac_tokenizer.pt"))
103
+ print("training...",len(iupac_tokenizer))
104
+ else:
105
+ iupac_tokenizer = torch.load(pt.join(full_path,"real_iupac_tokenizer.pt"), map_location="cpu")
106
+ print('fina_tune...',len(iupac_tokenizer))
107
+
108
+ target_col = "aLogP"
109
+ iupac_name_col = 'PUBCHEM_IUPAC_NAME'
110
+ MAXLEN=1024
111
+ dataset_kwargs = {"dataset_dir":full_path,"dataset_filename": dataset_filename,"tokenizer": iupac_tokenizer,"max_length": MAXLEN,"target_col": target_col,'dataset_size':None,"iupac_name_col":iupac_name_col}
112
+ train_dataset = IUPACDataset(**dataset_kwargs)
113
+
114
+ #for i in train_dataset:
115
+ # train_dataset[i]=train_dataset[i].to(device)
116
+
117
+ collator = T5Collator(iupac_tokenizer.pad_token_id)
118
+ train_dataloader = DataLoader(train_dataset,batch_size=64,collate_fn=collator,shuffle=True)
119
+
120
+ return train_dataloader,iupac_tokenizer
121
+
122
+ if __name__ == "__main__":
123
+
124
+ train_dataloader,iupac_tokenizer = get_data_loader(is_train=1)
125
+ pbar = tqdm(train_dataloader)
126
+ device = 'cpu'
127
+ for inputs in pbar:
128
+
129
+ src_label = Variable(inputs["labels"].to(device))
130
+ inputs = prepare_input(inputs,device)
131
+ src = Variable(inputs["input_ids"].to(device))
132
+ #self.tokenizer._convert_token_to_id
133
+
134
+ print(src[:,:].shape,src_label)
iupac-gpt/iupac_gpt/iupac_tokenization_pro.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ AdamW,
3
+ DataCollatorWithPadding,
4
+ HfArgumentParser,
5
+ T5Config,
6
+ T5ForConditionalGeneration,
7
+ T5Tokenizer,
8
+ Trainer,
9
+ TrainingArguments,
10
+ )
11
+ from torch.utils.data import DataLoader
12
+ import os
13
+ import tempfile
14
+ import re
15
+ import pandas as pd
16
+ import numpy as np
17
+ from typing import Dict, Optional
18
+ from dataclasses import dataclass, field
19
+ import logging
20
+
21
+ import torch
22
+ from torch.nn.utils.rnn import pad_sequence
23
+ from torch.optim.lr_scheduler import LambdaLR
24
+ import os.path as pt
25
+ import torch.optim as optim
26
+ import torch.nn as nn
27
+ from tqdm import tqdm
28
+ from torch.autograd import Variable
29
+ from .iupac_dataset_pro import IUPACDataset
30
+ import os
31
+ #os.environ["CUDA_VISIBLE_DEVICES"]="0"
32
+
33
+
34
+ class T5Collator:
35
+ def __init__(self, pad_token_id):
36
+ super().__init__()
37
+ self.pad_token_id = pad_token_id
38
+ def __call__(self, records):
39
+ # records is a list of dicts
40
+ batch = {}
41
+ padvals = {"input_ids": self.pad_token_id,'attention_mask':0}
42
+ for k in records[0]:
43
+ if k in padvals:
44
+ batch[k] = pad_sequence([torch.tensor(r[k]) for r in records],
45
+ batch_first=True,
46
+ padding_value=padvals[k])
47
+ else:
48
+ batch[k] = torch.FloatTensor([r[k] for r in records]) #torch.Tensor LongTensor FloatTensor
49
+ return batch
50
+
51
+ class T5IUPACTokenizer(T5Tokenizer):
52
+ def prepare_for_tokenization(self, text, is_split_into_words=False,
53
+ **kwargs):
54
+ return re.sub(" ", "_", text), kwargs
55
+
56
+ def _decode(self, *args, **kwargs):
57
+ # replace "_" with " ", except for the _ in extra_id_#
58
+ text = super()._decode(*args, **kwargs)
59
+ text = re.sub("extra_id_", "extraAidA", text)
60
+ text = re.sub("_", " ", text)
61
+ text = re.sub("extraAidA", "extra_id_", text)
62
+ return text
63
+
64
+ def sentinels(self, sentinel_ids):
65
+ return self.vocab_size - sentinel_ids - 1
66
+
67
+ def sentinel_mask(self, ids):
68
+ return ((self.vocab_size - self._extra_ids <= ids) &
69
+ (ids < self.vocab_size))
70
+
71
+ def _tokenize(self, text, sample=False):
72
+ #pieces = super()._tokenize(text, sample=sample)
73
+ pieces = super()._tokenize(text)
74
+ # sentencepiece adds a non-printing token at the start. Remove it
75
+ return pieces[1:]
76
+
77
+ def prepare_input_pro(data,device):
78
+ from collections.abc import Mapping
79
+ if isinstance(data, Mapping):
80
+ return type(data)({k: prepare_input_pro(v,device) for k, v in data.items()})
81
+ elif isinstance(data, (tuple, list)):
82
+ return type(data)(prepare_input_pro(v,device) for v in data)
83
+ elif isinstance(data, torch.Tensor):
84
+ kwargs = dict(device=device)
85
+ if data.dtype != torch.int64:
86
+ # NLP models inputs are int64 and those get adjusted to the right dtype of the
87
+ # embedding. Other models such as wav2vec2's inputs are already float and thus
88
+ # may need special handling to match the dtypes of the model
89
+ kwargs.update(dict(dtype=torch.int64))
90
+
91
+ return data.to(**kwargs)
92
+ return data
93
+
94
+ def get_data_loader_pro(is_train=1):
95
+
96
+ full_path = '/root/autodl-tmp/wjm/iupac-gpt/iupac_gpt/'
97
+
98
+ iupac_tokenizer = T5IUPACTokenizer(vocab_file=full_path+'iupac_spm.model')
99
+ iupac_vocab_size = iupac_tokenizer.vocab_size
100
+ print('iupac_vocab_size:',iupac_vocab_size)
101
+ if is_train:
102
+ torch.save(iupac_tokenizer, pt.join(full_path,"real_iupac_tokenizer.pt"))
103
+ print("training...",len(iupac_tokenizer))
104
+ else:
105
+ iupac_tokenizer = torch.load(pt.join(full_path,"real_iupac_tokenizer.pt"), map_location="cpu")
106
+ print('fina_tune...',len(iupac_tokenizer))
107
+
108
+ dataset_filename = 'iupacs_logp.csv'
109
+ target_col = "LogP"
110
+ iupac_name_col = 'iupac'
111
+ MAXLEN=1024
112
+ dataset_kwargs = {"dataset_dir":full_path,"dataset_filename": dataset_filename,"tokenizer": iupac_tokenizer,"max_length": MAXLEN,"target_col": target_col,'dataset_size':None,"iupac_name_col":iupac_name_col}
113
+ train_dataset = IUPACDataset(**dataset_kwargs)
114
+ collator = T5Collator(iupac_tokenizer.pad_token_id)
115
+ train_dataloader = DataLoader(train_dataset,batch_size=64,collate_fn=collator,shuffle=True)
116
+
117
+ return train_dataloader,iupac_tokenizer
118
+
119
+ if __name__ == "__main__":
120
+
121
+ train_dataloader,iupac_tokenizer = get_data_loader_class(is_train=1)
122
+ pbar = tqdm(train_dataloader)
123
+ device = 'cpu'
124
+ for inputs in pbar:
125
+
126
+ src_label = Variable(inputs["labels"].to(device))
127
+ inputs = prepare_input_class(inputs,device)
128
+ src = Variable(inputs["input_ids"].to(device))
129
+ #self.tokenizer._convert_token_to_id
130
+
131
+ print(src[:,:].shape,src_label)
iupac-gpt/iupac_gpt/iupacs_logp.csv ADDED
The diff for this file is too large to render. See raw diff
 
iupac-gpt/iupac_gpt/language_modeling.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pytorch-lightning module for causal language modeling.
2
+ """
3
+
4
+ __all__ = ("GPT2LitModel",)
5
+
6
+ import pytorch_lightning as pl
7
+ import torch
8
+
9
+
10
+ class GPT2LitModel(pl.LightningModule):
11
+ """Lightning module for autoregressive (causal) transformer language modeling.
12
+ Successfully tested on HuggingFace `GPT2LMHeadModel`.
13
+ """
14
+
15
+ def __init__(self, transformer, batch_size: int, learning_rate: float,
16
+ final_learning_rate: float, weight_decay: float, adam_eps: float,
17
+ adam_betas: tuple, scheduler_T_max: int,
18
+ save_model_every: int = 10_000, checkpoint: str = ""):
19
+ super().__init__()
20
+ self.save_hyperparameters(ignore=("transformer", "save_model_every",
21
+ "checkpoints"))
22
+ self.transformer = transformer
23
+ self.save_model_every = save_model_every
24
+ self.checkpoint = checkpoint or "./gpt2litmodel-logs"
25
+
26
+ def forward(self, *args, **kwargs):
27
+ return self.transformer(*args, **kwargs)
28
+
29
+ def training_step(self, batch, batch_idx):
30
+ outputs = self(**batch)
31
+
32
+ if self.save_model_every > 0 and batch_idx % self.save_model_every == 0:
33
+ self.transformer.save_pretrained(self.checkpoint)
34
+
35
+ return {'loss': outputs['loss']}
36
+
37
+ def training_epoch_end(self, outputs):
38
+ if self.save_model_every > 0:
39
+ self.transformer.save_pretrained(self.checkpoint)
40
+
41
+ losses = [step_output["loss"] for step_output in outputs]
42
+ mean_loss = torch.tensor(losses).mean()
43
+ ppl = torch.exp(mean_loss)
44
+
45
+ self.log("ppl", ppl, on_step=False, on_epoch=True, prog_bar=True)
46
+
47
+ def configure_optimizers(self):
48
+ parameters = self.named_parameters()
49
+ no_decay = ["bias", "LayerNorm.weight"]
50
+ grouped_parameters = [
51
+ {"params": [p for n, p in parameters
52
+ if not any(nd in n for nd in no_decay)],
53
+ "weight_decay": self.hparams.weight_decay},
54
+ {"params": [p for n, p in parameters
55
+ if any(nd in n for nd in no_decay)],
56
+ "weight_decay": 0.0}]
57
+ optimizer = torch.optim.Adam(
58
+ grouped_parameters, lr=self.hparams.learning_rate,
59
+ weight_decay=self.hparams.weight_decay,
60
+ eps=self.hparams.adam_eps, betas=self.hparams.adam_betas)
61
+
62
+ lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
63
+ optimizer, self.hparams.scheduler_T_max,
64
+ eta_min=self.hparams.final_learning_rate)
65
+
66
+ return {'optimizer': optimizer,
67
+ 'lr_scheduler': {'scheduler': lr_scheduler,
68
+ 'interval': 'step', 'frequency': 1}}
iupac-gpt/iupac_gpt/pubchem_iupac_smile_gpt.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b052dd26a26107e9c86a2b155a693669fb1f4fbf498762abe2d19fbaa6867567
3
+ size 2825708735
iupac-gpt/iupac_gpt/real_iupac_tokenizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1696e3f3060bcce33275387e4eb4e175f4c64a015962ac4f3c5f49f25ed6f335
3
+ size 3529
iupac-gpt/iupac_gpt/tokenization.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SMILES-based tokenization utilities.
2
+ """
3
+
4
+ __all__ = ("PAD_TOKEN", "BOS_TOKEN", "EOS_TOKEN", "UNK_TOKEN", "SUFFIX",
5
+ "SPECIAL_TOKENS", "PAD_TOKEN_ID", "BOS_TOKEN_ID", "EOS_TOKEN_ID",
6
+ "UNK_TOKEN_ID", "SMILESBPETokenizer", "SMILESAlphabet")
7
+
8
+ from collections.abc import Collection, Iterator
9
+ from dataclasses import dataclass
10
+ from itertools import chain
11
+ from typing import Any, Dict, FrozenSet, List, Optional, Set, Tuple, Union
12
+ from tokenizers import AddedToken, Tokenizer
13
+ from tokenizers import decoders, models, normalizers, processors, trainers
14
+ from tokenizers.implementations import BaseTokenizer
15
+ from transformers import PreTrainedTokenizerFast
16
+
17
+
18
+ SUFFIX, PAD_TOKEN, BOS_TOKEN, EOS_TOKEN, UNK_TOKEN = "", "<pad>", "<s>", "</s>", "<unk>"
19
+ SPECIAL_TOKENS = [PAD_TOKEN, BOS_TOKEN, EOS_TOKEN, UNK_TOKEN]
20
+ PAD_TOKEN_ID, BOS_TOKEN_ID, EOS_TOKEN_ID, UNK_TOKEN_ID = range(4)
21
+
22
+
23
+ class SMILESBPETokenizer(BaseTokenizer):
24
+ """Tokenizes SMILES strings and applies BPE.
25
+
26
+ Args:
27
+ vocab (`str` or `dict`, optional, defaults to `None`):
28
+ Token vocabulary.
29
+ merges (`str` or `dict` or `tuple`, optional, defaults to `None`):
30
+ BPE merges.
31
+ unk_token (`str` or `tokenizers.AddedToken`, optional, defaults to "<unk>")
32
+ suffix (`str`, defaults to "")
33
+ dropout (`float`, defaults to `None`)
34
+
35
+ Examples:
36
+ >>> tokenizer = SMILESBPETokenizer()
37
+ >>> tokenizer.train("path-to-smiles-strings-file")
38
+ Tokenization logs...
39
+ >>> tokenizer.save_model("checkpoints-path")
40
+ >>> same_tokenizer = SMILESBPETokenizer.from_file("checkpoints-path/vocab.json",
41
+ ... "checkpoints-path/merges.txt")
42
+ """
43
+
44
+ def __init__(
45
+ self,
46
+ vocab: Optional[Union[str, Dict[str, int]]] = None,
47
+ merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
48
+ unk_token: Union[str, AddedToken] = "<unk>",
49
+ suffix: str = SUFFIX,
50
+ dropout: Optional[float] = None,
51
+ ) -> None:
52
+ unk_token_str = str(unk_token)
53
+
54
+ tokenizer = Tokenizer(models.BPE(vocab, merges, dropout=dropout,
55
+ unk_token=unk_token_str,
56
+ end_of_word_suffix=suffix))
57
+
58
+ if tokenizer.token_to_id(unk_token_str) is not None:
59
+ tokenizer.add_special_tokens([unk_token_str])
60
+
61
+ tokenizer.normalizer = normalizers.Strip(left=False, right=True)
62
+ tokenizer.decoder = decoders.Metaspace(add_prefix_space=True)
63
+ tokenizer.post_processor = processors.TemplateProcessing(
64
+ single=f"{BOS_TOKEN} $A {EOS_TOKEN}",
65
+ special_tokens=[(BOS_TOKEN, BOS_TOKEN_ID), (EOS_TOKEN, EOS_TOKEN_ID)])
66
+
67
+ parameters = {"model": "BPE", "unk_token": unk_token, "suffix": suffix,
68
+ "dropout": dropout}
69
+
70
+ super().__init__(tokenizer, parameters)
71
+
72
+ @classmethod
73
+ def from_file(cls, vocab_filename: str, merges_filename: str, **kwargs) \
74
+ -> "SMILESBPETokenizer":
75
+ vocab, merges = models.BPE.read_file(vocab_filename, merges_filename)
76
+ return cls(vocab, merges, **kwargs)
77
+
78
+ def train(
79
+ self,
80
+ files: Union[str, List[str]],
81
+ vocab_size: int = 1_000,
82
+ min_frequency: int = 2,
83
+ special_tokens: List[Union[str, AddedToken]] = None,
84
+ limit_alphabet: int = 200,
85
+ initial_alphabet: List[str] = None,
86
+ suffix: Optional[str] = SUFFIX,
87
+ show_progress: bool = True,
88
+ ) -> None:
89
+ special_tokens = special_tokens or SPECIAL_TOKENS
90
+ initial_alphabet = initial_alphabet or []
91
+
92
+ trainer = trainers.BpeTrainer(vocab_size=vocab_size,
93
+ min_frequency=min_frequency,
94
+ special_tokens=special_tokens,
95
+ limit_alphabet=limit_alphabet,
96
+ initial_alphabet=initial_alphabet,
97
+ end_of_word_suffix=suffix,
98
+ show_progress=show_progress)
99
+ if isinstance(files, str):
100
+ files = [files]
101
+ self._tokenizer.train(files, trainer=trainer)
102
+
103
+ def train_from_iterator(
104
+ self,
105
+ iterator: Iterator,
106
+ vocab_size: int = 1_000,
107
+ min_frequency: int = 2,
108
+ special_tokens: List[Union[str, AddedToken]] = None,
109
+ limit_alphabet: int = 200,
110
+ initial_alphabet: List[str] = None,
111
+ suffix: Optional[str] = SUFFIX,
112
+ show_progress: bool = True,
113
+ ) -> None:
114
+ special_tokens = special_tokens or SPECIAL_TOKENS
115
+ initial_alphabet = initial_alphabet or []
116
+
117
+ trainer = trainers.BpeTrainer(vocab_size=vocab_size,
118
+ min_frequency=min_frequency,
119
+ special_tokens=special_tokens,
120
+ limit_alphabet=limit_alphabet,
121
+ initial_alphabet=initial_alphabet,
122
+ end_of_word_suffix=suffix,
123
+ show_progress=show_progress)
124
+ self._tokenizer.train_from_iterator(iterator, trainer=trainer)
125
+
126
+ @staticmethod
127
+ def get_hf_tokenizer(
128
+ tokenizer_file: str,
129
+ special_tokens: Optional[Dict[str, str]] = None,
130
+ model_max_length: int = 512,
131
+ *init_inputs, **kwargs
132
+ ) -> PreTrainedTokenizerFast:
133
+ """Gets HuggingFace tokenizer from the pretrained `tokenizer_file`. Optionally,
134
+ appends `special_tokens` to vocabulary and sets `model_max_length`.
135
+ """
136
+ tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file,
137
+ *init_inputs, **kwargs)
138
+ special_tokens = special_tokens or dict(zip(
139
+ ["pad_token", "bos_token", "eos_token", "unk_token"],
140
+ SPECIAL_TOKENS))
141
+ tokenizer.add_special_tokens(special_tokens)
142
+ tokenizer.model_max_length = model_max_length
143
+ return tokenizer
144
+
145
+
146
+ @dataclass(init=True, eq=False, repr=True, frozen=True)
147
+ class SMILESAlphabet(Collection):
148
+ atoms: FrozenSet[str] = frozenset([
149
+ 'Ac', 'Ag', 'Al', 'Am', 'Ar', 'As', 'At', 'Au', 'B', 'Ba', 'Be', 'Bh',
150
+ 'Bi', 'Bk', 'Br', 'C', 'Ca', 'Cd', 'Ce', 'Cf', 'Cl', 'Cm', 'Co', 'Cr',
151
+ 'Cs', 'Cu', 'Db', 'Dy', 'Er', 'Es', 'Eu', 'F', 'Fe', 'Fm', 'Fr', 'Ga',
152
+ 'Gd', 'Ge', 'H', 'He', 'Hf', 'Hg', 'Ho', 'Hs', 'I', 'In', 'Ir', 'K',
153
+ 'Kr', 'La', 'Li', 'Lr', 'Lu', 'Md', 'Mg', 'Mn', 'Mo', 'Mt', 'N', 'Na',
154
+ 'Nb', 'Nd', 'Ne', 'Ni', 'No', 'Np', 'O', 'Os', 'P', 'Pa', 'Pb', 'Pd',
155
+ 'Pm', 'Po', 'Pr', 'Pt', 'Pu', 'Ra', 'Rb', 'Re', 'Rf', 'Rh', 'Rn',
156
+ 'Ru', 'S', 'Sb', 'Sc', 'Se', 'Sg', 'Si', 'Sm', 'Sn', 'Sr', 'Ta', 'Tb',
157
+ 'Tc', 'Te', 'Th', 'Ti', 'Tl', 'Tm', 'U', 'V', 'W', 'Xe', 'Y', 'Yb',
158
+ 'Zn', 'Zr'
159
+ ])
160
+
161
+ # Bonds, charges, etc.
162
+ non_atoms: FrozenSet[str] = frozenset([
163
+ '-', '=', '#', ':', '(', ')', '.', '[', ']', '+', '-', '\\', '/', '*',
164
+ '1', '2', '3', '4', '5', '6', '7', '8', '9', '0',
165
+ '@', 'AL', 'TH', 'SP', 'TB', 'OH',
166
+ ])
167
+
168
+ additional: FrozenSet[str] = frozenset()
169
+
170
+ def __contains__(self, item: Any) -> bool:
171
+ return item in self.atoms or item in self.non_atoms
172
+
173
+ def __iter__(self):
174
+ return (token for token in chain(self.atoms, self.non_atoms))
175
+
176
+ def __len__(self) -> int:
177
+ return len(self.atoms) + len(self.non_atoms) + len(self.additional)
178
+
179
+ def get_alphabet(self) -> Set[str]:
180
+ alphabet = set()
181
+ for token in self.atoms:
182
+ if len(token) > 1:
183
+ alphabet.update(list(token))
184
+ alphabet.add(token[0].lower())
185
+ else:
186
+ alphabet.add(token)
187
+ alphabet.add(token.lower())
188
+ for token in chain(self.non_atoms, self.additional):
189
+ if len(token) > 1:
190
+ alphabet.update(list(token))
191
+ else:
192
+ alphabet.add(token)
193
+ return alphabet
iupac-gpt/nohup.out ADDED
The diff for this file is too large to render. See raw diff
 
iupac-gpt/notebooks/.ipynb_checkpoints/language-modeling-checkpoint.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
iupac-gpt/notebooks/iupac_head_view.html ADDED
The diff for this file is too large to render. See raw diff
 
iupac-gpt/notebooks/iupac_language-modeling.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # # Generative Pre-Training from Molecules
5
+
6
+ import os
7
+ #os.environ["CUDA_VISIBLE_DEVICES"] = ['1',"2"]
8
+ from pprint import pprint
9
+ import sys
10
+ sys.path.append('/root/autodl-tmp/wjm/iupac-gpt')
11
+ from tqdm import tqdm
12
+ try:
13
+ import iupac_gpt as gpt
14
+ except ImportError:
15
+ import sys
16
+ sys.path.extend([".."]) # Parent directory stores `smiles_gpt` package.
17
+ import iupac_gpt as gpt
18
+ import torch
19
+
20
+ # For demonstration purposes, we use only 10K subset of PubChem data made available by
21
+ # [ChemBERTa](https://arxiv.org/abs/2010.09885) developers. The original model was pretrained
22
+ # on the first 5M compounds with the following hyperparameters:
23
+ # ```python
24
+ # hyperparams = {"batch_size": 128, "max_epochs": 2, "max_length": 512,
25
+ # "learning_rate": 5e-4, "weight_decay": 0.0,
26
+ # "adam_eps": 1e-8, "adam_betas": (0.9, 0.999),
27
+ # "scheduler_T_max": 150_000, "final_learning_rate": 5e-8,
28
+ # "vocab_size": 1_000, "min_frequency": 2, "top_p": 0.96,
29
+ # "n_layer": 4, "n_head": 8, "n_embd": 512}
30
+ # ```
31
+ # Tokenizer, model, optimizer, scheduler, and trainer hyperparameters.
32
+ hyperparams = {"batch_size": 64, "max_epochs": 10, "max_length": 1280,
33
+ "learning_rate": 5e-4, "weight_decay": 0.0,
34
+ "adam_eps": 1e-8, "adam_betas": (0.9, 0.999),
35
+ "scheduler_T_max": 1_000, "final_learning_rate": 5e-8,
36
+ "vocab_size": 1491, "min_frequency": 2, "top_p": 0.96,
37
+ "n_layer": 8, "n_head": 8, "n_embd": 256}
38
+
39
+ gpus = [0] # Specify either a list of GPU devices or an integer (0 for no GPU).
40
+ num_workers = 24 # Number of dataloader worker processes.
41
+ # ## Tokenization
42
+ #
43
+ # `smiles_gpt.SMILESBPETokenizer` first splits SMILES strings into characters, runs
44
+ # byte-pair encoding, and augments the resulting list with `"<s>"` (beginning-of-SMILES) and
45
+ # `"</s>"` (end-of-SMILES) special tokens. `smiles_gpt.SMILESAlphabet` stores 72 possible
46
+ # characters as an initial vocabulary.
47
+ device = 'gpu'
48
+ train_dataloader,iupac_tokenizer = gpt.get_data_loader(is_train=1,dataset_filename = './pubchem_iupac_smile_gpt.csv')
49
+ pbar = tqdm(train_dataloader) #train_dataloader.cuda()
50
+
51
+
52
+ '''
53
+ for inputs in pbar:
54
+ src_label = Variable(inputs["labels"].to(device))
55
+ inputs = prepare_input(inputs,device)
56
+ src = Variable(inputs["input_ids"].to(device))
57
+ #self.tokenizer._convert_token_to_id
58
+
59
+ print(src[:,:].shape,src_label)
60
+ '''
61
+ tokenizer = iupac_tokenizer
62
+ #start mark <unk> 2, end mark </s> 1, pad <pad> 0
63
+
64
+ iupac_string = "2-amino-9-[4-hydroxy-3-(hydroxymethyl)-2-methylidenecyclopentyl]-1H-purin-6-one"
65
+ iupac_encoded = tokenizer(iupac_string)
66
+ iupac_encoded['input_ids'] = [2]+iupac_encoded['input_ids']
67
+
68
+ iupac_merges = [tokenizer.decode(i) for i in iupac_encoded['input_ids']]
69
+ #iupac_encoded['attention_mask']
70
+
71
+ print(iupac_encoded['input_ids'])
72
+ print(iupac_merges)
73
+
74
+ print(tokenizer.unk_token_id,tokenizer.eos_token_id,tokenizer.unk_token,tokenizer.eos_token,tokenizer.vocab_size) #2 1 1491
75
+ # ## Data Module
76
+ batch = next(iter(pbar))
77
+
78
+
79
+ # ## GPT-2 Model
80
+ #
81
+ # Now we load HuggingFace
82
+ # [`GPT2LMHeadModel`](https://huggingface.co/transformers/model_doc/gpt2.html#gpt2lmheadmodel)
83
+ # with the configuration composed of previously
84
+ # defined model hyperparameters. The model processes mini-batch of input ids and labels, then
85
+ # returns predictions and cross-entropy loss between labels and predictions.
86
+
87
+ from transformers import GPT2Config, GPT2LMHeadModel
88
+
89
+ config = GPT2Config(vocab_size=tokenizer.vocab_size,
90
+ bos_token_id=tokenizer.unk_token_id,
91
+ eos_token_id=tokenizer.eos_token_id,
92
+ n_layer=hyperparams["n_layer"],
93
+ n_head=hyperparams["n_head"],
94
+ n_embd=hyperparams["n_embd"],
95
+ n_positions=hyperparams["max_length"],
96
+ n_ctx=hyperparams["max_length"])
97
+ model = GPT2LMHeadModel(config)
98
+
99
+ #model= torch.nn.DataParallel(model.cuda(),device_ids=gpus,output_device=gpus[0])
100
+
101
+ outputs = model(**batch)
102
+ print(outputs.keys())
103
+
104
+ #['loss', 'logits', 'past_key_values']
105
+ # ## Trainer
106
+ #
107
+ # GPT-2 is trained with autoregressive language modeling objective:
108
+ # $$
109
+ # P(\boldsymbol{s}) = P(s_1) \cdot P(s_2 | s_1) \cdots P(s_T | s_1, \ldots, s_{T-1}) =
110
+ # \prod_{t=1}^{T} P(s_t | s_{j < t}),
111
+ # $$
112
+ # where $\boldsymbol{s}$ is a tokenized (encoded) SMILES string, $s_t$ is a token from pretrained
113
+ # vocabulary $\mathcal{V}$.
114
+ #
115
+ # We use `pytorch_lightning.Trainer` to train GPT-2. Since `Trainer` requires lightning modules,
116
+ # we import our
117
+ # [`smiles_gpt.GPT2LitModel`](https://github.com/sanjaradylov/smiles-gpt/blob/master/smiles_gpt/language_modeling.py#L10)
118
+ # wrapper that implements training phases for
119
+ # `GPT2LMHeadModel`, configures an `Adam` optimizer with `CosineAnnealingLR` scheduler, and
120
+ # logs average perplexity every epoch.
121
+
122
+ # In[8]:
123
+
124
+
125
+ from pytorch_lightning import Trainer
126
+ from pytorch_lightning.callbacks.early_stopping import EarlyStopping
127
+
128
+ checkpoint = "./checkpoints/iupac"
129
+
130
+
131
+ '''
132
+ trainer = Trainer(
133
+ gpus=gpus,
134
+ max_epochs=hyperparams["max_epochs"],
135
+ callbacks=[EarlyStopping("ppl", 0.1, 3)], #[EarlyStopping("ppl", 0.2, 2)]
136
+ auto_lr_find=False, # Set to True to search for optimal learning rate.
137
+ auto_scale_batch_size=False, # Set to True to scale batch size
138
+ # accelerator="dp" # Uncomment for GPU training.
139
+ accelerator="gpu", #devices=4,
140
+ strategy="ddp"
141
+ )
142
+ lit_model = gpt.GPT2LitModel(
143
+ model,
144
+ batch_size=hyperparams["batch_size"],
145
+ learning_rate=hyperparams["learning_rate"],
146
+ final_learning_rate=hyperparams["final_learning_rate"],
147
+ weight_decay=hyperparams["weight_decay"],
148
+ adam_eps=hyperparams["adam_eps"],
149
+ adam_betas=hyperparams["adam_betas"],
150
+ scheduler_T_max=hyperparams["scheduler_T_max"],
151
+ save_model_every=1, checkpoint=checkpoint)
152
+ trainer.fit(lit_model, train_dataloader)
153
+
154
+
155
+ #model.module.save_pretrained('./pretrained')
156
+ model.save_pretrained('./pretrained')
157
+
158
+ '''
159
+
160
+
161
+ # ## Interpretability
162
+ #
163
+ # [BertViz](https://github.com/jessevig/bertviz) inspects attention heads of transformers
164
+ # capturing specific patterns in data. Each head can be representative of some syntactic
165
+ # or short-/long-term relationships between tokens.
166
+
167
+ # In[9]:
168
+
169
+
170
+ import torch
171
+ from bertviz import head_view
172
+
173
+ input_ids_list = iupac_encoded['input_ids']
174
+ model = GPT2LMHeadModel.from_pretrained(checkpoint, output_attentions=True)
175
+ attention = model(torch.LongTensor(input_ids_list))[-1]
176
+ tokens = [tokenizer.decode(i) for i in input_ids_list]
177
+ print(input_ids_list,attention,tokens)
178
+ # Don't worry if a snippet is not displayed---just rerun this cell.
179
+ head_view(attention, tokens)
180
+
181
+
182
+
183
+ from bertviz import model_view
184
+
185
+ # Don't worry if a snippet is not displayed---just rerun this cell.
186
+ model_view(attention, tokens)
187
+
188
+
189
+ # ## Sampling
190
+ #
191
+ # Finally, we generate novel SMILES strings with top-$p$ sampling$-$i.e., sampling from the
192
+ # smallest vocabulary subset $\mathcal{V}^{(p)} \subset \mathcal{V}$ s.t. it takes up the most
193
+ # probable tokens whose cumulative probability mass exceeds $p$, $0 < p < 1$. Model
194
+ # terminates the procedure upon encountering `"</s>"` or reaching maximum number
195
+ # `hyperparams["max_length"]`. Special tokens are eventually removed.
196
+
197
+
198
+
199
+ import tqdm
200
+
201
+ model.eval() # Set the base model to evaluation mode.
202
+
203
+ generated_smiles_list = []
204
+ n_generated = 50000
205
+
206
+ for _ in tqdm.tqdm(range(n_generated)):
207
+ # Generate from "<unk>" so that the next token is arbitrary.
208
+ smiles_start = torch.LongTensor([[tokenizer.unk_token_id]])
209
+ # Get generated token IDs.
210
+ generated_ids = model.generate(smiles_start,
211
+ max_length=hyperparams["max_length"],
212
+ do_sample=True,top_p=hyperparams["top_p"],
213
+ repetition_penalty=1.2,
214
+ pad_token_id=tokenizer.eos_token_id)
215
+ # Decode the IDs into tokens and remove "<s>" and "</s>".
216
+ generated_smiles = tokenizer.decode(generated_ids[0],
217
+ skip_special_tokens=True)
218
+ generated_smiles_list.append(generated_smiles)
219
+
220
+ print(generated_smiles_list[:10])
221
+
222
+
223
+ import numpy as np
224
+ import pandas as pd
225
+
226
+ df2 = pd.DataFrame(generated_smiles_list, columns=['iupac'])
227
+
228
+ df2.to_csv("iupacGPT2-gen50K.csv",index=None,sep="|")
229
+
230
+
231
+
232
+
233
+
234
+
235
+
236
+
iupac-gpt/notebooks/iupac_language-modeling_retrain.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # # Generative Pre-Training from Molecules
5
+
6
+ import os
7
+ #os.environ["CUDA_VISIBLE_DEVICES"] = ['1',"2"]
8
+ from pprint import pprint
9
+ import sys
10
+ sys.path.append('/root/autodl-tmp/wjm/iupac-gpt')
11
+ from tqdm import tqdm
12
+ try:
13
+ import iupac_gpt as gpt
14
+ except ImportError:
15
+ import sys
16
+ sys.path.extend([".."]) # Parent directory stores `smiles_gpt` package.
17
+ import iupac_gpt as gpt
18
+ import torch
19
+
20
+ # For demonstration purposes, we use only 10K subset of PubChem data made available by
21
+ # [ChemBERTa](https://arxiv.org/abs/2010.09885) developers. The original model was pretrained
22
+ # on the first 5M compounds with the following hyperparameters:
23
+ # ```python
24
+ # hyperparams = {"batch_size": 128, "max_epochs": 2, "max_length": 512,
25
+ # "learning_rate": 5e-4, "weight_decay": 0.0,
26
+ # "adam_eps": 1e-8, "adam_betas": (0.9, 0.999),
27
+ # "scheduler_T_max": 150_000, "final_learning_rate": 5e-8,
28
+ # "vocab_size": 1_000, "min_frequency": 2, "top_p": 0.96,
29
+ # "n_layer": 4, "n_head": 8, "n_embd": 512}
30
+ # ```
31
+ # Tokenizer, model, optimizer, scheduler, and trainer hyperparameters.
32
+ hyperparams = {"batch_size": 128, "max_epochs": 10, "max_length": 1280,
33
+ "learning_rate": 5e-4, "weight_decay": 0.0,
34
+ "adam_eps": 1e-8, "adam_betas": (0.9, 0.999),
35
+ "scheduler_T_max": 1_000, "final_learning_rate": 5e-8,
36
+ "vocab_size": 1491, "min_frequency": 2, "top_p": 0.96,
37
+ "n_layer": 8, "n_head": 8, "n_embd": 256}
38
+
39
+ gpus = [0] # Specify either a list of GPU devices or an integer (0 for no GPU).
40
+ num_workers = 16 # Number of dataloader worker processes.
41
+ # ## Tokenization
42
+ #
43
+ # `smiles_gpt.SMILESBPETokenizer` first splits SMILES strings into characters, runs
44
+ # byte-pair encoding, and augments the resulting list with `"<s>"` (beginning-of-SMILES) and
45
+ # `"</s>"` (end-of-SMILES) special tokens. `smiles_gpt.SMILESAlphabet` stores 72 possible
46
+ # characters as an initial vocabulary.
47
+ device = 'gpu'
48
+ train_dataloader,iupac_tokenizer = gpt.get_data_loader(is_train=1,dataset_filename = './pubchem_iupac_smile_gpt.csv')
49
+ pbar = tqdm(train_dataloader) #train_dataloader.cuda()
50
+
51
+
52
+ '''
53
+ for inputs in pbar:
54
+ src_label = Variable(inputs["labels"].to(device))
55
+ inputs = prepare_input(inputs,device)
56
+ src = Variable(inputs["input_ids"].to(device))
57
+ #self.tokenizer._convert_token_to_id
58
+
59
+ print(src[:,:].shape,src_label)
60
+ '''
61
+ tokenizer = iupac_tokenizer
62
+ #start mark <unk> 2, end mark </s> 1, pad <pad> 0
63
+
64
+ iupac_string = "2-amino-9-[4-hydroxy-3-(hydroxymethyl)-2-methylidenecyclopentyl]-1H-purin-6-one"
65
+ iupac_encoded = tokenizer(iupac_string)
66
+ iupac_encoded['input_ids'] = [2]+iupac_encoded['input_ids']
67
+
68
+ iupac_merges = [tokenizer.decode(i) for i in iupac_encoded['input_ids']]
69
+ #iupac_encoded['attention_mask']
70
+
71
+ print(iupac_encoded['input_ids'])
72
+ print(iupac_merges)
73
+
74
+ print(tokenizer.unk_token_id,tokenizer.eos_token_id,tokenizer.unk_token,tokenizer.eos_token,tokenizer.vocab_size) #2 1 1491
75
+ # ## Data Module
76
+ #batch = next(iter(pbar))
77
+
78
+
79
+ # ## GPT-2 Model
80
+ #
81
+ # Now we load HuggingFace
82
+ # [`GPT2LMHeadModel`](https://huggingface.co/transformers/model_doc/gpt2.html#gpt2lmheadmodel)
83
+ # with the configuration composed of previously
84
+ # defined model hyperparameters. The model processes mini-batch of input ids and labels, then
85
+ # returns predictions and cross-entropy loss between labels and predictions.
86
+
87
+ from transformers import GPT2Config, GPT2LMHeadModel
88
+
89
+ config = GPT2Config(vocab_size=tokenizer.vocab_size,
90
+ bos_token_id=tokenizer.unk_token_id,
91
+ eos_token_id=tokenizer.eos_token_id,
92
+ n_layer=hyperparams["n_layer"],
93
+ n_head=hyperparams["n_head"],
94
+ n_embd=hyperparams["n_embd"],
95
+ n_positions=hyperparams["max_length"],
96
+ n_ctx=hyperparams["max_length"])
97
+ #model = GPT2LMHeadModel(config)
98
+
99
+ #model= torch.nn.DataParallel(model.cuda(),device_ids=gpus,output_device=gpus[0])
100
+
101
+ #outputs = model(**batch)
102
+ #print(outputs.keys())
103
+
104
+ #['loss', 'logits', 'past_key_values']
105
+ # ## Trainer
106
+ #
107
+ # GPT-2 is trained with autoregressive language modeling objective:
108
+ # $$
109
+ # P(\boldsymbol{s}) = P(s_1) \cdot P(s_2 | s_1) \cdots P(s_T | s_1, \ldots, s_{T-1}) =
110
+ # \prod_{t=1}^{T} P(s_t | s_{j < t}),
111
+ # $$
112
+ # where $\boldsymbol{s}$ is a tokenized (encoded) SMILES string, $s_t$ is a token from pretrained
113
+ # vocabulary $\mathcal{V}$.
114
+ #
115
+ # We use `pytorch_lightning.Trainer` to train GPT-2. Since `Trainer` requires lightning modules,
116
+ # we import our
117
+ # [`smiles_gpt.GPT2LitModel`](https://github.com/sanjaradylov/smiles-gpt/blob/master/smiles_gpt/language_modeling.py#L10)
118
+ # wrapper that implements training phases for
119
+ # `GPT2LMHeadModel`, configures an `Adam` optimizer with `CosineAnnealingLR` scheduler, and
120
+ # logs average perplexity every epoch.
121
+ checkpoint = "../checkpoints/iupac"
122
+
123
+ model = GPT2LMHeadModel.from_pretrained('./pretrained',local_files_only=True)
124
+
125
+
126
+ from pytorch_lightning import Trainer
127
+ from pytorch_lightning.callbacks.early_stopping import EarlyStopping
128
+
129
+
130
+
131
+ trainer = Trainer(
132
+ gpus=gpus,
133
+ max_epochs=hyperparams["max_epochs"],
134
+ callbacks=[EarlyStopping("ppl", 0.1, 3)], #[EarlyStopping("ppl", 0.2, 2)]
135
+ auto_lr_find=False, # Set to True to search for optimal learning rate.
136
+ auto_scale_batch_size=False, # Set to True to scale batch size
137
+ # accelerator="dp" # Uncomment for GPU training.
138
+ accelerator="gpu", #devices=4,
139
+ strategy="ddp"
140
+ )
141
+ lit_model = gpt.GPT2LitModel(
142
+ model,
143
+ batch_size=hyperparams["batch_size"],
144
+ learning_rate=hyperparams["learning_rate"],
145
+ final_learning_rate=hyperparams["final_learning_rate"],
146
+ weight_decay=hyperparams["weight_decay"],
147
+ adam_eps=hyperparams["adam_eps"],
148
+ adam_betas=hyperparams["adam_betas"],
149
+ scheduler_T_max=hyperparams["scheduler_T_max"],
150
+ save_model_every=1, checkpoint=checkpoint)
151
+ trainer.fit(lit_model, train_dataloader)
152
+
153
+
154
+ #model.module.save_pretrained('./pretrained')
155
+ model.save_pretrained('./pretrained')
156
+
157
+ # ## Interpretability
158
+ #
159
+ # [BertViz](https://github.com/jessevig/bertviz) inspects attention heads of transformers
160
+ # capturing specific patterns in data. Each head can be representative of some syntactic
161
+ # or short-/long-term relationships between tokens.
162
+
163
+ # In[9]:
164
+
165
+
166
+ import torch
167
+ from bertviz import head_view
168
+
169
+ input_ids_list = iupac_encoded['input_ids']
170
+ model = GPT2LMHeadModel.from_pretrained(checkpoint, output_attentions=True)
171
+ attention = model(torch.LongTensor(input_ids_list))[-1]
172
+ tokens = [tokenizer.decode(i) for i in input_ids_list]
173
+ print(input_ids_list,attention,tokens)
174
+ # Don't worry if a snippet is not displayed---just rerun this cell.
175
+ head_view(attention, tokens)
176
+
177
+
178
+
179
+ from bertviz import model_view
180
+
181
+ # Don't worry if a snippet is not displayed---just rerun this cell.
182
+ model_view(attention, tokens)
183
+
184
+
185
+ # ## Sampling
186
+ #
187
+ # Finally, we generate novel SMILES strings with top-$p$ sampling$-$i.e., sampling from the
188
+ # smallest vocabulary subset $\mathcal{V}^{(p)} \subset \mathcal{V}$ s.t. it takes up the most
189
+ # probable tokens whose cumulative probability mass exceeds $p$, $0 < p < 1$. Model
190
+ # terminates the procedure upon encountering `"</s>"` or reaching maximum number
191
+ # `hyperparams["max_length"]`. Special tokens are eventually removed.
192
+
193
+
194
+
195
+ import tqdm
196
+
197
+ model.eval() # Set the base model to evaluation mode.
198
+
199
+ generated_smiles_list = []
200
+ n_generated = 50000
201
+
202
+ for _ in tqdm.tqdm(range(n_generated)):
203
+ # Generate from "<unk>" so that the next token is arbitrary.
204
+ smiles_start = torch.LongTensor([[tokenizer.unk_token_id]])
205
+ # Get generated token IDs.
206
+ generated_ids = model.generate(smiles_start,
207
+ max_length=hyperparams["max_length"],
208
+ do_sample=True,top_p=hyperparams["top_p"],
209
+ repetition_penalty=1.2,
210
+ pad_token_id=tokenizer.eos_token_id)
211
+ # Decode the IDs into tokens and remove "<s>" and "</s>".
212
+ generated_smiles = tokenizer.decode(generated_ids[0],
213
+ skip_special_tokens=True)
214
+ generated_smiles_list.append(generated_smiles)
215
+
216
+ print(generated_smiles_list[:10])
217
+
218
+
219
+ import numpy as np
220
+ import pandas as pd
221
+
222
+ df2 = pd.DataFrame(generated_smiles_list, columns=['iupac'])
223
+
224
+ df2.to_csv("iupacGPT2-gen50K.csv",index=None,mode='a')
iupac-gpt/notebooks/iupac_language-modeling_train.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
iupac-gpt/notebooks/iupac_language-modeling_train.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # # Generative Pre-Training from Molecules
5
+
6
+ import os
7
+ #os.environ["CUDA_VISIBLE_DEVICES"] = ['1',"2"]
8
+ from pprint import pprint
9
+ import sys
10
+ sys.path.append('/root/autodl-tmp/wjm/iupac-gpt')
11
+ from tqdm import tqdm
12
+ try:
13
+ import iupac_gpt as gpt
14
+ except ImportError:
15
+ import sys
16
+ sys.path.extend([".."]) # Parent directory stores `smiles_gpt` package.
17
+ import iupac_gpt as gpt
18
+ import torch
19
+
20
+ # For demonstration purposes, we use only 10K subset of PubChem data made available by
21
+ # [ChemBERTa](https://arxiv.org/abs/2010.09885) developers. The original model was pretrained
22
+ # on the first 5M compounds with the following hyperparameters:
23
+ # ```python
24
+ # hyperparams = {"batch_size": 128, "max_epochs": 2, "max_length": 512,
25
+ # "learning_rate": 5e-4, "weight_decay": 0.0,
26
+ # "adam_eps": 1e-8, "adam_betas": (0.9, 0.999),
27
+ # "scheduler_T_max": 150_000, "final_learning_rate": 5e-8,
28
+ # "vocab_size": 1_000, "min_frequency": 2, "top_p": 0.96,
29
+ # "n_layer": 4, "n_head": 8, "n_embd": 512}
30
+ # ```
31
+ # Tokenizer, model, optimizer, scheduler, and trainer hyperparameters.
32
+ hyperparams = {"batch_size": 128, "max_epochs": 10, "max_length": 1280,
33
+ "learning_rate": 5e-4, "weight_decay": 0.0,
34
+ "adam_eps": 1e-8, "adam_betas": (0.9, 0.999),
35
+ "scheduler_T_max": 1_000, "final_learning_rate": 5e-8,
36
+ "vocab_size": 1491, "min_frequency": 2, "top_p": 0.96,
37
+ "n_layer": 8, "n_head": 8, "n_embd": 256}
38
+
39
+ gpus = [0,1,2] # Specify either a list of GPU devices or an integer (0 for no GPU).
40
+ num_workers = 32 # Number of dataloader worker processes.
41
+ # ## Tokenization
42
+ #
43
+ # `smiles_gpt.SMILESBPETokenizer` first splits SMILES strings into characters, runs
44
+ # byte-pair encoding, and augments the resulting list with `"<s>"` (beginning-of-SMILES) and
45
+ # `"</s>"` (end-of-SMILES) special tokens. `smiles_gpt.SMILESAlphabet` stores 72 possible
46
+ # characters as an initial vocabulary.
47
+ device = 'gpu'
48
+ train_dataloader,iupac_tokenizer = gpt.get_data_loader(is_train=1,dataset_filename = './pubchem_iupac_smile_gpt_1bw.csv')
49
+ pbar = tqdm(train_dataloader) #train_dataloader.cuda()
50
+
51
+
52
+ '''
53
+ for inputs in pbar:
54
+ src_label = Variable(inputs["labels"].to(device))
55
+ inputs = prepare_input(inputs,device)
56
+ src = Variable(inputs["input_ids"].to(device))
57
+ #self.tokenizer._convert_token_to_id
58
+
59
+ print(src[:,:].shape,src_label)
60
+ '''
61
+ tokenizer = iupac_tokenizer
62
+ #start mark <unk> 2, end mark </s> 1, pad <pad> 0
63
+
64
+ iupac_string = "2-amino-9-[4-hydroxy-3-(hydroxymethyl)-2-methylidenecyclopentyl]-1H-purin-6-one"
65
+ iupac_encoded = tokenizer(iupac_string)
66
+ iupac_encoded['input_ids'] = [2]+iupac_encoded['input_ids']
67
+
68
+ iupac_merges = [tokenizer.decode(i) for i in iupac_encoded['input_ids']]
69
+ #iupac_encoded['attention_mask']
70
+
71
+ print(iupac_encoded['input_ids'])
72
+ print(iupac_merges)
73
+
74
+ print(tokenizer.unk_token_id,tokenizer.eos_token_id,tokenizer.unk_token,tokenizer.eos_token,tokenizer.vocab_size) #2 1 1491
75
+ # ## Data Module
76
+ batch = next(iter(pbar))
77
+
78
+
79
+ # ## GPT-2 Model
80
+ #
81
+ # Now we load HuggingFace
82
+ # [`GPT2LMHeadModel`](https://huggingface.co/transformers/model_doc/gpt2.html#gpt2lmheadmodel)
83
+ # with the configuration composed of previously
84
+ # defined model hyperparameters. The model processes mini-batch of input ids and labels, then
85
+ # returns predictions and cross-entropy loss between labels and predictions.
86
+
87
+ from transformers import GPT2Config, GPT2LMHeadModel
88
+
89
+ config = GPT2Config(vocab_size=tokenizer.vocab_size,
90
+ bos_token_id=tokenizer.unk_token_id,
91
+ eos_token_id=tokenizer.eos_token_id,
92
+ n_layer=hyperparams["n_layer"],
93
+ n_head=hyperparams["n_head"],
94
+ n_embd=hyperparams["n_embd"],
95
+ n_positions=hyperparams["max_length"],
96
+ n_ctx=hyperparams["max_length"])
97
+ model = GPT2LMHeadModel(config)
98
+
99
+ #model= torch.nn.DataParallel(model.cuda(),device_ids=gpus,output_device=gpus[0])
100
+
101
+ outputs = model(**batch)
102
+ print(outputs.keys())
103
+
104
+ #['loss', 'logits', 'past_key_values']
105
+ # ## Trainer
106
+ #
107
+ # GPT-2 is trained with autoregressive language modeling objective:
108
+ # $$
109
+ # P(\boldsymbol{s}) = P(s_1) \cdot P(s_2 | s_1) \cdots P(s_T | s_1, \ldots, s_{T-1}) =
110
+ # \prod_{t=1}^{T} P(s_t | s_{j < t}),
111
+ # $$
112
+ # where $\boldsymbol{s}$ is a tokenized (encoded) SMILES string, $s_t$ is a token from pretrained
113
+ # vocabulary $\mathcal{V}$.
114
+ #
115
+ # We use `pytorch_lightning.Trainer` to train GPT-2. Since `Trainer` requires lightning modules,
116
+ # we import our
117
+ # [`smiles_gpt.GPT2LitModel`](https://github.com/sanjaradylov/smiles-gpt/blob/master/smiles_gpt/language_modeling.py#L10)
118
+ # wrapper that implements training phases for
119
+ # `GPT2LMHeadModel`, configures an `Adam` optimizer with `CosineAnnealingLR` scheduler, and
120
+ # logs average perplexity every epoch.
121
+
122
+ # In[8]:
123
+
124
+
125
+ from pytorch_lightning import Trainer
126
+ from pytorch_lightning.callbacks.early_stopping import EarlyStopping
127
+
128
+ checkpoint = "../checkpoints/iupac"
129
+
130
+ trainer = Trainer(
131
+ gpus=gpus,
132
+ max_epochs=hyperparams["max_epochs"],
133
+ callbacks=[EarlyStopping("ppl", 0.1, 3)], #[EarlyStopping("ppl", 0.2, 2)]
134
+ auto_lr_find=False, # Set to True to search for optimal learning rate.
135
+ auto_scale_batch_size=False, # Set to True to scale batch size
136
+ # accelerator="dp" # Uncomment for GPU training.
137
+ accelerator="gpu", #devices=4,
138
+ strategy="ddp"
139
+ )
140
+ lit_model = gpt.GPT2LitModel(
141
+ model,
142
+ batch_size=hyperparams["batch_size"],
143
+ learning_rate=hyperparams["learning_rate"],
144
+ final_learning_rate=hyperparams["final_learning_rate"],
145
+ weight_decay=hyperparams["weight_decay"],
146
+ adam_eps=hyperparams["adam_eps"],
147
+ adam_betas=hyperparams["adam_betas"],
148
+ scheduler_T_max=hyperparams["scheduler_T_max"],
149
+ save_model_every=1, checkpoint=checkpoint)
150
+ trainer.fit(lit_model, train_dataloader)
151
+
152
+
153
+ #model.module.save_pretrained('./pretrained')
154
+ model.save_pretrained('./pretrained')
155
+
156
+ # ## Interpretability
157
+ #
158
+ # [BertViz](https://github.com/jessevig/bertviz) inspects attention heads of transformers
159
+ # capturing specific patterns in data. Each head can be representative of some syntactic
160
+ # or short-/long-term relationships between tokens.
161
+
162
+ # In[9]:
163
+
164
+
165
+ import torch
166
+ from bertviz import head_view
167
+
168
+ input_ids_list = iupac_encoded['input_ids']
169
+ model = GPT2LMHeadModel.from_pretrained(checkpoint, output_attentions=True)
170
+ attention = model(torch.LongTensor(input_ids_list))[-1]
171
+ tokens = [tokenizer.decode(i) for i in input_ids_list]
172
+ print(input_ids_list,attention,tokens)
173
+ # Don't worry if a snippet is not displayed---just rerun this cell.
174
+ head_view(attention, tokens)
175
+
176
+
177
+
178
+ from bertviz import model_view
179
+
180
+ # Don't worry if a snippet is not displayed---just rerun this cell.
181
+ model_view(attention, tokens)
182
+
183
+
184
+ # ## Sampling
185
+ #
186
+ # Finally, we generate novel SMILES strings with top-$p$ sampling$-$i.e., sampling from the
187
+ # smallest vocabulary subset $\mathcal{V}^{(p)} \subset \mathcal{V}$ s.t. it takes up the most
188
+ # probable tokens whose cumulative probability mass exceeds $p$, $0 < p < 1$. Model
189
+ # terminates the procedure upon encountering `"</s>"` or reaching maximum number
190
+ # `hyperparams["max_length"]`. Special tokens are eventually removed.
191
+
192
+
193
+
194
+ import tqdm
195
+
196
+ model.eval() # Set the base model to evaluation mode.
197
+
198
+ generated_smiles_list = []
199
+ n_generated = 30000
200
+
201
+ for _ in tqdm.tqdm(range(n_generated)):
202
+ # Generate from "<unk>" so that the next token is arbitrary.
203
+ smiles_start = torch.LongTensor([[tokenizer.unk_token_id]])
204
+ # Get generated token IDs.
205
+ generated_ids = model.generate(smiles_start,
206
+ max_length=hyperparams["max_length"],
207
+ do_sample=True,top_p=hyperparams["top_p"],
208
+ repetition_penalty=1.2,
209
+ pad_token_id=tokenizer.eos_token_id)
210
+ # Decode the IDs into tokens and remove "<s>" and "</s>".
211
+ generated_smiles = tokenizer.decode(generated_ids[0],
212
+ skip_special_tokens=True)
213
+ generated_smiles_list.append(generated_smiles)
214
+
215
+ print(generated_smiles_list[:10])
216
+
217
+
218
+ import numpy as np
219
+ import pandas as pd
220
+
221
+ df2 = pd.DataFrame(generated_smiles_list, columns=['iupac'])
222
+
223
+ df2.to_csv("iupacGPT2-gen30K.csv",index=None,mode='a')
224
+
225
+
226
+
227
+
228
+
229
+
230
+
231
+