feat: Add AbLangPDB1 model files and code

Browse files

Files changed (14) hide show

.gitattributes +0 -34
.gitignore +2 -0
README.md +141 -3
ablangpaired_model.py +115 -0
ablangpdb_model.safetensors +3 -0
config.json +17 -0
heavy_tokenizer/special_tokens_map.json +37 -0
heavy_tokenizer/tokenizer.json +175 -0
heavy_tokenizer/tokenizer_config.json +57 -0
heavy_tokenizer/vocab.txt +25 -0
light_tokenizer/special_tokens_map.json +37 -0
light_tokenizer/tokenizer.json +175 -0
light_tokenizer/tokenizer_config.json +57 -0
light_tokenizer/vocab.txt +25 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


























1	*.safetensors filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__/
2	+ *.pyc

README.md CHANGED Viewed

@@ -1,3 +1,141 @@
----
-license: mit
----

+---
+license: mit
+language: en
+tags:
+- pytorch
+- feature-extraction
+- biology
+- protein-sequences
+- antibodies
+- ablang
+- PDB
+- contrastive-learning
+library_name: transformers
+---
+# AbLangPDB1: Contrastive-Learned Antibody Embeddings for General Epitope-Overlap Predictions
+This repository contains the model, code, and tokenizers for **AbLangPDB1**.
+## Model Description
+**AbLangPDB1** is a fine-tuned antibody language model for generating embeddings of antibodies searching for epitope/antigen-specificity matches to reference antibodies.
+The model was developed using contrastive learning on paired heavy and light chain sequences, as described in our paper:
+> [Contrastive Learning Enables Epitope Overlap Predictions for Targeted Antibody Discovery](). *[bioRxiv]*, Clinton M. Holt, Alexis K. Janke, Parastoo Amlashi, Parker J. Jamieson, Toma M. Marinov, Ivelin S. Georgiev. 2025. https://doi.org/10.1101/2025.02.25.640114
+### Model Architecture
+Heavy Chain Seq -> [AbLang Heavy] -> 768-dim -> |
+| -> [Concatenate] -> [Mixer Network] -> 1536-dim Paired Embedding
+Light Chain Seq -> [AbLang Light] -> 768-dim -> |
+The `AbLangPDB1` model uses the AbLangPaired architecture, a custom class that processes heavy and light chains of antibodies independently using the pre-trained **AbLang** models before fusing their embeddings together. The resulting embeddings from the two AbLang models are concatenated and passed through a custom **Mixer** network (6 fully connected feed forward layers) to produce a final, unified 1536-dimensional embedding for the paired antibody.
+The pretrained heavy model is [AbLang_heavy](https://huggingface.co/qilowoq/AbLang_heavy) and the pretrained light model is [AbLang_light](https://huggingface.co/qilowoq/AbLang_light). In brief, these use the RoBERTa architecture pretrained with the masked language modeling objective. Each model is 12 transformer blocks with 12 attenuated heads, an inner hidden size of 3072 and a hidden size of 768. It uses a learned positional embedding specific for antibodies with a max length of 160. The 768 dimensional embedding from each model is generated by mean pooling over all residue-level embeddings.
+During training these pretrained models were frozen and a QLORA adapter was added.
+## Intended uses & limitations
+The model is intended to be used to generate epitope-information-rich embeddings of antibodies, but a prediction head could be added to the model to make predictions such as neutralization capacity. Expect accuracy to be significantly better when comparing antibodies to those within the PDB.
+1. **Epitope Classification**: Antibodies with unknown epitopes can be embedded and compared against a reference database of antibodies with known epitopes. The reference antibody with the highest cosine similarity represents the most similar epitope to the epitope of the given antibody.
+**Limitation**: Mouse BCRs are unlikely to perform well here and BCRs which do not bind a Pfam domain used in training are likely to have reduced classification accuracy.
+2.  **Antibody Search**: A reference antibody sequence can be embedded along with a large search database. Antibodies with high cosine similarities in the search database can be assumed to have similar epitope targets.
+entative candidates can then be chosen from each cluster for downstream characterization.
+## Training data
+For AbLang-PDB, we curated 1,909 non-redundant human antibodies from the [Structural Antibody Database (SAbDab)](https://doi.org/10.1093/nar/gkt1043) with a February 19, 2024 cutoff date . These were assigned antigen domains using the [pfam_scan software](https://github.com/aziele/pfam_scan) such that two antibodies containing at least one shared Pfam were considered to be in the same category. For partitioning antibodies between training (80%), validation (10%), and test (10%) splits, antibodies sharing both heavy and light V-genes and CDRH3 amino acid identity >70% were assigned to the same clone group and distributed such that the same clone group was not present in both the training and test sets. Additionally, pairs with >92.5% sequence identity in either chain were excluded to maintain diversity.
+## Training Procedure
+The AbLang-PDB model was trained using a Mean Squared Error loss function of the cosine similarity between a pair of antibody embeddings versus the ground truth amount of epitope overlap of the pair. Here the epitope overlap includes labels pushing antibodies binding the same antigen protein family in the general vicinity of each other while pushing those binding overlapping epitopes progressively closer.
+## How to Use
+To use this model, first ensure you have the necessary libraries installed:
+### 1. Setup
+First, clone this repository and install the required libraries.
+```bash
+# Clone the repository to get the model script, weights, and tokenizers
+git clone https://huggingface.co/clint-holt/AbLangPDB1
+cd AbLangPDB1
+# Install dependencies
+pip install torch pandas "transformers>=4.30.0" safetensors
+```
+Then run the following code
+```python
+import torch
+import pandas as pd
+from transformers import AutoTokenizer
+# Import the custom model class and config from the cloned repository
+from ablangpaired_model import AbLangPaired, AbLangPairedConfig
+# 1. Load Model and Tokenizers
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model_dir = "." # Assumes you are running this script from the cloned directory
+# Configure the model to load the local weights
+# The AbLangPairedConfig specifies the base AbLang models and the local checkpoint file
+model_config = AbLangPairedConfig(checkpoint_filename=f"{model_dir}/ablangpdb_model.safetensors")
+model = AbLangPaired(model_config, device).to(device)
+model.eval()
+# Tokenizers are stored in subdirectories
+heavy_tokenizer = AutoTokenizer.from_pretrained(f"{model_dir}/heavy_tokenizer")
+light_tokenizer = AutoTokenizer.from_pretrained(f"{model_dir}/light_tokenizer")
+# 2. Prepare Antibody Sequences
+data = {
+    'HC_AA': ["EVQLVESGGGLVQPGGSLRLSCAASGFNLYYYSIHWVRQAPGKGLEWVASISPYSSSTSYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCARGRWYRRALDYWGQGTLVTVSS"],
+    'LC_AA': ["DIQMTQSPSSLSASVGDRVTITCRASQSVSSAVAWYQQKPGKAPKLLIYSASSLYSGVPSRFSGSRSGTDFTLTISSLQPEDFATYYCQQYPYYSSLITFGQGTKVEIK"]
+}
+df = pd.DataFrame(data)
+# Pre-process sequences by adding spaces between amino acids
+df["PREPARED_HC_SEQ"] = df["HC_AA"].apply(lambda x: " ".join(list(x)))
+df["PREPARED_LC_SEQ"] = df["LC_AA"].apply(lambda x: " ".join(list(x)))
+# 3. Tokenize and Embed
+h_tokens = heavy_tokenizer(df["PREPARED_HC_SEQ"].tolist(), padding='longest', return_tensors="pt")
+l_tokens = light_tokenizer(df["PREPARED_LC_SEQ"].tolist(), padding='longest', return_tensors="pt")
+with torch.no_grad():
+    embeddings = model(
+        h_input_ids=h_tokens['input_ids'].to(device),
+        h_attention_mask=h_tokens['attention_mask'].to(device),
+        l_input_ids=l_tokens['input_ids'].to(device),
+        l_attention_mask=l_tokens['attention_mask'].to(device)
+    )
+print("Embedding generation complete! ✅")
+print("Shape of embeddings tensor:", embeddings.shape)
+# Expected output shape: (1, 1536)
+```
+## Citation
+If you use this model or code in your research, please cite our paper:
+```bibtex
+@article {Holt2025.02.25.640114,
+    author = {Holt, Clinton M. and Janke, Alexis K. and Amlashi, Parastoo and Jamieson, Parker J. and Marinov, Toma M. and Georgiev, Ivelin S.},
+    title = {Contrastive Learning Enables Epitope Overlap Predictions for Targeted Antibody Discovery},
+    elocation-id = {2025.02.25.640114},
+    year = {2025},
+    doi = {10.1101/2025.02.25.640114},
+    publisher = {Cold Spring Harbor Laboratory},
+    URL = {https://www.biorxiv.org/content/early/2025/04/01/2025.02.25.640114},
+    eprint = {https://www.biorxiv.org/content/early/2025/04/01/2025.02.25.640114.full.pdf},
+    journal = {bioRxiv}
+}
+```

ablangpaired_model.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# ablangrbd_model.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoModel
+from transformers import PreTrainedModel, PretrainedConfig, AutoModel, AutoConfig
+from safetensors.torch import load_file
+import typing as T
+class Mixer(nn.Module):
+    def __init__(self, in_d: int=1536):
+        super(Mixer, self).__init__()
+        self.layers = nn.Sequential(
+            nn.Linear(in_d, in_d), # First layer
+            nn.ReLU(),             # First activation function
+            nn.Linear(in_d, in_d), # Second layer
+            nn.ReLU(),             # Second activation function
+            nn.Linear(in_d, in_d), # Third layer
+            nn.ReLU(),             # Third activation function
+            nn.Linear(in_d, in_d), # Fourth layer
+            nn.ReLU(),             # Fourth activation function
+            nn.Linear(in_d, in_d),  # Fifth layer
+            nn.ReLU(),             # Fifth activation function
+            nn.Linear(in_d, in_d)      # Output layer
+            # No activation here, apply softmax or sigmoid externally if needed, depending on your loss function
+        )
+    def forward(self, x):
+        return self.layers(x)
+def get_sequence_embeddings(mask, model_output):
+    mask = mask.float()
+    d = {k: v for k, v in torch.nonzero(mask).cpu().numpy()} # dict of sep tokens k = ab index, v = index of final position where mask = 1
+    # make sep token invisible
+    for i in d:
+        mask[i, d[i]] = 0
+    mask[:, 0] = 0.0 # make cls token invisible
+    mask = mask.unsqueeze(-1).expand(model_output.last_hidden_state.size())
+    sum_embeddings = torch.sum(model_output.last_hidden_state * mask, 1)
+    sum_mask = torch.clamp(mask.sum(1), min=1e-9)
+    return sum_embeddings / sum_mask  # sum_mask means length of unmasked positions
+class AbLangPairedConfig(PretrainedConfig):
+    model_type = "ablang_paired"
+    def __init__(
+        self,
+        checkpoint_filename: str,
+        heavy_model_id='qilowoq/AbLang_heavy',
+        heavy_revision='ecac793b0493f76590ce26d48f7aac4912de8717',
+        light_model_id='qilowoq/AbLang_light',
+        light_revision='ce0637166f5e6e271e906d29a8415d9fdc30e377',
+        mixer_hidden_dim: int = 1536,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.checkpoint_filename = checkpoint_filename
+        self.heavy_model_id = heavy_model_id
+        self.heavy_revision = heavy_revision
+        self.light_model_id = light_model_id
+        self.light_revision = light_revision
+        self.mixer_hidden_dim = mixer_hidden_dim
+class AbLangPaired(PreTrainedModel):
+    def __init__(self, personal_config: AbLangPairedConfig, device: T.Union[str, torch.device] = "cpu"):
+        # During training I used the AbLang_heavy config as AbLangPaired's config
+        # This may be why it is very hard to integrate this into the Hugging Face AutoModel system
+        self.config = AutoConfig.from_pretrained(personal_config.heavy_model_id, revision=personal_config.heavy_revision)
+        super().__init__(self.config)
+        # super().__init__()
+        self.roberta_heavy = AutoModel.from_pretrained(
+            personal_config.heavy_model_id,
+            revision=personal_config.heavy_revision,  # Specific commit hash
+            trust_remote_code=True
+        )
+        self.roberta_light = AutoModel.from_pretrained(
+            personal_config.light_model_id,
+            revision=personal_config.light_revision,  # Specific commit hash
+            trust_remote_code=True
+        )
+        self.mixer = Mixer(in_d=1536)
+        # Load either torch or transformers saved file
+        if personal_config.checkpoint_filename.endswith('.safetensors'):
+            state_dict = load_file(personal_config.checkpoint_filename)
+        else:
+            state_dict = torch.load(personal_config.checkpoint_filename, map_location=device)
+        load_result = self.load_state_dict(state_dict, strict=False)
+        self.to(device)
+        self.eval()
+    def forward(self, h_input_ids, h_attention_mask, l_input_ids, l_attention_mask, **kwargs):
+        # Run chains through separate streams
+        outputs_h = self.roberta_heavy(input_ids=h_input_ids.to(torch.int64), attention_mask=h_attention_mask)
+        outputs_l = self.roberta_light(input_ids=l_input_ids.to(torch.int64), attention_mask=l_attention_mask)
+        # Mean pool
+        pooled_output_h = get_sequence_embeddings(h_attention_mask, outputs_h)
+        pooled_output_l = get_sequence_embeddings(l_attention_mask, outputs_l)
+        # Concatenate and then do 6 fully connected layers to pick up on cross-chain features
+        pooled_output = torch.cat([pooled_output_h, pooled_output_l], dim=1)
+        pooled_output = self.mixer(pooled_output)
+        embedding = F.normalize(pooled_output, p=2, dim=1)
+        return embedding

ablangpdb_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:049b60d04449bd39da23b59205e48b9b59425aa7ae14a91414b8dfe7483856e4
+size 738301704

config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "architectures": [
+    "AbLangPaired"
+  ],
+  "heavy_model_id": "qilowoq/AbLang_heavy",
+  "heavy_revision": "ecac793b0493f76590ce26d48f7aac4912de8717",
+  "light_model_id": "qilowoq/AbLang_light",
+  "light_revision": "ce0637166f5e6e271e906d29a8415d9fdc30e377",
+  "mixer_hidden_dim": 1536,
+  "model_type": "ablang_paired",
+  "torch_dtype": "float32",
+  "transformers_version": "4.37.2",
+  "auto_map": {
+    "AutoConfig": "ablangpaired_model.AbLangPairedConfig",
+    "AutoModel": "ablangpaired_model.AbLangPaired"
+  }
+}

heavy_tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

heavy_tokenizer/tokenizer.json ADDED Viewed

	@@ -0,0 +1,175 @@

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "[CLS]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 21,
+      "content": "[PAD]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 22,
+      "content": "[SEP]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 23,
+      "content": "[MASK]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 24,
+      "content": "[UNK]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": {
+    "type": "BertNormalizer",
+    "clean_text": true,
+    "handle_chinese_chars": true,
+    "strip_accents": null,
+    "lowercase": false
+  },
+  "pre_tokenizer": {
+    "type": "BertPreTokenizer"
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "[CLS]",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "[SEP]",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "SpecialToken": {
+          "id": "[CLS]",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "[SEP]",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "[SEP]",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {
+      "[CLS]": {
+        "id": "[CLS]",
+        "ids": [
+          0
+        ],
+        "tokens": [
+          "[CLS]"
+        ]
+      },
+      "[SEP]": {
+        "id": "[SEP]",
+        "ids": [
+          22
+        ],
+        "tokens": [
+          "[SEP]"
+        ]
+      }
+    }
+  },
+  "decoder": {
+    "type": "WordPiece",
+    "prefix": "##",
+    "cleanup": true
+  },
+  "model": {
+    "type": "WordPiece",
+    "unk_token": "[UNK]",
+    "continuing_subword_prefix": "##",
+    "max_input_chars_per_word": 100,
+    "vocab": {
+      "[CLS]": 0,
+      "M": 1,
+      "R": 2,
+      "H": 3,
+      "K": 4,
+      "D": 5,
+      "E": 6,
+      "S": 7,
+      "T": 8,
+      "N": 9,
+      "Q": 10,
+      "C": 11,
+      "G": 12,
+      "P": 13,
+      "A": 14,
+      "V": 15,
+      "I": 16,
+      "F": 17,
+      "Y": 18,
+      "W": 19,
+      "L": 20,
+      "[PAD]": 21,
+      "[SEP]": 22,
+      "[MASK]": 23,
+      "[UNK]": 24
+    }
+  }
+}

heavy_tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "21": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "22": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "23": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "24": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": false,
+  "mask_token": "[MASK]",
+  "model_max_length": 160,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

heavy_tokenizer/vocab.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+[CLS]
+M
+R
+H
+K
+D
+E
+S
+T
+N
+Q
+C
+G
+P
+A
+V
+I
+F
+Y
+W
+L
+[PAD]
+[SEP]
+[MASK]
+[UNK]

light_tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

light_tokenizer/tokenizer.json ADDED Viewed

	@@ -0,0 +1,175 @@

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "[CLS]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 21,
+      "content": "[PAD]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 22,
+      "content": "[SEP]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 23,
+      "content": "[MASK]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 24,
+      "content": "[UNK]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": {
+    "type": "BertNormalizer",
+    "clean_text": true,
+    "handle_chinese_chars": true,
+    "strip_accents": null,
+    "lowercase": false
+  },
+  "pre_tokenizer": {
+    "type": "BertPreTokenizer"
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "[CLS]",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "[SEP]",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "SpecialToken": {
+          "id": "[CLS]",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "[SEP]",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "[SEP]",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {
+      "[CLS]": {
+        "id": "[CLS]",
+        "ids": [
+          0
+        ],
+        "tokens": [
+          "[CLS]"
+        ]
+      },
+      "[SEP]": {
+        "id": "[SEP]",
+        "ids": [
+          22
+        ],
+        "tokens": [
+          "[SEP]"
+        ]
+      }
+    }
+  },
+  "decoder": {
+    "type": "WordPiece",
+    "prefix": "##",
+    "cleanup": true
+  },
+  "model": {
+    "type": "WordPiece",
+    "unk_token": "[UNK]",
+    "continuing_subword_prefix": "##",
+    "max_input_chars_per_word": 100,
+    "vocab": {
+      "[CLS]": 0,
+      "M": 1,
+      "R": 2,
+      "H": 3,
+      "K": 4,
+      "D": 5,
+      "E": 6,
+      "S": 7,
+      "T": 8,
+      "N": 9,
+      "Q": 10,
+      "C": 11,
+      "G": 12,
+      "P": 13,
+      "A": 14,
+      "V": 15,
+      "I": 16,
+      "F": 17,
+      "Y": 18,
+      "W": 19,
+      "L": 20,
+      "[PAD]": 21,
+      "[SEP]": 22,
+      "[MASK]": 23,
+      "[UNK]": 24
+    }
+  }
+}

light_tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "21": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "22": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "23": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "24": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": false,
+  "mask_token": "[MASK]",
+  "model_max_length": 160,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

light_tokenizer/vocab.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+[CLS]
+M
+R
+H
+K
+D
+E
+S
+T
+N
+Q
+C
+G
+P
+A
+V
+I
+F
+Y
+W
+L
+[PAD]
+[SEP]
+[MASK]
+[UNK]