Upload tokenizer

Browse files

Files changed (4) hide show

special_tokens_map.json +16 -0
tokenization_vits.py +195 -0
tokenizer_config.json +37 -0
vocab.json +1240 -0

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "pad_token": {
+    "content": "_",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenization_vits.py ADDED Viewed

	@@ -0,0 +1,195 @@

+"""Tokenization class for VITS."""
+import json
+import os
+import re
+from typing import Any, Dict, List, Optional, Tuple, Union
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"}
+def has_non_roman_characters(input_string):
+    # Find any character outside the ASCII range
+    non_roman_pattern = re.compile(r"[^\x00-\x7F]")
+    # Search the input string for non-Roman characters
+    match = non_roman_pattern.search(input_string)
+    has_non_roman = match is not None
+    return has_non_roman
+class IndicVitsTokenizer(PreTrainedTokenizer):
+    """
+    Construct a VITS tokenizer.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        language (`str`, *optional*):
+            Language identifier.
+        add_blank (`bool`, *optional*, defaults to `True`):
+            Whether to insert token id 0 in between the other tokens.
+        normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the input text by removing all casing and punctuation.
+        phonemize (`bool`, *optional*, defaults to `True`):
+            Whether to convert the input text into phonemes.
+        is_uroman (`bool`, *optional*, defaults to `False`):
+            Whether the `uroman` Romanizer needs to be applied to the input text prior to tokenizing.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file,
+        pad_token="<pad>",
+        unk_token="<unk>",
+        language=None,
+        add_blank=True,
+        normalize=True,
+        phonemize=True,
+        is_uroman=False,
+        **kwargs,
+    ) -> None:
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.language = language
+        self.add_blank = add_blank
+        self.normalize = normalize
+        self.phonemize = phonemize
+        self.is_uroman = is_uroman
+        super().__init__(
+            pad_token=pad_token,
+            unk_token=unk_token,
+            language=language,
+            add_blank=add_blank,
+            normalize=normalize,
+            phonemize=phonemize,
+            is_uroman=is_uroman,
+            **kwargs,
+        )
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def normalize_text(self, input_string):
+        """Lowercase the input string, respecting any special token ids that may be part or entirely upper-cased."""
+        all_vocabulary = list(self.encoder.keys()) + list(self.added_tokens_encoder.keys())
+        filtered_text = ""
+        i = 0
+        while i < len(input_string):
+            found_match = False
+            for word in all_vocabulary:
+                if input_string[i : i + len(word)] == word:
+                    filtered_text += word
+                    i += len(word)
+                    found_match = True
+                    break
+            if not found_match:
+                filtered_text += input_string[i].lower()
+                i += 1
+        return filtered_text
+    def prepare_for_tokenization(
+        self, text: str, is_split_into_words: bool = False, normalize: Optional[bool] = None, **kwargs
+    ) -> Tuple[str, Dict[str, Any]]:
+        """
+        Performs any necessary transformations before tokenization.
+        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
+        `kwargs` at the end of the encoding process to be sure all the arguments have been used.
+        Args:
+            text (`str`):
+                The text to prepare.
+            is_split_into_words (`bool`, *optional*, defaults to `False`):
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
+                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
+                which it will tokenize.
+            normalize (`bool`, *optional*, defaults to `None`):
+                Whether or not to apply punctuation and casing normalization to the text inputs. Typically, VITS is
+                trained on lower-cased and un-punctuated text. Hence, normalization is used to ensure that the input
+                text consists only of lower-case characters.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Keyword arguments to use for the tokenization.
+        Returns:
+            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
+        """
+        normalize = normalize if normalize is not None else self.normalize
+        if normalize:
+            # normalise for casing
+            text = self.normalize_text(text)
+        # strip any chars outside of the vocab (punctuation)
+        text = "".join(list(filter(lambda char: char in self.encoder, text))).strip()
+        return text, kwargs
+    def _tokenize(self, text: str) -> List[str]:
+        """Tokenize a string by inserting the `<pad>` token at the boundary between adjacent characters."""
+        tokens = list(text)
+        return tokens
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        if self.add_blank and len(tokens) > 1:
+            tokens = tokens[1::2]
+        return "".join(tokens)
+    def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
+        ids = []
+        for token in tokens:
+            ids.append(self._convert_token_to_id(token))
+        if self.add_blank:
+            interspersed = [0] * (len(ids) * 2 + 1)
+            interspersed[1::2] = ids
+            ids = interspersed
+        return ids
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Union[Tuple[str], None]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+        return (vocab_file,)

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "add_blank": true,
+  "added_tokens_decoder": {
+    "95": {
+      "content": "_",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1218": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_vits.IndicVitsTokenizer",
+      null
+    ]
+  },
+  "clean_up_tokenization_spaces": false,
+  "extra_special_tokens": {},
+  "is_uroman": false,
+  "language": null,
+  "model_max_length": 1000000000000000019884624838656,
+  "normalize": true,
+  "pad_token": "_",
+  "phonemize": false,
+  "tokenizer_class": "IndicVitsTokenizer",
+  "unk_token": "<unk>"
+}

vocab.json ADDED Viewed

	@@ -0,0 +1,1240 @@

+{
+  " ": 16,
+  "!": 69,
+  "\"": 70,
+  "#": 71,
+  "$": 72,
+  "%": 73,
+  "&": 74,
+  "'": 1247,
+  "(": 76,
+  ")": 77,
+  "*": 78,
+  "+": 79,
+  ",": 80,
+  "-": 81,
+  ".": 82,
+  "/": 83,
+  "0": 1250,
+  "1": 1251,
+  "2": 1252,
+  "3": 1253,
+  "4": 1254,
+  "5": 1255,
+  "6": 1256,
+  "7": 1257,
+  "8": 1258,
+  "9": 1259,
+  ":": 84,
+  ";": 85,
+  "<": 86,
+  "=": 87,
+  ">": 88,
+  "?": 89,
+  "@": 90,
+  "A": 17,
+  "B": 18,
+  "C": 19,
+  "D": 20,
+  "E": 21,
+  "F": 22,
+  "G": 23,
+  "H": 24,
+  "I": 25,
+  "J": 26,
+  "K": 27,
+  "L": 28,
+  "M": 29,
+  "N": 30,
+  "O": 31,
+  "P": 32,
+  "Q": 33,
+  "R": 34,
+  "S": 35,
+  "T": 36,
+  "U": 37,
+  "V": 38,
+  "W": 39,
+  "X": 40,
+  "Y": 41,
+  "Z": 42,
+  "[": 91,
+  "\\": 92,
+  "]": 93,
+  "^": 94,
+  "_": 95,
+  "`": 96,
+  "a": 43,
+  "b": 44,
+  "c": 45,
+  "d": 46,
+  "e": 47,
+  "f": 48,
+  "g": 49,
+  "h": 50,
+  "i": 51,
+  "j": 52,
+  "k": 53,
+  "l": 54,
+  "m": 55,
+  "n": 56,
+  "o": 57,
+  "p": 58,
+  "q": 59,
+  "r": 60,
+  "s": 61,
+  "t": 62,
+  "t̺": 97,
+  "u": 63,
+  "v": 64,
+  "w": 65,
+  "x": 66,
+  "y": 67,
+  "z": 68,
+  "{": 98,
+  "|": 99,
+  "}": 100,
+  "~": 101,
+  "¡": 102,
+  "¢": 103,
+  "£": 104,
+  "¤": 105,
+  "¥": 106,
+  "¦": 107,
+  "§": 108,
+  "¨": 109,
+  "©": 110,
+  "ª": 111,
+  "«": 112,
+  "¬": 113,
+  "®": 114,
+  "¯": 115,
+  "°": 116,
+  "±": 117,
+  "²": 118,
+  "³": 119,
+  "´": 120,
+  "µ": 121,
+  "¶": 122,
+  "·": 123,
+  "¸": 124,
+  "¹": 125,
+  "º": 126,
+  "»": 127,
+  "¼": 128,
+  "½": 129,
+  "¾": 130,
+  "¿": 131,
+  "Â": 132,
+  "Ã": 133,
+  "×": 134,
+  "â": 135,
+  "æ": 1143,
+  "ç": 1149,
+  "ð": 1152,
+  "ø": 1187,
+  "ā": 136,
+  "ħ": 1169,
+  "ŋ": 1183,
+  "œ": 1191,
+  "ǀ": 1223,
+  "ǁ": 1224,
+  "ǂ": 1225,
+  "ǃ": 1226,
+  "ɐ": 1141,
+  "ɑ": 1140,
+  "ɒ": 1142,
+  "ɓ": 1144,
+  "ɔ": 1147,
+  "ɕ": 1148,
+  "ɖ": 1151,
+  "ɗ": 1150,
+  "ɘ": 1155,
+  "ə": 1154,
+  "ɚ": 1156,
+  "ɛ": 1157,
+  "ɜ": 1158,
+  "ɝ": 1159,
+  "ɞ": 1160,
+  "ɟ": 1161,
+  "ɠ": 1164,
+  "ɡ": 1163,
+  "ɢ": 1165,
+  "ɣ": 1210,
+  "ɤ": 1211,
+  "ɥ": 1170,
+  "ɦ": 1167,
+  "ɧ": 1168,
+  "ɨ": 1172,
+  "ɪ": 1173,
+  "ɫ": 1177,
+  "ɬ": 1176,
+  "ɭ": 1175,
+  "ɮ": 1178,
+  "ɯ": 1181,
+  "ɰ": 1182,
+  "ɱ": 1180,
+  "ɲ": 1185,
+  "ɳ": 1184,
+  "ɴ": 1186,
+  "ɵ": 1188,
+  "ɶ": 1192,
+  "ɸ": 1189,
+  "ɹ": 1194,
+  "ɺ": 1195,
+  "ɻ": 1197,
+  "ɽ": 1200,
+  "ɾ": 1196,
+  "ʀ": 1198,
+  "ʁ": 1199,
+  "ʂ": 1201,
+  "ʃ": 1202,
+  "ʄ": 1162,
+  "ʈ": 1203,
+  "ʉ": 1205,
+  "ʊ": 1206,
+  "ʋ": 1207,
+  "ʌ": 1209,
+  "ʍ": 1212,
+  "ʎ": 1214,
+  "ʏ": 1215,
+  "ʐ": 1217,
+  "ʑ": 1216,
+  "ʒ": 1218,
+  "ʔ": 1219,
+  "ʕ": 1221,
+  "ʘ": 1193,
+  "ʙ": 1145,
+  "ʛ": 1166,
+  "ʜ": 1171,
+  "ʝ": 1174,
+  "ʟ": 1179,
+  "ʡ": 1220,
+  "ʢ": 1222,
+  "ʤ": 1153,
+  "ʧ": 1204,
+  "ʰ": 1233,
+  "ʱ": 1234,
+  "ʲ": 1235,
+  "ʴ": 1232,
+  "ʷ": 1236,
+  "ʼ": 1231,
+  "ˆ": 138,
+  "ˈ": 1227,
+  "ˌ": 1228,
+  "ː": 1229,
+  "ˑ": 1230,
+  "˞": 1239,
+  "ˠ": 1237,
+  "ˤ": 1238,
+  "̃": 1249,
+  "̩": 1246,
+  "β": 1146,
+  "θ": 1190,
+  "χ": 1213,
+  "،": 139,
+  "؛": 140,
+  "؟": 141,
+  "٪": 142,
+  "٫": 143,
+  "٬": 144,
+  "٭": 145,
+  "۔": 146,
+  "۔۔۔": 147,
+  "ऀ": 148,
+  "ँ": 149,
+  "ं": 150,
+  "ः": 151,
+  "ऄ": 152,
+  "अ": 153,
+  "आ": 154,
+  "इ": 155,
+  "ई": 156,
+  "उ": 157,
+  "ऊ": 158,
+  "ऋ": 159,
+  "ऌ": 160,
+  "ऍ": 161,
+  "ऎ": 162,
+  "ए": 163,
+  "ऐ": 164,
+  "ऑ": 165,
+  "ऒ": 166,
+  "ओ": 167,
+  "औ": 168,
+  "क": 169,
+  "ख": 170,
+  "ग": 171,
+  "घ": 172,
+  "ङ": 173,
+  "च": 174,
+  "छ": 175,
+  "ज": 176,
+  "झ": 177,
+  "ञ": 178,
+  "ट": 179,
+  "ठ": 180,
+  "ड": 181,
+  "ढ": 182,
+  "ण": 183,
+  "त": 184,
+  "थ": 185,
+  "द": 186,
+  "ध": 187,
+  "न": 188,
+  "ऩ": 189,
+  "प": 190,
+  "फ": 191,
+  "ब": 192,
+  "भ": 193,
+  "म": 194,
+  "य": 195,
+  "र": 196,
+  "ऱ": 197,
+  "ल": 198,
+  "ळ": 199,
+  "ऴ": 200,
+  "व": 201,
+  "श": 202,
+  "ष": 203,
+  "स": 204,
+  "ह": 205,
+  "ऺ": 206,
+  "ऻ": 207,
+  "़": 208,
+  "ऽ": 209,
+  "ा": 210,
+  "ि": 211,
+  "ी": 212,
+  "ु": 213,
+  "ू": 214,
+  "ृ": 215,
+  "ॄ": 216,
+  "ॅ": 217,
+  "ॆ": 218,
+  "े": 219,
+  "ै": 220,
+  "ॉ": 221,
+  "ॊ": 222,
+  "ो": 223,
+  "ौ": 224,
+  "्": 225,
+  "ॎ": 226,
+  "ॏ": 227,
+  "ॐ": 228,
+  "॑": 229,
+  "॒": 230,
+  "॓": 231,
+  "॔": 232,
+  "ॕ": 233,
+  "ॖ": 234,
+  "ॗ": 235,
+  "क़": 236,
+  "ख़": 237,
+  "ग़": 238,
+  "ज़": 239,
+  "ड़": 240,
+  "ढ़": 241,
+  "फ़": 242,
+  "य़": 243,
+  "ॠ": 244,
+  "ॡ": 245,
+  "ॢ": 246,
+  "ॣ": 247,
+  "।": 248,
+  "॥": 249,
+  "०": 250,
+  "१": 251,
+  "२": 252,
+  "३": 253,
+  "४": 254,
+  "५": 255,
+  "६": 256,
+  "७": 257,
+  "८": 258,
+  "९": 259,
+  "॰": 260,
+  "ॱ": 261,
+  "ॲ": 262,
+  "ॳ": 263,
+  "ॴ": 264,
+  "ॵ": 265,
+  "ॶ": 266,
+  "ॷ": 267,
+  "ॸ": 268,
+  "ॹ": 269,
+  "ॺ": 270,
+  "ॻ": 271,
+  "ॼ": 272,
+  "ॽ": 273,
+  "ॾ": 274,
+  "ॿ": 275,
+  "ঀ": 276,
+  "ঁ": 277,
+  "ং": 278,
+  "ঃ": 279,
+  "অ": 280,
+  "আ": 281,
+  "ই": 282,
+  "ঈ": 283,
+  "উ": 284,
+  "ঊ": 285,
+  "ঋ": 286,
+  "ঌ": 287,
+  "এ": 288,
+  "ঐ": 289,
+  "ও": 290,
+  "ঔ": 291,
+  "ক": 292,
+  "খ": 293,
+  "গ": 294,
+  "ঘ": 295,
+  "ঙ": 296,
+  "চ": 297,
+  "ছ": 298,
+  "জ": 299,
+  "ঝ": 300,
+  "ঞ": 301,
+  "ট": 302,
+  "ঠ": 303,
+  "ড": 304,
+  "ঢ": 305,
+  "ণ": 306,
+  "ত": 307,
+  "থ": 308,
+  "দ": 309,
+  "ধ": 310,
+  "ন": 311,
+  "প": 312,
+  "ফ": 313,
+  "ব": 314,
+  "ভ": 315,
+  "ম": 316,
+  "য": 317,
+  "র": 318,
+  "ল": 319,
+  "শ": 320,
+  "ষ": 321,
+  "স": 322,
+  "হ": 323,
+  "়": 324,
+  "ঽ": 325,
+  "া": 326,
+  "ি": 327,
+  "ী": 328,
+  "ু": 329,
+  "ূ": 330,
+  "ৃ": 331,
+  "ৄ": 332,
+  "ে": 333,
+  "ৈ": 334,
+  "ো": 335,
+  "ৌ": 336,
+  "্": 337,
+  "ৎ": 338,
+  "ৗ": 339,
+  "ড়": 340,
+  "ঢ়": 341,
+  "য়": 342,
+  "ৠ": 343,
+  "ৡ": 344,
+  "ৢ": 345,
+  "ৣ": 346,
+  "০": 347,
+  "১": 348,
+  "২": 349,
+  "৩": 350,
+  "৪": 351,
+  "৫": 352,
+  "৬": 353,
+  "৭": 354,
+  "৮": 355,
+  "৯": 356,
+  "ৰ": 357,
+  "ৱ": 358,
+  "৲": 359,
+  "৳": 360,
+  "৴": 361,
+  "৵": 362,
+  "৶": 363,
+  "৷": 364,
+  "৸": 365,
+  "৹": 366,
+  "৺": 367,
+  "৻": 368,
+  "ৼ": 369,
+  "৽": 370,
+  "৾": 371,
+  "ਁ": 372,
+  "ਂ": 373,
+  "ਃ": 374,
+  "ਅ": 375,
+  "ਆ": 376,
+  "ਇ": 377,
+  "ਈ": 378,
+  "ਉ": 379,
+  "ਊ": 380,
+  "ਏ": 381,
+  "ਐ": 382,
+  "ਓ": 383,
+  "ਔ": 384,
+  "ਕ": 385,
+  "ਖ": 386,
+  "ਗ": 387,
+  "ਘ": 388,
+  "ਙ": 389,
+  "ਚ": 390,
+  "ਛ": 391,
+  "ਜ": 392,
+  "ਝ": 393,
+  "ਞ": 394,
+  "ਟ": 395,
+  "ਠ": 396,
+  "ਡ": 397,
+  "ਢ": 398,
+  "ਣ": 399,
+  "ਤ": 400,
+  "ਥ": 401,
+  "ਦ": 402,
+  "ਧ": 403,
+  "ਨ": 404,
+  "ਪ": 405,
+  "ਫ": 406,
+  "ਬ": 407,
+  "ਭ": 408,
+  "ਮ": 409,
+  "ਯ": 410,
+  "ਰ": 411,
+  "ਲ": 412,
+  "ਲ਼": 413,
+  "ਵ": 414,
+  "ਸ਼": 415,
+  "ਸ": 416,
+  "ਹ": 417,
+  "਼": 418,
+  "ਾ": 419,
+  "ਿ": 420,
+  "ੀ": 421,
+  "ੁ": 422,
+  "ੂ": 423,
+  "ੇ": 424,
+  "ੈ": 425,
+  "ੋ": 426,
+  "ੌ": 427,
+  "੍": 428,
+  "ੑ": 429,
+  "ਖ਼": 430,
+  "ਗ਼": 431,
+  "ਜ਼": 432,
+  "ੜ": 433,
+  "ਫ਼": 434,
+  "੦": 435,
+  "੧": 436,
+  "੨": 437,
+  "੩": 438,
+  "੪": 439,
+  "੫": 440,
+  "੬": 441,
+  "੭": 442,
+  "੮": 443,
+  "੯": 444,
+  "ੰ": 445,
+  "ੱ": 446,
+  "ੲ": 447,
+  "ੳ": 448,
+  "ੴ": 449,
+  "ੵ": 450,
+  "੶": 451,
+  "ઁ": 452,
+  "ં": 453,
+  "ઃ": 454,
+  "અ": 455,
+  "આ": 456,
+  "ઇ": 457,
+  "ઈ": 458,
+  "ઉ": 459,
+  "ઊ": 460,
+  "ઋ": 461,
+  "ઌ": 462,
+  "ઍ": 463,
+  "એ": 464,
+  "ઐ": 465,
+  "ઑ": 466,
+  "ઓ": 467,
+  "ઔ": 468,
+  "ક": 469,
+  "ખ": 470,
+  "ગ": 471,
+  "ઘ": 472,
+  "ઙ": 473,
+  "ચ": 474,
+  "છ": 475,
+  "જ": 476,
+  "ઝ": 477,
+  "ઞ": 478,
+  "ટ": 479,
+  "ઠ": 480,
+  "ડ": 481,
+  "ઢ": 482,
+  "ણ": 483,
+  "ત": 484,
+  "થ": 485,
+  "દ": 486,
+  "ધ": 487,
+  "ન": 488,
+  "પ": 489,
+  "ફ": 490,
+  "બ": 491,
+  "ભ": 492,
+  "મ": 493,
+  "ય": 494,
+  "ર": 495,
+  "લ": 496,
+  "ળ": 497,
+  "વ": 498,
+  "શ": 499,
+  "ષ": 500,
+  "સ": 501,
+  "હ": 502,
+  "઼": 503,
+  "ઽ": 504,
+  "ા": 505,
+  "િ": 506,
+  "ી": 507,
+  "ુ": 508,
+  "ૂ": 509,
+  "ૃ": 510,
+  "ૄ": 511,
+  "ૅ": 512,
+  "ે": 513,
+  "ૈ": 514,
+  "ૉ": 515,
+  "ો": 516,
+  "ૌ": 517,
+  "્": 518,
+  "ૐ": 519,
+  "ૠ": 520,
+  "ૡ": 521,
+  "ૢ": 522,
+  "ૣ": 523,
+  "૦": 524,
+  "૧": 525,
+  "૨": 526,
+  "૩": 527,
+  "૪": 528,
+  "૫": 529,
+  "૬": 530,
+  "૭": 531,
+  "૮": 532,
+  "૯": 533,
+  "૰": 534,
+  "૱": 535,
+  "ૹ": 536,
+  "ૺ": 537,
+  "ૻ": 538,
+  "ૼ": 539,
+  "૽": 540,
+  "૾": 541,
+  "૿": 542,
+  "ଁ": 543,
+  "ଂ": 544,
+  "ଃ": 545,
+  "ଅ": 546,
+  "ଆ": 547,
+  "ଇ": 548,
+  "ଈ": 549,
+  "ଉ": 550,
+  "ଊ": 551,
+  "ଋ": 552,
+  "ଌ": 553,
+  "ଏ": 554,
+  "ଐ": 555,
+  "ଓ": 556,
+  "ଔ": 557,
+  "କ": 558,
+  "ଖ": 559,
+  "ଗ": 560,
+  "ଘ": 561,
+  "ଙ": 562,
+  "ଚ": 563,
+  "ଛ": 564,
+  "ଜ": 565,
+  "ଝ": 566,
+  "ଞ": 567,
+  "ଟ": 568,
+  "ଠ": 569,
+  "ଡ": 570,
+  "ଢ": 571,
+  "ଣ": 572,
+  "ତ": 573,
+  "ଥ": 574,
+  "ଦ": 575,
+  "ଧ": 576,
+  "ନ": 577,
+  "ପ": 578,
+  "ଫ": 579,
+  "ବ": 580,
+  "ଭ": 581,
+  "ମ": 582,
+  "ଯ": 583,
+  "ର": 584,
+  "ଲ": 585,
+  "ଳ": 586,
+  "ଵ": 587,
+  "ଶ": 588,
+  "ଷ": 589,
+  "ସ": 590,
+  "ହ": 591,
+  "଼": 592,
+  "ଽ": 593,
+  "ା": 594,
+  "ି": 595,
+  "ୀ": 596,
+  "ୁ": 597,
+  "ୂ": 598,
+  "ୃ": 599,
+  "ୄ": 600,
+  "େ": 601,
+  "ୈ": 602,
+  "ୋ": 603,
+  "ୌ": 604,
+  "୍": 605,
+  "୕": 606,
+  "ୖ": 607,
+  "ୗ": 608,
+  "ଡ଼": 609,
+  "ଢ଼": 610,
+  "ୟ": 611,
+  "ୠ": 612,
+  "ୡ": 613,
+  "ୢ": 614,
+  "ୣ": 615,
+  "୦": 616,
+  "୧": 617,
+  "୨": 618,
+  "୩": 619,
+  "୪": 620,
+  "୫": 621,
+  "୬": 622,
+  "୭": 623,
+  "୮": 624,
+  "୯": 625,
+  "୰": 626,
+  "ୱ": 627,
+  "୲": 628,
+  "୳": 629,
+  "୴": 630,
+  "୵": 631,
+  "୶": 632,
+  "୷": 633,
+  "ஂ": 634,
+  "ஃ": 635,
+  "அ": 636,
+  "ஆ": 637,
+  "இ": 638,
+  "ஈ": 639,
+  "உ": 640,
+  "ஊ": 641,
+  "எ": 642,
+  "ஏ": 643,
+  "ஐ": 644,
+  "ஒ": 645,
+  "ஓ": 646,
+  "ஔ": 647,
+  "க": 648,
+  "ங": 649,
+  "ச": 650,
+  "ஜ": 651,
+  "ஞ": 652,
+  "ட": 653,
+  "ண": 654,
+  "த": 655,
+  "ந": 656,
+  "ன": 657,
+  "ப": 658,
+  "ம": 659,
+  "ய": 660,
+  "ர": 661,
+  "ற": 662,
+  "ல": 663,
+  "ள": 664,
+  "ழ": 665,
+  "வ": 666,
+  "ஶ": 667,
+  "ஷ": 668,
+  "ஸ": 669,
+  "ஹ": 670,
+  "ா": 671,
+  "ி": 672,
+  "ீ": 673,
+  "ு": 674,
+  "ூ": 675,
+  "ெ": 676,
+  "ே": 677,
+  "ை": 678,
+  "ொ": 679,
+  "ோ": 680,
+  "ௌ": 681,
+  "்": 682,
+  "ௐ": 683,
+  "ௗ": 684,
+  "௦": 685,
+  "௧": 686,
+  "௨": 687,
+  "௩": 688,
+  "௪": 689,
+  "௫": 690,
+  "௬": 691,
+  "௭": 692,
+  "௮": 693,
+  "௯": 694,
+  "௰": 695,
+  "௱": 696,
+  "௲": 697,
+  "௳": 698,
+  "௴": 699,
+  "௵": 700,
+  "௶": 701,
+  "௷": 702,
+  "௸": 703,
+  "௹": 704,
+  "௺": 705,
+  "ఀ": 706,
+  "ఁ": 707,
+  "ం": 708,
+  "ః": 709,
+  "ఄ": 710,
+  "అ": 711,
+  "ఆ": 712,
+  "ఇ": 713,
+  "ఈ": 714,
+  "ఉ": 715,
+  "ఊ": 716,
+  "ఋ": 717,
+  "ఌ": 718,
+  "ఎ": 719,
+  "ఏ": 720,
+  "ఐ": 721,
+  "ఒ": 722,
+  "ఓ": 723,
+  "ఔ": 724,
+  "క": 725,
+  "ఖ": 726,
+  "గ": 727,
+  "ఘ": 728,
+  "ఙ": 729,
+  "చ": 730,
+  "ఛ": 731,
+  "జ": 732,
+  "ఝ": 733,
+  "ఞ": 734,
+  "ట": 735,
+  "ఠ": 736,
+  "డ": 737,
+  "ఢ": 738,
+  "ణ": 739,
+  "త": 740,
+  "థ": 741,
+  "ద": 742,
+  "ధ": 743,
+  "న": 744,
+  "ప": 745,
+  "ఫ": 746,
+  "బ": 747,
+  "భ": 748,
+  "మ": 749,
+  "య": 750,
+  "ర": 751,
+  "ఱ": 752,
+  "ల": 753,
+  "ళ": 754,
+  "ఴ": 755,
+  "వ": 756,
+  "శ": 757,
+  "ష": 758,
+  "స": 759,
+  "హ": 760,
+  "ఽ": 761,
+  "ా": 762,
+  "ి": 763,
+  "ీ": 764,
+  "ు": 765,
+  "ూ": 766,
+  "ృ": 767,
+  "ౄ": 768,
+  "ె": 769,
+  "ే": 770,
+  "ై": 771,
+  "ొ": 772,
+  "ో": 773,
+  "ౌ": 774,
+  "్": 775,
+  "ౕ": 776,
+  "ౖ": 777,
+  "ౘ": 778,
+  "ౙ": 779,
+  "ౚ": 780,
+  "ౠ": 781,
+  "ౡ": 782,
+  "ౢ": 783,
+  "ౣ": 784,
+  "౦": 785,
+  "౧": 786,
+  "౨": 787,
+  "౩": 788,
+  "౪": 789,
+  "౫": 790,
+  "౬": 791,
+  "౭": 792,
+  "౮": 793,
+  "౯": 794,
+  "౷": 795,
+  "౸": 796,
+  "౹": 797,
+  "౺": 798,
+  "౻": 799,
+  "౼": 800,
+  "౽": 801,
+  "౾": 802,
+  "౿": 803,
+  "ಀ": 804,
+  "ಁ": 805,
+  "ಂ": 806,
+  "ಃ": 807,
+  "಄": 808,
+  "ಅ": 809,
+  "ಆ": 810,
+  "ಇ": 811,
+  "ಈ": 812,
+  "ಉ": 813,
+  "ಊ": 814,
+  "ಋ": 815,
+  "ಌ": 816,
+  "ಎ": 817,
+  "ಏ": 818,
+  "ಐ": 819,
+  "ಒ": 820,
+  "ಓ": 821,
+  "ಔ": 822,
+  "ಕ": 823,
+  "ಖ": 824,
+  "ಗ": 825,
+  "ಘ": 826,
+  "ಙ": 827,
+  "ಚ": 828,
+  "ಛ": 829,
+  "ಜ": 830,
+  "ಝ": 831,
+  "ಞ": 832,
+  "ಟ": 833,
+  "ಠ": 834,
+  "ಡ": 835,
+  "ಢ": 836,
+  "ಣ": 837,
+  "ತ": 838,
+  "ಥ": 839,
+  "ದ": 840,
+  "ಧ": 841,
+  "ನ": 842,
+  "ಪ": 843,
+  "ಫ": 844,
+  "ಬ": 845,
+  "ಭ": 846,
+  "ಮ": 847,
+  "ಯ": 848,
+  "ರ": 849,
+  "ಱ": 850,
+  "ಲ": 851,
+  "ಳ": 852,
+  "ವ": 853,
+  "ಶ": 854,
+  "ಷ": 855,
+  "ಸ": 856,
+  "ಹ": 857,
+  "಼": 858,
+  "ಽ": 859,
+  "ಾ": 860,
+  "ಿ": 861,
+  "ೀ": 862,
+  "ು": 863,
+  "ೂ": 864,
+  "ೃ": 865,
+  "ೄ": 866,
+  "ೆ": 867,
+  "ೇ": 868,
+  "ೈ": 869,
+  "ೊ": 870,
+  "ೋ": 871,
+  "ೌ": 872,
+  "್": 873,
+  "ೕ": 874,
+  "ೖ": 875,
+  "ೞ": 876,
+  "ೠ": 877,
+  "ೡ": 878,
+  "ೢ": 879,
+  "ೣ": 880,
+  "೦": 881,
+  "೧": 882,
+  "೨": 883,
+  "೩": 884,
+  "೪": 885,
+  "೫": 886,
+  "೬": 887,
+  "೭": 888,
+  "೮": 889,
+  "೯": 890,
+  "ೱ": 891,
+  "ೲ": 892,
+  "ഀ": 893,
+  "ഁ": 894,
+  "ം": 895,
+  "ഃ": 896,
+  "ഄ": 897,
+  "അ": 898,
+  "ആ": 899,
+  "ഇ": 900,
+  "ഈ": 901,
+  "ഉ": 902,
+  "ഊ": 903,
+  "ഋ": 904,
+  "ഌ": 905,
+  "എ": 906,
+  "ഏ": 907,
+  "ഐ": 908,
+  "ഒ": 909,
+  "ഓ": 910,
+  "ഔ": 911,
+  "ക": 912,
+  "ഖ": 913,
+  "ഗ": 914,
+  "ഘ": 915,
+  "ങ": 916,
+  "ച": 917,
+  "ഛ": 918,
+  "ജ": 919,
+  "ഝ": 920,
+  "ഞ": 921,
+  "ട": 922,
+  "ഠ": 923,
+  "ഡ": 924,
+  "ഢ": 925,
+  "ണ": 926,
+  "ത": 927,
+  "ഥ": 928,
+  "ദ": 929,
+  "ധ": 930,
+  "ന": 931,
+  "ഩ": 932,
+  "പ": 933,
+  "ഫ": 934,
+  "ബ": 935,
+  "ഭ": 936,
+  "മ": 937,
+  "യ": 938,
+  "ര": 939,
+  "റ": 940,
+  "ല": 941,
+  "ള": 942,
+  "ഴ": 943,
+  "വ": 944,
+  "ശ": 945,
+  "ഷ": 946,
+  "സ": 947,
+  "ഹ": 948,
+  "ഺ": 949,
+  "഻": 950,
+  "഼": 951,
+  "ഽ": 952,
+  "ാ": 953,
+  "ി": 954,
+  "ീ": 955,
+  "ു": 956,
+  "ൂ": 957,
+  "ൃ": 958,
+  "ൄ": 959,
+  "െ": 960,
+  "േ": 961,
+  "ൈ": 962,
+  "ൊ": 963,
+  "ോ": 964,
+  "ൌ": 965,
+  "്": 966,
+  "ൎ": 967,
+  "൏": 968,
+  "ൔ": 969,
+  "ൕ": 970,
+  "ൖ": 971,
+  "ൗ": 972,
+  "൘": 973,
+  "൙": 974,
+  "൚": 975,
+  "൛": 976,
+  "൜": 977,
+  "൝": 978,
+  "൞": 979,
+  "ൟ": 980,
+  "ൠ": 981,
+  "ൡ": 982,
+  "ൢ": 983,
+  "ൣ": 984,
+  "൦": 985,
+  "൧": 986,
+  "൨": 987,
+  "൩": 988,
+  "൪": 989,
+  "൫": 990,
+  "൬": 991,
+  "൭": 992,
+  "൮": 993,
+  "൯": 994,
+  "൰": 995,
+  "൱": 996,
+  "൲": 997,
+  "൳": 998,
+  "൴": 999,
+  "൵": 1000,
+  "൶": 1001,
+  "൷": 1002,
+  "൸": 1003,
+  "൹": 1004,
+  "ൺ": 1005,
+  "ൻ": 1006,
+  "ർ": 1007,
+  "ൽ": 1008,
+  "ൾ": 1009,
+  "ൿ": 1010,
+  "ᰀ": 1011,
+  "ᰁ": 1012,
+  "ᰂ": 1013,
+  "ᰃ": 1014,
+  "ᰄ": 1015,
+  "ᰅ": 1016,
+  "ᰆ": 1017,
+  "ᰇ": 1018,
+  "ᰈ": 1019,
+  "ᰉ": 1020,
+  "ᰊ": 1021,
+  "ᰋ": 1022,
+  "ᰌ": 1023,
+  "ᰍ": 1024,
+  "ᰎ": 1025,
+  "ᰏ": 1026,
+  "ᰐ": 1027,
+  "ᰑ": 1028,
+  "ᰒ": 1029,
+  "ᰓ": 1030,
+  "ᰔ": 1031,
+  "ᰕ": 1032,
+  "ᰖ": 1033,
+  "ᰗ": 1034,
+  "ᰘ": 1035,
+  "ᰙ": 1036,
+  "ᰚ": 1037,
+  "ᰛ": 1038,
+  "ᰜ": 1039,
+  "ᰝ": 1040,
+  "ᰞ": 1041,
+  "ᰟ": 1042,
+  "ᰠ": 1043,
+  "ᰡ": 1044,
+  "ᰢ": 1045,
+  "ᰣ": 1046,
+  "ᰤ": 1047,
+  "ᰥ": 1048,
+  "ᰦ": 1049,
+  "ᰧ": 1050,
+  "ᰨ": 1051,
+  "ᰩ": 1052,
+  "ᰪ": 1053,
+  "ᰫ": 1054,
+  "ᰬ": 1055,
+  "ᰭ": 1056,
+  "ᰮ": 1057,
+  "ᰯ": 1058,
+  "ᰰ": 1059,
+  "ᰱ": 1060,
+  "ᰲ": 1061,
+  "ᰳ": 1062,
+  "ᰴ": 1063,
+  "ᰵ": 1064,
+  "ᰶ": 1065,
+  "᰷": 1066,
+  "᰻": 1067,
+  "᰼": 1068,
+  "᰽": 1069,
+  "᰾": 1070,
+  "᰿": 1071,
+  "᱀": 1072,
+  "᱁": 1073,
+  "᱂": 1074,
+  "᱃": 1075,
+  "᱄": 1076,
+  "᱅": 1077,
+  "᱆": 1078,
+  "᱇": 1079,
+  "᱈": 1080,
+  "᱉": 1081,
+  "ᱍ": 1082,
+  "ᱎ": 1083,
+  "ᱏ": 1084,
+  "ᵻ": 1248,
+  "–": 1085,
+  "—": 1086,
+  "‘": 1087,
+  "“": 1088,
+  "”": 1089,
+  "†": 1090,
+  "‡": 1091,
+  "•": 1092,
+  "…": 1093,
+  "‰": 1094,
+  "′": 1095,
+  "″": 1096,
+  "‽": 1097,
+  "₠": 1098,
+  "₡": 1099,
+  "₢": 1100,
+  "₣": 1101,
+  "₤": 1102,
+  "₥": 1103,
+  "₦": 1104,
+  "₧": 1105,
+  "₨": 1106,
+  "₩": 1107,
+  "₪": 1108,
+  "₫": 1109,
+  "€": 1110,
+  "₵": 1111,
+  "₹": 1112,
+  "₺": 1113,
+  "₽": 1114,
+  "₿": 1115,
+  "℅": 1116,
+  "№": 1117,
+  "™": 1118,
+  "⅛": 1119,
+  "⅜": 1120,
+  "⅝": 1121,
+  "⅞": 1122,
+  "←": 1123,
+  "↑": 1241,
+  "→": 1242,
+  "↓": 1240,
+  "↗": 1243,
+  "↘": 1244,
+  "↵": 1127,
+  "⇒": 1128,
+  "−": 1129,
+  "∩": 1130,
+  "≡": 1131,
+  "≤": 1132,
+  "Ⓡ": 1133,
+  "█": 1134,
+  "●": 1135,
+  "☞": 1136,
+  "❀": 1137,
+  "ⱱ": 1208,
+  "？": 1138,
+  "": 1139
+}