Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

README.md +38 -0
examples.json +99 -0
tokenizer.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,38 @@

+---
+language: te
+tags:
+- telugu
+- tokenizer
+- bpe
+license: mit
+---
+# Telugu BPE Tokenizer
+A Byte-Pair Encoding (BPE) tokenizer trained on Telugu text data from Wikipedia.
+## Model Description
+This tokenizer was trained on Telugu text data collected from Wikipedia articles. It uses Byte-Pair Encoding (BPE) to create subword tokens.
+## Stats
+- Vocabulary Size: 5000 tokens
+- Compression Ratio: 1.26
+## Usage
+```python
+from tokenizers import Tokenizer
+# Load the tokenizer
+tokenizer = Tokenizer.from_file("tokenizer.json")
+# Tokenize text
+text = "నమస్కారం"
+encoding = tokenizer.encode(text)
+print(encoding.tokens)
+```
+## Training Data
+The tokenizer was trained on Telugu text data collected from Wikipedia articles. The data includes a diverse range of topics and writing styles.

examples.json ADDED Viewed

	@@ -0,0 +1,99 @@

+[
+  {
+    "text": "నమస్కారం",
+    "tokens": [
+      "Ġà°¨à°®",
+      "à°¸",
+      "à±į",
+      "à°ķ",
+      "à°¾",
+      "à°°",
+      "à°Ĥ"
+    ],
+    "ids": [
+      438,
+      196,
+      177,
+      185,
+      179,
+      180,
+      181
+    ]
+  },
+  {
+    "text": "తెలుగు భాష చాలా అందమైనది",
+    "tokens": [
+      "Ġà°¤",
+      "à±Ĩ",
+      "à°²",
+      "à±ģ",
+      "à°Ĺ",
+      "à±ģ",
+      "Ġà°Ń",
+      "à°¾",
+      "à°·",
+      "Ġà°ļ",
+      "à°¾",
+      "à°²",
+      "à°¾",
+      "Ġà°ħ",
+      "à°Ĥ",
+      "à°¦à°®",
+      "à±Ī",
+      "à°¨à°¦",
+      "à°¿"
+    ],
+    "ids": [
+      230,
+      204,
+      183,
+      182,
+      199,
+      182,
+      254,
+      179,
+      223,
+      225,
+      179,
+      183,
+      179,
+      211,
+      181,
+      946,
+      213,
+      447,
+      178
+    ]
+  },
+  {
+    "text": "భారతదేశం నా దేశం",
+    "tokens": [
+      "Ġà°Ń",
+      "à°¾",
+      "à°°à°¤à°¦",
+      "à±ĩ",
+      "à°¶",
+      "à°Ĥ",
+      "Ġà°¨",
+      "à°¾",
+      "Ġà°¦",
+      "à±ĩ",
+      "à°¶",
+      "à°Ĥ"
+    ],
+    "ids": [
+      254,
+      179,
+      524,
+      195,
+      217,
+      181,
+      206,
+      179,
+      215,
+      195,
+      217,
+      181
+    ]
+  }
+]

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff