HindiBPE / backend /app /tokenizer.py
Vibi007's picture
first commit
413be94
from typing import List, Dict, Set
import regex as re
class HindiTokenizer:
def __init__(self):
# Devanagari Unicode ranges
self.DEVANAGARI_RANGE = "\u0900-\u097f"
# Basic character sets
self.CONSONANTS = set("कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह")
self.VOWELS = set("अआइईउऊऋएऐओऔ")
self.MATRAS = set("ािीुूृेैोौ")
self.SPECIAL_CHARS = set("्ंःँ़")
self.NUMERALS = set("०१२३४५६७८९")
# Special tokens
self.EOW_TOKEN = "</w>" # End of word
self.UNK_TOKEN = "<unk>" # Unknown token
self.PAD_TOKEN = "<pad>" # Padding token
def clean_text(self, text: str) -> str:
"""Clean and normalize Hindi text"""
# Normalize Unicode
text = self._normalize_unicode(text)
# Remove excessive spaces
text = re.sub(r"\s+", " ", text)
# Keep only Devanagari, English letters, numbers and basic punctuation
pattern = f"[^{self.DEVANAGARI_RANGE}a-zA-Z0-9\s\.\,\?\!]"
text = re.sub(pattern, "", text)
return text.strip()
def _normalize_unicode(self, text: str) -> str:
"""Normalize Unicode characters to NFC form"""
import unicodedata
return unicodedata.normalize("NFC", text)
def tokenize(self, text: str) -> List[str]:
"""Tokenize text into subwords"""
chars = []
i = 0
text = self.clean_text(text)
while i < len(text):
if text[i].isspace():
# Skip spaces in tokenization
i += 1
continue
if i + 1 < len(text) and text[i + 1] in self.MATRAS:
# Handle consonant + matra (like सु, की)
chars.append(text[i : i + 2])
i += 2
elif i + 1 < len(text) and text[i + 1] == "ं":
# Handle anusvara (like नीं)
if i > 0 and text[i] in self.MATRAS:
# Combine with previous token if it ends with a matra
chars[-1] = chars[-1] + text[i + 1]
else:
chars.append(text[i : i + 2])
i += 2
else:
chars.append(text[i])
i += 1
return chars
def get_stats(self, text: str) -> Dict:
"""Get tokenization statistics"""
tokens = self.tokenize(text)
# Count original characters excluding spaces
orig_chars = len([c for c in text if not c.isspace()])
return {
"original_chars": orig_chars,
"token_count": len(tokens),
"unique_tokens": len(set(tokens)),
"compression_ratio": round(orig_chars / len(tokens), 2),
}
def _get_token_type(self, token: str) -> str:
"""Determine token type"""
if len(token) == 1:
if token in self.CONSONANTS:
return "consonant"
elif token in self.VOWELS:
return "vowel"
elif token in self.MATRAS:
return "matra"
elif token in self.SPECIAL_CHARS:
return "special"
else:
return "compound"