class HindiTokenizer: def __init__(self): # Base Varnmala (वर्णमाला) self.VYANJAN = set( [ # व्यंजन (Consonants) "क", "ख", "ग", "घ", "ङ", "च", "छ", "ज", "झ", "ञ", "ट", "ठ", "ड", "ढ", "ण", "त", "थ", "द", "ध", "न", "प", "फ", "ब", "भ", "म", "य", "र", "ल", "व", "श", "ष", "स", "ह", ] ) self.SWAR = set( [ # स्वर (Vowels) "अ", "आ", "इ", "ई", "उ", "ऊ", "ऋ", "ए", "ऐ", "ओ", "औ", ] ) self.MATRAS = set( [ # मात्राएँ "ा", "ि", "ी", "ु", "ू", "ृ", "े", "ै", "ो", "ौ", ] ) self.SPECIAL_CHARS = set( [ "्", # Halant (विराम) "ं", # Anusvara (अनुस्वार) "ः", # Visarga (विसर्ग) "ँ", # Chandrabindu (चन्द्रबिन्दु) "़", # Nukta (नुक्ता) ] ) # Combined sets for convenience self.CONSONANTS = self.VYANJAN # For backward compatibility self.VOWELS = self.SWAR # For backward compatibility self.BASE_VOCAB = self.VYANJAN | self.SWAR | self.MATRAS | self.SPECIAL_CHARS self.base_vocab_stats = { "vyanjan": len(self.VYANJAN), "swar": len(self.SWAR), "matras": len(self.MATRAS), "special": len(self.SPECIAL_CHARS), "total": len(self.BASE_VOCAB), } def _get_token_type(self, token: str) -> str: """Determine the type of a token""" if len(token) == 1: if token in self.VYANJAN: return "consonant" elif token in self.SWAR: return "vowel" elif token in self.MATRAS: return "matra" elif token in self.SPECIAL_CHARS: return "special" return "compound" def tokenize(self, text: str) -> list: """Basic character-level tokenization""" return list(text) def is_hindi_char(self, char: str) -> bool: """Check if a character is a Hindi character""" return char in self.BASE_VOCAB