HindiBPE / backend /app /hindi_tokenizer.py
Vibi007's picture
first commit
413be94
class HindiTokenizer:
def __init__(self):
# Base Varnmala (वर्णमाला)
self.VYANJAN = set(
[ # व्यंजन (Consonants)
"क",
"ख",
"ग",
"घ",
"ङ",
"च",
"छ",
"ज",
"झ",
"ञ",
"ट",
"ठ",
"ड",
"ढ",
"ण",
"त",
"थ",
"द",
"ध",
"न",
"प",
"फ",
"ब",
"भ",
"म",
"य",
"र",
"ल",
"व",
"श",
"ष",
"स",
"ह",
]
)
self.SWAR = set(
[ # स्वर (Vowels)
"अ",
"आ",
"इ",
"ई",
"उ",
"ऊ",
"ऋ",
"ए",
"ऐ",
"ओ",
"औ",
]
)
self.MATRAS = set(
[ # मात्राएँ
"ा",
"ि",
"ी",
"ु",
"ू",
"ृ",
"े",
"ै",
"ो",
"ौ",
]
)
self.SPECIAL_CHARS = set(
[
"्", # Halant (विराम)
"ं", # Anusvara (अनुस्वार)
"ः", # Visarga (विसर्ग)
"ँ", # Chandrabindu (चन्द्रबिन्दु)
"़", # Nukta (नुक्ता)
]
)
# Combined sets for convenience
self.CONSONANTS = self.VYANJAN # For backward compatibility
self.VOWELS = self.SWAR # For backward compatibility
self.BASE_VOCAB = self.VYANJAN | self.SWAR | self.MATRAS | self.SPECIAL_CHARS
self.base_vocab_stats = {
"vyanjan": len(self.VYANJAN),
"swar": len(self.SWAR),
"matras": len(self.MATRAS),
"special": len(self.SPECIAL_CHARS),
"total": len(self.BASE_VOCAB),
}
def _get_token_type(self, token: str) -> str:
"""Determine the type of a token"""
if len(token) == 1:
if token in self.VYANJAN:
return "consonant"
elif token in self.SWAR:
return "vowel"
elif token in self.MATRAS:
return "matra"
elif token in self.SPECIAL_CHARS:
return "special"
return "compound"
def tokenize(self, text: str) -> list:
"""Basic character-level tokenization"""
return list(text)
def is_hindi_char(self, char: str) -> bool:
"""Check if a character is a Hindi character"""
return char in self.BASE_VOCAB