from typing import List, Dict, Set import regex as re class HindiTokenizer: def __init__(self): # Devanagari Unicode ranges self.DEVANAGARI_RANGE = "\u0900-\u097f" # Basic character sets self.CONSONANTS = set("कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह") self.VOWELS = set("अआइईउऊऋएऐओऔ") self.MATRAS = set("ािीुूृेैोौ") self.SPECIAL_CHARS = set("्ंःँ़") self.NUMERALS = set("०१२३४५६७८९") # Special tokens self.EOW_TOKEN = "" # End of word self.UNK_TOKEN = "" # Unknown token self.PAD_TOKEN = "" # Padding token def clean_text(self, text: str) -> str: """Clean and normalize Hindi text""" # Normalize Unicode text = self._normalize_unicode(text) # Remove excessive spaces text = re.sub(r"\s+", " ", text) # Keep only Devanagari, English letters, numbers and basic punctuation pattern = f"[^{self.DEVANAGARI_RANGE}a-zA-Z0-9\s\.\,\?\!]" text = re.sub(pattern, "", text) return text.strip() def _normalize_unicode(self, text: str) -> str: """Normalize Unicode characters to NFC form""" import unicodedata return unicodedata.normalize("NFC", text) def tokenize(self, text: str) -> List[str]: """Tokenize text into subwords""" chars = [] i = 0 text = self.clean_text(text) while i < len(text): if text[i].isspace(): # Skip spaces in tokenization i += 1 continue if i + 1 < len(text) and text[i + 1] in self.MATRAS: # Handle consonant + matra (like सु, की) chars.append(text[i : i + 2]) i += 2 elif i + 1 < len(text) and text[i + 1] == "ं": # Handle anusvara (like नीं) if i > 0 and text[i] in self.MATRAS: # Combine with previous token if it ends with a matra chars[-1] = chars[-1] + text[i + 1] else: chars.append(text[i : i + 2]) i += 2 else: chars.append(text[i]) i += 1 return chars def get_stats(self, text: str) -> Dict: """Get tokenization statistics""" tokens = self.tokenize(text) # Count original characters excluding spaces orig_chars = len([c for c in text if not c.isspace()]) return { "original_chars": orig_chars, "token_count": len(tokens), "unique_tokens": len(set(tokens)), "compression_ratio": round(orig_chars / len(tokens), 2), } def _get_token_type(self, token: str) -> str: """Determine token type""" if len(token) == 1: if token in self.CONSONANTS: return "consonant" elif token in self.VOWELS: return "vowel" elif token in self.MATRAS: return "matra" elif token in self.SPECIAL_CHARS: return "special" else: return "compound"