Upload Hindi embeddings model and all associated files

Browse files

Files changed (15) hide show

.gitattributes +1 -0
corpus_stats.json +0 -0
evaluation_plots/.amlignore +6 -0
evaluation_plots/.amlignore.amltmp +6 -0
evaluation_plots/token_length_distribution.png +0 -0
evaluation_plots/top_token_frequencies.png +0 -0
hindi-rag-system.py +11 -182
hindi-rag-system.py.amltmp +881 -0
hindi_corpus.txt +2 -2
sample_corpus.txt +0 -0
tokenizer.model +2 -2
tokenizer.vocab +0 -0
tokenizer_config.json +1 -1
tokenizer_evaluation.json +129 -0
training_config.json +0 -2

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 hindi_corpus.txt filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 hindi_corpus.txt filter=lfs diff=lfs merge=lfs -text
+corpus_stats.json filter=lfs diff=lfs merge=lfs -text

corpus_stats.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

evaluation_plots/.amlignore ADDED Viewed

	@@ -0,0 +1,6 @@

+## This file was auto generated by the Azure Machine Learning Studio. Please do not remove.
+## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots
+.ipynb_aml_checkpoints/
+*.amltmp
+*.amltemp

evaluation_plots/.amlignore.amltmp ADDED Viewed

	@@ -0,0 +1,6 @@

+## This file was auto generated by the Azure Machine Learning Studio. Please do not remove.
+## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots
+.ipynb_aml_checkpoints/
+*.amltmp
+*.amltemp

evaluation_plots/token_length_distribution.png ADDED Viewed

evaluation_plots/top_token_frequencies.png ADDED Viewed

hindi-rag-system.py CHANGED Viewed

@@ -16,12 +16,6 @@ from langchain.vectorstores import FAISS as LangchainFAISS
 from langchain.docstore.document import Document
 from langchain.embeddings.base import Embeddings
 from typing import List, Dict, Any, Optional, Callable
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
-import gc
-import warnings
-# Ignore specific HuggingFace warnings
-warnings.filterwarnings("ignore", category=UserWarning, message=".*The model doesn't have tied token embeddings.*")
 # Tokenizer wrapper class - same as in original code
 class SentencePieceTokenizerWrapper:
@@ -708,107 +702,6 @@ def perform_similarity_search(vector_store, query, k=6):
     print(f"Searching for: {query}")
     return vector_store.similarity_search_with_score(query, k=k)
-# Llama model loading function
-def load_llama_model(model_name="unsloth/Llama-3.2-1B-Instruct", device="cuda"):
-    """
-    Load and prepare Llama model for text generation
-    """
-    print(f"Loading LLM: {model_name}")
-    # Check if CUDA is available
-    if device == "cuda" and not torch.cuda.is_available():
-        print("CUDA not available, falling back to CPU")
-        device = "cpu"
-    # Quantization config for 4-bit precision to save memory
-    quantization = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_compute_dtype=torch.float16,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_use_double_quant=True,
-    ) if device == "cuda" else None
-    # Standard HuggingFace loading
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    if device == "cuda":
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            device_map="auto",
-            quantization_config=quantization
-        )
-    else:
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-        model = model.to(device)
-    print("Successfully loaded model")
-    return model, tokenizer
-def setup_qa_system(model, tokenizer, vector_store):
-    """
-    Set up a direct QA system using the model and retriever
-    """
-    # Create retriever
-    retriever = vector_store.as_retriever(
-        search_type="similarity",
-        search_kwargs={"k": 3}
-    )
-    # Create a function to generate answers
-    def generate_answer(query):
-        # Retrieve documents
-        try:
-            docs = retriever.invoke(query)
-        except:
-            # Fallback to older method if invoke isn't available
-            docs = retriever.get_relevant_documents(query)
-        # Extract the content
-        context = "\n\n".join([doc.page_content for doc in docs])
-        # Create prompt
-        prompt = f"""
-आपको निम्नलिखित संदर्भ से जानकारी के आधार पर एक प्रश्न का उत्तर देना है।
-यदि आप उत्तर नहीं जानते हैं, तो बस "मुझे नहीं पता" कहें।
-संदर्भ:
-{context}
-प्रश्न: {query}
-उत्तर:
-"""
-        # Generate text
-        inputs = tokenizer(prompt, return_tensors="pt")
-        # Move to the same device as the model
-        for k, v in inputs.items():
-            if hasattr(v, "to") and callable(v.to):
-                inputs[k] = v.to(model.device)
-        with torch.no_grad():
-            try:
-                outputs = model.generate(
-                    inputs.input_ids,
-                    max_new_tokens=512,
-                    temperature=0.7,
-                    top_p=0.9,
-                    do_sample=True
-                )
-            except Exception as e:
-                return f"Error generating response: {str(e)}"
-        # Decode the generated text
-        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Extract just the answer part (after the prompt)
-        answer = full_response.split("उत्तर:")[-1].strip()
-        return answer
-    return generate_answer
 # Main RAG functions
 def index_text_files(model, tokenizer, data_dir, output_dir, device="cuda", chunk_size=500):
     """
@@ -902,7 +795,7 @@ def query_text_corpus(model, tokenizer, vector_store_path, query, k=6, device="c
         processed_results.append((combined_doc, score))
-    return processed_results, vector_store
 def main():
     parser = argparse.ArgumentParser(description="Hindi RAG System with LangChain and FAISS")
@@ -928,54 +821,27 @@ def main():
                         help="Run in interactive mode for querying")
     parser.add_argument("--reindex", action="store_true",
                         help="Force reindexing even if index exists")
-    parser.add_argument("--qa", action="store_true",
-                        help="Use LLM for question answering instead of just retrieval")
-    parser.add_argument("--llm_name", type=str, default="unsloth/Llama-3.2-1B-Instruct",
-                        help="HuggingFace model name for the LLM")
     args = parser.parse_args()
-    # Load embedding model and tokenizer
-    embed_model, embed_tokenizer, config = load_model_and_tokenizer(args.model_dir, args.tokenizer_dir)
-    # Move embedding model to device
-    embed_model = embed_model.to(args.device)
     # Create vector store path
     vector_store_path = os.path.join(args.output_dir, "faiss_index")
-    # Load LLM if QA is enabled
-    llm_model = None
-    llm_tokenizer = None
-    qa_generator = None
-    if args.qa:
-        try:
-            # Load LLM
-            llm_model, llm_tokenizer = load_llama_model(args.llm_name, args.device)
-            print("LLM loaded successfully for QA")
-        except Exception as e:
-            print(f"Error loading LLM: {e}")
-            print("Falling back to retrieval-only mode")
-            args.qa = False
     if args.index or args.reindex:
         # Index text files
-        vector_store, _ = index_text_files(
-            embed_model, embed_tokenizer, args.data_dir, args.output_dir, args.device, args.chunk_size
-        )
         print(f"Indexing complete. Vector store saved to {vector_store_path}")
-        # Set up QA chain if enabled
-        if args.qa and llm_model is not None and llm_tokenizer is not None:
-            qa_generator = setup_qa_system(llm_model, llm_tokenizer, vector_store)
     if args.query:
         # Query the corpus
-        results, vector_store = query_text_corpus(
-            embed_model, embed_tokenizer, vector_store_path, args.query, args.top_k, args.device
-        )
-        # Print retrieval results
         print("\nSearch Results:")
         for i, (doc, score) in enumerate(results):
             print(f"\nResult {i+1} (Score: {score:.4f}):")
@@ -984,27 +850,10 @@ def main():
             # Extract and print only relevant sentences
             relevant_text = extract_relevant_sentences(doc.page_content, args.query)
             print(f"Content: {relevant_text}")
-        # If QA is enabled, also answer the question using the LLM
-        if args.qa and llm_model is not None and llm_tokenizer is not None:
-            if qa_generator is None:
-                qa_generator = setup_qa_system(llm_model, llm_tokenizer, vector_store)
-            # Get answer from QA chain
-            print("\nGenerating answer using LLM...")
-            try:
-                answer = qa_generator(args.query)
-                print("\nLLM Answer:")
-                print(answer)
-            except Exception as e:
-                print(f"Error generating answer: {e}")
     if args.interactive:
         print("\nInteractive mode. Enter queries (or type 'quit' to exit).")
-        # For the first query, load vector store
-        vector_store = None
         while True:
             print("\nEnter query:")
             query = input()
@@ -1016,11 +865,9 @@ def main():
                 break
             # Query the corpus
-            results, vector_store = query_text_corpus(
-                embed_model, embed_tokenizer, vector_store_path, query, args.top_k, args.device
-            )
-            # Print retrieval results
             print("\nSearch Results:")
             for i, (doc, score) in enumerate(results):
                 print(f"\nResult {i+1} (Score: {score:.4f}):")
@@ -1029,24 +876,6 @@ def main():
                 # Extract and print only relevant sentences
                 relevant_text = extract_relevant_sentences(doc.page_content, query)
                 print(f"Content: {relevant_text}")
-            # If QA is enabled, also answer the question using the LLM
-            if args.qa and llm_model is not None and llm_tokenizer is not None:
-                if qa_generator is None:
-                    qa_generator = setup_qa_system(llm_model, llm_tokenizer, vector_store)
-                print("\nGenerating answer using LLM...")
-                try:
-                    answer = qa_generator(query)
-                    print("\nLLM Answer:")
-                    print(answer)
-                except Exception as e:
-                    print(f"Error generating answer: {e}")
-    # Clean up GPU memory
-    if args.device == "cuda":
-        gc.collect()
-        torch.cuda.empty_cache()
 if __name__ == "__main__":
-    main()

 from langchain.docstore.document import Document
 from langchain.embeddings.base import Embeddings
 from typing import List, Dict, Any, Optional, Callable
 # Tokenizer wrapper class - same as in original code
 class SentencePieceTokenizerWrapper:
     print(f"Searching for: {query}")
     return vector_store.similarity_search_with_score(query, k=k)
 # Main RAG functions
 def index_text_files(model, tokenizer, data_dir, output_dir, device="cuda", chunk_size=500):
     """
         processed_results.append((combined_doc, score))
+    return processed_results
 def main():
     parser = argparse.ArgumentParser(description="Hindi RAG System with LangChain and FAISS")
                         help="Run in interactive mode for querying")
     parser.add_argument("--reindex", action="store_true",
                         help="Force reindexing even if index exists")
     args = parser.parse_args()
+    # Load model and tokenizer
+    model, tokenizer, config = load_model_and_tokenizer(args.model_dir, args.tokenizer_dir)
+    # Move model to device
+    model = model.to(args.device)
     # Create vector store path
     vector_store_path = os.path.join(args.output_dir, "faiss_index")
     if args.index or args.reindex:
         # Index text files
+        index_text_files(model, tokenizer, args.data_dir, args.output_dir, args.device, args.chunk_size)
         print(f"Indexing complete. Vector store saved to {vector_store_path}")
     if args.query:
         # Query the corpus
+        results = query_text_corpus(model, tokenizer, vector_store_path, args.query, args.top_k, args.device)
+        # Print results
         print("\nSearch Results:")
         for i, (doc, score) in enumerate(results):
             print(f"\nResult {i+1} (Score: {score:.4f}):")
             # Extract and print only relevant sentences
             relevant_text = extract_relevant_sentences(doc.page_content, args.query)
             print(f"Content: {relevant_text}")
     if args.interactive:
         print("\nInteractive mode. Enter queries (or type 'quit' to exit).")
         while True:
             print("\nEnter query:")
             query = input()
                 break
             # Query the corpus
+            results = query_text_corpus(model, tokenizer, vector_store_path, query, args.top_k, args.device)
+            # Print results
             print("\nSearch Results:")
             for i, (doc, score) in enumerate(results):
                 print(f"\nResult {i+1} (Score: {score:.4f}):")
                 # Extract and print only relevant sentences
                 relevant_text = extract_relevant_sentences(doc.page_content, query)
                 print(f"Content: {relevant_text}")
 if __name__ == "__main__":
+    main()

hindi-rag-system.py.amltmp ADDED Viewed

	@@ -0,0 +1,881 @@

+import os
+import torch
+import json
+import argparse
+import numpy as np
+import re
+from torch import nn
+from torch.nn import functional as F
+import sentencepiece as spm
+import math
+from safetensors.torch import save_file, load_file
+from tqdm import tqdm
+import faiss
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import FAISS as LangchainFAISS
+from langchain.docstore.document import Document
+from langchain.embeddings.base import Embeddings
+from typing import List, Dict, Any, Optional, Callable
+# Tokenizer wrapper class - same as in original code
+class SentencePieceTokenizerWrapper:
+    def __init__(self, sp_model_path):
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(sp_model_path)
+        self.vocab_size = self.sp_model.GetPieceSize()
+        # Special token IDs from tokenizer training
+        self.pad_token_id = 0
+        self.bos_token_id = 1
+        self.eos_token_id = 2
+        self.unk_token_id = 3
+        # Set special tokens
+        self.pad_token = "<pad>"
+        self.bos_token = "<s>"
+        self.eos_token = "</s>"
+        self.unk_token = "<unk>"
+        self.mask_token = "<mask>"
+    def __call__(self, text, padding=False, truncation=False, max_length=None, return_tensors=None):
+        # Handle both string and list inputs
+        if isinstance(text, str):
+            # Encode a single string
+            ids = self.sp_model.EncodeAsIds(text)
+            # Handle truncation
+            if truncation and max_length and len(ids) > max_length:
+                ids = ids[:max_length]
+            attention_mask = [1] * len(ids)
+            # Handle padding
+            if padding and max_length:
+                padding_length = max(0, max_length - len(ids))
+                ids = ids + [self.pad_token_id] * padding_length
+                attention_mask = attention_mask + [0] * padding_length
+            result = {
+                'input_ids': ids,
+                'attention_mask': attention_mask
+            }
+            # Convert to tensors if requested
+            if return_tensors == 'pt':
+                import torch
+                result = {k: torch.tensor([v]) for k, v in result.items()}
+            return result
+        # Process a batch of texts
+        batch_encoded = [self.sp_model.EncodeAsIds(t) for t in text]
+        # Apply truncation if needed
+        if truncation and max_length:
+            batch_encoded = [ids[:max_length] for ids in batch_encoded]
+        # Create attention masks
+        batch_attention_mask = [[1] * len(ids) for ids in batch_encoded]
+        # Apply padding if needed
+        if padding:
+            if max_length:
+                max_len = max_length
+            else:
+                max_len = max(len(ids) for ids in batch_encoded)
+            # Pad sequences to max_len
+            batch_encoded = [ids + [self.pad_token_id] * (max_len - len(ids)) for ids in batch_encoded]
+            batch_attention_mask = [mask + [0] * (max_len - len(mask)) for mask in batch_attention_mask]
+        result = {
+            'input_ids': batch_encoded,
+            'attention_mask': batch_attention_mask
+        }
+        # Convert to tensors if requested
+        if return_tensors == 'pt':
+            import torch
+            result = {k: torch.tensor(v) for k, v in result.items()}
+        return result
+# Model architecture definitions for inference
+class MultiHeadAttention(nn.Module):
+    """Advanced multi-headed attention with relative positional encoding"""
+    def __init__(self, config):
+        super().__init__()
+        self.num_attention_heads = config["num_attention_heads"]
+        self.attention_head_size = config["hidden_size"] // config["num_attention_heads"]
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        # Query, Key, Value projections
+        self.query = nn.Linear(config["hidden_size"], self.all_head_size)
+        self.key = nn.Linear(config["hidden_size"], self.all_head_size)
+        self.value = nn.Linear(config["hidden_size"], self.all_head_size)
+        # Output projection
+        self.output = nn.Sequential(
+            nn.Linear(self.all_head_size, config["hidden_size"]),
+            nn.Dropout(config["attention_probs_dropout_prob"])
+        )
+        # Simplified relative position bias approach
+        self.max_position_embeddings = config["max_position_embeddings"]
+        self.relative_attention_bias = nn.Embedding(
+            2 * config["max_position_embeddings"] - 1,
+            config["num_attention_heads"]
+        )
+    def transpose_for_scores(self, x):
+        new_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(self, hidden_states, attention_mask=None):
+        batch_size, seq_length = hidden_states.size()[:2]
+        # Project inputs to queries, keys, and values
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        # Take the dot product between query and key to get the raw attention scores
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        # Generate relative position matrix
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device)
+        relative_position = position_ids.unsqueeze(1) - position_ids.unsqueeze(0)  # [seq_len, seq_len]
+        # Shift values to be >= 0
+        relative_position = relative_position + self.max_position_embeddings - 1
+        # Ensure indices are within bounds
+        relative_position = torch.clamp(relative_position, 0, 2 * self.max_position_embeddings - 2)
+        # Get relative position embeddings [seq_len, seq_len, num_heads]
+        rel_attn_bias = self.relative_attention_bias(relative_position)  # [seq_len, seq_len, num_heads]
+        # Reshape to add to attention heads [1, num_heads, seq_len, seq_len]
+        rel_attn_bias = rel_attn_bias.permute(2, 0, 1).unsqueeze(0)
+        # Add to attention scores - now dimensions will match
+        attention_scores = attention_scores + rel_attn_bias
+        # Scale attention scores
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Apply attention mask
+        if attention_mask is not None:
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities
+        attention_probs = F.softmax(attention_scores, dim=-1)
+        # Apply dropout
+        attention_probs = F.dropout(attention_probs, p=0.1, training=self.training)
+        # Apply attention to values
+        context_layer = torch.matmul(attention_probs, value_layer)
+        # Reshape back to [batch_size, seq_length, hidden_size]
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_shape)
+        # Final output projection
+        output = self.output(context_layer)
+        return output
+class EnhancedTransformerLayer(nn.Module):
+    """Advanced transformer layer with pre-layer norm and enhanced attention"""
+    def __init__(self, config):
+        super().__init__()
+        self.attention_pre_norm = nn.LayerNorm(config["hidden_size"], eps=config["layer_norm_eps"])
+        self.attention = MultiHeadAttention(config)
+        self.ffn_pre_norm = nn.LayerNorm(config["hidden_size"], eps=config["layer_norm_eps"])
+        # Feed-forward network
+        self.ffn = nn.Sequential(
+            nn.Linear(config["hidden_size"], config["intermediate_size"]),
+            nn.GELU(),
+            nn.Dropout(config["hidden_dropout_prob"]),
+            nn.Linear(config["intermediate_size"], config["hidden_size"]),
+            nn.Dropout(config["hidden_dropout_prob"])
+        )
+    def forward(self, hidden_states, attention_mask=None):
+        # Pre-layer norm for attention
+        attn_norm_hidden = self.attention_pre_norm(hidden_states)
+        # Self-attention
+        attention_output = self.attention(attn_norm_hidden, attention_mask)
+        # Residual connection
+        hidden_states = hidden_states + attention_output
+        # Pre-layer norm for feed-forward
+        ffn_norm_hidden = self.ffn_pre_norm(hidden_states)
+        # Feed-forward
+        ffn_output = self.ffn(ffn_norm_hidden)
+        # Residual connection
+        hidden_states = hidden_states + ffn_output
+        return hidden_states
+class AdvancedTransformerModel(nn.Module):
+    """Advanced Transformer model for inference"""
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        # Embeddings
+        self.word_embeddings = nn.Embedding(
+            config["vocab_size"],
+            config["hidden_size"],
+            padding_idx=config["pad_token_id"]
+        )
+        # Position embeddings
+        self.position_embeddings = nn.Embedding(config["max_position_embeddings"], config["hidden_size"])
+        # Embedding dropout
+        self.embedding_dropout = nn.Dropout(config["hidden_dropout_prob"])
+        # Transformer layers
+        self.layers = nn.ModuleList([
+            EnhancedTransformerLayer(config) for _ in range(config["num_hidden_layers"])
+        ])
+        # Final layer norm
+        self.final_layer_norm = nn.LayerNorm(config["hidden_size"], eps=config["layer_norm_eps"])
+    def forward(self, input_ids, attention_mask=None):
+        input_shape = input_ids.size()
+        batch_size, seq_length = input_shape
+        # Get position ids
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
+        # Get embeddings
+        word_embeds = self.word_embeddings(input_ids)
+        position_embeds = self.position_embeddings(position_ids)
+        # Sum embeddings
+        embeddings = word_embeds + position_embeds
+        # Apply dropout
+        embeddings = self.embedding_dropout(embeddings)
+        # Default attention mask
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=input_ids.device)
+        # Extended attention mask for transformer layers (1 for tokens to attend to, 0 for masked tokens)
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        # Apply transformer layers
+        hidden_states = embeddings
+        for layer in self.layers:
+            hidden_states = layer(hidden_states, extended_attention_mask)
+        # Final layer norm
+        hidden_states = self.final_layer_norm(hidden_states)
+        return hidden_states
+class AdvancedPooling(nn.Module):
+    """Advanced pooling module supporting multiple pooling strategies"""
+    def __init__(self, config):
+        super().__init__()
+        self.pooling_mode = config["pooling_mode"]  # 'mean', 'max', 'cls', 'attention'
+        self.hidden_size = config["hidden_size"]
+        # For attention pooling
+        if self.pooling_mode == 'attention':
+            self.attention_weights = nn.Linear(config["hidden_size"], 1)
+        # For weighted pooling
+        elif self.pooling_mode == 'weighted':
+            self.weight_layer = nn.Linear(config["hidden_size"], 1)
+    def forward(self, token_embeddings, attention_mask=None):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(token_embeddings[:, :, 0])
+        mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        if self.pooling_mode == 'cls':
+            # Use [CLS] token (first token)
+            pooled = token_embeddings[:, 0]
+        elif self.pooling_mode == 'max':
+            # Max pooling
+            token_embeddings = token_embeddings.clone()
+            # Set padding tokens to large negative value to exclude them from max
+            token_embeddings[mask_expanded == 0] = -1e9
+            pooled = torch.max(token_embeddings, dim=1)[0]
+        elif self.pooling_mode == 'attention':
+            # Attention pooling
+            weights = self.attention_weights(token_embeddings).squeeze(-1)
+            # Mask out padding tokens
+            weights = weights.masked_fill(attention_mask == 0, -1e9)
+            weights = F.softmax(weights, dim=1).unsqueeze(-1)
+            pooled = torch.sum(token_embeddings * weights, dim=1)
+        elif self.pooling_mode == 'weighted':
+            # Weighted average pooling
+            weights = torch.sigmoid(self.weight_layer(token_embeddings)).squeeze(-1)
+            # Apply mask
+            weights = weights * attention_mask
+            # Normalize weights
+            sum_weights = torch.sum(weights, dim=1, keepdim=True)
+            sum_weights = torch.clamp(sum_weights, min=1e-9)
+            weights = weights / sum_weights
+            # Apply weights
+            pooled = torch.sum(token_embeddings * weights.unsqueeze(-1), dim=1)
+        else:  # Default to mean pooling
+            # Mean pooling
+            sum_embeddings = torch.sum(token_embeddings * mask_expanded, dim=1)
+            sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
+            pooled = sum_embeddings / sum_mask
+        # L2 normalize
+        pooled = F.normalize(pooled, p=2, dim=1)
+        return pooled
+class SentenceEmbeddingModel(nn.Module):
+    """Complete sentence embedding model for inference"""
+    def __init__(self, config):
+        super(SentenceEmbeddingModel, self).__init__()
+        self.config = config
+        # Create transformer model
+        self.transformer = AdvancedTransformerModel(config)
+        # Create pooling module
+        self.pooling = AdvancedPooling(config)
+        # Build projection module if needed
+        if "projection_dim" in config and config["projection_dim"] > 0:
+            self.use_projection = True
+            self.projection = nn.Sequential(
+                nn.Linear(config["hidden_size"], config["hidden_size"]),
+                nn.GELU(),
+                nn.Linear(config["hidden_size"], config["projection_dim"]),
+                nn.LayerNorm(config["projection_dim"], eps=config["layer_norm_eps"])
+            )
+        else:
+            self.use_projection = False
+    def forward(self, input_ids, attention_mask=None):
+        # Get token embeddings from transformer
+        token_embeddings = self.transformer(input_ids, attention_mask)
+        # Pool token embeddings
+        pooled_output = self.pooling(token_embeddings, attention_mask)
+        # Apply projection if enabled
+        if self.use_projection:
+            pooled_output = self.projection(pooled_output)
+            pooled_output = F.normalize(pooled_output, p=2, dim=1)
+        return pooled_output
+def convert_to_safetensors(model_path, output_path):
+    """Convert PyTorch model to safetensors format"""
+    print(f"Converting model from {model_path} to safetensors format...")
+    try:
+        # First try with weights_only=False to handle PyTorch 2.6+ checkpoints
+        checkpoint = torch.load(model_path, map_location="cpu", weights_only=False)
+        print("Successfully loaded checkpoint with weights_only=False")
+    except TypeError:
+        # For older PyTorch versions that don't have weights_only parameter
+        print("Falling back to default torch.load behavior for older PyTorch versions")
+        checkpoint = torch.load(model_path, map_location="cpu")
+    # Get model state dict
+    if "model_state_dict" in checkpoint:
+        state_dict = checkpoint["model_state_dict"]
+        print("Extracted model_state_dict from checkpoint")
+    else:
+        state_dict = checkpoint
+        print("Using entire checkpoint as state_dict")
+    # Save as safetensors
+    save_file(state_dict, output_path)
+    print(f"Model converted and saved to {output_path}")
+def load_model_and_tokenizer(model_dir, tokenizer_dir="/home/ubuntu/hindi_tokenizer"):
+    """Load the model and tokenizer for inference"""
+    # Load the config
+    config_path = os.path.join(model_dir, "config.json")
+    with open(config_path, "r") as f:
+        config = json.load(f)
+    # Load the tokenizer - use specified tokenizer directory
+    tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.model")
+    if not os.path.exists(tokenizer_path):
+        # Try other locations
+        tokenizer_path = os.path.join(model_dir, "tokenizer.model")
+        if not os.path.exists(tokenizer_path):
+            raise FileNotFoundError(f"Could not find tokenizer model at {tokenizer_path}")
+    tokenizer = SentencePieceTokenizerWrapper(tokenizer_path)
+    print(f"Loaded tokenizer from {tokenizer_path} with vocabulary size: {tokenizer.vocab_size}")
+    # Load the model
+    safetensors_path = os.path.join(model_dir, "embedding_model.safetensors")
+    if not os.path.exists(safetensors_path):
+        print(f"Safetensors model not found at {safetensors_path}, converting from PyTorch checkpoint...")
+        # Convert from PyTorch checkpoint
+        pytorch_path = os.path.join(model_dir, "embedding_model.pt")
+        if not os.path.exists(pytorch_path):
+            raise FileNotFoundError(f"Could not find PyTorch model at {pytorch_path}")
+        convert_to_safetensors(pytorch_path, safetensors_path)
+    # Load state dict from safetensors
+    state_dict = load_file(safetensors_path)
+    # Create model
+    model = SentenceEmbeddingModel(config)
+    # Load state dict
+    try:
+        # Try direct loading
+        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+        print(f"Loaded model with missing keys: {missing_keys[:10]}{'...' if len(missing_keys) > 10 else ''}")
+        print(f"Unexpected keys: {unexpected_keys[:10]}{'...' if len(unexpected_keys) > 10 else ''}")
+    except Exception as e:
+        print(f"Error loading state dict: {e}")
+        print("Model will be initialized with random weights")
+    model.eval()
+    return model, tokenizer, config
+# LangChain Custom Embeddings Class
+class HindiSentenceEmbeddings(Embeddings):
+    """
+    Custom Langchain Embeddings class for Hindi sentence embeddings model
+    """
+    def __init__(self, model, tokenizer, device="cuda", batch_size=32, max_length=128):
+        """Initialize with model, tokenizer, and inference parameters"""
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = device
+        self.batch_size = batch_size
+        self.max_length = max_length
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Embed a list of documents/texts"""
+        embeddings = []
+        with torch.no_grad():
+            for i in range(0, len(texts), self.batch_size):
+                batch = texts[i:i+self.batch_size]
+                # Tokenize
+                inputs = self.tokenizer(
+                    batch,
+                    padding="max_length",
+                    truncation=True,
+                    max_length=self.max_length,
+                    return_tensors="pt"
+                )
+                # Move to device
+                input_ids = inputs["input_ids"].to(self.device)
+                attention_mask = inputs["attention_mask"].to(self.device)
+                # Get embeddings
+                batch_embeddings = self.model(input_ids, attention_mask)
+                # Move to CPU and convert to numpy
+                batch_embeddings = batch_embeddings.cpu().numpy()
+                embeddings.append(batch_embeddings)
+        return np.vstack(embeddings).tolist()
+    def embed_query(self, text: str) -> List[float]:
+        """Embed a single query/text"""
+        return self.embed_documents([text])[0]
+def extract_relevant_sentences(text, query, window_size=2):
+    """
+    Extract the most relevant sentences from text based on query keywords
+    Args:
+        text: The full text content
+        query: The user's query
+        window_size: Number of sentences to include before and after matched sentence
+    Returns:
+        String containing the most relevant portion of the text
+    """
+    # Clean and normalize query and text for matching
+    query = query.strip().lower()
+    # Remove question marks and other punctuation from query for matching
+    query = re.sub(r'[?।॥!,.:]', '', query)
+    # Extract keywords from the query (remove common Hindi stop words)
+    stop_words = ['और', 'का', 'के', 'को', 'में', 'से', 'है', 'हैं', 'था', 'थे', 'की', 'कि', 'पर', 'एक', 'यह', 'वह', 'जो', 'ने', 'हो', 'कर']
+    query_terms = [word for word in query.split() if word not in stop_words]
+    if not query_terms:
+        return text  # If no meaningful terms left, return the full text
+    # Split text into sentences (using Hindi sentence terminators)
+    sentences = re.split(r'([।॥!?.])', text)
+    # Rejoin sentences with their terminators
+    complete_sentences = []
+    for i in range(0, len(sentences)-1, 2):
+        if i+1 < len(sentences):
+            complete_sentences.append(sentences[i] + sentences[i+1])
+        else:
+            complete_sentences.append(sentences[i])
+    # If the above didn't work properly, try simpler approach
+    if len(complete_sentences) <= 1:
+        complete_sentences = re.split(r'[।॥!?.]', text)
+        complete_sentences = [s.strip() for s in complete_sentences if s.strip()]
+    # Score each sentence based on how many query terms it contains
+    sentence_scores = []
+    for i, sentence in enumerate(complete_sentences):
+        sentence_lower = sentence.lower()
+        # Calculate score based on number of query terms found
+        score = sum(1 for term in query_terms if term in sentence_lower)
+        sentence_scores.append((i, score))
+    # Find the best matching sentence
+    if not sentence_scores:
+        return text[:500] + "..."  # Fallback
+    # Get the index of sentence with highest score
+    best_match_idx, best_score = max(sentence_scores, key=lambda x: x[1])
+    # If no good match found, return the whole text (up to a limit)
+    if best_score == 0:
+        # Try partial word matching as a fallback
+        for i, sentence in enumerate(complete_sentences):
+            sentence_lower = sentence.lower()
+            partial_score = sum(1 for term in query_terms if any(term in word.lower() for word in sentence_lower.split()))
+            if partial_score > 0:
+                best_match_idx = i
+                break
+        else:
+            # If still no match, just return the first part of the text
+            if len(text) > 1000:
+                return text[:1000] + "..."
+            return text
+    # Get window of sentences around the best match
+    start_idx = max(0, best_match_idx - window_size)
+    end_idx = min(len(complete_sentences), best_match_idx + window_size + 1)
+    # Create excerpt
+    relevant_text = ' '.join(complete_sentences[start_idx:end_idx])
+    # If the excerpt is short, return more context
+    if len(relevant_text) < 100 and len(text) > len(relevant_text):
+        # Add more context
+        if end_idx < len(complete_sentences):
+            relevant_text += ' ' + ' '.join(complete_sentences[end_idx:end_idx+2])
+        if start_idx > 0:
+            relevant_text = ' '.join(complete_sentences[max(0, start_idx-2):start_idx]) + ' ' + relevant_text
+    # If the excerpt is too short or the whole text is small anyway, return whole text
+    if len(relevant_text) < 50 or len(text) < 1000:
+        return text
+    return relevant_text
+# Text processing and indexing functions
+def load_and_process_text_file(file_path, chunk_size=500, chunk_overlap=100):
+    """
+    Load a text file and split it into semantically meaningful chunks
+    """
+    print(f"Loading and processing text file: {file_path}")
+    # Read the file content
+    with open(file_path, 'r', encoding='utf-8') as f:
+        content = f.read()
+    # For small files, just keep the whole content as a single chunk
+    if len(content) <= chunk_size * 2:
+        print(f"File content is small, keeping as a single chunk")
+        return [Document(
+            page_content=content,
+            metadata={
+                "source": file_path,
+                "chunk_id": 0
+            }
+        )]
+    # Split by paragraphs first
+    paragraphs = re.split(r'\n\s*\n', content)
+    chunks = []
+    current_chunk = ""
+    current_size = 0
+    for para in paragraphs:
+        if not para.strip():
+            continue
+        # If adding this paragraph would exceed the chunk size, save current chunk and start new one
+        if current_size + len(para) > chunk_size and current_size > 0:
+            chunks.append(current_chunk)
+            current_chunk = para
+            current_size = len(para)
+        else:
+            # Add paragraph to current chunk with a newline if not empty
+            if current_size > 0:
+                current_chunk += "\n\n" + para
+            else:
+                current_chunk = para
+            current_size = len(current_chunk)
+    # Add the last chunk if not empty
+    if current_chunk:
+        chunks.append(current_chunk)
+    print(f"Split text into {len(chunks)} chunks")
+    # Convert to LangChain documents with metadata
+    documents = [
+        Document(
+            page_content=chunk,
+            metadata={
+                "source": file_path,
+                "chunk_id": i
+            }
+        ) for i, chunk in enumerate(chunks)
+    ]
+    return documents
+def create_vector_store(documents, embeddings, store_path=None):
+    """
+    Create a FAISS vector store from documents using the given embeddings
+    """
+    print("Creating FAISS vector store...")
+    # Create vector store
+    vector_store = LangchainFAISS.from_documents(documents, embeddings)
+    # Save if path is provided
+    if store_path:
+        print(f"Saving vector store to {store_path}")
+        vector_store.save_local(store_path)
+    return vector_store
+def load_vector_store(store_path, embeddings):
+    """
+    Load a FAISS vector store from disk
+    """
+    print(f"Loading vector store from {store_path}")
+    return LangchainFAISS.load_local(store_path, embeddings, allow_dangerous_deserialization=True)
+def perform_similarity_search(vector_store, query, k=6):
+    """
+    Perform basic similarity search on the vector store
+    """
+    print(f"Searching for: {query}")
+    return vector_store.similarity_search_with_score(query, k=k)
+# Main RAG functions
+def index_text_files(model, tokenizer, data_dir, output_dir, device="cuda", chunk_size=500):
+    """
+    Index text files from a directory and create a FAISS vector store
+    """
+    print(f"Indexing text files from {data_dir} with chunk size ({chunk_size}) for fine-grained retrieval")
+    # Create embedding model
+    embeddings = HindiSentenceEmbeddings(model, tokenizer, device=device)
+    # Create output directory if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+    # Get all text files
+    text_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.txt')]
+    print(f"Found {len(text_files)} text files")
+    # Process all text files
+    all_documents = []
+    for file_path in text_files:
+        documents = load_and_process_text_file(file_path, chunk_size=chunk_size)
+        all_documents.extend(documents)
+    print(f"Total documents: {len(all_documents)}")
+    # If we don't have enough chunks, reduce chunk size and try again
+    if len(all_documents) < 10 and chunk_size > 50:
+        print(f"Not enough chunks created. Reducing chunk size and trying again...")
+        return index_text_files(model, tokenizer, data_dir, output_dir, device, chunk_size=chunk_size//2)
+    # Create and save vector store
+    vector_store_path = os.path.join(output_dir, "faiss_index")
+    vector_store = create_vector_store(all_documents, embeddings, vector_store_path)
+    return vector_store, embeddings
+def query_text_corpus(model, tokenizer, vector_store_path, query, k=6, device="cuda"):
+    """
+    Query the text corpus using the indexed vector store
+    """
+    # Create embedding model
+    embeddings = HindiSentenceEmbeddings(model, tokenizer, device=device)
+    # Load vector store
+    vector_store = load_vector_store(vector_store_path, embeddings)
+    # Perform similarity search
+    results = perform_similarity_search(vector_store, query, k=k)
+    # Post-process results to combine adjacent chunks if they're from the same source
+    processed_results = []
+    seen_chunks = set()
+    for doc, score in results:
+        chunk_id = doc.metadata["chunk_id"]
+        source = doc.metadata["source"]
+        # Skip if we've already included this chunk
+        if (source, chunk_id) in seen_chunks:
+            continue
+        seen_chunks.add((source, chunk_id))
+        # Try to find adjacent chunks and combine them
+        combined_content = doc.page_content
+        # Look for adjacent chunks in results (both previous and next)
+        for adj_id in [chunk_id-1, chunk_id+1]:
+            for other_doc, _ in results:
+                if (other_doc.metadata["source"] == source and
+                    other_doc.metadata["chunk_id"] == adj_id and
+                    (source, adj_id) not in seen_chunks):
+                    # Add the adjacent chunk content
+                    if adj_id < chunk_id:  # Previous chunk
+                        combined_content = other_doc.page_content + " " + combined_content
+                    else:  # Next chunk
+                        combined_content = combined_content + " " + other_doc.page_content
+                    seen_chunks.add((source, adj_id))
+        # Create a new document with combined content
+        combined_doc = Document(
+            page_content=combined_content,
+            metadata={
+                "source": source,
+                "chunk_id": chunk_id,
+                "is_combined": True if combined_content != doc.page_content else False
+            }
+        )
+        processed_results.append((combined_doc, score))
+    return processed_results
+def main():
+    parser = argparse.ArgumentParser(description="Hindi RAG System with LangChain and FAISS")
+    parser.add_argument("--model_dir", type=str, default="/home/ubuntu/output/hindi-embeddings-custom-tokenizer/final",
+                        help="Directory containing the model and tokenizer")
+    parser.add_argument("--tokenizer_dir", type=str, default="/home/ubuntu/hindi_tokenizer",
+                        help="Directory containing the tokenizer")
+    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu",
+                        help="Device to run inference on ('cuda' or 'cpu')")
+    parser.add_argument("--index", action="store_true",
+                        help="Index text files from data directory")
+    parser.add_argument("--query", type=str, default=None,
+                        help="Query to search in the indexed corpus")
+    parser.add_argument("--data_dir", type=str, default="./data",
+                        help="Directory containing text files for indexing")
+    parser.add_argument("--output_dir", type=str, default="./output",
+                        help="Directory to save the indexed vector store")
+    parser.add_argument("--top_k", type=int, default=6,
+                        help="Number of top results to return")
+    parser.add_argument("--chunk_size", type=int, default=500,
+                        help="Size of text chunks for indexing")
+    parser.add_argument("--interactive", action="store_true",
+                        help="Run in interactive mode for querying")
+    parser.add_argument("--reindex", action="store_true",
+                        help="Force reindexing even if index exists")
+    args = parser.parse_args()
+    # Load model and tokenizer
+    model, tokenizer, config = load_model_and_tokenizer(args.model_dir, args.tokenizer_dir)
+    # Move model to device
+    model = model.to(args.device)
+    # Create vector store path
+    vector_store_path = os.path.join(args.output_dir, "faiss_index")
+    if args.index or args.reindex:
+        # Index text files
+        index_text_files(model, tokenizer, args.data_dir, args.output_dir, args.device, args.chunk_size)
+        print(f"Indexing complete. Vector store saved to {vector_store_path}")
+    if args.query:
+        # Query the corpus
+        results = query_text_corpus(model, tokenizer, vector_store_path, args.query, args.top_k, args.device)
+        # Print results
+        print("\nSearch Results:")
+        for i, (doc, score) in enumerate(results):
+            print(f"\nResult {i+1} (Score: {score:.4f}):")
+            print(f"Source: {doc.metadata['source']}, Chunk: {doc.metadata['chunk_id']}")
+            # Extract and print only relevant sentences
+            relevant_text = extract_relevant_sentences(doc.page_content, args.query)
+            print(f"Content: {relevant_text}")
+    if args.interactive:
+        print("\nInteractive mode. Enter queries (or type 'quit' to exit).")
+        while True:
+            print("\nEnter query:")
+            query = input()
+            if not query.strip():
+                continue
+            if query.lower() == 'quit':
+                break
+            # Query the corpus
+            results = query_text_corpus(model, tokenizer, vector_store_path, query, args.top_k, args.device)
+            # Print results
+            print("\nSearch Results:")
+            for i, (doc, score) in enumerate(results):
+                print(f"\nResult {i+1} (Score: {score:.4f}):")
+                print(f"Source: {doc.metadata['source']}, Chunk: {doc.metadata['chunk_id']}")
+                # Extract and print only relevant sentences
+                relevant_text = extract_relevant_sentences(doc.page_content, query)
+                print(f"Content: {relevant_text}")
+if __name__ == "__main__":
+    main()

hindi_corpus.txt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a2e1c771f83f7eca2fc1270939e14c12fd939b431be50308f07e52698111ba80
-size 57984810

 version https://git-lfs.github.com/spec/v1
+oid sha256:47477d8bbf21670e7720750755510260aca045de933697edea7620e65ac03ec6
+size 2202127229

sample_corpus.txt CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:123070be6221922fc42f22e54064f9cb2261715e02dd586a464231064d327d34
-size 1290162

 version https://git-lfs.github.com/spec/v1
+oid sha256:f1658727f4ea5c571f69a60f6defcd180014a1d69be6e9c1ec360d9510aa6b5e
+size 642200

tokenizer.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

@@ -12,7 +12,7 @@
   "cls_token": "<cls>",
   "sep_token": "<sep>",
   "model_max_length": 512,
-  "vocab_size": 50000,
   "do_lower_case": false,
   "special_tokens_map": {
     "pad_token": "<pad>",

   "cls_token": "<cls>",
   "sep_token": "<sep>",
   "model_max_length": 512,
+  "vocab_size": 16000,
   "do_lower_case": false,
   "special_tokens_map": {
     "pad_token": "<pad>",

tokenizer_evaluation.json ADDED Viewed

	@@ -0,0 +1,129 @@

+{
+  "vocab_size": 16000,
+  "num_samples": 1000,
+  "avg_tokens_per_text": 688.021,
+  "avg_chars_per_token": 4.176231539444291,
+  "token_frequency": {
+    "▁के": 22686,
+    "।": 19273,
+    "▁में": 16210,
+    "▁है": 16067,
+    "▁0": 13182,
+    "▁की": 12858,
+    "▁को": 10132,
+    "▁से": 10072,
+    "▁और": 9746,
+    "▁का": 7612,
+    "▁हैं": 6469,
+    "▁पर": 6117,
+    "▁ने": 5582,
+    "▁कि": 5028,
+    "ों": 4854,
+    "▁लिए": 4562,
+    "▁एक": 4558,
+    "▁भी": 4149,
+    "▁नहीं": 3721,
+    "ी": 3498,
+    "▁इस": 3040,
+    "▁कर": 2959,
+    "▁किया": 2772,
+    "▁यह": 2701,
+    "▁करने": 2487,
+    "▁था": 2474,
+    "▁ही": 2432,
+    "▁हो": 2428,
+    "▁साथ": 2289,
+    "ा": 2263,
+    "▁तो": 2235,
+    "▁": 2163,
+    "▁गया": 1949,
+    "े": 1904,
+    "▁कहा": 1665,
+    "▁अपने": 1596,
+    "र": 1504,
+    "▁बाद": 1476,
+    "▁रहे": 1446,
+    "▁।": 1419,
+    "▁जो": 1413,
+    "न": 1394,
+    "▁तक": 1316,
+    "▁दिया": 1297,
+    "▁आप": 1239,
+    "▁या": 1235,
+    "▁लेकिन": 1229,
+    "s": 1216,
+    "▁रहा": 1166,
+    "▁थे": 1154,
+    "▁थी": 1135,
+    "▁कुछ": 1122,
+    "▁हुए": 1074,
+    "▁वह": 1063,
+    "▁जा": 1059,
+    "▁रूप": 1059,
+    "▁पहले": 1056,
+    "ता": 1010,
+    "▁उन्होंने": 1000,
+    "▁करते": 995,
+    "▁तरह": 993,
+    "▁रही": 984,
+    "▁गई": 968,
+    "क": 946,
+    "ल": 922,
+    "▁किसी": 910,
+    "▁जाता": 895,
+    "ः": 889,
+    "▁इसके": 878,
+    "▁होने": 874,
+    "▁लोगों": 872,
+    "▁समय": 869,
+    "▁न": 860,
+    "म": 859,
+    "्स": 858,
+    "▁ये": 857,
+    "▁वाले": 856,
+    "▁भारत": 851,
+    "▁करना": 842,
+    "▁कोई": 839,
+    "▁अब": 837,
+    "▁अपनी": 837,
+    "▁क्या": 837,
+    "ने": 834,
+    "▁वे": 831,
+    "▁बहुत": 827,
+    "▁सरकार": 820,
+    "▁जब": 808,
+    "▁सकता": 808,
+    "▁मैं": 807,
+    "▁बात": 800,
+    "▁गए": 791,
+    "▁सभी": 785,
+    "ो": 785,
+    "▁व": 778,
+    "▁काम": 772,
+    "स": 759,
+    "ं": 743,
+    "▁द्वारा": 741,
+    "▁सकते": 739
+  },
+  "token_length_distribution": {
+    "1": 53072,
+    "2": 62697,
+    "3": 196220,
+    "4": 111226,
+    "5": 108766,
+    "6": 67569,
+    "7": 41320,
+    "8": 22604,
+    "9": 13027,
+    "10": 5885,
+    "11": 3074,
+    "12": 1590,
+    "13": 671,
+    "14": 225,
+    "15": 71,
+    "16": 4
+  },
+  "unicode_script_coverage": {},
+  "decodability_accuracy": 0.139,
+  "special_tokens_count": 0
+}

training_config.json CHANGED Viewed

@@ -1,9 +1,7 @@
 {
   "output_dir": "hindi_tokenizer_sota",
-  "vocab_size": 50000,
   "model_type": "unigram",
   "character_coverage": 0.9995,
-  "num_iterations": 2000000,
   "max_examples": 300000,
   "sample_factor": 1.0,
   "create_hf_config": true,

 {
   "output_dir": "hindi_tokenizer_sota",
   "model_type": "unigram",
   "character_coverage": 0.9995,
   "max_examples": 300000,
   "sample_factor": 1.0,
   "create_hf_config": true,