import gradio as gr import tiktoken from transformers import AutoTokenizer import os # Model mappings MODEL_MAP = { 'llama-2': 'meta-llama/Llama-2-7b-hf', 'llama-3': 'meta-llama/Meta-Llama-3-8B', 'gemma-2': 'google/gemma-2-2b', 'qwen3': 'Qwen/Qwen2.5-0.5B', 'bert': 'bert-base-uncased' } def tokenize_with_tiktoken(text, model): encoding = 'cl100k_base' if model == 'gpt-4' else 'gpt2' enc = tiktoken.get_encoding(encoding) tokens = enc.encode(text) token_texts = [enc.decode([token]) for token in tokens] return { 'model': f'GPT-4' if model == 'gpt-4' else 'GPT-2', 'token_count': len(tokens), 'tokens': token_texts, 'token_ids': tokens.tolist() } def tokenize_with_hf(text, model): try: model_name = MODEL_MAP.get(model, 'gpt2') tokenizer = AutoTokenizer.from_pretrained(model_name, token=os.getenv('HF_TOKEN')) tokens = tokenizer.encode(text) token_texts = [tokenizer.decode([token], skip_special_tokens=False) for token in tokens] return { 'model': model.upper(), 'token_count': len(tokens), 'tokens': token_texts, 'token_ids': tokens } except Exception as e: return { 'model': model.upper(), 'token_count': 0, 'tokens': [f"Error: {str(e)}"], 'token_ids': [] } def compare_tokenizers(text, selected_models): if not text.strip(): return "Please enter some text to tokenize." results = [] for model in selected_models: if model in ['gpt-4', 'gpt-2']: result = tokenize_with_tiktoken(text, model) else: result = tokenize_with_hf(text, model) # Format output tokens_display = ' | '.join([f'"{token}"' if token.strip() else '"·"' for token in result['tokens'][:20]]) if len(result['tokens']) > 20: tokens_display += f" ... (+{len(result['tokens']) - 20} more)" results.append(f""" **{result['model']}** - Token Count: **{result['token_count']}** - Tokens: {tokens_display} - Token IDs: {str(result['token_ids'][:10])}{'...' if len(result['token_ids']) > 10 else ''} """) return "\n\n---\n".join(results) # Create Gradio interface with gr.Blocks( title="🔤 Tokenizer Comparison Tool", theme=gr.themes.Soft() ) as demo: gr.Markdown(""" # 🔤 Tokenizer Comparison Tool Compare how different LLM tokenizers split text into tokens. See the differences between GPT, LLaMA, Gemma, and other models. """) with gr.Row(): with gr.Column(scale=2): text_input = gr.Textbox( label="Text to tokenize", placeholder="Hello world! This is a test with some subwords and punctuation.", lines=4, value="Hello world! This is a test with some subwords and punctuation." ) with gr.Column(scale=1): model_selector = gr.CheckboxGroup( choices=['gpt-4', 'gpt-2', 'llama-2', 'llama-3', 'gemma-2', 'qwen3', 'bert'], value=['gpt-4', 'llama-3', 'gpt-2'], label="Select tokenizers to compare" ) output = gr.Markdown( label="Tokenization Results", value="Enter text above to see tokenization results..." ) # Auto-update on text or model change text_input.change( fn=compare_tokenizers, inputs=[text_input, model_selector], outputs=output ) model_selector.change( fn=compare_tokenizers, inputs=[text_input, model_selector], outputs=output ) gr.Markdown(""" ### Legend: - **Token Count**: Number of tokens the model uses - **Tokens**: The actual text pieces (subwords) - **Token IDs**: Numerical IDs in the vocabulary - **"·"**: Represents spaces/whitespace ### Models: - **GPT-4/GPT-2**: OpenAI tokenizers (tiktoken) - **LLaMA**: Meta's models (SentencePiece) - **Gemma**: Google's models - **Qwen**: Alibaba's models - **BERT**: Google's BERT tokenizer """) if __name__ == "__main__": demo.launch()