import gradio as gr from transformers import BertTokenizer, AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM from tokenizers import ByteLevelBPETokenizer from gensim.models import FastText bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased") mbert_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased") bpe_tokenizer = ByteLevelBPETokenizer() fasttext_model = FastText(vector_size=100, window=5, min_count=1) polylm_tokenizer = AutoTokenizer.from_pretrained("DAMO-NLP-MT/polylm-1.7b") polylm_model = AutoModelForCausalLM.from_pretrained("DAMO-NLP-MT/polylm-1.7b") byt5_tokenizer = AutoTokenizer.from_pretrained("google/byt5-small") byt5_model = AutoModelForSeq2SeqLM.from_pretrained("google/byt5-small") def process_text(input_text, show_tokens, tokenizer_type, display_mode): tokens = [] if tokenizer_type == "BERT": tokens = bert_tokenizer.tokenize(input_text) elif tokenizer_type == "Multilingual BERT": tokens = mbert_tokenizer.tokenize(input_text) elif tokenizer_type == "BPE": bpe_tokenizer.train_from_iterator([input_text], vocab_size=1000, min_frequency=1) tokens = bpe_tokenizer.encode(input_text).tokens elif tokenizer_type == "FastText": tokens = input_text.split() elif tokenizer_type == "PolyLM": tokens = polylm_tokenizer.tokenize(input_text) elif tokenizer_type == "ByT5": tokens = byt5_tokenizer.tokenize(input_text) token_count = len(tokens) if display_mode == "Tokens": if show_tokens: token_html = "" for idx, token in enumerate(tokens): color = f"hsl({(idx * 50) % 360}, 70%, 40%)" token_html += f'{token} ' return token_html, token_count else: return " ".join(tokens), token_count elif display_mode == "Token Values": return str(tokens), token_count with gr.Blocks() as demo: gr.Markdown("# Tokenizer Explorer") gr.Markdown("Choose a tokenizer and see how your text is tokenized. Toggle 'Show Tokens' to view highlighted tokens.") with gr.Row(): input_text = gr.Textbox(label="Input Text", placeholder="Type your text here...", lines=5) output_display = gr.HTML(label="Output Display") with gr.Row(): token_count_display = gr.Number(label="Number of Tokens", value=0, interactive=False) tokenizer_type = gr.Radio( ["BERT", "Multilingual BERT", "BPE", "FastText", "PolyLM", "ByT5"], label="Choose Tokenizer", value="BERT", ) display_mode = gr.Radio( ["Tokens", "Token Values"], label="Display Mode", value="Tokens", ) show_tokens = gr.Checkbox(label="Show Tokens", value=True) def update_output(input_text, show_tokens, tokenizer_type, display_mode): token_output, token_count = process_text(input_text, show_tokens, tokenizer_type, display_mode) return token_output, token_count input_text.change( fn=update_output, inputs=[input_text, show_tokens, tokenizer_type, display_mode], outputs=[output_display, token_count_display], ) show_tokens.change( fn=update_output, inputs=[input_text, show_tokens, tokenizer_type, display_mode], outputs=[output_display, token_count_display], ) tokenizer_type.change( fn=update_output, inputs=[input_text, show_tokens, tokenizer_type, display_mode], outputs=[output_display, token_count_display], ) display_mode.change( fn=update_output, inputs=[input_text, show_tokens, tokenizer_type, display_mode], outputs=[output_display, token_count_display], ) demo.launch()