Tokenizers / app.py
rt2195355's picture
Create app.py
3a03bb1 verified
import gradio as gr
from transformers import BertTokenizer, AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
from tokenizers import ByteLevelBPETokenizer
from gensim.models import FastText
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
mbert_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
bpe_tokenizer = ByteLevelBPETokenizer()
fasttext_model = FastText(vector_size=100, window=5, min_count=1)
polylm_tokenizer = AutoTokenizer.from_pretrained("DAMO-NLP-MT/polylm-1.7b")
polylm_model = AutoModelForCausalLM.from_pretrained("DAMO-NLP-MT/polylm-1.7b")
byt5_tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")
byt5_model = AutoModelForSeq2SeqLM.from_pretrained("google/byt5-small")
def process_text(input_text, show_tokens, tokenizer_type, display_mode):
tokens = []
if tokenizer_type == "BERT":
tokens = bert_tokenizer.tokenize(input_text)
elif tokenizer_type == "Multilingual BERT":
tokens = mbert_tokenizer.tokenize(input_text)
elif tokenizer_type == "BPE":
bpe_tokenizer.train_from_iterator([input_text], vocab_size=1000, min_frequency=1)
tokens = bpe_tokenizer.encode(input_text).tokens
elif tokenizer_type == "FastText":
tokens = input_text.split()
elif tokenizer_type == "PolyLM":
tokens = polylm_tokenizer.tokenize(input_text)
elif tokenizer_type == "ByT5":
tokens = byt5_tokenizer.tokenize(input_text)
token_count = len(tokens)
if display_mode == "Tokens":
if show_tokens:
token_html = ""
for idx, token in enumerate(tokens):
color = f"hsl({(idx * 50) % 360}, 70%, 40%)"
token_html += f'<span style="background-color:{color}; padding:2px; border-radius:5px; color: black;">{token}</span> '
return token_html, token_count
else:
return " ".join(tokens), token_count
elif display_mode == "Token Values":
return str(tokens), token_count
with gr.Blocks() as demo:
gr.Markdown("# Tokenizer Explorer")
gr.Markdown("Choose a tokenizer and see how your text is tokenized. Toggle 'Show Tokens' to view highlighted tokens.")
with gr.Row():
input_text = gr.Textbox(label="Input Text", placeholder="Type your text here...", lines=5)
output_display = gr.HTML(label="Output Display")
with gr.Row():
token_count_display = gr.Number(label="Number of Tokens", value=0, interactive=False)
tokenizer_type = gr.Radio(
["BERT", "Multilingual BERT", "BPE", "FastText", "PolyLM", "ByT5"],
label="Choose Tokenizer",
value="BERT",
)
display_mode = gr.Radio(
["Tokens", "Token Values"],
label="Display Mode",
value="Tokens",
)
show_tokens = gr.Checkbox(label="Show Tokens", value=True)
def update_output(input_text, show_tokens, tokenizer_type, display_mode):
token_output, token_count = process_text(input_text, show_tokens, tokenizer_type, display_mode)
return token_output, token_count
input_text.change(
fn=update_output,
inputs=[input_text, show_tokens, tokenizer_type, display_mode],
outputs=[output_display, token_count_display],
)
show_tokens.change(
fn=update_output,
inputs=[input_text, show_tokens, tokenizer_type, display_mode],
outputs=[output_display, token_count_display],
)
tokenizer_type.change(
fn=update_output,
inputs=[input_text, show_tokens, tokenizer_type, display_mode],
outputs=[output_display, token_count_display],
)
display_mode.change(
fn=update_output,
inputs=[input_text, show_tokens, tokenizer_type, display_mode],
outputs=[output_display, token_count_display],
)
demo.launch()