Spaces:
Build error
Build error
import gradio as gr | |
from transformers import BertTokenizer, AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM | |
from tokenizers import ByteLevelBPETokenizer | |
from gensim.models import FastText | |
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased") | |
mbert_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased") | |
bpe_tokenizer = ByteLevelBPETokenizer() | |
fasttext_model = FastText(vector_size=100, window=5, min_count=1) | |
polylm_tokenizer = AutoTokenizer.from_pretrained("DAMO-NLP-MT/polylm-1.7b") | |
polylm_model = AutoModelForCausalLM.from_pretrained("DAMO-NLP-MT/polylm-1.7b") | |
byt5_tokenizer = AutoTokenizer.from_pretrained("google/byt5-small") | |
byt5_model = AutoModelForSeq2SeqLM.from_pretrained("google/byt5-small") | |
def process_text(input_text, show_tokens, tokenizer_type, display_mode): | |
tokens = [] | |
if tokenizer_type == "BERT": | |
tokens = bert_tokenizer.tokenize(input_text) | |
elif tokenizer_type == "Multilingual BERT": | |
tokens = mbert_tokenizer.tokenize(input_text) | |
elif tokenizer_type == "BPE": | |
bpe_tokenizer.train_from_iterator([input_text], vocab_size=1000, min_frequency=1) | |
tokens = bpe_tokenizer.encode(input_text).tokens | |
elif tokenizer_type == "FastText": | |
tokens = input_text.split() | |
elif tokenizer_type == "PolyLM": | |
tokens = polylm_tokenizer.tokenize(input_text) | |
elif tokenizer_type == "ByT5": | |
tokens = byt5_tokenizer.tokenize(input_text) | |
token_count = len(tokens) | |
if display_mode == "Tokens": | |
if show_tokens: | |
token_html = "" | |
for idx, token in enumerate(tokens): | |
color = f"hsl({(idx * 50) % 360}, 70%, 40%)" | |
token_html += f'<span style="background-color:{color}; padding:2px; border-radius:5px; color: black;">{token}</span> ' | |
return token_html, token_count | |
else: | |
return " ".join(tokens), token_count | |
elif display_mode == "Token Values": | |
return str(tokens), token_count | |
with gr.Blocks() as demo: | |
gr.Markdown("# Tokenizer Explorer") | |
gr.Markdown("Choose a tokenizer and see how your text is tokenized. Toggle 'Show Tokens' to view highlighted tokens.") | |
with gr.Row(): | |
input_text = gr.Textbox(label="Input Text", placeholder="Type your text here...", lines=5) | |
output_display = gr.HTML(label="Output Display") | |
with gr.Row(): | |
token_count_display = gr.Number(label="Number of Tokens", value=0, interactive=False) | |
tokenizer_type = gr.Radio( | |
["BERT", "Multilingual BERT", "BPE", "FastText", "PolyLM", "ByT5"], | |
label="Choose Tokenizer", | |
value="BERT", | |
) | |
display_mode = gr.Radio( | |
["Tokens", "Token Values"], | |
label="Display Mode", | |
value="Tokens", | |
) | |
show_tokens = gr.Checkbox(label="Show Tokens", value=True) | |
def update_output(input_text, show_tokens, tokenizer_type, display_mode): | |
token_output, token_count = process_text(input_text, show_tokens, tokenizer_type, display_mode) | |
return token_output, token_count | |
input_text.change( | |
fn=update_output, | |
inputs=[input_text, show_tokens, tokenizer_type, display_mode], | |
outputs=[output_display, token_count_display], | |
) | |
show_tokens.change( | |
fn=update_output, | |
inputs=[input_text, show_tokens, tokenizer_type, display_mode], | |
outputs=[output_display, token_count_display], | |
) | |
tokenizer_type.change( | |
fn=update_output, | |
inputs=[input_text, show_tokens, tokenizer_type, display_mode], | |
outputs=[output_display, token_count_display], | |
) | |
display_mode.change( | |
fn=update_output, | |
inputs=[input_text, show_tokens, tokenizer_type, display_mode], | |
outputs=[output_display, token_count_display], | |
) | |
demo.launch() | |