from transformers import AutoTokenizer from tqdm import tqdm import gradio as gr import pandas as pd from datasets import load_dataset import random from pathlib import Path initial_list_of_models = [ "Xenova/gpt-4o", "NousResearch/Meta-Llama-3-8B", "CohereForAI/c4ai-command-r-v01", "CohereForAI/c4ai-command-r-plus", "core42/jais-13b", ] dataset = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"] dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl" if dataframe_path.exists(): df = pd.read_jsonl(dataframe_path, lines=True) else: df = pd.DataFrame(columns=["Models", "Total Number of Tokens", "Vocab Size", "Tokenizer Class"]) for model_name in tqdm(initial_list_of_models): if model_name in df["Models"].values: continue tokenizer = AutoTokenizer.from_pretrained( model_name, use_fast=True, trust_remote_code=True ) vocab_size = tokenizer.vocab_size number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids]) df = df._append( { "Models": model_name, "Vocab Size": vocab_size, "Total Number of Tokens": number_of_tokens, "Tokenizer Class": tokenizer.__class__.__name__, }, ignore_index=True, ) # Sort the dataframe by the number of tokens df = df.sort_values(by="Total Number of Tokens", ascending=True) # Save the dataframe to a csv file df.to_json(dataframe_path, lines=True, orient="records") # Gradio Functions def refresh(): global df df = df.sort_values(by="Total Number of Tokens", ascending=True) return gr.Dataframe(df), gr.BarPlot(df) def submit(model_name): global df tokenizer = AutoTokenizer.from_pretrained( model_name, use_fast=True, trust_remote_code=True ) vocab_size = tokenizer.vocab_size number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids]) df = df._append( { "Models": model_name, "Vocab Size": vocab_size, "Total Number of Tokens": number_of_tokens, "Tokenizer Class": tokenizer.__class__.__name__, }, ignore_index=True, ) def generate_distinct_colors(n): """Generate n visually distinct colors in hexadecimal format.""" if n > 256**3: raise ValueError("Cannot generate more than 16,777,216 unique colors.") # To ensure colors are distinct, calculate an appropriate distance between colors # The cube root of 256**3 (total colors) divided by n gives a crude initial spacing estimate spacing = int((256 * 256 * 256)**(1/3) / n**(1/3)) max_val = 256 - spacing # Set to keep track of used colors used_colors = set() # List to store the result colors result = [] attempts = 0 while len(result) < n: # Generate a color with a random start and controlled spacing r = random.randint(0, max_val) g = random.randint(0, max_val) b = random.randint(0, max_val) # Scale up by spacing to ensure minimum distance between colors r = min(255, r * spacing) g = min(255, g * spacing) b = min(255, b * spacing) # Format the color in hexadecimal color = f"#{r:02X}{g:02X}{b:02X}" # Ensure this color hasn't been used if color not in used_colors: used_colors.add(color) result.append(color) else: attempts += 1 if attempts > 50: # Dynamically adjust spacing if stuck spacing = max(1, spacing - 1) max_val = 256 - spacing attempts = 0 return result def decode_bpe_tokens(tokens): fixed_tokens = [] for token in tokens: # Check if the token starts with the special BPE space character 'Ġ' if token.startswith('Ġ'): # Process the rest of the token try: # Decode the rest of the token from UTF-8 bytes understood as Latin-1 characters fixed_token = ' ' + token[1:].encode('utf-8').decode('utf-8') except UnicodeDecodeError: fixed_token = token # Use the original token if decoding fails else: try: # Directly encode and decode without misinterpretation steps fixed_token = token.encode('utf-8').decode('utf-8') except UnicodeDecodeError: fixed_token = token # Use the original token if decoding fails fixed_tokens.append(fixed_token) return fixed_tokens def decode_arabic_tokens(tokens): decoded_tokens = [] for token in tokens: decoded_token = token.encode('latin-1', 'backslashreplace').decode('unicode-escape') decoded_tokens.append(decoded_token) return decoded_tokens def tokenize_text(text, chosen_model): tokenizer = AutoTokenizer.from_pretrained(chosen_model) tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text)) # tokenized_text = decode_arabic_tokens(tokenizer.tokenize(text)) random_colors = generate_distinct_colors(len(tokenized_text)) print(tokenized_text) output = [] color_map = {} for idx, token in enumerate(tokenized_text): output.append((token, str(idx))) color_map[str(idx+1)] = random_colors[idx % len(random_colors)] return gr.HighlightedText(output, color_map) leaderboard_description = """The numbers in this leaderboard are based on the total number of tokens in the Arabic dataset [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations). """ with gr.Blocks() as demo: gr.HTML("

Arabic Tokenizers Leaderboard

") gr.Markdown("## What is the best tokenizer for Arabic?") gr.Markdown(leaderboard_description) with gr.Tab(label="Leaderboard"): dataframe = gr.Dataframe(df) with gr.Accordion("Barplot", open=False): barplot = gr.BarPlot( df, x="Models", y="Total Number of Tokens", x_title=" ", y_title=" ", width=1000, height=400, tooltip=["Vocab Size", "Total Number of Tokens"], vertical=False, x_label_angle=30, caption="Total Number of Tokens", ) model_name = gr.Textbox( label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)" ) with gr.Row(): refresh_btn = gr.Button(value="Refresh") submit_new_model_btn = gr.Button(value="Submit", variant="primary") with gr.Tab(label="Try tokenizers"): text = gr.Textbox(label="Enter a text", lines=5, value="السلام عليكم ورحمة الله", rtl=True, text_align="right") dropdown = gr.Dropdown( label="Select a model", choices=df["Models"].tolist(), value=df["Models"].tolist()[0], ) submit_text_btn = gr.Button(value="Submit", variant="primary") tokenized_textbox = gr.HighlightedText(label="Tokenized text") submit_new_model_btn.click(submit, model_name) refresh_btn.click(refresh, outputs=[dataframe, barplot]) submit_text_btn.click(tokenize_text, inputs=[text, dropdown], outputs=[tokenized_textbox]) demo.launch()