Spaces:

MohamedRashad
/

arabic-tokenizers-leaderboard

Running

File size: 8,298 Bytes

from transformers import AutoTokenizer
from tqdm import tqdm
import gradio as gr
import pandas as pd
from datasets import load_dataset
import random
from pathlib import Path

initial_list_of_models = [
    "Xenova/gpt-4o",
    "NousResearch/Meta-Llama-3-8B",
    "CohereForAI/c4ai-command-r-v01",
    "CohereForAI/c4ai-command-r-plus",
    "core42/jais-13b",
]

dataset = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"]

dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
if dataframe_path.exists():
    df = pd.read_json(dataframe_path, lines=True)
else:
    df = pd.DataFrame(columns=["📛 Models", "➕ Total Number of Tokens", "📘 Vocab Size", "Tokenizer Class"])

for model_name in tqdm(initial_list_of_models):
    if model_name in df["📛 Models"].values:
        continue
    tokenizer = AutoTokenizer.from_pretrained(
        model_name, use_fast=True, trust_remote_code=True
    )
    vocab_size = tokenizer.vocab_size
    number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
    df = df._append(
        {
            "📛 Models": model_name,
            "📘 Vocab Size": vocab_size,
            "➕ Total Number of Tokens": number_of_tokens,
            "Tokenizer Class": tokenizer.__class__.__name__,
        },
        ignore_index=True,
    )

# Sort the dataframe by the number of tokens
df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)

# Save the dataframe to a csv file
df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)

def submit(model_name):
    global df
    if model_name in df["📛 Models"].values:
        return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist())
    tokenizer = AutoTokenizer.from_pretrained(
        model_name, use_fast=True, trust_remote_code=True
    )
    vocab_size = tokenizer.vocab_size
    number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
    df = df._append(
        {
            "📛 Models": model_name,
            "➕ Total Number of Tokens": number_of_tokens,
            "📘 Vocab Size": vocab_size,
            "Tokenizer Class": tokenizer.__class__.__name__,
        },
        ignore_index=True,
    )
    df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
    df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
    return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist())

def generate_distinct_colors(n):
    """Generate n visually distinct colors in hexadecimal format."""
    if n > 256**3:
        raise ValueError("Cannot generate more than 16,777,216 unique colors.")
    
    # To ensure colors are distinct, calculate an appropriate distance between colors
    # The cube root of 256**3 (total colors) divided by n gives a crude initial spacing estimate
    spacing = int((256 * 256 * 256)**(1/3) / n**(1/3))
    max_val = 256 - spacing
    
    # Set to keep track of used colors
    used_colors = set()
    
    # List to store the result colors
    result = []
    
    attempts = 0
    while len(result) < n:
        # Generate a color with a random start and controlled spacing
        r = random.randint(0, max_val)
        g = random.randint(0, max_val)
        b = random.randint(0, max_val)
        
        # Scale up by spacing to ensure minimum distance between colors
        r = min(255, r * spacing)
        g = min(255, g * spacing)
        b = min(255, b * spacing)
        
        # Format the color in hexadecimal
        color = f"#{r:02X}{g:02X}{b:02X}"
        
        # Ensure this color hasn't been used
        if color not in used_colors:
            used_colors.add(color)
            result.append(color)
        else:
            attempts += 1
            if attempts > 50:
                # Dynamically adjust spacing if stuck
                spacing = max(1, spacing - 1)
                max_val = 256 - spacing
                attempts = 0
    
    return result

def decode_bpe_tokens(tokens):
    fixed_tokens = []
    for token in tokens:
        # Check if the token starts with the special BPE space character 'Ġ'
        if token.startswith('Ġ'):
            # Process the rest of the token
            try:
                # Decode the rest of the token from UTF-8 bytes understood as Latin-1 characters
                fixed_token = ' ' + token[1:].encode('utf-8').decode('utf-8')
            except UnicodeDecodeError:
                fixed_token = token  # Use the original token if decoding fails
        else:
            try:
                # Directly encode and decode without misinterpretation steps
                fixed_token = token.encode('utf-8').decode('utf-8')
            except UnicodeDecodeError:
                fixed_token = token  # Use the original token if decoding fails
        fixed_tokens.append(fixed_token)
    return fixed_tokens

def tokenize_text(text, chosen_model, better_tokenization=False):
    tokenizer = AutoTokenizer.from_pretrained(chosen_model)
    tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
    random_colors = generate_distinct_colors(len(tokenized_text))

    if better_tokenization:
        final_tokenized_text = []
        for token in tokenized_text:
            correct_tokenized_text = ""
            for char in text:
                correct_tokenized_text += char 
                current_token = decode_bpe_tokens(tokenizer.tokenize(correct_tokenized_text))
                if current_token[0] == token:
                    final_tokenized_text.append(correct_tokenized_text)
                    text = text[len(correct_tokenized_text):]
                    break
    else:
        final_tokenized_text = tokenized_text
    print(final_tokenized_text)

    output = []
    color_map = {}
    for idx, token in enumerate(final_tokenized_text):
        output.append((token, str(idx)))
        color_map[str(idx+1)] = random_colors[idx % len(random_colors)]

    return gr.HighlightedText(output, color_map)

leaderboard_description = """The `Total Number of Tokens` in this leaderboard is based on the total number of tokens summed on the Arabic section of [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations) dataset.
This dataset was chosen because it represents Arabic Fusha text in a small and consentrated manner.
A tokenizer that scores high in this leaderboard will be efficient in parsing Arabic in its different dialects and forms.
"""

with gr.Blocks() as demo:
    gr.HTML("<center><h1>Arabic Tokenizers Leaderboard</h1></center>")
    gr.Markdown("## What is the best tokenizer for Arabic?")
    gr.Markdown(leaderboard_description)
    with gr.Tab(label="Leaderboard"):
        dataframe = gr.Dataframe(df)
        with gr.Accordion("Barplot", open=False):
            barplot = gr.BarPlot(
                df,
                x="📛 Models",
                y="➕ Total Number of Tokens",
                x_title=" ",
                y_title=" ",
                width=1000,
                height=400,
                tooltip=["📘 Vocab Size", "➕ Total Number of Tokens"],
                vertical=False,
                x_label_angle=30,
            )
        model_name = gr.Textbox(
            label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
        )
        submit_new_model_btn = gr.Button(value="Submit", variant="primary")
    with gr.Tab(label="Try tokenizers"):
        text = gr.Textbox(label="Enter a text", lines=5, value="السلام عليكم ورحمة الله", rtl=True, text_align="right")
        dropdown = gr.Dropdown(
            label="Select a model",
            choices=df["📛 Models"].tolist(),
            value=df["📛 Models"].tolist()[0],
        )
        with gr.Row():
            submit_text_btn = gr.Button(value="Submit", variant="primary", scale=3)
            checkbox = gr.Checkbox(label="Better tokenization for Arabic Text", value=False, scale=1)
        tokenized_textbox = gr.HighlightedText(label="Tokenized text")

    submit_new_model_btn.click(submit, model_name, outputs=[dataframe, barplot, dropdown])
    submit_text_btn.click(tokenize_text, inputs=[text, dropdown, checkbox], outputs=[tokenized_textbox])


demo.launch()