MohamedRashad's picture
Update app.py
e4cac44 verified
raw
history blame
8.3 kB
from transformers import AutoTokenizer
from tqdm import tqdm
import gradio as gr
import pandas as pd
from datasets import load_dataset
import random
from pathlib import Path
initial_list_of_models = [
"Xenova/gpt-4o",
"NousResearch/Meta-Llama-3-8B",
"CohereForAI/c4ai-command-r-v01",
"CohereForAI/c4ai-command-r-plus",
"core42/jais-13b",
]
dataset = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"]
dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
if dataframe_path.exists():
df = pd.read_json(dataframe_path, lines=True)
else:
df = pd.DataFrame(columns=["📛 Models", "➕ Total Number of Tokens", "📘 Vocab Size", "Tokenizer Class"])
for model_name in tqdm(initial_list_of_models):
if model_name in df["📛 Models"].values:
continue
tokenizer = AutoTokenizer.from_pretrained(
model_name, use_fast=True, trust_remote_code=True
)
vocab_size = tokenizer.vocab_size
number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
df = df._append(
{
"📛 Models": model_name,
"📘 Vocab Size": vocab_size,
"➕ Total Number of Tokens": number_of_tokens,
"Tokenizer Class": tokenizer.__class__.__name__,
},
ignore_index=True,
)
# Sort the dataframe by the number of tokens
df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
# Save the dataframe to a csv file
df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
def submit(model_name):
global df
if model_name in df["📛 Models"].values:
return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist())
tokenizer = AutoTokenizer.from_pretrained(
model_name, use_fast=True, trust_remote_code=True
)
vocab_size = tokenizer.vocab_size
number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
df = df._append(
{
"📛 Models": model_name,
"➕ Total Number of Tokens": number_of_tokens,
"📘 Vocab Size": vocab_size,
"Tokenizer Class": tokenizer.__class__.__name__,
},
ignore_index=True,
)
df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist())
def generate_distinct_colors(n):
"""Generate n visually distinct colors in hexadecimal format."""
if n > 256**3:
raise ValueError("Cannot generate more than 16,777,216 unique colors.")
# To ensure colors are distinct, calculate an appropriate distance between colors
# The cube root of 256**3 (total colors) divided by n gives a crude initial spacing estimate
spacing = int((256 * 256 * 256)**(1/3) / n**(1/3))
max_val = 256 - spacing
# Set to keep track of used colors
used_colors = set()
# List to store the result colors
result = []
attempts = 0
while len(result) < n:
# Generate a color with a random start and controlled spacing
r = random.randint(0, max_val)
g = random.randint(0, max_val)
b = random.randint(0, max_val)
# Scale up by spacing to ensure minimum distance between colors
r = min(255, r * spacing)
g = min(255, g * spacing)
b = min(255, b * spacing)
# Format the color in hexadecimal
color = f"#{r:02X}{g:02X}{b:02X}"
# Ensure this color hasn't been used
if color not in used_colors:
used_colors.add(color)
result.append(color)
else:
attempts += 1
if attempts > 50:
# Dynamically adjust spacing if stuck
spacing = max(1, spacing - 1)
max_val = 256 - spacing
attempts = 0
return result
def decode_bpe_tokens(tokens):
fixed_tokens = []
for token in tokens:
# Check if the token starts with the special BPE space character 'Ġ'
if token.startswith('Ġ'):
# Process the rest of the token
try:
# Decode the rest of the token from UTF-8 bytes understood as Latin-1 characters
fixed_token = ' ' + token[1:].encode('utf-8').decode('utf-8')
except UnicodeDecodeError:
fixed_token = token # Use the original token if decoding fails
else:
try:
# Directly encode and decode without misinterpretation steps
fixed_token = token.encode('utf-8').decode('utf-8')
except UnicodeDecodeError:
fixed_token = token # Use the original token if decoding fails
fixed_tokens.append(fixed_token)
return fixed_tokens
def tokenize_text(text, chosen_model, better_tokenization=False):
tokenizer = AutoTokenizer.from_pretrained(chosen_model)
tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
random_colors = generate_distinct_colors(len(tokenized_text))
if better_tokenization:
final_tokenized_text = []
for token in tokenized_text:
correct_tokenized_text = ""
for char in text:
correct_tokenized_text += char
current_token = decode_bpe_tokens(tokenizer.tokenize(correct_tokenized_text))
if current_token[0] == token:
final_tokenized_text.append(correct_tokenized_text)
text = text[len(correct_tokenized_text):]
break
else:
final_tokenized_text = tokenized_text
print(final_tokenized_text)
output = []
color_map = {}
for idx, token in enumerate(final_tokenized_text):
output.append((token, str(idx)))
color_map[str(idx+1)] = random_colors[idx % len(random_colors)]
return gr.HighlightedText(output, color_map)
leaderboard_description = """The `Total Number of Tokens` in this leaderboard is based on the total number of tokens summed on the Arabic section of [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations) dataset.
This dataset was chosen because it represents Arabic Fusha text in a small and consentrated manner.
A tokenizer that scores high in this leaderboard will be efficient in parsing Arabic in its different dialects and forms.
"""
with gr.Blocks() as demo:
gr.HTML("<center><h1>Arabic Tokenizers Leaderboard</h1></center>")
gr.Markdown("## What is the best tokenizer for Arabic?")
gr.Markdown(leaderboard_description)
with gr.Tab(label="Leaderboard"):
dataframe = gr.Dataframe(df)
with gr.Accordion("Barplot", open=False):
barplot = gr.BarPlot(
df,
x="📛 Models",
y="➕ Total Number of Tokens",
x_title=" ",
y_title=" ",
width=1000,
height=400,
tooltip=["📘 Vocab Size", "➕ Total Number of Tokens"],
vertical=False,
x_label_angle=30,
)
model_name = gr.Textbox(
label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
)
submit_new_model_btn = gr.Button(value="Submit", variant="primary")
with gr.Tab(label="Try tokenizers"):
text = gr.Textbox(label="Enter a text", lines=5, value="السلام عليكم ورحمة الله", rtl=True, text_align="right")
dropdown = gr.Dropdown(
label="Select a model",
choices=df["📛 Models"].tolist(),
value=df["📛 Models"].tolist()[0],
)
with gr.Row():
submit_text_btn = gr.Button(value="Submit", variant="primary", scale=3)
checkbox = gr.Checkbox(label="Better tokenization for Arabic Text", value=False, scale=1)
tokenized_textbox = gr.HighlightedText(label="Tokenized text")
submit_new_model_btn.click(submit, model_name, outputs=[dataframe, barplot, dropdown])
submit_text_btn.click(tokenize_text, inputs=[text, dropdown, checkbox], outputs=[tokenized_textbox])
demo.launch()