Spaces:

MohamedRashad
/

arabic-tokenizers-leaderboard

Running

App Files Files Community

arabic-tokenizers-leaderboard / app.py

MohamedRashad

Update app.py

e4cac44 verified 9 months ago

raw

history blame

8.3 kB

	from transformers import AutoTokenizer
	from tqdm import tqdm
	import gradio as gr
	import pandas as pd
	from datasets import load_dataset
	import random
	from pathlib import Path

	initial_list_of_models = [
	"Xenova/gpt-4o",
	"NousResearch/Meta-Llama-3-8B",
	"CohereForAI/c4ai-command-r-v01",
	"CohereForAI/c4ai-command-r-plus",
	"core42/jais-13b",
	]

	dataset = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"]

	dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
	if dataframe_path.exists():
	df = pd.read_json(dataframe_path, lines=True)
	else:
	df = pd.DataFrame(columns=["📛 Models", "➕ Total Number of Tokens", "📘 Vocab Size", "Tokenizer Class"])

	for model_name in tqdm(initial_list_of_models):
	if model_name in df["📛 Models"].values:
	continue
	tokenizer = AutoTokenizer.from_pretrained(
	model_name, use_fast=True, trust_remote_code=True
	)
	vocab_size = tokenizer.vocab_size
	number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
	df = df._append(
	{
	"📛 Models": model_name,
	"📘 Vocab Size": vocab_size,
	"➕ Total Number of Tokens": number_of_tokens,
	"Tokenizer Class": tokenizer.__class__.__name__,
	},
	ignore_index=True,
	)

	# Sort the dataframe by the number of tokens
	df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)

	# Save the dataframe to a csv file
	df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)

	def submit(model_name):
	global df
	if model_name in df["📛 Models"].values:
	return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist())
	tokenizer = AutoTokenizer.from_pretrained(
	model_name, use_fast=True, trust_remote_code=True
	)
	vocab_size = tokenizer.vocab_size
	number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
	df = df._append(
	{
	"📛 Models": model_name,
	"➕ Total Number of Tokens": number_of_tokens,
	"📘 Vocab Size": vocab_size,
	"Tokenizer Class": tokenizer.__class__.__name__,
	},
	ignore_index=True,
	)
	df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
	df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
	return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist())

	def generate_distinct_colors(n):
	"""Generate n visually distinct colors in hexadecimal format."""
	if n > 256**3:
	raise ValueError("Cannot generate more than 16,777,216 unique colors.")

	# To ensure colors are distinct, calculate an appropriate distance between colors
	# The cube root of 256**3 (total colors) divided by n gives a crude initial spacing estimate
	spacing = int((256 * 256 * 256)(1/3) / n(1/3))
	max_val = 256 - spacing

	# Set to keep track of used colors
	used_colors = set()

	# List to store the result colors
	result = []

	attempts = 0
	while len(result) < n:
	# Generate a color with a random start and controlled spacing
	r = random.randint(0, max_val)
	g = random.randint(0, max_val)
	b = random.randint(0, max_val)

	# Scale up by spacing to ensure minimum distance between colors
	r = min(255, r * spacing)
	g = min(255, g * spacing)
	b = min(255, b * spacing)

	# Format the color in hexadecimal
	color = f"#{r:02X}{g:02X}{b:02X}"

	# Ensure this color hasn't been used
	if color not in used_colors:
	used_colors.add(color)
	result.append(color)
	else:
	attempts += 1
	if attempts > 50:
	# Dynamically adjust spacing if stuck
	spacing = max(1, spacing - 1)
	max_val = 256 - spacing
	attempts = 0

	return result

	def decode_bpe_tokens(tokens):
	fixed_tokens = []
	for token in tokens:
	# Check if the token starts with the special BPE space character 'Ġ'
	if token.startswith('Ġ'):
	# Process the rest of the token
	try:
	# Decode the rest of the token from UTF-8 bytes understood as Latin-1 characters
	fixed_token = ' ' + token[1:].encode('utf-8').decode('utf-8')
	except UnicodeDecodeError:
	fixed_token = token # Use the original token if decoding fails
	else:
	try:
	# Directly encode and decode without misinterpretation steps
	fixed_token = token.encode('utf-8').decode('utf-8')
	except UnicodeDecodeError:
	fixed_token = token # Use the original token if decoding fails
	fixed_tokens.append(fixed_token)
	return fixed_tokens

	def tokenize_text(text, chosen_model, better_tokenization=False):
	tokenizer = AutoTokenizer.from_pretrained(chosen_model)
	tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
	random_colors = generate_distinct_colors(len(tokenized_text))

	if better_tokenization:
	final_tokenized_text = []
	for token in tokenized_text:
	correct_tokenized_text = ""
	for char in text:
	correct_tokenized_text += char
	current_token = decode_bpe_tokens(tokenizer.tokenize(correct_tokenized_text))
	if current_token[0] == token:
	final_tokenized_text.append(correct_tokenized_text)
	text = text[len(correct_tokenized_text):]
	break
	else:
	final_tokenized_text = tokenized_text
	print(final_tokenized_text)

	output = []
	color_map = {}
	for idx, token in enumerate(final_tokenized_text):
	output.append((token, str(idx)))
	color_map[str(idx+1)] = random_colors[idx % len(random_colors)]

	return gr.HighlightedText(output, color_map)

	leaderboard_description = """The `Total Number of Tokens` in this leaderboard is based on the total number of tokens summed on the Arabic section of [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations) dataset.
	This dataset was chosen because it represents Arabic Fusha text in a small and consentrated manner.
	A tokenizer that scores high in this leaderboard will be efficient in parsing Arabic in its different dialects and forms.
	"""

	with gr.Blocks() as demo:
	gr.HTML("<center><h1>Arabic Tokenizers Leaderboard</h1></center>")
	gr.Markdown("## What is the best tokenizer for Arabic?")
	gr.Markdown(leaderboard_description)
	with gr.Tab(label="Leaderboard"):
	dataframe = gr.Dataframe(df)
	with gr.Accordion("Barplot", open=False):
	barplot = gr.BarPlot(
	df,
	x="📛 Models",
	y="➕ Total Number of Tokens",
	x_title=" ",
	y_title=" ",
	width=1000,
	height=400,
	tooltip=["📘 Vocab Size", "➕ Total Number of Tokens"],
	vertical=False,
	x_label_angle=30,
	)
	model_name = gr.Textbox(
	label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
	)
	submit_new_model_btn = gr.Button(value="Submit", variant="primary")
	with gr.Tab(label="Try tokenizers"):
	text = gr.Textbox(label="Enter a text", lines=5, value="السلام عليكم ورحمة الله", rtl=True, text_align="right")
	dropdown = gr.Dropdown(
	label="Select a model",
	choices=df["📛 Models"].tolist(),
	value=df["📛 Models"].tolist()[0],
	)
	with gr.Row():
	submit_text_btn = gr.Button(value="Submit", variant="primary", scale=3)
	checkbox = gr.Checkbox(label="Better tokenization for Arabic Text", value=False, scale=1)
	tokenized_textbox = gr.HighlightedText(label="Tokenized text")

	submit_new_model_btn.click(submit, model_name, outputs=[dataframe, barplot, dropdown])
	submit_text_btn.click(tokenize_text, inputs=[text, dropdown, checkbox], outputs=[tokenized_textbox])


	demo.launch()