Commit
·
a9aadc2
1
Parent(s):
0384cfb
Refactor tokenize_text function to include better tokenization for Arabic Text
Browse files
app.py
CHANGED
@@ -133,28 +133,24 @@ def decode_bpe_tokens(tokens):
|
|
133 |
fixed_tokens.append(fixed_token)
|
134 |
return fixed_tokens
|
135 |
|
136 |
-
def
|
137 |
-
decoded_tokens = []
|
138 |
-
for token in tokens:
|
139 |
-
decoded_token = token.encode('latin-1', 'backslashreplace').decode('unicode-escape')
|
140 |
-
decoded_tokens.append(decoded_token)
|
141 |
-
return decoded_tokens
|
142 |
-
|
143 |
-
def tokenize_text(text, chosen_model):
|
144 |
tokenizer = AutoTokenizer.from_pretrained(chosen_model)
|
145 |
tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
|
146 |
random_colors = generate_distinct_colors(len(tokenized_text))
|
147 |
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
|
|
|
|
|
|
158 |
print(final_tokenized_text)
|
159 |
|
160 |
output = []
|
@@ -199,11 +195,13 @@ with gr.Blocks() as demo:
|
|
199 |
choices=df["📛 Models"].tolist(),
|
200 |
value=df["📛 Models"].tolist()[0],
|
201 |
)
|
202 |
-
|
|
|
|
|
203 |
tokenized_textbox = gr.HighlightedText(label="Tokenized text")
|
204 |
|
205 |
submit_new_model_btn.click(submit, model_name, outputs=[dataframe, barplot, dropdown])
|
206 |
-
submit_text_btn.click(tokenize_text, inputs=[text, dropdown], outputs=[tokenized_textbox])
|
207 |
|
208 |
|
209 |
demo.launch()
|
|
|
133 |
fixed_tokens.append(fixed_token)
|
134 |
return fixed_tokens
|
135 |
|
136 |
+
def tokenize_text(text, chosen_model, better_tokenization=False):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
tokenizer = AutoTokenizer.from_pretrained(chosen_model)
|
138 |
tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
|
139 |
random_colors = generate_distinct_colors(len(tokenized_text))
|
140 |
|
141 |
+
if better_tokenization:
|
142 |
+
final_tokenized_text = []
|
143 |
+
for token in tokenized_text:
|
144 |
+
correct_tokenized_text = ""
|
145 |
+
for char in text:
|
146 |
+
correct_tokenized_text += char
|
147 |
+
current_token = decode_bpe_tokens(tokenizer.tokenize(correct_tokenized_text))
|
148 |
+
if current_token[0] == token:
|
149 |
+
final_tokenized_text.append(correct_tokenized_text)
|
150 |
+
text = text[len(correct_tokenized_text):]
|
151 |
+
break
|
152 |
+
else:
|
153 |
+
final_tokenized_text = tokenized_text
|
154 |
print(final_tokenized_text)
|
155 |
|
156 |
output = []
|
|
|
195 |
choices=df["📛 Models"].tolist(),
|
196 |
value=df["📛 Models"].tolist()[0],
|
197 |
)
|
198 |
+
with gr.Row():
|
199 |
+
submit_text_btn = gr.Button(value="Submit", variant="primary", scale=3)
|
200 |
+
checkbox = gr.Checkbox(label="Better tokenization for Arabic Text", value=False, scale=1)
|
201 |
tokenized_textbox = gr.HighlightedText(label="Tokenized text")
|
202 |
|
203 |
submit_new_model_btn.click(submit, model_name, outputs=[dataframe, barplot, dropdown])
|
204 |
+
submit_text_btn.click(tokenize_text, inputs=[text, dropdown, checkbox], outputs=[tokenized_textbox])
|
205 |
|
206 |
|
207 |
demo.launch()
|