MohamedRashad commited on
Commit
a9aadc2
·
1 Parent(s): 0384cfb

Refactor tokenize_text function to include better tokenization for Arabic Text

Browse files
Files changed (1) hide show
  1. app.py +18 -20
app.py CHANGED
@@ -133,28 +133,24 @@ def decode_bpe_tokens(tokens):
133
  fixed_tokens.append(fixed_token)
134
  return fixed_tokens
135
 
136
- def decode_arabic_tokens(tokens):
137
- decoded_tokens = []
138
- for token in tokens:
139
- decoded_token = token.encode('latin-1', 'backslashreplace').decode('unicode-escape')
140
- decoded_tokens.append(decoded_token)
141
- return decoded_tokens
142
-
143
- def tokenize_text(text, chosen_model):
144
  tokenizer = AutoTokenizer.from_pretrained(chosen_model)
145
  tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
146
  random_colors = generate_distinct_colors(len(tokenized_text))
147
 
148
- final_tokenized_text = []
149
- for token in tokenized_text:
150
- correct_tokenized_text = ""
151
- for char in text:
152
- correct_tokenized_text += char
153
- current_token = decode_bpe_tokens(tokenizer.tokenize(correct_tokenized_text))
154
- if current_token[0] == token:
155
- final_tokenized_text.append(correct_tokenized_text)
156
- text = text[len(correct_tokenized_text):]
157
- break
 
 
 
158
  print(final_tokenized_text)
159
 
160
  output = []
@@ -199,11 +195,13 @@ with gr.Blocks() as demo:
199
  choices=df["📛 Models"].tolist(),
200
  value=df["📛 Models"].tolist()[0],
201
  )
202
- submit_text_btn = gr.Button(value="Submit", variant="primary")
 
 
203
  tokenized_textbox = gr.HighlightedText(label="Tokenized text")
204
 
205
  submit_new_model_btn.click(submit, model_name, outputs=[dataframe, barplot, dropdown])
206
- submit_text_btn.click(tokenize_text, inputs=[text, dropdown], outputs=[tokenized_textbox])
207
 
208
 
209
  demo.launch()
 
133
  fixed_tokens.append(fixed_token)
134
  return fixed_tokens
135
 
136
+ def tokenize_text(text, chosen_model, better_tokenization=False):
 
 
 
 
 
 
 
137
  tokenizer = AutoTokenizer.from_pretrained(chosen_model)
138
  tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
139
  random_colors = generate_distinct_colors(len(tokenized_text))
140
 
141
+ if better_tokenization:
142
+ final_tokenized_text = []
143
+ for token in tokenized_text:
144
+ correct_tokenized_text = ""
145
+ for char in text:
146
+ correct_tokenized_text += char
147
+ current_token = decode_bpe_tokens(tokenizer.tokenize(correct_tokenized_text))
148
+ if current_token[0] == token:
149
+ final_tokenized_text.append(correct_tokenized_text)
150
+ text = text[len(correct_tokenized_text):]
151
+ break
152
+ else:
153
+ final_tokenized_text = tokenized_text
154
  print(final_tokenized_text)
155
 
156
  output = []
 
195
  choices=df["📛 Models"].tolist(),
196
  value=df["📛 Models"].tolist()[0],
197
  )
198
+ with gr.Row():
199
+ submit_text_btn = gr.Button(value="Submit", variant="primary", scale=3)
200
+ checkbox = gr.Checkbox(label="Better tokenization for Arabic Text", value=False, scale=1)
201
  tokenized_textbox = gr.HighlightedText(label="Tokenized text")
202
 
203
  submit_new_model_btn.click(submit, model_name, outputs=[dataframe, barplot, dropdown])
204
+ submit_text_btn.click(tokenize_text, inputs=[text, dropdown, checkbox], outputs=[tokenized_textbox])
205
 
206
 
207
  demo.launch()