Spaces:

Annorita
/

tokenizer_comparison

Sleeping

Annorita commited on Jan 23, 2024

Commit

ab98424

1 Parent(s): 1c58aa1

rearrange

Files changed (2) hide show

app.py CHANGED Viewed

@@ -1,29 +1,28 @@
 import streamlit as st
 from utils import get_res
-st.title('Tokenizers demo')
 #x = st.slider('Select a value')
 #st.write(x, 'squared is', x * x)
-option = st.selectbox(
-    'Choose a tokenizer',
-    ['狗', '貓', '鸚鵡', '天竺鼠'])
-'你的答案：', option
-model_name = st.text_input('Model Name', 'deepseek-ai/deepseek-coder-1.3b-instruct')
-'Your choice:', model_name
-input_data = st.text_input('Input Sentence', 'Hello world!!!')
 res = get_res(model_name=model_name, input_sentence=input_data, single_print=False)
 #st.markdown('<style></style>')
-input_text = 'hiiiii'
-st.markdown(f'<p style="background-color:#0066cc">{input_text}</p>',
-                 unsafe_allow_html=True)

 import streamlit as st
 from utils import get_res
+st.sidebar.title('Tokenizers demo')
 #x = st.slider('Select a value')
 #st.write(x, 'squared is', x * x)
+st.sidebar.subheader('Choose the tokenizer', divider='grey')
+option = st.sidebar.selectbox(
+    'model_name',
+    ['deepseek-ai/deepseek-coder-1.3b-instruct',
+     'bigcode/starcoder'])
+model_name = st.sidebar.text_input('Model Name', 'deepseek-ai/deepseek-coder-1.3b-instruct')
+#'Your choice:', model_name
+st.sidebar.subheader('Write the input sentence', divider='grey')
+input_data = st.sidebar.text_input('Input Sentence', 'Hello world!!!')
 res = get_res(model_name=model_name, input_sentence=input_data, single_print=False)
 #st.markdown('<style></style>')
+st.subheader('Tokenized result', divider='grey')
+st.markdown(res, unsafe_allow_html=True)

utils.py CHANGED Viewed

@@ -5,7 +5,8 @@ import itertools
 def get_color():
-    colors = [i for i in range(41, 48)]
     return itertools.cycle(colors)
 def get_res(model_name, input_sentence, single_print=True):
@@ -15,7 +16,7 @@ def get_res(model_name, input_sentence, single_print=True):
     out = tokenizer.encode(input_sentence, add_special_tokens=False)
     token_num = len(out)
-    w = [ '\033[''1;'+str(next(color_iterator))+f'm {tokenizer.decode(x)}\033[m' for x in out]
     res = ''.join(w) + f' {str(token_num)}'
     if single_print:
         print(res)

 def get_color():
+    colors = ['#df7b55', '#2c7482', '#2c8234', '#5581df', '#822c63','#b355df']
     return itertools.cycle(colors)
 def get_res(model_name, input_sentence, single_print=True):
     out = tokenizer.encode(input_sentence, add_special_tokens=False)
     token_num = len(out)
+    w = [ f'<span style="background-color:{next(color_iterator)}">{tokenizer.decode(x)}</span>' for x in out ]
     res = ''.join(w) + f' {str(token_num)}'
     if single_print:
         print(res)