Spaces:
Sleeping
Sleeping
rearrange
Browse files
app.py
CHANGED
@@ -1,29 +1,28 @@
|
|
1 |
import streamlit as st
|
2 |
from utils import get_res
|
3 |
|
4 |
-
st.title('Tokenizers demo')
|
5 |
|
6 |
#x = st.slider('Select a value')
|
7 |
#st.write(x, 'squared is', x * x)
|
8 |
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
|
11 |
-
'Choose a tokenizer',
|
12 |
-
['狗', '貓', '鸚鵡', '天竺鼠'])
|
13 |
-
'你的答案:', option
|
14 |
|
|
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
'Your choice:', model_name
|
19 |
-
|
20 |
-
input_data = st.text_input('Input Sentence', 'Hello world!!!')
|
21 |
|
22 |
|
23 |
res = get_res(model_name=model_name, input_sentence=input_data, single_print=False)
|
24 |
|
25 |
#st.markdown('<style></style>')
|
26 |
|
27 |
-
|
28 |
-
st.markdown(
|
29 |
-
unsafe_allow_html=True)
|
|
|
1 |
import streamlit as st
|
2 |
from utils import get_res
|
3 |
|
4 |
+
st.sidebar.title('Tokenizers demo')
|
5 |
|
6 |
#x = st.slider('Select a value')
|
7 |
#st.write(x, 'squared is', x * x)
|
8 |
|
9 |
+
st.sidebar.subheader('Choose the tokenizer', divider='grey')
|
10 |
+
option = st.sidebar.selectbox(
|
11 |
+
'model_name',
|
12 |
+
['deepseek-ai/deepseek-coder-1.3b-instruct',
|
13 |
+
'bigcode/starcoder'])
|
14 |
|
15 |
+
model_name = st.sidebar.text_input('Model Name', 'deepseek-ai/deepseek-coder-1.3b-instruct')
|
|
|
|
|
|
|
16 |
|
17 |
+
#'Your choice:', model_name
|
18 |
|
19 |
+
st.sidebar.subheader('Write the input sentence', divider='grey')
|
20 |
+
input_data = st.sidebar.text_input('Input Sentence', 'Hello world!!!')
|
|
|
|
|
|
|
21 |
|
22 |
|
23 |
res = get_res(model_name=model_name, input_sentence=input_data, single_print=False)
|
24 |
|
25 |
#st.markdown('<style></style>')
|
26 |
|
27 |
+
st.subheader('Tokenized result', divider='grey')
|
28 |
+
st.markdown(res, unsafe_allow_html=True)
|
|
utils.py
CHANGED
@@ -5,7 +5,8 @@ import itertools
|
|
5 |
|
6 |
|
7 |
def get_color():
|
8 |
-
colors = [
|
|
|
9 |
return itertools.cycle(colors)
|
10 |
|
11 |
def get_res(model_name, input_sentence, single_print=True):
|
@@ -15,7 +16,7 @@ def get_res(model_name, input_sentence, single_print=True):
|
|
15 |
out = tokenizer.encode(input_sentence, add_special_tokens=False)
|
16 |
token_num = len(out)
|
17 |
|
18 |
-
w = [ '
|
19 |
res = ''.join(w) + f' {str(token_num)}'
|
20 |
if single_print:
|
21 |
print(res)
|
|
|
5 |
|
6 |
|
7 |
def get_color():
|
8 |
+
colors = ['#df7b55', '#2c7482', '#2c8234', '#5581df', '#822c63','#b355df']
|
9 |
+
|
10 |
return itertools.cycle(colors)
|
11 |
|
12 |
def get_res(model_name, input_sentence, single_print=True):
|
|
|
16 |
out = tokenizer.encode(input_sentence, add_special_tokens=False)
|
17 |
token_num = len(out)
|
18 |
|
19 |
+
w = [ f'<span style="background-color:{next(color_iterator)}">{tokenizer.decode(x)}</span>' for x in out ]
|
20 |
res = ''.join(w) + f' {str(token_num)}'
|
21 |
if single_print:
|
22 |
print(res)
|