Spaces:
Runtime error
Runtime error
taka-yamakoshi
commited on
Commit
·
ed9112c
1
Parent(s):
9240bf4
add instructions
Browse files
app.py
CHANGED
|
@@ -98,15 +98,19 @@ if __name__=='__main__':
|
|
| 98 |
st.markdown(generate_markdown('quick and easy way to explore how tokenizers work',size=24), unsafe_allow_html=True)
|
| 99 |
|
| 100 |
# Select and load the tokenizer
|
| 101 |
-
tokenizer_name = st.sidebar.selectbox('Choose the tokenizer from below',
|
| 102 |
('bert-base-uncased','bert-large-cased',
|
| 103 |
'gpt2','gpt2-large',
|
| 104 |
'roberta-base','roberta-large',
|
| 105 |
'albert-base-v2','albert-xxlarge-v2'),index=7)
|
| 106 |
tokenizer = load_model(tokenizer_name)
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
comparison_mode = st.sidebar.checkbox('Compare two texts')
|
| 109 |
-
detokenize = st.sidebar.checkbox('de-tokenize
|
| 110 |
if comparison_mode:
|
| 111 |
sent_cols = st.columns(2)
|
| 112 |
num_tokens = {}
|
|
@@ -122,7 +126,7 @@ if __name__=='__main__':
|
|
| 122 |
sents[f'sent_{sent_id+1}'] = sentence
|
| 123 |
|
| 124 |
if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
|
| 125 |
-
st.markdown(generate_markdown('
|
| 126 |
if num_tokens[f'sent_1']==num_tokens[f'sent_2']:
|
| 127 |
st.markdown(generate_markdown('Matched! ',color='MediumAquamarine'), unsafe_allow_html=True)
|
| 128 |
else:
|
|
|
|
| 98 |
st.markdown(generate_markdown('quick and easy way to explore how tokenizers work',size=24), unsafe_allow_html=True)
|
| 99 |
|
| 100 |
# Select and load the tokenizer
|
| 101 |
+
tokenizer_name = st.sidebar.selectbox('1. Choose the tokenizer from below',
|
| 102 |
('bert-base-uncased','bert-large-cased',
|
| 103 |
'gpt2','gpt2-large',
|
| 104 |
'roberta-base','roberta-large',
|
| 105 |
'albert-base-v2','albert-xxlarge-v2'),index=7)
|
| 106 |
tokenizer = load_model(tokenizer_name)
|
| 107 |
|
| 108 |
+
st.sidebar.write('2. Optional settings')
|
| 109 |
+
st.sidebar.write(f'"Compare two texts" compares # tokens for two pieces of text '\
|
| 110 |
+
+f'and "de-tokenize" converts a list of tokenized indices back to strings.')
|
| 111 |
+
st.sidebar.write(f'For "de-tokenize", make sure to type in integers, separated by single spaces')
|
| 112 |
comparison_mode = st.sidebar.checkbox('Compare two texts')
|
| 113 |
+
detokenize = st.sidebar.checkbox('de-tokenize')
|
| 114 |
if comparison_mode:
|
| 115 |
sent_cols = st.columns(2)
|
| 116 |
num_tokens = {}
|
|
|
|
| 126 |
sents[f'sent_{sent_id+1}'] = sentence
|
| 127 |
|
| 128 |
if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
|
| 129 |
+
st.markdown(generate_markdown('# Tokens: ',size=16), unsafe_allow_html=True)
|
| 130 |
if num_tokens[f'sent_1']==num_tokens[f'sent_2']:
|
| 131 |
st.markdown(generate_markdown('Matched! ',color='MediumAquamarine'), unsafe_allow_html=True)
|
| 132 |
else:
|