Spaces:
Sleeping
Sleeping
Commit
·
023db76
1
Parent(s):
09d8da8
Update app.py
Browse files
app.py
CHANGED
@@ -7,17 +7,20 @@ import matplotlib.pyplot as plt
|
|
7 |
st.set_page_config(layout="wide")
|
8 |
|
9 |
with st.sidebar:
|
10 |
-
subset = st.selectbox('
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
15 |
|
16 |
fig, (axl, axr) = plt.subplots(1, 2, figsize=(10,3))
|
17 |
-
axl.hist(
|
18 |
-
axl.set_title('eng mistral tokens')
|
19 |
-
axr.hist(
|
20 |
-
axr.set_title('ukr mistral tokens')
|
21 |
st.pyplot(fig)
|
22 |
|
23 |
st.dataframe(pd.DataFrame(dataset))
|
|
|
7 |
st.set_page_config(layout="wide")
|
8 |
|
9 |
with st.sidebar:
|
10 |
+
subset = st.selectbox('subset', ('dev', 'devtest'))
|
11 |
|
12 |
+
with st.echo():
|
13 |
+
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
|
14 |
+
flores = load_dataset("facebook/flores", "eng_Latn-ukr_Cyrl")
|
15 |
+
dataset = flores[subset]
|
16 |
+
eng_num_tokens = dataset.map(lambda x: {'num_tokens':len(tokenizer(x['sentence_eng_Latn'])['input_ids'])})['num_tokens']
|
17 |
+
ukr_num_tokens = dataset.map(lambda x: {'num_tokens':len(tokenizer(x['sentence_ukr_Cyrl'])['input_ids'])})['num_tokens']
|
18 |
|
19 |
fig, (axl, axr) = plt.subplots(1, 2, figsize=(10,3))
|
20 |
+
axl.hist(eng_num_tokens)
|
21 |
+
axl.set_title(f'eng mistral tokens ({np.sum(eng_num_tokens)} total)')
|
22 |
+
axr.hist(ukr_num_tokens)
|
23 |
+
axr.set_title(f'ukr mistral tokens ({np.sum(ukr_num_tokens)} total)')
|
24 |
st.pyplot(fig)
|
25 |
|
26 |
st.dataframe(pd.DataFrame(dataset))
|