darkproger commited on
Commit
023db76
·
1 Parent(s): 09d8da8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -8
app.py CHANGED
@@ -7,17 +7,20 @@ import matplotlib.pyplot as plt
7
  st.set_page_config(layout="wide")
8
 
9
  with st.sidebar:
10
- subset = st.selectbox('Flores eng_Latn-ukr_Cyrl subset', ('dev', 'devtest'))
11
 
12
- tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
13
- flores = load_dataset("facebook/flores", "eng_Latn-ukr_Cyrl")
14
- dataset = flores[subset]
 
 
 
15
 
16
  fig, (axl, axr) = plt.subplots(1, 2, figsize=(10,3))
17
- axl.hist(dataset.map(lambda x: {'num_tokens':len(tokenizer(x['sentence_eng_Latn'])['input_ids'])})['num_tokens'])
18
- axl.set_title('eng mistral tokens')
19
- axr.hist(dataset.map(lambda x: {'num_tokens':len(tokenizer(x['sentence_ukr_Cyrl'])['input_ids'])})['num_tokens'])
20
- axr.set_title('ukr mistral tokens')
21
  st.pyplot(fig)
22
 
23
  st.dataframe(pd.DataFrame(dataset))
 
7
  st.set_page_config(layout="wide")
8
 
9
  with st.sidebar:
10
+ subset = st.selectbox('subset', ('dev', 'devtest'))
11
 
12
+ with st.echo():
13
+ tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
14
+ flores = load_dataset("facebook/flores", "eng_Latn-ukr_Cyrl")
15
+ dataset = flores[subset]
16
+ eng_num_tokens = dataset.map(lambda x: {'num_tokens':len(tokenizer(x['sentence_eng_Latn'])['input_ids'])})['num_tokens']
17
+ ukr_num_tokens = dataset.map(lambda x: {'num_tokens':len(tokenizer(x['sentence_ukr_Cyrl'])['input_ids'])})['num_tokens']
18
 
19
  fig, (axl, axr) = plt.subplots(1, 2, figsize=(10,3))
20
+ axl.hist(eng_num_tokens)
21
+ axl.set_title(f'eng mistral tokens ({np.sum(eng_num_tokens)} total)')
22
+ axr.hist(ukr_num_tokens)
23
+ axr.set_title(f'ukr mistral tokens ({np.sum(ukr_num_tokens)} total)')
24
  st.pyplot(fig)
25
 
26
  st.dataframe(pd.DataFrame(dataset))