File size: 849 Bytes
7e6bce5
 
 
b2c2c22
 
7e6bce5
b3c0d6f
 
b1bc515
 
 
fbb3b12
b2c2c22
 
 
09d8da8
62d5b02
b2c2c22
62d5b02
b2c2c22
d4a7a03
b2c2c22
e516cb2
7e6bce5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from datasets import load_dataset
import pandas as pd
import streamlit as st
from transformers import AutoTokenizer
import matplotlib.pyplot as plt

st.set_page_config(layout="wide")

with st.sidebar:
    subset = st.selectbox('Flores eng_Latn-ukr_Cyrl subset', ('dev', 'devtest'))

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
flores =  load_dataset("facebook/flores", "eng_Latn-ukr_Cyrl")
dataset = flores[subset]

fig, (axl, axr) = plt.subplots(1, 2, figsize=(10,3))
axl.hist(dataset.map(lambda x: {'num_tokens':len(tokenizer(x['sentence_eng_Latn'])['input_ids'])})['num_tokens'])
axl.set_title('eng mistral tokens')
axr.hist(dataset.map(lambda x: {'num_tokens':len(tokenizer(x['sentence_ukr_Cyrl'])['input_ids'])})['num_tokens'])
axr.set_title('ukr mistral tokens')
st.pyplot(fig)

st.dataframe(pd.DataFrame(dataset))