Spaces:
Sleeping
Sleeping
File size: 1,259 Bytes
7e6bce5 d02038d 7e6bce5 b2c2c22 7e6bce5 b3c0d6f b1bc515 023db76 b1bc515 023db76 b2c2c22 4f9f819 1c8f137 4f9f819 b7118cc 4f9f819 b2c2c22 4f9f819 7e6bce5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
from datasets import load_dataset
import pandas as pd
import numpy as np
import streamlit as st
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
st.set_page_config(layout="wide")
with st.sidebar:
subset = st.selectbox('subset', ('dev', 'devtest'))
with st.echo():
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
flores = load_dataset("facebook/flores", "eng_Latn-ukr_Cyrl")
dataset = flores[subset]
eng_num_tokens = dataset.map(lambda x: {'num_tokens':len(tokenizer(x['sentence_eng_Latn'])['input_ids'])})['num_tokens']
ukr_num_tokens = dataset.map(lambda x: {'num_tokens':len(tokenizer(x['sentence_ukr_Cyrl'])['input_ids'])})['num_tokens']
with st.sidebar:
fig, (axl, axr) = plt.subplots(2, 1, figsize=(3,6))
axl.hist(eng_num_tokens)
axl.set_title(f'eng mistral tokens ({np.sum(eng_num_tokens)} total)')
axr.hist(ukr_num_tokens)
axr.set_title(f'ukr mistral tokens ({np.sum(ukr_num_tokens)} total)')
st.pyplot(fig)
keyword = st.text_input("Filter by text", value="")
if not keyword:
st.dataframe(pd.DataFrame(dataset))
else:
st.dataframe(pd.DataFrame(dataset.filter(lambda x: keyword in x['sentence_eng_Latn'] or keyword in x['sentence_ukr_Cyrl'])))
|