Spaces:
Running
Running
from datasets import load_dataset | |
import streamlit as st | |
st.set_page_config(layout="wide") | |
dataset = load_dataset("GroNLP/divemt") | |
df = dataset["train"].to_pandas() | |
unique_src = df[["item_id", "src_text"]].drop_duplicates(subset="item_id") | |
langs = list(df["lang_id"].unique()) | |
st.title("DivEMT Explorer") | |
cc1, _ = st.columns([2, 1]) | |
with cc1: | |
st.write(""" | |
The DivEMT Explorer is a tool to explore translations and edits contained in the DivEMT corpus. | |
Use the expandable section "Explore examples" below to visualize some of the original source sentences. When you found a sentence that you might be interested in, insert its numeric id (between 0 and 429) in the box below, and select all the languages for which you want to visualize the results. | |
Inside every generated section you will find the translations for all the available settings, alongside aligned edits and a collection of collected metadata. You can filter the showed settings to better see the aligned edits annotations. | |
""") | |
with st.expander("Explore examples"): | |
col1, col2, _ = st.columns([3,2,5]) | |
with col1: | |
offset = st.slider( | |
"Select an offset", | |
min_value=0, | |
max_value=len(unique_src) - 5, | |
value=0, | |
) | |
with col2: | |
count = st.number_input( | |
'Select the number of examples to display', | |
min_value=3, | |
max_value=len(unique_src), | |
value=5, | |
) | |
st.table(unique_src[offset:int(offset+count)]) | |
col1_main, col2_main, _ = st.columns([1,1,3]) | |
with col1_main: | |
item_id = st.number_input( | |
'Select an item (0-429) to inspect', | |
min_value=0, | |
max_value=len(unique_src) - 1, | |
) | |
with col2_main: | |
langs = st.multiselect( | |
'Select languages', | |
options=langs | |
) | |
st.markdown("<b>Source text:</b> <span style='color: #ff4b4b'> " + unique_src.iloc[int(item_id)]["src_text"] + "</span>", unsafe_allow_html=True) | |
task_names = ["From Scratch (HT)", "Google PE (PE1)", "mBART PE (PE2)"] | |
for lang in langs: | |
with st.expander(f"View {lang.upper()} data"): | |
c1, _ = st.columns([1, 2]) | |
with c1: | |
tasks = st.multiselect( | |
'Select settings', | |
options=task_names, | |
default=task_names, | |
key=f"{lang}_tasks" | |
) | |
columns = st.columns(len(tasks)) | |
lang_data = df[(df["item_id"] == unique_src.iloc[int(item_id)]["item_id"]) & (df["lang_id"] == lang)] | |
lang_dicts = lang_data.to_dict("records") | |
ht = [x for x in lang_dicts if x["task_type"] == "ht"][0] | |
pe1 = [x for x in lang_dicts if x["task_type"] == "pe1"][0] | |
pe2 = [x for x in lang_dicts if x["task_type"] == "pe2"][0] | |
task_dict = {k:v for k,v in zip(task_names, [ht, pe1, pe2])} | |
max_mt_length = max([len(x["mt_text"]) for x in lang_dicts if x["mt_text"] is not None]) | |
for task_name, dic, col in zip(tasks, [task_dict[name] for name in tasks], columns): | |
with col: | |
st.header(task_name) | |
st.markdown(f"<b>Translator</b>: {dic['subject_id']}", unsafe_allow_html=True) | |
mt_text = dic["mt_text"] | |
if mt_text is None: | |
mt_text = "<span style='opacity:0'>" + "".join(["O " for i in range(max_mt_length // 2)]) + "</span>" | |
st.markdown(f"<b>MT</b>: {mt_text}", unsafe_allow_html=True) | |
st.markdown(f"<b>PE</b>: {dic['tgt_text']}", unsafe_allow_html=True) | |
st.markdown(f"<b>Aligned edits</b>:", unsafe_allow_html=True) | |
if dic["aligned_edit"] is not None: | |
st.text(dic["aligned_edit"].replace("\\n", "\n").replace("REF:", "MT :").replace("HYP:", "PE :")) | |
else: | |
st.text("MT : N/A\nPE : N/A\nEVAL: N/A\n") | |
st.markdown(f"<b>Metadata</b>:", unsafe_allow_html=True) | |
st.json({k:v for k,v in dic.items() if k not in ["src_text", "mt_text", "tgt_text", "aligned_edit"]}) |