Spaces:

GroNLP
/

divemt_explorer

Sleeping

File size: 5,984 Bytes

from datasets import load_dataset
import streamlit as st
import urllib
from inseq import FeatureAttributionOutput

st.set_page_config(layout="wide")

dataset = load_dataset("GroNLP/divemt")
attribution_path = "https://huggingface.co/datasets/inseq/divemt_attributions/resolve/main/divemt-attributions/it/{idx}_it_gradl2_{setting}_{sentence_type}.json.gz"
df = dataset["train"].to_pandas()
unique_src = df[["item_id", "src_text"]].drop_duplicates(subset="item_id")
langs = list(df["lang_id"].unique())
st.title("DivEMT Explorer 🔍 🌍")
st.markdown("""
##### The DivEMT Explorer is a tool to explore translations and edits in the DivEMT corpus.

##### Use the expandable section "Explore examples" below to visualize some of the original source sentences. When you find an interesting sentence, insert its numeric id (between 0 and 429) in the box below, and select all the available languages you want to use for visualizing the results.

##### Inside every generated language section, you will find the translations for all the available settings, alongside aligned edits and a collection of collected metadata. You can filter the shown settings to see the aligned edits annotations.
""")

with st.expander("Explore examples"):
    col1, col2, _ = st.columns([3,2,5])
    with col1:
        offset = st.slider(
            "Select an offset",
            min_value=0,
            max_value=len(unique_src) - 5,
            value=0,
        )
    with col2:
        count = st.number_input(
            'Select the number of examples to display',
            min_value=3,
            max_value=len(unique_src),
            value=5,
        )
    st.table(unique_src[offset:int(offset+count)])
col1_main, col2_main, _ = st.columns([1,1,3])
with col1_main:
    item_id = st.number_input(
        'Select an item (0-429) to inspect',
        min_value=0,
        max_value=len(unique_src) - 1,
    )
with col2_main:
    langs = st.multiselect(
        'Select languages',
        options=langs
    )
st.markdown("##### Source text")
st.markdown("##### <span style='color: #ff4b4b'> " + unique_src.iloc[int(item_id)]["src_text"] + "</span>", unsafe_allow_html=True)
task_names = ["From Scratch (HT)", "Google PE (PE1)", "mBART PE (PE2)"]
for lang in langs:
    with st.expander(f"View {lang.upper()} data"):
        c1, _ = st.columns([1, 2])
        with c1:
            tasks = st.multiselect(
                'Select settings',
                options=task_names,
                default=task_names,
                key=f"{lang}_tasks"
            )
        #columns = st.columns(len(tasks))
        lang_data = df[(df["item_id"] == unique_src.iloc[int(item_id)]["item_id"]) & (df["lang_id"] == lang)]
        lang_dicts = lang_data.to_dict("records")
        ht = [x for x in lang_dicts if x["task_type"] == "ht"][0]
        pe1 = [x for x in lang_dicts if x["task_type"] == "pe1"][0]
        pe2 = [x for x in lang_dicts if x["task_type"] == "pe2"][0]
        task_dict = {k:v for k,v in zip(task_names, [ht, pe1, pe2])}
        max_mt_length = max([len(x["mt_text"]) for x in lang_dicts if x["mt_text"] is not None])
        for task_name, dic in zip(tasks, [task_dict[name] for name in tasks]):
            st.header(task_name)
            st.markdown(f"<b>Translator</b>: {dic['subject_id']}", unsafe_allow_html=True)
            mt_text = dic["mt_text"]
            if mt_text is None:
                mt_text = "<span style='opacity:0'>" + "".join(["O " for i in range(max_mt_length // 2)]) + "</span>"
            st.markdown(f"<b>MT</b>: {'<bdi>' if lang == 'ara' else ''}{mt_text}{'</bdi>' if lang == 'ara' else ''}", unsafe_allow_html=True)
            st.markdown(f"<b>PE</b>: {'<bdi>' if lang == 'ara' else ''}{dic['tgt_text']}{'</bdi>' if lang == 'ara' else ''}", unsafe_allow_html=True)
            st.markdown(f"<b>Aligned edits</b>:", unsafe_allow_html=True)
            if dic["aligned_edit"] is not None:
                aligned_edit = dic["aligned_edit"]
                if lang == 'ara' and len(dic["aligned_edit"].split("EVAL: ")) == 2:
                    edits_reverse = aligned_edit.split("EVAL: ")[1][::-1]
                    aligned_edit = aligned_edit.split("EVAL: ")[0] + "EVAL: " + edits_reverse
                aligned_edit = aligned_edit.replace("\\n", "\n").replace("REF:", "MT :").replace("HYP:", "PE :")
                st.text(aligned_edit)
            else:
                st.text("MT : N/A\nPE : N/A\nEVAL: N/A\n")
            st.markdown(f"<b>Metadata</b>:", unsafe_allow_html=True)
            st.json({k:v for k,v in dic.items() if k not in ["src_text", "mt_text", "tgt_text", "aligned_edit"]}, expanded=False)
            if task_name != "From Scratch (HT)":
                setting = "pe1" if task_name == "Google PE (PE1)" else "pe2"
                st.markdown(f"<b>Attributions</b>:", unsafe_allow_html=True)
                st.text("Click on checkboxes to show/hide the respective attributions computed with mBART 1-to-50.")
                for sentence_type in ["mt", "pe", "diff"]:
                    url = attribution_path.format(idx=item_id, setting=setting, sentence_type=sentence_type)
                    try:
                        g = urllib.request.urlopen(url)
                        fpath = f"attr_{sentence_type}.json.gz"
                        with open(fpath, 'b+w') as f:
                            f.write(g.read())
                        attr = FeatureAttributionOutput.load(fpath, decompress=True)
                        if st.checkbox(sentence_type.upper(), key=f"{lang}_{task_name}_{sentence_type}"):
                            st.markdown(f"{attr.show(return_html=True, display=False, do_aggregation=False)}", unsafe_allow_html=True)
                    except (urllib.HTTPError, urllib.URLError) as e:
                        st.checkbox(sentence_type.upper() + " (NOT AVAILABLE)", key=f"{lang}_{task_name}_{sentence_type}", disabled=True)