File size: 6,021 Bytes
02e892d
6c35910
02e892d
74105b6
6c35910
 
 
 
6a50007
6c35910
 
affa8ee
c08f926
 
 
 
 
 
 
 
6c35910
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c08f926
 
6c35910
 
 
 
 
 
 
 
 
 
 
bd9bf85
affa8ee
6c35910
 
 
 
 
 
bd9bf85
 
 
 
 
 
 
 
 
 
448a3a8
6a50007
 
 
448a3a8
 
bd9bf85
 
 
74105b6
d3fc5d0
74105b6
 
 
 
6a50007
d3fc5d0
 
6a50007
d3fc5d0
 
 
 
 
9bc55ff
d3fc5d0
 
 
74105b6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from datasets import load_dataset
import streamlit as st
import urllib
from inseq import FeatureAttributionOutput

st.set_page_config(layout="wide")

dataset = load_dataset("GroNLP/divemt")
attribution_path = "https://huggingface.co/datasets/inseq/divemt_attributions/resolve/main/divemt-attributions/it/{idx}_{lang}_gradl2_{setting}_{sentence_type}.json.gz"
df = dataset["train"].to_pandas()
unique_src = df[["item_id", "src_text"]].drop_duplicates(subset="item_id")
langs = list(df["lang_id"].unique())
st.title("DivEMT Explorer πŸ” 🌍")
st.markdown("""
##### The DivEMT Explorer is a tool to explore translations and edits in the DivEMT corpus.

##### Use the expandable section "Explore examples" below to visualize some of the original source sentences. When you find an interesting sentence, insert its numeric id (between 0 and 429) in the box below, and select all the available languages you want to use for visualizing the results.

##### Inside every generated language section, you will find the translations for all the available settings, alongside aligned edits and a collection of collected metadata. You can filter the shown settings to see the aligned edits annotations.
""")

with st.expander("Explore examples"):
    col1, col2, _ = st.columns([3,2,5])
    with col1:
        offset = st.slider(
            "Select an offset",
            min_value=0,
            max_value=len(unique_src) - 5,
            value=0,
        )
    with col2:
        count = st.number_input(
            'Select the number of examples to display',
            min_value=3,
            max_value=len(unique_src),
            value=5,
        )
    st.table(unique_src[offset:int(offset+count)])
col1_main, col2_main, _ = st.columns([1,1,3])
with col1_main:
    item_id = st.number_input(
        'Select an item (0-429) to inspect',
        min_value=0,
        max_value=len(unique_src) - 1,
    )
with col2_main:
    langs = st.multiselect(
        'Select languages',
        options=langs
    )
st.markdown("##### Source text")
st.markdown("##### <span style='color: #ff4b4b'> " + unique_src.iloc[int(item_id)]["src_text"] + "</span>", unsafe_allow_html=True)
task_names = ["From Scratch (HT)", "Google PE (PE1)", "mBART PE (PE2)"]
for lang in langs:
    with st.expander(f"View {lang.upper()} data"):
        c1, _ = st.columns([1, 2])
        with c1:
            tasks = st.multiselect(
                'Select settings',
                options=task_names,
                default=task_names,
                key=f"{lang}_tasks"
            )
        #columns = st.columns(len(tasks))
        lang_data = df[(df["item_id"] == unique_src.iloc[int(item_id)]["item_id"]) & (df["lang_id"] == lang)]
        lang_dicts = lang_data.to_dict("records")
        ht = [x for x in lang_dicts if x["task_type"] == "ht"][0]
        pe1 = [x for x in lang_dicts if x["task_type"] == "pe1"][0]
        pe2 = [x for x in lang_dicts if x["task_type"] == "pe2"][0]
        task_dict = {k:v for k,v in zip(task_names, [ht, pe1, pe2])}
        max_mt_length = max([len(x["mt_text"]) for x in lang_dicts if x["mt_text"] is not None])
        for task_name, dic in zip(tasks, [task_dict[name] for name in tasks]):
            st.header(task_name)
            st.markdown(f"<b>Translator</b>: {dic['subject_id']}", unsafe_allow_html=True)
            mt_text = dic["mt_text"]
            if mt_text is None:
                mt_text = "<span style='opacity:0'>" + "".join(["O " for i in range(max_mt_length // 2)]) + "</span>"
            st.markdown(f"<b>MT</b>: {'<bdi>' if lang == 'ara' else ''}{mt_text}{'</bdi>' if lang == 'ara' else ''}", unsafe_allow_html=True)
            st.markdown(f"<b>PE</b>: {'<bdi>' if lang == 'ara' else ''}{dic['tgt_text']}{'</bdi>' if lang == 'ara' else ''}", unsafe_allow_html=True)
            st.markdown(f"<b>Aligned edits</b>:", unsafe_allow_html=True)
            if dic["aligned_edit"] is not None:
                aligned_edit = dic["aligned_edit"]
                #if lang == 'ara' and len(dic["aligned_edit"].split("EVAL: ")) == 2:
                #    edits_reverse = aligned_edit.split("EVAL: ")[1][::-1]
                #    aligned_edit = aligned_edit.split("EVAL: ")[0] + "EVAL: " + edits_reverse
                aligned_edit = aligned_edit.replace("\\n", "\n").replace("REF:", "MT :").replace("HYP:", "PE :")
                st.text(aligned_edit)
            else:
                st.text("MT : N/A\nPE : N/A\nEVAL: N/A\n")
            st.markdown(f"<b>Metadata</b>:", unsafe_allow_html=True)
            st.json({k:v for k,v in dic.items() if k not in ["src_text", "mt_text", "tgt_text", "aligned_edit"]}, expanded=False)
            if task_name != "From Scratch (HT)":
                setting = "pe1" if task_name == "Google PE (PE1)" else "pe2"
                st.markdown(f"<b>Attributions</b>:", unsafe_allow_html=True)
                st.text("Click on checkboxes to show/hide the respective attributions computed with mBART 1-to-50.")
                for sentence_type in ["mt", "pe", "diff"]:
                    url = attribution_path.format(idx=item_id, setting=setting, sentence_type=sentence_type, lang=lang)
                    try:
                        g = urllib.request.urlopen(url)
                        fpath = f"attr_{lang}_{sentence_type}.json.gz"
                        with open(fpath, 'b+w') as f:
                            f.write(g.read())
                        attr = FeatureAttributionOutput.load(fpath, decompress=True)
                        if st.checkbox(sentence_type.upper(), key=f"{lang}_{task_name}_{sentence_type}"):
                            st.markdown(f"{attr.show(return_html=True, display=False, do_aggregation=False)}", unsafe_allow_html=True)
                    except (urllib.error.HTTPError, urllib.error.URLError) as e:
                        st.checkbox(sentence_type.upper() + " (NOT AVAILABLE)", key=f"{lang}_{task_name}_{sentence_type}", disabled=True)