Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import load_dataset
|
| 2 |
+
import streamlit as st
|
| 3 |
+
|
| 4 |
+
st.set_page_config(layout="wide")
|
| 5 |
+
|
| 6 |
+
dataset = load_dataset("GroNLP/divemt")
|
| 7 |
+
df = dataset["train"].to_pandas()
|
| 8 |
+
unique_src = df[["item_id", "src_text"]].drop_duplicates(subset="item_id")
|
| 9 |
+
langs = list(df["lang"].unique())
|
| 10 |
+
|
| 11 |
+
st.title("DivEMT Explorer")
|
| 12 |
+
|
| 13 |
+
cc1, _ = st.columns([2, 1])
|
| 14 |
+
with cc1:
|
| 15 |
+
st.write("""
|
| 16 |
+
The DivEMT Explorer is a tool to explore translations and edits contained in the DivEMT corpus.
|
| 17 |
+
Use the expandable section "Explore examples" below to visualize some of the original source sentences. When you found a sentence that you might be interested in, insert its numeric id (between 0 and 429) in the box below, and select all the languages for which you want to visualize the results.
|
| 18 |
+
Inside every generated section you will find the translations for all the available settings, alongside aligned edits and a collection of collected metadata. You can filter the showed settings to better see the aligned edits annotations.
|
| 19 |
+
""")
|
| 20 |
+
with st.expander("Explore examples"):
|
| 21 |
+
col1, col2, _ = st.columns([3,2,5])
|
| 22 |
+
with col1:
|
| 23 |
+
offset = st.slider(
|
| 24 |
+
"Select an offset",
|
| 25 |
+
min_value=0,
|
| 26 |
+
max_value=len(unique_src) - 5,
|
| 27 |
+
value=0,
|
| 28 |
+
)
|
| 29 |
+
with col2:
|
| 30 |
+
count = st.number_input(
|
| 31 |
+
'Select the number of examples to display',
|
| 32 |
+
min_value=3,
|
| 33 |
+
max_value=len(unique_src),
|
| 34 |
+
value=5,
|
| 35 |
+
)
|
| 36 |
+
st.table(unique_src[offset:int(offset+count)])
|
| 37 |
+
col1_main, col2_main, _ = st.columns([1,1,3])
|
| 38 |
+
with col1_main:
|
| 39 |
+
item_id = st.number_input(
|
| 40 |
+
'Select an item (0-429) to inspect',
|
| 41 |
+
min_value=0,
|
| 42 |
+
max_value=len(unique_src) - 1,
|
| 43 |
+
)
|
| 44 |
+
with col2_main:
|
| 45 |
+
langs = st.multiselect(
|
| 46 |
+
'Select languages',
|
| 47 |
+
options=langs
|
| 48 |
+
)
|
| 49 |
+
st.markdown("<b>Source text:</b> <span style='color: #ff4b4b'> " + unique_src.iloc[int(item_id)]["src_text"] + "</span>", unsafe_allow_html=True)
|
| 50 |
+
task_names = ["From Scratch (HT)", "Google PE (PE1)", "mBART PE (PE2)"]
|
| 51 |
+
for lang in langs:
|
| 52 |
+
with st.expander(f"View {lang.upper()} data"):
|
| 53 |
+
c1, _ = st.columns([1, 2])
|
| 54 |
+
with c1:
|
| 55 |
+
tasks = st.multiselect(
|
| 56 |
+
'Select settings',
|
| 57 |
+
options=task_names,
|
| 58 |
+
default=task_names,
|
| 59 |
+
key=f"{lang}_tasks"
|
| 60 |
+
)
|
| 61 |
+
columns = st.columns(len(tasks))
|
| 62 |
+
lang_data = df[(df["item_id"] == unique_src.iloc[int(item_id)]["item_id"]) & (df["lang"] == lang)]
|
| 63 |
+
lang_dicts = lang_data.to_dict("records")
|
| 64 |
+
ht = [x for x in lang_dicts if x["task_type"] == "ht"][0]
|
| 65 |
+
pe1 = [x for x in lang_dicts if x["task_type"] == "pe1"][0]
|
| 66 |
+
pe2 = [x for x in lang_dicts if x["task_type"] == "pe2"][0]
|
| 67 |
+
task_dict = {k:v for k,v in zip(task_names, [ht, pe1, pe2])}
|
| 68 |
+
max_mt_length = max([len(x["mt_text"]) for x in lang_dicts if x["mt_text"] is not None])
|
| 69 |
+
for task_name, dic, col in zip(tasks, [task_dict[name] for name in tasks], columns):
|
| 70 |
+
with col:
|
| 71 |
+
st.header(task_name)
|
| 72 |
+
st.markdown(f"<b>Translator</b>: {dic['subject_id']}", unsafe_allow_html=True)
|
| 73 |
+
mt_text = dic["mt_text"]
|
| 74 |
+
if mt_text is None:
|
| 75 |
+
mt_text = "<span style='opacity:0'>" + "".join(["O " for i in range(max_mt_length // 2)]) + "</span>"
|
| 76 |
+
st.markdown(f"<b>MT</b>: {mt_text}", unsafe_allow_html=True)
|
| 77 |
+
st.markdown(f"<b>PE</b>: {dic['tgt_text']}", unsafe_allow_html=True)
|
| 78 |
+
st.markdown(f"<b>Aligned edits</b>:", unsafe_allow_html=True)
|
| 79 |
+
if dic["aligned_edit"] is not None:
|
| 80 |
+
st.text(dic["aligned_edit"].replace("\\n", "\n").replace("REF:", "MT :").replace("HYP:", "PE :"))
|
| 81 |
+
else:
|
| 82 |
+
st.text("MT : N/A\nPE : N/A\nEVAL: N/A\n")
|
| 83 |
+
st.markdown(f"<b>Metadata</b>:", unsafe_allow_html=True)
|
| 84 |
+
st.json({k:v for k,v in dic.items() if k not in ["src_text", "mt_text", "tgt_text", "aligned_edit"]})
|