Spaces:

GroNLP
/

divemt_explorer

Sleeping

File size: 6,804 Bytes

02e892d
6c35910
02e892d
497b1c6
74105b6
6c35910
 
 
 
228c5fe
6c35910
497b1c6
affa8ee
c08f926
 
497b1c6
c08f926
497b1c6
c08f926
497b1c6
c08f926
6c35910
228c5fe
 
 
 
 
 
 
 
 
497b1c6
 
 
 
 
 
 
 
 
 
 
 
6c35910
 
 
 
 
 
 
 
 
 
497b1c6
 
6c35910
c08f926
497b1c6
6c35910
 
497b1c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd9bf85
 
 
 
497b1c6
bd9bf85
 
497b1c6
448a3a8
2e7caf3
 
1f483bd
497b1c6
2e7caf3
448a3a8
 
bd9bf85
 
 
74105b6
497b1c6
d3fc5d0
74105b6
497b1c6
74105b6
228c5fe
d3fc5d0
 
6a50007
d3fc5d0
 
 
 
 
9bc55ff
d3fc5d0
497b1c6
 
 
 
d3fc5d0
 
74105b6

from datasets import load_dataset
import streamlit as st
import urllib
import math
from inseq import FeatureAttributionOutput

st.set_page_config(layout="wide")

dataset = load_dataset("GroNLP/divemt")
attribution_path = "https://huggingface.co/datasets/inseq/divemt_attributions/resolve/main/divemt-attributions/{lang}/{idx}_{lang}_gradl2_{setting}_{sentence_type}.json.gz"
df = dataset["train"].to_pandas()
unique_src = df[["item_id", "src_text"]].drop_duplicates(subset="item_id").rename(columns={"item_id": "Item ID", "src_text": "Source text"})
langs = list(df["lang_id"].unique())
st.title("DivEMT Explorer 🔍 🌍")
st.markdown("""
##### The DivEMT Explorer is a tool to explore translations, edits and errors in the [DivEMT dataset](https://huggingface.co/datasets/GroNLP/divemt).

The table below shows the 430 source sentences taken from Flores-101 and translated into six typologically diverse languages to build the DivEMT corpus. When you find a sentence you would like to inspect closely, insert its numeric id (between 0 and 429) in the box below, and select all the available languages you want to use for visualizing the results.

Inside every language section, you will find the translations for all the available settings, alongside aligned edits and all collected metadata. You can filter the settings to see only cases you are interested in. In the **Attributions** section, you can find attribution maps computed using the [Inseq library](https://github.com/inseq-team/inseq) and the mBART model.
""")

divemt_to_spacy_lang_map = {
    "ara": "ar",
    "nld": "nl",
    "ita": "it",
    "tur": "tr",
    "ukr": "uk",
    "vie": "vi",
}

divemt_to_labels_lang_map = {
    "ara": "Arabic",
    "nld": "Dutch",
    "ita": "Italian",
    "tur": "Turkish",
    "ukr": "Ukrainian",
    "vie": "Vietnamese",
}

st.dataframe(
    unique_src,
)
col1_main, col2_main, _ = st.columns([1,1,3])
with col1_main:
    item_id = st.number_input(
        'Select an item (0-429) to inspect',
        min_value=0,
        max_value=len(unique_src) - 1,
    )
with col2_main:
    langs = st.multiselect(
        'Select languages',
        options=langs,
        format_func=lambda x: divemt_to_labels_lang_map[x],
    )
st.markdown("##### Source text")
st.markdown("##### <span style='color: #ff4b4b'> " + unique_src.iloc[int(item_id)]["Source text"] + "</span>", unsafe_allow_html=True)
task_names = ["From Scratch (HT)", "Google PE (PE1)", "mBART PE (PE2)"]
for lang in langs:
    st.markdown(f"## {divemt_to_labels_lang_map[lang]}")
    c1, _ = st.columns([1.5,1])
    with c1:
        tasks = st.multiselect(
            'Select settings',
            options=task_names,
            default=task_names,
            key=f"{lang}_tasks"
        )
    #columns = st.columns(len(tasks))
    lang_data = df[(df["item_id"] == unique_src.iloc[int(item_id)]["Item ID"]) & (df["lang_id"] == lang)]
    lang_dicts = lang_data.to_dict("records")
    ht = [x for x in lang_dicts if x["task_type"] == "ht"][0]
    pe1 = [x for x in lang_dicts if x["task_type"] == "pe1"][0]
    pe2 = [x for x in lang_dicts if x["task_type"] == "pe2"][0]
    task_dict = {k:v for k,v in zip(task_names, [ht, pe1, pe2])}
    max_mt_length = max([len(x["mt_text"]) for x in lang_dicts if x["mt_text"] is not None])
    for task_name, dic in zip(tasks, [task_dict[name] for name in tasks]):
        with st.expander(f"{task_name}"):
            st.markdown(f"### {task_name}")
            st.markdown(f"<b>Translator</b>: {dic['subject_id']}", unsafe_allow_html=True)
            mt_text = dic["mt_text"]
            if mt_text is None:
                mt_text = "<span style='opacity:0'>" + "".join(["O " for i in range(max_mt_length // 2)]) + "</span>"
            st.markdown(f"<b>MT</b>: {'<bdi>' if lang == 'ara' else ''}{mt_text if mt_text != 'nan' else 'N/A'}{'</bdi>' if lang == 'ara' else ''}", unsafe_allow_html=True)
            st.markdown(f"<b>PE</b>: {'<bdi>' if lang == 'ara' else ''}{dic['tgt_text']}{'</bdi>' if lang == 'ara' else ''}", unsafe_allow_html=True)
            st.markdown(f"<b>Aligned edits</b>:", unsafe_allow_html=True)
            if dic["aligned_edit"] != "nan":
                aligned_edit = dic["aligned_edit"]
                if lang == 'ara' and len(dic["aligned_edit"].split("EVAL: ")) == 2:
                    edits_reverse = aligned_edit.split("EVAL: ")[1]
                    # - 4 is a hack that makes things aligned most of the time, grounded in empirical observation only
                    edits_reverse = edits_reverse + " " * ((len(aligned_edit.split("\\n")[0]) - len(edits_reverse)) - 10)
                    aligned_edit = aligned_edit.split("EVAL: ")[0] + "EVAL: " + edits_reverse[::-1]
                aligned_edit = aligned_edit.replace("\\n", "\n").replace("REF:", "MT :").replace("HYP:", "PE :")
                st.text(aligned_edit)
            else:
                st.text("MT : N/A\nPE : N/A\nEVAL: N/A\n")
            st.markdown(f"<b>Metadata</b>:", unsafe_allow_html=True)
            st.json({k:v for k,v in dic.items() if k not in ["src_text", "mt_text", "tgt_text", "aligned_edit"]}, expanded=False)
            st.markdown(f"<b>Attributions</b>:", unsafe_allow_html=True)
            if task_name != "From Scratch (HT)":
                setting = "pe1" if task_name == "Google PE (PE1)" else "pe2"
                st.markdown("<i>Click on checkboxes to show/hide the respective attributions computed with mBART.</i>", unsafe_allow_html=True)
                for sentence_type in ["mt", "pe", "diff"]:
                    url = attribution_path.format(idx=item_id, setting=setting, sentence_type=sentence_type, lang=divemt_to_spacy_lang_map[lang])
                    try:
                        g = urllib.request.urlopen(url)
                        fpath = f"attr_{lang}_{sentence_type}.json.gz"
                        with open(fpath, 'b+w') as f:
                            f.write(g.read())
                        attr = FeatureAttributionOutput.load(fpath, decompress=True)
                        if st.checkbox(sentence_type.upper(), key=f"{lang}_{task_name}_{sentence_type}"):
                            st.markdown(f"{attr.show(return_html=True, display=False, do_aggregation=False)}", unsafe_allow_html=True)
                    except (urllib.error.HTTPError, urllib.error.URLError) as e:
                        st.checkbox(sentence_type.upper() + " (NOT AVAILABLE)", key=f"{lang}_{task_name}_{sentence_type}", disabled=True)
            else:
                st.markdown("<i>Attributions are available only for machine-translated outputs.</i>", unsafe_allow_html=True)
st.markdown("</br>", unsafe_allow_html=True)
st.markdown("*Built by [Gabriele Sarti](https://gsarti.com)*")