from datasets import load_dataset import streamlit as st import urllib from inseq import FeatureAttributionOutput st.set_page_config(layout="wide") dataset = load_dataset("GroNLP/divemt") attribution_path = "https://huggingface.co/datasets/inseq/divemt_attributions/resolve/main/divemt-attributions/it/{idx}_{lang}_gradl2_{setting}_{sentence_type}.json.gz" df = dataset["train"].to_pandas() unique_src = df[["item_id", "src_text"]].drop_duplicates(subset="item_id") langs = list(df["lang_id"].unique()) st.title("DivEMT Explorer 🔍 🌍") st.markdown(""" ##### The DivEMT Explorer is a tool to explore translations and edits in the DivEMT corpus. ##### Use the expandable section "Explore examples" below to visualize some of the original source sentences. When you find an interesting sentence, insert its numeric id (between 0 and 429) in the box below, and select all the available languages you want to use for visualizing the results. ##### Inside every generated language section, you will find the translations for all the available settings, alongside aligned edits and a collection of collected metadata. You can filter the shown settings to see the aligned edits annotations. """) with st.expander("Explore examples"): col1, col2, _ = st.columns([3,2,5]) with col1: offset = st.slider( "Select an offset", min_value=0, max_value=len(unique_src) - 5, value=0, ) with col2: count = st.number_input( 'Select the number of examples to display', min_value=3, max_value=len(unique_src), value=5, ) st.table(unique_src[offset:int(offset+count)]) col1_main, col2_main, _ = st.columns([1,1,3]) with col1_main: item_id = st.number_input( 'Select an item (0-429) to inspect', min_value=0, max_value=len(unique_src) - 1, ) with col2_main: langs = st.multiselect( 'Select languages', options=langs ) st.markdown("##### Source text") st.markdown("##### " + unique_src.iloc[int(item_id)]["src_text"] + "", unsafe_allow_html=True) task_names = ["From Scratch (HT)", "Google PE (PE1)", "mBART PE (PE2)"] for lang in langs: with st.expander(f"View {lang.upper()} data"): c1, _ = st.columns([1, 2]) with c1: tasks = st.multiselect( 'Select settings', options=task_names, default=task_names, key=f"{lang}_tasks" ) #columns = st.columns(len(tasks)) lang_data = df[(df["item_id"] == unique_src.iloc[int(item_id)]["item_id"]) & (df["lang_id"] == lang)] lang_dicts = lang_data.to_dict("records") ht = [x for x in lang_dicts if x["task_type"] == "ht"][0] pe1 = [x for x in lang_dicts if x["task_type"] == "pe1"][0] pe2 = [x for x in lang_dicts if x["task_type"] == "pe2"][0] task_dict = {k:v for k,v in zip(task_names, [ht, pe1, pe2])} max_mt_length = max([len(x["mt_text"]) for x in lang_dicts if x["mt_text"] is not None]) for task_name, dic in zip(tasks, [task_dict[name] for name in tasks]): st.header(task_name) st.markdown(f"Translator: {dic['subject_id']}", unsafe_allow_html=True) mt_text = dic["mt_text"] if mt_text is None: mt_text = "" + "".join(["O " for i in range(max_mt_length // 2)]) + "" st.markdown(f"MT: {'' if lang == 'ara' else ''}{mt_text}{'' if lang == 'ara' else ''}", unsafe_allow_html=True) st.markdown(f"PE: {'' if lang == 'ara' else ''}{dic['tgt_text']}{'' if lang == 'ara' else ''}", unsafe_allow_html=True) st.markdown(f"Aligned edits:", unsafe_allow_html=True) if dic["aligned_edit"] is not None: aligned_edit = dic["aligned_edit"] #if lang == 'ara' and len(dic["aligned_edit"].split("EVAL: ")) == 2: # edits_reverse = aligned_edit.split("EVAL: ")[1][::-1] # aligned_edit = aligned_edit.split("EVAL: ")[0] + "EVAL: " + edits_reverse aligned_edit = aligned_edit.replace("\\n", "\n").replace("REF:", "MT :").replace("HYP:", "PE :") st.text(aligned_edit) else: st.text("MT : N/A\nPE : N/A\nEVAL: N/A\n") st.markdown(f"Metadata:", unsafe_allow_html=True) st.json({k:v for k,v in dic.items() if k not in ["src_text", "mt_text", "tgt_text", "aligned_edit"]}, expanded=False) if task_name != "From Scratch (HT)": setting = "pe1" if task_name == "Google PE (PE1)" else "pe2" st.markdown(f"Attributions:", unsafe_allow_html=True) st.text("Click on checkboxes to show/hide the respective attributions computed with mBART 1-to-50.") for sentence_type in ["mt", "pe", "diff"]: url = attribution_path.format(idx=item_id, setting=setting, sentence_type=sentence_type, lang=lang) try: g = urllib.request.urlopen(url) fpath = f"attr_{lang}_{sentence_type}.json.gz" with open(fpath, 'b+w') as f: f.write(g.read()) attr = FeatureAttributionOutput.load(fpath, decompress=True) if st.checkbox(sentence_type.upper(), key=f"{lang}_{task_name}_{sentence_type}"): st.markdown(f"{attr.show(return_html=True, display=False, do_aggregation=False)}", unsafe_allow_html=True) except (urllib.HTTPError, urllib.URLError) as e: st.checkbox(sentence_type.upper() + " (NOT AVAILABLE)", key=f"{lang}_{task_name}_{sentence_type}", disabled=True)