Spaces:

robustness-gym
/

summvis

Runtime error

File size: 25,210 Bytes
import logging
import os
from argparse import ArgumentParser
from ast import literal_eval
from types import SimpleNamespace
from typing import List

from robustnessgym import Dataset, Spacy, CachedOperation
from robustnessgym.core.constants import CACHEDOPS
from robustnessgym.core.tools import strings_as_json
from robustnessgym.logging.utils import set_logging_level
from spacy import load
from spacy.attrs import DEP, IS_ALPHA, IS_PUNCT, IS_STOP, LEMMA, LOWER, TAG, SENT_END, \
    SENT_START, ORTH, POS, ENT_IOB
from spacy.tokens import Doc

from align import BertscoreAligner, NGramAligner, StaticEmbeddingAligner
from utils import preprocess_text

set_logging_level('critical')
logger = logging.getLogger(__name__)
logger.setLevel(logging.CRITICAL)


def _spacy_encode(self, x):
    arr = x.to_array(
        [DEP, IS_ALPHA, IS_PUNCT, IS_STOP, LEMMA, LOWER, TAG, SENT_END, SENT_START,
         ORTH, POS, ENT_IOB])
    return {
        'arr': arr.flatten(),
        'shape': list(arr.shape),
        'words': [t.text for t in x]
    }


def _spacy_decode(self, x):
    doc = Doc(self.nlp.vocab, words=x['words'])
    return doc.from_array(
        [DEP, IS_ALPHA, IS_PUNCT, IS_STOP, LEMMA, LOWER,
         TAG, SENT_END, SENT_START, ORTH, POS, ENT_IOB],
        x['arr'].reshape(x['shape'])
    )


Spacy.encode = _spacy_encode
Spacy.decode = _spacy_decode


class AlignerCap(CachedOperation):
    def __init__(
            self,
            aligner,
            spacy,
            **kwargs,
    ):
        super(AlignerCap, self).__init__(**kwargs)
        self.spacy = spacy
        self.aligner = aligner

    @classmethod
    def encode(cls, x):
        # Convert to built-in types from np.int / np.float
        return super(AlignerCap, cls).encode([
            {str(k): [(int(t[0]), float(t[1])) for t in v] for k, v in d.items()}
            for d in x
        ])

    @classmethod
    def decode(cls, x):
        x = super(AlignerCap, cls).decode(x)
        x = [{literal_eval(k): v for k, v in d.items()} for d in x]
        return x

    def apply(self, batch, columns, *args, **kwargs):
        # Run the aligner on the first example of the batch
        return [
            self.aligner.align(
                self.spacy.retrieve(batch, columns[0])[0],
                [self.spacy.retrieve(batch, col)[0] for col in columns[1:]]
                if len(columns) > 2 else
                [self.spacy.retrieve(batch, columns[1])[0]],
            )
        ]


class BertscoreAlignerCap(AlignerCap):
    def __init__(
            self,
            threshold: float,
            top_k: int,
            spacy,
    ):
        super(BertscoreAlignerCap, self).__init__(
            aligner=BertscoreAligner(threshold=threshold, top_k=top_k),
            spacy=spacy,
            threshold=threshold,
            top_k=top_k,
        )


class NGramAlignerCap(AlignerCap):
    def __init__(
            self,
            spacy,
    ):
        super(NGramAlignerCap, self).__init__(
            aligner=NGramAligner(),
            spacy=spacy
        )


class StaticEmbeddingAlignerCap(AlignerCap):
    def __init__(
            self,
            threshold: float,
            top_k: int,
            spacy,
    ):
        super(StaticEmbeddingAlignerCap, self).__init__(
            aligner=StaticEmbeddingAligner(threshold=threshold, top_k=top_k),
            spacy=spacy,
            threshold=threshold,
            top_k=top_k,
        )


def _run_aligners(
        dataset: Dataset,
        aligners: List[CachedOperation],
        doc_column: str,
        reference_column: str,
        summary_columns: List[str] = None,
):
    if not summary_columns:
        summary_columns = []

    to_columns = []
    if reference_column is not None:
        to_columns.append(reference_column)
    to_columns.extend(summary_columns)

    for aligner in aligners:

        # Run the aligner on (document, summary) pairs

        dataset = aligner(
            dataset,
            [doc_column] + to_columns,
            # Must use `batch_size = 1`
            batch_size=1,
        )

        if reference_column is not None and len(summary_columns):
            # Run the aligner on (reference, summary) pairs
            dataset = aligner(
                dataset,
                [reference_column] + summary_columns,
                # Must use `batch_size = 1`
                batch_size=1,
            )

        if len(to_columns) > 1:
            # Instead of having one column for (document, summary) comparisons, split
            # off into (1 + |summary_columns|) total columns, one for each comparison

            # Retrieve the (document, summary) column
            doc_summary_column = aligner.retrieve(
                dataset[:],
                [doc_column] + to_columns,
            )[tuple([doc_column] + to_columns)]

            for i, col in enumerate(to_columns):
                # Add as a new column after encoding with the aligner's `encode` method
                dataset.add_column(
                    column=str(aligner.identifier(columns=[doc_column, col])),
                    values=[aligner.encode([row[i]]) for row in doc_summary_column],
                )

            # Remove the (document, summary) column
            dataset.remove_column(
                str(
                    aligner.identifier(
                        columns=[doc_column] + to_columns
                    )
                )
            )
            del dataset.interactions[CACHEDOPS].history[
                (
                    aligner.identifier,
                    strings_as_json(
                        strings=[doc_column] + to_columns
                    )
                )
            ]

        if reference_column is not None and len(summary_columns) > 1:
            # Instead of having one column for (reference, summary) comparisons, split
            # off into (|summary_columns|) total columns, one for each comparison

            # Retrieve the (reference, summary) column
            reference_summary_column = aligner.retrieve(
                dataset[:],
                [reference_column] + summary_columns,
            )[tuple([reference_column] + summary_columns)]

            for i, col in enumerate(summary_columns):
                # Add as a new column
                dataset.add_column(
                    column=str(aligner.identifier(columns=[reference_column, col])),
                    values=[
                        aligner.encode([row[i]]) for row in reference_summary_column
                    ]
                )

            # Remove the (reference, summary) column
            dataset.remove_column(
                str(
                    aligner.identifier(
                        columns=[reference_column] + summary_columns
                    )
                )
            )
            del dataset.interactions[CACHEDOPS].history[
                (
                    aligner.identifier,
                    strings_as_json(
                        strings=[reference_column] + summary_columns
                    )
                )
            ]

    return dataset


def deanonymize_dataset(
        rg_path: str,
        standardized_dataset: Dataset,
        processed_dataset_path: str = None,
        n_samples: int = None,

):
    """Take an anonymized dataset and add back the original dataset columns."""
    assert processed_dataset_path is not None, \
        "Please specify a path to save the dataset."

    # Load the dataset
    dataset = Dataset.load_from_disk(rg_path)

    if n_samples:
        dataset.set_visible_rows(list(range(n_samples)))
        standardized_dataset.set_visible_rows(list(range(n_samples)))

    text_columns = []

    # Add columns from the standardized dataset
    dataset.add_column('document', standardized_dataset['document'])
    text_columns.append('document')

    if 'summary:reference' in standardized_dataset.column_names:
        dataset.add_column('summary:reference', standardized_dataset['summary:reference'])
        text_columns.append('summary:reference')

    # Preprocessing all the text columns
    dataset = dataset.update(
        lambda x: {f'preprocessed_{k}': preprocess_text(x[k]) for k in text_columns}
    )

    # Run the Spacy pipeline on all preprocessed text columns
    try:
        nlp = load('en_core_web_lg')
    except OSError:
        nlp = load('en_core_web_sm')

    nlp.add_pipe('sentencizer', before="parser")
    spacy = Spacy(nlp=nlp)
    dataset = spacy(
        dataset,
        [f'preprocessed_{col}' for col in text_columns],
        batch_size=100,
    )

    # Directly save to disk
    dataset.save_to_disk(processed_dataset_path)

    return dataset


def run_workflow(
        jsonl_path: str = None,
        dataset: Dataset = None,
        doc_column: str = None,
        reference_column: str = None,
        summary_columns: List[str] = None,
        bert_aligner_threshold: float = 0.5,
        bert_aligner_top_k: int = 3,
        embedding_aligner_threshold: float = 0.5,
        embedding_aligner_top_k: int = 3,
        processed_dataset_path: str = None,
        n_samples: int = None,
        anonymize: bool = False,
):
    assert (jsonl_path is None) != (dataset is None), \
        "One of `jsonl_path` and `dataset` must be specified."
    assert processed_dataset_path is not None, \
        "Please specify a path to save the dataset."

    # Load the dataset
    if jsonl_path is not None:
        dataset = Dataset.from_jsonl(jsonl_path)

    if doc_column is None:
        # Assume `doc_column` is called "document"
        doc_column = 'document'
        assert doc_column in dataset.column_names, \
            f"`doc_column={doc_column}` is not a column in dataset."
        print("Assuming `doc_column` is called 'document'.")

    if reference_column is None:
        # Assume `reference_column` is called "summary:reference"
        reference_column = 'summary:reference'
        print("Assuming `reference_column` is called 'summary:reference'.")
        if reference_column not in dataset.column_names:
            print("No reference summary loaded")
            reference_column = None

    if summary_columns is None or len(summary_columns) == 0:
        # Assume `summary_columns` are prefixed by "summary:"
        summary_columns = []
        for col in dataset.column_names:
            if col.startswith("summary:") and col != "summary:reference":
                summary_columns.append(col)
        print(f"Reading summary columns from dataset. Found {summary_columns}.")

    if len(summary_columns) == 0 and reference_column is None:
        raise ValueError("At least one summary is required")

    # Set visible rows to restrict to the first `n_samples`
    if n_samples:
        dataset.set_visible_rows(list(range(n_samples)))

    # Combine the text columns into one list
    text_columns = [doc_column] + ([reference_column] if reference_column else []) + summary_columns

    # Preprocessing all the text columns
    dataset = dataset.update(
        lambda x: {f'preprocessed_{k}': preprocess_text(x[k]) for k in text_columns}
    )

    # Run the Spacy pipeline on all preprocessed text columns
    nlp = load('en_core_web_lg')
    nlp.add_pipe('sentencizer', before="parser")
    spacy = Spacy(nlp=nlp)
    dataset = spacy(
        dataset,
        [f'preprocessed_{col}' for col in text_columns],
        batch_size=100,
    )

    # Run the 3 align pipelines
    bert_aligner = BertscoreAlignerCap(
        threshold=bert_aligner_threshold,
        top_k=bert_aligner_top_k,
        spacy=spacy,
    )

    embedding_aligner = StaticEmbeddingAlignerCap(
        threshold=embedding_aligner_threshold,
        top_k=embedding_aligner_top_k,
        spacy=spacy,
    )

    ngram_aligner = NGramAlignerCap(
        spacy=spacy,
    )

    dataset = _run_aligners(
        dataset=dataset,
        aligners=[bert_aligner, embedding_aligner, ngram_aligner],
        doc_column=f'preprocessed_{doc_column}',
        reference_column=f'preprocessed_{reference_column}' if reference_column else None,
        summary_columns=[f'preprocessed_{col}' for col in summary_columns],
    )

    # Save the dataset
    if anonymize:
        # Remove certain columns to anonymize and save to disk
        for col in [doc_column, reference_column]:
            if col is not None:
                dataset.remove_column(col)
                dataset.remove_column(f'preprocessed_{col}')
                dataset.remove_column(
                    str(spacy.identifier(columns=[f'preprocessed_{col}']))
                )
                del dataset.interactions[CACHEDOPS].history[
                    (spacy.identifier, f'preprocessed_{col}')
                ]
        dataset.save_to_disk(f'{processed_dataset_path}.anonymized')
    else:
        # Directly save to disk
        dataset.save_to_disk(processed_dataset_path)

    return dataset


def parse_prediction_jsonl_name(prediction_jsonl: str):
    """Parse the name of the prediction_jsonl to extract useful information."""
    # Analyze the name of the prediction_jsonl
    filename = prediction_jsonl.split("/")[-1]

    # Check that the filename ends with `.results.anonymized`
    if filename.endswith(".results.anonymized"):
        # Fmt: <model>-<training dataset>.<eval dataset>.<eval split>.results.anonymized

        # Split using a period
        model_train_dataset, eval_dataset, eval_split = filename.split(".")[:-2]
        model, train_dataset = model_train_dataset.split("-")

        return SimpleNamespace(
            model_train_dataset=model_train_dataset,
            model=model,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            eval_split=eval_split,
        )

    raise NotImplementedError(
        "Prediction files must be named "
        "<model>-<training dataset>.<eval dataset>.<eval split>.results.anonymized. "
        f"Please rename the prediction file {filename} and run again."
    )


def join_predictions(
        dataset_jsonl: str = None,
        prediction_jsonls: str = None,
        save_jsonl_path: str = None,
):
    """Join predictions with a dataset."""
    assert prediction_jsonls is not None, "Must have prediction jsonl files."

    print(
        "> Warning: please inspect the prediction .jsonl file to make sure that "
        "predictions are aligned with the examples in the dataset. "
        "Use `get_dataset` to inspect the dataset."
    )

    # Load the dataset
    dataset = get_dataset(dataset_jsonl=dataset_jsonl)

    # Parse names of all prediction files to get metadata
    metadata = [
        parse_prediction_jsonl_name(prediction_jsonl)
        for prediction_jsonl in prediction_jsonls
    ]

    # Load the predictions
    predictions = [
        Dataset.from_jsonl(json_path=prediction_jsonl)
        for prediction_jsonl in prediction_jsonls
    ]

    # Predictions for a model
    for i, prediction_data in enumerate(predictions):
        # Get metadata for i_th prediction file
        metadata_i = metadata[i]

        # Construct a prefix for columns added to the dataset for this prediction file
        prefix = metadata_i.model_train_dataset

        # Add the predictions column to the dataset
        for col in prediction_data.column_names:
            # Don't add the indexing information since the dataset has it already
            if col not in {'index', 'ix', 'id'}:
                # `add_column` will automatically ensure that column lengths match
                if col == 'decoded':  # rename decoded to summary
                    dataset.add_column(f'summary:{prefix}', prediction_data[col])
                else:
                    dataset.add_column(f'{prefix}:{col}', prediction_data[col])

    # Save the dataset back to disk
    if save_jsonl_path:
        dataset.to_jsonl(save_jsonl_path)
    else:
        print("Dataset with predictions was not saved since `save_jsonl_path` "
              "was not specified.")

    return dataset


def standardize_dataset(
        dataset_name: str = None,
        dataset_version: str = None,
        dataset_split: str = 'test',
        dataset_jsonl: str = None,
        doc_column: str = None,
        reference_column: str = None,
        save_jsonl_path: str = None,
        no_save: bool = False,
):
    """Load a dataset from Huggingface and dump it to disk."""
    # Load the dataset from Huggingface
    dataset = get_dataset(
        dataset_name=dataset_name,
        dataset_version=dataset_version,
        dataset_split=dataset_split,
        dataset_jsonl=dataset_jsonl,
    )

    if doc_column is None:
        if reference_column is not None:
            raise ValueError("You must specify `doc_column` if you specify `reference_column`")
        try:
            doc_column, reference_column = {
                'cnn_dailymail': ('article', 'highlights'),
                'xsum': ('document', 'summary')
            }[dataset_name]
        except:
            raise NotImplementedError(
                "Please specify `doc_column`."
            )

    # Rename the columns
    if doc_column != 'document':
        dataset.add_column('document', dataset[doc_column])
        dataset.remove_column(doc_column)
    dataset.add_column('summary:reference', dataset[reference_column])
    dataset.remove_column(reference_column)

    # Save the dataset back to disk
    if save_jsonl_path:
        dataset.to_jsonl(save_jsonl_path)

    elif (save_jsonl_path is None) and not no_save:
        # Auto-create a path to save the standardized dataset
        os.makedirs('preprocessing', exist_ok=True)
        if not dataset_jsonl:
            dataset.to_jsonl(
                f'preprocessing/'
                f'standardized_{dataset_name}_{dataset_version}_{dataset_split}.jsonl'
            )
        else:
            dataset.to_jsonl(
                f'preprocessing/'
                f'standardized_{dataset_jsonl.split("/")[-1]}'
            )

    return dataset


def get_dataset(
        dataset_name: str = None,
        dataset_version: str = None,
        dataset_split: str = 'test',
        dataset_jsonl: str = None,
):
    """Load a dataset."""
    assert (dataset_name is not None) != (dataset_jsonl is not None), \
        "Specify one of `dataset_name` or `dataset_jsonl`."

    # Load the dataset
    if dataset_name is not None:
        return get_hf_dataset(dataset_name, dataset_version, dataset_split)

    return Dataset.from_jsonl(json_path=dataset_jsonl)


def get_hf_dataset(name: str, version: str = None, split: str = 'test'):
    """Get dataset from Huggingface."""
    if version:
        return Dataset.load_dataset(name, version, split=split)
    return Dataset.load_dataset(name, split=split)


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument('--dataset', type=str, choices=['cnn_dailymail', 'xsum'],
                        help="Huggingface dataset name.")
    parser.add_argument('--version', type=str,
                        help="Huggingface dataset version.")
    parser.add_argument('--split', type=str, default='test',
                        help="Huggingface dataset split.")
    parser.add_argument('--dataset_jsonl', type=str,
                        help="Path to a jsonl file for the dataset.")
    parser.add_argument('--dataset_rg', type=str,
                        help="Path to a dataset stored in the Robustness Gym format. "
                             "All processed datasets are stored in this format.")
    parser.add_argument('--prediction_jsonls', nargs='+', default=[],
                        help="Path to one or more jsonl files for the predictions.")
    parser.add_argument('--save_jsonl_path', type=str,
                        help="Path to save the processed jsonl dataset.")

    parser.add_argument('--doc_column', type=str,
                        help="Name of the document column in the dataset.")
    parser.add_argument('--reference_column', type=str,
                        help="Name of the reference summary column in the dataset.")
    parser.add_argument('--summary_columns', nargs='+', default=[],
                        help="Name of other summary columns in/added to the dataset.")

    parser.add_argument('--bert_aligner_threshold', type=float, default=0.1,
                        help="Minimum threshold for BERT alignment.")
    parser.add_argument('--bert_aligner_top_k', type=int, default=10,
                        help="Top-k for BERT alignment.")
    parser.add_argument('--embedding_aligner_threshold', type=float, default=0.1,
                        help="Minimum threshold for embedding alignment.")
    parser.add_argument('--embedding_aligner_top_k', type=int, default=10,
                        help="Top-k for embedding alignment.")
    parser.add_argument('--processed_dataset_path', type=str,
                        help="Path to store the final processed dataset.")
    parser.add_argument('--n_samples', type=int,
                        help="Number of dataset samples to process.")

    parser.add_argument('--workflow', action='store_true', default=False,
                        help="Whether to run the preprocessing workflow.")
    parser.add_argument('--standardize', action='store_true', default=False,
                        help="Whether to standardize the dataset and save to jsonl.")
    parser.add_argument('--join_predictions', action='store_true', default=False,
                        help="Whether to add predictions to the dataset and save to "
                             "jsonl.")
    parser.add_argument('--try_it', action='store_true', default=False,
                        help="`Try it` mode is faster and runs processing on 10 "
                             "examples.")
    parser.add_argument('--deanonymize', action='store_true', default=False,
                        help="Deanonymize the dataset provided by summvis.")
    parser.add_argument('--anonymize', action='store_true', default=False,
                        help="Anonymize by removing document and reference summary "
                             "columns of the original dataset.")

    args = parser.parse_args()

    if args.standardize:
        # Dump a dataset to jsonl on disk after standardizing it
        standardize_dataset(
            dataset_name=args.dataset,
            dataset_version=args.version,
            dataset_split=args.split,
            dataset_jsonl=args.dataset_jsonl,
            doc_column=args.doc_column,
            reference_column=args.reference_column,
            save_jsonl_path=args.save_jsonl_path,
        )

    if args.join_predictions:
        # Join the predictions with the dataset
        dataset = join_predictions(
            dataset_jsonl=args.dataset_jsonl,
            prediction_jsonls=args.prediction_jsonls,
            save_jsonl_path=args.save_jsonl_path,
        )

    if args.workflow:
        # Run the processing workflow
        dataset = None
        # Check if `args.dataset_rg` was passed in
        if args.dataset_rg:
            # Load the dataset directly
            dataset = Dataset.load_from_disk(args.dataset_rg)

        run_workflow(
            jsonl_path=args.dataset_jsonl,
            dataset=dataset,
            doc_column=args.doc_column,
            reference_column=args.reference_column,
            summary_columns=args.summary_columns,
            bert_aligner_threshold=args.bert_aligner_threshold,
            bert_aligner_top_k=args.bert_aligner_top_k,
            embedding_aligner_threshold=args.embedding_aligner_threshold,
            embedding_aligner_top_k=args.embedding_aligner_top_k,
            processed_dataset_path=args.processed_dataset_path,
            n_samples=args.n_samples if not args.try_it else 10,
            anonymize=args.anonymize,
        )

    if args.deanonymize:
        # Deanonymize an anonymized dataset
        # Check if `args.dataset_rg` was passed in
        assert args.dataset_rg is not None, \
            "Must specify `dataset_rg` path to be deanonymized."
        assert args.dataset_rg.endswith('anonymized'), \
            "`dataset_rg` must end in 'anonymized'."
        assert (args.dataset is None) != (args.dataset_jsonl is None), \
            "`dataset_rg` points to an anonymized dataset that will be " \
            "deanonymized. Please pass in relevant arguments: either " \
            "`dataset`, `version` and `split` OR `dataset_jsonl`."

        # Load the standardized dataset
        standardized_dataset = standardize_dataset(
            dataset_name=args.dataset,
            dataset_version=args.version,
            dataset_split=args.split,
            dataset_jsonl=args.dataset_jsonl,
            doc_column=args.doc_column,
            reference_column=args.reference_column,
            no_save=True,
        )
        # Use it to deanonymize
        dataset = deanonymize_dataset(
            rg_path=args.dataset_rg,
            standardized_dataset=standardized_dataset,
            processed_dataset_path=args.processed_dataset_path,
            n_samples=args.n_samples if not args.try_it else 10,
        )