Spaces:

HuggingFaceM4
/

IDEFICS_Data_Measurement_Tool

Runtime error

File size: 9,928 Bytes

46df0b6

import evaluate
import logging
import os
import pandas as pd
import plotly.express as px
import utils
import utils.dataset_utils as ds_utils
from collections import Counter
from os.path import exists, isdir
from os.path import join as pjoin

LABEL_FIELD = "labels"
LABEL_NAMES = "label_names"
LABEL_LIST = "label_list"
LABEL_MEASUREMENT = "label_measurement"
# Specific to the evaluate library
EVAL_LABEL_MEASURE = "label_distribution"
EVAL_LABEL_ID = "labels"
EVAL_LABEL_FRAC = "fractions"
# TODO: This should ideally be in what's returned from the evaluate library
EVAL_LABEL_SUM = "sums"

logs = utils.prepare_logging(__file__)


def map_labels(label_field, ds_name_to_dict, ds_name, config_name):
    try:
        label_field, label_names = (
            ds_name_to_dict[ds_name][config_name]["features"][label_field][0]
            if len(
                ds_name_to_dict[ds_name][config_name]["features"][label_field]) > 0
            else ((), [])
        )
    except KeyError as e:
        logs.exception(e)
        logs.warning("Not returning a label-name mapping")
        return []
    return label_names


def make_label_results_dict(label_measurement, label_names):
    label_dict = {LABEL_MEASUREMENT: label_measurement,
                  LABEL_NAMES: label_names}
    return label_dict


def make_label_fig(label_results, chart_type="pie"):
    try:
        label_names = label_results[LABEL_NAMES]
        label_measurement = label_results[LABEL_MEASUREMENT]
        label_sums = label_measurement[EVAL_LABEL_SUM]
        if chart_type == "bar":
            fig_labels = plt.bar(
                label_measurement[EVAL_LABEL_MEASURE][EVAL_LABEL_ID],
                label_measurement[EVAL_LABEL_MEASURE][EVAL_LABEL_FRAC])
        else:
            if chart_type != "pie":
                logs.info("Oops! Don't have that chart-type implemented.")
                logs.info("Making the default pie chart")
            # IMDB - unsupervised has a labels column where all values are -1,
            # which breaks the assumption that
            # the number of label_names == the number of label_sums.
            # This handles that case, assuming it will happen in other datasets.
            if len(label_names) != len(label_sums):
                logs.warning("Can't make a figure with the given label names: "
                             "We don't have the right amount of label types "
                             "to apply them to!")
                return False
            fig_labels = px.pie(names=label_names, values=label_sums)
    except KeyError:
        logs.info("Input label data missing required key(s).")
        logs.info("We require %s, %s" % (LABEL_NAMES, LABEL_MEASUREMENT))
        logs.info("We found: %s" % ",".join(label_results.keys()))
        return False
    return fig_labels


def extract_label_names(label_field, ds_name, config_name):
    ds_name_to_dict = ds_utils.get_dataset_info_dicts(ds_name)
    label_names = map_labels(label_field, ds_name_to_dict, ds_name, config_name)
    return label_names


class DMTHelper:
    """Helper class for the Data Measurements Tool.
    This allows us to keep all variables and functions related to labels
    in one file.
    """

    def __init__(self, dstats, load_only, save):
        logs.info("Initializing labels.")
        # -- Data Measurements Tool variables
        self.label_results = dstats.label_results
        self.fig_labels = dstats.fig_labels
        self.use_cache = dstats.use_cache
        self.cache_dir = dstats.dataset_cache_dir
        self.load_only = load_only
        self.save = save
        # -- Hugging Face Dataset variables
        self.label_field = dstats.label_field
        # Input HuggingFace dataset
        self.dset = dstats.dset
        self.dset_name = dstats.dset_name
        self.dset_config = dstats.dset_config
        self.label_names = dstats.label_names
        # -- Filenames
        self.label_dir = "labels"
        label_json = "labels.json"
        label_fig_json = "labels_fig.json"
        label_fig_html = "labels_fig.html"
        self.labels_json_fid = pjoin(self.cache_dir, self.label_dir,
                                     label_json)
        self.labels_fig_json_fid = pjoin(self.cache_dir, self.label_dir,
                                         label_fig_json)
        self.labels_fig_html_fid = pjoin(self.cache_dir, self.label_dir,
                                         label_fig_html)

    def run_DMT_processing(self):
        """
        Loads or prepares the Labels measurements and figure as specified by
        the DMT options.
        """
        # First look to see what we can load from cache.
        if self.use_cache:
            logs.info("Trying to load labels.")
            self.fig_labels, self.label_results = self._load_label_cache()
            if self.fig_labels:
                logs.info("Loaded cached label figure.")
            if self.label_results:
                logs.info("Loaded cached label results.")
        # If we can prepare the results afresh...
        if not self.load_only:
            # If we didn't load them already, compute label statistics.
            if not self.label_results:
                logs.info("Preparing labels.")
                self.label_results = self._prepare_labels()
            # If we didn't load it already, create figure.
            if not self.fig_labels:
                logs.info("Creating label figure.")
                self.fig_labels = \
                    make_label_fig(self.label_results)
            # Finish
            if self.save:
                self._write_label_cache()

    def _load_label_cache(self):
        fig_labels = {}
        label_results = {}
        # Measurements exist. Load them.
        if exists(self.labels_json_fid):
            # Loads the label list, names, and results
            label_results = ds_utils.read_json(self.labels_json_fid)
        # Image exists. Load it.
        if exists(self.labels_fig_json_fid):
            fig_labels = ds_utils.read_plotly(self.labels_fig_json_fid)
        return fig_labels, label_results

    def _prepare_labels(self):
        """Loads a Labels object and computes label statistics"""
        # Label object for the dataset
        label_obj = Labels(dataset=self.dset,
                           dataset_name=self.dset_name,
                           config_name=self.dset_config)
        # TODO: Handle the case where there are multiple label columns.
        # The logic throughout the code assumes only one.
        if type(self.label_field) == tuple:
            label_field = self.label_field[0]
        elif type(self.label_field) == str:
            label_field = self.label_field
        else:
            logs.warning("Unexpected format %s for label column name(s). "
                         "Not computing label statistics." %
                         type(self.label_field))
            return {}
        label_results = label_obj.prepare_labels(label_field, self.label_names)
        return label_results

    def _write_label_cache(self):
        ds_utils.make_path(pjoin(self.cache_dir, self.label_dir))
        if self.label_results:
            ds_utils.write_json(self.label_results, self.labels_json_fid)
        if self.fig_labels:
            ds_utils.write_plotly(self.fig_labels, self.labels_fig_json_fid)
            self.fig_labels.write_html(self.labels_fig_html_fid)

    def get_label_filenames(self):
        label_fid_dict = {"statistics": self.labels_json_fid,
                          "figure json": self.labels_fig_json_fid,
                          "figure html": self.labels_fig_html_fid}
        return label_fid_dict


class Labels:
    """Generic class for label processing.
    Uses the Dataset to extract the label column and compute label measurements.
    """

    def __init__(self, dataset, dataset_name=None, config_name=None):
        # Input HuggingFace Dataset.
        self.dset = dataset
        # These are used to extract label names, when the label names
        # are stored in the Dataset object but not in the "label" column
        # we are working with, which may instead just be ints corresponding to
        # the names
        self.ds_name = dataset_name
        self.config_name = config_name
        # For measurement data and additional metadata.
        self.label_results_dict = {}

    def prepare_labels(self, label_field, label_names=[]):
        """ Uses the evaluate library to return the label distribution. """
        logs.info("Inside main label calculation function.")
        logs.debug("Looking for label field called '%s'" % label_field)
        # The input Dataset object
        # When the label field is not found, an error will be thrown.
        if label_field in self.dset.features:
            label_list = self.dset[label_field]
        else:
            logs.warning("No label column found -- nothing to do. Returning.")
            logs.debug(self.dset.features)
            return {}
        # Get the evaluate library's measurement for label distro.
        label_distribution = evaluate.load(EVAL_LABEL_MEASURE)
        # Measure the label distro.
        label_measurement = label_distribution.compute(data=label_list)
        # TODO: Incorporate this summation into what the evaluate library returns.
        label_sum_dict = Counter(label_list)
        label_sums = [label_sum_dict[key] for key in sorted(label_sum_dict)]
        label_measurement["sums"] = label_sums
        if not label_names:
            # Have to extract the label names from the Dataset object when the
            # actual dataset columns are just ints representing the label names.
            label_names = extract_label_names(label_field, self.ds_name,
                                              self.config_name)
        label_results = make_label_results_dict(label_measurement, label_names)
        return label_results