Source code for smif.data_layer.data_array

"""DataArray provides a thin wrapper around multidimensional arrays and metadata
"""
from logging import getLogger

import numpy as np  # type: ignore
from smif.exception import SmifDataError, SmifDataMismatchError
from smif.metadata.spec import Spec

# Import pandas, xarray if available (optional dependencies)
try:
    import pandas  # type: ignore
    import xarray  # type: ignore
except ImportError:
    pass


INSTALL_WARNING = """\
Please install pandas and xarray to access smif.DataArray
data as pandas.DataFrame or xarray.DataArray. Try running:
    pip install smif[data]
or:
    conda install pandas xarray
"""


[docs]class DataArray:
    """DataArray provides access to input/parameter/results data, with conversions to common
    python data libraries (for example: numpy, pandas, xarray).

    Attributes
    ----------
    spec: smif.metadata.spec.Spec
    data: numpy.ndarray
    """

    def __init__(self, spec: Spec, data: np.ndarray):
        self.logger = getLogger(__name__)

        if not hasattr(data, "shape"):
            self.logger.debug("Data is not an numpy.ndarray")
            data = np.array(data)

        if not hasattr(spec, "shape"):
            self.logger.error("spec argument is not a Spec")
            raise TypeError("spec argument is not a Spec")

        if not data.shape == spec.shape:
            # special case for scalar - allow a single-value 1D array, here coerced to single
            # value 0D array. Then simpler to create from DataFrame or xarray.DataArray
            if data.shape == (1,) and spec.shape == ():
                data = np.array(data[0])
            else:
                msg = "Data shape {} does not match spec {}"
                raise SmifDataMismatchError(msg.format(data.shape, spec.shape))

        self.spec = spec
        self.data = data

    def __eq__(self, other):
        return self.spec == other.spec and _array_equal_nan(self.data, other.data)

    def __repr__(self):
        return "<DataArray('{}', '{}')>".format(self.spec, self.data)

    def __str__(self):
        return "<DataArray('{}', '{}')>".format(self.spec, self.data)

[docs]    def as_dict(self):
        """ """
        return self.spec.as_dict()

    @property
    def name(self):
        """The name of the data that this spec describes."""
        return self.spec.name

    @name.setter
    def name(self, value):
        self.spec.name = value

    @property
    def description(self):
        """A human-friendly description"""
        return self.spec.description

    @property
    def dims(self):
        """Names for each dimension"""
        return self.spec.dims

    @property
    def coords(self):
        """Coordinate labels for each dimension."""
        return self.spec.coords

[docs]    def dim_coords(self, dim):
        """Coordinates for a given dimension"""
        return self.spec.dim_coords(dim)

[docs]    def dim_names(self, dim):
        """Coordinate names for a given dimension"""
        return self.spec.dim_names(dim)

[docs]    def dim_elements(self, dim):
        """Coordinate elements for a given dimension"""
        return self.spec.dim_elements(dim)

    @property
    def unit(self):
        """The unit for all data points."""
        return self.spec.unit

    @property
    def shape(self):
        """The shape of the data array"""
        return self.data.shape

[docs]    def as_ndarray(self) -> np.ndarray:
        """Access as a :class:`numpy.ndarray`"""
        return self.data

[docs]    def as_df(self) -> pandas.DataFrame:
        """Access DataArray as a :class:`pandas.DataFrame`"""
        dims = self.dims
        coords = [c.ids for c in self.coords]

        try:
            if dims and coords:
                if len(dims) == 1:
                    index = pandas.Index(coords[0], name=dims[0])
                else:
                    index = pandas.MultiIndex.from_product(coords, names=dims)
                return pandas.DataFrame(
                    {self.name: np.reshape(self.data, self.data.size)}, index=index
                )
            else:
                # with no dims or coords, should be in the zero-dimensional case
                if self.data.shape != ():
                    msg = "Expected zero-dimensional data, got %s" % self.data.shape
                    raise SmifDataMismatchError(msg)
                return pandas.DataFrame([{self.name: self.data[()]}])
        except NameError as ex:
            raise SmifDataError(INSTALL_WARNING) from ex

[docs]    @classmethod
    def from_df(cls, spec, dataframe):
        """Create a DataArray from a :class:`pandas.DataFrame`"""
        name = spec.name
        dims = spec.dims

        data_columns = dataframe.columns.values.tolist()
        index_names = dataframe.index.names

        if dims and len(index_names) == 1 and index_names[0] is None:
            # case when an unindexed dataframe was passed in, try to recover automagically
            if set(dims).issubset(set(data_columns)):
                dataframe = dataframe.set_index(dims)
                data_columns = dataframe.columns.values.tolist()
                index_names = dataframe.index.names

        if len(dims) == 1 and isinstance(dataframe.index, pandas.MultiIndex):
            # case with one-level MultiIndex which xarray seems to reorder unless cast to
            # simple Index
            dataframe = dataframe.reset_index().set_index(dims[0])

        if name not in data_columns or (dims and set(dims) != set(index_names)):
            msg = (
                "Data for '{name}' expected a data column called '{name}' and index "
                + "names {dims}, instead got data columns {data_columns} and index names "
                + "{index_names}"
            )
            raise SmifDataMismatchError(
                msg.format(
                    name=name,
                    dims=dims,
                    data_columns=data_columns,
                    index_names=index_names,
                )
            )

        try:
            # convert to dataset
            xr_dataset = dataframe.to_xarray()

            # extract xr.DataArray
            xr_data_array = xr_dataset[spec.name]

            # reindex to ensure data order and fill out NaNs
            xr_data_array = _reindex_xr_data_array(spec, xr_data_array)

        # xarray raises Exception in v0.10 (narrowed to ValueError in v0.11)
        except Exception as ex:  # pylint: disable=broad-except
            dups = find_duplicate_indices(dataframe)
            if dups:
                msg = "Data for '{name}' contains duplicate values at {dups}"
                raise SmifDataMismatchError(msg.format(name=name, dups=dups)) from ex
            else:
                raise ex

        return cls(spec, xr_data_array.data)

[docs]    def as_xarray(self):
        """Access DataArray as a :class:`xarray.DataArray`"""
        metadata = self.spec.as_dict()
        del metadata["dims"]
        del metadata["coords"]

        dims = self.dims
        coords = {c.name: c.ids for c in self.coords}

        try:
            return xarray.DataArray(
                self.data, coords=coords, dims=dims, name=self.name, attrs=metadata
            )
        except NameError as ex:
            raise SmifDataError(INSTALL_WARNING) from ex

[docs]    @classmethod
    def from_xarray(cls, spec, xr_data_array):
        """Create a DataArray from a :class:`xarray.DataArray`"""
        # reindex to ensure data order and fill out NaNs
        xr_data_array = _reindex_xr_data_array(spec, xr_data_array)
        return cls(spec, xr_data_array.data)

[docs]    def update(self, other):
        """Update data values with any from other which are non-null"""
        assert self.spec == other.spec, "Specs must match when updating DataArray"
        # convert self and other to xarray representation
        self_xr = self.as_xarray()
        other_xr = other.as_xarray()
        # use xarray.combine_first convenience function
        overridden = other_xr.combine_first(self_xr)
        # assign result back to self
        self.data = overridden.data

[docs]    def validate_as_full(self):
        """Check that the data array contains no NaN values"""
        dataframe = self.as_df()
        if np.any(dataframe.isnull()):
            expected_len = len(dataframe)
            missing_data = show_null(dataframe)
            actual_len = expected_len - len(missing_data)
            dim_lens = (
                "{"
                + ", ".join(
                    "{}: {}".format(dim, len_)
                    for dim, len_ in zip(self.dims, self.shape)
                )
                + "}"
            )
            self.logger.debug("Missing data:\n\n    %s", missing_data)
            msg = (
                "Data for '{name}' had missing values - read {actual_len} but expected "
                + "{expected_len} in total, from dims of length {dim_lens}"
            )
            raise SmifDataMismatchError(
                msg.format(
                    name=self.name,
                    actual_len=actual_len,
                    expected_len=expected_len,
                    dim_lens=dim_lens,
                )
            )


[docs]def show_null(dataframe) -> pandas.DataFrame:
    """Shows missing data

    Returns
    -------
    pandas.DataFrame
    """
    try:
        missing_data = dataframe[dataframe.isnull().values]
    except NameError as ex:
        raise SmifDataError(INSTALL_WARNING) from ex
    return missing_data


[docs]def find_duplicate_indices(dataframe):
    """Find duplicate indices in a DataFrame

    Returns
    -------
    list[dict]
    """
    # find duplicate index entries
    dups_df = dataframe[dataframe.index.duplicated()]
    # drop data columns, reset index to promote index to values
    dups_index_df = dups_df.drop(dups_df.columns, axis=1).reset_index()
    return dups_index_df.to_dict("records")


def _array_equal_nan(a, b):
    """Compare numpy arrays for equality, allowing NaN to be considerd equal to itself"""
    if np.issubdtype(a.dtype, np.number) and np.issubdtype(b.dtype, np.number):
        return np.all((a == b) | (np.isnan(a) & np.isnan(b)))
    else:
        return np.all(a == b)


def _reindex_xr_data_array(spec, xr_data_array):
    """Reindex to ensure full data, order"""
    # all index values must exist in dimension - extras would otherwise be silently dropped
    for dim in spec.dims:
        index_values = set(xr_data_array.coords[dim].values)
        dim_names = set(spec.dim_names(dim))
        in_index_but_not_dim_names = index_values - dim_names
        if in_index_but_not_dim_names:
            msg = (
                "Data for '{name}' contained unexpected values in the set of "
                + "coordinates for dimension '{dim}': {extras}"
            )
            raise SmifDataMismatchError(
                msg.format(
                    dim=dim, extras=list(in_index_but_not_dim_names), name=spec.name
                )
            )

    coords = {c.name: c.ids for c in spec.coords}
    xr_data_array = xr_data_array.reindex(indexers=coords)

    return xr_data_array
Source code for smif.data_layer.data_array

smif

Navigation

Related Topics