"""DataArray provides a thin wrapper around multidimensional arrays and metadata
"""
from logging import getLogger
import numpy as np # type: ignore
from smif.exception import SmifDataError, SmifDataMismatchError
from smif.metadata.spec import Spec
# Import pandas, xarray if available (optional dependencies)
try:
import pandas # type: ignore
import xarray # type: ignore
except ImportError:
pass
INSTALL_WARNING = """\
Please install pandas and xarray to access smif.DataArray
data as pandas.DataFrame or xarray.DataArray. Try running:
pip install smif[data]
or:
conda install pandas xarray
"""
[docs]class DataArray:
"""DataArray provides access to input/parameter/results data, with conversions to common
python data libraries (for example: numpy, pandas, xarray).
Attributes
----------
spec: smif.metadata.spec.Spec
data: numpy.ndarray
"""
def __init__(self, spec: Spec, data: np.ndarray):
self.logger = getLogger(__name__)
if not hasattr(data, "shape"):
self.logger.debug("Data is not an numpy.ndarray")
data = np.array(data)
if not hasattr(spec, "shape"):
self.logger.error("spec argument is not a Spec")
raise TypeError("spec argument is not a Spec")
if not data.shape == spec.shape:
# special case for scalar - allow a single-value 1D array, here coerced to single
# value 0D array. Then simpler to create from DataFrame or xarray.DataArray
if data.shape == (1,) and spec.shape == ():
data = np.array(data[0])
else:
msg = "Data shape {} does not match spec {}"
raise SmifDataMismatchError(msg.format(data.shape, spec.shape))
self.spec = spec
self.data = data
def __eq__(self, other):
return self.spec == other.spec and _array_equal_nan(self.data, other.data)
def __repr__(self):
return "<DataArray('{}', '{}')>".format(self.spec, self.data)
def __str__(self):
return "<DataArray('{}', '{}')>".format(self.spec, self.data)
[docs] def as_dict(self):
""" """
return self.spec.as_dict()
@property
def name(self):
"""The name of the data that this spec describes."""
return self.spec.name
@name.setter
def name(self, value):
self.spec.name = value
@property
def description(self):
"""A human-friendly description"""
return self.spec.description
@property
def dims(self):
"""Names for each dimension"""
return self.spec.dims
@property
def coords(self):
"""Coordinate labels for each dimension."""
return self.spec.coords
[docs] def dim_coords(self, dim):
"""Coordinates for a given dimension"""
return self.spec.dim_coords(dim)
[docs] def dim_names(self, dim):
"""Coordinate names for a given dimension"""
return self.spec.dim_names(dim)
[docs] def dim_elements(self, dim):
"""Coordinate elements for a given dimension"""
return self.spec.dim_elements(dim)
@property
def unit(self):
"""The unit for all data points."""
return self.spec.unit
@property
def shape(self):
"""The shape of the data array"""
return self.data.shape
[docs] def as_ndarray(self) -> np.ndarray:
"""Access as a :class:`numpy.ndarray`"""
return self.data
[docs] def as_df(self) -> pandas.DataFrame:
"""Access DataArray as a :class:`pandas.DataFrame`"""
dims = self.dims
coords = [c.ids for c in self.coords]
try:
if dims and coords:
if len(dims) == 1:
index = pandas.Index(coords[0], name=dims[0])
else:
index = pandas.MultiIndex.from_product(coords, names=dims)
return pandas.DataFrame(
{self.name: np.reshape(self.data, self.data.size)}, index=index
)
else:
# with no dims or coords, should be in the zero-dimensional case
if self.data.shape != ():
msg = "Expected zero-dimensional data, got %s" % self.data.shape
raise SmifDataMismatchError(msg)
return pandas.DataFrame([{self.name: self.data[()]}])
except NameError as ex:
raise SmifDataError(INSTALL_WARNING) from ex
[docs] @classmethod
def from_df(cls, spec, dataframe):
"""Create a DataArray from a :class:`pandas.DataFrame`"""
name = spec.name
dims = spec.dims
data_columns = dataframe.columns.values.tolist()
index_names = dataframe.index.names
if dims and len(index_names) == 1 and index_names[0] is None:
# case when an unindexed dataframe was passed in, try to recover automagically
if set(dims).issubset(set(data_columns)):
dataframe = dataframe.set_index(dims)
data_columns = dataframe.columns.values.tolist()
index_names = dataframe.index.names
if len(dims) == 1 and isinstance(dataframe.index, pandas.MultiIndex):
# case with one-level MultiIndex which xarray seems to reorder unless cast to
# simple Index
dataframe = dataframe.reset_index().set_index(dims[0])
if name not in data_columns or (dims and set(dims) != set(index_names)):
msg = (
"Data for '{name}' expected a data column called '{name}' and index "
+ "names {dims}, instead got data columns {data_columns} and index names "
+ "{index_names}"
)
raise SmifDataMismatchError(
msg.format(
name=name,
dims=dims,
data_columns=data_columns,
index_names=index_names,
)
)
try:
# convert to dataset
xr_dataset = dataframe.to_xarray()
# extract xr.DataArray
xr_data_array = xr_dataset[spec.name]
# reindex to ensure data order and fill out NaNs
xr_data_array = _reindex_xr_data_array(spec, xr_data_array)
# xarray raises Exception in v0.10 (narrowed to ValueError in v0.11)
except Exception as ex: # pylint: disable=broad-except
dups = find_duplicate_indices(dataframe)
if dups:
msg = "Data for '{name}' contains duplicate values at {dups}"
raise SmifDataMismatchError(msg.format(name=name, dups=dups)) from ex
else:
raise ex
return cls(spec, xr_data_array.data)
[docs] def as_xarray(self):
"""Access DataArray as a :class:`xarray.DataArray`"""
metadata = self.spec.as_dict()
del metadata["dims"]
del metadata["coords"]
dims = self.dims
coords = {c.name: c.ids for c in self.coords}
try:
return xarray.DataArray(
self.data, coords=coords, dims=dims, name=self.name, attrs=metadata
)
except NameError as ex:
raise SmifDataError(INSTALL_WARNING) from ex
[docs] @classmethod
def from_xarray(cls, spec, xr_data_array):
"""Create a DataArray from a :class:`xarray.DataArray`"""
# reindex to ensure data order and fill out NaNs
xr_data_array = _reindex_xr_data_array(spec, xr_data_array)
return cls(spec, xr_data_array.data)
[docs] def update(self, other):
"""Update data values with any from other which are non-null"""
assert self.spec == other.spec, "Specs must match when updating DataArray"
# convert self and other to xarray representation
self_xr = self.as_xarray()
other_xr = other.as_xarray()
# use xarray.combine_first convenience function
overridden = other_xr.combine_first(self_xr)
# assign result back to self
self.data = overridden.data
[docs] def validate_as_full(self):
"""Check that the data array contains no NaN values"""
dataframe = self.as_df()
if np.any(dataframe.isnull()):
expected_len = len(dataframe)
missing_data = show_null(dataframe)
actual_len = expected_len - len(missing_data)
dim_lens = (
"{"
+ ", ".join(
"{}: {}".format(dim, len_)
for dim, len_ in zip(self.dims, self.shape)
)
+ "}"
)
self.logger.debug("Missing data:\n\n %s", missing_data)
msg = (
"Data for '{name}' had missing values - read {actual_len} but expected "
+ "{expected_len} in total, from dims of length {dim_lens}"
)
raise SmifDataMismatchError(
msg.format(
name=self.name,
actual_len=actual_len,
expected_len=expected_len,
dim_lens=dim_lens,
)
)
[docs]def show_null(dataframe) -> pandas.DataFrame:
"""Shows missing data
Returns
-------
pandas.DataFrame
"""
try:
missing_data = dataframe[dataframe.isnull().values]
except NameError as ex:
raise SmifDataError(INSTALL_WARNING) from ex
return missing_data
[docs]def find_duplicate_indices(dataframe):
"""Find duplicate indices in a DataFrame
Returns
-------
list[dict]
"""
# find duplicate index entries
dups_df = dataframe[dataframe.index.duplicated()]
# drop data columns, reset index to promote index to values
dups_index_df = dups_df.drop(dups_df.columns, axis=1).reset_index()
return dups_index_df.to_dict("records")
def _array_equal_nan(a, b):
"""Compare numpy arrays for equality, allowing NaN to be considerd equal to itself"""
if np.issubdtype(a.dtype, np.number) and np.issubdtype(b.dtype, np.number):
return np.all((a == b) | (np.isnan(a) & np.isnan(b)))
else:
return np.all(a == b)
def _reindex_xr_data_array(spec, xr_data_array):
"""Reindex to ensure full data, order"""
# all index values must exist in dimension - extras would otherwise be silently dropped
for dim in spec.dims:
index_values = set(xr_data_array.coords[dim].values)
dim_names = set(spec.dim_names(dim))
in_index_but_not_dim_names = index_values - dim_names
if in_index_but_not_dim_names:
msg = (
"Data for '{name}' contained unexpected values in the set of "
+ "coordinates for dimension '{dim}': {extras}"
)
raise SmifDataMismatchError(
msg.format(
dim=dim, extras=list(in_index_but_not_dim_names), name=spec.name
)
)
coords = {c.name: c.ids for c in spec.coords}
xr_data_array = xr_data_array.reindex(indexers=coords)
return xr_data_array