diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 93f335e625b..460188ab648 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -159,6 +159,18 @@ Deprecations ~~~~~~~~~~~~ +Performance +~~~~~~~~~~~ + +- Skip data reads during CF datetime dtype inference when the output dtype is + determinable from metadata alone (``use_cftime`` explicitly set, non-standard + calendar, non-nanosecond ``time_unit``, lazy backend arrays, or when + ``valid_max`` / ``time_coverage_end`` / a narrow storage dtype bounds the + possible values). This can significantly speed up + :py:func:`open_dataset` / :py:func:`open_datatree` on remote-backed stores + (zarr on S3, icechunk) with many time-encoded variables (:issue:`11303`). + By `Alfonso Ladino `_. + Bug Fixes ~~~~~~~~~ diff --git a/xarray/coding/times.py b/xarray/coding/times.py index f0a4c195263..ca2ab96a7bd 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -336,39 +336,210 @@ def _unpack_time_units_and_ref_date_cftime(units: str, calendar: str): return time_units, ref_date -def _decode_cf_datetime_dtype( +class _CannotInferFromMetadata(Exception): + """Raised by _infer_dtype_from_metadata when data reads are required.""" + + +# Edges of the datetime64[ns] representable range (the value range of a +# pandas.Timestamp at nanosecond resolution). +_DATETIME64_NS_LOWER = pd.Timestamp("1677-09-21 00:12:43.145224193") +_DATETIME64_NS_UPPER = pd.Timestamp("2262-04-11 23:47:16.854775807") + + +def _max_for_storage_dtype(dtype: np.dtype) -> int | None: + """Maximum absolute value representable in ``dtype``, or None if the + bound is too loose to be useful (int64, floats).""" + if dtype.kind in "iu" and dtype.itemsize <= 4: + info = np.iinfo(dtype) + return int(max(abs(info.min), info.max)) + return None + + +def _bound_fits_datetime64_ns(max_value: float | int, units: str) -> bool: + """True if ``ref_date ± max_value * unit_seconds`` fits in datetime64[ns].""" + time_unit, ref_date = _unpack_time_unit_and_ref_date(units) + try: + delta = pd.Timedelta(int(max_value), unit=time_unit) + upper = ref_date + delta + lower = ref_date - delta + except (OutOfBoundsDatetime, OutOfBoundsTimedelta, OverflowError, ValueError): + return False + return _DATETIME64_NS_LOWER <= lower and upper <= _DATETIME64_NS_UPPER + + +def _is_backed_by_lazy_backend(data) -> bool: + """Walk intermediate coder wrappers to find a backend-lazy array. + + Returns True when some level of the wrapper chain is a + ``LazilyIndexedArray`` (zarr/netcdf/DAP backends). Returns False for + in-memory data (plain numpy, ``PandasIndexingAdapter``) where reads + are essentially free and we want to keep the fail-fast semantics. + """ + # Guard against cycles; bound the depth to a small number. + for _ in range(16): + if isinstance(data, indexing.LazilyIndexedArray): + return True + inner = getattr(data, "array", None) + if inner is None or inner is data: + return False + data = inner + return False + + +def _bound_from_attrs(attrs: dict, units: str) -> bool: + """True if ``valid_max`` or ``time_coverage_end`` in attrs proves the + decoded values fit in datetime64[ns].""" + bound = attrs.get("valid_max") + if bound is not None and _bound_fits_datetime64_ns(bound, units): + return True + end = attrs.get("time_coverage_end") + if end is not None: + try: + ts = pd.Timestamp(end) + except (ValueError, TypeError): + return False + # Strip any timezone for comparison with tz-naive ns bounds. + if ts.tzinfo is not None: + ts = ts.tz_convert(None) if ts.tz is not None else ts.tz_localize(None) + return bool(_DATETIME64_NS_LOWER <= ts <= _DATETIME64_NS_UPPER) + return False + + +def _is_overflow_safe_from_metadata( data, units: str, calendar: str | None, use_cftime: bool | None, - time_unit: PDDatetimeUnitOptions = "ns", + time_unit: PDDatetimeUnitOptions, + attrs: dict | None, +) -> bool: + """Whether decode_cf_datetime's output dtype is knowable from metadata. + + Output dtype depends on data values only for ``use_cftime=None`` + + standard calendar + ``time_unit="ns"`` with a backend-lazy source + (where reads are expensive) and no attr/dtype bound that proves + values fit the datetime64[ns] range. + """ + if use_cftime is not None: + return True + # Non-standard calendars always decode to cftime (object dtype). + if calendar is not None and not _is_standard_calendar(calendar): + return True + # Non-"ns" datetime64 resolutions have ranges ~±10¹¹ years — overflow + # is unreachable in practice. + if time_unit != "ns": + return True + # Backend-lazy arrays: reads go through the backend's indexing + # pipeline (zarr sync/async, DAP, etc.) and can be expensive even + # when values are cached. We walk through intermediate coder wrappers + # (_ElementwiseFunctionArray) to find the backing array. In-memory + # wrappers like PandasIndexingAdapter are not covered here — reading + # them is free and preserves fail-fast on invalid values. + if _is_backed_by_lazy_backend(data): + return True + # Chunked + ns + auto: need a proven bound. + if attrs is not None and _bound_from_attrs(attrs, units): + return True + storage_dtype = getattr(data, "dtype", None) + if storage_dtype is not None: + max_val = _max_for_storage_dtype(storage_dtype) + if max_val is not None and _bound_fits_datetime64_ns(max_val, units): + return True + return False + + +def _infer_dtype_from_metadata( + data, + units: str, + calendar: str | None, + use_cftime: bool | None, + time_unit: PDDatetimeUnitOptions, + attrs: dict | None, +) -> np.dtype: + """Infer the decoded dtype from metadata alone, without reading data. + + Raises ``_CannotInferFromMetadata`` when the dtype depends on values + that must be read. For deterministic cases (``use_cftime`` explicit, + non-standard calendar) returns the dtype directly so ``cftime`` does + not need to be importable. Otherwise runs ``decode_cf_datetime`` on + a synthetic ``[0, 1]`` array so resolution-selection logic matches + real-data output. + """ + # Validate units — raises ValueError on malformed strings. + _unpack_time_unit_and_ref_date(units) + + if use_cftime is True: + return np.dtype("object") + if calendar is not None and not _is_standard_calendar(calendar): + return np.dtype("object") + + if not _is_overflow_safe_from_metadata( + data, units, calendar, use_cftime, time_unit, attrs + ): + raise _CannotInferFromMetadata + + result = decode_cf_datetime( + np.array([0, 1]), units, calendar, use_cftime, time_unit + ) + return getattr(result, "dtype", np.dtype("object")) + + +def _infer_dtype_from_first_last( + data, + units: str, + calendar: str | None, + use_cftime: bool | None, + time_unit: PDDatetimeUnitOptions, ) -> np.dtype: - # Verify that at least the first and last date can be decoded - # successfully. Otherwise, tracebacks end up swallowed by - # Dataset.__repr__ when users try to view their lazily decoded array. + """Fallback dtype inference that reads the first and last data values. + + Used only when metadata cannot prove the dtype (chunked int64/float64 + arrays with ``use_cftime=None`` and no value bounds). Reads are + expensive on remote-backed stores but unavoidable in this case. + """ values = indexing.ImplicitToExplicitIndexingAdapter(indexing.as_indexable(data)) example_value = np.concatenate( [to_numpy(first_n_items(values, 1)), to_numpy(last_item(values))] ) + result = decode_cf_datetime(example_value, units, calendar, use_cftime, time_unit) + return getattr(result, "dtype", np.dtype("object")) + +def _wrap_decode_error(err: Exception, units: str, calendar: str | None) -> ValueError: + calendar_msg = ( + "the default calendar" if calendar is None else f"calendar {calendar!r}" + ) + msg = ( + f"unable to decode time units {units!r} with {calendar_msg!r}. Try " + "opening your dataset with decode_times=False or installing cftime " + "if it is not installed." + ) + return ValueError(msg) + + +def _decode_cf_datetime_dtype( + data, + units: str, + calendar: str | None, + use_cftime: bool | None, + time_unit: PDDatetimeUnitOptions = "ns", + attrs: dict | None = None, +) -> np.dtype: try: - result = decode_cf_datetime( - example_value, units, calendar, use_cftime, time_unit + return _infer_dtype_from_metadata( + data, units, calendar, use_cftime, time_unit, attrs ) + except _CannotInferFromMetadata: + pass except Exception as err: - calendar_msg = ( - "the default calendar" if calendar is None else f"calendar {calendar!r}" - ) - msg = ( - f"unable to decode time units {units!r} with {calendar_msg!r}. Try " - "opening your dataset with decode_times=False or installing cftime " - "if it is not installed." - ) - raise ValueError(msg) from err - else: - dtype = getattr(result, "dtype", np.dtype("object")) + raise _wrap_decode_error(err, units, calendar) from err - return dtype + try: + return _infer_dtype_from_first_last( + data, units, calendar, use_cftime, time_unit + ) + except Exception as err: + raise _wrap_decode_error(err, units, calendar) from err def _decode_datetime_with_cftime( @@ -1412,7 +1583,12 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: units = pop_to(attrs, encoding, "units") calendar = pop_to(attrs, encoding, "calendar") dtype = _decode_cf_datetime_dtype( - data, units, calendar, self.use_cftime, self.time_unit + data, + units, + calendar, + self.use_cftime, + self.time_unit, + attrs=attrs, ) transform = partial( decode_cf_datetime, diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index e03f15a5119..db57340b53a 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -20,6 +20,7 @@ ) from xarray.coders import CFDatetimeCoder, CFTimedeltaCoder from xarray.coding.times import ( + _decode_cf_datetime_dtype, _encode_datetime_with_cftime, _netcdf_to_numpy_timeunit, _numpy_to_netcdf_timeunit, @@ -35,6 +36,7 @@ ) from xarray.coding.variables import SerializationWarning from xarray.conventions import _update_bounds_attributes, cf_encoder +from xarray.core import indexing from xarray.core.common import contains_cftime_datetimes from xarray.core.types import PDDatetimeUnitOptions from xarray.core.utils import is_duck_dask_array @@ -46,6 +48,7 @@ _STANDARD_CALENDARS, DuckArrayWrapper, FirstElementAccessibleArray, + InaccessibleArray, _all_cftime_date_types, arm_xfail, assert_array_equal, @@ -2217,3 +2220,165 @@ def test_roundtrip_empty_datetime64_array(time_unit: PDDatetimeUnitOptions) -> N ) assert_identical(variable, roundtripped) assert roundtripped.dtype == variable.dtype + + +class TestDecodeCFDatetimeDtype: + """Metadata-first dtype inference skips store reads when safe.""" + + @staticmethod + def _inaccessible(dtype: str = "int64", shape: tuple[int, ...] = (3,)): + """An array-like that raises UnexpectedDataAccess on any read. + + Used to prove ``_decode_cf_datetime_dtype`` does not touch data + when metadata alone is sufficient. + """ + return InaccessibleArray(np.zeros(shape, dtype=dtype)) + + @pytest.mark.parametrize( + "use_cftime, expected_kind", + [(True, "O"), (False, "M")], + ids=["use_cftime=True", "use_cftime=False"], + ) + def test_explicit_use_cftime_no_reads(self, use_cftime, expected_kind): + data = self._inaccessible() + dtype = _decode_cf_datetime_dtype( + data, "days since 2000-01-01", "standard", use_cftime + ) + assert dtype.kind == expected_kind + + @requires_cftime + def test_nonstandard_calendar_no_reads(self): + data = self._inaccessible() + dtype = _decode_cf_datetime_dtype( + data, "days since 2000-01-01", "noleap", use_cftime=None + ) + assert dtype == np.dtype("object") + + @pytest.mark.parametrize("time_unit", ["s", "ms", "us"]) + def test_wide_time_unit_no_reads(self, time_unit): + data = self._inaccessible() + dtype = _decode_cf_datetime_dtype( + data, + "days since 2000-01-01", + "standard", + use_cftime=None, + time_unit=time_unit, + ) + assert dtype.kind == "M" + + def test_lazy_backend_array_no_reads(self): + # Backend-lazy wrappers (LazilyIndexedArray) skip reads because + # indexing them would trigger expensive backend I/O. + inner = self._inaccessible(dtype="int64") + lazy = indexing.LazilyIndexedArray(inner) + dtype = _decode_cf_datetime_dtype( + lazy, + "days since 2000-01-01", + "standard", + use_cftime=None, + time_unit="ns", + ) + assert dtype == np.dtype("datetime64[ns]") + + def test_coder_wrapped_lazy_backend_no_reads(self): + # When earlier coders wrap the backend-lazy array in + # _ElementwiseFunctionArray, the fast path still applies — the + # wrapper chain is walked to find the underlying LazilyIndexedArray. + from xarray.coding.common import lazy_elemwise_func + + inner = self._inaccessible(dtype="int64") + lazy = indexing.LazilyIndexedArray(inner) + wrapped = lazy_elemwise_func(lazy, lambda x: x, np.dtype("int64")) + dtype = _decode_cf_datetime_dtype( + wrapped, + "days since 2000-01-01", + "standard", + use_cftime=None, + time_unit="ns", + ) + assert dtype == np.dtype("datetime64[ns]") + + def test_pandas_index_coord_reads_are_performed(self): + # In-memory coord data (PandasIndexingAdapter) is NOT treated as + # backend-lazy — we keep the eager first/last reads so fail-fast + # semantics on overflow values are preserved for in-memory data. + pd = pytest.importorskip("pandas") + adapter = indexing.PandasIndexingAdapter( + pd.Index(np.array([0, 1], dtype="int64")) + ) + dtype = _decode_cf_datetime_dtype( + adapter, + "days since 2000-01-01", + "standard", + use_cftime=None, + time_unit="ns", + ) + # No assertion on "reads" here — we rely on the fallback path + # being reached; overflow detection is covered by existing + # conventions tests (e.g. test_decode_cf_datetime_transition_to_invalid). + assert dtype == np.dtype("datetime64[ns]") + + def test_bounded_storage_dtype_no_reads(self): + # Chunked int32 seconds from 2000 cannot overflow datetime64[ns], + # so the bounds check lets us skip reads without falling through + # to the unchunked shortcut. + dask = pytest.importorskip("dask.array") + chunked = dask.from_array(np.zeros(3, dtype="int32")) + dtype = _decode_cf_datetime_dtype( + chunked, "seconds since 2000-01-01", "standard", use_cftime=None + ) + assert dtype == np.dtype("datetime64[ns]") + + @requires_dask + def test_valid_max_attr_no_reads(self): + import dask.array as da + + # int64 storage would normally trigger a read, but valid_max + # proves no overflow is possible. + chunked = da.from_array(np.zeros(3, dtype="int64")) + chunked_inacc = InaccessibleArray(chunked) + attrs = {"valid_max": 3650} # ~10 years, well within range + dtype = _decode_cf_datetime_dtype( + chunked_inacc, + "days since 2000-01-01", + "standard", + use_cftime=None, + attrs=attrs, + ) + assert dtype == np.dtype("datetime64[ns]") + + @requires_dask + def test_time_coverage_end_attr_no_reads(self): + import dask.array as da + + chunked = da.from_array(np.zeros(3, dtype="int64")) + chunked_inacc = InaccessibleArray(chunked) + attrs = {"time_coverage_end": "2025-12-31T23:59:59Z"} + dtype = _decode_cf_datetime_dtype( + chunked_inacc, + "days since 2000-01-01", + "standard", + use_cftime=None, + attrs=attrs, + ) + assert dtype == np.dtype("datetime64[ns]") + + @requires_dask + def test_chunked_int64_falls_back_to_reads(self): + # Chunked + ns + use_cftime=None + int64 + no bounds: metadata + # cannot prove anything, so we fall back to reading first/last. + import dask.array as da + + chunked = da.from_array(np.array([0, 1, 2], dtype="int64"), chunks=3) + dtype = _decode_cf_datetime_dtype( + chunked, + "days since 2000-01-01", + "standard", + use_cftime=None, + ) + assert dtype == np.dtype("datetime64[ns]") + + def test_invalid_units_raises_value_error(self): + data = self._inaccessible() + with pytest.raises(ValueError, match="unable to decode time units"): + _decode_cf_datetime_dtype(data, "not a valid unit string", None, None)