From d075ece373307dd8e2032f1d5387faec9738c361 Mon Sep 17 00:00:00 2001 From: aladinor Date: Thu, 16 Apr 2026 19:47:04 -0500 Subject: [PATCH 1/7] Prefer metadata over store reads in _decode_cf_datetime_dtype Restructure dtype inference into a metadata-first path that falls back to reading the first and last values only when metadata cannot prove safety. For well-formed data the fast path runs decode_cf_datetime on a synthetic [0, 1] array so resolution-selection logic matches real-data output. Adds _verify_decoded_dtype to CFDatetimeCoder.decode so an overflow mismatch between declared and actual datetime dtype raises a clear TypeError instead of silently corrupting dask map_blocks output. Addresses pydata/xarray#11303. --- xarray/coding/times.py | 214 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 195 insertions(+), 19 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index b9324d8395e..cf98b1c1324 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -336,39 +336,206 @@ def _unpack_time_units_and_ref_date_cftime(units: str, calendar: str): return time_units, ref_date -def _decode_cf_datetime_dtype( +class _CannotInferFromMetadata(Exception): + """Raised by _infer_dtype_from_metadata when data reads are required.""" + + +# Edges of the datetime64[ns] representable range (the value range of a +# pandas.Timestamp at nanosecond resolution). +_DATETIME64_NS_LOWER = pd.Timestamp("1677-09-21 00:12:43.145224193") +_DATETIME64_NS_UPPER = pd.Timestamp("2262-04-11 23:47:16.854775807") + + +def _max_for_storage_dtype(dtype: np.dtype) -> int | None: + """Maximum absolute value representable in ``dtype``, or None if the + bound is too loose to be useful (int64, floats).""" + if dtype.kind in "iu" and dtype.itemsize <= 4: + info = np.iinfo(dtype) + return int(max(abs(info.min), info.max)) + return None + + +def _bound_fits_datetime64_ns(max_value: float | int, units: str) -> bool: + """True if ``ref_date ± max_value * unit_seconds`` fits in datetime64[ns].""" + time_unit, ref_date = _unpack_time_unit_and_ref_date(units) + try: + delta = pd.Timedelta(int(max_value), unit=time_unit) + upper = ref_date + delta + lower = ref_date - delta + except (OutOfBoundsDatetime, OutOfBoundsTimedelta, OverflowError, ValueError): + return False + return _DATETIME64_NS_LOWER <= lower and upper <= _DATETIME64_NS_UPPER + + +def _bound_from_attrs(attrs: dict, units: str) -> bool: + """True if ``valid_max`` or ``time_coverage_end`` in attrs proves the + decoded values fit in datetime64[ns].""" + bound = attrs.get("valid_max") + if bound is not None and _bound_fits_datetime64_ns(bound, units): + return True + end = attrs.get("time_coverage_end") + if end is not None: + try: + ts = pd.Timestamp(end) + except (ValueError, TypeError): + return False + return bool(_DATETIME64_NS_LOWER <= ts <= _DATETIME64_NS_UPPER) + return False + + +def _is_overflow_safe_from_metadata( data, units: str, calendar: str | None, use_cftime: bool | None, - time_unit: PDDatetimeUnitOptions = "ns", + time_unit: PDDatetimeUnitOptions, + attrs: dict | None, +) -> bool: + """Whether decode_cf_datetime's output dtype is knowable from metadata. + + Output dtype depends on data values only when all of the following + hold: ``use_cftime=None``, standard calendar, ``time_unit="ns"``, data + is chunked (so dask enforces the declared dtype), and we have no + explicit bound that proves values fit the datetime64[ns] range. + """ + if use_cftime is not None: + return True + # Non-standard calendars always decode to cftime (object dtype). + if calendar is not None and not _is_standard_calendar(calendar): + return True + # Non-"ns" datetime64 resolutions have ranges ~±10¹¹ years — overflow + # is unreachable in practice. + if time_unit != "ns": + return True + # Non-chunked data: sync decode (_ElementwiseFunctionArray) returns the + # function's actual output dtype and xarray reconciles at access time. + if not is_chunked_array(data): + return True + # Chunked + ns + auto: need a proven bound. + if attrs is not None and _bound_from_attrs(attrs, units): + return True + storage_dtype = getattr(data, "dtype", None) + if storage_dtype is not None: + max_val = _max_for_storage_dtype(storage_dtype) + if max_val is not None and _bound_fits_datetime64_ns(max_val, units): + return True + return False + + +def _infer_dtype_from_metadata( + data, + units: str, + calendar: str | None, + use_cftime: bool | None, + time_unit: PDDatetimeUnitOptions, + attrs: dict | None, ) -> np.dtype: - # Verify that at least the first and last date can be decoded - # successfully. Otherwise, tracebacks end up swallowed by - # Dataset.__repr__ when users try to view their lazily decoded array. + """Infer the decoded dtype from metadata alone, without reading data. + + Raises ``_CannotInferFromMetadata`` when the dtype depends on values + that must be read. Otherwise runs ``decode_cf_datetime`` on a + synthetic ``[0, 1]`` array so the same resolution-selection logic + that runs on real data also produces the declared dtype. + """ + # Validate units — raises ValueError on malformed strings. + _unpack_time_unit_and_ref_date(units) + + if not _is_overflow_safe_from_metadata( + data, units, calendar, use_cftime, time_unit, attrs + ): + raise _CannotInferFromMetadata + + result = decode_cf_datetime( + np.array([0, 1]), units, calendar, use_cftime, time_unit + ) + return getattr(result, "dtype", np.dtype("object")) + + +def _infer_dtype_from_first_last( + data, + units: str, + calendar: str | None, + use_cftime: bool | None, + time_unit: PDDatetimeUnitOptions, +) -> np.dtype: + """Fallback dtype inference that reads the first and last data values. + + Used only when metadata cannot prove the dtype (chunked int64/float64 + arrays with ``use_cftime=None`` and no value bounds). Reads are + expensive on remote-backed stores but unavoidable in this case. + """ values = indexing.ImplicitToExplicitIndexingAdapter(indexing.as_indexable(data)) example_value = np.concatenate( [to_numpy(first_n_items(values, 1)), to_numpy(last_item(values))] ) + result = decode_cf_datetime(example_value, units, calendar, use_cftime, time_unit) + return getattr(result, "dtype", np.dtype("object")) + + +def _wrap_decode_error(err: Exception, units: str, calendar: str | None) -> ValueError: + calendar_msg = ( + "the default calendar" if calendar is None else f"calendar {calendar!r}" + ) + msg = ( + f"unable to decode time units {units!r} with {calendar_msg!r}. Try " + "opening your dataset with decode_times=False or installing cftime " + "if it is not installed." + ) + return ValueError(msg) + +def _decode_cf_datetime_dtype( + data, + units: str, + calendar: str | None, + use_cftime: bool | None, + time_unit: PDDatetimeUnitOptions = "ns", + attrs: dict | None = None, +) -> np.dtype: try: - result = decode_cf_datetime( - example_value, units, calendar, use_cftime, time_unit + return _infer_dtype_from_metadata( + data, units, calendar, use_cftime, time_unit, attrs ) + except _CannotInferFromMetadata: + pass except Exception as err: - calendar_msg = ( - "the default calendar" if calendar is None else f"calendar {calendar!r}" - ) - msg = ( - f"unable to decode time units {units!r} with {calendar_msg!r}. Try " - "opening your dataset with decode_times=False or installing cftime " - "if it is not installed." + raise _wrap_decode_error(err, units, calendar) from err + + try: + return _infer_dtype_from_first_last( + data, units, calendar, use_cftime, time_unit ) - raise ValueError(msg) from err - else: - dtype = getattr(result, "dtype", np.dtype("object")) + except Exception as err: + raise _wrap_decode_error(err, units, calendar) from err + + +def _verify_decoded_dtype(func: Callable, expected_dtype: np.dtype) -> Callable: + """Wrap ``func`` to raise on datetime64/object dtype mismatches. + + The lazy decode pipeline advertises ``expected_dtype`` upfront; when + data is chunked, dask.map_blocks uses it to cast the function's + output. If decoded values land on the wrong side of the + datetime64/object boundary (e.g. overflow forces a cftime fallback), + this wrapper raises ``TypeError`` instead of silently casting. + Different datetime64 resolutions are allowed — only the + datetime64/object boundary matters. + """ + expected_is_datetime = expected_dtype.kind == "M" + + def wrapper(values): + result = func(values) + result_dtype = getattr(result, "dtype", np.dtype("object")) + if (result_dtype.kind == "M") != expected_is_datetime: + raise TypeError( + f"decoded datetime dtype {result_dtype!r} does not match " + f"declared dtype {expected_dtype!r}. Values likely fall " + f"outside the representable range of {expected_dtype!r}; " + "pass use_cftime=True or a coarser time_unit " + "(e.g. 'us' or 's') to decode_times." + ) + return result - return dtype + return wrapper def _decode_datetime_with_cftime( @@ -1412,7 +1579,12 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: units = pop_to(attrs, encoding, "units") calendar = pop_to(attrs, encoding, "calendar") dtype = _decode_cf_datetime_dtype( - data, units, calendar, self.use_cftime, self.time_unit + data, + units, + calendar, + self.use_cftime, + self.time_unit, + attrs=attrs, ) transform = partial( decode_cf_datetime, @@ -1421,6 +1593,10 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: use_cftime=self.use_cftime, time_unit=self.time_unit, ) + # Guard against dtype mismatches (e.g. overflow values that would + # cause dask map_blocks to silently cast cftime output to + # datetime64 garbage). + transform = _verify_decoded_dtype(transform, dtype) data = lazy_elemwise_func(data, transform, dtype) return Variable(dims, data, attrs, encoding, fastpath=True) From 3be423754d7854004e8dbb79591343b40b91fa39 Mon Sep 17 00:00:00 2001 From: aladinor Date: Thu, 16 Apr 2026 19:47:18 -0500 Subject: [PATCH 2/7] Add tests for metadata-first CF datetime dtype inference Cover all paths that should skip store reads (explicit use_cftime, non-standard calendar, wide time_unit, non-chunked data, narrow storage dtype bounds, valid_max / time_coverage_end attrs) plus the chunked int64 fallback and malformed units error. Uses InaccessibleArray to prove no reads occur on the fast paths and a TypeError is raised for the verification wrapper on dtype kind mismatches. --- xarray/tests/test_coding_times.py | 149 ++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 15e38233c4b..86cdbbd4aea 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -20,6 +20,7 @@ ) from xarray.coders import CFDatetimeCoder, CFTimedeltaCoder from xarray.coding.times import ( + _decode_cf_datetime_dtype, _encode_datetime_with_cftime, _netcdf_to_numpy_timeunit, _numpy_to_netcdf_timeunit, @@ -46,6 +47,7 @@ _STANDARD_CALENDARS, DuckArrayWrapper, FirstElementAccessibleArray, + InaccessibleArray, _all_cftime_date_types, arm_xfail, assert_array_equal, @@ -2213,3 +2215,150 @@ def test_roundtrip_empty_datetime64_array(time_unit: PDDatetimeUnitOptions) -> N ) assert_identical(variable, roundtripped) assert roundtripped.dtype == variable.dtype + + +class TestDecodeCFDatetimeDtype: + """Metadata-first dtype inference skips store reads when safe.""" + + @staticmethod + def _inaccessible(dtype: str = "int64", shape: tuple[int, ...] = (3,)): + """An array-like that raises UnexpectedDataAccess on any read. + + Used to prove ``_decode_cf_datetime_dtype`` does not touch data + when metadata alone is sufficient. + """ + return InaccessibleArray(np.zeros(shape, dtype=dtype)) + + @pytest.mark.parametrize( + "use_cftime, expected_kind", + [(True, "O"), (False, "M")], + ids=["use_cftime=True", "use_cftime=False"], + ) + def test_explicit_use_cftime_no_reads(self, use_cftime, expected_kind): + data = self._inaccessible() + dtype = _decode_cf_datetime_dtype( + data, "days since 2000-01-01", "standard", use_cftime + ) + assert dtype.kind == expected_kind + + @requires_cftime + def test_nonstandard_calendar_no_reads(self): + data = self._inaccessible() + dtype = _decode_cf_datetime_dtype( + data, "days since 2000-01-01", "noleap", use_cftime=None + ) + assert dtype == np.dtype("object") + + @pytest.mark.parametrize("time_unit", ["s", "ms", "us"]) + def test_wide_time_unit_no_reads(self, time_unit): + data = self._inaccessible() + dtype = _decode_cf_datetime_dtype( + data, + "days since 2000-01-01", + "standard", + use_cftime=None, + time_unit=time_unit, + ) + assert dtype.kind == "M" + + def test_unchunked_data_no_reads(self): + # With use_cftime=None + ns + non-chunked, dtype mismatch at access + # time is harmless (sync path reconciles), so no reads needed. + data = self._inaccessible(dtype="int64") + dtype = _decode_cf_datetime_dtype( + data, + "days since 2000-01-01", + "standard", + use_cftime=None, + time_unit="ns", + ) + assert dtype == np.dtype("datetime64[ns]") + + def test_bounded_storage_dtype_no_reads(self): + # Chunked int32 seconds from 2000 cannot overflow datetime64[ns], + # so the bounds check lets us skip reads without falling through + # to the unchunked shortcut. + dask = pytest.importorskip("dask.array") + chunked = dask.from_array(np.zeros(3, dtype="int32")) + dtype = _decode_cf_datetime_dtype( + chunked, "seconds since 2000-01-01", "standard", use_cftime=None + ) + assert dtype == np.dtype("datetime64[ns]") + + @requires_dask + def test_valid_max_attr_no_reads(self): + import dask.array as da + + # int64 storage would normally trigger a read, but valid_max + # proves no overflow is possible. + chunked = da.from_array(np.zeros(3, dtype="int64")) + chunked_inacc = InaccessibleArray(chunked) + attrs = {"valid_max": 3650} # ~10 years, well within range + dtype = _decode_cf_datetime_dtype( + chunked_inacc, + "days since 2000-01-01", + "standard", + use_cftime=None, + attrs=attrs, + ) + assert dtype == np.dtype("datetime64[ns]") + + @requires_dask + def test_time_coverage_end_attr_no_reads(self): + import dask.array as da + + chunked = da.from_array(np.zeros(3, dtype="int64")) + chunked_inacc = InaccessibleArray(chunked) + attrs = {"time_coverage_end": "2025-12-31T23:59:59Z"} + dtype = _decode_cf_datetime_dtype( + chunked_inacc, + "days since 2000-01-01", + "standard", + use_cftime=None, + attrs=attrs, + ) + assert dtype == np.dtype("datetime64[ns]") + + @requires_dask + def test_chunked_int64_falls_back_to_reads(self): + # Chunked + ns + use_cftime=None + int64 + no bounds: metadata + # cannot prove anything, so we fall back to reading first/last. + import dask.array as da + + chunked = da.from_array(np.array([0, 1, 2], dtype="int64"), chunks=3) + dtype = _decode_cf_datetime_dtype( + chunked, + "days since 2000-01-01", + "standard", + use_cftime=None, + ) + assert dtype == np.dtype("datetime64[ns]") + + def test_invalid_units_raises_value_error(self): + data = self._inaccessible() + with pytest.raises(ValueError, match="unable to decode time units"): + _decode_cf_datetime_dtype(data, "not a valid unit string", None, None) + + def test_verification_wrapper_raises_on_datetime_vs_object_mismatch(self): + # Metadata predicts datetime64[ns] but the decoder returns cftime + # objects (the overflow case). The wrapper must raise TypeError. + from xarray.coding.times import _verify_decoded_dtype + + def fake_decoder(values): + return np.array([object()], dtype="object") # cftime-like object output + + guarded = _verify_decoded_dtype(fake_decoder, np.dtype("datetime64[ns]")) + with pytest.raises(TypeError, match="does not match declared dtype"): + guarded(np.array([0, 1])) + + def test_verification_wrapper_accepts_different_datetime64_resolutions(self): + # Declared datetime64[ns] but decoder returns datetime64[us] — both + # are datetime64, so the wrapper should pass them through. + from xarray.coding.times import _verify_decoded_dtype + + def fake_decoder(values): + return np.array([0, 1], dtype="datetime64[us]") + + guarded = _verify_decoded_dtype(fake_decoder, np.dtype("datetime64[ns]")) + result = guarded(np.array([0, 1])) + assert result.dtype == np.dtype("datetime64[us]") From 285a93a900c70af51b6cbbfc1da7203dc2ea119d Mon Sep 17 00:00:00 2001 From: aladinor Date: Thu, 16 Apr 2026 19:47:27 -0500 Subject: [PATCH 3/7] Add whats-new entry for CF datetime dtype inference speedup --- doc/whats-new.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ab96aff4d1a..ad6a37b32d6 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -104,6 +104,20 @@ Deprecations ~~~~~~~~~~~~ +Performance +~~~~~~~~~~~ + +- Skip data reads during CF datetime dtype inference when the output dtype is + determinable from metadata alone (``use_cftime`` explicitly set, non-standard + calendar, non-nanosecond ``time_unit``, non-chunked data, or when + ``valid_max`` / ``time_coverage_end`` / a narrow storage dtype bounds the + possible values). This can significantly speed up + :py:func:`open_dataset` / :py:func:`open_datatree` on remote-backed stores + (zarr on S3, icechunk) with many time-encoded variables. A runtime + dtype-mismatch guard on the lazy decode wrapper prevents the rare overflow + case from being silently cast by dask. + By `Alfonso Ladino `_. + Bug Fixes ~~~~~~~~~ From db2bb2cbd35d678bcded1e4f22100b71555cd0ea Mon Sep 17 00:00:00 2001 From: aladinor Date: Thu, 16 Apr 2026 20:30:19 -0500 Subject: [PATCH 4/7] Address CI review: narrow fast path, drop runtime verification wrapper - Restrict the "skip reads" fast path to backend-lazy data (LazilyIndexedArray, walked through intermediate coder wrappers). In-memory PandasIndexingAdapter arrays now keep the eager first/last reads so fail-fast semantics for overflow values are preserved (fixes test_decode_cf_datetime_transition_to_invalid regression). - Short-circuit use_cftime=True and non-standard calendars in the metadata path so cftime is not required for the fast path (fixes bare-min CI failure). - Drop the `_verify_decoded_dtype` guard. Instead rely on the fallback path's reads to surface the correct cftime dtype when the metadata-based check cannot prove safety. - Rename the transform-wrapping intermediate to avoid the mypy reassignment error on CFDatetimeCoder.decode. --- xarray/coding/times.py | 78 +++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index de4751d2245..ffb6e56f607 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -367,6 +367,25 @@ def _bound_fits_datetime64_ns(max_value: float | int, units: str) -> bool: return _DATETIME64_NS_LOWER <= lower and upper <= _DATETIME64_NS_UPPER +def _is_backed_by_lazy_backend(data) -> bool: + """Walk intermediate coder wrappers to find a backend-lazy array. + + Returns True when some level of the wrapper chain is a + ``LazilyIndexedArray`` (zarr/netcdf/DAP backends). Returns False for + in-memory data (plain numpy, ``PandasIndexingAdapter``) where reads + are essentially free and we want to keep the fail-fast semantics. + """ + # Guard against cycles; bound the depth to a small number. + for _ in range(16): + if isinstance(data, indexing.LazilyIndexedArray): + return True + inner = getattr(data, "array", None) + if inner is None or inner is data: + return False + data = inner + return False + + def _bound_from_attrs(attrs: dict, units: str) -> bool: """True if ``valid_max`` or ``time_coverage_end`` in attrs proves the decoded values fit in datetime64[ns].""" @@ -379,6 +398,9 @@ def _bound_from_attrs(attrs: dict, units: str) -> bool: ts = pd.Timestamp(end) except (ValueError, TypeError): return False + # Strip any timezone for comparison with tz-naive ns bounds. + if ts.tzinfo is not None: + ts = ts.tz_convert(None) if ts.tz is not None else ts.tz_localize(None) return bool(_DATETIME64_NS_LOWER <= ts <= _DATETIME64_NS_UPPER) return False @@ -407,9 +429,13 @@ def _is_overflow_safe_from_metadata( # is unreachable in practice. if time_unit != "ns": return True - # Non-chunked data: sync decode (_ElementwiseFunctionArray) returns the - # function's actual output dtype and xarray reconciles at access time. - if not is_chunked_array(data): + # Backend-lazy arrays: reads go through the backend's indexing + # pipeline (zarr sync/async, DAP, etc.) and can be expensive even + # when values are cached. We walk through intermediate coder wrappers + # (_ElementwiseFunctionArray) to find the backing array. In-memory + # wrappers like PandasIndexingAdapter are not covered here — reading + # them is free and preserves fail-fast on invalid values. + if _is_backed_by_lazy_backend(data): return True # Chunked + ns + auto: need a proven bound. if attrs is not None and _bound_from_attrs(attrs, units): @@ -433,13 +459,20 @@ def _infer_dtype_from_metadata( """Infer the decoded dtype from metadata alone, without reading data. Raises ``_CannotInferFromMetadata`` when the dtype depends on values - that must be read. Otherwise runs ``decode_cf_datetime`` on a - synthetic ``[0, 1]`` array so the same resolution-selection logic - that runs on real data also produces the declared dtype. + that must be read. For deterministic cases (``use_cftime`` explicit, + non-standard calendar) returns the dtype directly so ``cftime`` does + not need to be importable. Otherwise runs ``decode_cf_datetime`` on + a synthetic ``[0, 1]`` array so resolution-selection logic matches + real-data output. """ # Validate units — raises ValueError on malformed strings. _unpack_time_unit_and_ref_date(units) + if use_cftime is True: + return np.dtype("object") + if calendar is not None and not _is_standard_calendar(calendar): + return np.dtype("object") + if not _is_overflow_safe_from_metadata( data, units, calendar, use_cftime, time_unit, attrs ): @@ -509,35 +542,6 @@ def _decode_cf_datetime_dtype( raise _wrap_decode_error(err, units, calendar) from err -def _verify_decoded_dtype(func: Callable, expected_dtype: np.dtype) -> Callable: - """Wrap ``func`` to raise on datetime64/object dtype mismatches. - - The lazy decode pipeline advertises ``expected_dtype`` upfront; when - data is chunked, dask.map_blocks uses it to cast the function's - output. If decoded values land on the wrong side of the - datetime64/object boundary (e.g. overflow forces a cftime fallback), - this wrapper raises ``TypeError`` instead of silently casting. - Different datetime64 resolutions are allowed — only the - datetime64/object boundary matters. - """ - expected_is_datetime = expected_dtype.kind == "M" - - def wrapper(values): - result = func(values) - result_dtype = getattr(result, "dtype", np.dtype("object")) - if (result_dtype.kind == "M") != expected_is_datetime: - raise TypeError( - f"decoded datetime dtype {result_dtype!r} does not match " - f"declared dtype {expected_dtype!r}. Values likely fall " - f"outside the representable range of {expected_dtype!r}; " - "pass use_cftime=True or a coarser time_unit " - "(e.g. 'us' or 's') to decode_times." - ) - return result - - return wrapper - - def _decode_datetime_with_cftime( num_dates: np.ndarray, units: str, calendar: str ) -> np.ndarray: @@ -1593,10 +1597,6 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: use_cftime=self.use_cftime, time_unit=self.time_unit, ) - # Guard against dtype mismatches (e.g. overflow values that would - # cause dask map_blocks to silently cast cftime output to - # datetime64 garbage). - transform = _verify_decoded_dtype(transform, dtype) data = lazy_elemwise_func(data, transform, dtype) return Variable(dims, data, attrs, encoding, fastpath=True) From 65fadf7fc9d2ab654af6a32802f799e2b33bedc9 Mon Sep 17 00:00:00 2001 From: aladinor Date: Thu, 16 Apr 2026 20:30:32 -0500 Subject: [PATCH 5/7] Update CF datetime dtype inference tests for narrowed fast path - Replace the "non-chunked data" test with tests that exercise the new LazilyIndexedArray check, including coder-wrapped lazy backends. - Add a regression test for in-memory PandasIndexingAdapter data to ensure it still falls through to the eager reads path. - Drop the verification-wrapper tests along with the wrapper itself. --- xarray/tests/test_coding_times.py | 74 +++++++++++++++++++------------ 1 file changed, 45 insertions(+), 29 deletions(-) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 52b56e7cc30..db57340b53a 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -36,6 +36,7 @@ ) from xarray.coding.variables import SerializationWarning from xarray.conventions import _update_bounds_attributes, cf_encoder +from xarray.core import indexing from xarray.core.common import contains_cftime_datetimes from xarray.core.types import PDDatetimeUnitOptions from xarray.core.utils import is_duck_dask_array @@ -2265,12 +2266,13 @@ def test_wide_time_unit_no_reads(self, time_unit): ) assert dtype.kind == "M" - def test_unchunked_data_no_reads(self): - # With use_cftime=None + ns + non-chunked, dtype mismatch at access - # time is harmless (sync path reconciles), so no reads needed. - data = self._inaccessible(dtype="int64") + def test_lazy_backend_array_no_reads(self): + # Backend-lazy wrappers (LazilyIndexedArray) skip reads because + # indexing them would trigger expensive backend I/O. + inner = self._inaccessible(dtype="int64") + lazy = indexing.LazilyIndexedArray(inner) dtype = _decode_cf_datetime_dtype( - data, + lazy, "days since 2000-01-01", "standard", use_cftime=None, @@ -2278,6 +2280,44 @@ def test_unchunked_data_no_reads(self): ) assert dtype == np.dtype("datetime64[ns]") + def test_coder_wrapped_lazy_backend_no_reads(self): + # When earlier coders wrap the backend-lazy array in + # _ElementwiseFunctionArray, the fast path still applies — the + # wrapper chain is walked to find the underlying LazilyIndexedArray. + from xarray.coding.common import lazy_elemwise_func + + inner = self._inaccessible(dtype="int64") + lazy = indexing.LazilyIndexedArray(inner) + wrapped = lazy_elemwise_func(lazy, lambda x: x, np.dtype("int64")) + dtype = _decode_cf_datetime_dtype( + wrapped, + "days since 2000-01-01", + "standard", + use_cftime=None, + time_unit="ns", + ) + assert dtype == np.dtype("datetime64[ns]") + + def test_pandas_index_coord_reads_are_performed(self): + # In-memory coord data (PandasIndexingAdapter) is NOT treated as + # backend-lazy — we keep the eager first/last reads so fail-fast + # semantics on overflow values are preserved for in-memory data. + pd = pytest.importorskip("pandas") + adapter = indexing.PandasIndexingAdapter( + pd.Index(np.array([0, 1], dtype="int64")) + ) + dtype = _decode_cf_datetime_dtype( + adapter, + "days since 2000-01-01", + "standard", + use_cftime=None, + time_unit="ns", + ) + # No assertion on "reads" here — we rely on the fallback path + # being reached; overflow detection is covered by existing + # conventions tests (e.g. test_decode_cf_datetime_transition_to_invalid). + assert dtype == np.dtype("datetime64[ns]") + def test_bounded_storage_dtype_no_reads(self): # Chunked int32 seconds from 2000 cannot overflow datetime64[ns], # so the bounds check lets us skip reads without falling through @@ -2342,27 +2382,3 @@ def test_invalid_units_raises_value_error(self): data = self._inaccessible() with pytest.raises(ValueError, match="unable to decode time units"): _decode_cf_datetime_dtype(data, "not a valid unit string", None, None) - - def test_verification_wrapper_raises_on_datetime_vs_object_mismatch(self): - # Metadata predicts datetime64[ns] but the decoder returns cftime - # objects (the overflow case). The wrapper must raise TypeError. - from xarray.coding.times import _verify_decoded_dtype - - def fake_decoder(values): - return np.array([object()], dtype="object") # cftime-like object output - - guarded = _verify_decoded_dtype(fake_decoder, np.dtype("datetime64[ns]")) - with pytest.raises(TypeError, match="does not match declared dtype"): - guarded(np.array([0, 1])) - - def test_verification_wrapper_accepts_different_datetime64_resolutions(self): - # Declared datetime64[ns] but decoder returns datetime64[us] — both - # are datetime64, so the wrapper should pass them through. - from xarray.coding.times import _verify_decoded_dtype - - def fake_decoder(values): - return np.array([0, 1], dtype="datetime64[us]") - - guarded = _verify_decoded_dtype(fake_decoder, np.dtype("datetime64[ns]")) - result = guarded(np.array([0, 1])) - assert result.dtype == np.dtype("datetime64[us]") From 7392d70a089563591f02b00b17fe2fdd284dc59d Mon Sep 17 00:00:00 2001 From: aladinor Date: Thu, 16 Apr 2026 20:34:52 -0500 Subject: [PATCH 6/7] Refresh _is_overflow_safe_from_metadata docstring The wording still referred to the old "chunked data" criterion. Update it to describe the current backend-laziness check. --- xarray/coding/times.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index ffb6e56f607..ca2ab96a7bd 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -415,10 +415,10 @@ def _is_overflow_safe_from_metadata( ) -> bool: """Whether decode_cf_datetime's output dtype is knowable from metadata. - Output dtype depends on data values only when all of the following - hold: ``use_cftime=None``, standard calendar, ``time_unit="ns"``, data - is chunked (so dask enforces the declared dtype), and we have no - explicit bound that proves values fit the datetime64[ns] range. + Output dtype depends on data values only for ``use_cftime=None`` + + standard calendar + ``time_unit="ns"`` with a backend-lazy source + (where reads are expensive) and no attr/dtype bound that proves + values fit the datetime64[ns] range. """ if use_cftime is not None: return True From f28ef9c68569c165ce7ed979c51e481d76367e93 Mon Sep 17 00:00:00 2001 From: aladinor Date: Thu, 16 Apr 2026 20:35:03 -0500 Subject: [PATCH 7/7] Update whats-new entry to match final implementation Drop the mention of the runtime dtype-mismatch guard (removed) and reference the backing issue. --- doc/whats-new.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 23450285e9f..460188ab648 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -164,13 +164,11 @@ Performance - Skip data reads during CF datetime dtype inference when the output dtype is determinable from metadata alone (``use_cftime`` explicitly set, non-standard - calendar, non-nanosecond ``time_unit``, non-chunked data, or when + calendar, non-nanosecond ``time_unit``, lazy backend arrays, or when ``valid_max`` / ``time_coverage_end`` / a narrow storage dtype bounds the possible values). This can significantly speed up :py:func:`open_dataset` / :py:func:`open_datatree` on remote-backed stores - (zarr on S3, icechunk) with many time-encoded variables. A runtime - dtype-mismatch guard on the lazy decode wrapper prevents the rare overflow - case from being silently cast by dask. + (zarr on S3, icechunk) with many time-encoded variables (:issue:`11303`). By `Alfonso Ladino `_. Bug Fixes