diff --git a/docs/releases.md b/docs/releases.md index 1b7f6434..3da187b8 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -7,6 +7,9 @@ - `ChunkManifest` can now hold inlined chunks — raw chunk bytes carried directly in memory rather than as references to external files. Intended for parser authors (e.g., loading Kerchunk references with inlined data); not exposed via `loadable_variables`. ([#938](https://github.com/zarr-developers/VirtualiZarr/pull/938)). By [Max Jones](https://github.com/maxrjones) and [Tom Nicholas](https://github.com/TomNicholas). +- `KerchunkJSONParser` and `KerchunkParquetParser` now parse inline chunk data (both raw-string and `base64:`-prefixed forms) into inlined `ChunkManifest` entries, instead of raising `NotImplementedError`. Fixes the read side of [#489](https://github.com/zarr-developers/VirtualiZarr/issues/489); writer support for inlined chunks will follow in a separate PR. + ([#979](https://github.com/zarr-developers/VirtualiZarr/pull/979)). + By [Tom Nicholas](https://github.com/TomNicholas). ### Breaking changes diff --git a/virtualizarr/parsers/kerchunk/translator.py b/virtualizarr/parsers/kerchunk/translator.py index aea14a3b..6556e33d 100644 --- a/virtualizarr/parsers/kerchunk/translator.py +++ b/virtualizarr/parsers/kerchunk/translator.py @@ -1,5 +1,6 @@ from __future__ import annotations +import base64 import math from collections.abc import Iterable from typing import cast @@ -215,14 +216,20 @@ def manifest_from_kerchunk_chunk_dict( chunk_entries: dict[ChunkKey, ChunkEntry] = {} for k, v in kerchunk_chunk_dict.items(): if isinstance(v, (str, bytes)): - raise NotImplementedError( - "Reading inlined reference data is currently not supported." - "See https://github.com/zarr-developers/VirtualiZarr/issues/489", + # Kerchunk inline form: a raw string (interpreted as bytes) or, when + # prefixed with "base64:", a base64-encoded payload. + raw = v.encode() if isinstance(v, str) else v + if raw.startswith(b"base64:"): + data = base64.b64decode(raw[len(b"base64:") :]) + else: + data = raw + chunk_entries[k] = ChunkEntry( + path="", offset=0, length=len(data), data=data ) - elif not isinstance(v, (tuple, list)): + elif isinstance(v, (tuple, list)): + chunk_entries[k] = chunkentry_from_kerchunk(v, fs_root=fs_root) + else: raise TypeError(f"Unexpected type {type(v)} for chunk value: {v}") - - chunk_entries[k] = chunkentry_from_kerchunk(v, fs_root=fs_root) return ChunkManifest(entries=chunk_entries, shape=shape) diff --git a/virtualizarr/tests/test_parsers/test_kerchunk.py b/virtualizarr/tests/test_parsers/test_kerchunk.py index be985352..3b1d8dd5 100644 --- a/virtualizarr/tests/test_parsers/test_kerchunk.py +++ b/virtualizarr/tests/test_parsers/test_kerchunk.py @@ -374,55 +374,127 @@ def test_open_virtual_dataset_existing_kerchunk_refs( @requires_kerchunk -def test_notimplemented_read_inline_refs(tmp_path, netcdf4_inlined_ref, local_registry): - # For now, we raise a NotImplementedError if we read existing references that have inlined data - # https://github.com/zarr-developers/VirtualiZarr/pull/251#pullrequestreview-2361916932 - - ref_filepath = tmp_path / "ref.json" +def _refs_with_one_inlined_one_virtual(inline_repr: str) -> dict: + """Kerchunk refs describing a (2, 4) int32 array with two (1, 4) chunks: + position (0, 0) is inlined (carrying ``inline_repr``), position (1, 0) is virtual.""" + return { + "version": 1, + "refs": { + ".zgroup": '{"zarr_format":2}', + "a/.zarray": ( + '{"chunks":[1,4],"compressor":null,"dtype":"