diff --git a/docs/data_structures.md b/docs/data_structures.md index d21252b0..9892d39c 100644 --- a/docs/data_structures.md +++ b/docs/data_structures.md @@ -94,6 +94,58 @@ lengths = np.asarray([100, 100], dtype=np.uint64) manifest = ChunkManifest.from_arrays(paths=paths, offsets=offsets, lengths=lengths) ``` +### Chunk states + +Every position in a `ChunkManifest` is in one of three states, distinguished by the value of `path` in its entry: + +| State | `path` | Meaning | +|----------|---------------------------------------|----------------------------------------------------------------------------------------------| +| Virtual | a real URI (e.g., `"s3://bucket/foo.nc"`) | Chunk lives at the given byte range in an external file. | +| Missing | `""` (`MISSING_CHUNK_PATH`) | Chunk is absent. Reads return the array's `fill_value`. | +| Inlined | `"__inlined__"` (`INLINED_CHUNK_PATH`)| Raw bytes for the chunk are stored in memory in the manifest's `_inlined` dict (see below). | + +Parser authors are free to mix all three states within a single manifest. + +### Inlined chunks + +So far every chunk in the manifest has pointed to a byte range in some external file. +A `ChunkManifest` can also hold **inlined chunks**: the raw chunk bytes are carried directly inside the manifest itself, rather than referenced from an external file. + +Inlined chunks are useful for small variables — coordinate arrays, dimension labels, scalar metadata — where the overhead of a remote read exceeds the cost of just carrying the bytes along. + +Inlined chunks are produced by [parsers](custom_parsers.md), not by end users; there is no way to request them via `loadable_variables`. If you are writing a custom parser for a format that stores small inlined references (e.g., Kerchunk JSON), you can emit them using the constructors below. + +Internally, inlined chunks live in a sparse dictionary `_inlined: dict[tuple[int, ...], bytes]` on the `ChunkManifest`, keyed by chunk grid index. The corresponding entry in the paths array is set to the `INLINED_CHUNK_PATH` sentinel. + +To create a manifest with inlined chunks, pass entries with a `data` key: + +```python +from virtualizarr.manifests import ChunkManifest + +manifest = ChunkManifest( + entries={ + "0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + "0.1": {"path": "", "offset": 0, "length": 4, "data": b"\x00\x01\x02\x03"}, + } +) +``` + +Or via `from_arrays` with the `inlined` parameter: + +```python +import numpy as np +from virtualizarr.manifests import ChunkManifest + +manifest = ChunkManifest.from_arrays( + paths=np.asarray(["s3://bucket/foo.nc", ""], dtype=np.dtypes.StringDType()), + offsets=np.asarray([100, 0], dtype=np.uint64), + lengths=np.asarray([100, 4], dtype=np.uint64), + inlined={(1,): b"\x00\x01\x02\x03"}, +) +``` + +Inlined chunks participate in all manifest operations: concatenation and stacking shift their indices, broadcasting prepends singleton dimensions to their keys, equality compares the inlined bytes, pickling carries the data along (for Dask/multiprocessing), `ManifestStore` reads return them directly from memory, and `nbytes` includes their size. + ## `ManifestArray` class A Zarr array is defined not just by the location of its constituent chunk data, but by its array-level attributes such as `shape` and `dtype`. diff --git a/docs/releases.md b/docs/releases.md index b1c376c2..1b7f6434 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -4,6 +4,10 @@ ### New Features +- `ChunkManifest` can now hold inlined chunks — raw chunk bytes carried directly in memory rather than as references to external files. Intended for parser authors (e.g., loading Kerchunk references with inlined data); not exposed via `loadable_variables`. + ([#938](https://github.com/zarr-developers/VirtualiZarr/pull/938)). + By [Max Jones](https://github.com/maxrjones) and [Tom Nicholas](https://github.com/TomNicholas). + ### Breaking changes ### Bug fixes diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index 65e0524a..e3ac8525 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -1,3 +1,4 @@ +import itertools from typing import TYPE_CHECKING, Any, Callable, Union, cast import numpy as np @@ -214,11 +215,23 @@ def _concat_manifests(manifests: list[ChunkManifest], axis: int) -> ChunkManifes ) concatenated_offsets = np.concatenate([m._offsets for m in manifests], axis=axis) concatenated_lengths = np.concatenate([m._lengths for m in manifests], axis=axis) + + # merge inlined chunk dicts with index shifting along the concat axis + concatenated_inlined: dict[tuple[int, ...], bytes] = {} + grid_offset = 0 + for m in manifests: + for key, data in m._inlined.items(): + shifted = list(key) + shifted[axis] += grid_offset + concatenated_inlined[tuple(shifted)] = data + grid_offset += m._paths.shape[axis] + return ChunkManifest.from_arrays( paths=concatenated_paths, offsets=concatenated_offsets, lengths=concatenated_lengths, validate_paths=False, + inlined=concatenated_inlined if concatenated_inlined else None, ) @@ -230,11 +243,21 @@ def _stack_manifests(manifests: list[ChunkManifest], axis: int) -> ChunkManifest ) stacked_offsets = np.stack([m._offsets for m in manifests], axis=axis) stacked_lengths = np.stack([m._lengths for m in manifests], axis=axis) + + # merge inlined chunk dicts, inserting the new stacked axis + stacked_inlined: dict[tuple[int, ...], bytes] = {} + for i, m in enumerate(manifests): + for key, data in m._inlined.items(): + shifted = list(key) + shifted.insert(axis, i) + stacked_inlined[tuple(shifted)] = data + return ChunkManifest.from_arrays( paths=stacked_paths, offsets=stacked_offsets, lengths=stacked_lengths, validate_paths=False, + inlined=stacked_inlined if stacked_inlined else None, ) @@ -248,11 +271,29 @@ def _broadcast_manifest( ) broadcasted_offsets = np.broadcast_to(manifest._offsets, shape=shape) broadcasted_lengths = np.broadcast_to(manifest._lengths, shape=shape) + + # broadcast inlined chunks: prepend singleton dims to each key, then replicate + # the entry across every target position along any axis that was size 1 in the + # source (matching np.broadcast_to semantics for the paths/offsets/lengths arrays). + broadcasted_inlined: dict[tuple[int, ...], bytes] = {} + if manifest._inlined: + n_prepended = len(shape) - manifest._paths.ndim + source_shape_padded = (1,) * n_prepended + manifest._paths.shape + for key, data in manifest._inlined.items(): + padded_key = (0,) * n_prepended + key + axis_ranges = [ + range(shape[i]) if source_shape_padded[i] == 1 else (padded_key[i],) + for i in range(len(shape)) + ] + for target_key in itertools.product(*axis_ranges): + broadcasted_inlined[target_key] = data + return ChunkManifest.from_arrays( paths=broadcasted_paths, offsets=broadcasted_offsets, lengths=broadcasted_lengths, validate_paths=False, + inlined=broadcasted_inlined if broadcasted_inlined else None, ) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 618b13ac..ca3abc5b 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -10,7 +10,7 @@ ValuesView, ) from pathlib import PosixPath -from typing import Any, NewType, TypedDict, cast +from typing import Any, NewType, NotRequired, TypedDict, cast from urllib.parse import urlparse import numpy as np @@ -28,14 +28,27 @@ def _is_uri(path: str) -> bool: return bool(urlparse(path).scheme) +# Sentinel path values used in the paths array to distinguish chunk states. +# Missing chunks have no data anywhere; inlined chunks have data in _inlined dict. +MISSING_CHUNK_PATH = "" +INLINED_CHUNK_PATH = "__inlined__" + + class ChunkEntry(TypedDict): path: str offset: int length: int + data: NotRequired[bytes] @classmethod # type: ignore[misc] def with_validation( - cls, *, path: str, offset: int, length: int, fs_root: str | None = None + cls, + *, + path: str, + offset: int, + length: int, + inlined_data: bytes | None = None, + fs_root: str | None = None, ) -> "ChunkEntry": """ Constructor which validates each part of the chunk entry. @@ -45,10 +58,19 @@ def with_validation( fs_root The root of the filesystem on which these references were generated. Required if any (likely kerchunk-generated) paths are relative in order to turn them into absolute paths (which virtualizarr requires). + inlined_data + Raw bytes for inlined (in-memory) chunks. When present, path/offset/length are ignored. """ # note: we can't just use `__init__` or a dataclass' `__post_init__` because we need `fs_root` to be an optional kwarg - if path != "": + if inlined_data is not None: + return ChunkEntry( + path=INLINED_CHUNK_PATH, + offset=0, + length=len(inlined_data), + data=inlined_data, + ) + if path != MISSING_CHUNK_PATH: path = validate_and_normalize_path_to_uri(path, fs_root=fs_root) validate_byte_range(offset=offset, length=length) return ChunkEntry(path=path, offset=offset, length=length) @@ -66,8 +88,8 @@ def validate_and_normalize_path_to_uri(path: str, fs_root: str | None = None) -> The root of the filesystem on which these references were generated. Required if any (likely kerchunk-generated) paths are relative in order to turn them into absolute paths (which virtualizarr requires). """ - if path == "": - # (empty paths are allowed through as they represent missing chunks) + if path in (MISSING_CHUNK_PATH, INLINED_CHUNK_PATH): + # sentinel values are allowed through (missing chunks and inlined chunks) return path elif _is_uri(path): return path # path is already in URI form @@ -167,6 +189,7 @@ class ChunkManifest: _paths: np.ndarray[Any, np.dtypes.StringDType] _offsets: np.ndarray[Any, np.dtype[np.uint64]] _lengths: np.ndarray[Any, np.dtype[np.uint64]] + _inlined: dict[tuple[int, ...], bytes] def __init__( self, @@ -190,6 +213,11 @@ def __init__( "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, } ``` + + Entries may also include a ``data`` key with raw bytes for inlined (in-memory) chunks:: + + {"0.0": {"path": "", "offset": 0, "length": 4, "data": b"\\x00\\x01\\x02\\x03"}} + separator The chunk key separator, as specified by the array's chunk_key_encoding metadata. Either "." (default/v2 encoding) or "/" (default encoding). @@ -203,7 +231,7 @@ def __init__( if shape is None: shape = get_chunk_grid_shape(entries.keys(), separator) - # Initializing to empty implies that entries with path='' are treated as missing chunks + # Initializing to empty implies that entries with path=MISSING_CHUNK_PATH are treated as missing chunks paths = cast( # `np.empty` apparently is type hinted as if the output could have Any dtype np.ndarray[Any, np.dtypes.StringDType], np.empty(shape=shape, dtype=np.dtypes.StringDType()), @@ -211,26 +239,45 @@ def __init__( offsets = np.empty(shape=shape, dtype=np.dtype("uint64")) lengths = np.empty(shape=shape, dtype=np.dtype("uint64")) + inlined: dict[tuple[int, ...], bytes] = {} + + _virtual_keys = {"path", "offset", "length"} + _inlined_keys = {"path", "offset", "length", "data"} + # populate the arrays for key, entry in entries.items(): - if not isinstance(entry, dict) or len(entry) != 3: + if not isinstance(entry, dict) or set(entry) not in ( + _virtual_keys, + _inlined_keys, + ): msg = ( - "Each chunk entry must be of the form dict(path=, offset=, length=), " - f"but got {entry}" + "Each chunk entry must be a dict with keys 'path', 'offset', 'length' " + f"(and optionally 'data'), but got {entry}" ) raise ValueError(msg) - path, offset, length = entry.values() - entry = ChunkEntry.with_validation(path=path, offset=offset, length=length) # type: ignore[attr-defined] - split_key = () if shape == () else parse_manifest_index(key, separator) - paths[split_key] = entry["path"] - offsets[split_key] = entry["offset"] - lengths[split_key] = entry["length"] + + if "data" in entry: + # Inlined chunk: store bytes in the sparse dict + inlined[split_key] = entry["data"] + paths[split_key] = INLINED_CHUNK_PATH + offsets[split_key] = 0 + lengths[split_key] = len(entry["data"]) + else: + entry = ChunkEntry.with_validation( # type: ignore[attr-defined] + path=entry["path"], + offset=entry["offset"], + length=entry["length"], + ) + paths[split_key] = entry["path"] + offsets[split_key] = entry["offset"] + lengths[split_key] = entry["length"] self._paths = paths self._offsets = offsets self._lengths = lengths + self._inlined = inlined @classmethod def from_arrays( @@ -240,6 +287,7 @@ def from_arrays( offsets: np.ndarray[Any, np.dtype[np.uint64]], lengths: np.ndarray[Any, np.dtype[np.uint64]], validate_paths: bool = True, + inlined: dict[tuple[int, ...], bytes] | None = None, ) -> "ChunkManifest": """ Create manifest directly from numpy arrays containing the path and byte range information. @@ -258,6 +306,9 @@ def from_arrays( validate_paths Check that entries in the manifest are valid paths (e.g. that local paths are absolute not relative). Set to False to skip validation for performance reasons. + inlined + Dictionary mapping chunk grid indices to raw bytes for inlined (in-memory) chunks. + Paths at these indices should be ``INLINED_CHUNK_PATH``. """ # check types @@ -307,6 +358,7 @@ def from_arrays( obj._paths = paths obj._offsets = offsets obj._lengths = lengths + obj._inlined = inlined if inlined is not None else {} return obj @@ -329,6 +381,9 @@ def shape_chunk_grid(self) -> tuple[int, ...]: return self._paths.shape def __repr__(self) -> str: + n_inlined = len(self._inlined) + if n_inlined: + return f"ChunkManifest" return f"ChunkManifest" @property @@ -339,11 +394,24 @@ def nbytes(self) -> int: Note this is not the size of the referenced chunks if they were actually loaded into memory, this is only the size of the pointers to the chunk locations. If you were to load the data into memory it would be ~1e6x larger for 1MB chunks. + + For inlined chunks, includes the size of the actual chunk data stored in memory. """ - return self._paths.nbytes + self._offsets.nbytes + self._lengths.nbytes + inlined_bytes = sum(len(v) for v in self._inlined.values()) + return ( + self._paths.nbytes + + self._offsets.nbytes + + self._lengths.nbytes + + inlined_bytes + ) def __getitem__(self, key: ChunkKey) -> ChunkEntry: indices = parse_manifest_index(key) + if indices in self._inlined: + data = self._inlined[indices] + return ChunkEntry( + path=INLINED_CHUNK_PATH, offset=0, length=len(data), data=data + ) path = self._paths[indices] offset = self._offsets[indices] length = self._lengths[indices] @@ -382,23 +450,31 @@ def dict(self) -> ChunkDict: # type: ignore[override] } ``` - Entries whose path is an empty string will be interpreted as missing chunks and omitted from the dictionary. + Entries whose path is ``MISSING_CHUNK_PATH`` will be interpreted as missing chunks and omitted from the dictionary. + Entries whose path is ``INLINED_CHUNK_PATH`` have their data stored in memory and are included. """ coord_vectors = np.mgrid[ tuple(slice(None, length) for length in self.shape_chunk_grid) ] # TODO consolidate each occurrence of this np.nditer pattern - d = { - join(inds): dict( - path=path.item(), offset=offset.item(), length=length.item() - ) - for *inds, path, offset, length in np.nditer( - [*coord_vectors, self._paths, self._offsets, self._lengths], - flags=("refs_ok",), - ) - if path.item() != "" # don't include entry if path='' (i.e. empty chunk) - } + d = {} + for *inds, path, offset, length in np.nditer( + [*coord_vectors, self._paths, self._offsets, self._lengths], + flags=("refs_ok",), + ): + idx = tuple(int(i) for i in inds) + if idx in self._inlined: + d[join(inds)] = ChunkEntry( + path=INLINED_CHUNK_PATH, + offset=0, + length=len(self._inlined[idx]), + data=self._inlined[idx], + ) + elif path.item() != MISSING_CHUNK_PATH: + d[join(inds)] = dict( + path=path.item(), offset=offset.item(), length=length.item() + ) return cast( ChunkDict, @@ -410,7 +486,8 @@ def __eq__(self, other: Any) -> bool: paths_equal = (self._paths == other._paths).all() offsets_equal = (self._offsets == other._offsets).all() lengths_equal = (self._lengths == other._lengths).all() - return paths_equal and offsets_equal and lengths_equal + inlined_equal = self._inlined == other._inlined + return paths_equal and offsets_equal and lengths_equal and inlined_equal def get_entry(self, indices: tuple[int, ...]) -> ChunkEntry | None: """Look up a chunk entry by grid indices. Returns None for missing chunks (empty path).""" @@ -423,16 +500,22 @@ def get_entry(self, indices: tuple[int, ...]) -> ChunkEntry | None: def elementwise_eq(self, other: "ChunkManifest") -> np.ndarray: """Return boolean array where True means that chunk entry matches.""" - return ( + equal = ( (self._paths == other._paths) & (self._offsets == other._offsets) & (self._lengths == other._lengths) ) + # For inlined chunks, paths/offsets/lengths can agree while bytes differ. + # Force False at any position where the two manifests disagree on inlined data. + for chunk_key in set(self._inlined) | set(other._inlined): + if self._inlined.get(chunk_key) != other._inlined.get(chunk_key): + equal[chunk_key] = False + return equal def iter_nonempty_paths(self) -> Iterator[str]: - """Yield all non-empty paths in the manifest.""" + """Yield all real (non-missing, non-inlined) paths in the manifest.""" for path in self._paths.flat: - if path: + if path and path != INLINED_CHUNK_PATH: yield path def iter_refs(self) -> Iterator[tuple[tuple[int, ...], ChunkEntry]]: @@ -505,6 +588,7 @@ def rename_paths( offsets=self._offsets, lengths=self._lengths, validate_paths=True, + inlined=dict(self._inlined), ) diff --git a/virtualizarr/manifests/store.py b/virtualizarr/manifests/store.py index 3671037f..349b14d1 100644 --- a/virtualizarr/manifests/store.py +++ b/virtualizarr/manifests/store.py @@ -164,6 +164,20 @@ async def get( ) chunk_indexes = parse_manifest_index(key, separator, expand_pattern=True) + # Check for inlined (in-memory) chunks first + if chunk_indexes in manifest._inlined: + inlined_data = manifest._inlined[chunk_indexes] + if byte_range is not None: + inlined_byte_range = _transform_byte_range( + byte_range, + chunk_start=0, + chunk_end_exclusive=len(inlined_data), + ) + inlined_data = inlined_data[ + inlined_byte_range.start : inlined_byte_range.end + ] + return prototype.buffer.from_bytes(inlined_data) + entry = manifest.get_entry(chunk_indexes) if entry is None: return None diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index c3e3cdfa..ba46e001 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -112,6 +112,34 @@ def test_not_equal_chunk_entries(self, array_v3_metadata): @pytest.mark.skip(reason="Not Implemented") def test_partly_equals(self): ... + def test_equal_inlined_data(self, array_v3_metadata): + metadata = array_v3_metadata(shape=(1,), chunks=(1,)) + chunks = {"0": {"path": "", "offset": 0, "length": 4, "data": b"aaaa"}} + marr1 = ManifestArray( + metadata=metadata, chunkmanifest=ChunkManifest(entries=chunks) + ) + marr2 = ManifestArray( + metadata=metadata, chunkmanifest=ChunkManifest(entries=chunks) + ) + assert (marr1 == marr2).all() + + def test_not_equal_different_inlined_data(self, array_v3_metadata): + # same paths/offsets/lengths, but different inlined bytes → not equal + metadata = array_v3_metadata(shape=(1,), chunks=(1,)) + marr1 = ManifestArray( + metadata=metadata, + chunkmanifest=ChunkManifest( + entries={"0": {"path": "", "offset": 0, "length": 4, "data": b"aaaa"}} + ), + ) + marr2 = ManifestArray( + metadata=metadata, + chunkmanifest=ChunkManifest( + entries={"0": {"path": "", "offset": 0, "length": 4, "data": b"bbbb"}} + ), + ) + assert not (marr1 == marr2).all() + def test_equals_nan_fill_value(self, array_v3_metadata): # regression test for https://github.com/zarr-developers/VirtualiZarr/issues/501 chunks_dict = { @@ -274,6 +302,113 @@ def test_broadcast_empty( assert expanded.manifest.dict() == {} +class TestBroadcastInlined: + def test_broadcast_existing_axis(self, array_v3_metadata): + # inlined chunks should be replicated to every position along an expanded axis + metadata = array_v3_metadata(shape=(1, 2), chunks=(1, 1)) + manifest = ChunkManifest( + entries={ + "0.0": {"path": "", "offset": 0, "length": 4, "data": b"aaaa"}, + "0.1": {"path": "", "offset": 0, "length": 4, "data": b"bbbb"}, + } + ) + marr = ManifestArray(metadata=metadata, chunkmanifest=manifest) + + expanded = np.broadcast_to(marr, shape=(3, 2)) + assert expanded.shape == (3, 2) + assert expanded.manifest._inlined == { + (0, 0): b"aaaa", + (1, 0): b"aaaa", + (2, 0): b"aaaa", + (0, 1): b"bbbb", + (1, 1): b"bbbb", + (2, 1): b"bbbb", + } + + def test_broadcast_new_axis(self, array_v3_metadata): + # prepending a size-1 axis should rewrite inlined keys without replicating bytes + metadata = array_v3_metadata(shape=(2,), chunks=(1,)) + manifest = ChunkManifest( + entries={ + "0": {"path": "", "offset": 0, "length": 4, "data": b"aaaa"}, + "1": {"path": "", "offset": 0, "length": 4, "data": b"bbbb"}, + } + ) + marr = ManifestArray(metadata=metadata, chunkmanifest=manifest) + + expanded = np.broadcast_to(marr, shape=(1, 2)) + assert expanded.shape == (1, 2) + assert expanded.manifest._inlined == { + (0, 0): b"aaaa", + (0, 1): b"bbbb", + } + + def test_broadcast_prepended_and_expanded(self, array_v3_metadata): + # prepend a new axis AND expand it; inlined bytes should be replicated along the new axis + metadata = array_v3_metadata(shape=(2,), chunks=(1,)) + manifest = ChunkManifest( + entries={ + "0": {"path": "", "offset": 0, "length": 4, "data": b"aaaa"}, + "1": {"path": "", "offset": 0, "length": 4, "data": b"bbbb"}, + } + ) + marr = ManifestArray(metadata=metadata, chunkmanifest=manifest) + + expanded = np.broadcast_to(marr, shape=(3, 2)) + assert expanded.shape == (3, 2) + assert expanded.manifest._inlined == { + (0, 0): b"aaaa", + (1, 0): b"aaaa", + (2, 0): b"aaaa", + (0, 1): b"bbbb", + (1, 1): b"bbbb", + (2, 1): b"bbbb", + } + + def test_broadcast_preserves_bytes_identity(self, array_v3_metadata): + # replicated inlined entries should share the same bytes object, not copies + metadata = array_v3_metadata(shape=(1,), chunks=(1,)) + payload = b"x" * 16 + marr = ManifestArray( + metadata=metadata, + chunkmanifest=ChunkManifest( + entries={"0": {"path": "", "offset": 0, "length": 16, "data": payload}} + ), + ) + + expanded = np.broadcast_to(marr, shape=(4,)) + assert expanded.manifest._inlined[(0,)] is payload + assert expanded.manifest._inlined[(1,)] is payload + assert expanded.manifest._inlined[(2,)] is payload + assert expanded.manifest._inlined[(3,)] is payload + + def test_broadcast_mixed_inlined_and_virtual(self, array_v3_metadata): + # inlined and virtual chunks in the same manifest both replicate along the expanded axis + metadata = array_v3_metadata(shape=(1, 2), chunks=(1, 1)) + manifest = ChunkManifest( + entries={ + "0.0": {"path": "", "offset": 0, "length": 4, "data": b"aaaa"}, + "0.1": {"path": "file:///foo.nc", "offset": 100, "length": 4}, + } + ) + marr = ManifestArray(metadata=metadata, chunkmanifest=manifest) + + expanded = np.broadcast_to(marr, shape=(3, 2)) + assert expanded.manifest._inlined == { + (0, 0): b"aaaa", + (1, 0): b"aaaa", + (2, 0): b"aaaa", + } + assert expanded.manifest.dict() == { + "0.0": {"path": "__inlined__", "offset": 0, "length": 4, "data": b"aaaa"}, + "1.0": {"path": "__inlined__", "offset": 0, "length": 4, "data": b"aaaa"}, + "2.0": {"path": "__inlined__", "offset": 0, "length": 4, "data": b"aaaa"}, + "0.1": {"path": "file:///foo.nc", "offset": 100, "length": 4}, + "1.1": {"path": "file:///foo.nc", "offset": 100, "length": 4}, + "2.1": {"path": "file:///foo.nc", "offset": 100, "length": 4}, + } + + # TODO we really need some kind of fixtures to generate useful example data # The hard part is having an alternative way to get to the expected result of concatenation class TestConcat: @@ -333,6 +468,86 @@ def test_concat_empty(self, array_v3_metadata): assert result.metadata.fill_value == metadata.fill_value +class TestConcatInlined: + def test_concat_two_inlined_along_axis_0(self, array_v3_metadata): + metadata = array_v3_metadata(shape=(1, 2), chunks=(1, 1)) + marr1 = ManifestArray( + metadata=metadata, + chunkmanifest=ChunkManifest( + entries={ + "0.0": {"path": "", "offset": 0, "length": 4, "data": b"aaaa"}, + "0.1": {"path": "", "offset": 0, "length": 4, "data": b"bbbb"}, + } + ), + ) + marr2 = ManifestArray( + metadata=metadata, + chunkmanifest=ChunkManifest( + entries={ + "0.0": {"path": "", "offset": 0, "length": 4, "data": b"cccc"}, + "0.1": {"path": "", "offset": 0, "length": 4, "data": b"dddd"}, + } + ), + ) + + result = np.concatenate([marr1, marr2], axis=0) + assert result.shape == (2, 2) + assert result.manifest._inlined == { + (0, 0): b"aaaa", + (0, 1): b"bbbb", + (1, 0): b"cccc", + (1, 1): b"dddd", + } + + def test_concat_mixed_inlined_and_virtual(self, array_v3_metadata): + # one array inlined, the other virtual — concat along axis 1 + metadata = array_v3_metadata(shape=(1, 1), chunks=(1, 1)) + marr_inlined = ManifestArray( + metadata=metadata, + chunkmanifest=ChunkManifest( + entries={ + "0.0": {"path": "", "offset": 0, "length": 4, "data": b"aaaa"}, + } + ), + ) + marr_virtual = ManifestArray( + metadata=metadata, + chunkmanifest=ChunkManifest( + entries={ + "0.0": {"path": "/foo.nc", "offset": 100, "length": 4}, + } + ), + ) + + result = np.concatenate([marr_inlined, marr_virtual], axis=1) + assert result.shape == (1, 2) + # inlined entry stays at column 0; virtual shifted to column 1 + assert result.manifest._inlined == {(0, 0): b"aaaa"} + assert result.manifest.dict() == { + "0.0": {"path": "__inlined__", "offset": 0, "length": 4, "data": b"aaaa"}, + "0.1": {"path": "file:///foo.nc", "offset": 100, "length": 4}, + } + + def test_concat_all_virtual_leaves_inlined_empty(self, array_v3_metadata): + # regression: concat of two virtual-only arrays must not populate _inlined + metadata = array_v3_metadata(shape=(1, 1), chunks=(1, 1)) + marr1 = ManifestArray( + metadata=metadata, + chunkmanifest=ChunkManifest( + entries={"0.0": {"path": "/a.nc", "offset": 0, "length": 4}} + ), + ) + marr2 = ManifestArray( + metadata=metadata, + chunkmanifest=ChunkManifest( + entries={"0.0": {"path": "/b.nc", "offset": 0, "length": 4}} + ), + ) + + result = np.concatenate([marr1, marr2], axis=0) + assert result.manifest._inlined == {} + + class TestStack: def test_stack(self, array_v3_metadata): # both manifest arrays in this example have the same metadata @@ -403,6 +618,94 @@ def test_stack_empty(self, array_v3_metadata): assert result.metadata.fill_value == metadata.fill_value +class TestStackInlined: + def test_stack_two_inlined_along_new_axis(self, array_v3_metadata): + metadata = array_v3_metadata(shape=(2,), chunks=(1,)) + marr1 = ManifestArray( + metadata=metadata, + chunkmanifest=ChunkManifest( + entries={ + "0": {"path": "", "offset": 0, "length": 4, "data": b"aaaa"}, + "1": {"path": "", "offset": 0, "length": 4, "data": b"bbbb"}, + } + ), + ) + marr2 = ManifestArray( + metadata=metadata, + chunkmanifest=ChunkManifest( + entries={ + "0": {"path": "", "offset": 0, "length": 4, "data": b"cccc"}, + "1": {"path": "", "offset": 0, "length": 4, "data": b"dddd"}, + } + ), + ) + + result = np.stack([marr1, marr2], axis=0) + assert result.shape == (2, 2) + assert result.manifest._inlined == { + (0, 0): b"aaaa", + (0, 1): b"bbbb", + (1, 0): b"cccc", + (1, 1): b"dddd", + } + + def test_stack_mixed_inlined_and_virtual_axis_1(self, array_v3_metadata): + # insert the new stack axis at position 1 + metadata = array_v3_metadata(shape=(2,), chunks=(1,)) + marr_inlined = ManifestArray( + metadata=metadata, + chunkmanifest=ChunkManifest( + entries={ + "0": {"path": "", "offset": 0, "length": 4, "data": b"aaaa"}, + "1": {"path": "", "offset": 0, "length": 4, "data": b"bbbb"}, + } + ), + ) + marr_virtual = ManifestArray( + metadata=metadata, + chunkmanifest=ChunkManifest( + entries={ + "0": {"path": "/foo.nc", "offset": 0, "length": 4}, + "1": {"path": "/foo.nc", "offset": 4, "length": 4}, + } + ), + ) + + result = np.stack([marr_inlined, marr_virtual], axis=1) + assert result.shape == (2, 2) + assert result.manifest._inlined == { + (0, 0): b"aaaa", + (1, 0): b"bbbb", + } + assert result.manifest.dict() == { + "0.0": {"path": "__inlined__", "offset": 0, "length": 4, "data": b"aaaa"}, + "1.0": {"path": "__inlined__", "offset": 0, "length": 4, "data": b"bbbb"}, + "0.1": {"path": "file:///foo.nc", "offset": 0, "length": 4}, + "1.1": {"path": "file:///foo.nc", "offset": 4, "length": 4}, + } + + def test_stack_preserves_bytes_identity(self, array_v3_metadata): + # bytes objects should be shared by reference, not copied + metadata = array_v3_metadata(shape=(1,), chunks=(1,)) + payload = b"x" * 16 + marr1 = ManifestArray( + metadata=metadata, + chunkmanifest=ChunkManifest( + entries={"0": {"path": "", "offset": 0, "length": 16, "data": payload}} + ), + ) + marr2 = ManifestArray( + metadata=metadata, + chunkmanifest=ChunkManifest( + entries={"0": {"path": "", "offset": 0, "length": 16, "data": payload}} + ), + ) + + result = np.stack([marr1, marr2], axis=0) + assert result.manifest._inlined[(0, 0)] is payload + assert result.manifest._inlined[(1, 0)] is payload + + def test_refuse_combine(array_v3_metadata): # TODO test refusing to concatenate arrays that have conflicting shapes / chunk sizes chunks = (5, 1, 10) @@ -655,3 +958,16 @@ def test_to_xarray(array_v3_metadata): assert vv.data.metadata.dimension_names is None assert vv.attrs == {"ham": "sandwich"} assert vv.data.metadata.attributes == {} + + +def test_to_virtual_variable_preserves_inlined(array_v3_metadata): + metadata = array_v3_metadata(shape=(2,), chunks=(1,), dimension_names=["x"]) + manifest = ChunkManifest( + entries={ + "0": {"path": "", "offset": 0, "length": 4, "data": b"aaaa"}, + "1": {"path": "", "offset": 0, "length": 4, "data": b"bbbb"}, + } + ) + marr = ManifestArray(metadata=metadata, chunkmanifest=manifest) + vv = marr.to_virtual_variable() + assert vv.data.manifest._inlined == {(0,): b"aaaa", (1,): b"bbbb"} diff --git a/virtualizarr/tests/test_manifests/test_manifest.py b/virtualizarr/tests/test_manifests/test_manifest.py index 2c179d54..12ed222b 100644 --- a/virtualizarr/tests/test_manifests/test_manifest.py +++ b/virtualizarr/tests/test_manifests/test_manifest.py @@ -1,3 +1,5 @@ +import pickle + import numpy as np import pytest @@ -145,7 +147,19 @@ def test_invalid_chunk_entries(self): chunks = { "0.0.0": {"path": "s3://bucket/foo.nc"}, } - with pytest.raises(ValueError, match="must be of the form"): + with pytest.raises(ValueError, match="must be a dict with keys"): + ChunkManifest(entries=chunks) + + def test_invalid_chunk_entry_extra_keys(self): + chunks = { + "0.0.0": { + "path": "s3://bucket/foo.nc", + "offset": 0, + "length": 4, + "garbage": "x", + }, + } + with pytest.raises(ValueError, match="must be a dict with keys"): ChunkManifest(entries=chunks) def test_invalid_chunk_keys(self): @@ -326,3 +340,219 @@ def test_catch_malformed_paths(self): with pytest.raises(ValueError): # list is an invalid arg type manifest.rename_paths("./foo.nc") + + +class TestInlinedChunks: + """Tests for inlined (in-memory) chunk support in ChunkManifest.""" + + def test_create_manifest_with_inlined_chunks(self): + chunks = { + "0.0": { + "path": "", + "offset": 0, + "length": 4, + "data": b"\x00\x01\x02\x03", + }, + } + manifest = ChunkManifest(entries=chunks) + assert len(manifest._inlined) == 1 + assert (0, 0) in manifest._inlined + assert manifest._inlined[(0, 0)] == b"\x00\x01\x02\x03" + + def test_mixed_virtual_and_inlined(self): + chunks = { + "0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + "0.1": { + "path": "", + "offset": 0, + "length": 3, + "data": b"\x01\x02\x03", + }, + } + manifest = ChunkManifest(entries=chunks) + assert len(manifest._inlined) == 1 + assert manifest._paths[(0, 0)] == "s3://bucket/foo.nc" + assert manifest._paths[(0, 1)] == "__inlined__" + + def test_dict_includes_inlined_data(self): + chunks = { + "0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + "0.1": { + "path": "", + "offset": 0, + "length": 4, + "data": b"\x00\x01\x02\x03", + }, + } + manifest = ChunkManifest(entries=chunks) + d = manifest.dict() + assert len(d) == 2 + assert d["0.0"] == { + "path": "s3://bucket/foo.nc", + "offset": 100, + "length": 100, + } + assert d["0.1"]["data"] == b"\x00\x01\x02\x03" + + def test_getitem_inlined_chunk(self): + chunks = { + "0.0": { + "path": "", + "offset": 0, + "length": 4, + "data": b"\x00\x01\x02\x03", + }, + } + manifest = ChunkManifest(entries=chunks) + entry = manifest["0.0"] + assert "data" in entry + assert entry["data"] == b"\x00\x01\x02\x03" + assert entry["path"] == "__inlined__" + + def test_nbytes_includes_inlined_data(self): + chunks = { + "0.0": { + "path": "", + "offset": 0, + "length": 4, + "data": b"\x00\x01\x02\x03", + }, + } + manifest = ChunkManifest(entries=chunks) + # nbytes should include both the numpy arrays and the inlined data + assert manifest.nbytes > 4 + + def test_equals_with_inlined(self): + chunks = { + "0": { + "path": "", + "offset": 0, + "length": 3, + "data": b"\x01\x02\x03", + }, + } + m1 = ChunkManifest(entries=chunks) + m2 = ChunkManifest(entries=chunks) + assert m1 == m2 + + def test_not_equals_different_inlined_data(self): + m1 = ChunkManifest( + entries={ + "0": { + "path": "", + "offset": 0, + "length": 3, + "data": b"\x01\x02\x03", + }, + } + ) + m2 = ChunkManifest( + entries={ + "0": { + "path": "", + "offset": 0, + "length": 3, + "data": b"\x04\x05\x06", + }, + } + ) + assert m1 != m2 + + def test_from_arrays_with_inlined(self): + paths = np.asarray(["s3://bucket/foo.nc", ""], dtype=np.dtypes.StringDType) + offsets = np.asarray([100, 0], dtype=np.uint64) + lengths = np.asarray([100, 4], dtype=np.uint64) + inlined = {(1,): b"\x00\x01\x02\x03"} + manifest = ChunkManifest.from_arrays( + paths=paths, + offsets=offsets, + lengths=lengths, + validate_paths=False, + inlined=inlined, + ) + assert manifest._inlined == inlined + entry = manifest["1"] + assert "data" in entry + assert entry["data"] == b"\x00\x01\x02\x03" + + def test_virtual_only_has_empty_inlined(self): + chunks = { + "0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + } + manifest = ChunkManifest(entries=chunks) + assert manifest._inlined == {} + + def test_repr_with_inlined(self): + chunks = { + "0": { + "path": "", + "offset": 0, + "length": 3, + "data": b"\x01\x02\x03", + }, + } + manifest = ChunkManifest(entries=chunks) + assert "inlined_chunks=1" in repr(manifest) + + def test_repr_without_inlined(self): + chunks = { + "0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + } + manifest = ChunkManifest(entries=chunks) + assert "inlined_chunks" not in repr(manifest) + + def test_rename_paths_preserves_inlined(self): + chunks = { + "0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + "1": { + "path": "", + "offset": 0, + "length": 3, + "data": b"\x01\x02\x03", + }, + } + manifest = ChunkManifest(entries=chunks) + renamed = manifest.rename_paths("s3://bucket/bar.nc") + assert renamed._inlined == {(1,): b"\x01\x02\x03"} + assert renamed.dict()["0"]["path"] == "s3://bucket/bar.nc" + + def test_pickle_roundtrip(self): + chunks = { + "0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + "0.1": { + "path": "", + "offset": 0, + "length": 4, + "data": b"\x00\x01\x02\x03", + }, + } + manifest = ChunkManifest(entries=chunks) + pickled = pickle.dumps(manifest) + restored = pickle.loads(pickled) + assert manifest == restored + assert restored._inlined == {(0, 1): b"\x00\x01\x02\x03"} + + def test_scalar_inlined_chunk(self): + """Scalar arrays use key 'c' which maps to empty tuple ().""" + chunks = { + "c": { + "path": "", + "offset": 0, + "length": 8, + "data": b"\x00" * 8, + }, + } + manifest = ChunkManifest(entries=chunks, shape=()) + assert () in manifest._inlined + assert manifest._inlined[()] == b"\x00" * 8 + entry = manifest["c"] + assert "data" in entry + assert entry["data"] == b"\x00" * 8 + + def test_chunk_entry_with_validation_inlined(self): + entry = ChunkEntry.with_validation( + path="", offset=0, length=0, inlined_data=b"\x01\x02\x03" + ) + assert entry["data"] == b"\x01\x02\x03" + assert entry["path"] == "__inlined__" + assert entry["length"] == 3 diff --git a/virtualizarr/tests/test_manifests/test_store.py b/virtualizarr/tests/test_manifests/test_store.py index 125e9279..c7167c1e 100644 --- a/virtualizarr/tests/test_manifests/test_store.py +++ b/virtualizarr/tests/test_manifests/test_store.py @@ -207,6 +207,39 @@ def s3_store(minio_bucket): ) +@pytest.fixture() +def local_store_with_inlined(tmpdir): + """ManifestStore with one array holding three virtual chunks + one inlined chunk at 0.0.""" + store = obs.store.LocalStore() + filepath = f"{tmpdir}/data.tmp" + prefix = "file://" + obs.put( + store, + filepath, + b"\x01\x02\x03\x04\x05\x06\x07\x08\x09\x10\x11\x12", + ) + chunk_dict = { + "0.0": {"path": "", "offset": 0, "length": 4, "data": b"\xaa\xbb\xcc\xdd"}, + "0.1": {"path": f"{prefix}/{filepath}", "offset": 0, "length": 4}, + "1.0": {"path": f"{prefix}/{filepath}", "offset": 4, "length": 4}, + "1.1": {"path": f"{prefix}/{filepath}", "offset": 8, "length": 4}, + } + manifest = ChunkManifest(entries=chunk_dict) + codecs = [{"configuration": {"endian": "little"}, "name": "bytes"}] + metadata = create_v3_array_metadata( + shape=(4, 4), + chunk_shape=(2, 2), + data_type=np.dtype("int32"), + codecs=codecs, + chunk_key_encoding={"name": "default", "separator": "."}, + fill_value=0, + ) + marr = ManifestArray(metadata=metadata, chunkmanifest=manifest) + group = ManifestGroup(arrays={"foo": marr}) + registry = ObjectStoreRegistry({prefix: store}) + return ManifestStore(registry=registry, group=group) + + @pytest.fixture() def empty_memory_store(): import obstore as obs @@ -357,6 +390,50 @@ async def test_list_dir(self, manifest_store, request) -> None: observed = await _collect_aiterator(store.list_dir("subgroup/foo/")) assert observed == ("zarr.json", "c.0.0", "c.0.1", "c.1.0", "c.1.1") + @pytest.mark.asyncio + async def test_get_inlined_chunk(self, local_store_with_inlined): + observed = await local_store_with_inlined.get( + "foo/c.0.0", prototype=default_buffer_prototype() + ) + assert observed.to_bytes() == b"\xaa\xbb\xcc\xdd" + + @pytest.mark.asyncio + async def test_get_inlined_chunk_byte_ranges(self, local_store_with_inlined): + observed = await local_store_with_inlined.get( + "foo/c.0.0", + prototype=default_buffer_prototype(), + byte_range=RangeByteRequest(start=1, end=3), + ) + assert observed.to_bytes() == b"\xbb\xcc" + observed = await local_store_with_inlined.get( + "foo/c.0.0", + prototype=default_buffer_prototype(), + byte_range=OffsetByteRequest(offset=1), + ) + assert observed.to_bytes() == b"\xbb\xcc\xdd" + observed = await local_store_with_inlined.get( + "foo/c.0.0", + prototype=default_buffer_prototype(), + byte_range=SuffixByteRequest(suffix=2), + ) + assert observed.to_bytes() == b"\xcc\xdd" + + @pytest.mark.asyncio + async def test_get_mixed_inlined_and_virtual(self, local_store_with_inlined): + inlined = await local_store_with_inlined.get( + "foo/c.0.0", prototype=default_buffer_prototype() + ) + assert inlined.to_bytes() == b"\xaa\xbb\xcc\xdd" + virtual = await local_store_with_inlined.get( + "foo/c.0.1", prototype=default_buffer_prototype() + ) + assert virtual.to_bytes() == b"\x01\x02\x03\x04" + + @pytest.mark.asyncio + async def test_list_dir_includes_inlined_chunks(self, local_store_with_inlined): + observed = await _collect_aiterator(local_store_with_inlined.list_dir("foo/")) + assert observed == ("zarr.json", "c.0.0", "c.0.1", "c.1.0", "c.1.1") + @pytest.mark.asyncio async def test_store_raises(self, local_store) -> None: with pytest.raises(NotImplementedError):