Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/releases.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
- `ChunkManifest` can now hold inlined chunks — raw chunk bytes carried directly in memory rather than as references to external files. Intended for parser authors (e.g., loading Kerchunk references with inlined data); not exposed via `loadable_variables`.
([#938](https://github.com/zarr-developers/VirtualiZarr/pull/938)).
By [Max Jones](https://github.com/maxrjones) and [Tom Nicholas](https://github.com/TomNicholas).
- `KerchunkJSONParser` and `KerchunkParquetParser` now parse inline chunk data (both raw-string and `base64:`-prefixed forms) into inlined `ChunkManifest` entries, instead of raising `NotImplementedError`. Fixes the read side of [#489](https://github.com/zarr-developers/VirtualiZarr/issues/489); writer support for inlined chunks will follow in a separate PR.
([#979](https://github.com/zarr-developers/VirtualiZarr/pull/979)).
By [Tom Nicholas](https://github.com/TomNicholas).

### Breaking changes

Expand Down
19 changes: 13 additions & 6 deletions virtualizarr/parsers/kerchunk/translator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import base64
import math
from collections.abc import Iterable
from typing import cast
Expand Down Expand Up @@ -215,14 +216,20 @@ def manifest_from_kerchunk_chunk_dict(
chunk_entries: dict[ChunkKey, ChunkEntry] = {}
for k, v in kerchunk_chunk_dict.items():
if isinstance(v, (str, bytes)):
raise NotImplementedError(
"Reading inlined reference data is currently not supported."
"See https://github.com/zarr-developers/VirtualiZarr/issues/489",
# Kerchunk inline form: a raw string (interpreted as bytes) or, when
# prefixed with "base64:", a base64-encoded payload.
raw = v.encode() if isinstance(v, str) else v
if raw.startswith(b"base64:"):
data = base64.b64decode(raw[len(b"base64:") :])
else:
data = raw
chunk_entries[k] = ChunkEntry(
path="", offset=0, length=len(data), data=data
)
elif not isinstance(v, (tuple, list)):
elif isinstance(v, (tuple, list)):
chunk_entries[k] = chunkentry_from_kerchunk(v, fs_root=fs_root)
else:
raise TypeError(f"Unexpected type {type(v)} for chunk value: {v}")

chunk_entries[k] = chunkentry_from_kerchunk(v, fs_root=fs_root)
return ChunkManifest(entries=chunk_entries, shape=shape)


Expand Down
140 changes: 106 additions & 34 deletions virtualizarr/tests/test_parsers/test_kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,55 +374,127 @@ def test_open_virtual_dataset_existing_kerchunk_refs(


@requires_kerchunk
def test_notimplemented_read_inline_refs(tmp_path, netcdf4_inlined_ref, local_registry):
# For now, we raise a NotImplementedError if we read existing references that have inlined data
# https://github.com/zarr-developers/VirtualiZarr/pull/251#pullrequestreview-2361916932

ref_filepath = tmp_path / "ref.json"
def _refs_with_one_inlined_one_virtual(inline_repr: str) -> dict:
"""Kerchunk refs describing a (2, 4) int32 array with two (1, 4) chunks:
position (0, 0) is inlined (carrying ``inline_repr``), position (1, 0) is virtual."""
return {
"version": 1,
"refs": {
".zgroup": '{"zarr_format":2}',
"a/.zarray": (
'{"chunks":[1,4],"compressor":null,"dtype":"<i4",'
'"fill_value":null,"filters":null,"order":"C",'
'"shape":[2,4],"zarr_format":2}'
),
"a/.zattrs": '{"_ARRAY_DIMENSIONS":["x","y"]}',
"a/0.0": inline_repr,
"a/1.0": ["/test.nc", 6144, 16],
},
}

import ujson

with open(ref_filepath, "w") as json_file:
ujson.dump(netcdf4_inlined_ref, json_file)
@pytest.mark.parametrize(
"inline_repr, expected_bytes",
[
("base64:AQIDBAUGBwg=", b"\x01\x02\x03\x04\x05\x06\x07\x08"),
("hello, w!", b"hello, w!"),
],
ids=["base64", "raw_string"],
)
def test_parse_inline_refs_json(inline_repr, expected_bytes):
refs = _refs_with_one_inlined_one_virtual(inline_repr)

memory_store = obstore.store.MemoryStore()
memory_store.put("refs.json", ujson.dumps(refs).encode())
registry = ObjectStoreRegistry({"memory://": memory_store})
parser = KerchunkJSONParser()
with pytest.raises(
NotImplementedError,
match="Reading inlined reference data is currently not supported",
):
with open_virtual_dataset(
url=ref_filepath.as_posix(),
registry=local_registry,
parser=parser,
) as _:
pass
manifeststore = parser("memory:///refs.json", registry=registry)

marr = manifeststore._group._members["a"]
assert marr.manifest._inlined == {(0, 0): expected_bytes}
assert marr.manifest.dict() == {
"0.0": {
"path": "__inlined__",
"offset": 0,
"length": len(expected_bytes),
"data": expected_bytes,
},
"1.0": {"path": "file:///test.nc", "offset": 6144, "length": 16},
}


@requires_kerchunk
@pytest.mark.skipif(not has_fastparquet, reason="fastparquet not installed")
def test_notimplemented_read_inline_refs_parquet(
tmp_path, netcdf4_inlined_ref, local_registry
):
# Test that parquet references with inlined data raise NotImplementedError
# https://github.com/zarr-developers/VirtualiZarr/issues/489
def test_parse_inline_refs_parquet(tmp_path, local_registry):
import pandas as pd
from kerchunk.df import refs_to_dataframe

expected_bytes = b"\x01\x02\x03\x04\x05\x06\x07\x08"
refs = _refs_with_one_inlined_one_virtual("base64:" + "AQIDBAUGBwg=")

ref_filepath = tmp_path / "ref.parquet"
with pd.option_context("future.infer_string", False):
refs_to_dataframe(fo=netcdf4_inlined_ref, url=ref_filepath.as_posix())
refs_to_dataframe(fo=refs, url=ref_filepath.as_posix())

parser = KerchunkParquetParser()
with pytest.raises(
NotImplementedError,
match="Reading inlined reference data is currently not supported",
):
with open_virtual_dataset(
url=ref_filepath.as_posix(),
registry=local_registry,
parser=parser,
) as _:
pass
manifeststore = parser(ref_filepath.as_posix(), registry=local_registry)

marr = manifeststore._group._members["a"]
assert marr.manifest._inlined == {(0, 0): expected_bytes}
assert marr.manifest.dict() == {
"0.0": {
"path": "__inlined__",
"offset": 0,
"length": len(expected_bytes),
"data": expected_bytes,
},
"1.0": {"path": "file:///test.nc", "offset": 6144, "length": 16},
}


@pytest.mark.asyncio
async def test_read_inline_and_virtual_refs_end_to_end():
# Parse a refs dict containing one inlined chunk and one virtual chunk,
# then round-trip both through ManifestStore.get: the inlined chunk must
# come back from memory and the virtual chunk must be fetched from the
# target file via the object store registry.
from zarr.core.buffer import default_buffer_prototype

inlined_bytes = b"\x01\x02\x03\x04\x05\x06\x07\x08"
virtual_bytes = b"\xaa\xbb\xcc\xdd\xee\xff\x11\x22"

refs = {
"version": 1,
"refs": {
".zgroup": '{"zarr_format":2}',
"a/.zarray": (
'{"chunks":[1,4],"compressor":null,"dtype":"<i4",'
'"fill_value":null,"filters":null,"order":"C",'
'"shape":[2,4],"zarr_format":2}'
),
"a/.zattrs": '{"_ARRAY_DIMENSIONS":["x","y"]}',
"a/0.0": "base64:AQIDBAUGBwg=",
"a/1.0": ["memory:///test.nc", 0, len(virtual_bytes)],
},
}

memory_store = obstore.store.MemoryStore()
memory_store.put("refs.json", ujson.dumps(refs).encode())
memory_store.put("test.nc", virtual_bytes)

registry = ObjectStoreRegistry({"memory://": memory_store})
parser = KerchunkJSONParser()
manifeststore = parser("memory:///refs.json", registry=registry)

inlined_result = await manifeststore.get(
"a/c/0/0", prototype=default_buffer_prototype()
)
assert inlined_result.to_bytes() == inlined_bytes

virtual_result = await manifeststore.get(
"a/c/1/0", prototype=default_buffer_prototype()
)
assert virtual_result.to_bytes() == virtual_bytes


@pytest.mark.parametrize("skip_variables", ["a", ["a"]])
Expand Down
Loading