Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
54e9b48
feat: generalize ChunkManifest to hold native chunks
maxrjones Mar 20, 2026
1f1ead8
Rename native to inlined
maxrjones Mar 20, 2026
13adb46
Move docs to explanation
maxrjones Mar 20, 2026
8516604
Rename data to inlined_data
maxrjones Mar 20, 2026
04a420f
Better sentinel values
maxrjones Mar 20, 2026
e4ebc28
Improve required entry validation
maxrjones Mar 20, 2026
8fc3de3
Add scalar test
maxrjones Mar 20, 2026
9223350
Revert changes that should be a separate PR
maxrjones Mar 20, 2026
6e37005
Merge branch 'main' into store-native-chunks
maxrjones Apr 20, 2026
c97cf39
Fix mypy: avoid narrowing StringDType on np.where reassignment
TomNicholas Apr 22, 2026
d7b0abd
Revert icechunk writer changes; handle inlined chunks in a follow-up PR
TomNicholas Apr 23, 2026
e75c7f7
Move inlined chunks docs into data_structures.md
TomNicholas Apr 23, 2026
83b5c78
Add failing tests for broadcasting manifests with inlined chunks
TomNicholas Apr 23, 2026
0ba0f20
Replicate inlined chunks across expanded axes in broadcast_to
TomNicholas Apr 23, 2026
7e2506d
Add tests for concat and stack with inlined chunks
TomNicholas Apr 23, 2026
90aeeee
Add bytes-identity test for broadcasting inlined chunks
TomNicholas Apr 23, 2026
8e9f8af
Add failing test for ManifestArray equality with differing inlined bytes
TomNicholas Apr 23, 2026
96d8f17
Compare inlined bytes in ChunkManifest.elementwise_eq
TomNicholas Apr 23, 2026
c8475e2
Add ManifestStore read tests for inlined chunks
TomNicholas Apr 23, 2026
06345da
Smoke test that to_virtual_variable preserves inlined chunks
TomNicholas Apr 23, 2026
5122636
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 23, 2026
f328b11
Merge branch 'main' into store-native-chunks
maxrjones Apr 23, 2026
a51602c
Merge branch 'main' into store-native-chunks
TomNicholas Apr 24, 2026
b3cdfb7
Reject ChunkManifest entries with extra keys
TomNicholas Apr 24, 2026
746c779
Document the three chunk states (virtual, missing, inlined) in a table
TomNicholas Apr 24, 2026
de587a9
Add release note for inlined chunks support
TomNicholas Apr 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import h5py # type: ignore[import]
import numpy as np
import pytest
import ujson
import xarray as xr
import zarr
from obspec_utils.registry import ObjectStoreRegistry
Expand Down Expand Up @@ -312,11 +313,16 @@ def netcdf4_virtual_dataset(netcdf4_file):


@pytest.fixture
def netcdf4_inlined_ref(netcdf4_file):
"""Create an inlined reference from a NetCDF4 file."""
def netcdf4_inlined_ref(tmp_path, netcdf4_file):
"""Create an inlined reference file from a NetCDF4 file, returning a file URL."""
from kerchunk.hdf import SingleHdf5ToZarr

return SingleHdf5ToZarr(netcdf4_file, inline_threshold=1000).translate()
ref_filepath = tmp_path / "ref.json"
refs = SingleHdf5ToZarr(netcdf4_file, inline_threshold=1000).translate()
with open(ref_filepath, "w") as json_file:
ujson.dump(refs, json_file)

return f"file://{ref_filepath}"


# HDF5 file fixtures
Expand Down
130 changes: 130 additions & 0 deletions docs/inlined_references.md
Comment thread
TomNicholas marked this conversation as resolved.
Outdated
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# Loading inlined Kerchunk references

Kerchunk reference files can contain two kinds of chunk references:

- **Virtual references** point to byte ranges in external files (e.g., `["s3://bucket/data.nc", 1024, 512]`)
- **Inlined references** embed the raw chunk data directly in the JSON as base64-encoded strings (e.g., `"base64:AAAB..."`)

Inlined references are common for small variables like coordinate arrays, dimension labels, and scalar metadata. Kerchunk inlines data below a configurable `inline_threshold`.

VirtualiZarr can read both kinds of references. Inlined data is stored as **native chunks** directly in the [ChunkManifest][virtualizarr.manifests.ChunkManifest], so it travels with the manifest through concatenation, serialization, and pickling without needing access to any external file.

## Roundtrip example

This example demonstrates that the full pipeline---NetCDF to kerchunk JSON (with inlined coordinates) back to an xarray Dataset---produces results identical to loading the NetCDF directly.

### 1. Create a sample NetCDF file

```python
import tempfile, os, json
import numpy as np
import xarray as xr

tmpdir = tempfile.mkdtemp()
nc_path = os.path.join(tmpdir, "example.nc")

ds = xr.Dataset(
{"temperature": xr.DataArray(
np.arange(12, dtype="float32").reshape(3, 4),
dims=["time", "x"],
)},
coords={
"time": np.array([0, 1, 2], dtype="int64"),
"x": np.array([10, 20, 30, 40], dtype="int64"),
},
)
ds.to_netcdf(nc_path, format="NETCDF4")
```

### 2. Virtualize and write to kerchunk JSON

Use the HDF parser to read the NetCDF file. Specify `loadable_variables` for the
coordinate arrays so they are loaded into memory as numpy arrays. When serialized
to kerchunk format, these loaded variables are automatically base64-encoded as
inlined references.

```python
from virtualizarr import open_virtual_dataset
from virtualizarr.parsers import HDFParser
from obspec_utils.registry import ObjectStoreRegistry
from obstore.store import LocalStore

store = LocalStore(prefix="/")
registry = ObjectStoreRegistry({"file://": store})

with open_virtual_dataset(
url=f"file://{nc_path}",
registry=registry,
parser=HDFParser(),
loadable_variables=["time", "x"],
) as vds:
refs = vds.vz.to_kerchunk(format="dict")

# Write to disk
ref_path = os.path.join(tmpdir, "refs.json")
with open(ref_path, "w") as f:
json.dump(refs, f)
```

The resulting JSON has a mix of virtual and inlined references:

```python
for key, value in refs["refs"].items():
if isinstance(value, str) and value.startswith("base64:"):
print(f" Inlined: {key}")
elif isinstance(value, list):
print(f" Virtual: {key} -> {value[0]}")
```

```
Inlined: time/0
Inlined: x/0
Virtual: temperature/0.0 -> /tmp/.../example.nc
```

### 3. Load the kerchunk JSON back

Use the `KerchunkJSONParser` to read the reference file. Inlined data is decoded
from base64 and stored as native chunks in the manifest.

```python
from virtualizarr.parsers import KerchunkJSONParser

parser = KerchunkJSONParser()
manifest_store = parser(url=f"file://{ref_path}", registry=registry)
```

Open the manifest store as an xarray Dataset via the Zarr engine:

```python
loaded = xr.open_dataset(
manifest_store, engine="zarr", consolidated=False, zarr_format=3
).load()
```

### 4. Verify the roundtrip

```python
direct = xr.open_dataset(nc_path).load()
xr.testing.assert_identical(direct, loaded)
```

The two datasets are identical: coordinate values, data values, attributes, and dtypes all match.

## How it works

When the kerchunk parser encounters a base64-encoded inlined reference, it decodes the bytes and stores them as a **native chunk** on the `ChunkManifest`. Native chunks are held in a sparse dictionary keyed by chunk grid index:

```python
# After parsing, the manifest for 'time' has one native chunk:
time_manifest = manifest_store._group.arrays["time"].manifest
print(time_manifest._native)
# {(0,): b'\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00...'}
```

Native chunks participate in all manifest operations:

- **Concatenation and stacking**: indices are shifted to their new positions
- **Serialization**: included when writing back to kerchunk (re-encoded as base64) or Icechunk (written as real data)
- **Pickling**: travel with the manifest for distributed workflows (Dask, multiprocessing)
- **ManifestStore reads**: returned directly from memory without any network or disk I/O
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ nav:
- "installation.md"
- How-to guides:
- "usage.md"
- "inlined_references.md"
- "examples.md"
- Explanation:
- "data_structures.md"
Expand Down
31 changes: 31 additions & 0 deletions virtualizarr/manifests/array_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,23 @@ def concatenate(
[arr.manifest._lengths for arr in arrays],
axis=axis,
)

# merge native chunk dicts with index shifting along the concat axis
concatenated_native: dict[tuple[int, ...], bytes] = {}
grid_offset = 0
for arr in arrays:
for key, data in arr.manifest._native.items():
shifted = list(key)
shifted[axis] += grid_offset
concatenated_native[tuple(shifted)] = data
grid_offset += arr.manifest._paths.shape[axis]

concatenated_manifest = ChunkManifest.from_arrays(
paths=concatenated_paths,
offsets=concatenated_offsets,
lengths=concatenated_lengths,
validate_paths=False,
native=concatenated_native if concatenated_native else None,
)

new_metadata = copy_and_replace_metadata(
Expand Down Expand Up @@ -180,11 +192,21 @@ def stack(
[arr.manifest._lengths for arr in arrays],
axis=axis,
)

# merge native chunk dicts, inserting the new stacked axis
stacked_native: dict[tuple[int, ...], bytes] = {}
for i, arr in enumerate(arrays):
for key, data in arr.manifest._native.items():
shifted = list(key)
shifted.insert(axis, i)
stacked_native[tuple(shifted)] = data

stacked_manifest = ChunkManifest.from_arrays(
paths=stacked_paths,
offsets=stacked_offsets,
lengths=stacked_lengths,
validate_paths=False,
native=stacked_native if stacked_native else None,
)

# chunk shape has changed because a length-1 axis has been inserted
Expand Down Expand Up @@ -251,11 +273,20 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra
x.manifest._lengths,
shape=new_chunk_grid_shape,
)
# broadcast native chunks by prepending singleton dimensions to their keys
broadcasted_native: dict[tuple[int, ...], bytes] = {}
if x.manifest._native:
n_prepended = len(new_shape) - x.ndim
for key, data in x.manifest._native.items():
new_key = (0,) * n_prepended + key
broadcasted_native[new_key] = data

broadcasted_manifest = ChunkManifest.from_arrays(
paths=broadcasted_paths,
offsets=broadcasted_offsets,
lengths=broadcasted_lengths,
validate_paths=False,
native=broadcasted_native if broadcasted_native else None,
)

new_metadata = copy_and_replace_metadata(
Expand Down
Loading
Loading