Skip to content

Commit 5ffd01f

Browse files
TomNicholasclaude
andcommitted
Support writing inlined ChunkManifest entries to icechunk
Icechunk's set_virtual_refs_arr rejects the INLINED_CHUNK_PATH sentinel ('__inlined__') as a malformed URL. write_manifest_to_icechunk now writes inlined chunks first as native chunks via store.set, then rewrites those positions to empty strings in the paths array before calling set_virtual_refs_arr with the cleaned view. A cheap numpy-level check skips the virtual-refs call entirely for all-inlined or all-missing manifests. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 3d7aedb commit 5ffd01f

2 files changed

Lines changed: 82 additions & 16 deletions

File tree

docs/releases.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
- The kerchunk writer now serializes inlined `ChunkManifest` entries as kerchunk's `base64:`-prefixed inline form, rather than emitting broken `["__inlined__", 0, length]` triples. Together with the read-side support added in #979, this means a virtual dataset with inlined chunks can be round-tripped through both `to_kerchunk(format="json"/"parquet")` and the corresponding `KerchunkJSONParser`/`KerchunkParquetParser`.
88
By [Tom Nicholas](https://github.com/TomNicholas).
9+
- The icechunk writer now handles `ChunkManifest` entries containing inlined chunk data. For arrays with no inlined chunks the existing fast bulk `set_virtual_refs_arr` path is unchanged; otherwise inlined positions are sent to icechunk as empty (missing) virtual refs and the inlined bytes are written separately as managed chunks. A virtual dataset with inlined chunks can now be `to_icechunk`'d and re-opened via `xr.open_zarr` without data loss.
10+
By [Tom Nicholas](https://github.com/TomNicholas).
911

1012
- `ChunkManifest` can now hold inlined chunks — raw chunk bytes carried directly in memory rather than as references to external files. Intended for parser authors (e.g., loading Kerchunk references with inlined data); not exposed via `loadable_variables`.
1113
([#938](https://github.com/zarr-developers/VirtualiZarr/pull/938)).

virtualizarr/writers/icechunk.py

Lines changed: 80 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,19 @@
1+
import asyncio
12
from collections.abc import Mapping
23
from datetime import datetime, timedelta, timezone
3-
from typing import TYPE_CHECKING, Iterable, List, Literal, Optional, Union, cast
4+
from typing import TYPE_CHECKING, Any, Iterable, List, Literal, Optional, Union, cast
45

6+
import numpy as np
57
import xarray as xr
68
from xarray.backends.zarr import ZarrStore as XarrayZarrStore
79
from xarray.backends.zarr import encode_zarr_attr_value
810
from zarr import Array, Group
11+
from zarr.core.buffer import default_buffer_prototype
12+
from zarr.core.sync import sync
913

1014
from virtualizarr.codecs import extract_codecs, get_codecs
1115
from virtualizarr.manifests import ChunkManifest, ManifestArray
16+
from virtualizarr.manifests.manifest import INLINED_CHUNK_PATH
1217
from virtualizarr.manifests.utils import (
1318
check_compatible_encodings,
1419
check_same_chunk_shapes,
@@ -526,7 +531,7 @@ def write_virtual_variable_to_icechunk(
526531

527532
update_attributes(arr, var.attrs, encoding=var.encoding)
528533

529-
write_manifest_virtual_refs(
534+
write_manifest_to_icechunk(
530535
store=store,
531536
group=group,
532537
arr_name=name,
@@ -536,15 +541,20 @@ def write_virtual_variable_to_icechunk(
536541
)
537542

538543

539-
def write_manifest_virtual_refs(
544+
def write_manifest_to_icechunk(
540545
store: "IcechunkStore",
541546
group: "Group",
542547
arr_name: str,
543548
manifest: ChunkManifest,
544549
chunk_index_offsets: tuple[int, ...],
545550
last_updated_at: Optional[datetime] = None,
546551
) -> None:
547-
"""Write all the virtual references for one array manifest at once."""
552+
"""
553+
Write all the chunks (virtual and/or inlined) for one array manifest at once.
554+
555+
Virtual chunks are written as virtual chunks, and inlined chunks are written as native
556+
(which Icechunk may then choose to inline in its manifests).
557+
"""
548558

549559
if group.name == "/":
550560
key_prefix = arr_name
@@ -559,15 +569,69 @@ def write_manifest_virtual_refs(
559569
# In practice this should only really come up in synthetic examples, e.g. tests and docs.
560570
last_updated_at = datetime.now(timezone.utc) + timedelta(seconds=1)
561571

562-
# Pass manifest arrays directly to Rust, avoiding per-chunk Python object creation.
563-
# Empty paths represent missing chunks and are skipped on the Rust side.
564-
store.set_virtual_refs_arr(
565-
array_path=key_prefix,
566-
chunk_grid_shape=manifest.shape_chunk_grid,
567-
locations=manifest._paths.flatten().tolist(),
568-
offsets=manifest._offsets.flatten(),
569-
lengths=manifest._lengths.flatten(),
570-
validate_containers=False,
571-
arr_offset=chunk_index_offsets if any(chunk_index_offsets) else None,
572-
checksum=last_updated_at,
573-
)
572+
paths_flat = manifest._paths.flatten()
573+
574+
if manifest._inlined:
575+
# Write inlined chunks first, then erase them from the paths array so the
576+
# virtual-refs write below doesn't see the INLINED_CHUNK_PATH sentinel
577+
# (which Icechunk's `.set_virtual_refs_arr` would reject as a malformed URL).
578+
arr = group[arr_name]
579+
580+
# Use of zarr's `sync` here is to avoid a serial high-latency loop over chunks.
581+
# Would prefer if zarr-python had a public API for setting many chunks at once concurrently.
582+
sync(
583+
write_inlined_chunks_as_native(
584+
store=store,
585+
key_prefix=key_prefix,
586+
chunk_key_encoding=arr.metadata.chunk_key_encoding,
587+
inlined=manifest._inlined,
588+
chunk_index_offsets=chunk_index_offsets,
589+
)
590+
)
591+
virtual_paths = np.where(paths_flat == INLINED_CHUNK_PATH, "", paths_flat)
592+
else:
593+
virtual_paths = paths_flat
594+
595+
# Cheap numpy-level check so we can skip the .tolist() allocation and the
596+
# Python->Rust call entirely when no position holds a real virtual ref
597+
# (e.g. an all-inlined or all-missing manifest).
598+
if (virtual_paths != "").any():
599+
# Pass flat per-chunk arrays (or a list) to Rust in one call, avoiding Python-side
600+
# per-chunk dict construction. Empty paths are skipped on the Rust side.
601+
store.set_virtual_refs_arr(
602+
array_path=key_prefix,
603+
chunk_grid_shape=manifest.shape_chunk_grid,
604+
locations=virtual_paths.tolist(),
605+
offsets=manifest._offsets.flatten(),
606+
lengths=manifest._lengths.flatten(),
607+
validate_containers=False,
608+
arr_offset=chunk_index_offsets if any(chunk_index_offsets) else None,
609+
checksum=last_updated_at,
610+
)
611+
612+
613+
async def write_inlined_chunks_as_native(
614+
store: "IcechunkStore",
615+
key_prefix: str,
616+
chunk_key_encoding: Any,
617+
inlined: Mapping[tuple[int, ...], bytes],
618+
chunk_index_offsets: tuple[int, ...],
619+
) -> None:
620+
"""Write each inlined chunk as a native chunk at its zarr chunk key."""
621+
prototype = default_buffer_prototype()
622+
has_offset = any(chunk_index_offsets)
623+
coros = []
624+
for chunk_idx, data in inlined.items():
625+
shifted_idx = (
626+
tuple(c + o for c, o in zip(chunk_idx, chunk_index_offsets))
627+
if has_offset
628+
else chunk_idx
629+
)
630+
encoded_chunk_key = chunk_key_encoding.encode_chunk_key(shifted_idx)
631+
coros.append(
632+
store.set(
633+
f"{key_prefix}/{encoded_chunk_key}",
634+
prototype.buffer.from_bytes(data),
635+
)
636+
)
637+
await asyncio.gather(*coros)

0 commit comments

Comments
 (0)