zarr-developers · TomNicholas · Oct 24, 2025 · Oct 28, 2025 · Jan 24, 2026 · Jan 24, 2026
diff --git a/conftest.py b/conftest.py
@@ -132,7 +132,8 @@ def _generate_chunk_entries(
     )
 
     if chunk_grid_shape == ():
-        return {"0": entry_generator((0,), (0,), itemsize)}
+        # Scalar arrays should not use this function - use ChunkManifest.from_arrays instead
+        raise ValueError("Use ChunkManifest.from_arrays for scalar arrays")
 
     all_possible_combos = itertools.product(
         *[range(length) for length in chunk_grid_shape]
@@ -403,10 +404,24 @@ def _manifest_array(
             codecs=codecs,
             dimension_names=dimension_names,
         )
-        entries = _generate_chunk_entries(
-            shape, chunks, data_type.itemsize, _entry_from_chunk_key
-        )
-        chunkmanifest = ChunkManifest(entries=entries)
+
+        if shape == ():
+            # For scalar arrays, use from_arrays with 0-dimensional numpy arrays.
+            # This avoids ambiguity around chunk keys ("c" vs "0") - the dict-based
+            # ChunkManifest constructor infers shape from chunk keys, but scalar
+            # chunk key conventions differ between zarr versions. Using from_arrays
+            # with shape=() arrays is unambiguous, matching what the HDF parser does.
+            chunkmanifest = ChunkManifest.from_arrays(
+                paths=np.array("file:///foo.nc", dtype=np.dtypes.StringDType()),
+                offsets=np.array(0, dtype=np.uint64),
+                lengths=np.array(data_type.itemsize, dtype=np.uint64),
+            )
+        else:
+            entries = _generate_chunk_entries(
+                shape, chunks, data_type.itemsize, _entry_from_chunk_key
+            )
+            chunkmanifest = ChunkManifest(entries=entries)
+
         return ManifestArray(chunkmanifest=chunkmanifest, metadata=metadata)
 
     return _manifest_array

diff --git a/pyproject.toml b/pyproject.toml
@@ -83,7 +83,9 @@ all_parsers = [
 
 # writers
 icechunk = [
+    # TODO this will need to become icehunk 2.0 for passing arrow arrays
     "icechunk>=1.1.2",
+    "pyarrow>=23.0.0",
 ]
 
 

diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py
@@ -196,15 +196,16 @@ def test_broadcast_scalar(self, manifest_array):
         marr = manifest_array(shape=(), chunks=())
         assert marr.shape == ()
         assert marr.chunks == ()
+        # Scalar manifests created with from_arrays have empty string key
         assert marr.manifest.dict() == {
-            "0": {"path": "file:///foo.0.nc", "offset": 0, "length": 0},
+            "": {"path": "file:///foo.nc", "offset": 0, "length": 4},
         }
 
         expanded = np.broadcast_to(marr, shape=(1,))
         assert expanded.shape == (1,)
         assert expanded.chunks == (1,)
         assert expanded.manifest.dict() == {
-            "0": {"path": "file:///foo.0.nc", "offset": 0, "length": 0},
+            "0": {"path": "file:///foo.nc", "offset": 0, "length": 4},
         }
 
     @pytest.mark.parametrize(

diff --git a/virtualizarr/tests/test_writers/test_icechunk.py b/virtualizarr/tests/test_writers/test_icechunk.py
@@ -13,7 +13,6 @@
 
 from virtualizarr.manifests import ChunkManifest, ManifestArray
 from virtualizarr.tests.utils import PYTEST_TMP_DIRECTORY_URL_PREFIX
-from virtualizarr.writers.icechunk import generate_chunk_key
 
 icechunk = pytest.importorskip("icechunk")
 
@@ -258,9 +257,7 @@ def test_validate_containers(
     )
 
     # assert that an error is raised when attempting to write to icechunk
-    with pytest.raises(
-        ValueError, match="No Virtual Chunk Container set which supports prefix"
-    ):
+    with pytest.raises(ValueError, match="does not start with any supported prefix"):
         vds.vz.to_icechunk(icechunk_filestore)
 
     # assert that no uncommitted changes have been written to Icechunk session
@@ -394,47 +391,6 @@ def test_checksum(
         npt.assert_equal(pressure_array, arr)
 
 
-def test_generate_chunk_key_no_offset():
-    # Test case without any offset (append_axis and existing_num_chunks are None)
-    index = (1, 2, 3)
-    result = generate_chunk_key(index)
-    assert result == [1, 2, 3], (
-        "The chunk key should match the index without any offset."
-    )
-
-
-def test_generate_chunk_key_with_offset():
-    # Test case with offset on append_axis 1
-    index = (1, 2, 3)
-    append_axis = 1
-    existing_num_chunks = 5
-    result = generate_chunk_key(
-        index, append_axis=append_axis, existing_num_chunks=existing_num_chunks
-    )
-    assert result == [1, 7, 3], "The chunk key should offset the second index by 5."
-
-
-def test_generate_chunk_key_zero_offset():
-    # Test case where existing_num_chunks is 0 (no offset should be applied)
-    index = (4, 5, 6)
-    append_axis = 1
-    existing_num_chunks = 0
-    result = generate_chunk_key(
-        index, append_axis=append_axis, existing_num_chunks=existing_num_chunks
-    )
-    assert result == [4, 5, 6], (
-        "No offset should be applied when existing_num_chunks is 0."
-    )
-
-
-def test_generate_chunk_key_append_axis_out_of_bounds():
-    # Edge case where append_axis is out of bounds
-    index = (3, 4)
-    append_axis = 2  # This is out of bounds for a 2D index
-    with pytest.raises(ValueError):
-        generate_chunk_key(index, append_axis=append_axis, existing_num_chunks=1)
-
-
 def test_roundtrip_coords(
     manifest_array, icechunk_filestore: "IcechunkStore", icechunk_repo: "Repository"
 ):

diff --git a/virtualizarr/writers/arrow.py b/virtualizarr/writers/arrow.py
@@ -0,0 +1,104 @@
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Iterable, cast
+
+import xarray as xr
+
+from virtualizarr.manifests import ChunkManifest, ManifestArray
+
+if TYPE_CHECKING:
+    import pyarrow as pa
+
+
+@dataclass(frozen=True)
+class ArrowChunkManifest:
+    """Arrow-backed chunk manifest for efficient validation and writing to icechunk."""
+
+    locations: "pa.StringArray"
+    offsets: "pa.UInt64Array"
+    lengths: "pa.UInt64Array"
+    shape_chunk_grid: tuple[int, ...]
+
+    @classmethod
+    def from_manifest(cls, manifest: ChunkManifest) -> "ArrowChunkManifest":
+        """Convert a ChunkManifest to Arrow arrays.
+
+        Empty paths (representing missing chunks) are converted to nulls.
+        """
+        import pyarrow as pa
+
+        n_chunks = len(manifest)
+        paths_flat = manifest._paths.ravel()
+
+        # Create null mask from empty strings (True = null)
+        null_mask = paths_flat == ""
+
+        # Create arrays with mask applied during construction (no extra copies)
+        return cls(
+            locations=pa.array(
+                paths_flat.tolist(), type=pa.string(), size=n_chunks, mask=null_mask
+            ),
+            offsets=pa.array(
+                manifest._offsets.ravel(),
+                type=pa.uint64(),
+                size=n_chunks,
+                mask=null_mask,
+            ),
+            lengths=pa.array(
+                manifest._lengths.ravel(),
+                type=pa.uint64(),
+                size=n_chunks,
+                mask=null_mask,
+            ),
+            shape_chunk_grid=manifest.shape_chunk_grid,
+        )
+
+
+def extract_arrow_manifests(vds: xr.Dataset) -> dict[str, ArrowChunkManifest]:
+    """Extract all manifests from a dataset and convert to Arrow format."""
+    return {
+        name: ArrowChunkManifest.from_manifest(cast(ManifestArray, var.data).manifest)
+        for name, var in vds.variables.items()
+        if isinstance(var.data, ManifestArray)
+    }
+
+
+def validate_location_prefixes(
+    arrow_manifests: Iterable[ArrowChunkManifest],
+    valid_prefixes: list[str],
+) -> None:
+    """
+    Validate that all chunk locations start with one of the valid prefixes.
+
+    Uses PyArrow compute for efficient validation of large manifests.
+
+    Parameters
+    ----------
+    arrow_manifests
+        Manifests to validate.
+    valid_prefixes
+        List of allowed location prefixes. If empty, validation is skipped.
+    """
+    import pyarrow.compute as pc
+
+    for manifest in arrow_manifests:
+        locations = manifest.locations
+
+        # Build a mask of locations that match at least one prefix
+        # Nulls (missing chunks) become null in the result and are skipped
+        matches = pc.starts_with(locations, valid_prefixes[0])
+        for prefix in valid_prefixes[1:]:
+            matches = pc.or_(matches, pc.starts_with(locations, prefix))
+
+        # Check if all non-null locations match at least one prefix
+        all_match = pc.all(matches, skip_nulls=True)
+
+        # If any don't match then do more work to find the first offender
+        if all_match.is_valid and not all_match.as_py():
+            # Find first invalid location to report in error
+            invalid = pc.invert(pc.fill_null(matches, True))
+            invalid_indices = pc.indices_nonzero(invalid)
+            first_invalid_idx = invalid_indices[0].as_py()
+            invalid_location = locations[first_invalid_idx].as_py()
+            raise ValueError(
+                f"Location {invalid_location!r} does not start with any supported prefix: {valid_prefixes}"
+            )