zarr-developers · norlandrhagen · Mar 9, 2026 · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -50,6 +50,8 @@ hdf = [
     "imagecodecs-numcodecs==2024.6.1",
 ]
 
+zarr = ["arro3-core", "pyarrow"]
+
 # kerchunk-based parsers
 netcdf3 = [
     "virtualizarr[remote]",
@@ -201,14 +203,17 @@ run-tests-html-cov = { cmd = "pytest -n auto --run-network-tests --verbose --cov
 min-deps = ["dev", "test", "hdf", "hdf5-lib"] # VirtualiZarr/conftest.py using h5py, so the minimum set of dependencies for testing still includes hdf libs
 # Inherit from min-deps to get all the test commands, along with optional dependencies
 test = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py313"]
-test-py311 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py311"] # test against python 3.11
-test-py312 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py312"] # test against python 3.12
+test-py311 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "zarr", "py311"] # test against python 3.11
+test-py312 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "zarr", "py312"] # test against python 3.12
 minio = ["dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "tiff", "py312", "minio"]
 minimum-versions = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "tiff", "hdf5-lib", "minimum-versions"]
 upstream = ["dev", "test", "hdf", "hdf5-lib", "netcdf3", "upstream", "icechunk-dev", "py313"]
 all = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "all_parsers", "all_writers", "py313"]
 docs = ["docs", "dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py313"]
 
+[tool.pixi.dependencies]
+pytest = "*"
+
 # Define commands to run within the docs environment
 [tool.pixi.feature.docs.tasks]
 serve-docs = { cmd = "mkdocs serve" }

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import re
 from collections.abc import (
     Callable,
@@ -8,13 +10,16 @@
     ValuesView,
 )
 from pathlib import PosixPath
-from typing import Any, NewType, TypedDict, cast
+from typing import TYPE_CHECKING, Any, NewType, TypedDict, cast
 
 import numpy as np
 
 from virtualizarr.manifests.utils import construct_chunk_pattern, parse_manifest_index
 from virtualizarr.types import ChunkKey
 
+if TYPE_CHECKING:
+    import pyarrow as pa
+
 # doesn't guarantee that writers actually handle these
 VALID_URI_PREFIXES = {
     "s3://",
@@ -322,6 +327,55 @@ def from_arrays(
 
         return obj
 
+    @classmethod
+    def _from_arrow(
+        cls,
+        *,
+        chunk_keys: "pa.Array",
+        paths: "pa.Array",
+        sizes: "pa.Array",
+        chunk_grid_shape: tuple[int, ...],
+    ) -> "ChunkManifest":
+        """
+        Create a ChunkManifest directly from PyArrow arrays.
+
+
+        Parameters
+        ----------
+        chunk_keys
+            Normalized dot-separated chunk keys e.g. "1.2", as a PyArrow Utf8 array.
+        paths
+            Full paths to chunks, as a PyArrow Utf8 array.
+        sizes
+            Chunk sizes in bytes, as a PyArrow UInt64 array.
+        chunk_grid_shape
+            Shape of the chunk grid, used to pre-allocate numpy arrays.
+        """
+        import pyarrow as pa
+        import pyarrow.compute as pc
+
+        paths_arr = np.empty(shape=chunk_grid_shape, dtype=np.dtypes.StringDType())
+        offsets_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64"))
+        lengths_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64"))
+
+        if len(chunk_keys) > 0:
+            # split "1.2" into  [1, 2], then reshape to (n_chunks, ndim), use for indexing
+            ndim = len(chunk_grid_shape)
+            indices_flat = pc.cast(
+                pc.split_pattern(chunk_keys, pattern=".").flatten(), pa.int64()
+            ).to_numpy()
+            indices_2d = indices_flat.reshape(-1, ndim)
+            idx_tuple = tuple(indices_2d[:, i] for i in range(ndim))
+
+            paths_arr[idx_tuple] = paths.to_numpy(zero_copy_only=False)
+            lengths_arr[idx_tuple] = sizes.to_numpy()
+
+        return cls.from_arrays(
+            paths=paths_arr,
+            offsets=offsets_arr,
+            lengths=lengths_arr,
+        )
+
     @property
     def ndim_chunk_grid(self) -> int:
         """

diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py
@@ -6,6 +6,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, cast
 
+import numpy as np
 import zarr
 from obspec_utils.registry import ObjectStoreRegistry
 from zarr.api.asynchronous import open_group as open_group_async
@@ -19,10 +20,13 @@
     ManifestGroup,
     ManifestStore,
 )
-from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri
-from virtualizarr.vendor.zarr.core.common import _concurrent_map
+from virtualizarr.manifests.manifest import (
+    parse_manifest_index,
+    validate_and_normalize_path_to_uri,
+)
 
 if TYPE_CHECKING:
+    import pyarrow as pa
     import zarr
 
 ZarrArrayType = zarr.AsyncArray | zarr.Array
@@ -90,41 +94,67 @@ async def _handle_scalar_array(
 
 
 async def _build_chunk_mapping(
-    chunk_keys: list[str], zarr_array: ZarrArrayType, path: str, prefix: str
-) -> dict[str, dict[str, Any]]:
+    zarr_array: ZarrArrayType, path: str, prefix: str
+) -> tuple["pa.Array", "pa.Array", "pa.Array"] | None:
     """
-    Build chunk mapping from a list of chunk keys.
+    Build chunk mapping by listing the object store with obstore.
+
+    Uses obstore's list_async with Arrow output to get chunk paths and sizes
+    in a single Rust-level call, avoiding per-chunk getsize calls.
 
     Parameters
     ----------
-    chunk_keys
-        List of storage keys for chunks.
     zarr_array
         The Zarr array.
     path
         Base path for constructing chunk paths.
     prefix
-        Prefix to strip from chunk keys.
+        Prefix to list and strip from chunk keys.
 
     Returns
     -------
-    dict
-        Mapping of normalized chunk coordinates to storage locations.
+    Tuple of (normalized_keys, full_paths, sizes) as PyArrow arrays, or None if no chunks found.
     """
-    if not chunk_keys:
-        return {}
+    import pyarrow as pa
+    import pyarrow.compute as pc
+
+    path_batches = []
+    size_batches = []
+    stream = zarr_array.store.store.list_async(prefix=prefix, return_arrow=True)
+    async for batch in stream:
+        pa_path_col = pa.array(batch.column("path"))
+        not_metadata = pc.invert(
+            pc.or_(
+                pc.match_substring(pa_path_col, pattern="/."),
+                pc.starts_with(pa_path_col, "."),
+            )
+        )
+
+        filtered_paths = pa_path_col.filter(not_metadata)
+        filtered_sizes = pa.array(batch.column("size")).filter(not_metadata)
+        path_batches.append(filtered_paths)
+        size_batches.append(filtered_sizes)
 
-    lengths = await _concurrent_map(
-        [(k,) for k in chunk_keys], zarr_array.store.getsize
+    if not path_batches:
+        return None
+
+    all_paths = pa.concat_arrays(path_batches)
+    all_sizes = pa.concat_arrays(size_batches)
+
+    if len(all_paths) == 0:
+        return None
+    # normalize: strip prefix, replace / with .
+    stripped = pc.utf8_replace_slice(
+        all_paths, start=0, stop=len(prefix), replacement=""
     )
-    dict_keys = _normalize_chunk_keys(chunk_keys, prefix)
-    paths = [join_url(path, k) for k in chunk_keys]
-    offsets = [0] * len(lengths)
+    normalized_keys = pc.replace_substring(stripped, pattern="/", replacement=".")
 
-    return {
-        key: {"path": p, "offset": offset, "length": length}
-        for key, p, offset, length in zip(dict_keys, paths, offsets, lengths)
-    }
+    # construct full paths
+    full_paths = pc.binary_join_element_wise(
+        pa.scalar(path.rstrip("/")), all_paths, "/"
+    )
+
+    return normalized_keys, full_paths, all_sizes
 
 
 class ZarrVersionStrategy(ABC):
@@ -158,24 +188,7 @@ async def get_chunk_mapping(
             scalar_key = f"{prefix}0"
             return await _handle_scalar_array(zarr_array, path, scalar_key)
 
-        # List all keys under the array prefix, filtering out metadata files
-        prefix_keys = [(x,) async for x in zarr_array.store.list_prefix(prefix)]
-        if not prefix_keys:
-            return {}
-
-        metadata_files = {".zarray", ".zattrs", ".zgroup", ".zmetadata"}
-        chunk_keys = []
-        for key_tuple in prefix_keys:
-            key = key_tuple[0]
-            file_name = (
-                key[len(prefix) :]
-                if prefix and key.startswith(prefix)
-                else key.split("/")[-1]
-            )
-            if file_name not in metadata_files:
-                chunk_keys.append(key)
-
-        return await _build_chunk_mapping(chunk_keys, zarr_array, path, prefix)
+        return await _build_chunk_mapping(zarr_array, path, prefix)
 
     def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata:
         """Convert V2 metadata to V3 format."""
@@ -272,12 +285,7 @@ async def get_chunk_mapping(
 
         # List chunk keys under the c/ subdirectory
         prefix = f"{name}/c/" if name else "c/"
-        prefix_keys = [(x,) async for x in zarr_array.store.list_prefix(prefix)]
-        if not prefix_keys:
-            return {}
-
-        chunk_keys = [x[0] for x in prefix_keys]
-        return await _build_chunk_mapping(chunk_keys, zarr_array, path, prefix)
+        return await _build_chunk_mapping(zarr_array, path, prefix)
 
     def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata:
         """Return V3 metadata as-is (no conversion needed)."""
@@ -321,18 +329,44 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan
     missing, Zarr will return the fill_value for those regions when the array is read.
     """
     strategy = get_strategy(zarr_array)
+    chunk_grid_shape = zarr_array._chunk_grid_shape
+
+    # For scalar arrays use _from_arrays
     chunk_map = await strategy.get_chunk_mapping(zarr_array, path)
+    if zarr_array.shape == ():
+        if not chunk_map:
+            return ChunkManifest(chunk_map, shape=chunk_grid_shape)
+        paths_arr = np.empty(shape=chunk_grid_shape, dtype=np.dtypes.StringDType())
+        offsets_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64"))
+        lengths_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64"))
+        for key, entry in chunk_map.items():
+            idx = parse_manifest_index(key)
+            paths_arr[idx] = entry["path"]
+            offsets_arr[idx] = entry["offset"]
+            lengths_arr[idx] = entry["length"]
+        return ChunkManifest.from_arrays(
+            paths=paths_arr, offsets=offsets_arr, lengths=lengths_arr
+        )
 
-    if not chunk_map:
-        import math
+    # for non scalar arrays, use the new _from_arrow method
+    name = _get_array_name(zarr_array)
+    if zarr_array.metadata.zarr_format == 3:
+        prefix = f"{name}/c/" if name else "c/"
+    else:
+        prefix = f"{name}/" if name else ""
 
-        if zarr_array.shape and zarr_array.chunks:
-            chunk_grid_shape = tuple(
-                math.ceil(s / c) for s, c in zip(zarr_array.shape, zarr_array.chunks)
-            )
-            return ChunkManifest(chunk_map, shape=chunk_grid_shape)
+    result = await _build_chunk_mapping(zarr_array, path, prefix)
+
+    if result is None:
+        return ChunkManifest({}, shape=chunk_grid_shape)
 
-    return ChunkManifest(chunk_map)
+    normalized_keys, full_paths, sizes = result
+    return ChunkManifest._from_arrow(
+        chunk_keys=normalized_keys,
+        paths=full_paths,
+        sizes=sizes,
+        chunk_grid_shape=chunk_grid_shape,
+    )
 
 
 def get_metadata(zarr_array: ZarrArrayType) -> ArrayV3Metadata: