From 43d694262fdaefe305d8782c3c793bdf1f5a4ca4 Mon Sep 17 00:00:00 2001 From: Tyler <31015976+tylanderson@users.noreply.github.com> Date: Tue, 31 Mar 2026 12:42:25 -0400 Subject: [PATCH] add docs examples for region write and initalizing blanlk dataset --- docs/usage.md | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index 678ab6de..78db0119 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -468,6 +468,95 @@ print(snapshot_id) See the [Icechunk documentation](https://icechunk.io/en/latest/virtual/) for more details. +#### Writing to a region of an Icechunk Store + +Region writing allows you to write virtual references into a specific slice of an already-initialized store. +This is useful for parallel or out-of-order writes, where each worker fills in its own time slice independently. + + +```python +session = repo.writable_session("main") +vds1.vz.to_icechunk(session.store, region="auto") +snapshot_id = session.commit("Wrote region") +print(snapshot_id) +``` + +You can also specify regions explicitly using slices: + +```python +session = repo.writable_session("main") +n = len(vds1.time) +vds1.vz.to_icechunk(session.store, region={"time": slice(0, n)}) +snapshot_id = session.commit("Wrote explicit region") +print(snapshot_id) +``` + +##### Initializing blank datasets + +If using region writing for writing a new dataset, you should initialize your dataset first. +You can do this by using reference file to initialize the shape and attributes of the dataset, +while filling with empty references, as shown with the example below. + +```python +import pandas as pd +from virtualizarr import open_virtual_dataset +from virtualizarr.parsers import HDFParser +from zarr.core.metadata import ArrayV3Metadata +from virtualizarr.manifests import ChunkManifest, ManifestArray + +# Use an existing virtual dataset as the reference template +reference_vds = open_virtual_dataset( + url=f"{bucket}/{path}", + parser=parser, + registry=registry, +) + +# Build a blank scaffold that matches the final coordinate space +full_time = pd.date_range( + start="2015-01-01", + end="2026-01-01", + freq="1d", +) +target_shape = (len(full_time), 600, 1440) + +blank_ds = xr.Dataset( + coords={ + "time": full_time, + "lat": reference_vds["lat"], + "lon": reference_vds["lon"], + }, + attrs=reference_vds.attrs, +) + +for var_name in reference_vds.data_vars: + donor_arr = reference_vds[var_name].data + + # get the index of the time coord, and update the shape + # of the manifest chunk grid for the blank dataset + time_axis = ds[var_name].dims.index("time") + chunk_grid_shape = tuple( + len(full_time) if i == time_axis else s + for i, s in enumerate(donor_arr.manifest.shape_chunk_grid) + ) + empty_manifest = ChunkManifest({}, shape=chunk_grid_shape) + + # update the shape in the array metadata + meta_dict = donor_arr.metadata.to_dict() + meta_dict["shape"] = target_shape + new_metadata = ArrayV3Metadata.from_dict(meta_dict) + + blank_ds[var_name] = xr.Variable( + dims=reference_vds[var_name].dims, + data=ManifestArray(chunkmanifest=empty_manifest, metadata=new_metadata), + attrs=reference_vds[var_name].attrs, + ) + +# Initialize the store: this writes all array metadata and coordinates, but no data refs +session = repo.writable_session("main") +blank_ds.vz.to_icechunk(session.store) +session.commit("Initialized empty store") +``` + ### Writing to Kerchunk's format and reading data via fsspec The [kerchunk library](https://github.com/fsspec/kerchunk) has its own [specification](https://fsspec.github.io/kerchunk/spec.html) for serializing virtual datasets as a JSON file or Parquet directory.