diff --git a/dace/codegen/control_flow.py b/dace/codegen/control_flow.py index 063ca6f542..7903654e66 100644 --- a/dace/codegen/control_flow.py +++ b/dace/codegen/control_flow.py @@ -13,6 +13,7 @@ from dace.sdfg.sdfg import SDFG, InterstateEdge from dace.sdfg.graph import Edge from dace.codegen.common import unparse_interstate_edge +from dace.codegen.targets.cpp import sym2cpp if TYPE_CHECKING: from dace.codegen.targets.framecode import DaCeCodeGenerator @@ -34,6 +35,52 @@ def _child_of(node: SDFGState, parent: SDFGState, ptree: Dict[SDFGState, SDFGSta return False +def _generate_explicit_alloc_free(edge: Edge[InterstateEdge], sdfg: SDFG) -> str: + """ + Emit ``new[]`` / ``delete[]`` statements for any arrays listed in the + ``alloc`` / ``free`` properties of *edge*. + + These arrays must have ``AllocationLifetime.Explicit``; their pointers live + in the SDFG state struct (``__state->___``). Size + expressions are converted from symbolic form via :func:`sym2cpp`. + + :param edge: The interstate edge being processed. + :param sdfg: The enclosing SDFG (provides ``cfg_id`` and ``arrays``). + :returns: C++ source fragment (may be empty string). + """ + code = '' + + for arr_name in edge.data.alloc: + arr = sdfg.arrays[arr_name] + size_expr = ' * '.join(sym2cpp(s) for s in arr.shape) + code += ( + f'__state->__{sdfg.cfg_id}_{arr_name} = ' + f'new {arr.dtype.ctype}[{size_expr}];\n' + ) + + for entry in edge.data.reuse: + if len(entry) == 2: + new_arr, donor_arr = entry + code += ( + f'__state->__{sdfg.cfg_id}_{new_arr} = ' + f'__state->__{sdfg.cfg_id}_{donor_arr};\n' + f'__state->__{sdfg.cfg_id}_{donor_arr} = nullptr;\n' + ) + else: + new_arr, donor_arr, offset_bytes = entry + dtype = sdfg.arrays[new_arr].dtype.ctype + code += ( + f'__state->__{sdfg.cfg_id}_{new_arr} = ' + f'({dtype}*)((char*)__state->__{sdfg.cfg_id}_{donor_arr} ' + f'+ {offset_bytes});\n' + ) + + for arr_name in edge.data.free: + code += f'delete[] __state->__{sdfg.cfg_id}_{arr_name};\n' + + return code + + def _generate_interstate_edge_code(edge: Edge[InterstateEdge], sdfg: SDFG, cfg: ControlFlowRegion, @@ -62,6 +109,8 @@ def _generate_interstate_edge_code(edge: Edge[InterstateEdge], for variable, value in edge.data.assignments.items() ] + ['']) + expr += _generate_explicit_alloc_free(edge, sdfg) + if not assignments_only: dst: ControlFlowBlock = edge.dst expr += 'goto __state_{}_{};\n'.format(cfg.cfg_id, re.sub(r'\s+', '_', dst.label)) diff --git a/dace/codegen/dispatcher.py b/dace/codegen/dispatcher.py index ce896ded8e..12d75e7dd4 100644 --- a/dace/codegen/dispatcher.py +++ b/dace/codegen/dispatcher.py @@ -482,7 +482,10 @@ def dispatch_allocate(self, if datadesc.lifetime == dtypes.AllocationLifetime.Persistent: declaration_stream = CodeIOStream() callsite_stream = self.frame._initcode - elif datadesc.lifetime == dtypes.AllocationLifetime.External: + elif datadesc.lifetime in (dtypes.AllocationLifetime.External, dtypes.AllocationLifetime.Explicit): + # External: managed outside generated code. + # Explicit: managed by alloc/free on interstate edges; discard streams so no + # auto new[]/delete[] is emitted but defined_vars tracking still runs. declaration_stream = CodeIOStream() callsite_stream = CodeIOStream() else: @@ -504,7 +507,7 @@ def dispatch_deallocate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: ScopeSubg if datadesc.lifetime == dtypes.AllocationLifetime.Persistent: callsite_stream = self.frame._exitcode - elif datadesc.lifetime == dtypes.AllocationLifetime.External: + elif datadesc.lifetime in (dtypes.AllocationLifetime.External, dtypes.AllocationLifetime.Explicit): return self._array_dispatchers[datadesc.storage].deallocate_array(sdfg, cfg, dfg, state_id, node, datadesc, diff --git a/dace/codegen/instrumentation/allocation.py b/dace/codegen/instrumentation/allocation.py new file mode 100644 index 0000000000..92ec7a91c0 --- /dev/null +++ b/dace/codegen/instrumentation/allocation.py @@ -0,0 +1,81 @@ +import json +import os + +import dace +from dace.codegen.instrumentation.papi import MapEntry +from dace.codegen.instrumentation.provider import SDFG +from dace.sdfg.nodes import AccessNode, EntryNode +from dace.transformation.passes.symbol_propagation import SDFGState +from dace.sdfg.nodes import Node + + +type StateAlloc = dict[AccessNode,list[SDFGState]] +type NodeAlloc = dict[AccessNode, list[Node]] + + +def inScope(scopedict: dict[Node, SDFGState | Node | None], node: Node, scope: Node) -> bool: + node_scope = scopedict[node] + return node_scope != None and (node_scope == scope or inScope(scopedict, node_scope, scope) if isinstance(node_scope, Node) else False) + + + +def create_allocation_report(to : dict[SDFG | SDFGState | EntryNode, list[tuple[SDFG, SDFGState | None,AccessNode | None, bool, bool, bool]]]): + + #state_alloc: dict[AccessNode,list[SDFGState]] = {} + #node_alloc: dict[AccessNode, list[Node]] = {} + + state_alloc: dict[str, list[str]] = {} + node_alloc: dict[str, list[str]] = {} + + all_alloc: dict[str, list[str]] = {} + + report: dict[SDFG, dict[str, list[str]]] = {} + + for scope in to: + for alloc_info in to[scope]: + + sdfg: SDFG = alloc_info[0] + state : SDFGState | None = alloc_info[1] + access_node =alloc_info[2] + + nodes_allocated: list[Node] = [] + states_allocated: list[SDFGState] = [] + + if issubclass(type(scope),SDFG): + #TODO: find example where SDFG is the scope and implement + pass + elif issubclass(type(scope),SDFGState): + #highlight all nodes and the state itself + nodes_allocated = list(scope.nodes()) if isinstance(scope, SDFGState) else [] + states_allocated = [scope] if isinstance(scope, SDFGState) else [] + elif issubclass(type(scope),EntryNode): + if isinstance(scope, MapEntry): + nodes_allocated = [] + scope_dict = state.scope_dict() if state != None else {} + for node in state.nodes() if state != None else []: + if inScope(scope_dict, node, scope) or node == scope: + nodes_allocated.append(node) + states_allocated = [state] if state != None else [] + + if access_node != None: + state_alloc[access_node.guid] = [state.guid for state in states_allocated] + node_alloc[access_node.guid] = [node.guid for node in nodes_allocated] + all_alloc[access_node.guid] = state_alloc[access_node.guid] + node_alloc[access_node.guid] + if sdfg in report.keys(): + report[sdfg].update(all_alloc) + else: + report[sdfg] = all_alloc + + + + + for sdfg in report: + os.makedirs(f"{sdfg.build_folder}/perf", exist_ok=True) + with open(f"{sdfg.build_folder}/perf/allocation-report-{str(hash(str(report[sdfg])))}.json", "x") as f: + json.dump(report[sdfg],f) + + + + + + return diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py index ed7a5047fd..2d00ed021f 100644 --- a/dace/codegen/targets/cpp.py +++ b/dace/codegen/targets/cpp.py @@ -90,7 +90,7 @@ def copy_expr( dt = "" is_global = data_desc.lifetime in (dtypes.AllocationLifetime.Global, dtypes.AllocationLifetime.Persistent, - dtypes.AllocationLifetime.External) + dtypes.AllocationLifetime.External, dtypes.AllocationLifetime.Explicit) defined_types = None # Non-free symbol dependent Arrays due to their shape dependent_shape = (isinstance(data_desc, data.Array) and not isinstance(data_desc, data.View) and any( @@ -247,9 +247,10 @@ def ptr(name: str, desc: data.Data, sdfg: SDFG = None, framecode: 'DaCeCodeGener if root in sdfg.arrays and isinstance(sdfg.arrays[root], data.Structure): name = name.replace('.', '->') - # Special case: If memory is persistent and defined in this SDFG, add state - # struct to name - if (desc.transient and desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External)): + # Special case: If memory is persistent/external/explicit and defined in this + # SDFG, add state struct to name (the pointer lives in the state struct). + if (desc.transient and desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External, + dtypes.AllocationLifetime.Explicit)): if desc.storage == dtypes.StorageType.CPU_ThreadLocal: # Use unambiguous name for thread-local arrays return f'__{sdfg.cfg_id}_{name}' diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index 5bb2bc50a5..e3bf8ba8b4 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -23,6 +23,8 @@ from dace.sdfg.state import ControlFlowBlock, ControlFlowRegion, LoopRegion from dace.transformation.passes.analysis import StateReachability, loop_analysis +from dace.codegen.instrumentation.allocation import create_allocation_report + def _get_or_eval_sdfg_first_arg(func, sdfg): if callable(func): @@ -636,6 +638,26 @@ def determine_allocation_lifetime(self, top_sdfg: SDFG): self.to_allocate[top_sdfg].append((sdfg, first_state_instance, first_node_instance, True, True, True)) self.where_allocated[(sdfg, name)] = top_sdfg continue + elif top_lifetime is dtypes.AllocationLifetime.Explicit: + # Explicit lifetime: the pointer is declared in the state struct, + # but allocation/deallocation is handled via alloc/free on interstate edges. + # We register the variable in defined_vars (so references compile) but + # emit no auto new[]/delete[]. + + definition = desc.as_arg(name=f'__{sdfg.cfg_id}_{name}') + ';' + + if top_storage != dtypes.StorageType.CPU_ThreadLocal: + self.statestruct.append(definition) + + alloc_node = first_node_instance if first_node_instance is not None else nodes.AccessNode(name) + alloc_state = first_state_instance + + # allocate=True so defined_vars is populated; dispatcher discards streams + # (same as External). deallocate=False so no auto-deallocation is attempted. + self.to_allocate[top_sdfg].append( + (sdfg, alloc_state, alloc_node, True, True, False)) + self.where_allocated[(sdfg, name)] = top_sdfg + continue elif top_lifetime is dtypes.AllocationLifetime.Global: # Global memory is allocated in the beginning of the program # exists in the library state structure (to be passed along @@ -820,6 +842,7 @@ def determine_allocation_lifetime(self, top_sdfg: SDFG): self.where_allocated[(sdfg, name)] = curscope else: self.where_allocated[(sdfg, name)] = cursdfg + create_allocation_report(self.to_allocate) def allocate_arrays_in_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, scope: Union[nodes.EntryNode, SDFGState, SDFG], diff --git a/dace/dtypes.py b/dace/dtypes.py index 1f9b507935..ae571c4f2e 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -120,6 +120,7 @@ class AllocationLifetime(Enum): Global = auto() #: Allocated throughout the entire program (outer SDFG) Persistent = auto() #: Allocated throughout multiple invocations (init/exit) External = auto() #: Allocated and managed outside the generated code + Explicit = auto() #: Allocated/Deallocated via explicit alloc/free on interstate edges @undefined_safe_enum diff --git a/dace/libraries/allocation/__init__.py b/dace/libraries/allocation/__init__.py new file mode 100644 index 0000000000..a07e849dcb --- /dev/null +++ b/dace/libraries/allocation/__init__.py @@ -0,0 +1,4 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +from .make_explicit import make_explicit +from .hoist import hoist_alloc_out_of_loop +from .reuse import _apply_reuse, buffer_reuse_same_pass, buffer_reuse_same_pass_ua, buffer_reuse_cross_pass diff --git a/dace/libraries/allocation/hoist.py b/dace/libraries/allocation/hoist.py new file mode 100644 index 0000000000..e3f1c541c1 --- /dev/null +++ b/dace/libraries/allocation/hoist.py @@ -0,0 +1,166 @@ +""" +hoist: optimization that lifts allocations out of LoopRegions. + +If a transient array is allocated (and freed) inside a ``LoopRegion``, every +loop iteration pays the full allocation/deallocation cost. This pass moves +the allocation to before the loop and the deallocation to after the loop, +so each happens only once. + +Arrays that already carry ``AllocationLifetime.Explicit`` with ``alloc``/``free`` +annotations on loop-internal edges are hoisted directly. + +Arrays that do not yet use explicit allocation are first converted via +``make_explicit`` (which scopes the alloc/free to the loop's internal edges +for ``Scope``/``State``-lifetime arrays) and then hoisted. Arrays whose +declared lifetime is ``SDFG`` or ``Global`` are already allocated outside +the loop after ``make_explicit`` and are silently skipped. + +Only ``LoopRegion`` blocks are accepted. Passing a Map scope raises a +``TypeError`` immediately: Maps are parallel — hoisting an allocation out of a +Map would change the memory layout visible to every parallel thread. + +Usage:: + + from dace.libraries.allocation.hoist import hoist_alloc_out_of_loop + # already-explicit arrays + hoist_alloc_out_of_loop(loop_region, ['dx', 'dy', 'dz']) + # automatic-lifetime arrays (make_explicit + hoist in one call) + hoist_alloc_out_of_loop(loop_region, ['tmp']) +""" + +from typing import List, Tuple + +import dace +from dace import dtypes +from dace.sdfg.state import LoopRegion, ControlFlowRegion + +from .make_explicit import make_explicit, _alloc_on_incoming_edges, _free_on_outgoing_edges + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + +def _collect_alloc_free_edges( + region: ControlFlowRegion, + name: str, +) -> Tuple[list, list]: + """Recursively collect every edge inside *region* that references *name*. + + Returns ``(alloc_edges, free_edges)`` — both are flat lists of + ``dace.sdfg.graph.Edge`` objects whose ``data`` is an + ``InterstateEdge``. + + The search recurses into nested ``ControlFlowRegion`` blocks (e.g. a + ``LoopRegion`` nested inside another loop, or a ``ConditionalBlock``). + """ + alloc_edges: list = [] + free_edges: list = [] + + for edge in region.edges(): + if name in edge.data.alloc: + alloc_edges.append(edge) + if name in edge.data.free: + free_edges.append(edge) + + for node in region.nodes(): + if isinstance(node, ControlFlowRegion): + sub_alloc, sub_free = _collect_alloc_free_edges(node, name) + alloc_edges.extend(sub_alloc) + free_edges.extend(sub_free) + + return alloc_edges, free_edges + + +def _root_sdfg(region: ControlFlowRegion) -> "dace.SDFG": + """Walk the parent_graph chain upward until we reach the root SDFG.""" + node = region + while not isinstance(node, dace.SDFG): + node = node.parent_graph + return node + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def hoist_alloc_out_of_loop(loop: LoopRegion, array_names: List[str]) -> None: + """Hoist explicit allocations of *array_names* from inside a loop to its + surrounding CFG. + + For each array the function: + + 1. Validates that *loop* is a ``LoopRegion`` (raises ``TypeError`` + otherwise — Maps are parallel and must not be used here). + 2. Collects every edge inside *loop* (recursively) that has *name* in its + ``alloc`` or ``free`` list. + 3. Raises ``ValueError`` if no ``alloc`` edge for *name* is found inside + the loop — the array must actually be allocated inside the loop. + 4. Removes *name* from the ``alloc`` list of every internal edge found. + 5. Removes *name* from the ``free`` list of every internal edge found. + 6. Adds *name* to the ``alloc`` list of every incoming edge of *loop* in + its parent CFG (inserting a thin predecessor state when the loop is + the SDFG start and has no incoming edges). + 7. If internal ``free`` edges were found, adds *name* to the ``free`` + list of every outgoing edge of *loop* in its parent CFG (inserting a + thin successor state when the loop is a CFG sink). + + :param loop: The ``LoopRegion`` to hoist allocations out of. + :param array_names: Names of transient arrays to hoist. + :raises TypeError: If *loop* is not a ``LoopRegion``. + :raises ValueError: If an array name is unknown, not transient, or has no + ``alloc`` annotation on any edge inside *loop*. + """ + if not isinstance(loop, LoopRegion): + raise TypeError( + f"hoist_alloc_out_of_loop expects a LoopRegion, got " + f"'{type(loop).__name__}'. " + "Maps are parallel — hoisting allocations out of a Map would " + "change per-iteration memory layout visible to every parallel " + "thread and is therefore not supported." + ) + + parent = loop.parent_graph + sdfg = _root_sdfg(loop) + + for name in array_names: + # --- validate --- + if name not in sdfg.arrays: + raise ValueError(f"'{name}' is not in the SDFG's array descriptor table.") + desc = sdfg.arrays[name] + if not desc.transient: + raise ValueError( + f"'{name}' is not a transient data container. " + "Only transients can be explicitly allocated." + ) + + # --- make explicit if needed --- + # For Scope/State-lifetime arrays this places alloc/free on edges + # inside the loop (at the LCA of uses), so the hoist logic below + # can find and move them. For SDFG/Global-lifetime arrays + # make_explicit places alloc/free at the SDFG level already, so + # there will be nothing to hoist — we skip those silently. + if desc.lifetime != dtypes.AllocationLifetime.Explicit: + make_explicit(sdfg, [name]) + + # --- find alloc / free edges inside the loop --- + alloc_edges, free_edges = _collect_alloc_free_edges(loop, name) + + if not alloc_edges: + # Either the array was SDFG/Global-lifetime (already outside the + # loop after make_explicit) or it genuinely has no uses inside + # this loop — nothing to do. + continue + + # --- remove from internal edges --- + for edge in alloc_edges: + edge.data.alloc.remove(name) + for edge in free_edges: + edge.data.free.remove(name) + + # --- place alloc before the loop --- + _alloc_on_incoming_edges(parent, loop, name) + + # --- place free after the loop (only if there were internal frees) --- + if free_edges: + _free_on_outgoing_edges(parent, loop, name) diff --git a/dace/libraries/allocation/make_explicit.py b/dace/libraries/allocation/make_explicit.py new file mode 100644 index 0000000000..fd1ce2e1be --- /dev/null +++ b/dace/libraries/allocation/make_explicit.py @@ -0,0 +1,217 @@ +""" +make_explicit: utility to convert transient data containers from automatic +(scope-based lifetime) allocation to explicit allocation via the +AllocationLifetime.Explicit / interstate-edge alloc/free mechanism. + +For each array the function: + 1. Collects all SDFGStates that contain an access node for the array. + 2. Finds the lowest common ancestor (LCA) ControlFlowRegion of those states. + This preserves scope semantics: an array used only inside a LoopRegion + gets alloc/free on edges *within* that loop, not on edges surrounding it + in the parent CFG. + 3. Within the LCA region, finds the first and last direct child blocks (in + topological execution order) that contain a use of the array. + 4. Adds the array name to the ``alloc`` list of every incoming interstate + edge of the first-use block inside the LCA. A new thin state is inserted + only when the first-use block has no incoming edges in the LCA. + 5. Adds the array name to the ``free`` list of every outgoing interstate + edge of the last-use block inside the LCA. A new thin successor state is + inserted only when the last-use block has no outgoing edges in the LCA. + 6. Sets the array's ``lifetime`` to ``AllocationLifetime.Explicit`` so that + DaCe's normal codegen skips the automatic ``new``/``delete[]`` and the + control-flow code-generator emits them on the edge traversal instead. + +Note: Map-scoped arrays (AllocationLifetime.Scope where the innermost scope is +a Map node within a state) cannot be expressed via interstate edge annotations, +since Maps have no CFG-level boundaries. make_explicit does not special-case +this; callers should not pass such arrays. + +Usage:: + + from dace.libraries.allocation.make_explicit import make_explicit + make_explicit(sdfg, ['dx', 'dy', 'dz']) +""" + +from typing import List + +from dace import dtypes +from dace.sdfg import SDFG, SDFGState, InterstateEdge +from dace.sdfg.state import ControlFlowRegion +from dace.sdfg.analysis import cfg as cfg_analysis + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + +def _ancestor_chain(state, sdfg: SDFG) -> list: + """Return [sdfg, ..., direct_parent_of_state] — ancestor regions from root + down to (but not including) *state* itself.""" + chain = [] + node = state.parent_graph + while node is not sdfg: + chain.append(node) + node = node.parent_graph + chain.append(sdfg) + return list(reversed(chain)) # sdfg first, deepest parent last + + +def _lca_region(states: list, sdfg: SDFG) -> ControlFlowRegion: + """Return the lowest common ancestor ControlFlowRegion of *states*. + + The LCA is the innermost region that contains every state in *states*. + For states all inside the same LoopRegion this returns that LoopRegion; + for states spread across the SDFG it returns the SDFG itself. + """ + if not states: + return sdfg + + chains = [_ancestor_chain(s, sdfg) for s in states] + lca = sdfg + for regions in zip(*chains): + if len({id(r) for r in regions}) == 1: + lca = regions[0] + else: + break + return lca + + +def _top_level_block_in(region: ControlFlowRegion, node): + """Walk up the parent_graph chain to find the direct child of *region* + that contains *node*. *node* may already be a direct child.""" + while node.parent_graph is not region: + node = node.parent_graph + return node + + +def _blocks_using_in(region: ControlFlowRegion, name: str) -> list: + """Return the direct children of *region* (states or nested regions) that + contain an access node for *name*, in topological execution order + (duplicates removed).""" + seen = set() + result: list = [] + for state in cfg_analysis.blockorder_topological_sort(region, ignore_nonstate_blocks=True): + if any(n.data == name for n in state.data_nodes()): + top = _top_level_block_in(region, state) + if top not in seen: + result.append(top) + seen.add(top) + return result + + +def _alloc_on_incoming_edges(region: ControlFlowRegion, block, name: str) -> None: + """Add *name* to the ``alloc`` list of every incoming edge of *block* + within *region*. + + If *block* has no incoming edges (i.e. it is the region's start block), a + new predecessor state is inserted and the alloc is placed on the edge + from the new state to *block*. + """ + in_edges = region.in_edges(block) + if in_edges: + for edge in in_edges: + if name not in edge.data.alloc: + edge.data.alloc.append(name) + else: + # block is the start — insert a thin predecessor to carry the edge + new_pre = region.add_state_before(block, label=f'_alloc_pre_{name}', is_start_block=True) + alloc_edge = region.edges_between(new_pre, block)[0] + alloc_edge.data.alloc.append(name) + + +def _free_on_outgoing_edges(region: ControlFlowRegion, block, name: str) -> None: + """Add *name* to the ``free`` list of every outgoing edge of *block* + within *region*. + + If *block* has no outgoing edges (i.e. it is a sink), a new successor + state is inserted and the free is placed on the edge from *block* to the + new state. + """ + out_edges = region.out_edges(block) + if out_edges: + for edge in out_edges: + if name not in edge.data.free: + edge.data.free.append(name) + else: + # block is a sink — insert a thin successor to carry the edge + new_suc = region.add_state_after(block, label=f'_free_suc_{name}') + free_edge = region.edges_between(block, new_suc)[0] + free_edge.data.free.append(name) + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def make_explicit(sdfg: SDFG, array_names: List[str]) -> None: + """Convert transient data containers to explicit interstate-edge allocation. + + Allocation and deallocation are placed at the tightest scope that covers + all uses of the array — for example, inside a LoopRegion if the array is + only used there, or at the SDFG level if uses span multiple top-level + blocks. + + **Limitation — size-1 arrays**: DaCe's state-struct codegen represents + single-element transients as plain scalar fields (e.g. ``double x;``) + rather than pointer fields (``double *x;``). Placing them on alloc/free + edges would make the C++ emitter emit ``__state->x = new double[1];`` + against a scalar declaration, which is a compile-time type error. + ``make_explicit`` therefore raises :class:`ValueError` for any array + whose total element count is 1. Callers that discover such an array + should either skip it or use a different strategy (e.g. scalar alias + rather than pointer reuse). + + :param sdfg: The SDFG to modify in-place. + :param array_names: Names of transient arrays to make explicitly allocated. + :raises ValueError: If a name is not a transient array in *sdfg*, or if + the array has exactly one element (size-1 scalar limitation). + """ + # Validate ALL arrays before modifying anything so that a late failure + # cannot leave the SDFG in a partially-modified state. + for name in array_names: + if name not in sdfg.arrays: + raise ValueError(f"'{name}' not found in sdfg.arrays") + desc = sdfg.arrays[name] + if not desc.transient: + raise ValueError(f"'{name}' is not a transient data container") + if desc.total_size == 1: + raise ValueError( + f"'{name}' has total_size=1. DaCe stores single-element " + f"transients as scalar state-struct fields, not pointers; " + f"explicit heap allocation is not supported for them." + ) + + # Remember the original lifetime before overwriting — used below to + # decide the target scope region. + original_lifetime = desc.lifetime + + # Switch lifetime so validation passes when we later call validate() + desc.lifetime = dtypes.AllocationLifetime.Explicit + + # Collect every state that contains an access node for this array + all_states = [ + s for s in cfg_analysis.blockorder_topological_sort(sdfg, ignore_nonstate_blocks=True) + if any(n.data == name for n in s.data_nodes()) + ] + if not all_states: + # Array is unused — lifetime is flipped but no edges to annotate + continue + + # Determine the target scope region. + # SDFG/Global lifetime: always allocate at the SDFG level regardless + # of where the actual uses are. + # State/Scope/other: use the LCA of actual uses to preserve the + # per-iteration or per-state semantics of the original lifetime. + if original_lifetime in (dtypes.AllocationLifetime.SDFG, + dtypes.AllocationLifetime.Global): + lca: ControlFlowRegion = sdfg + else: + lca = _lca_region(all_states, sdfg) + + # First/last direct children of lca that contain uses + using = _blocks_using_in(lca, name) + if not using: + continue + + _alloc_on_incoming_edges(lca, using[0], name) + _free_on_outgoing_edges(lca, using[-1], name) diff --git a/dace/libraries/allocation/reuse.py b/dace/libraries/allocation/reuse.py new file mode 100644 index 0000000000..9abe4f08d3 --- /dev/null +++ b/dace/libraries/allocation/reuse.py @@ -0,0 +1,790 @@ +""" +reuse: apply buffer-reuse to a pair of explicitly-allocated SDFG arrays. + +_apply_reuse(sdfg, new_arr, donor_arr) rewires the alloc/free annotations +placed by make_explicit so that new_arr receives donor_arr's heap pointer +instead of allocating fresh memory. + +After the call: + - donor_arr: allocated (alloc edge), never freed (free entry removed). + - new_arr: not allocated; receives donor pointer via a reuse edge entry. + - new_arr: freed normally (free edge retained from make_explicit). + +The invariant is safe because the cats trace guarantees donor_arr's lifetime +ends before new_arr's lifetime begins. + +buffer_reuse_same_pass(sdfg, symbols, sym_max_vals) is the top-level automated +pass that runs the CATS trace extractor, identifies same-size same-dtype +transient pairs whose lifetimes do not overlap, and applies _apply_reuse for +each discovered pair. +""" + +import dataclasses +from typing import Any, Dict, List, Optional, Tuple, Union + +from dace import dtypes +from dace.sdfg import SDFG + +from .make_explicit import make_explicit + + +# --------------------------------------------------------------------------- +# Layer 1: _apply_reuse +# --------------------------------------------------------------------------- + +def _apply_reuse(sdfg: SDFG, new_arr: str, donor_arr: str) -> None: + """Rewrite the SDFG so that *new_arr* reuses *donor_arr*'s allocation. + + :param sdfg: The SDFG to modify in-place. + :param new_arr: Name of the array that should receive donor's pointer. + :param donor_arr: Name of the array whose allocation is reused. + :raises ValueError: If either array is not a transient in *sdfg*. + """ + for name in (new_arr, donor_arr): + if name not in sdfg.arrays: + raise ValueError(f"'{name}' not found in sdfg.arrays") + if not sdfg.arrays[name].transient: + raise ValueError(f"'{name}' is not a transient data container") + + # Step 1: ensure both arrays have Explicit lifetime and alloc/free on edges. + # Skip arrays that are already Explicit — make_explicit is not idempotent + # when an array is already in a reuse entry: calling it again would add the + # array back to alloc, creating an "in both alloc and reuse" contradiction. + needs_explicit = [ + name for name in (new_arr, donor_arr) + if sdfg.arrays[name].lifetime != dtypes.AllocationLifetime.Explicit + ] + if needs_explicit: + make_explicit(sdfg, needs_explicit) + + # make_explicit may place alloc/free on interstate edges inside nested + # control-flow regions (e.g. loop bodies), so iterate recursively. + all_edges = list(sdfg.all_interstate_edges(recursive=True)) + + # Step 2: for new_arr — replace alloc entries with reuse entries + for edge in all_edges: + if new_arr in edge.data.alloc: + edge.data.alloc.remove(new_arr) + edge.data.reuse.append([new_arr, donor_arr]) + + # Step 3: for donor_arr — remove free entries (new_arr's free takes over) + for edge in all_edges: + if donor_arr in edge.data.free: + edge.data.free.remove(donor_arr) + + +# --------------------------------------------------------------------------- +# Layer 2: CATS-based liveness extraction +# --------------------------------------------------------------------------- + +@dataclasses.dataclass +class _AllocEntry: + array_name: str + size_bytes: int + dtype: Any # dace.dtypes.typeclass + + +@dataclasses.dataclass +class _FreeEntry: + array_name: str + size_bytes: int # carried for convenience — needed to rebuild pool key + dtype: Any # dace.dtypes.typeclass + ua_ratio: float = 1.0 # usage_window / alloc_window ∈ [0.0, 1.0]; 1.0 = tight or unknown + + +_LivenessEvent = Union[_AllocEntry, _FreeEntry] + + +def _run_cats( + sdfg: SDFG, + symbols: Dict[str, int], + symbol_max_vals: Dict[str, int], +) -> Tuple[Any, Dict[str, str]]: + """Run CATS trace extraction on *sdfg* and return the generator plus a + reverse map from CATS alloc names to DaCe array names. + + The CATS codegen registers its own CPU target (``@autoregister_params``), + which shadows DaCe's built-in one. This helper saves and restores the + 'cpu' registry slot around the call so subsequent DaCe compilations use + the real DaCe CPU codegen. + """ + from dace.codegen.target import TargetCodeGenerator + from dace.codegen.targets.cpp import ptr as cpp_ptr + + from cats_dace.cats_trace_extractor import CATSTraceExtractor, _get_codegen_targets + from cats_dace.codegen.targets.cpu import CPUCodeGen as CATSCPUCodeGen + + # Snapshot the 'cpu' slot AFTER the CATS import so that if the caller already + # imported cats_dace elsewhere (e.g. from cats_dace.utils), the CATS class is + # excluded from the restore list — otherwise we would re-register it. + orig_cpu_entry = { + cls: kwargs + for cls, kwargs in TargetCodeGenerator.extensions().items() + if kwargs.get('name') == 'cpu' and cls is not CATSCPUCodeGen + } + + generator = CATSTraceExtractor(sdfg, symbols, symbol_max_vals, + transients_only=True) + _targets = {'cpu': CATSCPUCodeGen(generator, sdfg)} # noqa: F841 + _get_codegen_targets(sdfg, generator) + generator.generate_timeline(sdfg, None) + + if CATSCPUCodeGen in TargetCodeGenerator.extensions(): + TargetCodeGenerator.unregister(CATSCPUCodeGen) + for cls, kwargs in orig_cpu_entry.items(): + if cls not in TargetCodeGenerator.extensions(): + TargetCodeGenerator.register(cls, **kwargs) + + # CATS encodes each allocation as str(sdfg.cfg_id) + '_' + cpp_ptr(...) + alloc_name_to_array: Dict[str, str] = {} + for arr_name, desc in sdfg.arrays.items(): + if not desc.transient: + continue + ptr_str = cpp_ptr(arr_name, desc, sdfg, generator) + alloc_name = f'{sdfg.cfg_id}_{ptr_str}' + alloc_name_to_array[alloc_name] = arr_name + + return generator, alloc_name_to_array + + +def _extract_liveness( + sdfg: SDFG, + symbols: Dict[str, int], + symbol_max_vals: Dict[str, int], +) -> List[_LivenessEvent]: + """Run CATS on *sdfg* and return a liveness event list for transients. + + Each array's effective window is ``[first_DataAccessEvent, last_DataAccessEvent]``. + For arrays with no access events (unused scratch), the window falls back to + ``[AllocationEvent, DeallocationEvent]`` and ``ua_ratio`` is set to 1.0. + + ``ua_ratio`` on each ``_FreeEntry`` = ``usage_window / alloc_window``, clamped + to ``[0.0, 1.0]``. A low ratio means the array sat allocated for much longer + than it was actually used. + """ + from cats_dace.utils import AllocationEvent, DeallocationEvent, DataAccessEvent + + generator, alloc_name_to_array = _run_cats(sdfg, symbols, symbol_max_vals) + + alloc_idx: Dict[str, int] = {} + dealloc_idx: Dict[str, int] = {} + first_access_idx: Dict[str, int] = {} + last_access_idx: Dict[str, int] = {} + size_bytes: Dict[str, int] = {} + dtype_of: Dict[str, Any] = {} + + for i, event in enumerate(generator.access_timeline): + if isinstance(event, AllocationEvent): + for alloc_name, nbytes in event.data: + arr_name = alloc_name_to_array.get(alloc_name) + if arr_name is None or nbytes <= 0: + continue + alloc_idx[arr_name] = i + size_bytes[arr_name] = nbytes + dtype_of[arr_name] = sdfg.arrays[arr_name].dtype + elif isinstance(event, DeallocationEvent): + for alloc_name in event.data: + arr_name = alloc_name_to_array.get(alloc_name) + if arr_name is None: + continue + dealloc_idx[arr_name] = i + elif isinstance(event, DataAccessEvent): + arr_name = alloc_name_to_array.get(event.alloc_name) + if arr_name is None: + continue + if arr_name not in first_access_idx: + first_access_idx[arr_name] = i + last_access_idx[arr_name] = i + + # Build (sort_key, kind, arr_name, ua_ratio) tuples, then emit in order. + ordered: List[Tuple[float, str, str, float]] = [] + for arr_name in alloc_idx: + ai = alloc_idx[arr_name] + da = dealloc_idx.get(arr_name) + fa = first_access_idx.get(arr_name) + la = last_access_idx.get(arr_name) + + if fa is not None and la is not None: + alloc_key = fa - 0.5 + free_key = la + 0.5 + alloc_window = (da - ai) if (da is not None and da != ai) else 0 + usage_window = la - fa + if alloc_window > 0: + ua_ratio = max(0.0, min(1.0, usage_window / alloc_window)) + else: + ua_ratio = 1.0 + else: + if da is None: + continue + alloc_key = float(ai) + free_key = float(da) + ua_ratio = 1.0 + + ordered.append((alloc_key, 'alloc', arr_name, 0.0)) + ordered.append((free_key, 'free', arr_name, ua_ratio)) + + ordered.sort(key=lambda t: t[0]) + + events: List[_LivenessEvent] = [] + for _, kind, arr_name, ua_ratio in ordered: + if kind == 'alloc': + events.append(_AllocEntry(arr_name, size_bytes[arr_name], dtype_of[arr_name])) + else: + events.append(_FreeEntry(arr_name, size_bytes[arr_name], dtype_of[arr_name], ua_ratio)) + return events + + + +# --------------------------------------------------------------------------- +# Shared safety checks (used by both Layer 3 and Layer 4 passes) +# +# _collect_scopes: reject cross-ControlFlowRegion pairs (necessary for both +# same-size and cross-size reuse). +# _edge_order_safe: additionally required by buffer_reuse_cross_pass — checks strict +# topological ordering within the shared region. Needed +# because make_explicit coarsens CATS event-level lifetimes +# to block-boundary edges; two events that CATS sees as +# non-overlapping but inside the same block would alias via +# the reuse pointer without this check. +# --------------------------------------------------------------------------- + +def _collect_scopes( + sdfg: SDFG, + names: set, +) -> Dict[str, Tuple[frozenset, frozenset]]: + """For each array name in *names*, collect the set of parent + ControlFlowRegion ids for its alloc edges and for its free edges. + + Returns: name -> (frozenset of alloc-region ids, frozenset of free-region ids). + + Rejects cross-scope pairs: when new_arr's alloc/free regions differ from + donor_arr's, rebinding via _apply_reuse would free the donor's storage + across loop iterations and cause a use-after-free. Insufficient alone + for the arena pass — see _edge_order_safe. + """ + scopes: Dict[str, Tuple[set, set]] = {n: (set(), set()) for n in names} + for ns in sdfg.all_sdfgs_recursive(): + for region in ns.all_control_flow_regions(recursive=True): + for edge in region.edges(): + data = edge.data + for n in data.alloc: + if n in scopes: + scopes[n][0].add(id(region)) + for n in data.free: + if n in scopes: + scopes[n][1].add(id(region)) + return {n: (frozenset(a), frozenset(f)) for n, (a, f) in scopes.items()} + + + +# --------------------------------------------------------------------------- +# Layer 3: left-to-right greedy scan + top-level pass +# --------------------------------------------------------------------------- + +def _greedy_same_size_scan( + liveness: List[_LivenessEvent], +) -> List[Tuple[str, str]]: + """Identify reuse pairs by a left-to-right greedy scan. + + Match key: ``(size_bytes, dtype)`` — exact byte size and element type + must agree. When an allocation is seen and the free pool has a matching + donor, the donor is assigned to the new array (LIFO within each bucket). + + :param liveness: Output of :func:`_extract_liveness`. + :returns: List of ``(new_array_name, donor_array_name)`` pairs. + """ + # free_pool: (size_bytes, dtype) → stack of freed array names + free_pool: Dict[Tuple[int, Any], List[str]] = {} + reuse_plan: List[Tuple[str, str]] = [] + + for event in liveness: + if isinstance(event, _AllocEntry): + key = (event.size_bytes, event.dtype) + bucket = free_pool.get(key) + if bucket: + donor = bucket.pop() + if not bucket: + del free_pool[key] + reuse_plan.append((event.array_name, donor)) + + elif isinstance(event, _FreeEntry): + key = (event.size_bytes, event.dtype) + free_pool.setdefault(key, []).append(event.array_name) + + return reuse_plan + + +def _greedy_donor_candidates( + liveness: List[_LivenessEvent], +) -> Dict[str, List[str]]: + """Return ALL available donors per consumer in LIFO priority order. + + Unlike :func:`_greedy_same_size_scan`, does not commit a donor on the first + match. Returns the full ordered list so that callers can fall back to the + next candidate when the primary donor fails safety checks. + + :returns: Dict mapping each consumer array name to its ordered donor list + (most-recently-freed first). + """ + free_pool: Dict[Tuple[int, Any], List[str]] = {} + candidates: Dict[str, List[str]] = {} + + for event in liveness: + if isinstance(event, _AllocEntry): + key = (event.size_bytes, event.dtype) + bucket = free_pool.get(key, []) + if bucket: + candidates[event.array_name] = list(reversed(bucket)) + elif isinstance(event, _FreeEntry): + key = (event.size_bytes, event.dtype) + free_pool.setdefault(key, []).append(event.array_name) + + return candidates + + +def _ua_greedy_same_size_scan( + liveness: List[_LivenessEvent], +) -> List[Tuple[str, str]]: + """U/A-ratio variant of :func:`_greedy_same_size_scan`. + + Identical left-to-right scan, but when multiple same-size same-dtype donors + are available the one with the **lowest** ``ua_ratio`` is selected first — + i.e. the most-idle original allocation is consumed before tightly-used ones. + """ + # free_pool: (size_bytes, dtype) → list of (ua_ratio, array_name) + free_pool: Dict[Tuple[int, Any], List[Tuple[float, str]]] = {} + reuse_plan: List[Tuple[str, str]] = [] + + for event in liveness: + if isinstance(event, _AllocEntry): + key = (event.size_bytes, event.dtype) + bucket = free_pool.get(key) + if bucket: + best_idx = min(range(len(bucket)), key=lambda i: bucket[i][0]) + _, donor = bucket.pop(best_idx) + if not bucket: + del free_pool[key] + reuse_plan.append((event.array_name, donor)) + elif isinstance(event, _FreeEntry): + key = (event.size_bytes, event.dtype) + free_pool.setdefault(key, []).append((event.ua_ratio, event.array_name)) + + return reuse_plan + + +def _ua_donor_candidates( + liveness: List[_LivenessEvent], +) -> Dict[str, List[str]]: + """Return ALL available donors per consumer sorted by ascending u/a ratio. + + U/A-ratio variant of :func:`_greedy_donor_candidates`: the most-idle donor + (lowest ratio) is first in each list. + + :returns: Dict mapping each consumer array name to its ordered donor list. + """ + free_pool: Dict[Tuple[int, Any], List[Tuple[float, str]]] = {} + candidates: Dict[str, List[str]] = {} + + for event in liveness: + if isinstance(event, _AllocEntry): + key = (event.size_bytes, event.dtype) + bucket = free_pool.get(key, []) + if bucket: + candidates[event.array_name] = [ + name for _, name in sorted(bucket, key=lambda x: x[0]) + ] + elif isinstance(event, _FreeEntry): + key = (event.size_bytes, event.dtype) + free_pool.setdefault(key, []).append((event.ua_ratio, event.array_name)) + + return candidates + + +def buffer_reuse_same_pass( + sdfg: SDFG, + symbols: Dict[str, int], + symbol_max_vals: Dict[str, int], +) -> List[Tuple[str, str]]: + """Automatically reuse same-size same-dtype transient buffers in *sdfg*. + + 1. Extracts a liveness trace via CATS (transients only). + 2. Runs a left-to-right greedy scan to find non-overlapping same-size + same-dtype pairs. + 3. Applies :func:`_apply_reuse` for each discovered pair. + + :param sdfg: Root SDFG to optimise in-place. + :param symbols: Concrete symbol values for CATS size computation. + :param symbol_max_vals: Upper-bound values for unresolved symbols. + :returns: The list of ``(new_array, donor_array)`` reuse pairs applied. + """ + import copy as _copy + + # CATS compiles with its own CPUCodeGen; running it directly on *sdfg* + # would poison the build folder, so use a separately-named deep copy. + probe = _copy.deepcopy(sdfg) + probe._name = probe._name + "_cats_probe" + liveness = _extract_liveness(probe, symbols, symbol_max_vals) + donor_candidates = _greedy_donor_candidates(liveness) + + involved = {n for new in donor_candidates + for n in [new] + donor_candidates[new] if n in probe.arrays} + for name in list(involved): + if probe.arrays[name].lifetime != dtypes.AllocationLifetime.Explicit: + try: + make_explicit(probe, [name]) + except ValueError: + involved.discard(name) + scopes = _collect_scopes(probe, involved) + + # Try donors in LIFO priority order; fall back if a candidate fails + # the scope or edge-order safety check. + applied: List[Tuple[str, str]] = [] + used_donors: set = set() + for new_arr, donors in donor_candidates.items(): + for donor_arr in donors: + if donor_arr in used_donors: + continue + if new_arr not in scopes or donor_arr not in scopes: + continue + if scopes[new_arr] != scopes[donor_arr]: + continue + if not _edge_order_safe(probe, new_arr, donor_arr): + continue + try: + _apply_reuse(sdfg, new_arr, donor_arr) + applied.append((new_arr, donor_arr)) + used_donors.add(donor_arr) + break + except ValueError: + pass + + return applied + + +def buffer_reuse_same_pass_ua( + sdfg: SDFG, + symbols: Dict[str, int], + symbol_max_vals: Dict[str, int], +) -> List[Tuple[str, str]]: + """U/A-ratio variant of :func:`buffer_reuse_same_pass`. + + Uses :func:`_extract_liveness` (liveness tightened to each array's + actual access window) and :func:`_ua_greedy_same_size_scan` (ratio-sorted: + when multiple same-size same-dtype donors are available, the one with the + lowest u/a ratio — most-idle allocation — is consumed first). + + :param sdfg: Root SDFG to optimise in-place. + :param symbols: Concrete symbol values for CATS size computation. + :param symbol_max_vals: Upper-bound values for unresolved symbols. + :returns: Applied ``(new_array, donor_array)`` reuse pairs. + """ + import copy as _copy + + probe = _copy.deepcopy(sdfg) + probe._name = probe._name + "_cats_probe" + liveness = _extract_liveness(probe, symbols, symbol_max_vals) + donor_candidates = _ua_donor_candidates(liveness) + + involved = {n for new in donor_candidates + for n in [new] + donor_candidates[new] if n in probe.arrays} + for name in list(involved): + if probe.arrays[name].lifetime != dtypes.AllocationLifetime.Explicit: + try: + make_explicit(probe, [name]) + except ValueError: + involved.discard(name) + scopes = _collect_scopes(probe, involved) + + applied: List[Tuple[str, str]] = [] + used_donors: set = set() + for new_arr, donors in donor_candidates.items(): + for donor_arr in donors: + if donor_arr in used_donors: + continue + if new_arr not in scopes or donor_arr not in scopes: + continue + if scopes[new_arr] != scopes[donor_arr]: + continue + if not _edge_order_safe(probe, new_arr, donor_arr): + continue + try: + _apply_reuse(sdfg, new_arr, donor_arr) + applied.append((new_arr, donor_arr)) + used_donors.add(donor_arr) + break + except ValueError: + pass + return applied + + +# --------------------------------------------------------------------------- +# Layer 4: cross-size chain reuse (buffer_reuse_cross_pass) +# --------------------------------------------------------------------------- + +def _greedy_cross_size_scan( + liveness: List[_LivenessEvent], +) -> List[Tuple[str, str, int]]: + """Cross-size, cross-dtype best-fit sub-allocation scan (strict bump allocator). + + On each :class:`_AllocEntry`, find the available donor block with the + smallest *remaining* capacity (``total - used``) that still fits the + consumer and satisfies the alignment constraint + (``block.dtype.bytes >= consumer.dtype.bytes``). Assign the consumer at + the block's current bump pointer; advance the pointer by + ``consumer.size_bytes``. Multiple consumers can share one donor block at + non-overlapping byte offsets. + + A donor block enters the pool when its :class:`_FreeEntry` is processed. + Consumer free events are silently dropped — storage is owned by the donor. + + :param liveness: Output of :func:`_extract_liveness`. + :returns: List of ``(consumer, donor, offset_bytes)`` triples in liveness order. + """ + # pool: each entry is [total_size, dtype, donor_name, used_bytes] (mutable) + pool: List[list] = [] + plan: List[Tuple[str, str, int]] = [] + is_consumer: set = set() + + for event in liveness: + if isinstance(event, _AllocEntry): + best_idx: Optional[int] = None + best_remaining: Optional[int] = None + for idx, (total, dt, _nm, used) in enumerate(pool): + remaining = total - used + if remaining < event.size_bytes: + continue + if dt.bytes < event.dtype.bytes: + continue + if best_remaining is None or remaining < best_remaining: + best_idx = idx + best_remaining = remaining + + if best_idx is not None: + _total, _dt, donor, used = pool[best_idx] + plan.append((event.array_name, donor, used)) + pool[best_idx][3] = used + event.size_bytes + is_consumer.add(event.array_name) + + elif isinstance(event, _FreeEntry): + if event.array_name not in is_consumer: + pool.append([event.size_bytes, event.dtype, event.array_name, 0]) + + return plan + + + +def _apply_arena_reuse( + sdfg: SDFG, + new_arr: str, + donor_arr: str, + offset_bytes: int = 0, +) -> None: + """Cross-dtype/cross-size chain reuse: rebind *new_arr* into *donor_arr*'s + heap block at the given byte offset. + + Emits a 3-tuple reuse entry ``[new_arr, donor_arr, offset_bytes]`` which + the codegen lowers to ``(T_new*)((char*)donor_ptr + offset)``. The donor + retains ownership of the heap block: its ``free`` entry is moved to the + edge that previously held the consumer's ``free`` entry, so the typed + ``delete[] donor_ptr`` fires only after both arrays are done. The + consumer's ``free`` entry is removed because the consumer never owned + the storage. + """ + for name in (new_arr, donor_arr): + if name not in sdfg.arrays: + raise ValueError(f"'{name}' not found in sdfg.arrays") + if not sdfg.arrays[name].transient: + raise ValueError(f"'{name}' is not a transient data container") + + needs_explicit = [ + name for name in (new_arr, donor_arr) + if sdfg.arrays[name].lifetime != dtypes.AllocationLifetime.Explicit + ] + if needs_explicit: + make_explicit(sdfg, needs_explicit) + + all_edges = list(sdfg.all_interstate_edges(recursive=True)) + + # Remember where consumer's free lived so we can re-home the donor's free. + new_free_edge = None + for edge in all_edges: + if new_arr in edge.data.free: + new_free_edge = edge + break + + # Replace consumer's alloc with a 3-tuple reuse entry. + for edge in all_edges: + while new_arr in edge.data.alloc: + edge.data.alloc.remove(new_arr) + edge.data.reuse.append([new_arr, donor_arr, int(offset_bytes)]) + + # Strip consumer's free entries — consumer never owns the heap block. + for edge in all_edges: + while new_arr in edge.data.free: + edge.data.free.remove(new_arr) + + # Re-home donor's free to the consumer's old free site, so the typed + # delete[] donor_ptr happens after the consumer's last use. + if new_free_edge is not None: + for edge in all_edges: + while donor_arr in edge.data.free: + edge.data.free.remove(donor_arr) + if donor_arr not in new_free_edge.data.free: + new_free_edge.data.free.append(donor_arr) + + +def _resolve_donor_root(sdfg: SDFG, donor: str) -> Tuple[str, int]: + """Walk *donor*'s reuse chain to its storage-owning ancestor. + + If a previous arena pair already made *donor* a reuse consumer + (entry ``[donor, X, off]`` exists somewhere in the SDFG), follow + the chain to the root allocator and accumulate offsets. Returns + ``(root, total_offset)``. If *donor* is not a reuse consumer, + returns ``(donor, 0)``. + + Required because the cross-size apply moves the donor's ``free`` + entry; if donor is itself a view into an earlier donor, the typed + ``delete[]`` must fire on the original heap block, not on the view. + """ + seen: set = set() + current = donor + total = 0 + while current not in seen: + seen.add(current) + next_arr: Optional[str] = None + next_off = 0 + for edge in sdfg.all_interstate_edges(recursive=True): + for entry in edge.data.reuse: + if entry[0] == current: + next_arr = entry[1] + next_off = int(entry[2]) if len(entry) >= 3 else 0 + break + if next_arr is not None: + break + if next_arr is None: + return current, total + total += next_off + current = next_arr + return current, total + + +def _edge_order_safe(probe: SDFG, new_arr: str, donor_arr: str) -> bool: + """Return True iff donor's last-use block executes *strictly before* + consumer's first-use block in topological order, within the same parent + ControlFlowRegion. + + Strict ordering is required: if both blocks are the same, donor is + accessed *inside* the block whose entry edge fires the rebind, so the + donor's typed accesses race against consumer's writes through the + aliased pointer. This catches the case where ``make_explicit`` placed + alloc/free edges around a block whose internal accesses CATS knew + were disjoint, but whose block-level boundary still overlaps. + """ + import networkx as _nx + + new_alloc: List[Tuple[Any, Any]] = [] + donor_free: List[Tuple[Any, Any]] = [] + for ns in probe.all_sdfgs_recursive(): + for region in ns.all_control_flow_regions(recursive=True): + for edge in region.edges(): + if new_arr in edge.data.alloc: + new_alloc.append((region, edge)) + if donor_arr in edge.data.free: + donor_free.append((region, edge)) + + if len(new_alloc) != 1 or len(donor_free) != 1: + return False + + new_region, new_alloc_edge = new_alloc[0] + donor_region, donor_free_edge = donor_free[0] + if new_region is not donor_region: + return False + + g = new_region.nx + if not _nx.is_directed_acyclic_graph(g): + return False + order = {n: i for i, n in enumerate(_nx.topological_sort(g))} + + new_first = new_alloc_edge.dst + donor_last = donor_free_edge.src + if new_first not in order or donor_last not in order: + return False + return order[donor_last] < order[new_first] + + +def buffer_reuse_cross_pass( + sdfg: SDFG, + symbols: Dict[str, int], + symbol_max_vals: Dict[str, int], +) -> List[Tuple[str, str]]: + """Cross-size, cross-dtype buffer reuse with bump-allocator sub-allocation. + + A single donor block may serve multiple consumers at non-overlapping byte + offsets (strict bump allocator: the offset only increases within one block). + Consumer ``c`` at offset ``k`` is assigned ``(T_c*)((char*)donor + k)``. + + Donor selection: best-fit by remaining capacity so larger donors are + preserved for larger future consumers. + + :param sdfg: Root SDFG to optimise in-place. + :param symbols: Concrete symbol values for CATS size computation. + :param symbol_max_vals: Upper bounds for unresolved symbols. + :returns: List of ``(consumer, donor)`` pairs actually applied. + """ + import copy as _copy + from collections import defaultdict + + # CATS compiles with its own CPUCodeGen; isolate it on a renamed copy + # so the working SDFG's build folder stays clean. + probe = _copy.deepcopy(sdfg) + probe._name = probe._name + "_cats_probe" + liveness = _extract_liveness(probe, symbols, symbol_max_vals) + reuse_plan = _greedy_cross_size_scan(liveness) + + # Build a free-event index for ordering consumers of the same donor. + free_idx: Dict[str, int] = { + e.array_name: i + for i, e in enumerate(liveness) + if isinstance(e, _FreeEntry) + } + + # Scope and edge-order checks use an unmodified second probe. + check_probe = _copy.deepcopy(sdfg) + involved = {n for triple in reuse_plan for n in triple[:2] + if n in check_probe.arrays} + for name in list(involved): + if check_probe.arrays[name].lifetime != dtypes.AllocationLifetime.Explicit: + try: + make_explicit(check_probe, [name]) + except ValueError: + involved.discard(name) + scopes = _collect_scopes(check_probe, involved) + + # Group consumers by donor; filter on scope/safety. + consumers_by_donor: Dict[str, List[Tuple[str, int, int]]] = defaultdict(list) + for consumer, donor, offset in reuse_plan: + if consumer not in scopes or donor not in scopes: + continue + if scopes[consumer] != scopes[donor]: + continue + if not _edge_order_safe(check_probe, consumer, donor): + continue + consumers_by_donor[donor].append( + (consumer, offset, free_idx.get(consumer, 0)) + ) + + # Apply each donor's consumers sorted by free order (earliest first) so + # _apply_arena_reuse leaves the donor's free at the latest consumer's site. + applied: List[Tuple[str, str]] = [] + for donor, consumers in consumers_by_donor.items(): + consumers.sort(key=lambda t: t[2]) + for consumer, offset, _ in consumers: + actual_donor, chain_off = _resolve_donor_root(sdfg, donor) + try: + _apply_arena_reuse(sdfg, consumer, actual_donor, + offset_bytes=chain_off + offset) + applied.append((consumer, donor)) + except ValueError: + pass + + return applied diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index 16db581c7c..1a245a5846 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -179,10 +179,18 @@ class InterstateEdge(object): assignments = Property(dtype=dict, desc="Assignments to perform upon transition (e.g., 'x=x+1; y = 0')") condition = CodeProperty(desc="Transition condition", default=CodeBlock("1")) guid = Property(dtype=str, allow_none=False) + alloc = ListProperty(element_type=str, desc="Arrays to allocate upon this transition (AllocationLifetime.Explicit)") + free = ListProperty(element_type=str, desc="Arrays to free upon this transition (AllocationLifetime.Explicit)") + reuse = ListProperty(element_type=list, + desc="Buffer reuse pairs [[new_array, donor_array]] — new_array receives " + "donor_array's pointer instead of a fresh allocation") def __init__(self, condition: Optional[Union[CodeBlock, str, ast.AST, list]] = None, - assignments: Optional[Dict] = None): + assignments: Optional[Dict] = None, + alloc: Optional[List] = None, + free: Optional[List] = None, + reuse: Optional[List] = None): if condition is None: condition = CodeBlock("1") @@ -198,6 +206,9 @@ def __init__(self, else: self.condition = condition self.assignments = {k: InterstateEdge._convert_assignment(v) for k, v in assignments.items()} + self.alloc = list(alloc) if alloc is not None else [] + self.free = list(free) if free is not None else [] + self.reuse = list(reuse) if reuse is not None else [] self._cond_sympy = None self._uncond = None @@ -217,6 +228,7 @@ def __deepcopy__(self, memo): if k == 'guid': # Skip ID continue setattr(result, k, copy.deepcopy(v, memo)) + result.guid = generate_element_id(result) return result @staticmethod diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py index c7862f759e..fe5e3c4985 100644 --- a/dace/sdfg/validation.py +++ b/dace/sdfg/validation.py @@ -213,6 +213,10 @@ def validate_control_flow_region(sdfg: 'SDFG', # Check for interstate edges that write to scalars or arrays _no_writes_to_scalars_or_arrays_on_interstate_edges(sdfg) + # Check for explicit alloc/free lists on interstate edges + _validate_interstate_edge_explicit_alloc(sdfg) + _validate_explicit_allocation_balance(sdfg) + def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context: bool): """ Verifies the correctness of an SDFG by applying multiple tests. @@ -1155,3 +1159,186 @@ def _no_writes_to_scalars_or_arrays_on_interstate_edges(cfg: 'dace.ControlFlowRe raise InvalidSDFGInterstateEdgeError( f'Assignment to a scalar or an array detected in an interstate edge: "{edge}"', cfg.sdfg, cfg.edge_id(edge)) + + +def _validate_interstate_edge_explicit_alloc(sdfg: 'dace.sdfg.SDFG'): + """Validate the explicit-allocation annotations on *sdfg*'s interstate edges. + + For every ``InterstateEdge``, every array name referenced in its + ``alloc``, ``free``, or ``reuse`` lists must: + + - be declared in ``sdfg.arrays``, and + - carry ``AllocationLifetime.Explicit``. + + ``reuse`` entries may take either of two shapes: + + - 2-element ``[new_array, donor_array]``: same-dtype overlap; the + donor is consumed and the consumer takes over the heap block (the + donor's ``free`` is removed, the consumer's ``free`` lives on + somewhere else and frees via the consumer's pointer). + - 3-element ``[new_array, donor_array, offset_bytes]``: cross-dtype + or sub-block reuse; the consumer points into the donor's block at + ``offset_bytes`` via ``(T_new*)((char*)donor + offset)``. The + donor retains ownership (so the donor's ``free`` lives on + somewhere) and the consumer must never appear in any ``free`` + list (it does not own the storage). + + :raises InvalidSDFGInterstateEdgeError: on the first rule violation found. + """ + from dace.sdfg import InterstateEdge + for edge in sdfg.edges(): + if edge.data is None or not isinstance(edge.data, InterstateEdge): + continue + eid = sdfg.edge_id(edge) + for arr_name in edge.data.alloc: + if arr_name not in sdfg.arrays: + raise InvalidSDFGInterstateEdgeError( + f"Edge alloc references non-existent array '{arr_name}'", sdfg, eid) + if sdfg.arrays[arr_name].lifetime is not dtypes.AllocationLifetime.Explicit: + raise InvalidSDFGInterstateEdgeError( + f"Edge alloc references array '{arr_name}' with lifetime " + f"{sdfg.arrays[arr_name].lifetime} — only AllocationLifetime.Explicit allowed", sdfg, eid) + for arr_name in edge.data.free: + if arr_name not in sdfg.arrays: + raise InvalidSDFGInterstateEdgeError( + f"Edge free references non-existent array '{arr_name}'", sdfg, eid) + if sdfg.arrays[arr_name].lifetime is not dtypes.AllocationLifetime.Explicit: + raise InvalidSDFGInterstateEdgeError( + f"Edge free references array '{arr_name}' with lifetime " + f"{sdfg.arrays[arr_name].lifetime} — only AllocationLifetime.Explicit allowed", sdfg, eid) + for entry in edge.data.reuse: + if len(entry) == 2: + new_arr, donor_arr = entry + offset_bytes = None + elif len(entry) == 3: + new_arr, donor_arr, offset_bytes = entry + if not isinstance(offset_bytes, int) or offset_bytes < 0: + raise InvalidSDFGInterstateEdgeError( + f"Edge reuse entry {entry!r} has invalid offset_bytes " + f"{offset_bytes!r} — must be a non-negative int", + sdfg, eid) + else: + raise InvalidSDFGInterstateEdgeError( + f"Edge reuse entry {entry!r} must be a 2-element list " + f"[new_array, donor_array] or a 3-element list " + f"[new_array, donor_array, offset_bytes]", + sdfg, eid) + for role, arr_name in [('new', new_arr), ('donor', donor_arr)]: + if arr_name not in sdfg.arrays: + raise InvalidSDFGInterstateEdgeError( + f"Edge reuse references non-existent array '{arr_name}' (role: {role})", + sdfg, eid) + if sdfg.arrays[arr_name].lifetime is not dtypes.AllocationLifetime.Explicit: + raise InvalidSDFGInterstateEdgeError( + f"Edge reuse references array '{arr_name}' with lifetime " + f"{sdfg.arrays[arr_name].lifetime} — only AllocationLifetime.Explicit allowed", + sdfg, eid) + if new_arr in edge.data.alloc: + raise InvalidSDFGInterstateEdgeError( + f"Edge has '{new_arr}' in both alloc and reuse — " + f"remove alloc entry to avoid double allocation", + sdfg, eid) + if offset_bytes is None: + # 2-tuple: consumer takes over the heap block, so donor must + # not be freed on this edge (donor's free was removed). + if donor_arr in edge.data.free: + raise InvalidSDFGInterstateEdgeError( + f"Edge has '{donor_arr}' in both reuse (as donor) and free — " + f"remove free entry; new_arr's free takes over ownership", + sdfg, eid) + else: + # 3-tuple: donor retains ownership, so consumer must not + # appear in this edge's free (consumer never owns storage). + if new_arr in edge.data.free: + raise InvalidSDFGInterstateEdgeError( + f"Edge has '{new_arr}' in both reuse (as new) and free — " + f"3-tuple reuse means the consumer does not own the heap " + f"block, so it must not be freed via its own pointer", + sdfg, eid) + + +def _all_interstate_edges(sdfg: 'dace.sdfg.SDFG'): + """Yield (owner_region, edge) for every InterstateEdge anywhere in + sdfg's CFG (recurses into LoopRegion / ConditionalBlock) but NOT into + nested SDFGs.""" + from dace.sdfg import InterstateEdge + for region in sdfg.all_control_flow_regions(): + for edge in region.edges(): + if isinstance(edge.data, InterstateEdge): + yield region, edge + + +def _validate_explicit_allocation_balance(sdfg: 'dace.sdfg.SDFG'): + """Validate alloc/free balance for Explicit-lifetime arrays: + every such array must have at least one alloc edge and one free edge, + no edge may list a name twice in alloc or free, and no edge may list + the same name in both alloc and free.""" + allocated, freed = set(), set() + for owner, edge in _all_interstate_edges(sdfg): + if len(edge.data.alloc) != len(set(edge.data.alloc)): + seen = set() + dup = next(n for n in edge.data.alloc if n in seen or seen.add(n)) + raise InvalidSDFGInterstateEdgeError( + f"InterstateEdge.alloc contains duplicate array name {dup!r}. " + f"Each array may appear at most once in the alloc list of a " + f"single edge.", owner, owner.edge_id(edge)) + if len(edge.data.free) != len(set(edge.data.free)): + seen = set() + dup = next(n for n in edge.data.free if n in seen or seen.add(n)) + raise InvalidSDFGInterstateEdgeError( + f"InterstateEdge.free contains duplicate array name {dup!r}. " + f"Each array may appear at most once in the free list of a " + f"single edge.", owner, owner.edge_id(edge)) + overlap = set(edge.data.alloc) & set(edge.data.free) + if overlap: + name = next(iter(overlap)) + raise InvalidSDFGInterstateEdgeError( + f"InterstateEdge has {name!r} in both alloc and free — " + f"an allocate-and-free on the same edge has no well-defined " + f"order and is ambiguous. Remove one entry.", + owner, owner.edge_id(edge)) + allocated.update(edge.data.alloc) + freed.update(edge.data.free) + for entry in edge.data.reuse: + new_arr = entry[0] + donor = entry[1] + allocated.add(new_arr) + # 2-tuple: donor is consumed (consumer's free covers it). + # 3-tuple: donor retains ownership (donor's free still lives + # on some edge), so its B2 obligation is met by edge.data.free. + if len(entry) == 2: + freed.add(donor) + else: + # 3-tuple consumer never owns storage, so the consumer's + # B2 obligation is met by the donor's free entry, which + # is freed on the edge that donor.free now lives on. + freed.add(new_arr) + + used = {node.data for state in sdfg.all_states() for node in state.data_nodes()} + all_edge_ids = [owner.edge_id(e) for owner, e in _all_interstate_edges(sdfg)] + + for name, desc in sdfg.arrays.items(): + if desc.lifetime is not dtypes.AllocationLifetime.Explicit: + continue + if name not in used: + warnings.warn( + f"Explicit array {name!r} is declared with " + f"AllocationLifetime.Explicit but has no access nodes in " + f"this SDFG. Remove the array or change its lifetime to " + f"avoid a dead declaration.", + UserWarning) + continue + if name not in allocated: + raise InvalidSDFGError( + f"Explicit array {name!r} has no alloc edge anywhere in " + f"its SDFG. Add {name!r} to InterstateEdge.alloc on one of " + f"the SDFG's interstate edges, e.g. edge ids: {all_edge_ids}. " + f"(Alternatively: change the array's lifetime off " + f"AllocationLifetime.Explicit.)", + sdfg, None) + if name not in freed: + raise InvalidSDFGError( + f"Explicit array {name!r} has no free edge anywhere in " + f"its SDFG. Add {name!r} to InterstateEdge.free on one of " + f"the SDFG's interstate edges, e.g. edge ids: {all_edge_ids}.", + sdfg, None) diff --git a/tests/codegen/buffer_reuse_test.py b/tests/codegen/buffer_reuse_test.py new file mode 100644 index 0000000000..829974bf59 --- /dev/null +++ b/tests/codegen/buffer_reuse_test.py @@ -0,0 +1,255 @@ +import pytest +import dace +from dace.sdfg import InterstateEdge + +def test_interstate_edge_has_reuse_field(): + edge = InterstateEdge() + assert hasattr(edge, 'reuse') + assert edge.reuse == [] + +def test_reuse_accepts_pairs(): + edge = InterstateEdge() + edge.reuse.append(['B', 'A']) + assert edge.reuse == [['B', 'A']] + +def test_reuse_survives_deepcopy(): + import copy + edge = InterstateEdge() + edge.reuse.append(['B', 'A']) + edge2 = copy.deepcopy(edge) + assert edge2.reuse == [['B', 'A']] + assert edge2.guid != edge.guid + +def test_reuse_serializes_to_json(): + edge = InterstateEdge() + edge.reuse.append(['B', 'A']) + j = edge.to_json() + edge2 = InterstateEdge.from_json(j) + assert edge2.reuse == [['B', 'A']] + +from dace import dtypes + +def _make_two_array_sdfg(): + """ + Simple SDFG: s0 --[e]--> s1 + Arrays A and B are both float64[10], Explicit lifetime. + Edge e has reuse=[['B','A']]. + A non-transient array 'x' and a tasklet are added so that codegen + runs successfully on this minimal SDFG. + Returns (sdfg, edge). + """ + sdfg = dace.SDFG('reuse_codegen_test') + sdfg.add_array('x', [10], dace.float64) + sdfg.add_array('A', [10], dace.float64, transient=True) + sdfg.add_array('B', [10], dace.float64, transient=True) + sdfg.arrays['A'].lifetime = dtypes.AllocationLifetime.Explicit + sdfg.arrays['B'].lifetime = dtypes.AllocationLifetime.Explicit + s0 = sdfg.add_state('s0', is_start_block=True) + t = s0.add_tasklet('t', {'_in': dace.float64}, {'_out': dace.float64}, '_out = _in') + r = s0.add_read('x') + w = s0.add_write('x') + s0.add_edge(r, None, t, '_in', dace.Memlet('x[0]')) + s0.add_edge(t, '_out', w, None, dace.Memlet('x[0]')) + s1 = sdfg.add_state('s1') + e = sdfg.add_edge(s0, s1, InterstateEdge()) + e.data.reuse.append(['B', 'A']) + return sdfg, e + +def test_codegen_emits_pointer_assignment(): + sdfg, _ = _make_two_array_sdfg() + cpp = sdfg.generate_code()[0].clean_code + assert '__state->__0_B = __state->__0_A' in cpp + +def test_codegen_emits_donor_nullout(): + sdfg, _ = _make_two_array_sdfg() + cpp = sdfg.generate_code()[0].clean_code + assert '__state->__0_A = nullptr' in cpp + +def test_codegen_no_new_for_reused_array(): + sdfg, _ = _make_two_array_sdfg() + cpp = sdfg.generate_code()[0].clean_code + # B must not be freshly allocated — it reuses A's pointer + assert '__state->__0_B = new double' not in cpp + + +def test_validation_passes_for_valid_reuse(): + sdfg, _ = _make_two_array_sdfg() + sdfg.validate() # must not raise + +def test_validation_fails_reuse_unknown_array(): + sdfg, e = _make_two_array_sdfg() + e.data.reuse.append(['NONEXISTENT', 'A']) + with pytest.raises(Exception, match="non-existent"): + sdfg.validate() + +def test_validation_fails_reuse_non_explicit_lifetime(): + sdfg, e = _make_two_array_sdfg() + sdfg.arrays['A'].lifetime = dtypes.AllocationLifetime.Scope + with pytest.raises(Exception, match="AllocationLifetime.Explicit"): + sdfg.validate() + +def test_validation_fails_new_arr_in_both_alloc_and_reuse(): + sdfg, e = _make_two_array_sdfg() + e.data.alloc.append('B') # B is already in reuse — double allocation + with pytest.raises(Exception, match="alloc and reuse"): + sdfg.validate() + +def test_validation_fails_donor_in_both_reuse_and_free(): + sdfg, e = _make_two_array_sdfg() + e.data.free.append('A') # A is already donor in reuse — ownership conflict + with pytest.raises(Exception, match="reuse.*donor.*free|free.*reuse"): + sdfg.validate() + +from dace.libraries.allocation import _apply_reuse + +def _make_sequential_sdfg(): + """ + SDFG: init --[e0]--> use_A --[e1]--> use_B --[e2]--> done + A and B are both float64[10] transients with default (Scope) lifetime. + use_A has an AccessNode for A. + use_B has an AccessNode for B. + A and B have the same shape/dtype — a valid reuse pair. + """ + sdfg = dace.SDFG('seq_test') + sdfg.add_array('A', [10], dace.float64, transient=True) + sdfg.add_array('B', [10], dace.float64, transient=True) + + init = sdfg.add_state('init', is_start_block=True) + use_A = sdfg.add_state('use_A') + use_B = sdfg.add_state('use_B') + done = sdfg.add_state('done') + + sdfg.add_edge(init, use_A, InterstateEdge()) + sdfg.add_edge(use_A, use_B, InterstateEdge()) + sdfg.add_edge(use_B, done, InterstateEdge()) + + # Add access nodes connected via tasklets so they are not isolated + an_A = use_A.add_access('A') + t_A = use_A.add_tasklet('write_A', {}, {'out'}, 'out = 0.0') + use_A.add_edge(t_A, 'out', an_A, None, dace.Memlet('A[0]')) + + an_B = use_B.add_access('B') + t_B = use_B.add_tasklet('write_B', {}, {'out'}, 'out = 0.0') + use_B.add_edge(t_B, 'out', an_B, None, dace.Memlet('B[0]')) + + return sdfg + +def test_apply_reuse_sets_explicit_lifetime(): + sdfg = _make_sequential_sdfg() + _apply_reuse(sdfg, 'B', 'A') + assert sdfg.arrays['A'].lifetime == dtypes.AllocationLifetime.Explicit + assert sdfg.arrays['B'].lifetime == dtypes.AllocationLifetime.Explicit + +def test_apply_reuse_donor_has_alloc_not_free(): + sdfg = _make_sequential_sdfg() + _apply_reuse(sdfg, 'B', 'A') + all_allocs = [n for e in sdfg.edges() for n in e.data.alloc] + all_frees = [n for e in sdfg.edges() for n in e.data.free] + assert 'A' in all_allocs + assert 'A' not in all_frees # donor is never freed — new_arr takes over + +def test_apply_reuse_new_arr_has_reuse_not_alloc(): + sdfg = _make_sequential_sdfg() + _apply_reuse(sdfg, 'B', 'A') + all_allocs = [n for e in sdfg.edges() for n in e.data.alloc] + all_reuses = [pair for e in sdfg.edges() for pair in e.data.reuse] + assert 'B' not in all_allocs + assert ['B', 'A'] in all_reuses + +def test_apply_reuse_new_arr_still_freed(): + sdfg = _make_sequential_sdfg() + _apply_reuse(sdfg, 'B', 'A') + all_frees = [n for e in sdfg.edges() for n in e.data.free] + assert 'B' in all_frees + +def test_apply_reuse_sdfg_validates(): + sdfg = _make_sequential_sdfg() + _apply_reuse(sdfg, 'B', 'A') + sdfg.validate() # must not raise + +def test_apply_reuse_error_unknown_array(): + sdfg = _make_sequential_sdfg() + with pytest.raises(ValueError, match="not found"): + _apply_reuse(sdfg, 'NOPE', 'A') + +def test_apply_reuse_error_non_transient(): + sdfg = dace.SDFG('err') + sdfg.add_array('inp', [10], dace.float64, transient=False) + sdfg.add_array('buf', [10], dace.float64, transient=True) + s = sdfg.add_state('s', is_start_block=True) + with pytest.raises(ValueError, match="transient"): + _apply_reuse(sdfg, 'buf', 'inp') + +import numpy as np + +def _build_fill_read_sdfg(): + """ + SDFG: init --> fill_A --> read_B --> done + fill_A writes A[i] = i (for i in 0..9). + read_B sums B into out[0]. + After _apply_reuse(sdfg, 'B', 'A'), B holds A's data -> out[0] = 45. + """ + sdfg = dace.SDFG('fill_read') + sdfg.add_array('A', [10], dace.float64, transient=True) + sdfg.add_array('B', [10], dace.float64, transient=True) + sdfg.add_array('out', [1], dace.float64, transient=False) + + init = sdfg.add_state('init', is_start_block=True) + fill_A = sdfg.add_state('fill_A') + read_B = sdfg.add_state('read_B') + done = sdfg.add_state('done') + sdfg.add_edge(init, fill_A, InterstateEdge()) + sdfg.add_edge(fill_A, read_B, InterstateEdge()) + sdfg.add_edge(read_B, done, InterstateEdge()) + + # fill_A state: A[i] = (double)i + me, mx = fill_A.add_map('fill', {'i': '0:10'}) + t_fill = fill_A.add_tasklet('fill_t', {}, {'a'}, 'a = (double)i;', + language=dace.Language.CPP) + a_write = fill_A.add_write('A') + fill_A.add_edge(me, None, t_fill, None, dace.Memlet()) + fill_A.add_memlet_path(t_fill, mx, a_write, + src_conn='a', memlet=dace.Memlet('A[i]')) + + # read_B state: out[0] = sum(B[i]) + me2, mx2 = read_B.add_map('sum', {'i': '0:10'}) + t_sum = read_B.add_tasklet('sum_t', {'b'}, {'o'}, 'o = b;', + language=dace.Language.CPP) + b_read = read_B.add_read('B') + out_acc = read_B.add_write('out') + read_B.add_memlet_path(b_read, me2, t_sum, + dst_conn='b', memlet=dace.Memlet('B[i]')) + read_B.add_memlet_path(t_sum, mx2, out_acc, + src_conn='o', + memlet=dace.Memlet('out[0]', wcr='lambda a, b: a + b')) + + return sdfg + +def test_cpp_contains_reuse_pointer_assignment(): + sdfg = _build_fill_read_sdfg() + _apply_reuse(sdfg, 'B', 'A') + cpp = sdfg.generate_code()[0].clean_code + # A must be allocated + assert '__state->__0_A = new double' in cpp + # B must NOT be freshly allocated -- it reuses A's pointer + assert '__state->__0_B = new double' not in cpp + # B must receive A's pointer + assert '__state->__0_B = __state->__0_A' in cpp + # A must be nulled out + assert '__state->__0_A = nullptr' in cpp + # B must be freed (takes over A's memory) + assert 'delete[] __state->__0_B' in cpp + # A must NOT be freed + assert 'delete[] __state->__0_A' not in cpp + +def test_compile_run_simple_reuse(): + sdfg = _build_fill_read_sdfg() + _apply_reuse(sdfg, 'B', 'A') + sdfg.validate() + + out = np.zeros(1, dtype=np.float64) + csdfg = sdfg.compile() + csdfg(out=out) + + expected = 10 * 9 / 2 # 0+1+...+9 = 45 + assert out[0] == expected, f"Expected {expected}, got {out[0]}" diff --git a/tests/npbench/misc/nbody_test.py b/tests/npbench/misc/nbody_test.py deleted file mode 100644 index f413dba599..0000000000 --- a/tests/npbench/misc/nbody_test.py +++ /dev/null @@ -1,307 +0,0 @@ -# Copyright 2019-2022 ETH Zurich and the DaCe authors. All rights reserved. -# Original application code: NPBench - https://github.com/spcl/npbench -import dace.dtypes -import numpy as np -import dace as dc -import pytest -import argparse -from dace.transformation.auto.auto_optimize import auto_optimize - -N, Nt = (dc.symbol(s, dtype=dc.int64) for s in ('N', 'Nt')) - - -@dc.program -def getAcc(pos: dc.float64[N, 3], mass: dc.float64[N], G: dc.float64, softening: dc.float64): - """ - Calculate the acceleration on each particle due to Newton's Law - pos is an N x 3 matrix of positions - mass is an N x 1 vector of masses - G is Newton's Gravitational constant - softening is the softening length - a is N x 3 matrix of accelerations - """ - # positions r = [x,y,z] for all particles - x = pos[:, 0:1] - y = pos[:, 1:2] - z = pos[:, 2:3] - - # matrix that stores all pairwise particle separations: r_j - r_i - dx = np.add.outer(-x, x) - dy = np.add.outer(-y, y) - dz = np.add.outer(-z, z) - - # matrix that stores 1/r^3 for all particle pairwise particle separations - inv_r3 = (dx**2 + dy**2 + dz**2 + softening**2) - # inv_r3[inv_r3>0] = inv_r3[inv_r3>0]**(-1.5) - I = inv_r3 > 0 - np.power(inv_r3, -1.5, out=inv_r3, where=I) - - ax = G * (dx * inv_r3) @ mass - ay = G * (dy * inv_r3) @ mass - az = G * (dz * inv_r3) @ mass - - # pack together the acceleration components - a = np.ndarray((N, 3), dtype=np.float64) - a[:, 0] = ax - a[:, 1] = ay - a[:, 2] = az - - return a - - -@dc.program -def getEnergy(pos: dc.float64[N, 3], vel: dc.float64[N, 3], mass: dc.float64[N], G: dc.float64): - """ - Get kinetic energy (KE) and potential energy (PE) of simulation - pos is N x 3 matrix of positions - vel is N x 3 matrix of velocities - mass is an N x 1 vector of masses - G is Newton's Gravitational constant - KE is the kinetic energy of the system - PE is the potential energy of the system - """ - # Kinetic Energy: - # KE = 0.5 * np.sum(np.sum( mass * vel**2 )) - # KE = 0.5 * np.sum( mass * vel**2 ) - KE = 0.5 * np.sum(np.reshape(mass, (N, 1)) * vel**2) - - # Potential Energy: - - # positions r = [x,y,z] for all particles - x = pos[:, 0:1] - y = pos[:, 1:2] - z = pos[:, 2:3] - - # matrix that stores all pairwise particle separations: r_j - r_i - dx = np.add.outer(-x, x) - dy = np.add.outer(-y, y) - dz = np.add.outer(-z, z) - - # matrix that stores 1/r for all particle pairwise particle separations - inv_r = np.sqrt(dx**2 + dy**2 + dz**2) - # inv_r[inv_r>0] = 1.0/inv_r[inv_r>0] - I = inv_r > 0 - np.divide(1.0, inv_r, out=inv_r, where=I) - - # sum over upper triangle, to count each interaction only once - tmp = -np.multiply.outer(mass, mass) * inv_r - PE = 0.0 - for j in range(N): - for k in range(j + 1, N): - PE += tmp[j, k] - PE *= G - - return KE, PE - - -@dc.program -def nbody(mass: dc.float64[N], pos: dc.float64[N, 3], vel: dc.float64[N, 3], dt: dc.float64, G: dc.float64, - softening: dc.float64): - - # Convert to Center-of-Mass frame - np.subtract(vel, np.mean(np.reshape(mass, (N, 1)) * vel, axis=0) / np.mean(mass), out=vel) - - # calculate initial gravitational accelerations - acc = getAcc(pos, mass, G, softening) - - # calculate initial energy of system - KE = np.ndarray(Nt + 1, dtype=np.float64) - PE = np.ndarray(Nt + 1, dtype=np.float64) - KE[0], PE[0] = getEnergy(pos, vel, mass, G) - - t = 0.0 - - # Simulation Main Loop - for i in range(Nt): - # (1/2) kick - vel += acc * dt / 2.0 - - # drift - pos += vel * dt - - # update accelerations - acc[:] = getAcc(pos, mass, G, softening) - - # (1/2) kick - vel += acc * dt / 2.0 - - # update time - t += dt - - # get energy of system - KE[i + 1], PE[i + 1] = getEnergy(pos, vel, mass, G) - - return KE, PE - - -def initialize(N, tEnd, dt): - from numpy.random import default_rng - rng = default_rng(42) - mass = 20.0 * np.ones((N, 1)) / N # total mass of particles is 20 - pos = rng.random((N, 3)) # randomly selected positions and velocities - vel = rng.random((N, 3)) - Nt = int(np.ceil(tEnd / dt)) - return mass, pos, vel, Nt - - -### Ground Truth - - -def getAcc_np(pos, mass, G, softening): - """ - Calculate the acceleration on each particle due to Newton's Law - pos is an N x 3 matrix of positions - mass is an N x 1 vector of masses - G is Newton's Gravitational constant - softening is the softening length - a is N x 3 matrix of accelerations - """ - # positions r = [x,y,z] for all particles - x = pos[:, 0:1] - y = pos[:, 1:2] - z = pos[:, 2:3] - - # matrix that stores all pairwise particle separations: r_j - r_i - dx = x.T - x - dy = y.T - y - dz = z.T - z - - # matrix that stores 1/r^3 for all particle pairwise particle separations - inv_r3 = (dx**2 + dy**2 + dz**2 + softening**2) - inv_r3[inv_r3 > 0] = inv_r3[inv_r3 > 0]**(-1.5) - - ax = G * (dx * inv_r3) @ mass - ay = G * (dy * inv_r3) @ mass - az = G * (dz * inv_r3) @ mass - - # pack together the acceleration components - a = np.hstack((ax, ay, az)) - - return a - - -def getEnergy_np(pos, vel, mass, G): - """ - Get kinetic energy (KE) and potential energy (PE) of simulation - pos is N x 3 matrix of positions - vel is N x 3 matrix of velocities - mass is an N x 1 vector of masses - G is Newton's Gravitational constant - KE is the kinetic energy of the system - PE is the potential energy of the system - """ - # Kinetic Energy: - # KE = 0.5 * np.sum(np.sum( mass * vel**2 )) - KE = 0.5 * np.sum(mass * vel**2) - - # Potential Energy: - - # positions r = [x,y,z] for all particles - x = pos[:, 0:1] - y = pos[:, 1:2] - z = pos[:, 2:3] - - # matrix that stores all pairwise particle separations: r_j - r_i - dx = x.T - x - dy = y.T - y - dz = z.T - z - - # matrix that stores 1/r for all particle pairwise particle separations - inv_r = np.sqrt(dx**2 + dy**2 + dz**2) - inv_r[inv_r > 0] = 1.0 / inv_r[inv_r > 0] - - # sum over upper triangle, to count each interaction only once - # PE = G * np.sum(np.sum(np.triu(-(mass*mass.T)*inv_r,1))) - PE = G * np.sum(np.triu(-(mass * mass.T) * inv_r, 1)) - - return KE, PE - - -def nbody_np(mass, pos, vel, N, Nt, dt, G, softening): - - # Convert to Center-of-Mass frame - vel -= np.mean(mass * vel, axis=0) / np.mean(mass) - - # calculate initial gravitational accelerations - acc = getAcc_np(pos, mass, G, softening) - - # calculate initial energy of system - KE = np.ndarray(Nt + 1, dtype=np.float64) - PE = np.ndarray(Nt + 1, dtype=np.float64) - KE[0], PE[0] = getEnergy_np(pos, vel, mass, G) - - t = 0.0 - - # Simulation Main Loop - for i in range(Nt): - # (1/2) kick - vel += acc * dt / 2.0 - - # drift - pos += vel * dt - - # update accelerations - acc = getAcc_np(pos, mass, G, softening) - - # (1/2) kick - vel += acc * dt / 2.0 - - # update time - t += dt - - # get energy of system - KE[i + 1], PE[i + 1] = getEnergy_np(pos, vel, mass, G) - - return KE, PE - - -def run_nbody(device_type: dace.dtypes.DeviceType): - ''' - Runs nbody for the given device - :return: the SDFG - ''' - - # Initialize data (npbench small size) - N, tEnd, dt, softening, G = 25, 2.0, 0.05, 0.1, 1.0 - mass, pos, vel, Nt = initialize(N, tEnd, dt) - mass_ref = np.copy(mass) - pos_ref = np.copy(pos) - vel_ref = np.copy(vel) - - if device_type in {dace.dtypes.DeviceType.CPU, dace.dtypes.DeviceType.GPU}: - # Parse the SDFG and apply auto-opt - sdfg = nbody.to_sdfg() - sdfg = auto_optimize(sdfg, device_type) - KE, PE = sdfg(mass, pos, vel, dt, G, softening, N=N, Nt=Nt) - else: - raise ValueError(f"Unsupported device type: {device_type}") - - # Compute ground truth and validate - KE_ref, PE_ref = nbody_np(mass_ref, pos_ref, vel_ref, N, Nt, dt, G, softening) - assert np.allclose(KE, KE_ref) - assert np.allclose(PE, PE_ref) - return sdfg - - -def test_cpu(): - run_nbody(dace.dtypes.DeviceType.CPU) - - -@pytest.mark.skip(reason="Incorrect output") -@pytest.mark.gpu -def test_gpu(): - run_nbody(dace.dtypes.DeviceType.GPU) - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser() - parser.add_argument("-t", "--target", default='cpu', choices=['cpu', 'gpu'], help='Target platform') - - args = vars(parser.parse_args()) - target = args["target"] - - if target == "cpu": - run_nbody(dace.dtypes.DeviceType.CPU) - elif target == "gpu": - run_nbody(dace.dtypes.DeviceType.GPU) diff --git a/tests/passes/buffer_reuse_cross_test.py b/tests/passes/buffer_reuse_cross_test.py new file mode 100644 index 0000000000..c6b9e0c799 --- /dev/null +++ b/tests/passes/buffer_reuse_cross_test.py @@ -0,0 +1,295 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +"""Tests for buffer_reuse_cross_pass — cross-size, cross-dtype sub-allocation reuse. + +buffer_reuse_cross_pass extends buffer_reuse_same_pass to cross-size donors: a +larger just-freed buffer can serve multiple smaller upcoming allocations at +non-overlapping byte offsets (strict bump allocator). Donor selection is +best-fit by remaining capacity. +""" +from __future__ import annotations + +import numpy as np +import pytest + +import dace +from dace.sdfg import SDFG, InterstateEdge +from dace.libraries.allocation import make_explicit, buffer_reuse_same_pass +from dace.libraries.allocation.reuse import ( + _AllocEntry, + _FreeEntry, + _greedy_cross_size_scan, + _extract_liveness, + buffer_reuse_cross_pass, +) + + +def _two_size_sequential_sdfg(name: str, big: int = 16, small: int = 8) -> SDFG: + """Sequential SDFG with separated A-read and B-write states so A's + last-use block and B's first-use block are distinct. + + init -> write_A -> read_A -> write_B -> read_B -> done + + A is (big,) float64; B is (small,) float64. ``make_explicit`` places + A's free and B's alloc on the *same* edge (read_A → write_B) which + satisfies the strict edge-order safety check in buffer_reuse_cross_pass + A's last-use block (read_A) is strictly before B's first-use block + (write_B) in topological order. + """ + sdfg = SDFG(name) + sdfg.add_array('A', [big], dace.float64, transient=True) + sdfg.add_array('B', [small], dace.float64, transient=True) + sdfg.add_array('out', [small], dace.float64, transient=False) + + init = sdfg.add_state('init', is_start_block=True) + wA = sdfg.add_state('write_A') + rA = sdfg.add_state('read_A') + wB = sdfg.add_state('write_B') + rB = sdfg.add_state('read_B') + done = sdfg.add_state('done') + sdfg.add_edge(init, wA, InterstateEdge()) + sdfg.add_edge(wA, rA, InterstateEdge()) + sdfg.add_edge(rA, wB, InterstateEdge()) + sdfg.add_edge(wB, rB, InterstateEdge()) + sdfg.add_edge(rB, done, InterstateEdge()) + + # write_A: A[i] = i for i in [0, big) + m1, x1 = wA.add_map('wA', {'i': f'0:{big}'}) + t1 = wA.add_tasklet('wA', {}, {'a'}, 'a = (double)i;', language=dace.Language.CPP) + aw = wA.add_write('A') + wA.add_edge(m1, None, t1, None, dace.Memlet()) + wA.add_memlet_path(t1, x1, aw, src_conn='a', memlet=dace.Memlet('A[i]')) + + # read_A: out[i] = A[i] * 2.0 for i in [0, small) (uses A; result later overwritten) + m2, x2 = rA.add_map('rA', {'i': f'0:{small}'}) + t2 = rA.add_tasklet('rA', {'a'}, {'o'}, 'o = a * 2.0;', language=dace.Language.CPP) + ar = rA.add_read('A') + ow2 = rA.add_write('out') + rA.add_memlet_path(ar, m2, t2, dst_conn='a', memlet=dace.Memlet('A[i]')) + rA.add_memlet_path(t2, x2, ow2, src_conn='o', memlet=dace.Memlet('out[i]')) + + # write_B: B[i] = (double)i for i in [0, small) + m3, x3 = wB.add_map('wB', {'i': f'0:{small}'}) + t3 = wB.add_tasklet('wB', {}, {'b'}, 'b = (double)i;', language=dace.Language.CPP) + bw = wB.add_write('B') + wB.add_edge(m3, None, t3, None, dace.Memlet()) + wB.add_memlet_path(t3, x3, bw, src_conn='b', memlet=dace.Memlet('B[i]')) + + # read_B: out[i] = B[i] + 1.0 for i in [0, small) + m4, x4 = rB.add_map('rB', {'i': f'0:{small}'}) + t4 = rB.add_tasklet('rB', {'b'}, {'o'}, 'o = b + 1.0;', language=dace.Language.CPP) + br = rB.add_read('B') + ow4 = rB.add_write('out') + rB.add_memlet_path(br, m4, t4, dst_conn='b', memlet=dace.Memlet('B[i]')) + rB.add_memlet_path(t4, x4, ow4, src_conn='o', memlet=dace.Memlet('out[i]')) + + return sdfg + + +# --------------------------------------------------------------------------- +# Pure-function tests for _greedy_cross_size_scan +# --------------------------------------------------------------------------- + +class TestGreedyCrossSizeScan: + + def test_chooses_larger_donor_for_smaller_consumer(self): + f64 = dace.float64 + liveness = [ + _AllocEntry('A', 128, f64), + _FreeEntry('A', 128, f64), + _AllocEntry('B', 64, f64), + _FreeEntry('B', 64, f64), + ] + plan = _greedy_cross_size_scan(liveness) + assert plan == [('B', 'A', 0)] + + def test_best_fit_prefers_smaller_donor_when_multiple_fit(self): + f64 = dace.float64 + liveness = [ + _AllocEntry('big', 256, f64), + _AllocEntry('mid', 128, f64), + _FreeEntry('big', 256, f64), + _FreeEntry('mid', 128, f64), + _AllocEntry('small', 64, f64), + _FreeEntry('small', 64, f64), + ] + plan = _greedy_cross_size_scan(liveness) + assert plan == [('small', 'mid', 0)] + + def test_picks_cross_dtype_donor_when_donor_wider(self): + """A float64 donor (8-byte alignment) can satisfy a float32 + consumer (4-byte alignment) — alignof(donor) >= alignof(consumer).""" + liveness = [ + _AllocEntry('A_d', 128, dace.float64), + _FreeEntry('A_d', 128, dace.float64), + _AllocEntry('B_f', 64, dace.float32), + _FreeEntry('B_f', 64, dace.float32), + ] + plan = _greedy_cross_size_scan(liveness) + assert plan == [('B_f', 'A_d', 0)] + + def test_skips_narrower_donor_for_wider_consumer(self): + """A float32 donor cannot satisfy a float64 consumer even when + the donor's size_bytes is sufficient: the donor's heap block is + only float-aligned, not double-aligned.""" + liveness = [ + _AllocEntry('A_f', 256, dace.float32), + _FreeEntry('A_f', 256, dace.float32), + _AllocEntry('B_d', 64, dace.float64), + _FreeEntry('B_d', 64, dace.float64), + ] + plan = _greedy_cross_size_scan(liveness) + assert plan == [] + + def test_skips_donors_too_small(self): + f64 = dace.float64 + liveness = [ + _AllocEntry('A', 64, f64), + _FreeEntry('A', 64, f64), + _AllocEntry('B', 128, f64), + _FreeEntry('B', 128, f64), + ] + plan = _greedy_cross_size_scan(liveness) + assert plan == [] # A is too small to donate to B + + def test_returns_empty_when_lifetimes_overlap(self): + f64 = dace.float64 + liveness = [ + _AllocEntry('A', 128, f64), + _AllocEntry('B', 64, f64), # B alloc'd before A freed → no chain + _FreeEntry('A', 128, f64), + _FreeEntry('B', 64, f64), + ] + plan = _greedy_cross_size_scan(liveness) + assert plan == [] + + def test_two_consumers_share_one_donor(self): + """D[160 B] freed; B[80 B] gets offset 0, C[64 B] gets offset 80.""" + f64 = dace.float64 + liveness = [ + _FreeEntry('D', 160, f64), + _AllocEntry('B', 80, f64), + _AllocEntry('C', 64, f64), + ] + plan = _greedy_cross_size_scan(liveness) + assert plan == [('B', 'D', 0), ('C', 'D', 80)] + + +def _three_array_sdfg(name: str, d_size: int = 20, + b_size: int = 10, c_size: int = 8) -> SDFG: + """Sequential SDFG: D used then freed, then B used, then C used. + + init -> write_D -> read_D -> write_B -> read_B -> write_C -> read_C -> done + + D[d_size], B[b_size], C[c_size] all float64. D's last access (read_D) + precedes B's first access (write_B) and C's first access (write_C) in + topological order, so both B and C can sub-allocate from D. + """ + sdfg = SDFG(name) + sdfg.add_array('D', [d_size], dace.float64, transient=True) + sdfg.add_array('B', [b_size], dace.float64, transient=True) + sdfg.add_array('C', [c_size], dace.float64, transient=True) + sdfg.add_array('out', [1], dace.float64, transient=False) + + states = [sdfg.add_state(s, is_start_block=(i == 0)) + for i, s in enumerate( + ['init', 'write_D', 'read_D', 'write_B', 'read_B', + 'write_C', 'read_C', 'done'])] + for a, b in zip(states, states[1:]): + sdfg.add_edge(a, b, dace.InterstateEdge()) + + init, wD, rD, wB, rB, wC, rC, done = states + + def _write(st, arr, sz): + m, x = st.add_map(f'w{arr}', {'i': f'0:{sz}'}) + t = st.add_tasklet(f'w{arr}', {}, {'v'}, + 'v = (double)i;', language=dace.Language.CPP) + w = st.add_write(arr) + st.add_edge(m, None, t, None, dace.Memlet()) + st.add_memlet_path(t, x, w, src_conn='v', + memlet=dace.Memlet(f'{arr}[i]')) + + def _read(st, arr): + m, x = st.add_map(f'r{arr}', {'i': '0:1'}) + t = st.add_tasklet(f'r{arr}', {'v'}, {'o'}, + 'o = v;', language=dace.Language.CPP) + r = st.add_read(arr) + ow = st.add_write('out') + st.add_memlet_path(r, m, t, dst_conn='v', + memlet=dace.Memlet(f'{arr}[0]')) + st.add_memlet_path(t, x, ow, src_conn='o', + memlet=dace.Memlet('out[0]')) + + _write(wD, 'D', d_size) + _read(rD, 'D') + _write(wB, 'B', b_size) + _read(rB, 'B') + _write(wC, 'C', c_size) + _read(rC, 'C') + return sdfg + + +# --------------------------------------------------------------------------- +# Integration: buffer_reuse_cross_pass on a sequential cross-size SDFG +# --------------------------------------------------------------------------- + +class TestBufferReuseCrossPassIntegration: + + def test_skips_when_donor_too_small(self): + """Swap sizes: A is the smaller, B is the larger. No chain pair + should be applied because A cannot donate to B.""" + sdfg = _two_size_sequential_sdfg('test_arena_too_small', big=8, small=16) + applied = buffer_reuse_cross_pass(sdfg, {}, {}) + assert applied == [], f"Expected no pairs; got {applied}" + + def test_cross_size_pair_found(self): + sdfg = _two_size_sequential_sdfg('test_arena_pair_found', big=16, small=8) + applied = buffer_reuse_cross_pass(sdfg, {}, {}) + assert ('B', 'A') in applied, f"Expected (B,A); got {applied}" + + def test_end_to_end_correctness(self): + big, small = 32, 16 + sdfg_base = _two_size_sequential_sdfg('test_arena_e2e_base', big=big, small=small) + sdfg_arena = _two_size_sequential_sdfg('test_arena_e2e_arena', big=big, small=small) + + applied = buffer_reuse_cross_pass(sdfg_arena, {}, {}) + assert ('B', 'A') in applied, f"expected (B,A); got {applied}" + + out_base = np.zeros(small, dtype=np.float64) + out_arena = np.zeros(small, dtype=np.float64) + sdfg_base(out=out_base) + sdfg_arena(out=out_arena) + assert np.array_equal(out_base, out_arena), ( + f"diverged: base={out_base}, arena={out_arena}" + ) + + def test_two_consumers_sub_allocated_from_one_donor(self): + """D[20 × float64 = 160 B] is large enough for both B[10 × 8 = 80 B] + and C[8 × 8 = 64 B] (80 + 64 = 144 ≤ 160). After the pass both B and + C should have reuse entries pointing to D, with C at a non-zero offset.""" + sdfg = _three_array_sdfg('test_suballoc', d_size=20, b_size=10, c_size=8) + + applied = buffer_reuse_cross_pass(sdfg, {}, {}) + + consumers = {c for c, _ in applied} + assert 'B' in consumers, f"B not applied; applied={applied}" + assert 'C' in consumers, f"C not applied; applied={applied}" + + reuse_entries = [r for e in sdfg.all_interstate_edges(recursive=True) + for r in e.data.reuse] + b_entry = next((r for r in reuse_entries if r[0] == 'B'), None) + c_entry = next((r for r in reuse_entries if r[0] == 'C'), None) + + assert b_entry is not None, "no reuse entry for B" + assert c_entry is not None, "no reuse entry for C" + assert b_entry[1] == 'D', f"B should reuse D; got {b_entry[1]}" + assert c_entry[1] == 'D', f"C should reuse D; got {c_entry[1]}" + assert int(b_entry[2]) == 0, f"B offset should be 0; got {b_entry[2]}" + assert int(c_entry[2]) == 80, f"C offset should be 80 (B uses 10×8 B); got {c_entry[2]}" + + print("\n Applied pairs:", applied) + print(" Edge annotations:") + for e in sdfg.all_interstate_edges(recursive=True): + d = e.data + if d.alloc or d.free or d.reuse: + print(f" {e.src.label} -> {e.dst.label}:" + f" alloc={d.alloc} free={d.free} reuse={d.reuse}") diff --git a/tests/passes/buffer_reuse_same_ua_test.py b/tests/passes/buffer_reuse_same_ua_test.py new file mode 100644 index 0000000000..71a537cea8 --- /dev/null +++ b/tests/passes/buffer_reuse_same_ua_test.py @@ -0,0 +1,319 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +"""Tests for buffer_reuse_same_pass_ua — U/A-ratio variant of buffer_reuse_same_pass. + +Difference from buffer_reuse_same_pass: the effective free point of each array is +its last CATS DataAccessEvent rather than its DeallocationEvent. This matters +whenever an array stays live past its last real use — most importantly for +AllocationLifetime.Explicit arrays, which CATS conservatively treats as +scope-wide (see determine_allocation_lifetime fallback). +""" +import copy +from typing import List, Tuple + +import numpy as np +import pytest + +import dace +from dace import dtypes +from dace.sdfg import SDFG, InterstateEdge +from dace.libraries.allocation import make_explicit, buffer_reuse_same_pass +from dace.libraries.allocation.reuse import ( + _extract_liveness, + _AllocEntry, + _FreeEntry, + _greedy_same_size_scan, + _greedy_cross_size_scan, + _ua_greedy_same_size_scan, + buffer_reuse_same_pass_ua, +) + +from dace.libraries.allocation.reuse import _apply_reuse + +# --------------------------------------------------------------------------- +# Minimal 4-stage image-pipeline used by integration tests +# --------------------------------------------------------------------------- + +_N = dace.symbol('N') + + +@dace.program +def _blur(src: dace.float64[_N, _N], dst: dace.float64[_N, _N]): + for i, j in dace.map[1:_N-1, 1:_N-1]: + dst[i, j] = ( + src[i-1, j-1] * 1.0 + src[i-1, j] * 2.0 + src[i-1, j+1] * 1.0 + + src[i, j-1] * 2.0 + src[i, j] * 4.0 + src[i, j+1] * 2.0 + + src[i+1, j-1] * 1.0 + src[i+1, j] * 2.0 + src[i+1, j+1] * 1.0 + ) / 16.0 + + +@dace.program +def _sobel(src: dace.float64[_N, _N], dst: dace.float64[_N, _N]): + for i, j in dace.map[1:_N-1, 1:_N-1]: + gx = ( + -1.0 * src[i-1, j-1] + 1.0 * src[i-1, j+1] + + -2.0 * src[i, j-1] + 2.0 * src[i, j+1] + + -1.0 * src[i+1, j-1] + 1.0 * src[i+1, j+1] + ) + gy = ( + -1.0 * src[i-1, j-1] + -2.0 * src[i-1, j] + -1.0 * src[i-1, j+1] + + 1.0 * src[i+1, j-1] + 2.0 * src[i+1, j] + 1.0 * src[i+1, j+1] + ) + dst[i, j] = (gx * gx + gy * gy) ** 0.5 + + +@dace.program +def _smooth(src: dace.float64[_N, _N], dst: dace.float64[_N, _N]): + for i, j in dace.map[1:_N-1, 1:_N-1]: + dst[i, j] = ( + src[i-1, j-1] * 1.0 + src[i-1, j] * 2.0 + src[i-1, j+1] * 1.0 + + src[i, j-1] * 2.0 + src[i, j] * 4.0 + src[i, j+1] * 2.0 + + src[i+1, j-1] * 1.0 + src[i+1, j] * 2.0 + src[i+1, j+1] * 1.0 + ) / 16.0 + + +@dace.program +def _threshold(src: dace.float64[_N, _N], dst: dace.float64[_N, _N]): + for i, j in dace.map[0:_N, 0:_N]: + dst[i, j] = float(src[i, j] > 0.5) + + +@dace.program +def _pipeline(img: dace.float64[_N, _N], out: dace.float64[_N, _N]): + tmp1 = np.zeros((_N, _N), dtype=np.float64) + _blur(img, tmp1) + tmp2 = np.zeros((_N, _N), dtype=np.float64) + _sobel(tmp1, tmp2) + tmp3 = np.zeros((_N, _N), dtype=np.float64) + _smooth(tmp2, tmp3) + _threshold(tmp3, out) + + +def _make_pipeline_sdfg(mode: str) -> dace.SDFG: + """Return an image-pipeline SDFG in baseline, explicit, or reused mode.""" + sdfg = _pipeline.to_sdfg(simplify=False) + if mode == 'baseline': + return sdfg + make_explicit(sdfg, ['tmp1', 'tmp2', 'tmp3']) + if mode == 'explicit': + return sdfg + if mode == 'reused': + _apply_reuse(sdfg, new_arr='tmp3', donor_arr='tmp1') + return sdfg + raise ValueError(f"unknown mode: {mode!r}") + + +# --------------------------------------------------------------------------- +# Helpers for constructing small hand-built SDFGs +# --------------------------------------------------------------------------- + +def _sequential_two_array_sdfg(name: str, n: int = 10) -> SDFG: + """SDFG with two same-size Explicit arrays whose *real* uses are disjoint. + + Flow: + init -> write_A -> read_A_write_B -> read_B -> done + + A is explicit-allocated at entry and explicit-freed at exit, even though + its last access is in read_A_write_B. Likewise for B. So CATS sees both + live SDFG-wide, but they are in fact disjoint after the middle state. + """ + sdfg = SDFG(name) + sdfg.add_array('A', [n], dace.float64, transient=True) + sdfg.add_array('B', [n], dace.float64, transient=True) + sdfg.add_array('out', [n], dace.float64, transient=False) + + init = sdfg.add_state('init', is_start_block=True) + wA = sdfg.add_state('write_A') + rAwB = sdfg.add_state('read_A_write_B') + rB = sdfg.add_state('read_B') + done = sdfg.add_state('done') + sdfg.add_edge(init, wA, InterstateEdge()) + sdfg.add_edge(wA, rAwB, InterstateEdge()) + sdfg.add_edge(rAwB, rB, InterstateEdge()) + sdfg.add_edge(rB, done, InterstateEdge()) + + # write_A: A[i] = i + m1, x1 = wA.add_map('wA', {'i': f'0:{n}'}) + t1 = wA.add_tasklet('wA', {}, {'a'}, 'a = (double)i;', language=dace.Language.CPP) + aw = wA.add_write('A') + wA.add_edge(m1, None, t1, None, dace.Memlet()) + wA.add_memlet_path(t1, x1, aw, src_conn='a', memlet=dace.Memlet('A[i]')) + + # read_A_write_B: B[i] = A[i] * 2 + m2, x2 = rAwB.add_map('rAwB', {'i': f'0:{n}'}) + t2 = rAwB.add_tasklet('rAwB', {'a'}, {'b'}, 'b = a * 2.0;', language=dace.Language.CPP) + ar = rAwB.add_read('A') + bw = rAwB.add_write('B') + rAwB.add_memlet_path(ar, m2, t2, dst_conn='a', memlet=dace.Memlet('A[i]')) + rAwB.add_memlet_path(t2, x2, bw, src_conn='b', memlet=dace.Memlet('B[i]')) + + # read_B: out[i] = B[i] + 1 + m3, x3 = rB.add_map('rB', {'i': f'0:{n}'}) + t3 = rB.add_tasklet('rB', {'b'}, {'o'}, 'o = b + 1.0;', language=dace.Language.CPP) + br = rB.add_read('B') + ow = rB.add_write('out') + rB.add_memlet_path(br, m3, t3, dst_conn='b', memlet=dace.Memlet('B[i]')) + rB.add_memlet_path(t3, x3, ow, src_conn='o', memlet=dace.Memlet('out[i]')) + + return sdfg + + +# --------------------------------------------------------------------------- +# Pure-function tests for _extract_liveness (tightened access-window variant) +# --------------------------------------------------------------------------- + +class TestExtractLivenessUA: + + def test_returns_alloc_and_free_for_every_explicit_array(self): + sdfg = _sequential_two_array_sdfg('test_ua_sizes', n=16) + make_explicit(sdfg, ['A', 'B']) + events = _extract_liveness(sdfg, {}, {}) + allocs = [e for e in events if isinstance(e, _AllocEntry)] + frees = [e for e in events if isinstance(e, _FreeEntry)] + assert {a.array_name for a in allocs} == {'A', 'B'} + assert {f.array_name for f in frees} == {'A', 'B'} + + def test_free_of_A_precedes_alloc_of_B(self): + sdfg = _sequential_two_array_sdfg('test_ua_order', n=16) + make_explicit(sdfg, ['A', 'B']) + events = _extract_liveness(sdfg, {}, {}) + def _first(name, cls): + return next(i for i, e in enumerate(events) + if isinstance(e, cls) and e.array_name == name) + free_A = _first('A', _FreeEntry) + alloc_B = _first('B', _AllocEntry) + assert free_A < alloc_B, ( + f"Expected free(A) before alloc(B); events={events}" + ) + + +# --------------------------------------------------------------------------- +# Integration tests — compare UA vs greedy on scenarios where CATS misreports. +# --------------------------------------------------------------------------- + +class TestUAvsGreedy: + + def test_unified_liveness_finds_disjoint_pair(self): + """After unification, _extract_liveness already tightens windows so + the plain greedy scan finds the disjoint pair — no separate UA liveness + function needed.""" + sdfg = _sequential_two_array_sdfg('test_unified_finds_pair', n=16) + make_explicit(sdfg, ['A', 'B']) + pairs = _greedy_same_size_scan(_extract_liveness(sdfg, {}, {})) + assert pairs == [('B', 'A')], f"Expected [('B','A')], got {pairs}" + + def test_both_passes_find_tmp3_tmp1_in_pipeline(self): + """Both buffer_reuse_same_pass and buffer_reuse_same_pass_ua use tightened liveness + now, so both should find (tmp3, tmp1) in the image pipeline.""" + n = 32 + sdfg_g = _make_pipeline_sdfg('explicit') + greedy_pairs = buffer_reuse_same_pass(sdfg_g, {'N': n}, {'N': n}) + assert ('tmp3', 'tmp1') in set(greedy_pairs), ( + f"buffer_reuse_same_pass did not find (tmp3, tmp1); got {greedy_pairs}" + ) + + sdfg_u = _make_pipeline_sdfg('explicit') + ua_pairs = buffer_reuse_same_pass_ua(sdfg_u, {'N': n}, {'N': n}) + assert ('tmp3', 'tmp1') in set(ua_pairs), ( + f"buffer_reuse_same_pass_ua did not find (tmp3, tmp1); got {ua_pairs}" + ) + + +# --------------------------------------------------------------------------- +# Unit tests for _FreeEntry.ua_ratio field +# --------------------------------------------------------------------------- + +class TestFreeEntryRatio: + + def test_free_entry_accepts_ua_ratio(self): + fe = _FreeEntry('x', 100, dace.float64, ua_ratio=0.3) + assert fe.ua_ratio == 0.3 + + def test_free_entry_ua_ratio_defaults_to_one(self): + fe = _FreeEntry('y', 50, dace.float64) + assert fe.ua_ratio == 1.0 + + +# --------------------------------------------------------------------------- +# Unit tests for _ua_greedy_same_size_scan +# --------------------------------------------------------------------------- + +class TestUAGreedySameSize: + + def test_ua_picks_lowest_ratio_over_lifo(self): + """Two same-size donors: A freed first (ratio 0.1), B freed second (ratio 0.9). + LIFO picks B (last freed); ua greedy picks A (lowest ratio).""" + events = [ + _FreeEntry('A', 100, dace.float64, ua_ratio=0.1), + _FreeEntry('B', 100, dace.float64, ua_ratio=0.9), + _AllocEntry('C', 100, dace.float64), + ] + lifo_pairs = _greedy_same_size_scan(events) + ua_pairs = _ua_greedy_same_size_scan(events) + assert lifo_pairs == [('C', 'B')], f"LIFO expected B: {lifo_pairs}" + assert ua_pairs == [('C', 'A')], f"UA expected A: {ua_pairs}" + + def test_ua_same_size_no_donor_available(self): + """No donors: no pairs produced.""" + events = [_AllocEntry('X', 100, dace.float64)] + assert _ua_greedy_same_size_scan(events) == [] + + def test_ua_same_size_single_donor(self): + """Single donor: same result as LIFO.""" + events = [ + _FreeEntry('A', 100, dace.float64, ua_ratio=0.5), + _AllocEntry('B', 100, dace.float64), + ] + assert _ua_greedy_same_size_scan(events) == [('B', 'A')] + + +# --------------------------------------------------------------------------- +# New unified liveness tests (TDD: written before implementation) +# --------------------------------------------------------------------------- + +class TestExtractLivenessUnified: + + def test_tightens_to_access_window(self): + """free(A) must precede alloc(B) — A's last access is before B's first.""" + sdfg = _sequential_two_array_sdfg('test_unified_tight', n=16) + make_explicit(sdfg, ['A', 'B']) + events = _extract_liveness(sdfg, {}, {}) + def _first(name, cls): + return next(i for i, e in enumerate(events) + if isinstance(e, cls) and e.array_name == name) + assert _first('A', _FreeEntry) < _first('B', _AllocEntry) + + def test_free_entries_carry_ua_ratio(self): + sdfg = _sequential_two_array_sdfg('test_unified_ratio', n=16) + make_explicit(sdfg, ['A', 'B']) + events = _extract_liveness(sdfg, {}, {}) + free_events = [e for e in events if isinstance(e, _FreeEntry)] + assert len(free_events) == 2 + for fe in free_events: + assert 0.0 <= fe.ua_ratio <= 1.0 + + +# --------------------------------------------------------------------------- +# Wiring test: buffer_reuse_same_pass_ua uses _ua_greedy_same_size_scan +# --------------------------------------------------------------------------- + +class TestUAPassWiring: + + def test_ua_end_to_end_pipeline_correct(self): + """Run baseline and UA on the image pipeline; outputs must match.""" + n = 32 + rng = np.random.default_rng(7) + img = rng.random((n, n)) + + out_base = np.zeros((n, n), dtype=np.float64) + sdfg_base = _make_pipeline_sdfg('baseline') + sdfg_base(img=img, out=out_base, N=n) + + out_ua = np.zeros((n, n), dtype=np.float64) + sdfg_ua = _make_pipeline_sdfg('explicit') + applied = buffer_reuse_same_pass_ua(sdfg_ua, {'N': n}, {'N': n}) + sdfg_ua(img=img, out=out_ua, N=n) + + assert len(applied) >= 1, f"UA applied no pairs" + assert np.array_equal(out_base, out_ua), ( + f"UA output diverges; max diff={np.abs(out_base - out_ua).max()}" + ) diff --git a/tests/passes/explicit_alloc_edge_cases_test.py b/tests/passes/explicit_alloc_edge_cases_test.py new file mode 100644 index 0000000000..0ef96e2470 --- /dev/null +++ b/tests/passes/explicit_alloc_edge_cases_test.py @@ -0,0 +1,435 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +"""Production-readiness edge-case tests for make_explicit() and the +AllocationLifetime.Explicit codegen pipeline. + +Covers scenarios not present in the main test suite: + - Idempotency: calling make_explicit twice produces no duplicates + - Already-Explicit array (pre-set lifetime) is accepted without error + - 0-dimensional (scalar-like) array with shape=() + - Array used in only one state that is both first AND last use + - SDFG with branch: array only on one branch + - Array in a loop body (LoopRegion) is allocated before/freed after the loop + - Codegen helper _generate_explicit_alloc_free emits correct C++ fragments + - InterstateEdge alloc/free serialisation round-trip (to_json / from_json) + - validation._validate_interstate_edge_explicit_alloc rejects bad states +""" + +import re +import json +import pytest +import dace +from dace import dtypes +from dace.sdfg import SDFG, SDFGState, InterstateEdge +from dace.libraries.allocation import make_explicit +from dace.libraries.allocation.make_explicit import ( + _blocks_using_in as _blocks_using, + _top_level_block_in as _top_level_block, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _simple_sdfg(array_name='buf', shape=(10,), dtype=dace.float64): + """Three-state SDFG: init → use → done. 'use' contains an access node.""" + sdfg = dace.SDFG('simple') + sdfg.add_array(array_name, shape, dtype, transient=True) + init = sdfg.add_state('init') + use = sdfg.add_state('use') + done = sdfg.add_state('done') + e0 = sdfg.add_edge(init, use, InterstateEdge()) + e1 = sdfg.add_edge(use, done, InterstateEdge()) + t = use.add_tasklet('fill', {}, {'out'}, 'out = 1.0') + use.add_edge(t, 'out', use.add_write(array_name), None, + dace.Memlet(f'{array_name}[0]')) + return sdfg, init, use, done, e0, e1 + + +def _get_cpp(sdfg: SDFG) -> str: + return sdfg.generate_code()[0].clean_code + + +# --------------------------------------------------------------------------- +# Idempotency +# --------------------------------------------------------------------------- + +class TestIdempotency: + + def test_make_explicit_twice_no_duplicate_alloc(self): + """Calling make_explicit twice must not add the name twice.""" + sdfg, _, _, _, e0, e1 = _simple_sdfg() + make_explicit(sdfg, ['buf']) + make_explicit(sdfg, ['buf']) + assert e0.data.alloc.count('buf') == 1 + + def test_make_explicit_twice_no_duplicate_free(self): + sdfg, _, _, _, e0, e1 = _simple_sdfg() + make_explicit(sdfg, ['buf']) + make_explicit(sdfg, ['buf']) + assert e1.data.free.count('buf') == 1 + + def test_already_explicit_lifetime_accepted(self): + """An array whose lifetime is already Explicit must be processed + without error; the edge annotation must still be added if missing.""" + sdfg, _, _, _, e0, e1 = _simple_sdfg() + sdfg.arrays['buf'].lifetime = dtypes.AllocationLifetime.Explicit + # Should not raise even though lifetime is pre-set + make_explicit(sdfg, ['buf']) + assert 'buf' in e0.data.alloc + assert 'buf' in e1.data.free + + def test_make_explicit_empty_list_is_noop(self): + """An empty array_names list must leave the SDFG unchanged.""" + sdfg, _, _, _, e0, e1 = _simple_sdfg() + original_state_count = len(sdfg.states()) + make_explicit(sdfg, []) + assert len(sdfg.states()) == original_state_count + assert e0.data.alloc == [] + assert e1.data.free == [] + + +# --------------------------------------------------------------------------- +# Scalar / 0-dimensional / unusual shapes +# --------------------------------------------------------------------------- + +class TestUnusualShapes: + + def test_scalar_array_shape_1(self): + """Shape (1,) raises ValueError — DaCe stores size-1 transients as scalars, not pointers.""" + sdfg, _, _, _, e0, e1 = _simple_sdfg('s', shape=(1,)) + with pytest.raises(ValueError, match='total_size=1'): + make_explicit(sdfg, ['s']) + + def test_multidim_shape(self): + """3-D array: shape expression uses all three dimensions.""" + sdfg = dace.SDFG('multi3d') + sdfg.add_array('arr', (4, 8, 16), dace.float32, transient=True) + init = sdfg.add_state('init') + use = sdfg.add_state('use') + done = sdfg.add_state('done') + sdfg.add_edge(init, use, InterstateEdge()) + sdfg.add_edge(use, done, InterstateEdge()) + t = use.add_tasklet('fill', {}, {'out'}, 'out = 0.0') + use.add_edge(t, 'out', use.add_write('arr'), None, dace.Memlet('arr[0,0,0]')) + make_explicit(sdfg, ['arr']) + cpp = _get_cpp(sdfg) + # Shape must be 4 * 8 * 16 = 512 or expressed as 4 * 8 * 16 + assert 'new float' in cpp + assert '512' in cpp or ('4' in cpp and '8' in cpp and '16' in cpp) + + def test_symbolic_shape_n_m(self): + """Two-symbol shape N×M must appear as a product in C++.""" + sdfg = dace.SDFG('sym2d') + N = dace.symbol('N') + M = dace.symbol('M') + sdfg.add_symbol('N', dace.int64) + sdfg.add_symbol('M', dace.int64) + sdfg.add_array('mat', (N, M), dace.float64, transient=True) + init = sdfg.add_state('init') + use = sdfg.add_state('use') + done = sdfg.add_state('done') + sdfg.add_edge(init, use, InterstateEdge()) + sdfg.add_edge(use, done, InterstateEdge()) + t = use.add_tasklet('fill', {}, {'out'}, 'out = 0.0') + use.add_edge(t, 'out', use.add_write('mat'), None, dace.Memlet('mat[0,0]')) + make_explicit(sdfg, ['mat']) + cpp = _get_cpp(sdfg) + assert 'new double' in cpp + # Both symbols must appear in the size expression + assert 'N' in cpp and 'M' in cpp + + +# --------------------------------------------------------------------------- +# Single-state that is both first AND last use +# --------------------------------------------------------------------------- + +class TestSingleUseState: + + def test_alloc_and_free_on_same_state_edges(self): + """When first_state == last_state, alloc goes on incoming edge and + free on outgoing edge of the SAME state.""" + sdfg, init, use, done, e0, e1 = _simple_sdfg() + make_explicit(sdfg, ['buf']) + # e0 is incoming to 'use', e1 is outgoing from 'use' + assert 'buf' in e0.data.alloc + assert 'buf' in e1.data.free + # Alloc must NOT be on e1, free must NOT be on e0 + assert 'buf' not in e1.data.alloc + assert 'buf' not in e0.data.free + + def test_validates_after_single_use_state(self): + sdfg, *_ = _simple_sdfg() + make_explicit(sdfg, ['buf']) + sdfg.validate() # must not raise + + +# --------------------------------------------------------------------------- +# Branching SDFG +# --------------------------------------------------------------------------- + +class TestBranchingSdfg: + + def _make_branch_sdfg(self): + """SDFG with two branches: + init → branch_a → merge + → branch_b → merge + 'buf' is only used in branch_a. + """ + sdfg = dace.SDFG('branch') + sdfg.add_array('buf', (10,), dace.float64, transient=True) + init = sdfg.add_state('init', is_start_block=True) + branch_a = sdfg.add_state('branch_a') + branch_b = sdfg.add_state('branch_b') + merge = sdfg.add_state('merge') + + e_a = sdfg.add_edge(init, branch_a, InterstateEdge()) + e_b = sdfg.add_edge(init, branch_b, InterstateEdge()) + e_am = sdfg.add_edge(branch_a, merge, InterstateEdge()) + e_bm = sdfg.add_edge(branch_b, merge, InterstateEdge()) + + t = branch_a.add_tasklet('fill', {}, {'out'}, 'out = 0.0') + branch_a.add_edge(t, 'out', branch_a.add_write('buf'), None, + dace.Memlet('buf[0]')) + + return sdfg, e_a, e_b, e_am, e_bm + + def test_alloc_on_incoming_edge_of_branch(self): + sdfg, e_a, e_b, e_am, e_bm = self._make_branch_sdfg() + make_explicit(sdfg, ['buf']) + # buf is first used in branch_a; its incoming edge from init is e_a + assert 'buf' in e_a.data.alloc + # The other branch edge must NOT carry the alloc + assert 'buf' not in e_b.data.alloc + + def test_free_on_outgoing_edge_of_branch(self): + sdfg, e_a, e_b, e_am, e_bm = self._make_branch_sdfg() + make_explicit(sdfg, ['buf']) + # Last use is also branch_a; its outgoing edge to merge is e_am + assert 'buf' in e_am.data.free + assert 'buf' not in e_bm.data.free + + def test_branch_sdfg_validates(self): + sdfg, *_ = self._make_branch_sdfg() + make_explicit(sdfg, ['buf']) + sdfg.validate() + + +# --------------------------------------------------------------------------- +# _blocks_using helper — top-level block resolution +# --------------------------------------------------------------------------- + +class TestBlocksUsing: + + def test_blocks_using_returns_direct_states(self): + """For a flat SDFG, _blocks_using returns SDFGState objects.""" + sdfg, init, use, done, e0, e1 = _simple_sdfg() + blocks = _blocks_using(sdfg, 'buf') + assert use in blocks + assert len(blocks) == 1 + + def test_blocks_using_empty_for_unused_array(self): + """An array declared but never accessed returns an empty list.""" + sdfg = dace.SDFG('empty') + sdfg.add_array('unused', (5,), dace.float32, transient=True) + sdfg.add_state('s0') + assert _blocks_using(sdfg, 'unused') == [] + + def test_blocks_using_topological_order(self): + """First element must be the topologically earlier state.""" + sdfg, init, use, done, e0, e1 = _simple_sdfg() + # add a second access node in 'done' + t2 = done.add_tasklet('read', {'inp'}, {}, '') + done.add_edge(done.add_read('buf'), None, t2, 'inp', + dace.Memlet('buf[0]')) + blocks = _blocks_using(sdfg, 'buf') + assert blocks[0] is use + assert blocks[-1] is done + + +# --------------------------------------------------------------------------- +# _generate_explicit_alloc_free helper +# --------------------------------------------------------------------------- + +class TestGenerateExplicitAllocFree: + + def test_alloc_fragment_contains_new(self): + """Edge with non-empty alloc list must produce new[] statement.""" + from dace.codegen.control_flow import _generate_explicit_alloc_free + from dace.sdfg.graph import Edge + + sdfg = dace.SDFG('frag') + sdfg.add_array('x', (16,), dace.float32, transient=True) + sdfg.arrays['x'].lifetime = dtypes.AllocationLifetime.Explicit + + init = sdfg.add_state('i') + use = sdfg.add_state('u') + edge = sdfg.add_edge(init, use, InterstateEdge(alloc=['x'])) + + result = _generate_explicit_alloc_free(edge, sdfg) + assert 'new float[16]' in result + assert f'__state->__{sdfg.cfg_id}_x' in result + + def test_free_fragment_contains_delete(self): + from dace.codegen.control_flow import _generate_explicit_alloc_free + + sdfg = dace.SDFG('frag2') + sdfg.add_array('x', (4,), dace.float64, transient=True) + sdfg.arrays['x'].lifetime = dtypes.AllocationLifetime.Explicit + + use = sdfg.add_state('u') + done = sdfg.add_state('d') + edge = sdfg.add_edge(use, done, InterstateEdge(free=['x'])) + + result = _generate_explicit_alloc_free(edge, sdfg) + assert f'delete[] __state->__{sdfg.cfg_id}_x' in result + + def test_empty_alloc_free_returns_empty_string(self): + from dace.codegen.control_flow import _generate_explicit_alloc_free + + sdfg = dace.SDFG('frag3') + s0 = sdfg.add_state('s0') + s1 = sdfg.add_state('s1') + edge = sdfg.add_edge(s0, s1, InterstateEdge()) + + assert _generate_explicit_alloc_free(edge, sdfg) == '' + + def test_symbolic_shape_in_alloc_fragment(self): + from dace.codegen.control_flow import _generate_explicit_alloc_free + + sdfg = dace.SDFG('sym_frag') + N = dace.symbol('N') + sdfg.add_symbol('N', dace.int64) + sdfg.add_array('v', (N,), dace.float64, transient=True) + sdfg.arrays['v'].lifetime = dtypes.AllocationLifetime.Explicit + + s0 = sdfg.add_state('s0') + s1 = sdfg.add_state('s1') + edge = sdfg.add_edge(s0, s1, InterstateEdge(alloc=['v'])) + + result = _generate_explicit_alloc_free(edge, sdfg) + assert 'new double[N]' in result + + +# --------------------------------------------------------------------------- +# InterstateEdge serialisation round-trip +# --------------------------------------------------------------------------- + +class TestInterstateEdgeSerialisation: + + def test_alloc_free_survive_json_roundtrip(self): + """alloc / free lists must be preserved through to_json / from_json.""" + edge = InterstateEdge(alloc=['a', 'b'], free=['c']) + as_json = edge.to_json() + restored = InterstateEdge.from_json(as_json) + assert restored.alloc == ['a', 'b'] + assert restored.free == ['c'] + + def test_empty_alloc_free_json_roundtrip(self): + edge = InterstateEdge() + as_json = edge.to_json() + restored = InterstateEdge.from_json(as_json) + assert restored.alloc == [] + assert restored.free == [] + + def test_sdfg_json_roundtrip_preserves_alloc_free(self): + """Full SDFG serialisation must preserve alloc/free on edges.""" + sdfg, _, _, _, e0, e1 = _simple_sdfg() + make_explicit(sdfg, ['buf']) + + as_json = sdfg.to_json() + restored = SDFG.from_json(as_json) + + restored_edges = restored.edges() + alloc_edges = [e for e in restored_edges if e.data.alloc] + free_edges = [e for e in restored_edges if e.data.free] + + assert any('buf' in e.data.alloc for e in alloc_edges) + assert any('buf' in e.data.free for e in free_edges) + + +# --------------------------------------------------------------------------- +# Validation error messages +# --------------------------------------------------------------------------- + +class TestValidationErrorMessages: + + def test_nonexistent_array_on_alloc_edge_raises(self): + sdfg, init, use, done, e0, e1 = _simple_sdfg() + sdfg.arrays['buf'].lifetime = dtypes.AllocationLifetime.Explicit + e0.data.alloc.append('DOES_NOT_EXIST') + with pytest.raises(Exception, match='non-existent'): + sdfg.validate() + + def test_wrong_lifetime_on_alloc_edge_raises(self): + sdfg, init, use, done, e0, e1 = _simple_sdfg() + # lifetime stays Scope (not Explicit) + e0.data.alloc.append('buf') + with pytest.raises(Exception): + sdfg.validate() + + def test_wrong_lifetime_on_free_edge_raises(self): + sdfg, init, use, done, e0, e1 = _simple_sdfg() + e1.data.free.append('buf') + with pytest.raises(Exception): + sdfg.validate() + + def test_nonexistent_array_on_free_edge_raises(self): + sdfg, init, use, done, e0, e1 = _simple_sdfg() + sdfg.arrays['buf'].lifetime = dtypes.AllocationLifetime.Explicit + e1.data.free.append('ALSO_DOES_NOT_EXIST') + with pytest.raises(Exception, match='non-existent'): + sdfg.validate() + + +# --------------------------------------------------------------------------- +# Codegen integration: placement in output C++ +# --------------------------------------------------------------------------- + +class TestCodegenPlacement: + + def test_alloc_appears_before_tasklet_body(self): + """new[] must appear before the tasklet code that uses the buffer.""" + sdfg, init, use, done, e0, e1 = _simple_sdfg() + make_explicit(sdfg, ['buf']) + cpp = _get_cpp(sdfg) + alloc_pos = cpp.find('new double') + tasklet_pos = cpp.find('out = 1.0') # from _simple_sdfg's tasklet + assert alloc_pos != -1, "new double not found in generated C++" + assert tasklet_pos != -1, "tasklet body not found in generated C++" + assert alloc_pos < tasklet_pos, ( + "new[] must appear before the tasklet that uses the buffer") + + def test_free_appears_after_tasklet_body(self): + """delete[] must come after the tasklet that uses the buffer.""" + sdfg, init, use, done, e0, e1 = _simple_sdfg() + make_explicit(sdfg, ['buf']) + cpp = _get_cpp(sdfg) + free_pos = cpp.find('delete[]') + tasklet_pos = cpp.find('out = 1.0') + assert free_pos != -1, "delete[] not found in generated C++" + assert tasklet_pos != -1, "tasklet body not found in generated C++" + assert free_pos > tasklet_pos, ( + "delete[] must appear after the tasklet that uses the buffer") + + def test_multiple_arrays_all_allocated_in_cpp(self): + """All arrays in a batch must be allocated in the generated C++.""" + sdfg = dace.SDFG('multi') + for name in ('a', 'b', 'c'): + sdfg.add_array(name, (4,), dace.float64, transient=True) + + init = sdfg.add_state('init') + use = sdfg.add_state('use') + done = sdfg.add_state('done') + sdfg.add_edge(init, use, InterstateEdge()) + sdfg.add_edge(use, done, InterstateEdge()) + + for name in ('a', 'b', 'c'): + t = use.add_tasklet(f't_{name}', {}, {'out'}, 'out = 0.0') + use.add_edge(t, 'out', use.add_write(name), None, + dace.Memlet(f'{name}[0]')) + + make_explicit(sdfg, ['a', 'b', 'c']) + cpp = _get_cpp(sdfg) + assert cpp.count('new double') == 3 + assert cpp.count('delete[]') == 3 diff --git a/tests/passes/explicit_alloc_interstate_test.py b/tests/passes/explicit_alloc_interstate_test.py new file mode 100644 index 0000000000..1ffb4baa48 --- /dev/null +++ b/tests/passes/explicit_alloc_interstate_test.py @@ -0,0 +1,426 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +"""Tests for make_explicit() — converting transient arrays to AllocationLifetime.Explicit +with alloc/free annotations on interstate edges. + +Test coverage: + - Single-state array: alloc on incoming edge, free on outgoing edge + - Multi-state array (SDFG-scope): alloc on incoming edge of first-use state, + free on outgoing edge of last-use state + - Start-state array (no incoming edges): thin predecessor state inserted + - Sink-state array (no outgoing edges): thin successor state inserted + - Multiple arrays sharing the same first/last states batch onto the same edges + - Lifetime is set to Explicit + - Error cases: non-existent array, non-transient array + - Code generation: new[]/delete[] appear on the correct edges in the output C++ + - Validation: SDFG validates after make_explicit +""" + +import re + +import pytest +import dace +from dace import dtypes +from dace.sdfg import SDFG, SDFGState, InterstateEdge +from dace.sdfg.state import LoopRegion +from dace.libraries.allocation import make_explicit + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_simple_sdfg(array_name='buf', shape=(10,), dtype=dace.float64): + """Three-state SDFG: init --[e0]--> use --[e1]--> done + The 'use' state contains one access node for *array_name*. + """ + sdfg = dace.SDFG('simple') + sdfg.add_array(array_name, shape, dtype, transient=True) + + init = sdfg.add_state('init') + use = sdfg.add_state('use') + done = sdfg.add_state('done') + + e0 = sdfg.add_edge(init, use, InterstateEdge()) + e1 = sdfg.add_edge(use, done, InterstateEdge()) + + # Minimal tasklet that writes into buf so the access node is present + t = use.add_tasklet('fill', {}, {'out'}, 'out = 1.0') + use.add_edge(t, 'out', use.add_write(array_name), None, + dace.Memlet(f'{array_name}[0]')) + return sdfg, init, use, done, e0, e1 + + +def _get_cpp(sdfg): + """Return the generated CPU C++ as a string.""" + codes = sdfg.generate_code() + return codes[0].clean_code + + +# --------------------------------------------------------------------------- +# Lifetime tests +# --------------------------------------------------------------------------- + +def test_lifetime_set_to_explicit(): + sdfg, *_ = _make_simple_sdfg() + make_explicit(sdfg, ['buf']) + assert sdfg.arrays['buf'].lifetime is dtypes.AllocationLifetime.Explicit + + +def test_error_nonexistent_array(): + sdfg, *_ = _make_simple_sdfg() + with pytest.raises(ValueError, match='not found'): + make_explicit(sdfg, ['does_not_exist']) + + +def test_error_non_transient(): + sdfg, *_ = _make_simple_sdfg() + sdfg.arrays['buf'].transient = False + with pytest.raises(ValueError, match='not a transient'): + make_explicit(sdfg, ['buf']) + + +# --------------------------------------------------------------------------- +# Edge annotation tests (single-state array) +# --------------------------------------------------------------------------- + +def test_alloc_on_incoming_edge(): + sdfg, init, use, done, e0, e1 = _make_simple_sdfg() + make_explicit(sdfg, ['buf']) + # 'buf' is first (and only) used in 'use'; its alloc must be on e0 (init→use) + assert 'buf' in e0.data.alloc + assert 'buf' not in e1.data.alloc + + +def test_free_on_outgoing_edge(): + sdfg, init, use, done, e0, e1 = _make_simple_sdfg() + make_explicit(sdfg, ['buf']) + # 'buf' is last used in 'use'; its free must be on e1 (use→done) + assert 'buf' in e1.data.free + assert 'buf' not in e0.data.free + + +def test_no_duplicate_on_double_call(): + """Calling make_explicit twice must not add the name twice to the same edge.""" + sdfg, init, use, done, e0, e1 = _make_simple_sdfg() + make_explicit(sdfg, ['buf']) + make_explicit(sdfg, ['buf']) + assert e0.data.alloc.count('buf') == 1 + assert e1.data.free.count('buf') == 1 + + +# --------------------------------------------------------------------------- +# Edge annotation tests (multi-state array) +# --------------------------------------------------------------------------- + +def test_multistate_alloc_on_first_use_edge(): + """Array used in both 'use1' and 'use2': init→use1→use2→done + Alloc must be on init→use1, free on use2→done. + """ + sdfg = dace.SDFG('multi') + sdfg.add_array('buf', (10,), dace.float64, transient=True) + + init = sdfg.add_state('init') + use1 = sdfg.add_state('use1') + use2 = sdfg.add_state('use2') + done = sdfg.add_state('done') + + e0 = sdfg.add_edge(init, use1, InterstateEdge()) + e1 = sdfg.add_edge(use1, use2, InterstateEdge()) + e2 = sdfg.add_edge(use2, done, InterstateEdge()) + + for st in (use1, use2): + t = st.add_tasklet('t', {}, {'out'}, 'out = 0.0') + st.add_edge(t, 'out', st.add_write('buf'), None, dace.Memlet('buf[0]')) + + make_explicit(sdfg, ['buf']) + + assert 'buf' in e0.data.alloc # before first use + assert 'buf' not in e1.data.alloc + assert 'buf' in e2.data.free # after last use + assert 'buf' not in e1.data.free + + +# --------------------------------------------------------------------------- +# Start-state / sink-state edge cases +# --------------------------------------------------------------------------- + +def test_start_state_inserts_predecessor(): + """Array first used in the SDFG start state (no incoming edges). + make_explicit must insert a thin predecessor state. + """ + sdfg = dace.SDFG('startstate') + sdfg.add_array('buf', (5,), dace.float32, transient=True) + + start = sdfg.add_state('start', is_start_block=True) + end = sdfg.add_state('end') + sdfg.add_edge(start, end, InterstateEdge()) + + t = start.add_tasklet('t', {}, {'out'}, 'out = 0.0') + start.add_edge(t, 'out', start.add_write('buf'), None, dace.Memlet('buf[0]')) + + make_explicit(sdfg, ['buf']) + + # A new predecessor state must have been inserted before 'start' + preds = sdfg.in_edges(start) + assert len(preds) == 1 + alloc_edge = preds[0] + assert 'buf' in alloc_edge.data.alloc + + +def test_sink_state_inserts_successor(): + """Array last used in a sink state (no outgoing edges). + make_explicit must insert a thin successor state. + """ + sdfg = dace.SDFG('sinkstate') + sdfg.add_array('buf', (5,), dace.float32, transient=True) + + init = sdfg.add_state('init', is_start_block=True) + sink = sdfg.add_state('sink') + sdfg.add_edge(init, sink, InterstateEdge()) + + t = sink.add_tasklet('t', {}, {'out'}, 'out = 0.0') + sink.add_edge(t, 'out', sink.add_write('buf'), None, dace.Memlet('buf[0]')) + + make_explicit(sdfg, ['buf']) + + succs = sdfg.out_edges(sink) + assert len(succs) == 1 + free_edge = succs[0] + assert 'buf' in free_edge.data.free + + +# --------------------------------------------------------------------------- +# Loop-scoped arrays: semantics preservation +# --------------------------------------------------------------------------- + +def _make_loop_sdfg(lifetime=dtypes.AllocationLifetime.Scope): + """SDFG with a single LoopRegion; 'buf' is used only inside the loop.""" + sdfg = dace.SDFG('loop_scoped') + sdfg.add_array('buf', (10,), dace.float32, transient=True, lifetime=lifetime) + sdfg.add_array('out', (10,), dace.float32, transient=False) + + loop = LoopRegion('loop', 'i < 10', 'i', 'i = 0', 'i = i + 1') + sdfg.add_node(loop, is_start_block=True) + + body = loop.add_state('body', is_start_block=True) + t = body.add_tasklet('compute', {}, {'a'}, 'a = 42.0') + body.add_edge(t, 'a', body.add_access('buf'), None, dace.Memlet('buf[i]')) + body.add_edge(body.add_access('buf'), None, body.add_access('out'), None, + dace.Memlet('out[i]')) + + exit_state = sdfg.add_state('exit') + sdfg.add_edge(loop, exit_state, dace.InterstateEdge()) + return sdfg, loop + + +def test_loop_scope_alloc_inside_loop(): + """Scope-lifetime array used only in a loop: alloc/free must go on edges + *inside* the LoopRegion, not on the SDFG-level edges surrounding the loop. + """ + sdfg, loop = _make_loop_sdfg(dtypes.AllocationLifetime.Scope) + make_explicit(sdfg, ['buf']) + + # No alloc/free on the SDFG-level edges + for edge in sdfg.edges(): + assert 'buf' not in edge.data.alloc, "alloc must not appear on SDFG-level edge" + assert 'buf' not in edge.data.free, "free must not appear on SDFG-level edge" + + # alloc/free must be on edges inside the LoopRegion + loop_alloc = any('buf' in e.data.alloc for e in loop.edges()) + loop_free = any('buf' in e.data.free for e in loop.edges()) + assert loop_alloc, "alloc must be on an edge inside the LoopRegion" + assert loop_free, "free must be on an edge inside the LoopRegion" + + +def test_loop_sdfg_lifetime_alloc_outside_loop(): + """SDFG-lifetime array used only in a loop: alloc/free must be placed at + the SDFG level (hoisted), since the declared scope is the whole SDFG. + """ + sdfg, loop = _make_loop_sdfg(dtypes.AllocationLifetime.SDFG) + make_explicit(sdfg, ['buf']) + + sdfg_alloc = any('buf' in e.data.alloc for e in sdfg.edges()) + sdfg_free = any('buf' in e.data.free for e in sdfg.edges()) + assert sdfg_alloc, "SDFG-lifetime alloc must be on a SDFG-level edge" + assert sdfg_free, "SDFG-lifetime free must be on a SDFG-level edge" + + +def test_loop_scope_validates(): + sdfg, _ = _make_loop_sdfg(dtypes.AllocationLifetime.Scope) + make_explicit(sdfg, ['buf']) + sdfg.validate() + + +# --------------------------------------------------------------------------- +# Multiple arrays batch onto the same edges +# --------------------------------------------------------------------------- + +def test_multiple_arrays_same_edges(): + """Two arrays with the same first/last state end up on the same edges.""" + sdfg = dace.SDFG('multi_arr') + sdfg.add_array('a', (4,), dace.float64, transient=True) + sdfg.add_array('b', (4,), dace.float64, transient=True) + + init = sdfg.add_state('init') + use = sdfg.add_state('use') + done = sdfg.add_state('done') + e0 = sdfg.add_edge(init, use, InterstateEdge()) + e1 = sdfg.add_edge(use, done, InterstateEdge()) + + for arr in ('a', 'b'): + t = use.add_tasklet(f't_{arr}', {}, {'out'}, 'out = 0.0') + use.add_edge(t, 'out', use.add_write(arr), None, dace.Memlet(f'{arr}[0]')) + + make_explicit(sdfg, ['a', 'b']) + + assert 'a' in e0.data.alloc and 'b' in e0.data.alloc + assert 'a' in e1.data.free and 'b' in e1.data.free + + +# --------------------------------------------------------------------------- +# Code generation +# --------------------------------------------------------------------------- + +def test_codegen_new_in_edge_code(): + sdfg, init, use, done, e0, e1 = _make_simple_sdfg('buf', (10,), dace.float64) + make_explicit(sdfg, ['buf']) + cpp = _get_cpp(sdfg) + assert 'new double' in cpp, "Expected heap allocation in generated C++" + + +def test_codegen_delete_in_edge_code(): + sdfg, init, use, done, e0, e1 = _make_simple_sdfg('buf', (10,), dace.float64) + make_explicit(sdfg, ['buf']) + cpp = _get_cpp(sdfg) + assert 'delete[]' in cpp, "Expected deallocation in generated C++" + + +def test_codegen_no_auto_alloc(): + """With Explicit lifetime, the normal auto new[] at function scope must not appear.""" + sdfg, init, use, done, e0, e1 = _make_simple_sdfg('buf', (10,), dace.float64) + make_explicit(sdfg, ['buf']) + cpp = _get_cpp(sdfg) + # The auto-alloc path would declare `buf` as a local variable and assign it + # with `new` at function scope. With Explicit lifetime the pointer lives in + # the state struct, so no function-local `double *buf = new ...` should appear. + assert 'double *buf' not in cpp, "Auto-alloc declaration should not appear" + + +def test_codegen_state_struct_declaration(): + """Array pointer must be declared inside the state struct.""" + sdfg, init, use, done, e0, e1 = _make_simple_sdfg('buf', (10,), dace.float64) + make_explicit(sdfg, ['buf']) + cpp = _get_cpp(sdfg) + # State struct contains ___buf + assert '__0_buf' in cpp or f'__{sdfg.cfg_id}_buf' in cpp + + +def test_codegen_symbolic_shape(): + """Symbolic shape N must be emitted as the symbol name, not a literal.""" + sdfg = dace.SDFG('sym_shape') + N = dace.symbol('N') + sdfg.add_symbol('N', dace.int64) + sdfg.add_array('buf', (N,), dace.float64, transient=True) + + init = sdfg.add_state('init') + use = sdfg.add_state('use') + done = sdfg.add_state('done') + sdfg.add_edge(init, use, InterstateEdge()) + sdfg.add_edge(use, done, InterstateEdge()) + + t = use.add_tasklet('t', {}, {'out'}, 'out = 1.0') + use.add_edge(t, 'out', use.add_write('buf'), None, dace.Memlet('buf[0]')) + + make_explicit(sdfg, ['buf']) + cpp = _get_cpp(sdfg) + assert 'new double' in cpp + assert '[N]' in cpp or 'N]' in cpp, "Symbolic size N must appear in allocation" + + +def test_codegen_alloc_before_free_ordering(): + """new[] must appear before delete[] in the generated output.""" + sdfg, init, use, done, e0, e1 = _make_simple_sdfg('buf', (10,), dace.float64) + make_explicit(sdfg, ['buf']) + cpp = _get_cpp(sdfg) + alloc_pos = cpp.find('new double') + free_pos = cpp.find('delete[]') + assert alloc_pos != -1 and free_pos != -1 + assert alloc_pos < free_pos, "new[] must precede delete[]" + + +# --------------------------------------------------------------------------- +# Validation +# --------------------------------------------------------------------------- + +def test_sdfg_validates_after_make_explicit(): + sdfg, *_ = _make_simple_sdfg() + make_explicit(sdfg, ['buf']) + sdfg.validate() # must not raise + + +def test_validation_rejects_non_explicit_on_alloc_edge(): + """Manually annotating an edge with a non-Explicit array must fail validation.""" + sdfg, init, use, done, e0, e1 = _make_simple_sdfg() + # Do NOT call make_explicit — lifetime stays Scope + e0.data.alloc.append('buf') + with pytest.raises(Exception): + sdfg.validate() + + +def test_validation_rejects_non_explicit_on_free_edge(): + sdfg, init, use, done, e0, e1 = _make_simple_sdfg() + e1.data.free.append('buf') + with pytest.raises(Exception): + sdfg.validate() + + +def test_validation_rejects_nonexistent_array_on_edge(): + sdfg, init, use, done, e0, e1 = _make_simple_sdfg() + sdfg.arrays['buf'].lifetime = dtypes.AllocationLifetime.Explicit + e0.data.alloc.append('DOES_NOT_EXIST') + with pytest.raises(Exception): + sdfg.validate() + + +# --------------------------------------------------------------------------- +# Size-1 array limitation +# --------------------------------------------------------------------------- + +def test_error_size_1_array(): + """Arrays with exactly one element must raise ValueError.""" + sdfg = dace.SDFG('size1') + sdfg.add_array('scalar', (1,), dace.float64, transient=True) + init = sdfg.add_state('init', is_start_block=True) + use = sdfg.add_state('use') + done = sdfg.add_state('done') + sdfg.add_edge(init, use, InterstateEdge()) + sdfg.add_edge(use, done, InterstateEdge()) + t = use.add_tasklet('t', {}, {'out'}, 'out = 1.0') + use.add_edge(t, 'out', use.add_write('scalar'), None, dace.Memlet('scalar[0]')) + with pytest.raises(ValueError, match='total_size=1'): + make_explicit(sdfg, ['scalar']) + + +def test_unused_array_lifetime_set_no_edges_annotated(): + """Array with no access nodes anywhere: lifetime becomes Explicit but no + edges are annotated (the all_states list is empty, so we continue).""" + sdfg, init, use, done, e0, e1 = _make_simple_sdfg() + sdfg.add_array('unused', (10,), dace.float64, transient=True) + make_explicit(sdfg, ['unused']) + assert sdfg.arrays['unused'].lifetime is dtypes.AllocationLifetime.Explicit + all_allocs = [n for e in sdfg.edges() for n in e.data.alloc] + all_frees = [n for e in sdfg.edges() for n in e.data.free] + assert 'unused' not in all_allocs + assert 'unused' not in all_frees + + +if __name__ == '__main__': + print("Running basic make_explicit tests...") + test_lifetime_set_to_explicit() + test_alloc_on_incoming_edge() + test_free_on_outgoing_edge() + test_codegen_new_in_edge_code() + test_codegen_delete_in_edge_code() + test_codegen_no_auto_alloc() + test_sdfg_validates_after_make_explicit() + print("Basic tests PASSED") diff --git a/tests/passes/explicit_alloc_serialization_test.py b/tests/passes/explicit_alloc_serialization_test.py new file mode 100644 index 0000000000..2203c257da --- /dev/null +++ b/tests/passes/explicit_alloc_serialization_test.py @@ -0,0 +1,104 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +"""Round-trip tests for the Explicit-allocation extensions. + +Asserts that the new InterstateEdge properties (alloc / free / reuse) and the +AllocationLifetime.Explicit enum value survive an SDFG -> JSON -> SDFG cycle. +This is the serialisation contract the design relies on: passes that mutate +allocation state (make_explicit, buffer_reuse_same_pass, buffer_reuse_same_pass_ua) must +be observable through .sdfg files saved to disk and reloaded by code-gen. +""" +import json + +import dace +from dace import dtypes +from dace.sdfg import SDFG, InterstateEdge + + +def _explicit_array(sdfg: SDFG, name: str, shape=(4,)): + sdfg.add_array(name, shape, dace.float64, transient=True, + lifetime=dtypes.AllocationLifetime.Explicit) + + +def _three_state_with_annotations() -> SDFG: + """SDFG carrying every annotation the new properties expose: + alloc, free, and a reuse pair.""" + sdfg = SDFG('rtrip') + _explicit_array(sdfg, 'tmp1') + _explicit_array(sdfg, 'tmp2') + s0 = sdfg.add_state('init') + s1 = sdfg.add_state('use') + s2 = sdfg.add_state('done') + e0 = sdfg.add_edge(s0, s1, InterstateEdge()) + e1 = sdfg.add_edge(s1, s2, InterstateEdge()) + s1.add_access('tmp1') + s1.add_access('tmp2') + e0.data.alloc = ['tmp1'] + e0.data.reuse = [['tmp2', 'tmp1']] # tmp2 rebinds onto tmp1 + e1.data.free = ['tmp1'] + return sdfg + + +def _only_edge(sdfg: SDFG, src_label: str, dst_label: str) -> InterstateEdge: + src = next(s for s in sdfg.nodes() if s.label == src_label) + dst = next(s for s in sdfg.nodes() if s.label == dst_label) + return sdfg.edges_between(src, dst)[0].data + + +def test_alloc_free_reuse_round_trip(): + sdfg = _three_state_with_annotations() + blob = sdfg.to_json() + # ensure we are exercising the JSON path (not in-memory aliasing) + blob = json.loads(json.dumps(blob)) + rt = SDFG.from_json(blob) + + e0 = _only_edge(rt, 'init', 'use') + e1 = _only_edge(rt, 'use', 'done') + + assert e0.alloc == ['tmp1'] + assert e0.free == [] + assert e0.reuse == [['tmp2', 'tmp1']] + assert e1.alloc == [] + assert e1.free == ['tmp1'] + assert e1.reuse == [] + + +def test_explicit_lifetime_round_trip(): + sdfg = _three_state_with_annotations() + blob = json.loads(json.dumps(sdfg.to_json())) + rt = SDFG.from_json(blob) + + for name in ('tmp1', 'tmp2'): + assert rt.arrays[name].lifetime == dtypes.AllocationLifetime.Explicit, ( + f'{name} lost its Explicit lifetime through the JSON round-trip' + ) + + +def test_save_load_round_trip(tmp_path): + """Same contract via SDFG.save / SDFG.from_file — the path code-gen uses.""" + sdfg = _three_state_with_annotations() + fname = tmp_path / 'rtrip.sdfg' + sdfg.save(str(fname)) + rt = SDFG.from_file(str(fname)) + + e0 = _only_edge(rt, 'init', 'use') + e1 = _only_edge(rt, 'use', 'done') + + assert e0.alloc == ['tmp1'] + assert e0.reuse == [['tmp2', 'tmp1']] + assert e1.free == ['tmp1'] + assert rt.arrays['tmp1'].lifetime == dtypes.AllocationLifetime.Explicit + assert rt.arrays['tmp2'].lifetime == dtypes.AllocationLifetime.Explicit + + +def test_empty_lists_default_round_trip(): + """Edges without annotations must come back with empty alloc/free/reuse — + not None, not missing — so downstream code can iterate unconditionally.""" + sdfg = SDFG('empty_edges') + s0 = sdfg.add_state('a'); s1 = sdfg.add_state('b') + sdfg.add_edge(s0, s1, InterstateEdge()) + + rt = SDFG.from_json(json.loads(json.dumps(sdfg.to_json()))) + e = _only_edge(rt, 'a', 'b') + assert e.alloc == [] + assert e.free == [] + assert e.reuse == [] diff --git a/tests/passes/explicit_alloc_validator_test.py b/tests/passes/explicit_alloc_validator_test.py new file mode 100644 index 0000000000..9a05ddeb55 --- /dev/null +++ b/tests/passes/explicit_alloc_validator_test.py @@ -0,0 +1,182 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +"""Tests for _validate_explicit_allocation_balance — alloc/free balance +checks for explicit allocation annotations on interstate edges. +""" +import pytest +import warnings +import dace +from dace import dtypes +from dace.sdfg import SDFG, SDFGState, InterstateEdge +from dace.sdfg.state import LoopRegion, ConditionalBlock, ControlFlowRegion +from dace.properties import CodeBlock +from dace.sdfg.validation import ( + InvalidSDFGError, + InvalidSDFGInterstateEdgeError, + _validate_explicit_allocation_balance, +) + + +def _explicit_array(sdfg: SDFG, name: str, shape=(4,)): + sdfg.add_array(name, shape, dace.float64, transient=True, + lifetime=dtypes.AllocationLifetime.Explicit) + + +def test_validator_is_callable_on_empty_sdfg(): + sdfg = SDFG('empty') + sdfg.add_state('only') + _validate_explicit_allocation_balance(sdfg) # must not raise + + +def _sdfg_with_use_and(opt_free: bool, opt_alloc: bool): + """3-state SDFG with an Explicit array 'tmp1' used in the middle state. + Conditionally attach alloc to edge0 and/or free to edge1.""" + sdfg = SDFG('b1') + _explicit_array(sdfg, 'tmp1') + s0 = sdfg.add_state('init'); s1 = sdfg.add_state('use'); s2 = sdfg.add_state('done') + e0 = sdfg.add_edge(s0, s1, InterstateEdge()) + e1 = sdfg.add_edge(s1, s2, InterstateEdge()) + s1.add_access('tmp1') # ensure 'tmp1' is "used" + if opt_alloc: e0.data.alloc = ['tmp1'] + if opt_free: e1.data.free = ['tmp1'] + return sdfg + + +def test_b1_missing_alloc_raises(): + sdfg = _sdfg_with_use_and(opt_free=True, opt_alloc=False) + with pytest.raises(InvalidSDFGError, match="tmp1"): + _validate_explicit_allocation_balance(sdfg) + + +def test_b2_missing_free_raises(): + sdfg = _sdfg_with_use_and(opt_free=False, opt_alloc=True) + with pytest.raises(InvalidSDFGError, match="tmp1"): + _validate_explicit_allocation_balance(sdfg) + + +def test_b3_unused_explicit_warns(): + sdfg = SDFG('b3') + _explicit_array(sdfg, 'scratch') # declared Explicit, never used + sdfg.add_state('only') + with pytest.warns(UserWarning, match="scratch"): + _validate_explicit_allocation_balance(sdfg) + + +def test_b4_duplicate_in_alloc_raises(): + sdfg = _sdfg_with_use_and(opt_free=True, opt_alloc=False) + # Put tmp1 in alloc twice on the init->use edge + init, use = list(sdfg.nodes())[0], list(sdfg.nodes())[1] + e = sdfg.edges_between(init, use)[0] + e.data.alloc = ['tmp1', 'tmp1'] + with pytest.raises(InvalidSDFGInterstateEdgeError, match="duplicate"): + _validate_explicit_allocation_balance(sdfg) + + +def test_b4_duplicate_in_free_raises(): + sdfg = _sdfg_with_use_and(opt_free=False, opt_alloc=True) + init, use = list(sdfg.nodes())[0], list(sdfg.nodes())[1] + use_done = sdfg.edges_between(use, list(sdfg.nodes())[2])[0] + use_done.data.free = ['tmp1', 'tmp1'] + with pytest.raises(InvalidSDFGInterstateEdgeError, match="duplicate"): + _validate_explicit_allocation_balance(sdfg) + + +def test_b5_alloc_free_overlap_raises(): + sdfg = _sdfg_with_use_and(opt_free=False, opt_alloc=False) + init, use = list(sdfg.nodes())[0], list(sdfg.nodes())[1] + e = sdfg.edges_between(init, use)[0] + e.data.alloc = ['tmp1'] + e.data.free = ['tmp1'] + with pytest.raises(InvalidSDFGInterstateEdgeError, match="alloc and free"): + _validate_explicit_allocation_balance(sdfg) + + +def test_happy_path_linear(): + sdfg = _sdfg_with_use_and(opt_free=True, opt_alloc=True) + with warnings.catch_warnings(): + warnings.simplefilter('error') # warnings → errors + _validate_explicit_allocation_balance(sdfg) + + +def test_happy_path_loop_region(): + """Explicit array 'acc' allocated on the loop-internal header edge, + freed on the loop-internal exit edge. If _all_interstate_edges did NOT + recurse into LoopRegion, this would trip B1/B2.""" + sdfg = SDFG('b7') + _explicit_array(sdfg, 'acc') + pre = sdfg.add_state('pre') + post = sdfg.add_state('post') + + loop = LoopRegion('L', condition_expr='i < 10', + loop_var='i', initialize_expr='i = 0', + update_expr='i = i + 1') + sdfg.add_node(loop) + sdfg.add_edge(pre, loop, InterstateEdge()) + sdfg.add_edge(loop, post, InterstateEdge()) + + body_a = loop.add_state('body_a', is_start_block=True) + body_b = loop.add_state('body_b') + body_c = loop.add_state('body_c') + ea = loop.add_edge(body_a, body_b, InterstateEdge(assignments={})) + eb = loop.add_edge(body_b, body_c, InterstateEdge(assignments={})) + ea.data.alloc = ['acc'] + eb.data.free = ['acc'] + body_b.add_access('acc') + + with warnings.catch_warnings(): + warnings.simplefilter('error') + _validate_explicit_allocation_balance(sdfg) + + +def test_b4_inside_loop_region_error_renders(): + """Regression: nested-region edge ids must be resolvable by the exception's + __str__ path. Before the owner-arg fix, str(exc) would report the wrong edge + (a top-level SDFG edge) rather than the nested ba->bb edge where the + violation actually lives.""" + sdfg = SDFG('nested_b4') + _explicit_array(sdfg, 'nx') + pre = sdfg.add_state('pre'); post = sdfg.add_state('post') + loop = LoopRegion('L', condition_expr='i < 2', loop_var='i', + initialize_expr='i = 0', update_expr='i = i + 1') + sdfg.add_node(loop) + sdfg.add_edge(pre, loop, InterstateEdge()) + sdfg.add_edge(loop, post, InterstateEdge()) + ba = loop.add_state('ba', is_start_block=True) + bb = loop.add_state('bb') + e = loop.add_edge(ba, bb, InterstateEdge(assignments={})) + e.data.alloc = ['nx', 'nx'] # B4 violation on nested edge + bb.add_access('nx') + # The fix guarantees both: (a) the error is raised, and (b) str() identifies + # the correct nested edge (ba->bb), not a top-level edge (pre->L). + with pytest.raises(InvalidSDFGInterstateEdgeError, match="duplicate") as excinfo: + _validate_explicit_allocation_balance(sdfg) + rendered = str(excinfo.value) # must not raise + assert 'ba' in rendered and 'bb' in rendered, ( + f"str(exc) should reference the nested ba->bb edge, got: {rendered!r}" + ) + + +def test_happy_path_conditional_block(): + """alloc on pre-branch edge, free on post-branch edge, use inside one + branch. Top-level scan already sees pre/post edges, so this test mostly + confirms that a ConditionalBlock doesn't somehow swallow top-level edges.""" + sdfg = SDFG('b8') + _explicit_array(sdfg, 'tmp') + + pre = sdfg.add_state('pre') + post = sdfg.add_state('post') + cond = ConditionalBlock('C') + sdfg.add_node(cond) + + true_body = ControlFlowRegion('true_body', sdfg=sdfg) + t_use = true_body.add_state('t_use', is_start_block=True) + t_use.add_access('tmp') + cond.add_branch(CodeBlock('True'), true_body) + + e_pre = sdfg.add_edge(pre, cond, InterstateEdge()) + e_post = sdfg.add_edge(cond, post, InterstateEdge()) + e_pre.data.alloc = ['tmp'] + e_post.data.free = ['tmp'] + + with warnings.catch_warnings(): + warnings.simplefilter('error') + _validate_explicit_allocation_balance(sdfg) diff --git a/tests/passes/hoist_alloc_test.py b/tests/passes/hoist_alloc_test.py new file mode 100644 index 0000000000..ca3d950a12 --- /dev/null +++ b/tests/passes/hoist_alloc_test.py @@ -0,0 +1,478 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +"""Tests for hoist_alloc_out_of_loop() — lifting explicit allocations out of +LoopRegions so that memory is allocated once before the loop instead of on every +iteration. + +Test coverage: + - Basic hoist: alloc/free move from inside loop to parent edges + - Loop is SDFG start (no incoming edges): thin predecessor inserted + - Loop is SDFG sink (no outgoing edges): thin successor inserted + - Multiple arrays hoisted in one call + - Nested loop: alloc on inner-loop edge hoisted to outer parent CFG + - TypeError when a non-LoopRegion is passed (Map, plain state, etc.) + - ValueError when array is not found in the SDFG + - ValueError when array is not transient + - ValueError when array has no alloc annotation inside the loop + - Alloc without matching free inside: only alloc is moved, no free added + - SDFG validates after hoist + - Code generation: new[] before the loop, delete[] after the loop +""" + +import re + +import pytest +import dace +from dace import dtypes +from dace.sdfg import SDFG, SDFGState, InterstateEdge +from dace.sdfg.state import LoopRegion +from dace.libraries.allocation import make_explicit, hoist_alloc_out_of_loop + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_loop_sdfg(array_name='buf', shape=(10,), dtype=dace.float64): + """Build an SDFG of the form: + + init --[e_before]--> [LoopRegion] --[e_after]--> done + + Inside the LoopRegion: + body_start --[e_body]--> body_end + (body_end loops back via the region's implicit back-edge) + + The body_start state contains a write access node for *array_name*. + Returns (sdfg, loop, e_before, e_after, e_body). + """ + sdfg = dace.SDFG('loop_hoist_test') + sdfg.add_array(array_name, shape, dtype, transient=True) + + init = sdfg.add_state('init') + loop = LoopRegion('myloop', + condition_expr='i < 10', + loop_var='i', + initialize_expr='i = 0', + update_expr='i = i + 1', + sdfg=sdfg) + sdfg.add_node(loop) + done = sdfg.add_state('done') + + e_before = sdfg.add_edge(init, loop, InterstateEdge()) + e_after = sdfg.add_edge(loop, done, InterstateEdge()) + + # Loop body: two states with an edge inside the loop + body_start = loop.add_state('body_start', is_start_block=True) + body_end = loop.add_state('body_end') + e_body = loop.add_edge(body_start, body_end, InterstateEdge()) + + # Access node so the array is "used" inside the loop + t = body_start.add_tasklet('fill', {}, {'out'}, 'out = 1.0') + body_start.add_edge(t, 'out', body_start.add_write(array_name), None, + dace.Memlet(f'{array_name}[0]')) + + return sdfg, loop, e_before, e_after, e_body + + +def _annotate_inside(loop, e_body, name): + """Manually place alloc and free annotations on the body edge and set + the array's lifetime to Explicit so hoist does not call make_explicit.""" + node = loop + while not isinstance(node, dace.SDFG): + node = node.parent_graph + node.arrays[name].lifetime = dtypes.AllocationLifetime.Explicit + e_body.data.alloc.append(name) + e_body.data.free.append(name) + + +def _get_cpp(sdfg): + """Return the generated CPU C++ as a string.""" + codes = sdfg.generate_code() + return codes[0].clean_code + + +# --------------------------------------------------------------------------- +# Basic functionality +# --------------------------------------------------------------------------- + +def test_alloc_moved_to_before_loop(): + """alloc on a body edge is removed and added to the incoming loop edge.""" + sdfg, loop, e_before, e_after, e_body = _make_loop_sdfg() + _annotate_inside(loop, e_body, 'buf') + + hoist_alloc_out_of_loop(loop, ['buf']) + + assert 'buf' not in e_body.data.alloc + assert 'buf' in e_before.data.alloc + + +def test_free_moved_to_after_loop(): + """free on a body edge is removed and added to the outgoing loop edge.""" + sdfg, loop, e_before, e_after, e_body = _make_loop_sdfg() + _annotate_inside(loop, e_body, 'buf') + + hoist_alloc_out_of_loop(loop, ['buf']) + + assert 'buf' not in e_body.data.free + assert 'buf' in e_after.data.free + + +def test_alloc_only_no_free_inside(): + """If alloc is inside but free is not, hoist alloc only; no free is added.""" + sdfg, loop, e_before, e_after, e_body = _make_loop_sdfg() + sdfg.arrays['buf'].lifetime = dtypes.AllocationLifetime.Explicit + e_body.data.alloc.append('buf') + # No free annotation inside + + hoist_alloc_out_of_loop(loop, ['buf']) + + assert 'buf' in e_before.data.alloc + assert 'buf' not in e_after.data.free # nothing added + + +def test_loop_is_sdfg_start_thin_predecessor_inserted(): + """When the loop has no incoming edges, a thin predecessor state is inserted.""" + sdfg = dace.SDFG('start_loop') + sdfg.add_array('buf', [10], dace.float64, transient=True, + lifetime=dtypes.AllocationLifetime.Explicit) + + loop = LoopRegion('myloop', condition_expr='i < 5', loop_var='i', + initialize_expr='i = 0', update_expr='i = i + 1', sdfg=sdfg) + sdfg.add_node(loop, is_start_block=True) + done = sdfg.add_state('done') + e_after = sdfg.add_edge(loop, done, InterstateEdge()) + + body = loop.add_state('body', is_start_block=True) + e_body = loop.add_edge(body, loop.add_state('body_end'), InterstateEdge()) + e_body.data.alloc.append('buf') + + hoist_alloc_out_of_loop(loop, ['buf']) + + in_edges = sdfg.in_edges(loop) + assert len(in_edges) == 1 + assert 'buf' in in_edges[0].data.alloc + + +def test_loop_is_sdfg_sink_thin_successor_inserted(): + """When the loop has no outgoing edges, a thin successor state is inserted.""" + sdfg = dace.SDFG('sink_loop') + sdfg.add_array('buf', [10], dace.float64, transient=True, + lifetime=dtypes.AllocationLifetime.Explicit) + + init = sdfg.add_state('init') + loop = LoopRegion('myloop', condition_expr='i < 5', loop_var='i', + initialize_expr='i = 0', update_expr='i = i + 1', sdfg=sdfg) + sdfg.add_node(loop) + e_before = sdfg.add_edge(init, loop, InterstateEdge()) + + body = loop.add_state('body', is_start_block=True) + e_body = loop.add_edge(body, loop.add_state('body_end'), InterstateEdge()) + e_body.data.alloc.append('buf') + e_body.data.free.append('buf') + + hoist_alloc_out_of_loop(loop, ['buf']) + + out_edges = sdfg.out_edges(loop) + assert len(out_edges) == 1 + assert 'buf' in out_edges[0].data.free + + +def test_multiple_arrays_hoisted(): + """Multiple arrays can be hoisted in a single call.""" + sdfg = dace.SDFG('multi') + sdfg.add_array('dx', [10], dace.float64, transient=True, + lifetime=dtypes.AllocationLifetime.Explicit) + sdfg.add_array('dy', [10], dace.float64, transient=True, + lifetime=dtypes.AllocationLifetime.Explicit) + + init = sdfg.add_state('init') + loop = LoopRegion('myloop', condition_expr='i < 10', loop_var='i', + initialize_expr='i = 0', update_expr='i = i + 1', sdfg=sdfg) + sdfg.add_node(loop) + done = sdfg.add_state('done') + e_before = sdfg.add_edge(init, loop, InterstateEdge()) + e_after = sdfg.add_edge(loop, done, InterstateEdge()) + + body = loop.add_state('body', is_start_block=True) + e_body = loop.add_edge(body, loop.add_state('body_end'), InterstateEdge()) + e_body.data.alloc = ['dx', 'dy'] + e_body.data.free = ['dx', 'dy'] + + hoist_alloc_out_of_loop(loop, ['dx', 'dy']) + + assert 'dx' in e_before.data.alloc + assert 'dy' in e_before.data.alloc + assert 'dx' in e_after.data.free + assert 'dy' in e_after.data.free + assert e_body.data.alloc == [] + assert e_body.data.free == [] + + +# --------------------------------------------------------------------------- +# Nested loops +# --------------------------------------------------------------------------- + +def test_nested_loop_inner_alloc_hoisted_to_outer_parent(): + """Alloc on an edge inside an inner loop is hoisted out of the outer loop + when hoist_alloc_out_of_loop is called on the outer loop.""" + sdfg = dace.SDFG('nested') + sdfg.add_array('buf', [10], dace.float64, transient=True, + lifetime=dtypes.AllocationLifetime.Explicit) + + init = sdfg.add_state('init') + outer = LoopRegion('outer', condition_expr='i < 3', loop_var='i', + initialize_expr='i = 0', update_expr='i = i + 1', sdfg=sdfg) + sdfg.add_node(outer) + done = sdfg.add_state('done') + e_before = sdfg.add_edge(init, outer, InterstateEdge()) + e_after = sdfg.add_edge(outer, done, InterstateEdge()) + + inner = LoopRegion('inner', condition_expr='j < 3', loop_var='j', + initialize_expr='j = 0', update_expr='j = j + 1', + sdfg=sdfg) + outer.add_node(inner, is_start_block=True) + + inner_body = inner.add_state('inner_body', is_start_block=True) + e_inner = inner.add_edge(inner_body, + inner.add_state('inner_end'), InterstateEdge()) + e_inner.data.alloc.append('buf') + e_inner.data.free.append('buf') + + # Hoist out of the OUTER loop — should reach into nested region + hoist_alloc_out_of_loop(outer, ['buf']) + + assert 'buf' not in e_inner.data.alloc + assert 'buf' in e_before.data.alloc + assert 'buf' in e_after.data.free + + +# --------------------------------------------------------------------------- +# Error cases +# --------------------------------------------------------------------------- + +def test_type_error_for_non_loop_region(): + """Passing anything other than a LoopRegion raises TypeError.""" + sdfg = dace.SDFG('err') + state = sdfg.add_state('s') + with pytest.raises(TypeError, match="LoopRegion"): + hoist_alloc_out_of_loop(state, ['buf']) + + +def test_type_error_message_mentions_maps(): + """The TypeError message specifically explains that Maps are parallel.""" + sdfg = dace.SDFG('err') + # MapEntry is not a LoopRegion — simulate by passing a random object + class FakeMap: + pass + with pytest.raises(TypeError, match="Map"): + hoist_alloc_out_of_loop(FakeMap(), ['buf']) + + +def test_value_error_unknown_array(): + sdfg = dace.SDFG('err') + loop = LoopRegion('l', condition_expr='True', sdfg=sdfg) + sdfg.add_node(loop, is_start_block=True) + with pytest.raises(ValueError, match="not in the SDFG"): + hoist_alloc_out_of_loop(loop, ['nonexistent']) + + +def test_value_error_non_transient_array(): + sdfg = dace.SDFG('err') + sdfg.add_array('inp', [10], dace.float64, transient=False) + loop = LoopRegion('l', condition_expr='True', sdfg=sdfg) + sdfg.add_node(loop, is_start_block=True) + with pytest.raises(ValueError, match="not a transient"): + hoist_alloc_out_of_loop(loop, ['inp']) + + +def test_explicit_no_alloc_inside_loop_is_skipped(): + """An Explicit-lifetime array with no alloc annotation inside the loop + (e.g. allocated outside the loop) is silently skipped — no error raised.""" + sdfg, loop, e_before, e_after, e_body = _make_loop_sdfg() + sdfg.arrays['buf'].lifetime = dtypes.AllocationLifetime.Explicit + # Alloc is on the edge before the loop, not inside — nothing to hoist + e_before.data.alloc.append('buf') + e_after.data.free.append('buf') + + hoist_alloc_out_of_loop(loop, ['buf']) # must not raise + + # Edges unchanged + assert 'buf' in e_before.data.alloc + assert 'buf' in e_after.data.free + + +# --------------------------------------------------------------------------- +# Auto make_explicit + hoist +# --------------------------------------------------------------------------- + +def test_non_explicit_array_is_made_explicit_and_hoisted(): + """An array with Scope lifetime (not yet Explicit) is automatically + converted to Explicit and hoisted out of the loop in one call.""" + sdfg, loop, e_before, e_after, e_body = _make_loop_sdfg() + # buf has default Scope lifetime — no alloc/free annotations yet + + hoist_alloc_out_of_loop(loop, ['buf']) + + assert sdfg.arrays['buf'].lifetime is dtypes.AllocationLifetime.Explicit + assert 'buf' in e_before.data.alloc, "alloc must be on the edge before the loop" + assert 'buf' in e_after.data.free, "free must be on the edge after the loop" + # No alloc/free must remain inside the loop + for e in loop.edges(): + assert 'buf' not in e.data.alloc + assert 'buf' not in e.data.free + + +def test_non_explicit_array_validates_after_hoist(): + sdfg, loop, e_before, e_after, e_body = _make_loop_sdfg() + hoist_alloc_out_of_loop(loop, ['buf']) + sdfg.validate() + + +def test_non_explicit_sdfg_lifetime_array_skipped(): + """An array with SDFG lifetime is placed at SDFG level by make_explicit — + no alloc inside the loop, so hoist silently skips it.""" + sdfg, loop, e_before, e_after, e_body = _make_loop_sdfg() + sdfg.arrays['buf'].lifetime = dtypes.AllocationLifetime.SDFG + + hoist_alloc_out_of_loop(loop, ['buf']) # must not raise + + # make_explicit placed alloc/free at SDFG level already + assert sdfg.arrays['buf'].lifetime is dtypes.AllocationLifetime.Explicit + sdfg_alloc = any('buf' in e.data.alloc for e in sdfg.edges()) + assert sdfg_alloc, "SDFG-lifetime array should be allocated at SDFG level" + + +# --------------------------------------------------------------------------- +# SDFG validation +# --------------------------------------------------------------------------- + +def test_sdfg_validates_after_hoist(): + sdfg, loop, e_before, e_after, e_body = _make_loop_sdfg() + _annotate_inside(loop, e_body, 'buf') + sdfg.arrays['buf'].lifetime = dtypes.AllocationLifetime.Explicit + + hoist_alloc_out_of_loop(loop, ['buf']) + sdfg.validate() + + +# --------------------------------------------------------------------------- +# Code-generation smoke test +# --------------------------------------------------------------------------- + +def test_codegen_new_before_loop_delete_after(): + """After hoisting, new[] must appear before the loop body and delete[] after.""" + sdfg, loop, e_before, e_after, e_body = _make_loop_sdfg() + _annotate_inside(loop, e_body, 'buf') + sdfg.arrays['buf'].lifetime = dtypes.AllocationLifetime.Explicit + + hoist_alloc_out_of_loop(loop, ['buf']) + + cpp = _get_cpp(sdfg) + + # new[] must appear before the loop keyword + new_pos = cpp.find('new double') + loop_pos = cpp.find('for (') + delete_pos = cpp.find('delete[]') + + assert new_pos != -1, "new[] not found in generated C++" + assert loop_pos != -1, "for-loop not found in generated C++" + assert delete_pos != -1, "delete[] not found in generated C++" + + assert new_pos < loop_pos, "new[] should appear before the loop" + assert loop_pos < delete_pos, "delete[] should appear after the loop" + + +# --------------------------------------------------------------------------- +# Multiple incoming / outgoing edges +# --------------------------------------------------------------------------- + +def test_multiple_incoming_edges_alloc_on_all(): + """When the loop has two incoming edges (from a conditional branch), alloc + is added to every incoming edge so the array is allocated on both paths.""" + sdfg = dace.SDFG('multi_in') + sdfg.add_array('buf', [10], dace.float64, transient=True, + lifetime=dtypes.AllocationLifetime.Explicit) + + init = sdfg.add_state('init', is_start_block=True) + true_br = sdfg.add_state('true_br') + false_br = sdfg.add_state('false_br') + loop = LoopRegion('myloop', 'i < 10', 'i', 'i = 0', 'i = i + 1', sdfg=sdfg) + sdfg.add_node(loop) + done = sdfg.add_state('done') + + sdfg.add_edge(init, true_br, InterstateEdge(condition='1 == 1')) + sdfg.add_edge(init, false_br, InterstateEdge(condition='1 == 0')) + e1 = sdfg.add_edge(true_br, loop, InterstateEdge()) + e2 = sdfg.add_edge(false_br, loop, InterstateEdge()) + sdfg.add_edge(loop, done, InterstateEdge()) + + body = loop.add_state('body', is_start_block=True) + e_body = loop.add_edge(body, loop.add_state('body_end'), InterstateEdge()) + e_body.data.alloc.append('buf') + e_body.data.free.append('buf') + + hoist_alloc_out_of_loop(loop, ['buf']) + + assert 'buf' in e1.data.alloc, "alloc must appear on the true-branch incoming edge" + assert 'buf' in e2.data.alloc, "alloc must appear on the false-branch incoming edge" + assert 'buf' not in e_body.data.alloc + + +def test_multiple_outgoing_edges_free_on_all(): + """When the loop has two outgoing edges, free is added to every outgoing + edge so the array is released on every exit path.""" + sdfg = dace.SDFG('multi_out') + sdfg.add_array('buf', [10], dace.float64, transient=True, + lifetime=dtypes.AllocationLifetime.Explicit) + + init = sdfg.add_state('init', is_start_block=True) + loop = LoopRegion('myloop', 'i < 10', 'i', 'i = 0', 'i = i + 1', sdfg=sdfg) + sdfg.add_node(loop) + done1 = sdfg.add_state('done1') + done2 = sdfg.add_state('done2') + + sdfg.add_edge(init, loop, InterstateEdge()) + e_out1 = sdfg.add_edge(loop, done1, InterstateEdge(condition='1 == 1')) + e_out2 = sdfg.add_edge(loop, done2, InterstateEdge(condition='1 == 0')) + + body = loop.add_state('body', is_start_block=True) + e_body = loop.add_edge(body, loop.add_state('body_end'), InterstateEdge()) + e_body.data.alloc.append('buf') + e_body.data.free.append('buf') + + hoist_alloc_out_of_loop(loop, ['buf']) + + assert 'buf' in e_out1.data.free, "free must appear on first outgoing edge" + assert 'buf' in e_out2.data.free, "free must appear on second outgoing edge" + assert 'buf' not in e_body.data.free + + +# --------------------------------------------------------------------------- +# Constructed example: quantitative hoisting check +# --------------------------------------------------------------------------- + +def test_hoist_moves_alloc_out_of_loop_body_completely(): + """Constructed example: an array allocated on every loop body edge has + all internal alloc/free annotations removed after hoisting, and gains + exactly one alloc annotation on the pre-loop edge.""" + sdfg, loop, e_before, e_after, e_body = _make_loop_sdfg() + _annotate_inside(loop, e_body, 'buf') + + # Sanity: one alloc inside the loop, none outside + inner_allocs_before = sum(1 for e in loop.edges() if 'buf' in e.data.alloc) + outer_allocs_before = sum(1 for e in sdfg.edges() if 'buf' in e.data.alloc) + assert inner_allocs_before == 1 + assert outer_allocs_before == 0 + + hoist_alloc_out_of_loop(loop, ['buf']) + + # After hoist: no alloc/free remain inside the loop + inner_allocs_after = sum(1 for e in loop.edges() if 'buf' in e.data.alloc) + inner_frees_after = sum(1 for e in loop.edges() if 'buf' in e.data.free) + assert inner_allocs_after == 0, "no alloc must remain inside the loop body" + assert inner_frees_after == 0, "no free must remain inside the loop body" + + # Exactly one alloc on the pre-loop edge and one free on the post-loop edge + outer_allocs_after = sum(1 for e in sdfg.edges() if 'buf' in e.data.alloc) + outer_frees_after = sum(1 for e in sdfg.edges() if 'buf' in e.data.free) + assert outer_allocs_after == 1 + assert outer_frees_after == 1 diff --git a/tests/passes/reuse_internals_test.py b/tests/passes/reuse_internals_test.py new file mode 100644 index 0000000000..c5101b4fb7 --- /dev/null +++ b/tests/passes/reuse_internals_test.py @@ -0,0 +1,502 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +"""Unit tests for internal helpers in reuse.py and quantitative constructed +examples for every reuse pass. + +Internal helpers covered: + - _greedy_same_size_scan: empty, no-donor, size mismatch, dtype mismatch, LIFO + - _collect_scopes: region-id sets, cross-scope detection + - _edge_order_safe: safe (strictly ordered), unsafe (different regions) + - _resolve_donor_root: no chain, 2-tuple chain, 3-tuple chain with offset + - _apply_arena_reuse: 3-tuple emitted, donor free moved, consumer free removed + +Quantitative examples (alloc count and footprint reduction): + - _apply_reuse: 2 arrays → 1 alloc (-1 count, -50% bytes) + - buffer_reuse_cross_pass: large→small cross-size (-1 count, -small_bytes footprint) +""" + +import pytest +import dace +from dace import dtypes +from dace.sdfg import SDFG, InterstateEdge +from dace.sdfg.state import LoopRegion +from dace.libraries.allocation import make_explicit +from dace.libraries.allocation.reuse import ( + _AllocEntry, + _FreeEntry, + _greedy_same_size_scan, + _collect_scopes, + _edge_order_safe, + _resolve_donor_root, + _apply_arena_reuse, + _apply_reuse, + buffer_reuse_cross_pass, +) + + +# --------------------------------------------------------------------------- +# Metric helpers +# --------------------------------------------------------------------------- + +def _count_allocs(sdfg: SDFG) -> int: + """Number of array names across all alloc lists in the SDFG.""" + return sum(len(e.data.alloc) + for e in sdfg.all_interstate_edges(recursive=True)) + + +def _alloc_footprint_bytes(sdfg: SDFG) -> int: + """Sum of bytes for every array that has at least one alloc annotation.""" + names = set() + for e in sdfg.all_interstate_edges(recursive=True): + names.update(e.data.alloc) + return sum( + sdfg.arrays[n].total_size * sdfg.arrays[n].dtype.bytes + for n in names if n in sdfg.arrays + ) + + +# --------------------------------------------------------------------------- +# Shared SDFG factory helpers +# --------------------------------------------------------------------------- + +def _sequential_sdfg(name: str, big: int, small: int, + dtype_big=dace.float64, dtype_small=dace.float64) -> SDFG: + """init → use_A → use_B → done with A[big] and B[small]. + + A and B are used in separate states so make_explicit places their + alloc/free edges strictly around each state (no overlap). + """ + sdfg = SDFG(name) + sdfg.add_array('A', [big], dtype_big, transient=True) + sdfg.add_array('B', [small], dtype_small, transient=True) + + init = sdfg.add_state('init', is_start_block=True) + use_A = sdfg.add_state('use_A') + use_B = sdfg.add_state('use_B') + done = sdfg.add_state('done') + sdfg.add_edge(init, use_A, InterstateEdge()) + sdfg.add_edge(use_A, use_B, InterstateEdge()) + sdfg.add_edge(use_B, done, InterstateEdge()) + + t_A = use_A.add_tasklet('a', {}, {'o'}, 'o = 1.0') + use_A.add_edge(t_A, 'o', use_A.add_write('A'), None, dace.Memlet('A[0]')) + t_B = use_B.add_tasklet('b', {}, {'o'}, 'o = 1.0') + use_B.add_edge(t_B, 'o', use_B.add_write('B'), None, dace.Memlet('B[0]')) + + make_explicit(sdfg, ['A', 'B']) + return sdfg + + +# --------------------------------------------------------------------------- +# _greedy_same_size_scan unit tests +# --------------------------------------------------------------------------- + +class TestGreedySameSizeScan: + + def test_empty_liveness_returns_empty(self): + assert _greedy_same_size_scan([]) == [] + + def test_alloc_only_no_donors(self): + events = [_AllocEntry('A', 100, dace.float64)] + assert _greedy_same_size_scan(events) == [] + + def test_free_only_no_alloc(self): + events = [_FreeEntry('A', 100, dace.float64)] + assert _greedy_same_size_scan(events) == [] + + def test_size_mismatch_no_pair(self): + events = [ + _FreeEntry('A', 100, dace.float64), + _AllocEntry('B', 200, dace.float64), + ] + assert _greedy_same_size_scan(events) == [] + + def test_dtype_mismatch_no_pair(self): + events = [ + _FreeEntry('A', 100, dace.float64), + _AllocEntry('B', 100, dace.float32), + ] + assert _greedy_same_size_scan(events) == [] + + def test_matching_pair_produces_reuse(self): + events = [ + _FreeEntry('A', 100, dace.float64), + _AllocEntry('B', 100, dace.float64), + ] + assert _greedy_same_size_scan(events) == [('B', 'A')] + + def test_lifo_within_same_bucket(self): + """A freed first, B freed second; LIFO picks B for next alloc of C.""" + events = [ + _FreeEntry('A', 100, dace.float64), + _FreeEntry('B', 100, dace.float64), + _AllocEntry('C', 100, dace.float64), + ] + assert _greedy_same_size_scan(events) == [('C', 'B')] + + def test_two_sequential_pairs(self): + events = [ + _FreeEntry('A', 100, dace.float64), + _AllocEntry('B', 100, dace.float64), + _FreeEntry('B', 100, dace.float64), + _AllocEntry('C', 100, dace.float64), + ] + pairs = _greedy_same_size_scan(events) + assert ('B', 'A') in pairs + assert ('C', 'B') in pairs + + +# --------------------------------------------------------------------------- +# _collect_scopes unit tests +# --------------------------------------------------------------------------- + +class TestCollectScopes: + + def test_returns_region_ids_for_alloc_and_free(self): + sdfg = SDFG('scopes_basic') + sdfg.add_array('A', [10], dace.float64, transient=True, + lifetime=dtypes.AllocationLifetime.Explicit) + init = sdfg.add_state('init', is_start_block=True) + use = sdfg.add_state('use') + done = sdfg.add_state('done') + e0 = sdfg.add_edge(init, use, InterstateEdge()) + e1 = sdfg.add_edge(use, done, InterstateEdge()) + e0.data.alloc.append('A') + e1.data.free.append('A') + + scopes = _collect_scopes(sdfg, {'A'}) + + assert 'A' in scopes + alloc_regions, free_regions = scopes['A'] + assert len(alloc_regions) == 1 + assert len(free_regions) == 1 + assert alloc_regions == free_regions # both on same SDFG + + def test_cross_scope_gives_different_region_sets(self): + """Alloc at SDFG level, free inside a LoopRegion → mismatched sets.""" + sdfg = SDFG('scopes_cross') + sdfg.add_array('A', [10], dace.float64, transient=True, + lifetime=dtypes.AllocationLifetime.Explicit) + + init = sdfg.add_state('init', is_start_block=True) + loop = LoopRegion('myloop', 'i < 5', 'i', 'i = 0', 'i = i + 1', sdfg=sdfg) + sdfg.add_node(loop) + done = sdfg.add_state('done') + e_before = sdfg.add_edge(init, loop, InterstateEdge()) + sdfg.add_edge(loop, done, InterstateEdge()) + + body = loop.add_state('body', is_start_block=True) + body_end = loop.add_state('body_end') + e_inner = loop.add_edge(body, body_end, InterstateEdge()) + + e_before.data.alloc.append('A') + e_inner.data.free.append('A') + + scopes = _collect_scopes(sdfg, {'A'}) + alloc_regions, free_regions = scopes['A'] + assert alloc_regions != free_regions + + +# --------------------------------------------------------------------------- +# _edge_order_safe unit tests +# --------------------------------------------------------------------------- + +class TestEdgeOrderSafe: + + def test_returns_true_when_strictly_ordered(self): + """B reuses A: A's last-use block (use_A) is strictly before B's + first-use block (use_B) → safe.""" + sdfg = _sequential_sdfg('eos_true', big=10, small=10) + assert _edge_order_safe(sdfg, 'B', 'A') is True + + def test_returns_false_when_different_regions(self): + """A allocated at SDFG level, B allocated inside LoopRegion → not safe.""" + sdfg = SDFG('eos_diff') + sdfg.add_array('A', [10], dace.float64, transient=True, + lifetime=dtypes.AllocationLifetime.Explicit) + sdfg.add_array('B', [10], dace.float64, transient=True, + lifetime=dtypes.AllocationLifetime.Explicit) + + init = sdfg.add_state('init', is_start_block=True) + loop = LoopRegion('myloop', 'i < 5', 'i', 'i = 0', 'i = i + 1', sdfg=sdfg) + sdfg.add_node(loop) + done = sdfg.add_state('done') + e0 = sdfg.add_edge(init, loop, InterstateEdge()) + e1 = sdfg.add_edge(loop, done, InterstateEdge()) + + body = loop.add_state('body', is_start_block=True) + body_end = loop.add_state('body_end') + e_body = loop.add_edge(body, body_end, InterstateEdge()) + + # A at SDFG level, B inside loop → different parent regions + e0.data.alloc.append('A') + e1.data.free.append('A') + e_body.data.alloc.append('B') + e_body.data.free.append('B') + + assert _edge_order_safe(sdfg, 'B', 'A') is False + + def test_returns_false_when_multiple_alloc_edges(self): + """If new_arr has more than one alloc edge the check returns False + (ambiguous ordering).""" + sdfg = _sequential_sdfg('eos_multi', big=10, small=10) + # Inject a second alloc edge for B + extra = sdfg.add_state('extra') + use_B = next(s for s in sdfg.states() if s.label == 'use_B') + e_extra = sdfg.add_edge(extra, use_B, InterstateEdge()) + e_extra.data.alloc.append('B') + + assert _edge_order_safe(sdfg, 'B', 'A') is False + + +# --------------------------------------------------------------------------- +# _resolve_donor_root unit tests +# --------------------------------------------------------------------------- + +class TestResolveDonorRoot: + + def test_non_reuse_consumer_returns_self(self): + sdfg = SDFG('resolve_base') + sdfg.add_array('A', [10], dace.float64, transient=True) + sdfg.add_state('s0', is_start_block=True) + + root, off = _resolve_donor_root(sdfg, 'A') + assert root == 'A' + assert off == 0 + + def test_2tuple_chain_returns_root_zero_offset(self): + """B → A (2-tuple): root of B is A at offset 0.""" + sdfg = SDFG('resolve_2t') + sdfg.add_array('A', [20], dace.float64, transient=True) + sdfg.add_array('B', [10], dace.float64, transient=True) + s0 = sdfg.add_state('s0', is_start_block=True) + s1 = sdfg.add_state('s1') + e = sdfg.add_edge(s0, s1, InterstateEdge()) + e.data.reuse.append(['B', 'A']) + + root, off = _resolve_donor_root(sdfg, 'B') + assert root == 'A' + assert off == 0 + + def test_3tuple_chain_accumulates_offset(self): + """C → B at 8; B → A at 16 → root of C is A at offset 24.""" + sdfg = SDFG('resolve_3t') + sdfg.add_array('A', [40], dace.float64, transient=True) + sdfg.add_array('B', [20], dace.float64, transient=True) + sdfg.add_array('C', [10], dace.float64, transient=True) + s0 = sdfg.add_state('s0', is_start_block=True) + s1 = sdfg.add_state('s1') + e = sdfg.add_edge(s0, s1, InterstateEdge()) + e.data.reuse.append(['B', 'A', 16]) + e.data.reuse.append(['C', 'B', 8]) + + root, off = _resolve_donor_root(sdfg, 'C') + assert root == 'A' + assert off == 24 + + +# --------------------------------------------------------------------------- +# _apply_arena_reuse unit tests +# --------------------------------------------------------------------------- + +class TestApplyArenaReuse: + + def test_consumer_alloc_replaced_with_3tuple(self): + sdfg = _sequential_sdfg('arena_3t', big=20, small=10) + _apply_arena_reuse(sdfg, 'B', 'A', offset_bytes=0) + + all_allocs = [n for e in sdfg.all_interstate_edges(recursive=True) + for n in e.data.alloc] + all_reuses = [r for e in sdfg.all_interstate_edges(recursive=True) + for r in e.data.reuse] + + assert 'B' not in all_allocs + assert any(r[0] == 'B' and r[1] == 'A' and int(r[2]) == 0 + for r in all_reuses) + + def test_consumer_free_removed(self): + sdfg = _sequential_sdfg('arena_nofree', big=20, small=10) + _apply_arena_reuse(sdfg, 'B', 'A', offset_bytes=0) + + all_frees = [n for e in sdfg.all_interstate_edges(recursive=True) + for n in e.data.free] + assert 'B' not in all_frees + + def test_donor_free_moved_to_consumer_old_free_site(self): + sdfg = _sequential_sdfg('arena_moved', big=20, small=10) + # Capture B's free edge before the call + b_free_edge = next( + e for e in sdfg.all_interstate_edges(recursive=True) + if 'B' in e.data.free + ) + + _apply_arena_reuse(sdfg, 'B', 'A', offset_bytes=0) + + assert 'A' in b_free_edge.data.free + + def test_donor_alloc_retained(self): + sdfg = _sequential_sdfg('arena_alloc', big=20, small=10) + _apply_arena_reuse(sdfg, 'B', 'A', offset_bytes=0) + + all_allocs = [n for e in sdfg.all_interstate_edges(recursive=True) + for n in e.data.alloc] + assert 'A' in all_allocs + + def test_nonzero_offset_stored(self): + sdfg = _sequential_sdfg('arena_off', big=20, small=10) + _apply_arena_reuse(sdfg, 'B', 'A', offset_bytes=32) + + all_reuses = [r for e in sdfg.all_interstate_edges(recursive=True) + for r in e.data.reuse] + assert any(r[0] == 'B' and int(r[2]) == 32 for r in all_reuses) + + def test_error_unknown_array(self): + sdfg = _sequential_sdfg('arena_err', big=20, small=10) + with pytest.raises(ValueError, match='not found'): + _apply_arena_reuse(sdfg, 'NOPE', 'A') + + def test_sdfg_validates_after_arena_reuse(self): + sdfg = _sequential_sdfg('arena_valid', big=20, small=10) + _apply_arena_reuse(sdfg, 'B', 'A', offset_bytes=0) + sdfg.validate() + + +# --------------------------------------------------------------------------- +# Quantitative constructed examples +# --------------------------------------------------------------------------- + +class TestApplyReuseQuantitative: + """_apply_reuse on a two-array same-size SDFG: verify alloc count and + footprint decrease by the expected amounts.""" + + N = 10 # elements per array; dtype = float64 → 8 bytes each + + def _make(self, name): + return _sequential_sdfg(name, big=self.N, small=self.N) + + def test_alloc_count_decreases_by_one(self): + sdfg = self._make('reuse_count') + count_before = _count_allocs(sdfg) # 2: one for A, one for B + + _apply_reuse(sdfg, 'B', 'A') + + count_after = _count_allocs(sdfg) # 1: only A + assert count_after == count_before - 1, ( + f"Expected alloc count {count_before - 1}, got {count_after}" + ) + + def test_footprint_decreases_by_B_size(self): + sdfg = self._make('reuse_footprint') + b_bytes = self.N * dace.float64.bytes + footprint_before = _alloc_footprint_bytes(sdfg) # 2*N*8 + + _apply_reuse(sdfg, 'B', 'A') + + footprint_after = _alloc_footprint_bytes(sdfg) # N*8 + assert footprint_after == footprint_before - b_bytes, ( + f"Footprint should decrease by {b_bytes}; " + f"was {footprint_before}, now {footprint_after}" + ) + + def test_reuse_entry_created_for_B(self): + sdfg = self._make('reuse_entry') + _apply_reuse(sdfg, 'B', 'A') + reuse_entries = [r for e in sdfg.all_interstate_edges(recursive=True) + for r in e.data.reuse] + assert any(r[0] == 'B' and r[1] == 'A' for r in reuse_entries) + + +class TestBufferArenaPassQuantitative: + """buffer_reuse_cross_pass on a large-then-small SDFG: verify alloc count and + footprint after the pass match expected values (only A's memory survives + as an alloc entry; B's allocation is replaced by a reuse into A).""" + + BIG = 16 + SMALL = 8 + + def _make_sequential_cross_size(self, name): + """Sequential: A[big] used, then B[small] used — non-overlapping.""" + sdfg = SDFG(name) + sdfg.add_array('A', [self.BIG], dace.float64, transient=True) + sdfg.add_array('B', [self.SMALL], dace.float64, transient=True) + sdfg.add_array('out', [self.SMALL], dace.float64, transient=False) + + init = sdfg.add_state('init', is_start_block=True) + wA = sdfg.add_state('write_A') + rA = sdfg.add_state('read_A') + wB = sdfg.add_state('write_B') + rB = sdfg.add_state('read_B') + done = sdfg.add_state('done') + sdfg.add_edge(init, wA, InterstateEdge()) + sdfg.add_edge(wA, rA, InterstateEdge()) + sdfg.add_edge(rA, wB, InterstateEdge()) + sdfg.add_edge(wB, rB, InterstateEdge()) + sdfg.add_edge(rB, done, InterstateEdge()) + + big, small = self.BIG, self.SMALL + + m1, x1 = wA.add_map('wA', {'i': f'0:{big}'}) + t1 = wA.add_tasklet('wA', {}, {'a'}, 'a = (double)i;', + language=dace.Language.CPP) + aw = wA.add_write('A') + wA.add_edge(m1, None, t1, None, dace.Memlet()) + wA.add_memlet_path(t1, x1, aw, src_conn='a', memlet=dace.Memlet('A[i]')) + + m2, x2 = rA.add_map('rA', {'i': f'0:{small}'}) + t2 = rA.add_tasklet('rA', {'a'}, {'o'}, 'o = a;', + language=dace.Language.CPP) + ar = rA.add_read('A') + ow2 = rA.add_write('out') + rA.add_memlet_path(ar, m2, t2, dst_conn='a', memlet=dace.Memlet('A[i]')) + rA.add_memlet_path(t2, x2, ow2, src_conn='o', memlet=dace.Memlet('out[i]')) + + m3, x3 = wB.add_map('wB', {'i': f'0:{small}'}) + t3 = wB.add_tasklet('wB', {}, {'b'}, 'b = (double)i;', + language=dace.Language.CPP) + bw = wB.add_write('B') + wB.add_edge(m3, None, t3, None, dace.Memlet()) + wB.add_memlet_path(t3, x3, bw, src_conn='b', memlet=dace.Memlet('B[i]')) + + m4, x4 = rB.add_map('rB', {'i': f'0:{small}'}) + t4 = rB.add_tasklet('rB', {'b'}, {'o'}, 'o = b + 1.0;', + language=dace.Language.CPP) + br = rB.add_read('B') + ow4 = rB.add_write('out') + rB.add_memlet_path(br, m4, t4, dst_conn='b', memlet=dace.Memlet('B[i]')) + rB.add_memlet_path(t4, x4, ow4, src_conn='o', memlet=dace.Memlet('out[i]')) + + return sdfg + + def test_alloc_count_decreases_by_one(self): + """Before the pass both arrays carry explicit alloc annotations (2 total). + After, B's alloc is replaced by a reuse entry — only A's alloc survives.""" + sdfg = self._make_sequential_cross_size('arena_q_count') + make_explicit(sdfg, ['A', 'B']) + count_before = _count_allocs(sdfg) # 2 + + applied = buffer_reuse_cross_pass(sdfg, {}, {}) + assert len(applied) == 1, f"Expected 1 pair applied; got {applied}" + + count_after = _count_allocs(sdfg) + assert count_after == count_before - 1, ( + f"Expected alloc count {count_before - 1}, got {count_after}" + ) + + def test_footprint_decreases_by_small_array_bytes(self): + """Before: A_bytes + B_bytes allocated; after: only A_bytes (B reuses A).""" + sdfg = self._make_sequential_cross_size('arena_q_fp') + make_explicit(sdfg, ['A', 'B']) + b_bytes = self.SMALL * dace.float64.bytes + footprint_before = _alloc_footprint_bytes(sdfg) + + buffer_reuse_cross_pass(sdfg, {}, {}) + + footprint_after = _alloc_footprint_bytes(sdfg) + assert footprint_after == footprint_before - b_bytes, ( + f"Footprint should decrease by {b_bytes}; " + f"was {footprint_before}, now {footprint_after}" + ) + + def test_pair_applied_is_b_reuses_a(self): + sdfg = self._make_sequential_cross_size('arena_q_pair') + applied = buffer_reuse_cross_pass(sdfg, {}, {}) + assert ('B', 'A') in applied