From 29514f9f14fef2eb757b0c560599d394ff090921 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 26 May 2026 14:02:59 +0000 Subject: [PATCH] perf(gcm): avoid redundant get_ordered_predecessors calls and pre-allocated DataFrame in fitting_sampling In fit_causal_model_of_target, get_ordered_predecessors was called twice for every non-root node (once for fitting, once for PARENTS_DURING_FIT). Store the result once per node. In draw_samples, the pre-allocated pd.DataFrame(np.empty(...)) was filled column-by-column, which triggers repeated copy operations in pandas 2.x (copy-on-write). Switch to a dict of numpy arrays and construct the DataFrame once at the end. Also removes the now-unnecessary _parent_samples_of helper. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: github-actions[bot] --- dowhy/gcm/fitting_sampling.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/dowhy/gcm/fitting_sampling.py b/dowhy/gcm/fitting_sampling.py index e3dbe4bd7e..8303df059f 100644 --- a/dowhy/gcm/fitting_sampling.py +++ b/dowhy/gcm/fitting_sampling.py @@ -1,6 +1,6 @@ """This module provides functionality for fitting probabilistic causal models and drawing samples from them.""" -from typing import Any +from typing import Any, Dict import networkx as nx import numpy as np @@ -91,9 +91,11 @@ def fit_causal_model_of_target( if is_root_node(causal_model.graph, target_node): causal_model.causal_mechanism(target_node).fit(X=training_data[target_node].to_numpy()[~y_nan_mask]) + ordered_predecessors: list = [] else: + ordered_predecessors = get_ordered_predecessors(causal_model.graph, target_node) causal_model.causal_mechanism(target_node).fit( - X=training_data[get_ordered_predecessors(causal_model.graph, target_node)].to_numpy()[~y_nan_mask], + X=training_data[ordered_predecessors].to_numpy()[~y_nan_mask], Y=training_data[target_node].to_numpy()[~y_nan_mask], ) @@ -102,9 +104,7 @@ def fit_causal_model_of_target( # this would automatically fail when the number of parents is different, there are other more subtle cases, # where the number is still the same, but it's different parents, and therefore different data. That would yield # wrong results, but would not fail. - causal_model.graph.nodes[target_node][PARENTS_DURING_FIT] = get_ordered_predecessors( - causal_model.graph, target_node - ) + causal_model.graph.nodes[target_node][PARENTS_DURING_FIT] = ordered_predecessors def draw_samples(causal_model: ProbabilisticCausalModel, num_samples: int) -> pd.DataFrame: @@ -118,7 +118,7 @@ def draw_samples(causal_model: ProbabilisticCausalModel, num_samples: int) -> pd validate_causal_dag(causal_model.graph) sorted_nodes = list(nx.topological_sort(causal_model.graph)) - drawn_samples = pd.DataFrame(np.empty((num_samples, len(sorted_nodes))), columns=sorted_nodes) + drawn_samples: Dict[Any, np.ndarray] = {} for node in sorted_nodes: causal_mechanism = causal_model.causal_mechanism(node) @@ -126,12 +126,8 @@ def draw_samples(causal_model: ProbabilisticCausalModel, num_samples: int) -> pd if is_root_node(causal_model.graph, node): drawn_samples[node] = causal_mechanism.draw_samples(num_samples).squeeze() else: - drawn_samples[node] = causal_mechanism.draw_samples( - _parent_samples_of(node, causal_model, drawn_samples) - ).squeeze() - - return drawn_samples - + predecessors = get_ordered_predecessors(causal_model.graph, node) + parent_data = np.column_stack([drawn_samples[p] for p in predecessors]) + drawn_samples[node] = causal_mechanism.draw_samples(parent_data).squeeze() -def _parent_samples_of(node: Any, scm: ProbabilisticCausalModel, samples: pd.DataFrame) -> np.ndarray: - return samples[get_ordered_predecessors(scm.graph, node)].to_numpy() + return pd.DataFrame(drawn_samples, columns=sorted_nodes)