From 98b091beb7032e2395de78bb3f892dbe8afde1e0 Mon Sep 17 00:00:00 2001 From: Tasfin Mahmud Date: Tue, 26 May 2026 23:24:11 +0600 Subject: [PATCH 1/3] Fix data subset refuter index misalignment for categorical columns (Issue #1372) Signed-off-by: Tasfin Mahmud --- dowhy/causal_estimators/distance_matching_estimator.py | 5 ++++- dowhy/utils/encoding.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/dowhy/causal_estimators/distance_matching_estimator.py b/dowhy/causal_estimators/distance_matching_estimator.py index 3d0bd70f2a..48a530d483 100644 --- a/dowhy/causal_estimators/distance_matching_estimator.py +++ b/dowhy/causal_estimators/distance_matching_estimator.py @@ -179,9 +179,12 @@ def estimate_effect( self._target_units = target_units self._treatment_value = treatment_value self._control_value = control_value + # Encode new data based on fitted encoders + observed_common_causes = self._encode(data[self._observed_common_causes_names], "observed_common_causes") + updated_df = pd.concat( [ - self._observed_common_causes, + observed_common_causes, data[[self._target_estimand.outcome_variable[0], self._target_estimand.treatment_variable[0]]], ], axis=1, diff --git a/dowhy/utils/encoding.py b/dowhy/utils/encoding.py index 0d91f48256..ce3615ffe3 100644 --- a/dowhy/utils/encoding.py +++ b/dowhy/utils/encoding.py @@ -38,7 +38,7 @@ def one_hot_encode(data: pd.DataFrame, columns=None, drop_first: bool = False, e # Columns to keep in the result - not encoded. columns_to_keep = data.columns.difference(data_to_encode.columns) - df_columns_to_keep = data[columns_to_keep].reset_index(drop=True) + df_columns_to_keep = data[columns_to_keep] if encoder is None: # Create new encoder drop = None @@ -54,7 +54,7 @@ def one_hot_encode(data: pd.DataFrame, columns=None, drop_first: bool = False, e # Convert the encoded data to a DataFrame columns_encoded = encoder.get_feature_names_out(data_to_encode.columns) - df_encoded = pd.DataFrame(encoded_data, columns=columns_encoded).reset_index(drop=True) # drop index from original + df_encoded = pd.DataFrame(encoded_data, columns=columns_encoded, index=data_to_encode.index) # Concatenate the encoded DataFrame with the original non-categorical columns df_result = pd.concat([df_columns_to_keep, df_encoded], axis=1) From b58e9433860f0f7675878be4f52a74a5473b20dd Mon Sep 17 00:00:00 2001 From: Tasfin Mahmud Date: Tue, 2 Jun 2026 16:20:57 +0600 Subject: [PATCH 2/3] Add regression test for DistanceMatchingEstimator + DataSubsetRefuter with categorical columns (#1372) Signed-off-by: Tasfin Mahmud --- .../test_distance_matching_estimator.py | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/tests/causal_estimators/test_distance_matching_estimator.py b/tests/causal_estimators/test_distance_matching_estimator.py index 7549a6a39c..1d5e8cda5f 100644 --- a/tests/causal_estimators/test_distance_matching_estimator.py +++ b/tests/causal_estimators/test_distance_matching_estimator.py @@ -135,6 +135,65 @@ def test_non_binary_treatment_raises(self): with pytest.raises(Exception, match="binary"): model.estimate_effect(estimand, method_name="backdoor.distance_matching", target_units="att") + def test_data_subset_refuter_with_categorical_columns(self): + """Regression test for Issue #1372. + + DataSubsetRefuter samples a subset of the DataFrame, which changes the + index. When common causes contain categorical columns, one_hot_encode + must preserve the original index so that the encoded DataFrame aligns + with the subsetted treatment/outcome columns. Additionally, + estimate_effect() must re-encode the (subsetted) data rather than + reusing the stale encoded cache from fit(). + + Before the fix, this raised: + ValueError: Unalignable boolean Series provided as indexer + """ + rng = np.random.default_rng(1372) + n = 400 + # Create a categorical common cause + w_cat = pd.Categorical(rng.choice(["low", "mid", "high"], size=n)) + w_num = rng.standard_normal(n) + treatment = ((w_num + (w_cat == "high").astype(int) + rng.standard_normal(n)) > 0).astype(int) + outcome = 5 * treatment + 2 * w_num + 3 * (w_cat == "high").astype(int) + rng.standard_normal(n) + + df = pd.DataFrame({ + "W_cat": w_cat, + "W_num": w_num, + "v0": treatment, + "y": outcome, + }) + + gml = ( + 'graph [directed 1 ' + 'node [id "W_cat" label "W_cat"] ' + 'node [id "W_num" label "W_num"] ' + 'node [id "v0" label "v0"] ' + 'node [id "y" label "y"] ' + 'edge [source "W_cat" target "v0"] edge [source "W_cat" target "y"] ' + 'edge [source "W_num" target "v0"] edge [source "W_num" target "y"] ' + 'edge [source "v0" target "y"]]' + ) + + model = CausalModel(data=df, treatment="v0", outcome="y", graph=gml) + estimand = model.identify_effect(proceed_when_unidentifiable=True) + estimate = model.estimate_effect( + estimand, + method_name="backdoor.distance_matching", + target_units="att", + ) + + # The refutation must complete without raising ValueError + refutation = model.refute_estimate( + estimand, + estimate, + method_name="data_subset_refuter", + subset_fraction=0.8, + num_simulations=3, + ) + + assert refutation is not None + assert np.isfinite(refutation.new_effect), "Refuted effect should be finite" + def test_average_treatment_effect_via_simple_estimator(self): """Smoke test using the shared SimpleEstimator harness.""" tester = SimpleEstimator(error_tolerance=0.3, Estimator=DistanceMatchingEstimator) From 47f7f7a09cda0d466a73f6b12ceaa874f4e7f63d Mon Sep 17 00:00:00 2001 From: Tasfin Mahmud Date: Sat, 6 Jun 2026 23:21:08 +0600 Subject: [PATCH 3/3] style: fix black formatting --- .../test_distance_matching_estimator.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/causal_estimators/test_distance_matching_estimator.py b/tests/causal_estimators/test_distance_matching_estimator.py index 1d5e8cda5f..875338aaa7 100644 --- a/tests/causal_estimators/test_distance_matching_estimator.py +++ b/tests/causal_estimators/test_distance_matching_estimator.py @@ -156,15 +156,17 @@ def test_data_subset_refuter_with_categorical_columns(self): treatment = ((w_num + (w_cat == "high").astype(int) + rng.standard_normal(n)) > 0).astype(int) outcome = 5 * treatment + 2 * w_num + 3 * (w_cat == "high").astype(int) + rng.standard_normal(n) - df = pd.DataFrame({ - "W_cat": w_cat, - "W_num": w_num, - "v0": treatment, - "y": outcome, - }) + df = pd.DataFrame( + { + "W_cat": w_cat, + "W_num": w_num, + "v0": treatment, + "y": outcome, + } + ) gml = ( - 'graph [directed 1 ' + "graph [directed 1 " 'node [id "W_cat" label "W_cat"] ' 'node [id "W_num" label "W_num"] ' 'node [id "v0" label "v0"] '