From 98b091beb7032e2395de78bb3f892dbe8afde1e0 Mon Sep 17 00:00:00 2001
From: Tasfin Mahmud <tasfinmahmud1@gmail.com>
Date: Tue, 26 May 2026 23:24:11 +0600
Subject: [PATCH 1/3] Fix data subset refuter index misalignment for
 categorical columns (Issue #1372)

Signed-off-by: Tasfin Mahmud <tasfinmahmud1@gmail.com>
---
 dowhy/causal_estimators/distance_matching_estimator.py | 5 ++++-
 dowhy/utils/encoding.py                                | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/dowhy/causal_estimators/distance_matching_estimator.py b/dowhy/causal_estimators/distance_matching_estimator.py
index 3d0bd70f2a..48a530d483 100644
--- a/dowhy/causal_estimators/distance_matching_estimator.py
+++ b/dowhy/causal_estimators/distance_matching_estimator.py
@@ -179,9 +179,12 @@ def estimate_effect(
         self._target_units = target_units
         self._treatment_value = treatment_value
         self._control_value = control_value
+        # Encode new data based on fitted encoders
+        observed_common_causes = self._encode(data[self._observed_common_causes_names], "observed_common_causes")
+
         updated_df = pd.concat(
             [
-                self._observed_common_causes,
+                observed_common_causes,
                 data[[self._target_estimand.outcome_variable[0], self._target_estimand.treatment_variable[0]]],
             ],
             axis=1,
diff --git a/dowhy/utils/encoding.py b/dowhy/utils/encoding.py
index 0d91f48256..ce3615ffe3 100644
--- a/dowhy/utils/encoding.py
+++ b/dowhy/utils/encoding.py
@@ -38,7 +38,7 @@ def one_hot_encode(data: pd.DataFrame, columns=None, drop_first: bool = False, e
 
     # Columns to keep in the result - not encoded.
     columns_to_keep = data.columns.difference(data_to_encode.columns)
-    df_columns_to_keep = data[columns_to_keep].reset_index(drop=True)
+    df_columns_to_keep = data[columns_to_keep]
 
     if encoder is None:  # Create new encoder
         drop = None
@@ -54,7 +54,7 @@ def one_hot_encode(data: pd.DataFrame, columns=None, drop_first: bool = False, e
     # Convert the encoded data to a DataFrame
     columns_encoded = encoder.get_feature_names_out(data_to_encode.columns)
 
-    df_encoded = pd.DataFrame(encoded_data, columns=columns_encoded).reset_index(drop=True)  # drop index from original
+    df_encoded = pd.DataFrame(encoded_data, columns=columns_encoded, index=data_to_encode.index)
 
     # Concatenate the encoded DataFrame with the original non-categorical columns
     df_result = pd.concat([df_columns_to_keep, df_encoded], axis=1)

From b58e9433860f0f7675878be4f52a74a5473b20dd Mon Sep 17 00:00:00 2001
From: Tasfin Mahmud <tasfinmahmud1@gmail.com>
Date: Tue, 2 Jun 2026 16:20:57 +0600
Subject: [PATCH 2/3] Add regression test for DistanceMatchingEstimator +
 DataSubsetRefuter with categorical columns (#1372)

Signed-off-by: Tasfin Mahmud <tasfinmahmud1@gmail.com>
---
 .../test_distance_matching_estimator.py       | 59 +++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/tests/causal_estimators/test_distance_matching_estimator.py b/tests/causal_estimators/test_distance_matching_estimator.py
index 7549a6a39c..1d5e8cda5f 100644
--- a/tests/causal_estimators/test_distance_matching_estimator.py
+++ b/tests/causal_estimators/test_distance_matching_estimator.py
@@ -135,6 +135,65 @@ def test_non_binary_treatment_raises(self):
         with pytest.raises(Exception, match="binary"):
             model.estimate_effect(estimand, method_name="backdoor.distance_matching", target_units="att")
 
+    def test_data_subset_refuter_with_categorical_columns(self):
+        """Regression test for Issue #1372.
+
+        DataSubsetRefuter samples a subset of the DataFrame, which changes the
+        index. When common causes contain categorical columns, one_hot_encode
+        must preserve the original index so that the encoded DataFrame aligns
+        with the subsetted treatment/outcome columns. Additionally,
+        estimate_effect() must re-encode the (subsetted) data rather than
+        reusing the stale encoded cache from fit().
+
+        Before the fix, this raised:
+            ValueError: Unalignable boolean Series provided as indexer
+        """
+        rng = np.random.default_rng(1372)
+        n = 400
+        # Create a categorical common cause
+        w_cat = pd.Categorical(rng.choice(["low", "mid", "high"], size=n))
+        w_num = rng.standard_normal(n)
+        treatment = ((w_num + (w_cat == "high").astype(int) + rng.standard_normal(n)) > 0).astype(int)
+        outcome = 5 * treatment + 2 * w_num + 3 * (w_cat == "high").astype(int) + rng.standard_normal(n)
+
+        df = pd.DataFrame({
+            "W_cat": w_cat,
+            "W_num": w_num,
+            "v0": treatment,
+            "y": outcome,
+        })
+
+        gml = (
+            'graph [directed 1 '
+            'node [id "W_cat" label "W_cat"] '
+            'node [id "W_num" label "W_num"] '
+            'node [id "v0" label "v0"] '
+            'node [id "y" label "y"] '
+            'edge [source "W_cat" target "v0"] edge [source "W_cat" target "y"] '
+            'edge [source "W_num" target "v0"] edge [source "W_num" target "y"] '
+            'edge [source "v0" target "y"]]'
+        )
+
+        model = CausalModel(data=df, treatment="v0", outcome="y", graph=gml)
+        estimand = model.identify_effect(proceed_when_unidentifiable=True)
+        estimate = model.estimate_effect(
+            estimand,
+            method_name="backdoor.distance_matching",
+            target_units="att",
+        )
+
+        # The refutation must complete without raising ValueError
+        refutation = model.refute_estimate(
+            estimand,
+            estimate,
+            method_name="data_subset_refuter",
+            subset_fraction=0.8,
+            num_simulations=3,
+        )
+
+        assert refutation is not None
+        assert np.isfinite(refutation.new_effect), "Refuted effect should be finite"
+
     def test_average_treatment_effect_via_simple_estimator(self):
         """Smoke test using the shared SimpleEstimator harness."""
         tester = SimpleEstimator(error_tolerance=0.3, Estimator=DistanceMatchingEstimator)

From 47f7f7a09cda0d466a73f6b12ceaa874f4e7f63d Mon Sep 17 00:00:00 2001
From: Tasfin Mahmud <tasfinmahmud1@gmail.com>
Date: Sat, 6 Jun 2026 23:21:08 +0600
Subject: [PATCH 3/3] style: fix black formatting

---
 .../test_distance_matching_estimator.py          | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tests/causal_estimators/test_distance_matching_estimator.py b/tests/causal_estimators/test_distance_matching_estimator.py
index 1d5e8cda5f..875338aaa7 100644
--- a/tests/causal_estimators/test_distance_matching_estimator.py
+++ b/tests/causal_estimators/test_distance_matching_estimator.py
@@ -156,15 +156,17 @@ def test_data_subset_refuter_with_categorical_columns(self):
         treatment = ((w_num + (w_cat == "high").astype(int) + rng.standard_normal(n)) > 0).astype(int)
         outcome = 5 * treatment + 2 * w_num + 3 * (w_cat == "high").astype(int) + rng.standard_normal(n)
 
-        df = pd.DataFrame({
-            "W_cat": w_cat,
-            "W_num": w_num,
-            "v0": treatment,
-            "y": outcome,
-        })
+        df = pd.DataFrame(
+            {
+                "W_cat": w_cat,
+                "W_num": w_num,
+                "v0": treatment,
+                "y": outcome,
+            }
+        )
 
         gml = (
-            'graph [directed 1 '
+            "graph [directed 1 "
             'node [id "W_cat" label "W_cat"] '
             'node [id "W_num" label "W_num"] '
             'node [id "v0" label "v0"] '