py-why · TasfinMahmud · May 26, 2026 · Jun 2, 2026 · Jun 6, 2026
diff --git a/dowhy/causal_estimators/distance_matching_estimator.py b/dowhy/causal_estimators/distance_matching_estimator.py
@@ -179,9 +179,12 @@ def estimate_effect(
         self._target_units = target_units
         self._treatment_value = treatment_value
         self._control_value = control_value
+        # Encode new data based on fitted encoders
+        observed_common_causes = self._encode(data[self._observed_common_causes_names], "observed_common_causes")
+
         updated_df = pd.concat(
             [
-                self._observed_common_causes,
+                observed_common_causes,
                 data[[self._target_estimand.outcome_variable[0], self._target_estimand.treatment_variable[0]]],
             ],
             axis=1,

diff --git a/dowhy/utils/encoding.py b/dowhy/utils/encoding.py
@@ -38,7 +38,7 @@ def one_hot_encode(data: pd.DataFrame, columns=None, drop_first: bool = False, e
 
     # Columns to keep in the result - not encoded.
     columns_to_keep = data.columns.difference(data_to_encode.columns)
-    df_columns_to_keep = data[columns_to_keep].reset_index(drop=True)
+    df_columns_to_keep = data[columns_to_keep]
 
     if encoder is None:  # Create new encoder
         drop = None
@@ -54,7 +54,7 @@ def one_hot_encode(data: pd.DataFrame, columns=None, drop_first: bool = False, e
     # Convert the encoded data to a DataFrame
     columns_encoded = encoder.get_feature_names_out(data_to_encode.columns)
 
-    df_encoded = pd.DataFrame(encoded_data, columns=columns_encoded).reset_index(drop=True)  # drop index from original
+    df_encoded = pd.DataFrame(encoded_data, columns=columns_encoded, index=data_to_encode.index)
 
     # Concatenate the encoded DataFrame with the original non-categorical columns
     df_result = pd.concat([df_columns_to_keep, df_encoded], axis=1)

diff --git a/tests/causal_estimators/test_distance_matching_estimator.py b/tests/causal_estimators/test_distance_matching_estimator.py
@@ -135,6 +135,67 @@ def test_non_binary_treatment_raises(self):
         with pytest.raises(Exception, match="binary"):
             model.estimate_effect(estimand, method_name="backdoor.distance_matching", target_units="att")
 
+    def test_data_subset_refuter_with_categorical_columns(self):
+        """Regression test for Issue #1372.
+
+        DataSubsetRefuter samples a subset of the DataFrame, which changes the
+        index. When common causes contain categorical columns, one_hot_encode
+        must preserve the original index so that the encoded DataFrame aligns
+        with the subsetted treatment/outcome columns. Additionally,
+        estimate_effect() must re-encode the (subsetted) data rather than
+        reusing the stale encoded cache from fit().
+
+        Before the fix, this raised:
+            ValueError: Unalignable boolean Series provided as indexer
+        """
+        rng = np.random.default_rng(1372)
+        n = 400
+        # Create a categorical common cause
+        w_cat = pd.Categorical(rng.choice(["low", "mid", "high"], size=n))
+        w_num = rng.standard_normal(n)
+        treatment = ((w_num + (w_cat == "high").astype(int) + rng.standard_normal(n)) > 0).astype(int)
+        outcome = 5 * treatment + 2 * w_num + 3 * (w_cat == "high").astype(int) + rng.standard_normal(n)
+
+        df = pd.DataFrame(
+            {
+                "W_cat": w_cat,
+                "W_num": w_num,
+                "v0": treatment,
+                "y": outcome,
+            }
+        )
+
+        gml = (
+            "graph [directed 1 "
+            'node [id "W_cat" label "W_cat"] '
+            'node [id "W_num" label "W_num"] '
+            'node [id "v0" label "v0"] '
+            'node [id "y" label "y"] '
+            'edge [source "W_cat" target "v0"] edge [source "W_cat" target "y"] '
+            'edge [source "W_num" target "v0"] edge [source "W_num" target "y"] '
+            'edge [source "v0" target "y"]]'
+        )
+
+        model = CausalModel(data=df, treatment="v0", outcome="y", graph=gml)
+        estimand = model.identify_effect(proceed_when_unidentifiable=True)
+        estimate = model.estimate_effect(
+            estimand,
+            method_name="backdoor.distance_matching",
+            target_units="att",
+        )
+
+        # The refutation must complete without raising ValueError
+        refutation = model.refute_estimate(
+            estimand,
+            estimate,
+            method_name="data_subset_refuter",
+            subset_fraction=0.8,
+            num_simulations=3,
+        )
+
+        assert refutation is not None
+        assert np.isfinite(refutation.new_effect), "Refuted effect should be finite"
+
     def test_average_treatment_effect_via_simple_estimator(self):
         """Smoke test using the shared SimpleEstimator harness."""
         tester = SimpleEstimator(error_tolerance=0.3, Estimator=DistanceMatchingEstimator)