Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion dowhy/causal_estimators/distance_matching_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,9 +179,12 @@ def estimate_effect(
self._target_units = target_units
self._treatment_value = treatment_value
self._control_value = control_value
# Encode new data based on fitted encoders
observed_common_causes = self._encode(data[self._observed_common_causes_names], "observed_common_causes")

updated_df = pd.concat(
[
self._observed_common_causes,
observed_common_causes,
data[[self._target_estimand.outcome_variable[0], self._target_estimand.treatment_variable[0]]],
],
axis=1,
Expand Down
4 changes: 2 additions & 2 deletions dowhy/utils/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def one_hot_encode(data: pd.DataFrame, columns=None, drop_first: bool = False, e

# Columns to keep in the result - not encoded.
columns_to_keep = data.columns.difference(data_to_encode.columns)
df_columns_to_keep = data[columns_to_keep].reset_index(drop=True)
df_columns_to_keep = data[columns_to_keep]

if encoder is None: # Create new encoder
drop = None
Expand All @@ -54,7 +54,7 @@ def one_hot_encode(data: pd.DataFrame, columns=None, drop_first: bool = False, e
# Convert the encoded data to a DataFrame
columns_encoded = encoder.get_feature_names_out(data_to_encode.columns)

df_encoded = pd.DataFrame(encoded_data, columns=columns_encoded).reset_index(drop=True) # drop index from original
df_encoded = pd.DataFrame(encoded_data, columns=columns_encoded, index=data_to_encode.index)

# Concatenate the encoded DataFrame with the original non-categorical columns
df_result = pd.concat([df_columns_to_keep, df_encoded], axis=1)
Expand Down
61 changes: 61 additions & 0 deletions tests/causal_estimators/test_distance_matching_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,67 @@ def test_non_binary_treatment_raises(self):
with pytest.raises(Exception, match="binary"):
model.estimate_effect(estimand, method_name="backdoor.distance_matching", target_units="att")

def test_data_subset_refuter_with_categorical_columns(self):
"""Regression test for Issue #1372.

DataSubsetRefuter samples a subset of the DataFrame, which changes the
index. When common causes contain categorical columns, one_hot_encode
must preserve the original index so that the encoded DataFrame aligns
with the subsetted treatment/outcome columns. Additionally,
estimate_effect() must re-encode the (subsetted) data rather than
reusing the stale encoded cache from fit().

Before the fix, this raised:
ValueError: Unalignable boolean Series provided as indexer
"""
rng = np.random.default_rng(1372)
n = 400
# Create a categorical common cause
w_cat = pd.Categorical(rng.choice(["low", "mid", "high"], size=n))
w_num = rng.standard_normal(n)
treatment = ((w_num + (w_cat == "high").astype(int) + rng.standard_normal(n)) > 0).astype(int)
outcome = 5 * treatment + 2 * w_num + 3 * (w_cat == "high").astype(int) + rng.standard_normal(n)

df = pd.DataFrame(
{
"W_cat": w_cat,
"W_num": w_num,
"v0": treatment,
"y": outcome,
}
)

gml = (
"graph [directed 1 "
'node [id "W_cat" label "W_cat"] '
'node [id "W_num" label "W_num"] '
'node [id "v0" label "v0"] '
'node [id "y" label "y"] '
'edge [source "W_cat" target "v0"] edge [source "W_cat" target "y"] '
'edge [source "W_num" target "v0"] edge [source "W_num" target "y"] '
'edge [source "v0" target "y"]]'
)

model = CausalModel(data=df, treatment="v0", outcome="y", graph=gml)
estimand = model.identify_effect(proceed_when_unidentifiable=True)
estimate = model.estimate_effect(
estimand,
method_name="backdoor.distance_matching",
target_units="att",
)

# The refutation must complete without raising ValueError
refutation = model.refute_estimate(
estimand,
estimate,
method_name="data_subset_refuter",
subset_fraction=0.8,
num_simulations=3,
)

assert refutation is not None
assert np.isfinite(refutation.new_effect), "Refuted effect should be finite"

def test_average_treatment_effect_via_simple_estimator(self):
"""Smoke test using the shared SimpleEstimator harness."""
tester = SimpleEstimator(error_tolerance=0.3, Estimator=DistanceMatchingEstimator)
Expand Down