Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 38 additions & 7 deletions dowhy/causal_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,16 +203,47 @@ def get_new_estimator_object(
return new_estimator

def estimate_effect_naive(self, data: pd.DataFrame):
"""
"""Compute a naive (unadjusted) observational difference as a baseline for effect strength.

Estimates E[Y | T = treatment_value] - E[Y | T = control_value] from the raw data without
any causal adjustment. This is used internally by :meth:`evaluate_effect_strength` as the
denominator when computing the ``fraction-effect`` statistic.

:param data: Pandas dataframe to estimate effect
:returns: CausalEstimate with the naive observational difference
"""
# TODO Only works for binary treatment
df_withtreatment = data.loc[data[self._target_estimand.treatment_variable] == 1]
df_notreatment = data.loc[data[self._target_estimand.treatment_variable] == 0]
est = np.mean(df_withtreatment[self._target_estimand.outcome_variable]) - np.mean(
df_notreatment[self._target_estimand.outcome_variable]
treatment_var = self._target_estimand.treatment_variable
outcome_var = self._target_estimand.outcome_variable

if len(treatment_var) == 1:
# Single treatment: index as a Series to get a 1-D boolean mask
treatment_col = data[treatment_var[0]]
mask_with = treatment_col == self._treatment_value
mask_without = treatment_col == self._control_value
else:
# Multiple treatments: broadcast scalar or per-treatment values and combine row-wise
t_val = self._treatment_value
c_val = self._control_value
if not isinstance(t_val, (list, tuple)):
t_val = [t_val] * len(treatment_var)
if not isinstance(c_val, (list, tuple)):
c_val = [c_val] * len(treatment_var)
mask_with = (data[treatment_var] == t_val).all(axis=1)
mask_without = (data[treatment_var] == c_val).all(axis=1)

df_withtreatment = data.loc[mask_with]
df_notreatment = data.loc[mask_without]
est = np.mean(df_withtreatment[outcome_var]) - np.mean(df_notreatment[outcome_var])
return CausalEstimate(
data,
None,
None,
est,
None,
None,
control_value=self._control_value,
treatment_value=self._treatment_value,
)
return CausalEstimate(data, None, None, est, None, control_value=0, treatment_value=1)

def _estimate_effect_fn(self, data_df):
"""Function used in conditional effect estimation. This function is to be overridden by each child estimator.
Expand Down
64 changes: 64 additions & 0 deletions tests/causal_estimators/test_linear_regression_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,3 +296,67 @@ def test_invalid_identifier_method_raises(self, invalid_method):
estimator = LinearRegressionEstimator(identified_estimand=target_estimand)
with pytest.raises(ValueError, match="only supports backdoor and general_adjustment"):
estimator.fit(data["df"])

def test_evaluate_effect_strength_binary_treatment(self):
"""evaluate_effect_strength must not raise for a single binary treatment.

Regression test for #416: estimate_effect_naive used data[list] instead of
data[col_name], returning a DataFrame that caused `ValueError: Cannot index with
multidimensional key` inside `.loc[]`.
"""
data = dowhy.datasets.linear_dataset(
beta=10,
num_common_causes=1,
num_instruments=0,
num_treatments=1,
num_samples=500,
treatment_is_binary=True,
)
target_estimand = identify_effect_auto(
build_graph_from_str(data["gml_graph"]),
observed_nodes=list(data["df"].columns),
action_nodes=data["treatment_name"],
outcome_nodes=data["outcome_name"],
estimand_type=EstimandType.NONPARAMETRIC_ATE,
)
target_estimand.set_identifier_method("backdoor")
estimator = LinearRegressionEstimator(identified_estimand=target_estimand)
estimator.fit(data["df"])
ate_estimate = estimator.estimate_effect(data["df"], control_value=0, treatment_value=1)
# Should not raise ValueError: Cannot index with multidimensional key
strength = estimator.evaluate_effect_strength(data["df"], ate_estimate)
assert "fraction-effect" in strength
assert np.isfinite(strength["fraction-effect"])

def test_evaluate_effect_strength_non_binary_treatment(self):
"""estimate_effect_naive must respect actual treatment_value / control_value, not hardcoded 0/1.

Regression test for #416: the old code used hardcoded ``== 1`` and ``== 0``, so
non-binary treatments (e.g. control_value=1, treatment_value=2) would silently
compute the wrong effect-strength ratio (selecting no rows).
"""
data = dowhy.datasets.linear_dataset(
beta=10,
num_common_causes=1,
num_instruments=0,
num_treatments=1,
num_samples=1000,
treatment_is_binary=False,
)
df = data["df"]
# Recode continuous treatment to binary {0, 1} so both control and treatment rows exist
df[data["treatment_name"][0]] = np.where(df[data["treatment_name"][0]] > 0, 1, 0)
Comment on lines +347 to +348
target_estimand = identify_effect_auto(
build_graph_from_str(data["gml_graph"]),
observed_nodes=list(df.columns),
action_nodes=data["treatment_name"],
outcome_nodes=data["outcome_name"],
estimand_type=EstimandType.NONPARAMETRIC_ATE,
)
target_estimand.set_identifier_method("backdoor")
estimator = LinearRegressionEstimator(identified_estimand=target_estimand)
estimator.fit(df)
ate_estimate = estimator.estimate_effect(df, control_value=0, treatment_value=1)
# Should not raise; fraction-effect must be a finite number
strength = estimator.evaluate_effect_strength(df, ate_estimate)
assert np.isfinite(strength["fraction-effect"])