diff --git a/dowhy/utils/encoding.py b/dowhy/utils/encoding.py index 0d91f48256..b024af8df0 100644 --- a/dowhy/utils/encoding.py +++ b/dowhy/utils/encoding.py @@ -38,7 +38,7 @@ def one_hot_encode(data: pd.DataFrame, columns=None, drop_first: bool = False, e # Columns to keep in the result - not encoded. columns_to_keep = data.columns.difference(data_to_encode.columns) - df_columns_to_keep = data[columns_to_keep].reset_index(drop=True) + df_columns_to_keep = data[columns_to_keep] if encoder is None: # Create new encoder drop = None @@ -51,10 +51,12 @@ def one_hot_encode(data: pd.DataFrame, columns=None, drop_first: bool = False, e else: # Use existing encoder encoded_data = encoder.transform(data_to_encode) - # Convert the encoded data to a DataFrame + # Convert the encoded data to a DataFrame, preserving the original index so that + # callers relying on index alignment (e.g. distance matching with a data subset) work + # correctly. columns_encoded = encoder.get_feature_names_out(data_to_encode.columns) - df_encoded = pd.DataFrame(encoded_data, columns=columns_encoded).reset_index(drop=True) # drop index from original + df_encoded = pd.DataFrame(encoded_data, columns=columns_encoded, index=data_to_encode.index) # Concatenate the encoded DataFrame with the original non-categorical columns df_result = pd.concat([df_columns_to_keep, df_encoded], axis=1) diff --git a/tests/utils/test_encoding.py b/tests/utils/test_encoding.py index 3fb69062f7..4ecdca9349 100644 --- a/tests/utils/test_encoding.py +++ b/tests/utils/test_encoding.py @@ -85,3 +85,29 @@ def test_one_hot_encode_consistent_with_new_data(): c_z2 = df_encoded2["C_Z"] assert c_z1[2] == c_z2[1] assert c_z1[5] == c_z2[5] + + +def test_one_hot_encode_preserves_index(): + """Regression test for https://github.com/py-why/dowhy/issues/1372. + + When a DataFrame with a non-default (non-sequential) index is encoded, + the output must retain the original index so that index-aligned operations + downstream (e.g. pd.concat or boolean .loc indexing) continue to work + correctly. This scenario occurs whenever a data-subset refuter passes a + sampled subset of the original DataFrame to an estimator like + DistanceMatchingEstimator. + """ + data = pd.DataFrame({"cat": ["a", "b", "a", "c", "b"], "num": [1.0, 2.0, 3.0, 4.0, 5.0]}) + # Simulate a data-subset refuter that samples rows without resetting the index. + subset = data.iloc[[1, 3, 4]] + + result, _ = one_hot_encode(subset) + + assert list(result.index) == [1, 3, 4], "Index must be preserved after encoding" + + +def test_one_hot_encode_preserves_index_no_categorical(): + """Index must be preserved even when there are no categorical columns.""" + data = pd.DataFrame({"x": [1.0, 2.0, 3.0]}, index=[5, 10, 15]) + result, _ = one_hot_encode(data) + assert list(result.index) == [5, 10, 15], "Index must be preserved when no encoding is needed"