From b6a31f61206a2f3a98e7cde1ee1f3fd471edae8d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 30 Mar 2026 17:21:23 +0000 Subject: [PATCH] fix: remove numpy/scipy symbol re-exports from datasets module (issue #981) Remove bare 'from numpy.random import choice' and replace the three call sites with 'np.random.choice(...)'. This stops the numpy 'choice' symbol from appearing as part of dowhy.datasets's public API. Remove bare 'from scipy.stats import bernoulli, halfnorm, poisson, uniform' and replace all call sites in sales_dataset() with the already-imported alias 'ss.*' (scipy.stats is already imported as 'ss'). This stops four scipy distribution objects from polluting the module namespace and confusing Sphinx source-link generation. No functional change; behaviour is identical. Closes #981 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- dowhy/datasets.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/dowhy/datasets.py b/dowhy/datasets.py index 6d92167864..9be4e3ebf2 100755 --- a/dowhy/datasets.py +++ b/dowhy/datasets.py @@ -7,8 +7,6 @@ import numpy as np import pandas as pd import scipy.stats as ss -from numpy.random import choice -from scipy.stats import bernoulli, halfnorm, poisson, uniform from sklearn.neural_network import MLPRegressor from dowhy.utils.graph_operations import add_edge, del_edge, get_random_node_pair, get_simple_ordered_tree @@ -25,14 +23,14 @@ def sigmoid(x): def convert_to_binary(x, stochastic=True): p = sigmoid(x) if stochastic: - return choice([0, 1], p=[1 - p, p]) + return np.random.choice([0, 1], p=[1 - p, p]) else: return int(p > 0.5) def stochastically_convert_to_three_level_categorical(x): p = sigmoid(x) - return choice([0, 1, 2], p=[0.8 * (1 - p), 0.8 * p, 0.2]) + return np.random.choice([0, 1, 2], p=[0.8 * (1 - p), 0.8 * p, 0.2]) def convert_to_categorical(arr, num_vars, num_discrete_vars, quantiles=[0.25, 0.5, 0.75], one_hot_encode=False): @@ -590,7 +588,7 @@ def create_discrete_column(num_samples, std_dev=1): xL, scale=std_dev ) # probability of selecting a number x is p(x-0.5 < x < x+0.5) where x is a normal random variable with mean 0 and standard deviation std_dev prob = prob / prob.sum() # normalize the probabilities so their sum is 1 - nums = choice(a=x, size=num_samples, p=prob) # pick up an element + nums = np.random.choice(a=x, size=num_samples, p=prob) # pick up an element return nums @@ -1099,22 +1097,22 @@ def sales_dataset( df[ad_spend_col] = ( based_ad_spending - + df[shopping_event_col] * uniform.rvs(loc=1000, scale=1000, size=df.shape[0]) - + (1 - df[shopping_event_col]) * uniform.rvs(loc=100, scale=400, size=df.shape[0]) + + df[shopping_event_col] * ss.uniform.rvs(loc=1000, scale=1000, size=df.shape[0]) + + (1 - df[shopping_event_col]) * ss.uniform.rvs(loc=100, scale=400, size=df.shape[0]) ) df[page_visit_col] = ( - poisson.rvs(mu=10000 * page_visitor_factor, size=df.shape[0]) - + uniform.rvs(loc=5000 * page_visitor_factor, scale=5000, size=df.shape[0]) * df[shopping_event_col] - + halfnorm.rvs(loc=0.5 * page_visitor_factor, scale=0.01, size=df.shape[0]) * df[ad_spend_col] - + halfnorm.rvs(loc=1000 * page_visitor_factor, scale=100, size=df.shape[0]) + ss.poisson.rvs(mu=10000 * page_visitor_factor, size=df.shape[0]) + + ss.uniform.rvs(loc=5000 * page_visitor_factor, scale=5000, size=df.shape[0]) * df[shopping_event_col] + + ss.halfnorm.rvs(loc=0.5 * page_visitor_factor, scale=0.01, size=df.shape[0]) * df[ad_spend_col] + + ss.halfnorm.rvs(loc=1000 * page_visitor_factor, scale=100, size=df.shape[0]) ) df[page_visit_col] = df[page_visit_col].astype(int) df[price_col] = ( base_price - + uniform.rvs(loc=-200, scale=200, size=df.shape[0]) * df[shopping_event_col] - + bernoulli.rvs(p=0.02, size=df.shape[0]) * uniform.rvs(loc=-20, scale=20, size=df.shape[0]) + + ss.uniform.rvs(loc=-200, scale=200, size=df.shape[0]) * df[shopping_event_col] + + ss.bernoulli.rvs(p=0.02, size=df.shape[0]) * ss.uniform.rvs(loc=-20, scale=20, size=df.shape[0]) ) price_changes = 1 - df[price_col] / original_product_price @@ -1123,9 +1121,9 @@ def sales_dataset( price_changes[price_changes == 0] = 1 df[units_sold_col] = [ - poisson.rvs( + ss.poisson.rvs( mu=demand_changes.iloc[i] / price_changes.iloc[i] * 0.2 * df[page_visit_col].iloc[i] - + uniform.rvs(loc=100, scale=1000) * df[shopping_event_col].iloc[i] + + ss.uniform.rvs(loc=100, scale=1000) * df[shopping_event_col].iloc[i] ) for i in range(df.shape[0]) ] @@ -1136,7 +1134,7 @@ def sales_dataset( df[operation_col] = ( df[ad_spend_col] + product_production_cost * df[units_sold_col] - + halfnorm.rvs(loc=500000, scale=10, size=df.shape[0]) + + ss.halfnorm.rvs(loc=500000, scale=10, size=df.shape[0]) ) df[profit_col] = df[revenue_col] - df[operation_col]