diff --git a/neuracore/core/endpoint.py b/neuracore/core/endpoint.py index 6ca4f40e0..f3a90276d 100644 --- a/neuracore/core/endpoint.py +++ b/neuracore/core/endpoint.py @@ -33,8 +33,8 @@ from neuracore.core.get_latest_sync_point import get_latest_sync_point from neuracore.core.utils.download import download_with_progress from neuracore.ml.logging.endpoint_log_streamer import EndpointLogStreamer +from neuracore.ml.preprocessing.base import PreprocessingConfiguration from neuracore.ml.utils.endpoint_storage_handler import EndpointStorageHandler -from neuracore.ml.utils.preprocessing_utils import PreprocessingConfiguration from .auth import get_auth from .const import API_URL, PING_ENDPOINT, PREDICT_ENDPOINT, SET_CHECKPOINT_ENDPOINT diff --git a/neuracore/core/utils/server.py b/neuracore/core/utils/server.py index cd409b267..239a9ab89 100644 --- a/neuracore/core/utils/server.py +++ b/neuracore/core/utils/server.py @@ -29,10 +29,8 @@ ) from neuracore.core.exceptions import InsufficientSynchronizedPointError from neuracore.ml.logging.json_line_formatter import JsonLineLogFormatter -from neuracore.ml.utils.preprocessing_utils import ( - PreprocessingConfiguration, - resolve_preprocessing_config, -) +from neuracore.ml.preprocessing.base import PreprocessingConfiguration +from neuracore.ml.utils.preprocessing_utils import resolve_preprocessing_config logger = logging.getLogger(__name__) diff --git a/neuracore/ml/datasets/pytorch_synchronized_dataset.py b/neuracore/ml/datasets/pytorch_synchronized_dataset.py index 37c65e3ed..0a8bbe8da 100644 --- a/neuracore/ml/datasets/pytorch_synchronized_dataset.py +++ b/neuracore/ml/datasets/pytorch_synchronized_dataset.py @@ -29,12 +29,10 @@ ) from neuracore.ml import BatchedTrainingSamples from neuracore.ml.datasets.pytorch_neuracore_dataset import PytorchNeuracoreDataset +from neuracore.ml.preprocessing.base import PreprocessingConfiguration from neuracore.ml.utils.json_serialization import JsonValue, to_json_serializable from neuracore.ml.utils.memory_monitor import MemoryMonitor -from neuracore.ml.utils.preprocessing_utils import ( - PreprocessingConfiguration, - apply_preprocessing_methods, -) +from neuracore.ml.utils.preprocessing_utils import apply_preprocessing_methods logger = logging.getLogger(__name__) diff --git a/neuracore/ml/preprocessing/__init__.py b/neuracore/ml/preprocessing/__init__.py index a692dfe5e..6ac677bd7 100644 --- a/neuracore/ml/preprocessing/__init__.py +++ b/neuracore/ml/preprocessing/__init__.py @@ -1,7 +1,8 @@ """Preprocessing runtime utilities.""" -from .base import PreprocessingMethod +from .base import PreprocessingConfiguration, PreprocessingMethod __all__ = [ + "PreprocessingConfiguration", "PreprocessingMethod", ] diff --git a/neuracore/ml/preprocessing/base.py b/neuracore/ml/preprocessing/base.py index 3f4fecd6e..9c7345948 100644 --- a/neuracore/ml/preprocessing/base.py +++ b/neuracore/ml/preprocessing/base.py @@ -37,3 +37,32 @@ def to_dict(self) -> dict[str, Any]: params[param_name] = getattr(self, param_name, None) return {"_target_": target_name, **params} + + def __str__(self) -> str: + """Return a human-readable representation of the preprocessing method.""" + params = {k: v for k, v in self.to_dict().items() if k != "_target_"} + param_str = ", ".join(f"{name}={value!r}" for name, value in params.items()) + return f"{self.__class__.__name__}({param_str})" + + def __repr__(self) -> str: + """Return a human-readable representation for debugging.""" + return self.__str__() + + +class PreprocessingConfiguration(dict[DataType, list[PreprocessingMethod]]): + """Runtime preprocessing pipeline keyed by data type.""" + + def __str__(self) -> str: + """Return a human-readable representation of the preprocessing pipeline.""" + if not self: + return "PreprocessingConfiguration({})" + lines = [] + for data_type in sorted(self, key=lambda dt: dt.value): + methods = self[data_type] + method_strs = ", ".join(str(method) for method in methods) + lines.append(f" {data_type.value}: [{method_strs}]") + return "PreprocessingConfiguration({\n" + "\n".join(lines) + "\n})" + + def __repr__(self) -> str: + """Return a human-readable representation for debugging.""" + return self.__str__() diff --git a/neuracore/ml/train.py b/neuracore/ml/train.py index 91e188ba8..2d6359299 100644 --- a/neuracore/ml/train.py +++ b/neuracore/ml/train.py @@ -37,6 +37,7 @@ from neuracore.ml.logging.cloud_training_logger import CloudTrainingLogger from neuracore.ml.logging.json_line_formatter import JsonLineLogFormatter from neuracore.ml.logging.tensorboard_training_logger import TensorboardTrainingLogger +from neuracore.ml.preprocessing.base import PreprocessingConfiguration from neuracore.ml.trainers.batch_autotuner import ( find_optimal_batch_size, is_valid_batch_size, @@ -49,10 +50,7 @@ from neuracore.ml.utils.algorithm_loader import AlgorithmLoader from neuracore.ml.utils.algorithm_storage_handler import AlgorithmStorageHandler from neuracore.ml.utils.device_utils import cpu_count, get_default_device -from neuracore.ml.utils.preprocessing_utils import ( - PreprocessingConfiguration, - resolve_preprocessing_config, -) +from neuracore.ml.utils.preprocessing_utils import resolve_preprocessing_config from neuracore.ml.utils.training_config import ( resolve_to_complete_config, resolve_user_input_config, diff --git a/neuracore/ml/utils/nc_archive.py b/neuracore/ml/utils/nc_archive.py index d8ad3f618..262f934b0 100644 --- a/neuracore/ml/utils/nc_archive.py +++ b/neuracore/ml/utils/nc_archive.py @@ -18,13 +18,11 @@ from omegaconf import OmegaConf from neuracore.ml.core.neuracore_model import NeuracoreModel +from neuracore.ml.preprocessing.base import PreprocessingConfiguration from neuracore.ml.utils.algorithm_loader import AlgorithmLoader from neuracore.ml.utils.device_utils import get_default_device from neuracore.ml.utils.json_serialization import to_json_serializable -from neuracore.ml.utils.preprocessing_utils import ( - PreprocessingConfiguration, - resolve_preprocessing_config, -) +from neuracore.ml.utils.preprocessing_utils import resolve_preprocessing_config logger = logging.getLogger(__name__) @@ -387,7 +385,9 @@ def load_model_from_nc_archive( _archive_path(extracted_files, "output_cross_embodiment_description") ) as f: output_cross_embodiment_description = json.load(f) - input_preprocessing_config: PreprocessingConfiguration = {} + input_preprocessing_config: PreprocessingConfiguration = ( + PreprocessingConfiguration() + ) if "input_preprocessing_config" in extracted_files: with open( _archive_path(extracted_files, "input_preprocessing_config") @@ -401,7 +401,9 @@ def load_model_from_nc_archive( logger.warning( "Input preprocessing config in model archive is empty" ) - output_preprocessing_config: PreprocessingConfiguration = {} + output_preprocessing_config: PreprocessingConfiguration = ( + PreprocessingConfiguration() + ) if "output_preprocessing_config" in extracted_files: with open( _archive_path(extracted_files, "output_preprocessing_config") diff --git a/neuracore/ml/utils/policy_inference.py b/neuracore/ml/utils/policy_inference.py index 67d8cc553..5f9b2c522 100644 --- a/neuracore/ml/utils/policy_inference.py +++ b/neuracore/ml/utils/policy_inference.py @@ -23,10 +23,10 @@ ) from neuracore.core.utils.http_session import thread_local_session from neuracore.ml import BatchedInferenceInputs +from neuracore.ml.preprocessing.base import PreprocessingConfiguration from neuracore.ml.utils.device_utils import get_default_device from neuracore.ml.utils.nc_archive import load_model_from_nc_archive from neuracore.ml.utils.preprocessing_utils import ( - PreprocessingConfiguration, apply_preprocessing_methods, validate_preprocessing_configuration, ) diff --git a/neuracore/ml/utils/preprocessing_utils.py b/neuracore/ml/utils/preprocessing_utils.py index 5515c1895..344c1c66f 100644 --- a/neuracore/ml/utils/preprocessing_utils.py +++ b/neuracore/ml/utils/preprocessing_utils.py @@ -10,9 +10,10 @@ if TYPE_CHECKING: from neuracore_types import BatchedNCData -from neuracore.ml.preprocessing.base import PreprocessingMethod - -PreprocessingConfiguration = dict[DataType, list[PreprocessingMethod]] +from neuracore.ml.preprocessing.base import ( + PreprocessingConfiguration, + PreprocessingMethod, +) def validate_preprocessing_configuration( @@ -56,10 +57,10 @@ def resolve_preprocessing_config( from hydra.utils import instantiate preprocessing_methods = instantiate(config_dict, _convert_="all") - resolved_config = { + resolved_config = PreprocessingConfiguration({ DataType(data_type): methods for data_type, methods in preprocessing_methods.items() - } + }) validate_preprocessing_configuration(preprocessing_config=resolved_config) return resolved_config diff --git a/neuracore/ml/utils/training_storage_handler.py b/neuracore/ml/utils/training_storage_handler.py index 5d787527a..5f328457f 100644 --- a/neuracore/ml/utils/training_storage_handler.py +++ b/neuracore/ml/utils/training_storage_handler.py @@ -13,8 +13,8 @@ from neuracore.core.config.get_current_org import get_current_org from neuracore.core.const import API_URL from neuracore.core.utils.http_session import thread_local_session +from neuracore.ml.preprocessing.base import PreprocessingConfiguration from neuracore.ml.utils.nc_archive import create_nc_archive -from neuracore.ml.utils.preprocessing_utils import PreprocessingConfiguration from neuracore.ml.utils.upload_storage_mixin import UploadStorageMixin logger = logging.getLogger(__name__) @@ -30,8 +30,12 @@ def __init__( algorithm_config: dict = {}, input_cross_embodiment_description: dict[str, Any] = {}, output_cross_embodiment_description: dict[str, Any] = {}, - input_preprocessing_config: PreprocessingConfiguration = {}, - output_preprocessing_config: PreprocessingConfiguration = {}, + input_preprocessing_config: PreprocessingConfiguration = ( + PreprocessingConfiguration() + ), + output_preprocessing_config: PreprocessingConfiguration = ( + PreprocessingConfiguration() + ), ) -> None: """Initialize the storage handler. diff --git a/neuracore/ml/utils/validate.py b/neuracore/ml/utils/validate.py index 0313ccf1f..085b86b71 100644 --- a/neuracore/ml/utils/validate.py +++ b/neuracore/ml/utils/validate.py @@ -27,9 +27,9 @@ import neuracore as nc from neuracore.ml.logging.json_line_formatter import JsonLineLogFormatter +from neuracore.ml.preprocessing.base import PreprocessingConfiguration from neuracore.ml.preprocessing.methods.resize_pad import ResizePad from neuracore.ml.utils.device_utils import get_default_device -from neuracore.ml.utils.preprocessing_utils import PreprocessingConfiguration from ..core.ml_types import BatchedTrainingOutputs, BatchedTrainingSamples from ..datasets.pytorch_dummy_dataset import MAX_LEN_PER_DATA_TYPE, PytorchDummyDataset @@ -150,14 +150,18 @@ def run_validation( logger.info(f"Supported output data types: {supported_output_data_types}") # Build validation preprocessing configuration - input_preprocessing_config: PreprocessingConfiguration = { - DataType.RGB_IMAGES: [ResizePad(size=(224, 224))], - DataType.DEPTH_IMAGES: [ResizePad(size=(224, 224))], - } - output_preprocessing_config: PreprocessingConfiguration = { - DataType.RGB_IMAGES: [ResizePad(size=(224, 224))], - DataType.DEPTH_IMAGES: [ResizePad(size=(224, 224))], - } + input_preprocessing_config: PreprocessingConfiguration = ( + PreprocessingConfiguration({ + DataType.RGB_IMAGES: [ResizePad(size=(224, 224))], + DataType.DEPTH_IMAGES: [ResizePad(size=(224, 224))], + }) + ) + output_preprocessing_config: PreprocessingConfiguration = ( + PreprocessingConfiguration({ + DataType.RGB_IMAGES: [ResizePad(size=(224, 224))], + DataType.DEPTH_IMAGES: [ResizePad(size=(224, 224))], + }) + ) # Create dummy cross-embodiment descriptions input_cross_embodiment_description = { diff --git a/tests/unit/ml/datasets/test_pytorch_synchronized_dataset.py b/tests/unit/ml/datasets/test_pytorch_synchronized_dataset.py index 9f87aeb89..372392f07 100644 --- a/tests/unit/ml/datasets/test_pytorch_synchronized_dataset.py +++ b/tests/unit/ml/datasets/test_pytorch_synchronized_dataset.py @@ -25,8 +25,8 @@ PytorchSynchronizedDataset, _cacheable_cross_embodiment_description, ) +from neuracore.ml.preprocessing.base import PreprocessingConfiguration from neuracore.ml.preprocessing.methods.resize_pad import ResizePad -from neuracore.ml.utils.preprocessing_utils import PreprocessingConfiguration DATA_ITEMS = 3 diff --git a/tests/unit/ml/preprocessing/test_preprocessing_utils.py b/tests/unit/ml/preprocessing/test_preprocessing_utils.py index 0bbe03085..e2a787a5a 100644 --- a/tests/unit/ml/preprocessing/test_preprocessing_utils.py +++ b/tests/unit/ml/preprocessing/test_preprocessing_utils.py @@ -40,3 +40,26 @@ def test_resolve_preprocessing_config_to_dict_is_json_serializable(): # before the fix. serialized = json.dumps(method.to_dict()) assert '"size"' in serialized + + +def test_preprocessing_configuration_str_is_human_readable(): + pytest.importorskip("hydra") + pytest.importorskip("torch") + cfg = OmegaConf.create({ + "RGB_IMAGES": [{ + "_target_": "neuracore.ml.preprocessing.methods.resize_pad.ResizePad", + "size": [224, 224], + }], + "DEPTH_IMAGES": [{ + "_target_": "neuracore.ml.preprocessing.methods.resize_pad.ResizePad", + "size": [224, 224], + }], + }) + resolved = resolve_preprocessing_config(cfg) + + rendered = str(resolved) + assert "PreprocessingConfiguration({" in rendered + assert "RGB_IMAGES: [ResizePad(size=[224, 224])]" in rendered + assert "DEPTH_IMAGES: [ResizePad(size=[224, 224])]" in rendered + assert "object at 0x" not in rendered + assert "DataType." not in rendered