From 6dae6be8c5591a08fd52bc950c346442666e551f Mon Sep 17 00:00:00 2001 From: Yevgeni Litvin Date: Sun, 25 Jul 2021 21:48:01 -0700 Subject: [PATCH 1/2] Remove very old pickle compatibility code modifying old atg package names The code was meant to support opening some old datasets that contained pickled data with pre-petastorm names. --- petastorm/etl/dataset_metadata.py | 4 +- petastorm/etl/legacy.py | 79 ------------------- petastorm/etl/rowgroup_indexing.py | 4 +- petastorm/etl/safe_pickle.py | 43 ++++++++++ .../tests/test_reading_legacy_datasets.py | 38 --------- 5 files changed, 47 insertions(+), 121 deletions(-) delete mode 100644 petastorm/etl/legacy.py create mode 100644 petastorm/etl/safe_pickle.py delete mode 100644 petastorm/tests/test_reading_legacy_datasets.py diff --git a/petastorm/etl/dataset_metadata.py b/petastorm/etl/dataset_metadata.py index 77c192ca3..48dd782cf 100644 --- a/petastorm/etl/dataset_metadata.py +++ b/petastorm/etl/dataset_metadata.py @@ -25,7 +25,7 @@ from six.moves import cPickle as pickle from petastorm import utils -from petastorm.etl.legacy import depickle_legacy_package_name_compatible +from petastorm.etl.safe_pickle import safe_loads from petastorm.fs_utils import FilesystemResolver, get_filesystem_and_path_or_paths, get_dataset_path from petastorm.unischema import Unischema @@ -380,7 +380,7 @@ def get_schema(dataset): # Since we have moved the unischema class around few times, unpickling old schemas will not work. In this case we # override the old import path to get backwards compatibility - schema = depickle_legacy_package_name_compatible(ser_schema) + schema = safe_loads(ser_schema) return schema diff --git a/petastorm/etl/legacy.py b/petastorm/etl/legacy.py deleted file mode 100644 index 2a14ca614..000000000 --- a/petastorm/etl/legacy.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2017-2018 Uber Technologies, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging - -from six.moves import cPickle as pickle - -import io - -# List of packages which symbols are allowed to be loaded -safe_modules = { - "petastorm", - "collections", - "numpy", - "pyspark", - "decimal", - "builtins", - "copy_reg", - "__builtin__", -} - - -class RestrictedUnpickler(pickle.Unpickler): - - def find_class(self, module, name): - """Only allow safe classes from builtins""" - package_name = module.split(".")[0] - if package_name in safe_modules: - return super().find_class(module, name) - else: - # Forbid everything else - raise pickle.UnpicklingError("global '%s.%s' is forbidden" % (module, name)) - - -def restricted_loads(s): - """Helper function analogous to pickle.loads()""" - return RestrictedUnpickler(io.BytesIO(s)).load() - - -logger = logging.getLogger(__name__) - - -def depickle_legacy_package_name_compatible(pickled_string): - """Backward compatible way of depickling old pickled strings. - - Previously petastorm package was named differently. In order to be able to load older datasets, we modify - module names in the pickled stream with the new ones. - - :param pickled_string: A pickled string to be passed to pickle.loads - :return: - """ - LEGACY_PACKAGE_NAMES = ['av.experimental.deepdrive.dataset_toolkit', 'av.ml.dataset_toolkit'] - LEGACY_MODULES = ['codecs', 'unischema', 'sequence'] - - for legacy_package_name in LEGACY_PACKAGE_NAMES: - for legacy_module in LEGACY_MODULES: - # Substitute module names directly in the pickled stream. Encode as 'ascii' to make sure no non-ascii - # character made its way into package/module name - legacy_package_entry = '\n(c{}.{}\n'.format(legacy_package_name, legacy_module).encode('ascii') - new_module_name = '\n(cpetastorm.{}\n'.format(legacy_module).encode('ascii') - modified_pickled_string = pickled_string.replace(legacy_package_entry, new_module_name) - if modified_pickled_string != pickled_string: - logger.warning('Depickling "%s.%s" which has moved to "petastorm.%s". ' - 'Regenerate metadata.', legacy_package_name, legacy_module, legacy_module) - - pickled_string = modified_pickled_string - - return restricted_loads(pickled_string) diff --git a/petastorm/etl/rowgroup_indexing.py b/petastorm/etl/rowgroup_indexing.py index f0f8b067f..2f218ce07 100644 --- a/petastorm/etl/rowgroup_indexing.py +++ b/petastorm/etl/rowgroup_indexing.py @@ -22,7 +22,7 @@ from petastorm import utils from petastorm.etl import dataset_metadata -from petastorm.etl.legacy import depickle_legacy_package_name_compatible +from petastorm.etl.safe_pickle import safe_loads from petastorm.fs_utils import FilesystemResolver logger = logging.getLogger(__name__) @@ -154,5 +154,5 @@ def get_row_group_indexes(dataset): serialized_indexes = dataset_metadata_dict[ROWGROUPS_INDEX_KEY] - index_dict = depickle_legacy_package_name_compatible(serialized_indexes) + index_dict = safe_loads(serialized_indexes) return index_dict diff --git a/petastorm/etl/safe_pickle.py b/petastorm/etl/safe_pickle.py new file mode 100644 index 000000000..889006422 --- /dev/null +++ b/petastorm/etl/safe_pickle.py @@ -0,0 +1,43 @@ +# Copyright (c) 2017-2018 Uber Technologies, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import io +import pickle + +# List of packages which symbols are allowed to be loaded +safe_modules = { + "petastorm", + "collections", + "numpy", + "pyspark", + "decimal", + "builtins", +} + + +class RestrictedUnpickler(pickle.Unpickler): + + def find_class(self, module, name): + """Only allow safe classes from builtins""" + package_name = module.split(".")[0] + if package_name in safe_modules: + return super().find_class(module, name) + else: + # Forbid everything else + raise pickle.UnpicklingError(f"global {module, name} is forbidden") + + +def safe_loads(s): + """Helper function analogous to pickle.loads()""" + return RestrictedUnpickler(io.BytesIO(s)).load() diff --git a/petastorm/tests/test_reading_legacy_datasets.py b/petastorm/tests/test_reading_legacy_datasets.py deleted file mode 100644 index e62431e27..000000000 --- a/petastorm/tests/test_reading_legacy_datasets.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2017-2018 Uber Technologies, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os - -import pytest - -from petastorm import make_reader - - -def dataset_urls(): - """Returns a list of legacy datasets available for testing""" - legacy_data_directory = os.path.join(os.path.dirname(__file__), 'data', 'legacy') - versions = os.listdir(legacy_data_directory) - urls = ['file://' + os.path.join(legacy_data_directory, v) for v in versions] - return urls - - -@pytest.mark.parametrize('legacy_dataset_url', dataset_urls()) -def test_reading_legacy_dataset(legacy_dataset_url): - """The test runs for a single legacy dataset. Opens the dataset using `make_reader` and reads all records from it""" - with make_reader(legacy_dataset_url, workers_count=1) as reader: - all_data = list(reader) - - # Some basic check on the data - assert len(all_data) == 100 - assert len(all_data[0]._fields) > 5 - assert all_data[0].matrix.shape == (32, 16, 3) From 513533b77cb4e5532804e2d02a201823f0b6151d Mon Sep 17 00:00:00 2001 From: Yevgeni Litvin Date: Mon, 26 Jul 2021 10:49:00 -0700 Subject: [PATCH 2/2] Resurrecting test_reading_legacy_dataset test It was removed by mistake. The test actually tests an ability to read a dataset created with previous version of petastorm. --- petastorm/etl/safe_pickle.py | 2 + .../tests/test_reading_legacy_datasets.py | 38 +++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 petastorm/tests/test_reading_legacy_datasets.py diff --git a/petastorm/etl/safe_pickle.py b/petastorm/etl/safe_pickle.py index 889006422..835ff6318 100644 --- a/petastorm/etl/safe_pickle.py +++ b/petastorm/etl/safe_pickle.py @@ -23,6 +23,8 @@ "pyspark", "decimal", "builtins", + "copy_reg", + "__builtin__", } diff --git a/petastorm/tests/test_reading_legacy_datasets.py b/petastorm/tests/test_reading_legacy_datasets.py new file mode 100644 index 000000000..e62431e27 --- /dev/null +++ b/petastorm/tests/test_reading_legacy_datasets.py @@ -0,0 +1,38 @@ +# Copyright (c) 2017-2018 Uber Technologies, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import pytest + +from petastorm import make_reader + + +def dataset_urls(): + """Returns a list of legacy datasets available for testing""" + legacy_data_directory = os.path.join(os.path.dirname(__file__), 'data', 'legacy') + versions = os.listdir(legacy_data_directory) + urls = ['file://' + os.path.join(legacy_data_directory, v) for v in versions] + return urls + + +@pytest.mark.parametrize('legacy_dataset_url', dataset_urls()) +def test_reading_legacy_dataset(legacy_dataset_url): + """The test runs for a single legacy dataset. Opens the dataset using `make_reader` and reads all records from it""" + with make_reader(legacy_dataset_url, workers_count=1) as reader: + all_data = list(reader) + + # Some basic check on the data + assert len(all_data) == 100 + assert len(all_data[0]._fields) > 5 + assert all_data[0].matrix.shape == (32, 16, 3)