From 6dae6be8c5591a08fd52bc950c346442666e551f Mon Sep 17 00:00:00 2001
From: Yevgeni Litvin <yevgeni@aurora.tech>
Date: Sun, 25 Jul 2021 21:48:01 -0700
Subject: [PATCH 1/2] Remove very old pickle compatibility code modifying old
 atg package names

The code was meant to support opening some old datasets that contained
pickled data with pre-petastorm names.
---
 petastorm/etl/dataset_metadata.py             |  4 +-
 petastorm/etl/legacy.py                       | 79 -------------------
 petastorm/etl/rowgroup_indexing.py            |  4 +-
 petastorm/etl/safe_pickle.py                  | 43 ++++++++++
 .../tests/test_reading_legacy_datasets.py     | 38 ---------
 5 files changed, 47 insertions(+), 121 deletions(-)
 delete mode 100644 petastorm/etl/legacy.py
 create mode 100644 petastorm/etl/safe_pickle.py
 delete mode 100644 petastorm/tests/test_reading_legacy_datasets.py

diff --git a/petastorm/etl/dataset_metadata.py b/petastorm/etl/dataset_metadata.py
index 77c192ca3..48dd782cf 100644
--- a/petastorm/etl/dataset_metadata.py
+++ b/petastorm/etl/dataset_metadata.py
@@ -25,7 +25,7 @@
 from six.moves import cPickle as pickle
 
 from petastorm import utils
-from petastorm.etl.legacy import depickle_legacy_package_name_compatible
+from petastorm.etl.safe_pickle import safe_loads
 from petastorm.fs_utils import FilesystemResolver, get_filesystem_and_path_or_paths, get_dataset_path
 from petastorm.unischema import Unischema
 
@@ -380,7 +380,7 @@ def get_schema(dataset):
     # Since we have moved the unischema class around few times, unpickling old schemas will not work. In this case we
     # override the old import path to get backwards compatibility
 
-    schema = depickle_legacy_package_name_compatible(ser_schema)
+    schema = safe_loads(ser_schema)
 
     return schema
 
diff --git a/petastorm/etl/legacy.py b/petastorm/etl/legacy.py
deleted file mode 100644
index 2a14ca614..000000000
--- a/petastorm/etl/legacy.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#  Copyright (c) 2017-2018 Uber Technologies, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-
-from six.moves import cPickle as pickle
-
-import io
-
-# List of packages which symbols are allowed to be loaded
-safe_modules = {
-    "petastorm",
-    "collections",
-    "numpy",
-    "pyspark",
-    "decimal",
-    "builtins",
-    "copy_reg",
-    "__builtin__",
-}
-
-
-class RestrictedUnpickler(pickle.Unpickler):
-
-    def find_class(self, module, name):
-        """Only allow safe classes from builtins"""
-        package_name = module.split(".")[0]
-        if package_name in safe_modules:
-            return super().find_class(module, name)
-        else:
-            # Forbid everything else
-            raise pickle.UnpicklingError("global '%s.%s' is forbidden" % (module, name))
-
-
-def restricted_loads(s):
-    """Helper function analogous to pickle.loads()"""
-    return RestrictedUnpickler(io.BytesIO(s)).load()
-
-
-logger = logging.getLogger(__name__)
-
-
-def depickle_legacy_package_name_compatible(pickled_string):
-    """Backward compatible way of depickling old pickled strings.
-
-    Previously petastorm package was named differently. In order to be able to load older datasets, we modify
-    module names in the pickled stream with the new ones.
-
-    :param pickled_string: A pickled string to be passed to pickle.loads
-    :return:
-    """
-    LEGACY_PACKAGE_NAMES = ['av.experimental.deepdrive.dataset_toolkit', 'av.ml.dataset_toolkit']
-    LEGACY_MODULES = ['codecs', 'unischema', 'sequence']
-
-    for legacy_package_name in LEGACY_PACKAGE_NAMES:
-        for legacy_module in LEGACY_MODULES:
-            # Substitute module names directly in the pickled stream. Encode as 'ascii' to make sure no non-ascii
-            # character made its way into package/module name
-            legacy_package_entry = '\n(c{}.{}\n'.format(legacy_package_name, legacy_module).encode('ascii')
-            new_module_name = '\n(cpetastorm.{}\n'.format(legacy_module).encode('ascii')
-            modified_pickled_string = pickled_string.replace(legacy_package_entry, new_module_name)
-            if modified_pickled_string != pickled_string:
-                logger.warning('Depickling "%s.%s" which has moved to "petastorm.%s". '
-                               'Regenerate metadata.', legacy_package_name, legacy_module, legacy_module)
-
-            pickled_string = modified_pickled_string
-
-    return restricted_loads(pickled_string)
diff --git a/petastorm/etl/rowgroup_indexing.py b/petastorm/etl/rowgroup_indexing.py
index f0f8b067f..2f218ce07 100644
--- a/petastorm/etl/rowgroup_indexing.py
+++ b/petastorm/etl/rowgroup_indexing.py
@@ -22,7 +22,7 @@
 
 from petastorm import utils
 from petastorm.etl import dataset_metadata
-from petastorm.etl.legacy import depickle_legacy_package_name_compatible
+from petastorm.etl.safe_pickle import safe_loads
 from petastorm.fs_utils import FilesystemResolver
 
 logger = logging.getLogger(__name__)
@@ -154,5 +154,5 @@ def get_row_group_indexes(dataset):
 
     serialized_indexes = dataset_metadata_dict[ROWGROUPS_INDEX_KEY]
 
-    index_dict = depickle_legacy_package_name_compatible(serialized_indexes)
+    index_dict = safe_loads(serialized_indexes)
     return index_dict
diff --git a/petastorm/etl/safe_pickle.py b/petastorm/etl/safe_pickle.py
new file mode 100644
index 000000000..889006422
--- /dev/null
+++ b/petastorm/etl/safe_pickle.py
@@ -0,0 +1,43 @@
+#  Copyright (c) 2017-2018 Uber Technologies, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+import pickle
+
+# List of packages which symbols are allowed to be loaded
+safe_modules = {
+    "petastorm",
+    "collections",
+    "numpy",
+    "pyspark",
+    "decimal",
+    "builtins",
+}
+
+
+class RestrictedUnpickler(pickle.Unpickler):
+
+    def find_class(self, module, name):
+        """Only allow safe classes from builtins"""
+        package_name = module.split(".")[0]
+        if package_name in safe_modules:
+            return super().find_class(module, name)
+        else:
+            # Forbid everything else
+            raise pickle.UnpicklingError(f"global {module, name} is forbidden")
+
+
+def safe_loads(s):
+    """Helper function analogous to pickle.loads()"""
+    return RestrictedUnpickler(io.BytesIO(s)).load()
diff --git a/petastorm/tests/test_reading_legacy_datasets.py b/petastorm/tests/test_reading_legacy_datasets.py
deleted file mode 100644
index e62431e27..000000000
--- a/petastorm/tests/test_reading_legacy_datasets.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#  Copyright (c) 2017-2018 Uber Technologies, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-
-import pytest
-
-from petastorm import make_reader
-
-
-def dataset_urls():
-    """Returns a list of legacy datasets available for testing"""
-    legacy_data_directory = os.path.join(os.path.dirname(__file__), 'data', 'legacy')
-    versions = os.listdir(legacy_data_directory)
-    urls = ['file://' + os.path.join(legacy_data_directory, v) for v in versions]
-    return urls
-
-
-@pytest.mark.parametrize('legacy_dataset_url', dataset_urls())
-def test_reading_legacy_dataset(legacy_dataset_url):
-    """The test runs for a single legacy dataset. Opens the dataset using `make_reader` and reads all records from it"""
-    with make_reader(legacy_dataset_url, workers_count=1) as reader:
-        all_data = list(reader)
-
-        # Some basic check on the data
-        assert len(all_data) == 100
-        assert len(all_data[0]._fields) > 5
-        assert all_data[0].matrix.shape == (32, 16, 3)

From 513533b77cb4e5532804e2d02a201823f0b6151d Mon Sep 17 00:00:00 2001
From: Yevgeni Litvin <yevgeni@aurora.tech>
Date: Mon, 26 Jul 2021 10:49:00 -0700
Subject: [PATCH 2/2] Resurrecting test_reading_legacy_dataset test

It was removed by mistake. The test actually tests an ability to read a
dataset created with previous version of petastorm.
---
 petastorm/etl/safe_pickle.py                  |  2 +
 .../tests/test_reading_legacy_datasets.py     | 38 +++++++++++++++++++
 2 files changed, 40 insertions(+)
 create mode 100644 petastorm/tests/test_reading_legacy_datasets.py

diff --git a/petastorm/etl/safe_pickle.py b/petastorm/etl/safe_pickle.py
index 889006422..835ff6318 100644
--- a/petastorm/etl/safe_pickle.py
+++ b/petastorm/etl/safe_pickle.py
@@ -23,6 +23,8 @@
     "pyspark",
     "decimal",
     "builtins",
+    "copy_reg",
+    "__builtin__",
 }
 
 
diff --git a/petastorm/tests/test_reading_legacy_datasets.py b/petastorm/tests/test_reading_legacy_datasets.py
new file mode 100644
index 000000000..e62431e27
--- /dev/null
+++ b/petastorm/tests/test_reading_legacy_datasets.py
@@ -0,0 +1,38 @@
+#  Copyright (c) 2017-2018 Uber Technologies, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import pytest
+
+from petastorm import make_reader
+
+
+def dataset_urls():
+    """Returns a list of legacy datasets available for testing"""
+    legacy_data_directory = os.path.join(os.path.dirname(__file__), 'data', 'legacy')
+    versions = os.listdir(legacy_data_directory)
+    urls = ['file://' + os.path.join(legacy_data_directory, v) for v in versions]
+    return urls
+
+
+@pytest.mark.parametrize('legacy_dataset_url', dataset_urls())
+def test_reading_legacy_dataset(legacy_dataset_url):
+    """The test runs for a single legacy dataset. Opens the dataset using `make_reader` and reads all records from it"""
+    with make_reader(legacy_dataset_url, workers_count=1) as reader:
+        all_data = list(reader)
+
+        # Some basic check on the data
+        assert len(all_data) == 100
+        assert len(all_data[0]._fields) > 5
+        assert all_data[0].matrix.shape == (32, 16, 3)