From d6b1ec129489c00681de65145d15991c1818e568 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 6 May 2026 14:21:04 -0500 Subject: [PATCH 01/28] refactor: move from deprecated pkg_resources --- dataprofiler/labelers/base_data_labeler.py | 3 +++ dataprofiler/labelers/data_labelers.py | 2 ++ dataprofiler/labelers/data_processing.py | 2 ++ dataprofiler/tests/labelers/test_char_tf_load_model.py | 2 ++ dataprofiler/tests/labelers/test_character_level_cnn_model.py | 2 ++ dataprofiler/tests/labelers/test_column_name_model.py | 2 ++ dataprofiler/tests/labelers/test_data_processing.py | 2 ++ .../labelers/test_integration_column_name_data_labeler.py | 2 ++ .../tests/labelers/test_integration_regex_data_labeler.py | 2 ++ dataprofiler/tests/labelers/test_regex_model.py | 2 ++ 10 files changed, 21 insertions(+) diff --git a/dataprofiler/labelers/base_data_labeler.py b/dataprofiler/labelers/base_data_labeler.py index f9a4a0ab..43585e26 100644 --- a/dataprofiler/labelers/base_data_labeler.py +++ b/dataprofiler/labelers/base_data_labeler.py @@ -2,10 +2,12 @@ from __future__ import annotations +import importlib.resources import json import os import sys import warnings +from pathlib import Path from typing import cast import numpy as np @@ -17,6 +19,7 @@ from . import data_processing, utils from .base_model import BaseModel + default_labeler_dir = utils.find_resources_dir("labelers") diff --git a/dataprofiler/labelers/data_labelers.py b/dataprofiler/labelers/data_labelers.py index 961b45e6..5dda6e62 100644 --- a/dataprofiler/labelers/data_labelers.py +++ b/dataprofiler/labelers/data_labelers.py @@ -2,7 +2,9 @@ from __future__ import annotations +import importlib.resources import os +from pathlib import Path import pandas as pd diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py index ba17a3bd..107a7872 100644 --- a/dataprofiler/labelers/data_processing.py +++ b/dataprofiler/labelers/data_processing.py @@ -4,6 +4,7 @@ import abc import copy +import importlib import inspect import json import math @@ -12,6 +13,7 @@ import types import warnings from collections import Counter +from pathlib import Path from typing import Any, Generator, Iterable, TypeVar, cast import numpy as np diff --git a/dataprofiler/tests/labelers/test_char_tf_load_model.py b/dataprofiler/tests/labelers/test_char_tf_load_model.py index 40879e57..ec3c3058 100644 --- a/dataprofiler/tests/labelers/test_char_tf_load_model.py +++ b/dataprofiler/tests/labelers/test_char_tf_load_model.py @@ -1,7 +1,9 @@ +import importlib.resources import json import os import unittest from io import StringIO +from pathlib import Path from unittest import mock import numpy as np diff --git a/dataprofiler/tests/labelers/test_character_level_cnn_model.py b/dataprofiler/tests/labelers/test_character_level_cnn_model.py index cbc35b13..32f6ac05 100644 --- a/dataprofiler/tests/labelers/test_character_level_cnn_model.py +++ b/dataprofiler/tests/labelers/test_character_level_cnn_model.py @@ -1,7 +1,9 @@ +import importlib import json import os import unittest from io import StringIO +from pathlib import Path from unittest import mock import numpy as np diff --git a/dataprofiler/tests/labelers/test_column_name_model.py b/dataprofiler/tests/labelers/test_column_name_model.py index dfd4274e..97c81c95 100644 --- a/dataprofiler/tests/labelers/test_column_name_model.py +++ b/dataprofiler/tests/labelers/test_column_name_model.py @@ -1,8 +1,10 @@ +import importlib import json import os import sys import unittest from io import StringIO +from pathlib import Path from unittest import mock import numpy as np diff --git a/dataprofiler/tests/labelers/test_data_processing.py b/dataprofiler/tests/labelers/test_data_processing.py index 7624ccca..7db9d900 100644 --- a/dataprofiler/tests/labelers/test_data_processing.py +++ b/dataprofiler/tests/labelers/test_data_processing.py @@ -1,9 +1,11 @@ +import importlib import json import os import random import re import unittest from io import StringIO +from pathlib import Path from unittest import mock import numpy as np diff --git a/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py b/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py index 8b19731f..05524e18 100644 --- a/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py +++ b/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py @@ -1,4 +1,6 @@ +import importlib import unittest +from pathlib import Path import numpy as np diff --git a/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py b/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py index df72b99e..37aaae33 100644 --- a/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py +++ b/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py @@ -1,5 +1,7 @@ +import importlib import os import unittest +from pathlib import Path import numpy as np diff --git a/dataprofiler/tests/labelers/test_regex_model.py b/dataprofiler/tests/labelers/test_regex_model.py index 66ac6448..7ff7481d 100644 --- a/dataprofiler/tests/labelers/test_regex_model.py +++ b/dataprofiler/tests/labelers/test_regex_model.py @@ -1,7 +1,9 @@ +import importlib import json import os import unittest from io import StringIO +from pathlib import Path from unittest import mock import numpy as np From 9920fabc14fe184d2d9350a1ad7af7031365c38b Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 6 May 2026 14:22:37 -0500 Subject: [PATCH 02/28] fix: to use func --- dataprofiler/labelers/base_data_labeler.py | 2 -- dataprofiler/labelers/data_labelers.py | 2 -- dataprofiler/labelers/utils.py | 1 + dataprofiler/tests/labelers/test_char_tf_load_model.py | 2 -- .../tests/labelers/test_character_level_cnn_model.py | 2 -- dataprofiler/tests/labelers/test_column_name_model.py | 2 -- dataprofiler/tests/labelers/test_data_processing.py | 5 +++-- .../labelers/test_integration_column_name_data_labeler.py | 2 -- .../tests/labelers/test_integration_regex_data_labeler.py | 2 -- dataprofiler/tests/labelers/test_regex_model.py | 2 -- 10 files changed, 4 insertions(+), 18 deletions(-) diff --git a/dataprofiler/labelers/base_data_labeler.py b/dataprofiler/labelers/base_data_labeler.py index 43585e26..b2038b43 100644 --- a/dataprofiler/labelers/base_data_labeler.py +++ b/dataprofiler/labelers/base_data_labeler.py @@ -2,12 +2,10 @@ from __future__ import annotations -import importlib.resources import json import os import sys import warnings -from pathlib import Path from typing import cast import numpy as np diff --git a/dataprofiler/labelers/data_labelers.py b/dataprofiler/labelers/data_labelers.py index 5dda6e62..961b45e6 100644 --- a/dataprofiler/labelers/data_labelers.py +++ b/dataprofiler/labelers/data_labelers.py @@ -2,9 +2,7 @@ from __future__ import annotations -import importlib.resources import os -from pathlib import Path import pandas as pd diff --git a/dataprofiler/labelers/utils.py b/dataprofiler/labelers/utils.py index 99553869..d393fa6a 100644 --- a/dataprofiler/labelers/utils.py +++ b/dataprofiler/labelers/utils.py @@ -2,6 +2,7 @@ import importlib.resources import sys +import sysconfig import warnings from pathlib import Path from typing import Any, Callable, List diff --git a/dataprofiler/tests/labelers/test_char_tf_load_model.py b/dataprofiler/tests/labelers/test_char_tf_load_model.py index ec3c3058..40879e57 100644 --- a/dataprofiler/tests/labelers/test_char_tf_load_model.py +++ b/dataprofiler/tests/labelers/test_char_tf_load_model.py @@ -1,9 +1,7 @@ -import importlib.resources import json import os import unittest from io import StringIO -from pathlib import Path from unittest import mock import numpy as np diff --git a/dataprofiler/tests/labelers/test_character_level_cnn_model.py b/dataprofiler/tests/labelers/test_character_level_cnn_model.py index 32f6ac05..cbc35b13 100644 --- a/dataprofiler/tests/labelers/test_character_level_cnn_model.py +++ b/dataprofiler/tests/labelers/test_character_level_cnn_model.py @@ -1,9 +1,7 @@ -import importlib import json import os import unittest from io import StringIO -from pathlib import Path from unittest import mock import numpy as np diff --git a/dataprofiler/tests/labelers/test_column_name_model.py b/dataprofiler/tests/labelers/test_column_name_model.py index 97c81c95..dfd4274e 100644 --- a/dataprofiler/tests/labelers/test_column_name_model.py +++ b/dataprofiler/tests/labelers/test_column_name_model.py @@ -1,10 +1,8 @@ -import importlib import json import os import sys import unittest from io import StringIO -from pathlib import Path from unittest import mock import numpy as np diff --git a/dataprofiler/tests/labelers/test_data_processing.py b/dataprofiler/tests/labelers/test_data_processing.py index 7db9d900..5ec15594 100644 --- a/dataprofiler/tests/labelers/test_data_processing.py +++ b/dataprofiler/tests/labelers/test_data_processing.py @@ -1,11 +1,12 @@ -import importlib +pass import json import os import random import re import unittest from io import StringIO -from pathlib import Path + +pass from unittest import mock import numpy as np diff --git a/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py b/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py index 05524e18..8b19731f 100644 --- a/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py +++ b/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py @@ -1,6 +1,4 @@ -import importlib import unittest -from pathlib import Path import numpy as np diff --git a/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py b/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py index 37aaae33..df72b99e 100644 --- a/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py +++ b/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py @@ -1,7 +1,5 @@ -import importlib import os import unittest -from pathlib import Path import numpy as np diff --git a/dataprofiler/tests/labelers/test_regex_model.py b/dataprofiler/tests/labelers/test_regex_model.py index 7ff7481d..66ac6448 100644 --- a/dataprofiler/tests/labelers/test_regex_model.py +++ b/dataprofiler/tests/labelers/test_regex_model.py @@ -1,9 +1,7 @@ -import importlib import json import os import unittest from io import StringIO -from pathlib import Path from unittest import mock import numpy as np From 7592895357dd4bab54dac9917cd4b4c9b809e9c4 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 6 May 2026 14:22:43 -0500 Subject: [PATCH 03/28] fix: add missing change --- dataprofiler/labelers/data_processing.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py index 107a7872..ba17a3bd 100644 --- a/dataprofiler/labelers/data_processing.py +++ b/dataprofiler/labelers/data_processing.py @@ -4,7 +4,6 @@ import abc import copy -import importlib import inspect import json import math @@ -13,7 +12,6 @@ import types import warnings from collections import Counter -from pathlib import Path from typing import Any, Generator, Iterable, TypeVar, cast import numpy as np From a3592fb2b351899aa4a30af9be08775e11088d65 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 6 May 2026 14:23:21 -0500 Subject: [PATCH 04/28] refactor: resources to be in package --- dataprofiler/labelers/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dataprofiler/labelers/utils.py b/dataprofiler/labelers/utils.py index d393fa6a..99553869 100644 --- a/dataprofiler/labelers/utils.py +++ b/dataprofiler/labelers/utils.py @@ -2,7 +2,6 @@ import importlib.resources import sys -import sysconfig import warnings from pathlib import Path from typing import Any, Callable, List From 3ecaf6bc1cd0a05145d1e9a4b9a5d3b91ff635a3 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 6 May 2026 14:24:42 -0500 Subject: [PATCH 05/28] fix: tests bc of almost --- dataprofiler/tests/profilers/test_float_column_profile.py | 5 +++++ dataprofiler/tests/profilers/test_int_column_profile.py | 5 +++++ dataprofiler/tests/profilers/test_text_column_profile.py | 5 +++++ 3 files changed, 15 insertions(+) diff --git a/dataprofiler/tests/profilers/test_float_column_profile.py b/dataprofiler/tests/profilers/test_float_column_profile.py index d9ec122c..6061302c 100644 --- a/dataprofiler/tests/profilers/test_float_column_profile.py +++ b/dataprofiler/tests/profilers/test_float_column_profile.py @@ -1777,6 +1777,11 @@ def test_diff(self): profile_diff.pop("median_absolute_deviation"), places=2, ) + self.assertAlmostEqual( + expected_diff.get("t-test").get("welch").pop("p-value"), + profile_diff.get("t-test").get("welch").pop("p-value"), + places=10, + ) self.assertDictEqual(expected_diff, profile_diff) # Assert type error is properly called diff --git a/dataprofiler/tests/profilers/test_int_column_profile.py b/dataprofiler/tests/profilers/test_int_column_profile.py index 960e5318..bb5b136a 100644 --- a/dataprofiler/tests/profilers/test_int_column_profile.py +++ b/dataprofiler/tests/profilers/test_int_column_profile.py @@ -1097,6 +1097,11 @@ def test_diff(self): profile_diff.pop("median_absolute_deviation"), places=2, ) + self.assertAlmostEqual( + expected_diff.get("t-test").get("welch").pop("p-value"), + profile_diff.get("t-test").get("welch").pop("p-value"), + places=10, + ) self.assertDictEqual(expected_diff, profile_diff) # Assert type error is properly called diff --git a/dataprofiler/tests/profilers/test_text_column_profile.py b/dataprofiler/tests/profilers/test_text_column_profile.py index 699e35cb..092426bf 100644 --- a/dataprofiler/tests/profilers/test_text_column_profile.py +++ b/dataprofiler/tests/profilers/test_text_column_profile.py @@ -662,6 +662,11 @@ def test_diff(self): profile_diff.pop("median_absolute_deviation"), places=2, ) + self.assertAlmostEqual( + expected_diff.get("t-test").get("welch").pop("p-value"), + profile_diff.get("t-test").get("welch").pop("p-value"), + places=10, + ) self.assertDictEqual(expected_diff, profile_diff) @mock.patch("time.time", return_value=0.0) From 0a2efd3f791b53eabd3f677bc748b303e69ed4f4 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 6 May 2026 14:24:52 -0500 Subject: [PATCH 06/28] feat: refactor to pass in a path or string or None --- dataprofiler/labelers/base_data_labeler.py | 1 - dataprofiler/labelers/utils.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/dataprofiler/labelers/base_data_labeler.py b/dataprofiler/labelers/base_data_labeler.py index b2038b43..f9a4a0ab 100644 --- a/dataprofiler/labelers/base_data_labeler.py +++ b/dataprofiler/labelers/base_data_labeler.py @@ -17,7 +17,6 @@ from . import data_processing, utils from .base_model import BaseModel - default_labeler_dir = utils.find_resources_dir("labelers") diff --git a/dataprofiler/labelers/utils.py b/dataprofiler/labelers/utils.py index 99553869..e10d138f 100644 --- a/dataprofiler/labelers/utils.py +++ b/dataprofiler/labelers/utils.py @@ -3,6 +3,7 @@ import importlib.resources import sys import warnings +from importlib.resources.abc import Traversable from pathlib import Path from typing import Any, Callable, List From fb321a26ece1425acc81da9f678af4e1bce3ee0b Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 6 May 2026 14:24:52 -0500 Subject: [PATCH 07/28] fix: import for older versions --- dataprofiler/labelers/utils.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/dataprofiler/labelers/utils.py b/dataprofiler/labelers/utils.py index e10d138f..02a9744f 100644 --- a/dataprofiler/labelers/utils.py +++ b/dataprofiler/labelers/utils.py @@ -3,9 +3,16 @@ import importlib.resources import sys import warnings -from importlib.resources.abc import Traversable from pathlib import Path -from typing import Any, Callable, List +from typing import TYPE_CHECKING, Any, Callable, List + +if TYPE_CHECKING: + try: + # Newer Pythons / newer typeshed + from importlib.resources.abc import Traversable + except ModuleNotFoundError: + # Older Pythons + from importlib.abc import Traversable try: # Newer Pythons / newer typeshed From 2207920a5827d214d2251e7653796a508714a0a0 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 6 May 2026 14:24:52 -0500 Subject: [PATCH 08/28] fix: Tranversable must be done at runtime --- dataprofiler/labelers/utils.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/dataprofiler/labelers/utils.py b/dataprofiler/labelers/utils.py index 02a9744f..9f39d21b 100644 --- a/dataprofiler/labelers/utils.py +++ b/dataprofiler/labelers/utils.py @@ -4,15 +4,14 @@ import sys import warnings from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, List - -if TYPE_CHECKING: - try: - # Newer Pythons / newer typeshed - from importlib.resources.abc import Traversable - except ModuleNotFoundError: - # Older Pythons - from importlib.abc import Traversable +from typing import Any, Callable, List + +try: + # Newer Pythons / newer typeshed + from importlib.resources.abc import Traversable +except ModuleNotFoundError: + # Older Pythons + from importlib.abc import Traversable try: # Newer Pythons / newer typeshed From 96344db2be1b302fccbbc5077b30370df4b60d93 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 6 May 2026 14:24:52 -0500 Subject: [PATCH 09/28] refactor: keras reqs and others --- requirements-dev.txt | 2 +- requirements-ml.txt | 7 +++---- requirements-test.txt | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 8c7c7868..163dae50 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,7 @@ check-manifest>=0.50 black>=24.3.0 isort==5.12.0 -pre-commit==2.19.0 +pre-commit==4.3.0 tox==3.25.1 tox-conda==0.10.2 types-setuptools==67.7.0.1 diff --git a/requirements-ml.txt b/requirements-ml.txt index 31f9ca63..4abb91c4 100644 --- a/requirements-ml.txt +++ b/requirements-ml.txt @@ -1,7 +1,6 @@ scikit-learn>=0.23.2 -keras<=3.4.0 +keras<=3.11.0 rapidfuzz>=2.6.1 -tensorflow>=2.16.0; sys.platform != 'darwin' -tensorflow>=2.16.0; sys_platform == 'darwin' and platform_machine != 'arm64' -tensorflow-macos>=2.16.0; sys_platform == 'darwin' and platform_machine == 'arm64' +tensorflow>=2.16.0 +tensorflow-metal; sys_platform == 'darwin' and platform_machine == 'arm64' tqdm>=4.0.0 diff --git a/requirements-test.txt b/requirements-test.txt index 725b2384..cf127b60 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -6,4 +6,4 @@ pytest-cov>=2.8.1 pytest-xdist>=2.1.0 pytest-forked>=1.3.0 toolz>=0.10.0 -memray>=1.7.0,<1.12.0 +memray>=1.18.0 From 0458e73f65b3665bec3efbf6889d83cc5605a333 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 6 May 2026 14:24:52 -0500 Subject: [PATCH 10/28] refactor: losses for keras and tests --- .pre-commit-config.yaml | 12 ++-- dataprofiler/labelers/base_model.py | 2 +- dataprofiler/labelers/char_load_tf_model.py | 5 +- .../labelers/character_level_cnn_model.py | 66 ++++++++----------- dataprofiler/labelers/data_processing.py | 4 +- .../test_character_level_cnn_model.py | 2 +- .../tests/labelers/test_data_labelers.py | 2 +- requirements-ml.txt | 1 - 8 files changed, 38 insertions(+), 56 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1cd76047..902e256f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,7 +21,7 @@ repos: # Flake8: complexity and style checking # https://flake8.pycqa.org/en/latest/user/using-hooks.html - repo: https://github.com/pycqa/flake8 - rev: 4.0.1 + rev: 7.3.0 hooks: - id: flake8 additional_dependencies: [flake8-docstrings] @@ -58,7 +58,7 @@ repos: 'chardet>=3.0.4,<7.0.0', fastavro>=1.0.0.post1, python-snappy>=0.7.1, - charset-normalizer>=1.3.6, + 'charset-normalizer>=1.3.6,<7.0.0', psutil>=4.0.0, scipy>=1.4.1, requests>=2.28.1, @@ -82,11 +82,9 @@ repos: # requirements-ml.txt scikit-learn>=0.23.2, - 'keras>=2.4.3,<=3.4.0', + 'keras>=3.11.0', rapidfuzz>=2.6.1, - "tensorflow>=2.6.4,<2.15.0; sys.platform != 'darwin'", - "tensorflow>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine != 'arm64'", - "tensorflow-macos>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine == 'arm64'", + "tensorflow>=2.16.0", tqdm>=4.0.0, # requirements-reports.txt @@ -101,7 +99,7 @@ repos: pytest-xdist>=2.1.0, pytest-forked>=1.3.0, toolz>=0.10.0, - 'memray>=1.7.0,<1.12.0', + 'memray>=1.18.0', ] # Check-manifest: ensures required non-Python files are included in MANIFEST.in # https://github.com/mgedmin/check-manifest/blob/master/.pre-commit-hooks.yaml diff --git a/dataprofiler/labelers/base_model.py b/dataprofiler/labelers/base_model.py index 032c2ea3..08b453ec 100644 --- a/dataprofiler/labelers/base_model.py +++ b/dataprofiler/labelers/base_model.py @@ -78,7 +78,7 @@ def __eq__(self, other: object) -> bool: :rtype: bool """ if ( - type(self) != type(other) + type(self) is not type(other) or not isinstance(other, BaseModel) or self._parameters != other._parameters or self._label_mapping != other._label_mapping diff --git a/dataprofiler/labelers/char_load_tf_model.py b/dataprofiler/labelers/char_load_tf_model.py index a4a44e03..ecd04f6e 100644 --- a/dataprofiler/labelers/char_load_tf_model.py +++ b/dataprofiler/labelers/char_load_tf_model.py @@ -262,8 +262,7 @@ def _construct_model(self) -> None: # Compile the model w/ metrics softmax_output_layer_name = self._model.output_names[0] - losses = {softmax_output_layer_name: "categorical_crossentropy"} - + losses = ["categorical_crossentropy", None, None] # use f1 score metric f1_score_training = labeler_utils.F1Score( num_classes=num_labels, average="micro" @@ -316,7 +315,7 @@ def _reconstruct_model(self) -> None: # Compile the model softmax_output_layer_name = self._model.output_names[0] - losses = {softmax_output_layer_name: "categorical_crossentropy"} + losses = ["categorical_crossentropy", None, None] # use f1 score metric f1_score_training = labeler_utils.F1Score( diff --git a/dataprofiler/labelers/character_level_cnn_model.py b/dataprofiler/labelers/character_level_cnn_model.py index 2cbb7051..f732c8e0 100644 --- a/dataprofiler/labelers/character_level_cnn_model.py +++ b/dataprofiler/labelers/character_level_cnn_model.py @@ -450,6 +450,7 @@ def load_from_disk(cls, dirpath: str) -> CharacterLevelCnnModel: loaded_model._model_default_ind = loaded_model.label_mapping[ loaded_model._parameters["default_label"] ] + loaded_model._compile_loss(loaded_model._model, loaded_model.num_labels) return loaded_model @staticmethod @@ -475,6 +476,28 @@ def _argmax_threshold_layer( # matrix. return ThreshArgMaxLayer(threshold, num_labels, default_ind) + @staticmethod + def _compile_loss(model: tf.keras.Model, num_labels: int) -> None: + """Compiles the loss for the given model and number of labels.""" + # Compile the model + softmax_output_layer_name = model.output_names[0] + # losses = {softmax_output_layer_name: "categorical_crossentropy"} + losses = ["categorical_crossentropy", None, None] + + # use f1 score metric + f1_score_training = labeler_utils.F1Score( + num_classes=num_labels, average="micro" + ) + metrics = { + softmax_output_layer_name: [ + "categorical_crossentropy", + "acc", + f1_score_training, + ] + } + + model.compile(loss=losses, optimizer="adam", metrics=metrics) + def _construct_model(self) -> None: """ Construct model for the data labeler. @@ -570,24 +593,7 @@ def _construct_model(self) -> None: final_predicted_layer(argmax_layer, self._model.outputs[0]), ] self._model = tf.keras.Model(self._model.inputs, argmax_outputs) - - # Compile the model - softmax_output_layer_name = self._model.output_names[0] - losses = {softmax_output_layer_name: "categorical_crossentropy"} - - # use f1 score metric - f1_score_training = labeler_utils.F1Score( - num_classes=num_labels, average="micro" - ) - metrics = { - softmax_output_layer_name: [ - "categorical_crossentropy", - "acc", - f1_score_training, - ] - } - - self._model.compile(loss=losses, optimizer="adam", metrics=metrics) + self._compile_loss(self._model, num_labels) self._epoch_id = 0 self._model_num_labels = num_labels @@ -632,24 +638,7 @@ def _reconstruct_model(self) -> None: final_predicted_layer(argmax_layer, final_softmax_layer), ] self._model = tf.keras.Model(self._model.inputs, argmax_outputs) - - # Compile the model - softmax_output_layer_name = self._model.output_names[0] - losses = {softmax_output_layer_name: "categorical_crossentropy"} - - # use f1 score metric - f1_score_training = labeler_utils.F1Score( - num_classes=num_labels, average="micro" - ) - metrics = { - softmax_output_layer_name: [ - "categorical_crossentropy", - "acc", - f1_score_training, - ] - } - - self._model.compile(loss=losses, optimizer="adam", metrics=metrics) + self._compile_loss(self._model, num_labels) self._epoch_id = 0 self._model_num_labels = num_labels self._model_default_ind = default_ind @@ -699,14 +688,11 @@ def fit( f1_report: dict = {} self._model.reset_metrics() - softmax_output_layer_name = self._model.output_names[0] start_time = time.time() batch_id = 0 for x_train, y_train in train_data: - model_results = self._model.train_on_batch( - x_train, {softmax_output_layer_name: y_train} - ) + model_results = self._model.train_on_batch(x_train, y_train) sys.stdout.flush() if verbose: sys.stdout.write( diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py index ba17a3bd..2ee1cb81 100644 --- a/dataprofiler/labelers/data_processing.py +++ b/dataprofiler/labelers/data_processing.py @@ -73,7 +73,7 @@ def __eq__(self, other: object) -> bool: :rtype: bool """ if ( - type(self) != type(other) + type(self) is not type(other) or not isinstance(other, BaseDataProcessor) or self._parameters != other._parameters ): @@ -1589,7 +1589,7 @@ def __eq__(self, other: object) -> bool: :rtype: bool """ if ( - type(self) != type(other) + type(self) is not type(other) or not isinstance(other, StructCharPostprocessor) or self._parameters["default_label"] != other._parameters["default_label"] or self._parameters["pad_label"] != other._parameters["pad_label"] diff --git a/dataprofiler/tests/labelers/test_character_level_cnn_model.py b/dataprofiler/tests/labelers/test_character_level_cnn_model.py index cbc35b13..79d1b3f7 100644 --- a/dataprofiler/tests/labelers/test_character_level_cnn_model.py +++ b/dataprofiler/tests/labelers/test_character_level_cnn_model.py @@ -430,7 +430,7 @@ def test_save(self, mock_open, *mocks): StringIO.close(mock_file) @mock.patch("tensorflow.keras.Model.save", return_value=None) - @mock.patch("tensorflow.keras.models.load_model", return_value=mock.Mock()) + @mock.patch("tensorflow.keras.models.load_model", return_value=mock.MagicMock()) @mock.patch("builtins.open", side_effect=mock_open) def test_load(self, *mocks): dir = os.path.join(_resource_labeler_dir, "unstructured_model/") diff --git a/dataprofiler/tests/labelers/test_data_labelers.py b/dataprofiler/tests/labelers/test_data_labelers.py index b0cd4c7e..e7ef0383 100644 --- a/dataprofiler/tests/labelers/test_data_labelers.py +++ b/dataprofiler/tests/labelers/test_data_labelers.py @@ -399,7 +399,7 @@ def test_has_public_functions(self, *args): @staticmethod def _setup_mock_load_model(mock_load_model): - mock_load_model.return_value = mock.Mock() + mock_load_model.return_value = mock.MagicMock() def test_load_labeler(self, mock_open, mock_load_model): diff --git a/requirements-ml.txt b/requirements-ml.txt index 4abb91c4..c403d5b1 100644 --- a/requirements-ml.txt +++ b/requirements-ml.txt @@ -2,5 +2,4 @@ scikit-learn>=0.23.2 keras<=3.11.0 rapidfuzz>=2.6.1 tensorflow>=2.16.0 -tensorflow-metal; sys_platform == 'darwin' and platform_machine == 'arm64' tqdm>=4.0.0 From 2fe4dddedaca89ed376c262513af1e7892611291 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 6 May 2026 14:24:52 -0500 Subject: [PATCH 11/28] fix: remove unneeded global --- dataprofiler/plugins/decorators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataprofiler/plugins/decorators.py b/dataprofiler/plugins/decorators.py index c781f430..f099c1aa 100644 --- a/dataprofiler/plugins/decorators.py +++ b/dataprofiler/plugins/decorators.py @@ -1,4 +1,5 @@ """Contains function for generating plugins data.""" + from collections import defaultdict from typing import Any, DefaultDict, Dict @@ -21,7 +22,6 @@ def __inner_factory_function(fn): :param fn: Plugin function :return: function """ - global plugins_dict plugins_dict[typ][name] = fn return fn From c41303e98dc92fc15e43d27917bf5608b904cd87 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 6 May 2026 14:29:50 -0500 Subject: [PATCH 12/28] fix: accidentally duplicated test on rebase --- dataprofiler/tests/profilers/test_float_column_profile.py | 5 ----- dataprofiler/tests/profilers/test_int_column_profile.py | 5 ----- dataprofiler/tests/profilers/test_text_column_profile.py | 5 ----- 3 files changed, 15 deletions(-) diff --git a/dataprofiler/tests/profilers/test_float_column_profile.py b/dataprofiler/tests/profilers/test_float_column_profile.py index 6061302c..d9ec122c 100644 --- a/dataprofiler/tests/profilers/test_float_column_profile.py +++ b/dataprofiler/tests/profilers/test_float_column_profile.py @@ -1777,11 +1777,6 @@ def test_diff(self): profile_diff.pop("median_absolute_deviation"), places=2, ) - self.assertAlmostEqual( - expected_diff.get("t-test").get("welch").pop("p-value"), - profile_diff.get("t-test").get("welch").pop("p-value"), - places=10, - ) self.assertDictEqual(expected_diff, profile_diff) # Assert type error is properly called diff --git a/dataprofiler/tests/profilers/test_int_column_profile.py b/dataprofiler/tests/profilers/test_int_column_profile.py index bb5b136a..960e5318 100644 --- a/dataprofiler/tests/profilers/test_int_column_profile.py +++ b/dataprofiler/tests/profilers/test_int_column_profile.py @@ -1097,11 +1097,6 @@ def test_diff(self): profile_diff.pop("median_absolute_deviation"), places=2, ) - self.assertAlmostEqual( - expected_diff.get("t-test").get("welch").pop("p-value"), - profile_diff.get("t-test").get("welch").pop("p-value"), - places=10, - ) self.assertDictEqual(expected_diff, profile_diff) # Assert type error is properly called diff --git a/dataprofiler/tests/profilers/test_text_column_profile.py b/dataprofiler/tests/profilers/test_text_column_profile.py index 092426bf..699e35cb 100644 --- a/dataprofiler/tests/profilers/test_text_column_profile.py +++ b/dataprofiler/tests/profilers/test_text_column_profile.py @@ -662,11 +662,6 @@ def test_diff(self): profile_diff.pop("median_absolute_deviation"), places=2, ) - self.assertAlmostEqual( - expected_diff.get("t-test").get("welch").pop("p-value"), - profile_diff.get("t-test").get("welch").pop("p-value"), - places=10, - ) self.assertDictEqual(expected_diff, profile_diff) @mock.patch("time.time", return_value=0.0) From 06152683b4f14d2f851af0ff9a184321c59865cf Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 6 May 2026 14:36:50 -0500 Subject: [PATCH 13/28] fix: rebase duplicates --- dataprofiler/labelers/utils.py | 7 ------- dataprofiler/tests/labelers/test_data_processing.py | 3 --- 2 files changed, 10 deletions(-) diff --git a/dataprofiler/labelers/utils.py b/dataprofiler/labelers/utils.py index 9f39d21b..99553869 100644 --- a/dataprofiler/labelers/utils.py +++ b/dataprofiler/labelers/utils.py @@ -13,13 +13,6 @@ # Older Pythons from importlib.abc import Traversable -try: - # Newer Pythons / newer typeshed - from importlib.resources.abc import Traversable -except ModuleNotFoundError: - # Older Pythons - from importlib.abc import Traversable - def warn_missing_module(labeler_function: str, module_name: str) -> None: """ diff --git a/dataprofiler/tests/labelers/test_data_processing.py b/dataprofiler/tests/labelers/test_data_processing.py index 5ec15594..7624ccca 100644 --- a/dataprofiler/tests/labelers/test_data_processing.py +++ b/dataprofiler/tests/labelers/test_data_processing.py @@ -1,12 +1,9 @@ -pass import json import os import random import re import unittest from io import StringIO - -pass from unittest import mock import numpy as np From f08af16c9c76d566dee45c4759b8a2cc38133aa3 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 6 May 2026 14:46:00 -0500 Subject: [PATCH 14/28] fix: keras reqs --- requirements-ml.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-ml.txt b/requirements-ml.txt index c403d5b1..b3005a5c 100644 --- a/requirements-ml.txt +++ b/requirements-ml.txt @@ -1,5 +1,5 @@ scikit-learn>=0.23.2 -keras<=3.11.0 +keras>=3.11.0 rapidfuzz>=2.6.1 tensorflow>=2.16.0 tqdm>=4.0.0 From e5f404141570b339f3caa84079f289cc251bb593 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 6 May 2026 15:01:01 -0500 Subject: [PATCH 15/28] refactor: update to be more than 3.4.0 for keras --- .pre-commit-config.yaml | 2 +- requirements-ml.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 902e256f..4dcd82ce 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -82,7 +82,7 @@ repos: # requirements-ml.txt scikit-learn>=0.23.2, - 'keras>=3.11.0', + 'keras>3.4.0', rapidfuzz>=2.6.1, "tensorflow>=2.16.0", tqdm>=4.0.0, diff --git a/requirements-ml.txt b/requirements-ml.txt index b3005a5c..0c02d6bc 100644 --- a/requirements-ml.txt +++ b/requirements-ml.txt @@ -1,5 +1,5 @@ scikit-learn>=0.23.2 -keras>=3.11.0 +keras>3.4.0 rapidfuzz>=2.6.1 tensorflow>=2.16.0 tqdm>=4.0.0 From 052d0581a8c739cf194c75e86786d6ab5f616ee2 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Mon, 11 May 2026 13:07:06 -0500 Subject: [PATCH 16/28] refactor: numpy2 and mypy --- .pre-commit-config.yaml | 4 ++-- dataprofiler/data_readers/data_utils.py | 2 +- dataprofiler/labelers/char_load_tf_model.py | 6 +++--- .../labelers/character_level_cnn_model.py | 6 +++--- .../labelers/classification_report_utils.py | 12 ++++++----- dataprofiler/labelers/data_processing.py | 16 +++++++------- dataprofiler/labelers/labeler_utils.py | 15 +++++++------ .../profilers/data_labeler_column_profile.py | 2 +- dataprofiler/profilers/histogram_utils.py | 21 ++++++++++++++----- .../profilers/numerical_column_stats.py | 17 ++++++++------- dataprofiler/profilers/profiler_utils.py | 2 +- requirements.txt | 2 +- 12 files changed, 61 insertions(+), 44 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4dcd82ce..7fd5ca82 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -50,7 +50,7 @@ repos: # requirements.txt h5py>=2.10.0, wheel>=0.33.1, - numpy<2.0.0, + numpy>=1.0.0, 'pandas>=1.1.2,<3.0.0', python-dateutil>=2.7.5, pytz>=2020.1, @@ -107,7 +107,7 @@ repos: rev: "0.48" hooks: - id: check-manifest - additional_dependencies: ['h5py', 'wheel', 'future', 'numpy<2.0.0', 'pandas', + additional_dependencies: ['h5py', 'wheel', 'future', 'numpy>=1.0.0', 'pandas', 'python-dateutil', 'pytz', 'pyarrow', 'chardet', 'fastavro', 'python-snappy', 'charset-normalizer', 'psutil', 'scipy', 'requests', 'networkx','typing-extensions', 'HLL', 'datasketches', 'boto3'] diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index 611d25dc..6e213810 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -334,7 +334,7 @@ def reservoir(file: TextIOWrapper, sample_nrows: int) -> list: except StopIteration: break # Append new, replace old with dummy, and keep track of order - remove_index = rng.integers(0, sample_nrows) + remove_index = int(rng.integers(0, sample_nrows)) values[indices[remove_index]] = str(None) indices[remove_index] = len(values) values.append(newval) diff --git a/dataprofiler/labelers/char_load_tf_model.py b/dataprofiler/labelers/char_load_tf_model.py index ecd04f6e..be60f358 100644 --- a/dataprofiler/labelers/char_load_tf_model.py +++ b/dataprofiler/labelers/char_load_tf_model.py @@ -403,11 +403,11 @@ def fit( f1, f1_report = self._validate_training(val_data) # type: ignore history["f1_report"] = f1_report - val_f1 = f1_report["weighted avg"]["f1-score"] if f1_report else np.NAN + val_f1 = f1_report["weighted avg"]["f1-score"] if f1_report else np.nan val_precision = ( - f1_report["weighted avg"]["precision"] if f1_report else np.NAN + f1_report["weighted avg"]["precision"] if f1_report else np.nan ) - val_recall = f1_report["weighted avg"]["recall"] if f1_report else np.NAN + val_recall = f1_report["weighted avg"]["recall"] if f1_report else np.nan epoch_time = time.time() - start_time logger.info( "\rEPOCH %d (%ds), loss: %f - acc: %f - f1_score %f -- " diff --git a/dataprofiler/labelers/character_level_cnn_model.py b/dataprofiler/labelers/character_level_cnn_model.py index f732c8e0..3fe135ac 100644 --- a/dataprofiler/labelers/character_level_cnn_model.py +++ b/dataprofiler/labelers/character_level_cnn_model.py @@ -708,11 +708,11 @@ def fit( f1, f1_report = self._validate_training(val_data) # type: ignore history["f1_report"] = f1_report - val_f1 = f1_report["weighted avg"]["f1-score"] if f1_report else np.NAN + val_f1 = f1_report["weighted avg"]["f1-score"] if f1_report else np.nan val_precision = ( - f1_report["weighted avg"]["precision"] if f1_report else np.NAN + f1_report["weighted avg"]["precision"] if f1_report else np.nan ) - val_recall = f1_report["weighted avg"]["recall"] if f1_report else np.NAN + val_recall = f1_report["weighted avg"]["recall"] if f1_report else np.nan epoch_time = time.time() - start_time logger.info( "\rEPOCH %d (%ds), loss: %f - acc: %f - f1_score %f -- " diff --git a/dataprofiler/labelers/classification_report_utils.py b/dataprofiler/labelers/classification_report_utils.py index 28e742e3..3146e829 100644 --- a/dataprofiler/labelers/classification_report_utils.py +++ b/dataprofiler/labelers/classification_report_utils.py @@ -31,8 +31,8 @@ def convert_confusion_matrix_to_MCM(conf_matrix: list | np.ndarray) -> np.ndarra """ if not isinstance(conf_matrix, np.ndarray): conf_matrix = np.array(conf_matrix) - num_labels = conf_matrix.shape[0] - num_samples = np.sum(conf_matrix) + num_labels = len(conf_matrix) + num_samples: int = int(np.sum(conf_matrix)) MCM = np.zeros((num_labels, 2, 2), dtype=np.int64) # True Positives @@ -205,6 +205,8 @@ def precision_recall_fscore_support( f_score = (1 + beta2) * precision * recall / denom # Average the results + weights: np.ndarray | None + support: np.ndarray | None = true_sum if average == "weighted": weights = true_sum if weights.sum() == 0: @@ -219,9 +221,9 @@ def precision_recall_fscore_support( precision = np.average(precision, weights=weights) recall = np.average(recall, weights=weights) f_score = np.average(f_score, weights=weights) - true_sum = None # return no support + support = None # return no support - return precision, recall, f_score, true_sum + return precision, recall, f_score, support def classification_report( @@ -300,7 +302,7 @@ def classification_report( """ # ALTERATION: replaced the _check_targets with this if statement since # no y_true, y_pred - y_type = "multiclass" if conf_matrix.shape[0] > 2 else "binary" + y_type = "multiclass" if len(conf_matrix) > 2 else "binary" labels_given = True if labels is None: diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py index 2ee1cb81..fe67c69a 100644 --- a/dataprofiler/labelers/data_processing.py +++ b/dataprofiler/labelers/data_processing.py @@ -692,7 +692,7 @@ def process( :return batch_data: A dict containing samples of size batch_size :rtype batch_data: dicts """ - num_dim = sum([dim > 1 for dim in data.shape]) + num_dim = sum(dim > 1 for dim in np.shape(data)) if num_dim > 1: raise ValueError( "Multidimensional data given to " @@ -1213,8 +1213,8 @@ def match_sentence_lengths( :type inplace: bool :return: dict(pred=...) or dict(pred=..., conf=...) """ - pred_buffer = [] - conf_buffer = [] + pred_buffer: np.ndarray = np.array([]) + conf_buffer: np.ndarray = np.array([]) result_ind = 0 buffer_add_inds = np.cumsum(list(map(len, results["pred"]))).tolist() separator_len = len(flatten_separator) @@ -1469,14 +1469,14 @@ def process( "If `labels` are specified, `label_mapping` " "must also be specified." ) - if data.shape != labels.shape: + if np.shape(data) != np.shape(labels): raise ValueError( f"Data and labels given to " f"StructCharPreprocessor are of different " - f"shapes, {data.shape} != {labels.shape}" + f"shapes, {np.shape(data)} != {np.shape(labels)}" ) - num_dim = sum([dim > 1 for dim in data.shape]) + num_dim = sum(dim > 1 for dim in np.shape(data)) if num_dim > 1: warnings.warn( "Data given to StructCharPreprocessor was " @@ -1681,8 +1681,8 @@ def match_sentence_lengths( :type inplace: bool :return: dict(pred=...) or dict(pred=..., conf=...) """ - pred_buffer = [] - conf_buffer = [] + pred_buffer: np.ndarray = np.array([]) + conf_buffer: np.ndarray = np.array([]) result_ind = 0 buffer_add_inds = np.cumsum(list(map(len, results["pred"]))).tolist() separator_len = len(flatten_separator) diff --git a/dataprofiler/labelers/labeler_utils.py b/dataprofiler/labelers/labeler_utils.py index 3a24886f..efebaec4 100644 --- a/dataprofiler/labelers/labeler_utils.py +++ b/dataprofiler/labelers/labeler_utils.py @@ -78,8 +78,8 @@ class 1 1.00 0.67 0.80 3 def evaluate_accuracy( - predicted_entities_in_index: list[list[int]], - true_entities_in_index: list[list[int]], + predicted_entities_in_index: list[list[int]] | np.ndarray, + true_entities_in_index: list[list[int]] | np.ndarray, num_labels: int, entity_rev_dict: dict[int, str], verbose: bool = True, @@ -119,13 +119,16 @@ def evaluate_accuracy( if x[1] not in omitted_labels ] - max_len = len(predicted_entities_in_index[0]) - true_labels_padded = np.zeros((len(true_entities_in_index), max_len)) - for i, true_labels_row in enumerate(true_entities_in_index): + predicted_entities = [np.asarray(row) for row in predicted_entities_in_index] + true_entities = [np.asarray(row) for row in true_entities_in_index] + + max_len = len(predicted_entities[0]) + true_labels_padded = np.zeros((len(true_entities), max_len)) + for i, true_labels_row in enumerate(true_entities): true_labels_padded[i][: len(true_labels_row)] = true_labels_row true_labels_flatten = np.hstack(true_labels_padded) # type: ignore - predicted_labels_flatten = np.hstack(predicted_entities_in_index) + predicted_labels_flatten = np.hstack(predicted_entities) all_labels: list[str] = [] if entity_rev_dict: diff --git a/dataprofiler/profilers/data_labeler_column_profile.py b/dataprofiler/profilers/data_labeler_column_profile.py index d9bfe1ee..81f9c0ce 100644 --- a/dataprofiler/profilers/data_labeler_column_profile.py +++ b/dataprofiler/profilers/data_labeler_column_profile.py @@ -427,7 +427,7 @@ def _update_predictions( start_index = 0 if self.data_labeler.model.requires_zero_mapping: start_index = 1 - for i in range(rank_predictions.shape[0]): + for i in range(len(rank_predictions)): sorted_rank = rank_predictions[i][-self._top_k_voting :] sorted_rank = sorted_rank[np.argsort(predictions["conf"][i][sorted_rank])] for rank_position, value in enumerate(sorted_rank): diff --git a/dataprofiler/profilers/histogram_utils.py b/dataprofiler/profilers/histogram_utils.py index df230c4c..d1b42297 100644 --- a/dataprofiler/profilers/histogram_utils.py +++ b/dataprofiler/profilers/histogram_utils.py @@ -11,11 +11,22 @@ from typing import List, Optional, Tuple, Union import numpy as np -from numpy.lib.histograms import ( # type: ignore[attr-defined] - _get_outer_edges, - _hist_bin_selectors, - _unsigned_subtract, -) + +try: + # numpy v2+ + from numpy.lib._histograms_impl import ( # type: ignore[attr-defined] + _get_outer_edges, + _hist_bin_selectors, + _unsigned_subtract, + ) + +except ModuleNotFoundError: + # numpy v1+ + from numpy.lib.histograms import ( + _get_outer_edges, + _hist_bin_selectors, + _unsigned_subtract, + ) def _get_maximum_from_profile(profile): diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index fa0666a6..a9e18d29 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -6,7 +6,7 @@ import copy import itertools import warnings -from typing import Any, Callable, Dict, List, TypeVar, cast +from typing import Any, Callable, Dict, List, TypeAlias, TypeVar, cast import numpy as np import numpy.typing as npt @@ -32,6 +32,7 @@ def __init__(self, function: Callable) -> None: NumericStatsMixinT = TypeVar("NumericStatsMixinT", bound="NumericStatsMixin") +Numeric: TypeAlias = int | float | np.float64 | np.int64 class NumericStatsMixin(BaseColumnProfiler[NumericStatsMixinT], metaclass=abc.ABCMeta): @@ -56,10 +57,10 @@ def __init__(self, options: NumericalOptions = None) -> None: "NumericalStatsMixin parameter 'options' must be " "of type NumericalOptions." ) - self.min: int | float | np.float64 | np.int64 | None = None - self.max: int | float | np.float64 | np.int64 | None = None + self.min: Numeric | None = None + self.max: Numeric | None = None self._top_k_modes: int = 5 # By default, return at max 5 modes - self.sum: int | float | np.float64 | np.int64 = np.float64(0) + self.sum: Numeric = np.float64(0) self._biased_variance: float | np.float64 = np.nan self._biased_skewness: float | np.float64 = np.nan self._biased_kurtosis: float | np.float64 = np.nan @@ -298,14 +299,14 @@ def _add_helper( ) if "min" in self.__calculations.keys(): if other1.min is not None and other2.min is not None: - self.min = min(other1.min, other2.min) + self.min = min(other1.min, other2.min) # type: ignore[type-var] elif other2.min is None: self.min = other1.min else: self.min = other2.min if "max" in self.__calculations.keys(): if other1.max is not None and other2.max is not None: - self.max = max(other1.max, other2.max) + self.max = max(other1.max, other2.max) # type: ignore[type-var] elif other2.max is None: self.max = other1.max else: @@ -1403,7 +1404,7 @@ def _assimilate_histogram( dest_hist_entity_count_per_bin: np.ndarray, dest_hist_bin_edges: np.ndarray, dest_hist_num_bin: int, - ) -> tuple[dict[str, np.ndarray[Any, Any]], float]: + ) -> tuple[dict[str, np.ndarray], float]: """ Assimilates a histogram into another histogram using specifications. @@ -1821,7 +1822,7 @@ def _get_variance( # Suppress any numpy warnings as we have a custom warning for invalid # or infinite data already with np.errstate(all="ignore"): - batch_biased_variance = np.var(df_series) # Obtains biased variance + batch_biased_variance = cast(float | np.float64, np.var(df_series)) subset_properties["biased_variance"] = batch_biased_variance sum_value = subset_properties["sum"] batch_count = subset_properties["match_count"] diff --git a/dataprofiler/profilers/profiler_utils.py b/dataprofiler/profilers/profiler_utils.py index e38e1b04..100131f7 100644 --- a/dataprofiler/profilers/profiler_utils.py +++ b/dataprofiler/profilers/profiler_utils.py @@ -602,7 +602,7 @@ def find_diff_of_matrices( mat1 = np.array(matrix1, dtype=np.float64) mat2 = np.array(matrix2, dtype=np.float64) - if mat1.shape == mat2.shape: + if np.shape(mat1) == np.shape(mat2): diff: np.ndarray = mat1 - mat2 if ((diff == 0) | np.isnan(diff)).all(): return "unchanged" diff --git a/requirements.txt b/requirements.txt index 1036c433..355018fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ h5py>=2.10.0 wheel>=0.33.1 -numpy<2.0.0 +numpy>=1.0.0 pandas>=1.1.2,<3.0.0 python-dateutil>=2.7.5 pytz>=2020.1 From 3965667b5e8099adeab8953c79d666aa6212281c Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Mon, 11 May 2026 13:29:55 -0500 Subject: [PATCH 17/28] fix: mypy 3.10 --- dataprofiler/labelers/data_processing.py | 4 ++-- dataprofiler/profilers/numerical_column_stats.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py index fe67c69a..33d916cc 100644 --- a/dataprofiler/labelers/data_processing.py +++ b/dataprofiler/labelers/data_processing.py @@ -1216,7 +1216,7 @@ def match_sentence_lengths( pred_buffer: np.ndarray = np.array([]) conf_buffer: np.ndarray = np.array([]) result_ind = 0 - buffer_add_inds = np.cumsum(list(map(len, results["pred"]))).tolist() + buffer_add_inds: list[int] = np.cumsum(list(map(len, results["pred"]))).tolist() separator_len = len(flatten_separator) if not inplace: @@ -1684,7 +1684,7 @@ def match_sentence_lengths( pred_buffer: np.ndarray = np.array([]) conf_buffer: np.ndarray = np.array([]) result_ind = 0 - buffer_add_inds = np.cumsum(list(map(len, results["pred"]))).tolist() + buffer_add_inds: list[int] = np.cumsum(list(map(len, results["pred"]))).tolist() separator_len = len(flatten_separator) if not inplace: diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index a9e18d29..1f799961 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -541,7 +541,7 @@ def mean(self) -> float | np.float64: """Return mean value.""" if self.match_count == 0: return 0.0 - return self.sum / self.match_count + return cast(float | np.float64, self.sum / self.match_count) @property def mode(self) -> list[float]: From f1046a93da358e0e45f136c46f292e52f02853cb Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Mon, 11 May 2026 13:50:37 -0500 Subject: [PATCH 18/28] fix: bugs --- dataprofiler/labelers/char_load_tf_model.py | 9 +++------ dataprofiler/profilers/numerical_column_stats.py | 14 ++++++++++++-- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/dataprofiler/labelers/char_load_tf_model.py b/dataprofiler/labelers/char_load_tf_model.py index be60f358..eef03c78 100644 --- a/dataprofiler/labelers/char_load_tf_model.py +++ b/dataprofiler/labelers/char_load_tf_model.py @@ -262,7 +262,7 @@ def _construct_model(self) -> None: # Compile the model w/ metrics softmax_output_layer_name = self._model.output_names[0] - losses = ["categorical_crossentropy", None, None] + losses = ["categorical_crossentropy", None] # use f1 score metric f1_score_training = labeler_utils.F1Score( num_classes=num_labels, average="micro" @@ -315,7 +315,7 @@ def _reconstruct_model(self) -> None: # Compile the model softmax_output_layer_name = self._model.output_names[0] - losses = ["categorical_crossentropy", None, None] + losses = ["categorical_crossentropy", None] # use f1 score metric f1_score_training = labeler_utils.F1Score( @@ -380,14 +380,11 @@ def fit( f1_report: dict = {} self._model.reset_metrics() - softmax_output_layer_name = self._model.output_names[0] start_time = time.time() batch_id = 0 for x_train, y_train in train_data: - model_results = self._model.train_on_batch( - x_train, {softmax_output_layer_name: y_train} - ) + model_results = self._model.train_on_batch(x_train, y_train) sys.stdout.flush() if verbose: sys.stdout.write( diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index 1f799961..40272704 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -35,6 +35,16 @@ def __init__(self, function: Callable) -> None: Numeric: TypeAlias = int | float | np.float64 | np.int64 +def _as_float_scalar(value: Numeric | np.ndarray) -> float: + """Convert a scalar-like numeric value to a Python float.""" + array_value = np.asarray(value) + if array_value.ndim == 0: + return float(array_value) + if array_value.size == 1: + return float(array_value.item()) + raise TypeError("Expected a scalar numeric value.") + + class NumericStatsMixin(BaseColumnProfiler[NumericStatsMixinT], metaclass=abc.ABCMeta): """ Abstract numerical column profile subclass of BaseColumnProfiler. @@ -199,7 +209,7 @@ def _add_helper_merge_profile_histograms( # calculate the min of the first edge and the max of the last edge # between two arrays global_min_of_histogram_edges = ( - float(self.min) + _as_float_scalar(self.min) if self.min is not None else min( other1._stored_histogram["histogram"]["bin_edges"][0], @@ -208,7 +218,7 @@ def _add_helper_merge_profile_histograms( ) global_max_of_histogram_edges = ( - float(self.max) + _as_float_scalar(self.max) if self.max is not None else max( other1._stored_histogram["histogram"]["bin_edges"][-1], From 8f1b4e0a923f5ea09a8e618b4ad5b7cc234a1fdb Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Mon, 11 May 2026 14:05:09 -0500 Subject: [PATCH 19/28] fix: float --- dataprofiler/profilers/numerical_column_stats.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index 40272704..6e61f7a6 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -634,7 +634,12 @@ def _perform_t_test( ) invalid_stats = True if np.isnan( - [float(mean1), float(mean2), float(var1), float(var2)] + [ + _as_float_scalar(mean1), + _as_float_scalar(mean2), + _as_float_scalar(var1), + _as_float_scalar(var2), + ] ).any() or None in [ mean1, mean2, @@ -1836,7 +1841,9 @@ def _get_variance( subset_properties["biased_variance"] = batch_biased_variance sum_value = subset_properties["sum"] batch_count = subset_properties["match_count"] - batch_mean = 0.0 if not batch_count else float(sum_value) / batch_count + batch_mean = ( + 0.0 if not batch_count else _as_float_scalar(sum_value) / batch_count + ) subset_properties["mean"] = batch_mean self._biased_variance = self._merge_biased_variance( self.match_count, From fdc671edbcd96200eb8fe14564db01cf5c9c964f Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Mon, 11 May 2026 14:25:29 -0500 Subject: [PATCH 20/28] refactor: for hist fix too --- dataprofiler/profilers/histogram_utils.py | 16 +++++++++--- .../profilers/numerical_column_stats.py | 26 +++++++------------ dataprofiler/profilers/profiler_utils.py | 12 +++++++++ 3 files changed, 33 insertions(+), 21 deletions(-) diff --git a/dataprofiler/profilers/histogram_utils.py b/dataprofiler/profilers/histogram_utils.py index d1b42297..30c3ecd4 100644 --- a/dataprofiler/profilers/histogram_utils.py +++ b/dataprofiler/profilers/histogram_utils.py @@ -12,6 +12,8 @@ import numpy as np +from . import profiler_utils + try: # numpy v2+ from numpy.lib._histograms_impl import ( # type: ignore[attr-defined] @@ -90,7 +92,7 @@ def _ptp(maximum: float, minimum: float): :return: the difference between the maximum and minimum """ - return np.subtract(maximum, minimum) + return profiler_utils.as_float_scalar(np.subtract(maximum, minimum)) def _calc_doane_bin_width_from_profile(profile): @@ -191,7 +193,9 @@ def _calc_fd_bin_width_from_profile(profile): :return: An estimate of the optimal bin width for the given data. """ - iqr = np.subtract(profile._get_percentile([75]), profile._get_percentile([25])) + iqr = profiler_utils.as_float_scalar( + np.subtract(profile._get_percentile([75]), profile._get_percentile([25])) + ) dataset_size = _get_dataset_size_from_profile(profile) return 2.0 * iqr * dataset_size ** (-1.0 / 3.0) @@ -300,7 +304,9 @@ def _get_bin_edges( n_equal_bins = 1 else: # Do not call selectors on empty arrays - width = _hist_bin_selectors[bin_name](a, (first_edge, last_edge)) + width = profiler_utils.as_float_scalar( + _hist_bin_selectors[bin_name](a, (first_edge, last_edge)) + ) if width: n_equal_bins = int( np.ceil(_unsigned_subtract(last_edge, first_edge) / width) @@ -351,7 +357,9 @@ def _calculate_bins_from_profile(profile, bin_method): n_equal_bins = 1 else: # Do not call selectors on empty arrays - width = _hist_bin_width_selectors_for_profile[bin_method](profile) + width = profiler_utils.as_float_scalar( + _hist_bin_width_selectors_for_profile[bin_method](profile) + ) if width and not np.isnan(width): n_equal_bins = int(np.ceil(_ptp(maximum, minimum) / width)) else: diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index 6e61f7a6..b0197ed4 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -35,16 +35,6 @@ def __init__(self, function: Callable) -> None: Numeric: TypeAlias = int | float | np.float64 | np.int64 -def _as_float_scalar(value: Numeric | np.ndarray) -> float: - """Convert a scalar-like numeric value to a Python float.""" - array_value = np.asarray(value) - if array_value.ndim == 0: - return float(array_value) - if array_value.size == 1: - return float(array_value.item()) - raise TypeError("Expected a scalar numeric value.") - - class NumericStatsMixin(BaseColumnProfiler[NumericStatsMixinT], metaclass=abc.ABCMeta): """ Abstract numerical column profile subclass of BaseColumnProfiler. @@ -209,7 +199,7 @@ def _add_helper_merge_profile_histograms( # calculate the min of the first edge and the max of the last edge # between two arrays global_min_of_histogram_edges = ( - _as_float_scalar(self.min) + profiler_utils.as_float_scalar(self.min) if self.min is not None else min( other1._stored_histogram["histogram"]["bin_edges"][0], @@ -218,7 +208,7 @@ def _add_helper_merge_profile_histograms( ) global_max_of_histogram_edges = ( - _as_float_scalar(self.max) + profiler_utils.as_float_scalar(self.max) if self.max is not None else max( other1._stored_histogram["histogram"]["bin_edges"][-1], @@ -635,10 +625,10 @@ def _perform_t_test( invalid_stats = True if np.isnan( [ - _as_float_scalar(mean1), - _as_float_scalar(mean2), - _as_float_scalar(var1), - _as_float_scalar(var2), + profiler_utils.as_float_scalar(mean1), + profiler_utils.as_float_scalar(mean2), + profiler_utils.as_float_scalar(var1), + profiler_utils.as_float_scalar(var2), ] ).any() or None in [ mean1, @@ -1842,7 +1832,9 @@ def _get_variance( sum_value = subset_properties["sum"] batch_count = subset_properties["match_count"] batch_mean = ( - 0.0 if not batch_count else _as_float_scalar(sum_value) / batch_count + 0.0 + if not batch_count + else profiler_utils.as_float_scalar(sum_value) / batch_count ) subset_properties["mean"] = batch_mean self._biased_variance = self._merge_biased_variance( diff --git a/dataprofiler/profilers/profiler_utils.py b/dataprofiler/profilers/profiler_utils.py index 100131f7..2cc9846b 100644 --- a/dataprofiler/profilers/profiler_utils.py +++ b/dataprofiler/profilers/profiler_utils.py @@ -39,6 +39,18 @@ from .. import rng_utils +def as_float_scalar( + value: int | float | np.integer | np.floating | np.ndarray | list[float], +) -> float: + """Convert a scalar-like value to a Python float.""" + array_value = np.asarray(value) + if array_value.ndim == 0: + return float(array_value) + if array_value.size == 1: + return float(array_value.item()) + raise TypeError("Expected a scalar-like numeric value.") + + def recursive_dict_update(d: dict, update_d: dict) -> dict: """ Recursive updates nested dictionaries. Updating d with update_d. From 34c47fe2ddaa0a2a2df3e056d3aa466903242cd9 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Mon, 11 May 2026 15:04:04 -0500 Subject: [PATCH 21/28] fix: issue with none in hist --- dataprofiler/profilers/histogram_utils.py | 12 +++++----- .../profilers/numerical_column_stats.py | 24 +++++++++---------- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/dataprofiler/profilers/histogram_utils.py b/dataprofiler/profilers/histogram_utils.py index 30c3ecd4..a5d2defa 100644 --- a/dataprofiler/profilers/histogram_utils.py +++ b/dataprofiler/profilers/histogram_utils.py @@ -304,9 +304,9 @@ def _get_bin_edges( n_equal_bins = 1 else: # Do not call selectors on empty arrays - width = profiler_utils.as_float_scalar( - _hist_bin_selectors[bin_name](a, (first_edge, last_edge)) - ) + width = _hist_bin_selectors[bin_name](a, (first_edge, last_edge)) + if width is not None: + width = profiler_utils.as_float_scalar(width) if width: n_equal_bins = int( np.ceil(_unsigned_subtract(last_edge, first_edge) / width) @@ -357,9 +357,9 @@ def _calculate_bins_from_profile(profile, bin_method): n_equal_bins = 1 else: # Do not call selectors on empty arrays - width = profiler_utils.as_float_scalar( - _hist_bin_width_selectors_for_profile[bin_method](profile) - ) + width = _hist_bin_width_selectors_for_profile[bin_method](profile) + if width is not None: + width = profiler_utils.as_float_scalar(width) if width and not np.isnan(width): n_equal_bins = int(np.ceil(_ptp(maximum, minimum) / width)) else: diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index b0197ed4..5135bcfd 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -623,19 +623,17 @@ def _perform_t_test( RuntimeWarning, ) invalid_stats = True - if np.isnan( - [ - profiler_utils.as_float_scalar(mean1), - profiler_utils.as_float_scalar(mean2), - profiler_utils.as_float_scalar(var1), - profiler_utils.as_float_scalar(var2), - ] - ).any() or None in [ - mean1, - mean2, - var1, - var2, - ]: + if ( + None in [mean1, mean2, var1, var2] + or np.isnan( + [ + profiler_utils.as_float_scalar(mean1), + profiler_utils.as_float_scalar(mean2), + profiler_utils.as_float_scalar(var1), + profiler_utils.as_float_scalar(var2), + ] + ).any() + ): warnings.warn( "Null value(s) found in mean and/or variance values. " "T-test cannot be performed.", From 57066fbad78e4a048c73a1e577db02df5cf0ed0b Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Fri, 22 May 2026 11:04:43 -0500 Subject: [PATCH 22/28] fix: remove comment --- dataprofiler/labelers/character_level_cnn_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dataprofiler/labelers/character_level_cnn_model.py b/dataprofiler/labelers/character_level_cnn_model.py index 3fe135ac..f1e925d5 100644 --- a/dataprofiler/labelers/character_level_cnn_model.py +++ b/dataprofiler/labelers/character_level_cnn_model.py @@ -481,7 +481,6 @@ def _compile_loss(model: tf.keras.Model, num_labels: int) -> None: """Compiles the loss for the given model and number of labels.""" # Compile the model softmax_output_layer_name = model.output_names[0] - # losses = {softmax_output_layer_name: "categorical_crossentropy"} losses = ["categorical_crossentropy", None, None] # use f1 score metric From 5de7abea54ae44547864040fb962ddf03076cc77 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Fri, 22 May 2026 11:29:50 -0500 Subject: [PATCH 23/28] refactor: to still utilize dict mapping for losses --- dataprofiler/labelers/char_load_tf_model.py | 159 ++++++++++------- .../labelers/character_level_cnn_model.py | 163 +++++++++++++----- dataprofiler/labelers/labeler_utils.py | 58 +++++++ .../test_character_level_cnn_model.py | 9 +- 4 files changed, 283 insertions(+), 106 deletions(-) diff --git a/dataprofiler/labelers/char_load_tf_model.py b/dataprofiler/labelers/char_load_tf_model.py index eef03c78..c8c40d70 100644 --- a/dataprofiler/labelers/char_load_tf_model.py +++ b/dataprofiler/labelers/char_load_tf_model.py @@ -17,6 +17,7 @@ from .. import dp_logging from . import labeler_utils from .base_model import AutoSubRegistrationMeta, BaseModel, BaseTrainableModel +from .character_level_cnn_model import ArgMaxLayer _file_dir = os.path.dirname(os.path.abspath(__file__)) @@ -29,6 +30,8 @@ class CharLoadTFModel(BaseTrainableModel, metaclass=AutoSubRegistrationMeta): # boolean if the label mapping requires the mapping for index 0 reserved requires_zero_mapping = False + _SOFTMAX_OUTPUT = "softmax_output" + _ARGMAX_OUTPUT = "argmax_output" def __init__( self, model_path: str, label_mapping: dict[str, int], parameters: dict = None @@ -61,6 +64,35 @@ def __init__( BaseModel.__init__(self, label_mapping, parameters) + @classmethod + def _create_model_outputs( + cls, softmax_output: tf.Tensor, argmax_output: tf.Tensor | None = None + ) -> dict[str, tf.Tensor]: + """Return normalized dict outputs for training and inference.""" + if argmax_output is None: + argmax_output = ArgMaxLayer(name=cls._ARGMAX_OUTPUT)(softmax_output) + return { + cls._SOFTMAX_OUTPUT: softmax_output, + cls._ARGMAX_OUTPUT: argmax_output, + } + + @classmethod + def _normalize_model_outputs(cls, model: tf.keras.Model) -> tf.keras.Model: + """Convert list-style outputs to the normalized dict structure.""" + return labeler_utils.normalize_tf_model_outputs( + model, + [cls._SOFTMAX_OUTPUT, cls._ARGMAX_OUTPUT], + lambda softmax_output, extra_outputs: cls._create_model_outputs( + softmax_output, extra_outputs[0] + ), + ) + + def _new_softmax_head_name(self) -> str: + """Return a layer name unique within the current model graph.""" + return labeler_utils.get_tf_rebuild_layer_name( + self._model, f"{self._SOFTMAX_OUTPUT}_rebuild" + ) + def __eq__(self, other: object) -> bool: """ Check if two models are equal with one another. @@ -215,15 +247,34 @@ def load_from_disk(cls, dirpath: str) -> CharLoadTFModel: tf_model = tf.keras.models.load_model(dirpath) loaded_model = cls(dirpath, label_mapping, parameters) - loaded_model._model = tf_model + loaded_model._model = cls._normalize_model_outputs(tf_model) # load self loaded_model._model_num_labels = loaded_model.num_labels loaded_model._model_default_ind = loaded_model.label_mapping[ loaded_model._parameters["default_label"] ] + loaded_model._compile_model(loaded_model.num_labels) return loaded_model + def _compile_model(self, num_labels: int) -> None: + """Compile the model with dict-based losses and metrics.""" + losses = { + self._SOFTMAX_OUTPUT: "categorical_crossentropy", + self._ARGMAX_OUTPUT: None, + } + f1_score_training = labeler_utils.F1Score( + num_classes=num_labels, average="micro" + ) + metrics = { + self._SOFTMAX_OUTPUT: [ + "categorical_crossentropy", + "acc", + f1_score_training, + ] + } + self._model.compile(loss=losses, optimizer="adam", metrics=metrics) + def _construct_model(self) -> None: """ Model constructor for the data labeler. @@ -237,45 +288,28 @@ def _construct_model(self) -> None: model_loc = self._parameters["model_path"] self._model: tf.keras.Model = tf.keras.models.load_model(model_loc) - self._model = tf.keras.Model(self._model.inputs, self._model.outputs) - softmax_output_layer_name = self._model.output_names[0] + self._model = self._normalize_model_outputs(self._model) + softmax_output = self._model.output[self._SOFTMAX_OUTPUT] + softmax_layer = softmax_output._keras_history[0] + softmax_output_layer_name = softmax_layer.name softmax_layer_ind = cast( int, labeler_utils.get_tf_layer_index_from_name( self._model, softmax_output_layer_name ), ) - softmax_layer = self._model.get_layer(softmax_output_layer_name) - new_softmax_layer = softmax_layer.output + new_softmax_layer = softmax_output if softmax_layer.weights[0].shape[-1] != num_labels: new_softmax_layer = tf.keras.layers.Dense( - num_labels, activation="softmax", name="softmax_output" + num_labels, + activation="softmax", + name=self._new_softmax_head_name(), )(self._model.layers[softmax_layer_ind - 1].output) - # Add argmax layer to get labels directly as an output - argmax_layer = tf.keras.ops.argmax(new_softmax_layer, axis=2) - - argmax_outputs = [new_softmax_layer, argmax_layer] - self._model = tf.keras.Model(self._model.inputs, argmax_outputs) - self._model = tf.keras.Model(self._model.inputs, self._model.outputs) - - # Compile the model w/ metrics - softmax_output_layer_name = self._model.output_names[0] - losses = ["categorical_crossentropy", None] - # use f1 score metric - f1_score_training = labeler_utils.F1Score( - num_classes=num_labels, average="micro" - ) - metrics = { - softmax_output_layer_name: [ - "categorical_crossentropy", - "acc", - f1_score_training, - ] - } - - self._model.compile(loss=losses, optimizer="adam", metrics=metrics) + output_dict = self._create_model_outputs(new_softmax_layer) + self._model = tf.keras.Model(self._model.inputs, output_dict) + self._compile_model(num_labels) self._epoch_id = 0 self._model_num_labels = num_labels @@ -304,32 +338,14 @@ def _reconstruct_model(self) -> None: # Add the final Softmax layer to the previous spot # self._model.layers[-2] to skip: original softmax final_softmax_layer = tf.keras.layers.Dense( - num_labels, activation="softmax", name="softmax_output" + num_labels, + activation="softmax", + name=self._new_softmax_head_name(), )(self._model.layers[-2].output) - # Add argmax layer to get labels directly as an output - argmax_layer = tf.keras.ops.argmax(final_softmax_layer, axis=2) - - argmax_outputs = [final_softmax_layer, argmax_layer] - self._model = tf.keras.Model(self._model.inputs, argmax_outputs) - - # Compile the model - softmax_output_layer_name = self._model.output_names[0] - losses = ["categorical_crossentropy", None] - - # use f1 score metric - f1_score_training = labeler_utils.F1Score( - num_classes=num_labels, average="micro" - ) - metrics = { - softmax_output_layer_name: [ - "categorical_crossentropy", - "acc", - f1_score_training, - ] - } - - self._model.compile(loss=losses, optimizer="adam", metrics=metrics) + output_dict = self._create_model_outputs(final_softmax_layer) + self._model = tf.keras.Model(self._model.inputs, output_dict) + self._compile_model(num_labels) self._epoch_id = 0 self._model_num_labels = num_labels @@ -383,18 +399,37 @@ def fit( start_time = time.time() batch_id = 0 + target_output = self._SOFTMAX_OUTPUT for x_train, y_train in train_data: - model_results = self._model.train_on_batch(x_train, y_train) + model_results = self._model.train_on_batch( + x_train, + {target_output: y_train}, + return_dict=True, + ) + acc_value = next( + (value for key, value in model_results.items() if key.endswith("acc")), + np.nan, + ) + f1_value = next( + (value for key, value in model_results.items() if "f1" in key.lower()), + np.nan, + ) sys.stdout.flush() if verbose: sys.stdout.write( "\rEPOCH %d, batch_id %d: loss: %f - acc: %f - " - "f1_score %f" % (self._epoch_id, batch_id, *model_results[1:]) + "f1_score %f" + % ( + self._epoch_id, + batch_id, + model_results.get("loss", np.nan), + acc_value, + f1_value, + ) ) batch_id += 1 - for i, metric_label in enumerate(self._model.metrics_names): - history[metric_label] = model_results[i] + history.update(model_results) if val_data: f1, f1_report = self._validate_training(val_data) # type: ignore @@ -412,7 +447,9 @@ def fit( % ( self._epoch_id, epoch_time, - *model_results[1:], + model_results.get("loss", np.nan), + acc_value, + f1_value, val_f1, val_precision, val_recall, @@ -459,7 +496,7 @@ def _validate_training( y_val_pred.append( self._model.predict( x_val, batch_size=batch_size_test, verbose=verbose_keras - )[1] + )[self._ARGMAX_OUTPUT] ) y_val_test.append(np.argmax(y_val, axis=-1)) batch_id += 1 @@ -532,10 +569,10 @@ def predict( if show_confidences: confidences[ allocation_index : allocation_index + num_samples_in_batch - ] = model_output[0].numpy() + ] = model_output[self._SOFTMAX_OUTPUT].numpy() predictions[ allocation_index : allocation_index + num_samples_in_batch - ] = model_output[1].numpy() + ] = model_output[self._ARGMAX_OUTPUT].numpy() allocation_index += num_samples_in_batch diff --git a/dataprofiler/labelers/character_level_cnn_model.py b/dataprofiler/labelers/character_level_cnn_model.py index f1e925d5..9ae1eefe 100644 --- a/dataprofiler/labelers/character_level_cnn_model.py +++ b/dataprofiler/labelers/character_level_cnn_model.py @@ -120,6 +120,7 @@ def get_config(self): def call(self, argmax_layer: tf.Tensor, confidence_layer: tf.Tensor) -> tf.Tensor: """Apply the threshold argmax to the input tensor.""" + argmax_layer = tf.cast(argmax_layer, tf.int64) threshold_at_argmax = tf.gather(self.thresh_vec, argmax_layer) confidence_max_layer = tf.keras.backend.max(confidence_layer, axis=2) @@ -146,6 +147,19 @@ def call(self, argmax_layer: tf.Tensor, confidence_layer: tf.Tensor) -> tf.Tenso return final_predicted_layer +@tf.keras.utils.register_keras_serializable(package="CharacterLevelCnnModel") +class ArgMaxLayer(tf.keras.layers.Layer): + """Keras layer returning integer argmax indices.""" + + def call(self, confidence_layer: tf.Tensor) -> tf.Tensor: + """Return argmax indices as int64.""" + return tf.cast(tf.keras.ops.argmax(confidence_layer, axis=2), tf.int64) + + def compute_output_shape(self, input_shape): + """Return the confidence tensor shape without the class dimension.""" + return input_shape[:-1] + + @tf.keras.utils.register_keras_serializable(package="CharacterLevelCnnModel") class EncodingLayer(tf.keras.layers.Layer): """Encodes strings to integers.""" @@ -206,6 +220,9 @@ class CharacterLevelCnnModel(BaseTrainableModel, metaclass=AutoSubRegistrationMe # boolean if the label mapping requires the mapping for index 0 reserved requires_zero_mapping: bool = True + _SOFTMAX_OUTPUT = "softmax_output" + _ARGMAX_OUTPUT = "argmax_output" + _THRESH_OUTPUT = "thresh_argmax_output" def __init__(self, label_mapping: dict[str, int], parameters: dict = None) -> None: """ @@ -242,6 +259,54 @@ def __init__(self, label_mapping: dict[str, int], parameters: dict = None) -> No BaseModel.__init__(self, label_mapping, parameters) + @classmethod + def _create_model_outputs( + cls, + softmax_output: tf.Tensor, + default_ind: int, + num_labels: int, + argmax_output: tf.Tensor | None = None, + threshold_output: tf.Tensor | None = None, + ) -> dict[str, tf.Tensor]: + """Return normalized dict outputs for training and inference.""" + if argmax_output is None: + argmax_output = ArgMaxLayer(name=cls._ARGMAX_OUTPUT)(softmax_output) + if threshold_output is None: + threshold_output = ThreshArgMaxLayer( + threshold_=0.0, + num_labels_=num_labels, + default_ind=default_ind, + name=cls._THRESH_OUTPUT, + )(argmax_output, softmax_output) + return { + cls._SOFTMAX_OUTPUT: softmax_output, + cls._ARGMAX_OUTPUT: argmax_output, + cls._THRESH_OUTPUT: threshold_output, + } + + @classmethod + def _normalize_model_outputs( + cls, model: tf.keras.Model, default_ind: int, num_labels: int + ) -> tf.keras.Model: + """Convert list-style outputs to the normalized dict structure.""" + return labeler_utils.normalize_tf_model_outputs( + model, + [cls._SOFTMAX_OUTPUT, cls._ARGMAX_OUTPUT, cls._THRESH_OUTPUT], + lambda softmax_output, extra_outputs: cls._create_model_outputs( + softmax_output, + default_ind, + num_labels, + extra_outputs[0], + extra_outputs[1], + ), + ) + + def _new_softmax_head_name(self) -> str: + """Return a layer name unique within the current model graph.""" + return labeler_utils.get_tf_rebuild_layer_name( + self._model, f"{self._SOFTMAX_OUTPUT}_rebuild" + ) + def __eq__(self, other: object) -> bool: """ Check if two models are equal with one another. @@ -444,6 +509,12 @@ def load_from_disk(cls, dirpath: str) -> CharacterLevelCnnModel: loaded_model._construct_model() tf1_weights.append(loaded_model._model.weights[-1].value()) loaded_model._model.set_weights(tf1_weights) + else: + loaded_model._model = cls._normalize_model_outputs( + tf_model, + loaded_model.label_mapping[loaded_model._parameters["default_label"]], + loaded_model.num_labels, + ) # load self loaded_model._model_num_labels = loaded_model.num_labels @@ -479,22 +550,21 @@ def _argmax_threshold_layer( @staticmethod def _compile_loss(model: tf.keras.Model, num_labels: int) -> None: """Compiles the loss for the given model and number of labels.""" - # Compile the model - softmax_output_layer_name = model.output_names[0] - losses = ["categorical_crossentropy", None, None] - - # use f1 score metric + losses = { + CharacterLevelCnnModel._SOFTMAX_OUTPUT: "categorical_crossentropy", + CharacterLevelCnnModel._ARGMAX_OUTPUT: None, + CharacterLevelCnnModel._THRESH_OUTPUT: None, + } f1_score_training = labeler_utils.F1Score( num_classes=num_labels, average="micro" ) metrics = { - softmax_output_layer_name: [ + CharacterLevelCnnModel._SOFTMAX_OUTPUT: [ "categorical_crossentropy", "acc", f1_score_training, ] } - model.compile(loss=losses, optimizer="adam", metrics=metrics) def _construct_model(self) -> None: @@ -577,21 +647,18 @@ def _construct_model(self) -> None: self._model.add(tf.keras.layers.Dropout(self._parameters["dropout"])) # Add the final Softmax layer - self._model.add(tf.keras.layers.Dense(num_labels, activation="softmax")) - - # Add argmax layer to get labels directly as an output - argmax_layer = tf.keras.ops.argmax(self._model.outputs[0], axis=2) - - # Create confidence layers - final_predicted_layer = ThreshArgMaxLayer( - threshold_=0.0, num_labels_=num_labels, default_ind=default_ind + self._model.add( + tf.keras.layers.Dense( + num_labels, + activation="softmax", + name=self._SOFTMAX_OUTPUT, + ) ) - argmax_outputs = self._model.outputs + [ - argmax_layer, - final_predicted_layer(argmax_layer, self._model.outputs[0]), - ] - self._model = tf.keras.Model(self._model.inputs, argmax_outputs) + output_dict = self._create_model_outputs( + self._model.outputs[0], default_ind, num_labels + ) + self._model = tf.keras.Model(self._model.inputs, output_dict) self._compile_loss(self._model, num_labels) self._epoch_id = 0 @@ -621,22 +688,15 @@ def _reconstruct_model(self) -> None: # Add the final Softmax layer to the previous spot # self._model.layers[-3] to skip: thresh and original softmax final_softmax_layer = tf.keras.layers.Dense( - num_labels, activation="softmax", name="dense_2" + num_labels, + activation="softmax", + name=self._new_softmax_head_name(), )(self._model.layers[-3].output) - # Add argmax layer to get labels directly as an output - argmax_layer = tf.keras.ops.argmax(final_softmax_layer, axis=2) - - # Create confidence layers - final_predicted_layer = ThreshArgMaxLayer( - threshold_=0.0, num_labels_=num_labels, default_ind=default_ind + output_dict = self._create_model_outputs( + final_softmax_layer, default_ind, num_labels ) - - argmax_outputs = [final_softmax_layer] + [ - argmax_layer, - final_predicted_layer(argmax_layer, final_softmax_layer), - ] - self._model = tf.keras.Model(self._model.inputs, argmax_outputs) + self._model = tf.keras.Model(self._model.inputs, output_dict) self._compile_loss(self._model, num_labels) self._epoch_id = 0 self._model_num_labels = num_labels @@ -690,18 +750,37 @@ def fit( start_time = time.time() batch_id = 0 + target_output = self._SOFTMAX_OUTPUT for x_train, y_train in train_data: - model_results = self._model.train_on_batch(x_train, y_train) + model_results = self._model.train_on_batch( + x_train, + {target_output: y_train}, + return_dict=True, + ) + acc_value = next( + (value for key, value in model_results.items() if key.endswith("acc")), + np.nan, + ) + f1_value = next( + (value for key, value in model_results.items() if "f1" in key.lower()), + np.nan, + ) sys.stdout.flush() if verbose: sys.stdout.write( "\rEPOCH %d, batch_id %d: loss: %f - acc: %f - " - "f1_score %f" % (self._epoch_id, batch_id, *model_results[1:]) + "f1_score %f" + % ( + self._epoch_id, + batch_id, + model_results.get("loss", np.nan), + acc_value, + f1_value, + ) ) batch_id += 1 - for i, metric_label in enumerate(self._model.metrics_names): - history[metric_label] = model_results[i] + history.update(model_results) if val_data: f1, f1_report = self._validate_training(val_data) # type: ignore @@ -719,7 +798,9 @@ def fit( % ( self._epoch_id, epoch_time, - *model_results[1:], + model_results.get("loss", np.nan), + acc_value, + f1_value, val_f1, val_precision, val_recall, @@ -768,7 +849,7 @@ def _validate_training( tf.convert_to_tensor(x_val), batch_size=batch_size_test, verbose=verbose_keras, - )[1] + )[self._ARGMAX_OUTPUT] ) y_val_test.append(np.argmax(y_val, axis=-1)) batch_id += 1 @@ -861,10 +942,10 @@ def predict( if show_confidences: confidences[ allocation_index : allocation_index + num_samples_in_batch - ] = model_output[0].numpy() + ] = model_output[self._SOFTMAX_OUTPUT].numpy() predictions[ allocation_index : allocation_index + num_samples_in_batch - ] = model_output[1].numpy() + ] = model_output[self._ARGMAX_OUTPUT].numpy() sentence_lengths[ allocation_index : allocation_index + num_samples_in_batch ] = list(map(lambda x: len(x[0]), batch_data)) diff --git a/dataprofiler/labelers/labeler_utils.py b/dataprofiler/labelers/labeler_utils.py index efebaec4..2a74b87e 100644 --- a/dataprofiler/labelers/labeler_utils.py +++ b/dataprofiler/labelers/labeler_utils.py @@ -231,6 +231,64 @@ def get_tf_layer_index_from_name(model: tf.keras.Model, layer_name: str) -> int return None +def normalize_tf_model_outputs( + model: tf.keras.Model, + output_names: list[str], + create_outputs_fn: Callable[ + [tf.Tensor, list[tf.Tensor | None]], dict[str, tf.Tensor] + ], +) -> tf.keras.Model: + """Convert a model's outputs into a named dict-output structure when possible.""" + try: + model_output = model.output + except (AttributeError, IndexError): + model_output = None + + try: + model_outputs_list = list(model.outputs) + except (AttributeError, IndexError, TypeError): + model_outputs_list = [] + + if isinstance(model_output, dict): + if set(model_output) == set(output_names): + return model + softmax_output = model_output.get( + output_names[0], next(iter(model_output.values())) + ) + extra_outputs = [model_output.get(name) for name in output_names[1:]] + else: + if not model_outputs_list: + try: + last_output = model.layers[-1].output + except (AttributeError, IndexError): + return model + if not hasattr(last_output, "_keras_history"): + return model + model_outputs_list = [last_output] + softmax_output = model_outputs_list[0] + extra_outputs = [ + model_outputs_list[index] if len(model_outputs_list) > index else None + for index in range(1, len(output_names)) + ] + + try: + output_dict = create_outputs_fn(softmax_output, extra_outputs) + return tf.keras.Model(model.inputs, output_dict) + except (AttributeError, TypeError, ValueError): + return model + + +def get_tf_rebuild_layer_name(model: tf.keras.Model, base_name: str) -> str: + """Return a layer name unique within the current model graph.""" + existing_names = {layer.name for layer in getattr(model, "layers", [])} + if base_name not in existing_names: + return base_name + suffix = 1 + while f"{base_name}_{suffix}" in existing_names: + suffix += 1 + return f"{base_name}_{suffix}" + + def hide_tf_logger_warnings() -> None: """Filter out a set of warnings from the tf logger.""" diff --git a/dataprofiler/tests/labelers/test_character_level_cnn_model.py b/dataprofiler/tests/labelers/test_character_level_cnn_model.py index 79d1b3f7..554b61c8 100644 --- a/dataprofiler/tests/labelers/test_character_level_cnn_model.py +++ b/dataprofiler/tests/labelers/test_character_level_cnn_model.py @@ -446,8 +446,8 @@ def test_model_construct(self): cnn_model.details() expected_layers = [ - "input_1", - "lambda", + "input_layer", + "encoding_layer", "embedding", "conv1d", "dropout", @@ -465,8 +465,9 @@ def test_model_construct(self): "dropout_4", "dense_1", "dropout_5", - "dense_2", - "thresh_arg_max_layer", + "softmax_output", + "argmax_output", + "thresh_argmax_output", ] model_layers = [layer.name for layer in cnn_model._model.layers] self.assertEqual(len(expected_layers), len(model_layers)) From e1afcf7efdcb84c9854c96c891a61cff57a4258f Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Fri, 22 May 2026 12:04:15 -0500 Subject: [PATCH 24/28] fix: int pre-commit --- dataprofiler/profilers/int_column_profile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataprofiler/profilers/int_column_profile.py b/dataprofiler/profilers/int_column_profile.py index 014465c7..4b8ab728 100644 --- a/dataprofiler/profilers/int_column_profile.py +++ b/dataprofiler/profilers/int_column_profile.py @@ -163,7 +163,7 @@ def update(self, df_series: pd.Series) -> IntColumn: df_series = df_series.reset_index(drop=True) is_each_row_int = self._is_each_row_int(df_series) sample_size = len(is_each_row_int) - match_int_count = np.sum(is_each_row_int) + match_int_count: int = int(np.sum(is_each_row_int)) profile = dict(match_count=match_int_count, sample_size=sample_size) BaseColumnProfiler._perform_property_calcs( From 0b00aed1cd3f4d30d9e4aa4910b27a4a558cf622 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Fri, 22 May 2026 12:17:07 -0500 Subject: [PATCH 25/28] fix: train labeling --- dataprofiler/labelers/char_load_tf_model.py | 7 +++++-- dataprofiler/labelers/character_level_cnn_model.py | 8 ++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/dataprofiler/labelers/char_load_tf_model.py b/dataprofiler/labelers/char_load_tf_model.py index c8c40d70..eb7c1b53 100644 --- a/dataprofiler/labelers/char_load_tf_model.py +++ b/dataprofiler/labelers/char_load_tf_model.py @@ -399,11 +399,14 @@ def fit( start_time = time.time() batch_id = 0 - target_output = self._SOFTMAX_OUTPUT for x_train, y_train in train_data: + y_train_dict = { + self._SOFTMAX_OUTPUT: y_train, + self._ARGMAX_OUTPUT: None, + } model_results = self._model.train_on_batch( x_train, - {target_output: y_train}, + y_train_dict, return_dict=True, ) acc_value = next( diff --git a/dataprofiler/labelers/character_level_cnn_model.py b/dataprofiler/labelers/character_level_cnn_model.py index 9ae1eefe..20c4777c 100644 --- a/dataprofiler/labelers/character_level_cnn_model.py +++ b/dataprofiler/labelers/character_level_cnn_model.py @@ -750,11 +750,15 @@ def fit( start_time = time.time() batch_id = 0 - target_output = self._SOFTMAX_OUTPUT for x_train, y_train in train_data: + y_train_dict = { + self._SOFTMAX_OUTPUT: y_train, + self._ARGMAX_OUTPUT: None, + self._THRESH_OUTPUT: None, + } model_results = self._model.train_on_batch( x_train, - {target_output: y_train}, + y_train_dict, return_dict=True, ) acc_value = next( From 8edd1dc72746fac275e948bbd0733f9e2b5eaf63 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Fri, 22 May 2026 12:48:25 -0500 Subject: [PATCH 26/28] refactor notes, reqs, and change log --- .pre-commit-config.yaml | 36 +++++++++++++---------- CHANGELOG.md | 10 +++++++ MANIFEST.in | 1 + dataprofiler/profilers/histogram_utils.py | 5 ++++ requirements-ml.txt | 2 +- requirements.txt | 2 +- 6 files changed, 38 insertions(+), 18 deletions(-) create mode 100644 CHANGELOG.md diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7fd5ca82..cd3dc8be 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ repos: # Black: format Python code # https://github.com/psf/black/blob/master/.pre-commit-hooks.yaml - repo: https://github.com/psf/black - rev: 22.3.0 + rev: 24.3.0 hooks: - id: black exclude: (versioneer.py|dataprofiler/_version.py|_docs/) @@ -50,29 +50,31 @@ repos: # requirements.txt h5py>=2.10.0, wheel>=0.33.1, - numpy>=1.0.0, + 'numpy>=1.22.0,<3.0.0', 'pandas>=1.1.2,<3.0.0', python-dateutil>=2.7.5, pytz>=2020.1, pyarrow>=1.0.1, 'chardet>=3.0.4,<7.0.0', - fastavro>=1.0.0.post1, + fastavro>=1.1.0, python-snappy>=0.7.1, - 'charset-normalizer>=1.3.6,<7.0.0', + charset-normalizer>=1.3.6, psutil>=4.0.0, - scipy>=1.4.1, - requests>=2.28.1, + scipy>=1.10.0, + requests>=2.32.4, networkx>=2.5.1, typing-extensions>=3.10.0.2, HLL>=2.0.3, datasketches>=4.1.0, - boto3>=1.28.61, + packaging>=23.0, + boto3>=1.37.15, + urllib3>=2.5.0, # requirements-dev.txt - check-manifest>=0.48, - black==22.3.0, + check-manifest>=0.50, + black>=24.3.0, isort==5.12.0, - pre-commit==2.19.0, + pre-commit==4.3.0, tox==3.25.1, types-setuptools==67.7.0.1, types-python-dateutil==2.8.19.12, @@ -82,7 +84,7 @@ repos: # requirements-ml.txt scikit-learn>=0.23.2, - 'keras>3.4.0', + 'keras>3.4.0,<4.0.0', rapidfuzz>=2.6.1, "tensorflow>=2.16.0", tqdm>=4.0.0, @@ -104,13 +106,15 @@ repos: # Check-manifest: ensures required non-Python files are included in MANIFEST.in # https://github.com/mgedmin/check-manifest/blob/master/.pre-commit-hooks.yaml - repo: https://github.com/mgedmin/check-manifest - rev: "0.48" + rev: "0.50" hooks: - id: check-manifest - additional_dependencies: ['h5py', 'wheel', 'future', 'numpy>=1.0.0', 'pandas', - 'python-dateutil', 'pytz', 'pyarrow', 'chardet', 'fastavro', - 'python-snappy', 'charset-normalizer', 'psutil', 'scipy', 'requests', - 'networkx','typing-extensions', 'HLL', 'datasketches', 'boto3'] + additional_dependencies: ['h5py', 'wheel', 'future', 'numpy>=1.22.0,<3.0.0', + 'pandas', 'python-dateutil', 'pytz', 'pyarrow', 'chardet', + 'fastavro>=1.1.0', 'python-snappy', 'charset-normalizer', 'psutil', + 'scipy>=1.10.0', 'requests>=2.32.4', 'networkx', 'typing-extensions', + 'HLL', 'datasketches', 'packaging>=23.0', 'boto3>=1.37.15', + 'urllib3>=2.5.0'] # Pyupgrade - standardize and modernize Python syntax for newer versions of the language - repo: https://github.com/asottile/pyupgrade rev: v3.3.0 diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..c7abf30e --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,10 @@ +# Changelog + +## Unreleased + +- Added compatibility support for NumPy 2.0 while constraining `numpy` to + `>=1.22.0,<3.0.0` to avoid future breakage from NumPy 3. +- Added compatibility support for Keras versions newer than 3.4.0 while + constraining `keras` to `>3.4.0,<4.0.0` to avoid future breakage from Keras 4. +- Updated the pre-commit configuration to align hook versions and hook + dependencies with the current project requirements. diff --git a/MANIFEST.in b/MANIFEST.in index bafea077..f5ba8819 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,6 +2,7 @@ global-exclude .DS_Store global-exclude */__pycache__/* include *.txt +include CHANGELOG.md include CODEOWNERS recursive-include dataprofiler *.avro recursive-include dataprofiler *.csv diff --git a/dataprofiler/profilers/histogram_utils.py b/dataprofiler/profilers/histogram_utils.py index a5d2defa..b7aed9ad 100644 --- a/dataprofiler/profilers/histogram_utils.py +++ b/dataprofiler/profilers/histogram_utils.py @@ -7,6 +7,7 @@ A copy of the license for numpy is available here: https://github.com/numpy/numpy/blob/main/LICENSE.txt """ + import operator from typing import List, Optional, Tuple, Union @@ -16,6 +17,10 @@ try: # numpy v2+ + # NOTE: `numpy.lib._histograms_impl` is a private module, so this import may + # need to be revisited if NumPy exposes a public replacement for these + # helpers. NumPy's 2.4.0 release notes discuss public APIs replacing modules + # that moved private in 2.x: https://numpy.org/doc/stable/release/2.4.0-notes.html from numpy.lib._histograms_impl import ( # type: ignore[attr-defined] _get_outer_edges, _hist_bin_selectors, diff --git a/requirements-ml.txt b/requirements-ml.txt index 0c02d6bc..c8b373f2 100644 --- a/requirements-ml.txt +++ b/requirements-ml.txt @@ -1,5 +1,5 @@ scikit-learn>=0.23.2 -keras>3.4.0 +keras>3.4.0,<4.0.0 rapidfuzz>=2.6.1 tensorflow>=2.16.0 tqdm>=4.0.0 diff --git a/requirements.txt b/requirements.txt index 355018fe..3db3daad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ h5py>=2.10.0 wheel>=0.33.1 -numpy>=1.0.0 +numpy>=1.22.0,<3.0.0 pandas>=1.1.2,<3.0.0 python-dateutil>=2.7.5 pytz>=2020.1 From 03b4fa1366109636cedbafe7973f21e90bfd643b Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Fri, 22 May 2026 12:56:00 -0500 Subject: [PATCH 27/28] fix: pre-commit --- dataprofiler/__init__.py | 1 + dataprofiler/_typing.py | 1 + dataprofiler/data_readers/avro_data.py | 1 + dataprofiler/data_readers/base_data.py | 1 + dataprofiler/data_readers/csv_data.py | 1 + dataprofiler/data_readers/data_utils.py | 1 + .../data_readers/filepath_or_buffer.py | 1 + dataprofiler/data_readers/graph_data.py | 1 + dataprofiler/data_readers/json_data.py | 7 +-- dataprofiler/data_readers/parquet_data.py | 1 + .../data_readers/structured_mixins.py | 1 + dataprofiler/labelers/__init__.py | 1 + dataprofiler/labelers/base_model.py | 1 + dataprofiler/labelers/char_load_tf_model.py | 7 +-- .../labelers/character_level_cnn_model.py | 7 +-- .../labelers/classification_report_utils.py | 1 + dataprofiler/labelers/column_name_model.py | 1 + dataprofiler/labelers/data_processing.py | 16 ++++--- dataprofiler/labelers/labeler_utils.py | 1 + dataprofiler/labelers/regex_model.py | 1 + dataprofiler/profilers/__init__.py | 1 + .../profilers/categorical_column_profile.py | 45 ++++++++++--------- .../profilers/column_profile_compilers.py | 1 + .../profilers/data_labeler_column_profile.py | 1 + .../profilers/datetime_column_profile.py | 5 ++- .../profilers/float_column_profile.py | 1 + dataprofiler/profilers/graph_profiler.py | 1 + dataprofiler/profilers/helpers/__init__.py | 1 + .../profilers/helpers/report_helpers.py | 1 + dataprofiler/profilers/int_column_profile.py | 1 + dataprofiler/profilers/json_decoder.py | 13 +++--- .../profilers/numerical_column_stats.py | 11 ++--- .../profilers/order_column_profile.py | 1 + dataprofiler/profilers/profile_builder.py | 38 +++++++++------- dataprofiler/profilers/profiler_utils.py | 7 ++- dataprofiler/profilers/text_column_profile.py | 1 + .../profilers/unstructured_labeler_profile.py | 1 + .../profilers/unstructured_text_profile.py | 1 + dataprofiler/reports/graphs.py | 1 + dataprofiler/reports/utils.py | 1 + dataprofiler/rng_utils.py | 1 + dataprofiler/settings.py | 1 + dataprofiler/tests/plugins/test_plugins.py | 10 ++--- .../profilers/test_float_column_profile.py | 18 +++++++- .../tests/profilers/test_profiler_utils.py | 1 - .../structured_space_time_analysis.py | 1 + dataprofiler/tests/test_rng_utils.py | 1 + 47 files changed, 138 insertions(+), 81 deletions(-) diff --git a/dataprofiler/__init__.py b/dataprofiler/__init__.py index ed54be24..6b29c5b7 100644 --- a/dataprofiler/__init__.py +++ b/dataprofiler/__init__.py @@ -1,4 +1,5 @@ """Package for dataprofiler.""" + from . import settings from ._version import get_versions from .data_readers.data import Data diff --git a/dataprofiler/_typing.py b/dataprofiler/_typing.py index fa362d1b..b7a62388 100644 --- a/dataprofiler/_typing.py +++ b/dataprofiler/_typing.py @@ -1,4 +1,5 @@ """Contains typing aliases.""" + from typing import Dict, List, NewType, Union import numpy as np diff --git a/dataprofiler/data_readers/avro_data.py b/dataprofiler/data_readers/avro_data.py index 720b9d1f..7f15bdec 100644 --- a/dataprofiler/data_readers/avro_data.py +++ b/dataprofiler/data_readers/avro_data.py @@ -1,4 +1,5 @@ """Contains class for saving and loading spreadsheet data.""" + from io import BytesIO, StringIO from typing import Any, Dict, List, Optional, Union diff --git a/dataprofiler/data_readers/base_data.py b/dataprofiler/data_readers/base_data.py index 27d8d5de..e6e85d3d 100644 --- a/dataprofiler/data_readers/base_data.py +++ b/dataprofiler/data_readers/base_data.py @@ -1,4 +1,5 @@ """Contains abstract class for data loading and saving.""" + import locale import sys from collections import OrderedDict diff --git a/dataprofiler/data_readers/csv_data.py b/dataprofiler/data_readers/csv_data.py index 7e13d407..cb1a2e2d 100644 --- a/dataprofiler/data_readers/csv_data.py +++ b/dataprofiler/data_readers/csv_data.py @@ -1,4 +1,5 @@ """Contains class that saves and loads spreadsheet data.""" + import csv import random import re diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index 6e213810..833d650e 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -1,4 +1,5 @@ """Contains functions for data readers.""" + import json import logging import os diff --git a/dataprofiler/data_readers/filepath_or_buffer.py b/dataprofiler/data_readers/filepath_or_buffer.py index 56c21e28..201f690e 100644 --- a/dataprofiler/data_readers/filepath_or_buffer.py +++ b/dataprofiler/data_readers/filepath_or_buffer.py @@ -1,4 +1,5 @@ """Contains functions and classes for handling filepaths and buffers.""" + from io import BytesIO, StringIO, TextIOWrapper from typing import IO, Any, Optional, Type, Union, cast diff --git a/dataprofiler/data_readers/graph_data.py b/dataprofiler/data_readers/graph_data.py index 337408a6..3cc83b04 100644 --- a/dataprofiler/data_readers/graph_data.py +++ b/dataprofiler/data_readers/graph_data.py @@ -1,4 +1,5 @@ """Contains class for identifying, reading, and loading graph data.""" + import csv from typing import Dict, List, Optional, Union, cast diff --git a/dataprofiler/data_readers/json_data.py b/dataprofiler/data_readers/json_data.py index 93e5d7e6..cc71c57e 100644 --- a/dataprofiler/data_readers/json_data.py +++ b/dataprofiler/data_readers/json_data.py @@ -1,4 +1,5 @@ """Contains class to save and load json data.""" + import json import re import warnings @@ -71,9 +72,9 @@ def __init__( self._data_formats["records"] = self._get_data_as_records self._data_formats["json"] = self._get_data_as_json - self._data_formats[ - "flattened_dataframe" - ] = self._get_data_as_flattened_dataframe + self._data_formats["flattened_dataframe"] = ( + self._get_data_as_flattened_dataframe + ) self._selected_data_format: str = options.get( "data_format", "flattened_dataframe" ) diff --git a/dataprofiler/data_readers/parquet_data.py b/dataprofiler/data_readers/parquet_data.py index 4fa567b8..b679431b 100644 --- a/dataprofiler/data_readers/parquet_data.py +++ b/dataprofiler/data_readers/parquet_data.py @@ -1,4 +1,5 @@ """Contains class to save and load parquet data.""" + from io import BytesIO, StringIO from typing import Any, Dict, List, Optional, Union diff --git a/dataprofiler/data_readers/structured_mixins.py b/dataprofiler/data_readers/structured_mixins.py index 3587291f..6b1da157 100644 --- a/dataprofiler/data_readers/structured_mixins.py +++ b/dataprofiler/data_readers/structured_mixins.py @@ -1,4 +1,5 @@ """Contains mixin data class for loading datasets of tye SpreadSheet.""" + from logging import Logger from typing import Any, Dict, List, Optional, Union, cast diff --git a/dataprofiler/labelers/__init__.py b/dataprofiler/labelers/__init__.py index 1b2302fc..a355ead2 100644 --- a/dataprofiler/labelers/__init__.py +++ b/dataprofiler/labelers/__init__.py @@ -26,6 +26,7 @@ 2. structured_model 3. regex_model """ + # import data labelers # import models from .base_data_labeler import BaseDataLabeler, TrainableDataLabeler diff --git a/dataprofiler/labelers/base_model.py b/dataprofiler/labelers/base_model.py index 08b453ec..c5d7aef5 100644 --- a/dataprofiler/labelers/base_model.py +++ b/dataprofiler/labelers/base_model.py @@ -1,4 +1,5 @@ """Contains abstract classes for labeling data.""" + from __future__ import annotations import abc diff --git a/dataprofiler/labelers/char_load_tf_model.py b/dataprofiler/labelers/char_load_tf_model.py index eb7c1b53..8adbfa0b 100644 --- a/dataprofiler/labelers/char_load_tf_model.py +++ b/dataprofiler/labelers/char_load_tf_model.py @@ -1,4 +1,5 @@ """Contains class for training data labeler model.""" + from __future__ import annotations import copy @@ -573,9 +574,9 @@ def predict( confidences[ allocation_index : allocation_index + num_samples_in_batch ] = model_output[self._SOFTMAX_OUTPUT].numpy() - predictions[ - allocation_index : allocation_index + num_samples_in_batch - ] = model_output[self._ARGMAX_OUTPUT].numpy() + predictions[allocation_index : allocation_index + num_samples_in_batch] = ( + model_output[self._ARGMAX_OUTPUT].numpy() + ) allocation_index += num_samples_in_batch diff --git a/dataprofiler/labelers/character_level_cnn_model.py b/dataprofiler/labelers/character_level_cnn_model.py index 20c4777c..7d78900a 100644 --- a/dataprofiler/labelers/character_level_cnn_model.py +++ b/dataprofiler/labelers/character_level_cnn_model.py @@ -1,4 +1,5 @@ """Contains classes for char data labeling.""" + from __future__ import annotations import copy @@ -947,9 +948,9 @@ def predict( confidences[ allocation_index : allocation_index + num_samples_in_batch ] = model_output[self._SOFTMAX_OUTPUT].numpy() - predictions[ - allocation_index : allocation_index + num_samples_in_batch - ] = model_output[self._ARGMAX_OUTPUT].numpy() + predictions[allocation_index : allocation_index + num_samples_in_batch] = ( + model_output[self._ARGMAX_OUTPUT].numpy() + ) sentence_lengths[ allocation_index : allocation_index + num_samples_in_batch ] = list(map(lambda x: len(x[0]), batch_data)) diff --git a/dataprofiler/labelers/classification_report_utils.py b/dataprofiler/labelers/classification_report_utils.py index 3146e829..840c236f 100644 --- a/dataprofiler/labelers/classification_report_utils.py +++ b/dataprofiler/labelers/classification_report_utils.py @@ -1,4 +1,5 @@ """Contains functions for classification.""" + from __future__ import annotations import warnings diff --git a/dataprofiler/labelers/column_name_model.py b/dataprofiler/labelers/column_name_model.py index d698cfd6..1732983c 100644 --- a/dataprofiler/labelers/column_name_model.py +++ b/dataprofiler/labelers/column_name_model.py @@ -1,4 +1,5 @@ """Contains class for column name data labeling model.""" + from __future__ import annotations import json diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py index 33d916cc..70c980c3 100644 --- a/dataprofiler/labelers/data_processing.py +++ b/dataprofiler/labelers/data_processing.py @@ -176,9 +176,11 @@ def process( labels: np.ndarray | None = None, label_mapping: dict[str, int] | None = None, batch_size: int = 32, - ) -> Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None] | tuple[ - np.ndarray, np.ndarray - ] | np.ndarray: + ) -> ( + Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None] + | tuple[np.ndarray, np.ndarray] + | np.ndarray + ): """Preprocess data.""" raise NotImplementedError() @@ -1950,9 +1952,11 @@ def _validate_parameters(self, parameters: dict) -> None: # being changed and is already set aggregation_func = parameters.get( "aggregation_func", - self._parameters.get("aggregation_func") - if hasattr(self, "_parameters") - else None, + ( + self._parameters.get("aggregation_func") + if hasattr(self, "_parameters") + else None + ), ) if value is None and aggregation_func == "priority": errors.append( diff --git a/dataprofiler/labelers/labeler_utils.py b/dataprofiler/labelers/labeler_utils.py index 2a74b87e..3a1097ce 100644 --- a/dataprofiler/labelers/labeler_utils.py +++ b/dataprofiler/labelers/labeler_utils.py @@ -1,4 +1,5 @@ """Contains functions for the data labeler.""" + from __future__ import annotations import logging diff --git a/dataprofiler/labelers/regex_model.py b/dataprofiler/labelers/regex_model.py index c6a690c1..dd74da71 100644 --- a/dataprofiler/labelers/regex_model.py +++ b/dataprofiler/labelers/regex_model.py @@ -1,4 +1,5 @@ """Contains class for regex data labeling model.""" + from __future__ import annotations import copy diff --git a/dataprofiler/profilers/__init__.py b/dataprofiler/profilers/__init__.py index 4b068fcb..14834794 100644 --- a/dataprofiler/profilers/__init__.py +++ b/dataprofiler/profilers/__init__.py @@ -1,4 +1,5 @@ """Package for providing statistics and predictions for a given dataset.""" + from . import json_decoder from .base_column_profilers import BaseColumnProfiler from .categorical_column_profile import CategoricalColumn diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py index 1ca63090..d64f5aa5 100644 --- a/dataprofiler/profilers/categorical_column_profile.py +++ b/dataprofiler/profilers/categorical_column_profile.py @@ -1,4 +1,5 @@ """Contains class for categorical column profiler.""" + from __future__ import annotations import math @@ -277,28 +278,28 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: # These stats are only diffed if both profiles are categorical if self.is_match and other_profile.is_match: - differences["statistics"][ - "chi2-test" - ] = profiler_utils.perform_chi_squared_test_for_homogeneity( - self._categories, - self.sample_size, - other_profile._categories, - other_profile.sample_size, + differences["statistics"]["chi2-test"] = ( + profiler_utils.perform_chi_squared_test_for_homogeneity( + self._categories, + self.sample_size, + other_profile._categories, + other_profile.sample_size, + ) ) - differences["statistics"][ - "categories" - ] = profiler_utils.find_diff_of_lists_and_sets( - self.categories, other_profile.categories + differences["statistics"]["categories"] = ( + profiler_utils.find_diff_of_lists_and_sets( + self.categories, other_profile.categories + ) ) - differences["statistics"][ - "gini_impurity" - ] = profiler_utils.find_diff_of_numbers( - self.gini_impurity, other_profile.gini_impurity + differences["statistics"]["gini_impurity"] = ( + profiler_utils.find_diff_of_numbers( + self.gini_impurity, other_profile.gini_impurity + ) ) - differences["statistics"][ - "unalikeability" - ] = profiler_utils.find_diff_of_numbers( - self.unalikeability, other_profile.unalikeability + differences["statistics"]["unalikeability"] = ( + profiler_utils.find_diff_of_numbers( + self.unalikeability, other_profile.unalikeability + ) ) cat_count1 = dict( sorted(self._categories.items(), key=itemgetter(1), reverse=True) @@ -326,9 +327,9 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: ) differences["statistics"]["psi"] = total_psi - differences["statistics"][ - "categorical_count" - ] = profiler_utils.find_diff_of_dicts(self_cat_count, other_cat_count) + differences["statistics"]["categorical_count"] = ( + profiler_utils.find_diff_of_dicts(self_cat_count, other_cat_count) + ) return differences diff --git a/dataprofiler/profilers/column_profile_compilers.py b/dataprofiler/profilers/column_profile_compilers.py index 07edf13d..cfeb8c69 100644 --- a/dataprofiler/profilers/column_profile_compilers.py +++ b/dataprofiler/profilers/column_profile_compilers.py @@ -1,4 +1,5 @@ """For generating a report.""" + from __future__ import annotations import abc diff --git a/dataprofiler/profilers/data_labeler_column_profile.py b/dataprofiler/profilers/data_labeler_column_profile.py index 81f9c0ce..3ce6257f 100644 --- a/dataprofiler/profilers/data_labeler_column_profile.py +++ b/dataprofiler/profilers/data_labeler_column_profile.py @@ -1,4 +1,5 @@ """Contains class for for profiling data labeler col.""" + from __future__ import annotations import operator diff --git a/dataprofiler/profilers/datetime_column_profile.py b/dataprofiler/profilers/datetime_column_profile.py index af99283a..1042ea0c 100644 --- a/dataprofiler/profilers/datetime_column_profile.py +++ b/dataprofiler/profilers/datetime_column_profile.py @@ -1,4 +1,5 @@ """Contains class for profiling datetime column.""" + from __future__ import annotations import datetime @@ -216,7 +217,7 @@ def _validate_datetime(date: str, date_format: str) -> datetime.datetime | float :return: either the str converted into a date format, or Nan """ try: - converted_date: (datetime.datetime | float) = datetime.datetime.strptime( + converted_date: datetime.datetime | float = datetime.datetime.strptime( date, date_format ) except (ValueError, TypeError): @@ -237,7 +238,7 @@ def _replace_day_suffix(date: str, pattern: re.Pattern) -> str | float: """ try: new_date: str | float = pattern.sub(r"\1", date) - except (TypeError): + except TypeError: new_date = np.nan return new_date diff --git a/dataprofiler/profilers/float_column_profile.py b/dataprofiler/profilers/float_column_profile.py index bc426a44..3d6ede32 100644 --- a/dataprofiler/profilers/float_column_profile.py +++ b/dataprofiler/profilers/float_column_profile.py @@ -1,4 +1,5 @@ """Float profile analysis for individual col within structured profiling.""" + from __future__ import annotations import copy diff --git a/dataprofiler/profilers/graph_profiler.py b/dataprofiler/profilers/graph_profiler.py index 0680a29a..345a0f2e 100644 --- a/dataprofiler/profilers/graph_profiler.py +++ b/dataprofiler/profilers/graph_profiler.py @@ -1,4 +1,5 @@ """Class and functions to calculate and profile properties of graph data.""" + from __future__ import annotations import importlib diff --git a/dataprofiler/profilers/helpers/__init__.py b/dataprofiler/profilers/helpers/__init__.py index 43393433..2c72b2f3 100644 --- a/dataprofiler/profilers/helpers/__init__.py +++ b/dataprofiler/profilers/helpers/__init__.py @@ -1,4 +1,5 @@ """This package provides helper functions for generating reports.""" + from .report_helpers import _prepare_report, calculate_quantiles __all__ = [ diff --git a/dataprofiler/profilers/helpers/report_helpers.py b/dataprofiler/profilers/helpers/report_helpers.py index 0588252c..44ac8fb1 100644 --- a/dataprofiler/profilers/helpers/report_helpers.py +++ b/dataprofiler/profilers/helpers/report_helpers.py @@ -1,4 +1,5 @@ """Contains helper functions for generating report.""" + from __future__ import annotations import math diff --git a/dataprofiler/profilers/int_column_profile.py b/dataprofiler/profilers/int_column_profile.py index 4b8ab728..ae4ed575 100644 --- a/dataprofiler/profilers/int_column_profile.py +++ b/dataprofiler/profilers/int_column_profile.py @@ -1,4 +1,5 @@ """Int profile analysis for individual col within structured profiling.""" + from __future__ import annotations import numpy as np diff --git a/dataprofiler/profilers/json_decoder.py b/dataprofiler/profilers/json_decoder.py index fb4ff8cb..eb09db0d 100644 --- a/dataprofiler/profilers/json_decoder.py +++ b/dataprofiler/profilers/json_decoder.py @@ -1,4 +1,5 @@ """Contains methods to decode components of a Profiler.""" + from __future__ import annotations import warnings @@ -116,9 +117,9 @@ def get_structured_col_profiler_class(class_name: str) -> type[StructuredColProf :type class_name: str representing name of class :return: subclass of StructuredColProfiler object """ - struct_col_profiler_class: None | ( - type[StructuredColProfiler] - ) = _structured_col_profiler.get(class_name) + struct_col_profiler_class: None | (type[StructuredColProfiler]) = ( + _structured_col_profiler.get(class_name) + ) if struct_col_profiler_class is None: raise ValueError( f"Invalid structured col profiler class {class_name} " f"failed to load." @@ -153,9 +154,9 @@ def load_column_profile( JSON """ - column_profiler_cls: type[ - BaseColumnProfiler[BaseColumnProfiler] - ] = get_column_profiler_class(serialized_json["class"]) + column_profiler_cls: type[BaseColumnProfiler[BaseColumnProfiler]] = ( + get_column_profiler_class(serialized_json["class"]) + ) return column_profiler_cls.load_from_dict(serialized_json["data"], config) diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index 5135bcfd..9ec2190a 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -258,9 +258,9 @@ def _add_helper_merge_profile_histograms( if self.user_set_histogram_bin is None: for method in self.histogram_bin_method_names: - self.histogram_methods[method][ - "suggested_bin_count" - ] = histogram_utils._calculate_bins_from_profile(self, method) + self.histogram_methods[method]["suggested_bin_count"] = ( + histogram_utils._calculate_bins_from_profile(self, method) + ) self._get_quantiles() @@ -1044,10 +1044,7 @@ def _merge_biased_kurtosis( / N**3 ) third_term = ( - 6 - * delta**2 - * (match_count1**2 * M2_2 + match_count2**2 * M2_1) - / N**2 + 6 * delta**2 * (match_count1**2 * M2_2 + match_count2**2 * M2_1) / N**2 ) fourth_term = 4 * delta * (match_count1 * M3_2 - match_count2 * M3_1) / N M4 = first_term + second_term + third_term + fourth_term diff --git a/dataprofiler/profilers/order_column_profile.py b/dataprofiler/profilers/order_column_profile.py index 30826232..0a437431 100644 --- a/dataprofiler/profilers/order_column_profile.py +++ b/dataprofiler/profilers/order_column_profile.py @@ -1,4 +1,5 @@ """Index profile analysis for individual col within structured profiling.""" + from __future__ import annotations from abc import abstractmethod diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py index 6e512658..7d904b6a 100644 --- a/dataprofiler/profilers/profile_builder.py +++ b/dataprofiler/profilers/profile_builder.py @@ -1919,10 +1919,10 @@ def diff( # type: ignore[override] col_name = other_profile._profile[i].name other_profile_schema[col_name].append(i) - report["global_stats"][ - "profile_schema" - ] = profiler_utils.find_diff_of_dicts_with_diff_keys( - self_profile_schema, other_profile_schema + report["global_stats"]["profile_schema"] = ( + profiler_utils.find_diff_of_dicts_with_diff_keys( + self_profile_schema, other_profile_schema + ) ) # Only find the diff of columns if the schemas are exactly the same @@ -2101,9 +2101,9 @@ def report(self, report_options: dict = None) -> dict: self.options.null_replication_metrics.is_enabled and i in self._null_replication_metrics ): - report["data_stats"][i][ - "null_replication_metrics" - ] = self._null_replication_metrics[i] + report["data_stats"][i]["null_replication_metrics"] = ( + self._null_replication_metrics[i] + ) return _prepare_report(report, output_format, omit_keys) @@ -2610,9 +2610,11 @@ def _update_null_replication_metrics(self, clean_samples: dict) -> None: total_row_sum = np.asarray( [ - get_data_type_profiler(profile).sum - if get_data_type(profile) not in [None, "datetime"] - else np.nan + ( + get_data_type_profiler(profile).sum + if get_data_type(profile) not in [None, "datetime"] + else np.nan + ) for profile in self._profile ] ) @@ -2704,17 +2706,21 @@ def _merge_null_replication_metrics(self, other: StructuredProfiler) -> dict: self_row_sum = np.asarray( [ - get_data_type_profiler(profile).sum - if get_data_type(profile) - else np.nan + ( + get_data_type_profiler(profile).sum + if get_data_type(profile) + else np.nan + ) for profile in self._profile ] ) other_row_sum = np.asarray( [ - get_data_type_profiler(profile).sum - if get_data_type(profile) - else np.nan + ( + get_data_type_profiler(profile).sum + if get_data_type(profile) + else np.nan + ) for profile in other._profile ] ) diff --git a/dataprofiler/profilers/profiler_utils.py b/dataprofiler/profilers/profiler_utils.py index 2cc9846b..7986cec0 100644 --- a/dataprofiler/profilers/profiler_utils.py +++ b/dataprofiler/profilers/profiler_utils.py @@ -1,4 +1,5 @@ """Contains functions for profilers.""" + from __future__ import annotations import collections @@ -429,13 +430,11 @@ def __sub__(self: T, other: T) -> Any: def find_diff_of_numbers( stat1: int | float | np.float64 | np.int64 | None, stat2: int | float | np.float64 | np.int64 | None, -) -> Any: - ... +) -> Any: ... @overload -def find_diff_of_numbers(stat1: T | None, stat2: T | None) -> Any: - ... +def find_diff_of_numbers(stat1: T | None, stat2: T | None) -> Any: ... def find_diff_of_numbers(stat1, stat2): diff --git a/dataprofiler/profilers/text_column_profile.py b/dataprofiler/profilers/text_column_profile.py index bea8dbd6..eb79643f 100644 --- a/dataprofiler/profilers/text_column_profile.py +++ b/dataprofiler/profilers/text_column_profile.py @@ -1,4 +1,5 @@ """Text profile analysis for individual col within structured profiling..""" + from __future__ import annotations import itertools diff --git a/dataprofiler/profilers/unstructured_labeler_profile.py b/dataprofiler/profilers/unstructured_labeler_profile.py index 1c7b16c0..22789c4e 100644 --- a/dataprofiler/profilers/unstructured_labeler_profile.py +++ b/dataprofiler/profilers/unstructured_labeler_profile.py @@ -1,4 +1,5 @@ """Profile analysis for applying labels within unstructured profiling.""" + from __future__ import annotations from collections import defaultdict diff --git a/dataprofiler/profilers/unstructured_text_profile.py b/dataprofiler/profilers/unstructured_text_profile.py index 96b7d062..3f1b6dd7 100644 --- a/dataprofiler/profilers/unstructured_text_profile.py +++ b/dataprofiler/profilers/unstructured_text_profile.py @@ -1,4 +1,5 @@ """For profiling unstructured text data.""" + from __future__ import annotations import itertools diff --git a/dataprofiler/reports/graphs.py b/dataprofiler/reports/graphs.py index 1f0b4301..4e630a1c 100644 --- a/dataprofiler/reports/graphs.py +++ b/dataprofiler/reports/graphs.py @@ -1,4 +1,5 @@ """Contains functions for generating graph data report.""" + # !/usr/bin/env python3 from __future__ import annotations diff --git a/dataprofiler/reports/utils.py b/dataprofiler/reports/utils.py index a10b8fe5..975dc7d8 100644 --- a/dataprofiler/reports/utils.py +++ b/dataprofiler/reports/utils.py @@ -1,4 +1,5 @@ """Contains functions for checking for installations/dependencies.""" + import sys import warnings from typing import Any, Callable, List, TypeVar, cast diff --git a/dataprofiler/rng_utils.py b/dataprofiler/rng_utils.py index 32906665..2fd14f0f 100644 --- a/dataprofiler/rng_utils.py +++ b/dataprofiler/rng_utils.py @@ -1,4 +1,5 @@ """Create a random number generator using a manual seed DATAPROFILER_SEED.""" + import os import warnings diff --git a/dataprofiler/settings.py b/dataprofiler/settings.py index 1ba017f4..a81c3477 100644 --- a/dataprofiler/settings.py +++ b/dataprofiler/settings.py @@ -1,2 +1,3 @@ """Configure settings for dataprofiler.""" + _seed = None diff --git a/dataprofiler/tests/plugins/test_plugins.py b/dataprofiler/tests/plugins/test_plugins.py index ec148a52..9368975d 100644 --- a/dataprofiler/tests/plugins/test_plugins.py +++ b/dataprofiler/tests/plugins/test_plugins.py @@ -28,8 +28,8 @@ def test_plugin(): @mock.patch("dataprofiler.plugins.__init__.os.path.isdir") @mock.patch("dataprofiler.plugins.__init__.os.listdir") def test_load_plugin(self, mock_listdir, mock_isdir, mock_importlib_util): - mock_listdir.side_effect = ( - lambda folder_dir: ["__pycache__", "py"] + mock_listdir.side_effect = lambda folder_dir: ( + ["__pycache__", "py"] if folder_dir.endswith("plugins") else ["stillnotrealpy", "a.json", None] ) @@ -38,10 +38,8 @@ def test_load_plugin(self, mock_listdir, mock_isdir, mock_importlib_util): load_plugins() mock_importlib_util.spec_from_file_location.assert_not_called() - mock_listdir.side_effect = ( - lambda folder_dir: ["folder"] - if folder_dir.endswith("plugins") - else ["file.py"] + mock_listdir.side_effect = lambda folder_dir: ( + ["folder"] if folder_dir.endswith("plugins") else ["file.py"] ) mock_spec = mock.Mock() mock_importlib_util.spec_from_file_location.return_value = mock_spec diff --git a/dataprofiler/tests/profilers/test_float_column_profile.py b/dataprofiler/tests/profilers/test_float_column_profile.py index d9ec122c..19fe4c8a 100644 --- a/dataprofiler/tests/profilers/test_float_column_profile.py +++ b/dataprofiler/tests/profilers/test_float_column_profile.py @@ -835,7 +835,14 @@ def test_total_histogram_bin_variance(self): def test_histogram_loss(self): # run time is small - (diff_var, avg_diffvar, total_var, avg_totalvar, run_time, avg_runtime,) = ( + ( + diff_var, + avg_diffvar, + total_var, + avg_totalvar, + run_time, + avg_runtime, + ) = ( 0.3, 0.2, 0.1, @@ -855,7 +862,14 @@ def test_histogram_loss(self): self.assertEqual(expected_loss, est_loss) # run time is big - (diff_var, avg_diffvar, total_var, avg_totalvar, run_time, avg_runtime,) = ( + ( + diff_var, + avg_diffvar, + total_var, + avg_totalvar, + run_time, + avg_runtime, + ) = ( 0.3, 0.2, 0.1, diff --git a/dataprofiler/tests/profilers/test_profiler_utils.py b/dataprofiler/tests/profilers/test_profiler_utils.py index 4eee1963..0ea0c0fc 100644 --- a/dataprofiler/tests/profilers/test_profiler_utils.py +++ b/dataprofiler/tests/profilers/test_profiler_utils.py @@ -472,7 +472,6 @@ def test_odd_merge_profile_list(self, mock_data_labeler, *mocks): class TestAutoMultiProcessToggle(unittest.TestCase): - """ Validate profile_utils.auto_multiprocess_toggle is properly working. """ diff --git a/dataprofiler/tests/space_time_analysis/structured_space_time_analysis.py b/dataprofiler/tests/space_time_analysis/structured_space_time_analysis.py index df57854f..5af7bc2a 100644 --- a/dataprofiler/tests/space_time_analysis/structured_space_time_analysis.py +++ b/dataprofiler/tests/space_time_analysis/structured_space_time_analysis.py @@ -1,4 +1,5 @@ """Contains space and time analysis tests for the Dataprofiler""" + import json import os import random diff --git a/dataprofiler/tests/test_rng_utils.py b/dataprofiler/tests/test_rng_utils.py index 6ee2ed35..8a4c4d22 100644 --- a/dataprofiler/tests/test_rng_utils.py +++ b/dataprofiler/tests/test_rng_utils.py @@ -1,4 +1,5 @@ """Validates that generator intakes DATAPROFILER_SEED properly.""" + import os import unittest import unittest.mock From ffbac1aff2c6fbb36f297b993dab5a6d135e7ab4 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Fri, 22 May 2026 14:44:22 -0500 Subject: [PATCH 28/28] refactor: add unit tests validating usage of the old load format --- .../tests/labelers/test_char_tf_load_model.py | 20 ++++++++++++ .../test_character_level_cnn_model.py | 32 +++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/dataprofiler/tests/labelers/test_char_tf_load_model.py b/dataprofiler/tests/labelers/test_char_tf_load_model.py index 40879e57..6160b8fa 100644 --- a/dataprofiler/tests/labelers/test_char_tf_load_model.py +++ b/dataprofiler/tests/labelers/test_char_tf_load_model.py @@ -227,6 +227,26 @@ def test_predict(self, *mocks): self.assertIn("conf", result) self.assertEqual((2, 2, model.num_labels), np.array(result["conf"]).shape) + def test_normalize_old_list_output_model(self, *mocks): + inputs = tf.keras.Input(shape=(2,), dtype=tf.int64) + embedded = tf.keras.layers.Embedding(input_dim=100, output_dim=8)(inputs) + softmax_output = tf.keras.layers.Dense( + self.label_mapping["ADDRESS"] + 1, + activation="softmax", + )(embedded) + argmax_output = tf.keras.layers.Lambda( + lambda x: tf.cast(tf.argmax(x, axis=2), tf.int64) + )(softmax_output) + old_format_model = tf.keras.Model(inputs, [softmax_output, argmax_output]) + + normalized_model = CharLoadTFModel._normalize_model_outputs(old_format_model) + + self.assertIsInstance(normalized_model.output, dict) + self.assertSetEqual( + set(normalized_model.output.keys()), + {CharLoadTFModel._SOFTMAX_OUTPUT, CharLoadTFModel._ARGMAX_OUTPUT}, + ) + def test_fit_and_predict(self, *mocks): # model model = CharLoadTFModel(self.model_path, self.label_mapping) diff --git a/dataprofiler/tests/labelers/test_character_level_cnn_model.py b/dataprofiler/tests/labelers/test_character_level_cnn_model.py index 554b61c8..311da006 100644 --- a/dataprofiler/tests/labelers/test_character_level_cnn_model.py +++ b/dataprofiler/tests/labelers/test_character_level_cnn_model.py @@ -10,8 +10,10 @@ from dataprofiler.labelers import utils as labeler_utils from dataprofiler.labelers.character_level_cnn_model import ( + ArgMaxLayer, CharacterLevelCnnModel, EncodingLayer, + ThreshArgMaxLayer, ) _file_dir = os.path.dirname(os.path.abspath(__file__)) @@ -253,6 +255,36 @@ def test_validation_evaluate_and_classification_report(self, *mocks): self.assertIsNotNone(f1_report) self.assertEqual(11, f1_report["ADDRESS"]["support"]) + def test_normalize_old_list_output_model(self): + default_ind = self.label_mapping["UNKNOWN"] + num_labels = max(self.label_mapping.values()) + 1 + inputs = tf.keras.Input(shape=(2, 4)) + hidden = tf.keras.layers.Dense(8, activation="relu")(inputs) + softmax_output = tf.keras.layers.Dense(num_labels, activation="softmax")(hidden) + argmax_output = ArgMaxLayer()(softmax_output) + threshold_output = ThreshArgMaxLayer( + threshold_=0.0, + num_labels_=num_labels, + default_ind=default_ind, + )(argmax_output, softmax_output) + old_format_model = tf.keras.Model( + inputs, [softmax_output, argmax_output, threshold_output] + ) + + normalized_model = CharacterLevelCnnModel._normalize_model_outputs( + old_format_model, default_ind, num_labels + ) + + self.assertIsInstance(normalized_model.output, dict) + self.assertSetEqual( + set(normalized_model.output.keys()), + { + CharacterLevelCnnModel._SOFTMAX_OUTPUT, + CharacterLevelCnnModel._ARGMAX_OUTPUT, + CharacterLevelCnnModel._THRESH_OUTPUT, + }, + ) + def test_fit_and_predict_with_new_labels(self): # Initialize model cnn_model = CharacterLevelCnnModel(self.label_mapping)