Skip to content
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
d6b1ec1
refactor: move from deprecated pkg_resources
JGSweets May 6, 2026
9920fab
fix: to use func
JGSweets May 6, 2026
7592895
fix: add missing change
JGSweets May 6, 2026
a3592fb
refactor: resources to be in package
JGSweets May 6, 2026
3ecaf6b
fix: tests bc of almost
JGSweets May 6, 2026
0a2efd3
feat: refactor to pass in a path or string or None
JGSweets May 6, 2026
fb321a2
fix: import for older versions
JGSweets May 6, 2026
2207920
fix: Tranversable must be done at runtime
JGSweets May 6, 2026
96344db
refactor: keras reqs and others
JGSweets May 6, 2026
0458e73
refactor: losses for keras and tests
JGSweets May 6, 2026
2fe4ddd
fix: remove unneeded global
JGSweets May 6, 2026
c41303e
fix: accidentally duplicated test on rebase
JGSweets May 6, 2026
0615268
fix: rebase duplicates
JGSweets May 6, 2026
f08af16
fix: keras reqs
JGSweets May 6, 2026
e5f4041
refactor: update to be more than 3.4.0 for keras
JGSweets May 6, 2026
052d058
refactor: numpy2 and mypy
JGSweets May 11, 2026
3965667
fix: mypy 3.10
JGSweets May 11, 2026
f1046a9
fix: bugs
JGSweets May 11, 2026
8f1b4e0
fix: float
JGSweets May 11, 2026
fdc671e
refactor: for hist fix too
JGSweets May 11, 2026
34c47fe
fix: issue with none in hist
JGSweets May 11, 2026
57066fb
fix: remove comment
JGSweets May 22, 2026
5de7abe
refactor: to still utilize dict mapping for losses
JGSweets May 22, 2026
e1afcf7
fix: int pre-commit
JGSweets May 22, 2026
0b00aed
fix: train labeling
JGSweets May 22, 2026
8edd1dc
refactor notes, reqs, and change log
JGSweets May 22, 2026
03b4fa1
fix: pre-commit
JGSweets May 22, 2026
ffbac1a
refactor: add unit tests validating usage of the old load format
JGSweets May 22, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 5 additions & 7 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ repos:
# Flake8: complexity and style checking
# https://flake8.pycqa.org/en/latest/user/using-hooks.html
- repo: https://github.com/pycqa/flake8
rev: 4.0.1
rev: 7.3.0
hooks:
- id: flake8
additional_dependencies: [flake8-docstrings]
Expand Down Expand Up @@ -58,7 +58,7 @@ repos:
'chardet>=3.0.4,<7.0.0',
fastavro>=1.0.0.post1,
python-snappy>=0.7.1,
charset-normalizer>=1.3.6,
'charset-normalizer>=1.3.6,<7.0.0',
psutil>=4.0.0,
scipy>=1.4.1,
requests>=2.28.1,
Expand All @@ -82,11 +82,9 @@ repos:

# requirements-ml.txt
scikit-learn>=0.23.2,
'keras>=2.4.3,<=3.4.0',
'keras>=3.11.0',
rapidfuzz>=2.6.1,
"tensorflow>=2.6.4,<2.15.0; sys.platform != 'darwin'",
"tensorflow>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine != 'arm64'",
"tensorflow-macos>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
"tensorflow>=2.16.0",
tqdm>=4.0.0,

# requirements-reports.txt
Expand All @@ -101,7 +99,7 @@ repos:
pytest-xdist>=2.1.0,
pytest-forked>=1.3.0,
toolz>=0.10.0,
'memray>=1.7.0,<1.12.0',
'memray>=1.18.0',
]
# Check-manifest: ensures required non-Python files are included in MANIFEST.in
# https://github.com/mgedmin/check-manifest/blob/master/.pre-commit-hooks.yaml
Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/labelers/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def __eq__(self, other: object) -> bool:
:rtype: bool
"""
if (
type(self) != type(other)
type(self) is not type(other)
or not isinstance(other, BaseModel)
or self._parameters != other._parameters
or self._label_mapping != other._label_mapping
Expand Down
5 changes: 2 additions & 3 deletions dataprofiler/labelers/char_load_tf_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,8 +262,7 @@ def _construct_model(self) -> None:

# Compile the model w/ metrics
softmax_output_layer_name = self._model.output_names[0]
losses = {softmax_output_layer_name: "categorical_crossentropy"}

losses = ["categorical_crossentropy", None, None]
# use f1 score metric
f1_score_training = labeler_utils.F1Score(
num_classes=num_labels, average="micro"
Expand Down Expand Up @@ -316,7 +315,7 @@ def _reconstruct_model(self) -> None:

# Compile the model
softmax_output_layer_name = self._model.output_names[0]
losses = {softmax_output_layer_name: "categorical_crossentropy"}
losses = ["categorical_crossentropy", None, None]

# use f1 score metric
f1_score_training = labeler_utils.F1Score(
Expand Down
66 changes: 26 additions & 40 deletions dataprofiler/labelers/character_level_cnn_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,7 @@ def load_from_disk(cls, dirpath: str) -> CharacterLevelCnnModel:
loaded_model._model_default_ind = loaded_model.label_mapping[
loaded_model._parameters["default_label"]
]
loaded_model._compile_loss(loaded_model._model, loaded_model.num_labels)
return loaded_model

@staticmethod
Expand All @@ -475,6 +476,28 @@ def _argmax_threshold_layer(
# matrix.
return ThreshArgMaxLayer(threshold, num_labels, default_ind)

@staticmethod
def _compile_loss(model: tf.keras.Model, num_labels: int) -> None:
"""Compiles the loss for the given model and number of labels."""
# Compile the model
softmax_output_layer_name = model.output_names[0]
# losses = {softmax_output_layer_name: "categorical_crossentropy"}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please remove

losses = ["categorical_crossentropy", None, None]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Quick question — the loss assignment changed from dict-based (by output name) to list-based (by position):

CharacterLevelCnnModel (3 outputs)

losses = ["categorical_crossentropy", None, None]

CharLoadTFModel (2 outputs)

losses = ["categorical_crossentropy", None]

Can you confirm the output ordering is stable and these align correctly with the model outputs? Just want to make sure the positional
assignment matches up since the dict approach was order-independent.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call out! I believe the previous layers were list based which is why in keras 3 it required the list losses. This code matched that, however, like you I prefer the order-independent approach and am looking into the requirements of that and ensuring the backwards compatibility of loading a model that was list based initially.


# use f1 score metric
f1_score_training = labeler_utils.F1Score(
num_classes=num_labels, average="micro"
)
metrics = {
softmax_output_layer_name: [
"categorical_crossentropy",
"acc",
f1_score_training,
]
}

model.compile(loss=losses, optimizer="adam", metrics=metrics)

def _construct_model(self) -> None:
"""
Construct model for the data labeler.
Expand Down Expand Up @@ -570,24 +593,7 @@ def _construct_model(self) -> None:
final_predicted_layer(argmax_layer, self._model.outputs[0]),
]
self._model = tf.keras.Model(self._model.inputs, argmax_outputs)

# Compile the model
softmax_output_layer_name = self._model.output_names[0]
losses = {softmax_output_layer_name: "categorical_crossentropy"}

# use f1 score metric
f1_score_training = labeler_utils.F1Score(
num_classes=num_labels, average="micro"
)
metrics = {
softmax_output_layer_name: [
"categorical_crossentropy",
"acc",
f1_score_training,
]
}

self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
self._compile_loss(self._model, num_labels)

self._epoch_id = 0
self._model_num_labels = num_labels
Expand Down Expand Up @@ -632,24 +638,7 @@ def _reconstruct_model(self) -> None:
final_predicted_layer(argmax_layer, final_softmax_layer),
]
self._model = tf.keras.Model(self._model.inputs, argmax_outputs)

# Compile the model
softmax_output_layer_name = self._model.output_names[0]
losses = {softmax_output_layer_name: "categorical_crossentropy"}

# use f1 score metric
f1_score_training = labeler_utils.F1Score(
num_classes=num_labels, average="micro"
)
metrics = {
softmax_output_layer_name: [
"categorical_crossentropy",
"acc",
f1_score_training,
]
}

self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
self._compile_loss(self._model, num_labels)
self._epoch_id = 0
self._model_num_labels = num_labels
self._model_default_ind = default_ind
Expand Down Expand Up @@ -699,14 +688,11 @@ def fit(
f1_report: dict = {}

self._model.reset_metrics()
softmax_output_layer_name = self._model.output_names[0]

start_time = time.time()
batch_id = 0
for x_train, y_train in train_data:
model_results = self._model.train_on_batch(
x_train, {softmax_output_layer_name: y_train}
)
model_results = self._model.train_on_batch(x_train, y_train)
sys.stdout.flush()
if verbose:
sys.stdout.write(
Expand Down
4 changes: 2 additions & 2 deletions dataprofiler/labelers/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def __eq__(self, other: object) -> bool:
:rtype: bool
"""
if (
type(self) != type(other)
type(self) is not type(other)
or not isinstance(other, BaseDataProcessor)
or self._parameters != other._parameters
):
Expand Down Expand Up @@ -1589,7 +1589,7 @@ def __eq__(self, other: object) -> bool:
:rtype: bool
"""
if (
type(self) != type(other)
type(self) is not type(other)
or not isinstance(other, StructCharPostprocessor)
or self._parameters["default_label"] != other._parameters["default_label"]
or self._parameters["pad_label"] != other._parameters["pad_label"]
Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/plugins/decorators.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Contains function for generating plugins data."""

from collections import defaultdict
from typing import Any, DefaultDict, Dict

Expand All @@ -21,7 +22,6 @@ def __inner_factory_function(fn):
:param fn: Plugin function
:return: function
"""
global plugins_dict
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

global is only needed when a function rebinds a module variable, like plugins_dict = {...}.

plugins_dict[typ][name] = fn
return fn

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ def test_save(self, mock_open, *mocks):
StringIO.close(mock_file)

@mock.patch("tensorflow.keras.Model.save", return_value=None)
@mock.patch("tensorflow.keras.models.load_model", return_value=mock.Mock())
@mock.patch("tensorflow.keras.models.load_model", return_value=mock.MagicMock())
@mock.patch("builtins.open", side_effect=mock_open)
def test_load(self, *mocks):
dir = os.path.join(_resource_labeler_dir, "unstructured_model/")
Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/tests/labelers/test_data_labelers.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ def test_has_public_functions(self, *args):

@staticmethod
def _setup_mock_load_model(mock_load_model):
mock_load_model.return_value = mock.Mock()
mock_load_model.return_value = mock.MagicMock()

def test_load_labeler(self, mock_open, mock_load_model):

Expand Down
2 changes: 1 addition & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
check-manifest>=0.50
black>=24.3.0
isort==5.12.0
pre-commit==2.19.0
pre-commit==4.3.0
tox==3.25.1
tox-conda==0.10.2
types-setuptools==67.7.0.1
Expand Down
6 changes: 2 additions & 4 deletions requirements-ml.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
scikit-learn>=0.23.2
keras<=3.4.0
keras>=3.11.0
rapidfuzz>=2.6.1
tensorflow>=2.16.0; sys.platform != 'darwin'
tensorflow>=2.16.0; sys_platform == 'darwin' and platform_machine != 'arm64'
tensorflow-macos>=2.16.0; sys_platform == 'darwin' and platform_machine == 'arm64'
tensorflow>=2.16.0
tqdm>=4.0.0
2 changes: 1 addition & 1 deletion requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ pytest-cov>=2.8.1
pytest-xdist>=2.1.0
pytest-forked>=1.3.0
toolz>=0.10.0
memray>=1.7.0,<1.12.0
memray>=1.18.0
Loading