Skip to content
Open
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
d6b1ec1
refactor: move from deprecated pkg_resources
JGSweets May 6, 2026
9920fab
fix: to use func
JGSweets May 6, 2026
7592895
fix: add missing change
JGSweets May 6, 2026
a3592fb
refactor: resources to be in package
JGSweets May 6, 2026
3ecaf6b
fix: tests bc of almost
JGSweets May 6, 2026
0a2efd3
feat: refactor to pass in a path or string or None
JGSweets May 6, 2026
fb321a2
fix: import for older versions
JGSweets May 6, 2026
2207920
fix: Tranversable must be done at runtime
JGSweets May 6, 2026
96344db
refactor: keras reqs and others
JGSweets May 6, 2026
0458e73
refactor: losses for keras and tests
JGSweets May 6, 2026
2fe4ddd
fix: remove unneeded global
JGSweets May 6, 2026
c41303e
fix: accidentally duplicated test on rebase
JGSweets May 6, 2026
0615268
fix: rebase duplicates
JGSweets May 6, 2026
f08af16
fix: keras reqs
JGSweets May 6, 2026
e5f4041
refactor: update to be more than 3.4.0 for keras
JGSweets May 6, 2026
052d058
refactor: numpy2 and mypy
JGSweets May 11, 2026
3965667
fix: mypy 3.10
JGSweets May 11, 2026
f1046a9
fix: bugs
JGSweets May 11, 2026
8f1b4e0
fix: float
JGSweets May 11, 2026
fdc671e
refactor: for hist fix too
JGSweets May 11, 2026
34c47fe
fix: issue with none in hist
JGSweets May 11, 2026
57066fb
fix: remove comment
JGSweets May 22, 2026
5de7abe
refactor: to still utilize dict mapping for losses
JGSweets May 22, 2026
e1afcf7
fix: int pre-commit
JGSweets May 22, 2026
0b00aed
fix: train labeling
JGSweets May 22, 2026
8edd1dc
refactor notes, reqs, and change log
JGSweets May 22, 2026
03b4fa1
fix: pre-commit
JGSweets May 22, 2026
ffbac1a
refactor: add unit tests validating usage of the old load format
JGSweets May 22, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 7 additions & 9 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ repos:
# Flake8: complexity and style checking
# https://flake8.pycqa.org/en/latest/user/using-hooks.html
- repo: https://github.com/pycqa/flake8
rev: 4.0.1
rev: 7.3.0
hooks:
- id: flake8
additional_dependencies: [flake8-docstrings]
Expand Down Expand Up @@ -50,15 +50,15 @@ repos:
# requirements.txt
h5py>=2.10.0,
wheel>=0.33.1,
numpy<2.0.0,
numpy>=1.0.0,
'pandas>=1.1.2,<3.0.0',
python-dateutil>=2.7.5,
pytz>=2020.1,
pyarrow>=1.0.1,
'chardet>=3.0.4,<7.0.0',
fastavro>=1.0.0.post1,
python-snappy>=0.7.1,
charset-normalizer>=1.3.6,
'charset-normalizer>=1.3.6,<7.0.0',
psutil>=4.0.0,
scipy>=1.4.1,
requests>=2.28.1,
Expand All @@ -82,11 +82,9 @@ repos:

# requirements-ml.txt
scikit-learn>=0.23.2,
'keras>=2.4.3,<=3.4.0',
'keras>3.4.0',
rapidfuzz>=2.6.1,
"tensorflow>=2.6.4,<2.15.0; sys.platform != 'darwin'",
"tensorflow>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine != 'arm64'",
"tensorflow-macos>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
"tensorflow>=2.16.0",
tqdm>=4.0.0,

# requirements-reports.txt
Expand All @@ -101,15 +99,15 @@ repos:
pytest-xdist>=2.1.0,
pytest-forked>=1.3.0,
toolz>=0.10.0,
'memray>=1.7.0,<1.12.0',
'memray>=1.18.0',
]
# Check-manifest: ensures required non-Python files are included in MANIFEST.in
# https://github.com/mgedmin/check-manifest/blob/master/.pre-commit-hooks.yaml
- repo: https://github.com/mgedmin/check-manifest
rev: "0.48"
hooks:
- id: check-manifest
additional_dependencies: ['h5py', 'wheel', 'future', 'numpy<2.0.0', 'pandas',
additional_dependencies: ['h5py', 'wheel', 'future', 'numpy>=1.0.0', 'pandas',
'python-dateutil', 'pytz', 'pyarrow', 'chardet', 'fastavro',
'python-snappy', 'charset-normalizer', 'psutil', 'scipy', 'requests',
'networkx','typing-extensions', 'HLL', 'datasketches', 'boto3']
Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/data_readers/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ def reservoir(file: TextIOWrapper, sample_nrows: int) -> list:
except StopIteration:
break
# Append new, replace old with dummy, and keep track of order
remove_index = rng.integers(0, sample_nrows)
remove_index = int(rng.integers(0, sample_nrows))
values[indices[remove_index]] = str(None)
indices[remove_index] = len(values)
values.append(newval)
Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/labelers/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def __eq__(self, other: object) -> bool:
:rtype: bool
"""
if (
type(self) != type(other)
type(self) is not type(other)
or not isinstance(other, BaseModel)
or self._parameters != other._parameters
or self._label_mapping != other._label_mapping
Expand Down
165 changes: 99 additions & 66 deletions dataprofiler/labelers/char_load_tf_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from .. import dp_logging
from . import labeler_utils
from .base_model import AutoSubRegistrationMeta, BaseModel, BaseTrainableModel
from .character_level_cnn_model import ArgMaxLayer

_file_dir = os.path.dirname(os.path.abspath(__file__))

Expand All @@ -29,6 +30,8 @@ class CharLoadTFModel(BaseTrainableModel, metaclass=AutoSubRegistrationMeta):

# boolean if the label mapping requires the mapping for index 0 reserved
requires_zero_mapping = False
_SOFTMAX_OUTPUT = "softmax_output"
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

normalize layer names

_ARGMAX_OUTPUT = "argmax_output"

def __init__(
self, model_path: str, label_mapping: dict[str, int], parameters: dict = None
Expand Down Expand Up @@ -61,6 +64,35 @@ def __init__(

BaseModel.__init__(self, label_mapping, parameters)

@classmethod
def _create_model_outputs(
cls, softmax_output: tf.Tensor, argmax_output: tf.Tensor | None = None
) -> dict[str, tf.Tensor]:
"""Return normalized dict outputs for training and inference."""
if argmax_output is None:
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ensure normalized dict based model outputs

argmax_output = ArgMaxLayer(name=cls._ARGMAX_OUTPUT)(softmax_output)
return {
cls._SOFTMAX_OUTPUT: softmax_output,
cls._ARGMAX_OUTPUT: argmax_output,
}

@classmethod
def _normalize_model_outputs(cls, model: tf.keras.Model) -> tf.keras.Model:
"""Convert list-style outputs to the normalized dict structure."""
return labeler_utils.normalize_tf_model_outputs(
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

conversion of previous style for consistency requirement by keras 3

model,
[cls._SOFTMAX_OUTPUT, cls._ARGMAX_OUTPUT],
lambda softmax_output, extra_outputs: cls._create_model_outputs(
softmax_output, extra_outputs[0]
),
)

def _new_softmax_head_name(self) -> str:
"""Return a layer name unique within the current model graph."""
return labeler_utils.get_tf_rebuild_layer_name(
self._model, f"{self._SOFTMAX_OUTPUT}_rebuild"
)

def __eq__(self, other: object) -> bool:
"""
Check if two models are equal with one another.
Expand Down Expand Up @@ -215,15 +247,34 @@ def load_from_disk(cls, dirpath: str) -> CharLoadTFModel:
tf_model = tf.keras.models.load_model(dirpath)

loaded_model = cls(dirpath, label_mapping, parameters)
loaded_model._model = tf_model
loaded_model._model = cls._normalize_model_outputs(tf_model)

# load self
loaded_model._model_num_labels = loaded_model.num_labels
loaded_model._model_default_ind = loaded_model.label_mapping[
loaded_model._parameters["default_label"]
]
loaded_model._compile_model(loaded_model.num_labels)
return loaded_model

def _compile_model(self, num_labels: int) -> None:
"""Compile the model with dict-based losses and metrics."""
losses = {
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ensure we utilize dict based solution

self._SOFTMAX_OUTPUT: "categorical_crossentropy",
self._ARGMAX_OUTPUT: None,
}
f1_score_training = labeler_utils.F1Score(
num_classes=num_labels, average="micro"
)
metrics = {
self._SOFTMAX_OUTPUT: [
"categorical_crossentropy",
"acc",
f1_score_training,
]
}
self._model.compile(loss=losses, optimizer="adam", metrics=metrics)

def _construct_model(self) -> None:
"""
Model constructor for the data labeler.
Expand All @@ -237,46 +288,28 @@ def _construct_model(self) -> None:
model_loc = self._parameters["model_path"]

self._model: tf.keras.Model = tf.keras.models.load_model(model_loc)
self._model = tf.keras.Model(self._model.inputs, self._model.outputs)
softmax_output_layer_name = self._model.output_names[0]
self._model = self._normalize_model_outputs(self._model)
softmax_output = self._model.output[self._SOFTMAX_OUTPUT]
softmax_layer = softmax_output._keras_history[0]
softmax_output_layer_name = softmax_layer.name
softmax_layer_ind = cast(
int,
labeler_utils.get_tf_layer_index_from_name(
self._model, softmax_output_layer_name
),
)
softmax_layer = self._model.get_layer(softmax_output_layer_name)

new_softmax_layer = softmax_layer.output
new_softmax_layer = softmax_output
if softmax_layer.weights[0].shape[-1] != num_labels:
new_softmax_layer = tf.keras.layers.Dense(
num_labels, activation="softmax", name="softmax_output"
num_labels,
activation="softmax",
name=self._new_softmax_head_name(),
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

allows iteration on layer name due to keras reqs

)(self._model.layers[softmax_layer_ind - 1].output)

# Add argmax layer to get labels directly as an output
argmax_layer = tf.keras.ops.argmax(new_softmax_layer, axis=2)

argmax_outputs = [new_softmax_layer, argmax_layer]
self._model = tf.keras.Model(self._model.inputs, argmax_outputs)
self._model = tf.keras.Model(self._model.inputs, self._model.outputs)

# Compile the model w/ metrics
softmax_output_layer_name = self._model.output_names[0]
losses = {softmax_output_layer_name: "categorical_crossentropy"}

# use f1 score metric
f1_score_training = labeler_utils.F1Score(
num_classes=num_labels, average="micro"
)
metrics = {
softmax_output_layer_name: [
"categorical_crossentropy",
"acc",
f1_score_training,
]
}

self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
output_dict = self._create_model_outputs(new_softmax_layer)
self._model = tf.keras.Model(self._model.inputs, output_dict)
self._compile_model(num_labels)

self._epoch_id = 0
self._model_num_labels = num_labels
Expand Down Expand Up @@ -305,32 +338,14 @@ def _reconstruct_model(self) -> None:
# Add the final Softmax layer to the previous spot
# self._model.layers[-2] to skip: original softmax
final_softmax_layer = tf.keras.layers.Dense(
num_labels, activation="softmax", name="softmax_output"
num_labels,
activation="softmax",
name=self._new_softmax_head_name(),
)(self._model.layers[-2].output)

# Add argmax layer to get labels directly as an output
argmax_layer = tf.keras.ops.argmax(final_softmax_layer, axis=2)

argmax_outputs = [final_softmax_layer, argmax_layer]
self._model = tf.keras.Model(self._model.inputs, argmax_outputs)

# Compile the model
softmax_output_layer_name = self._model.output_names[0]
losses = {softmax_output_layer_name: "categorical_crossentropy"}

# use f1 score metric
f1_score_training = labeler_utils.F1Score(
num_classes=num_labels, average="micro"
)
metrics = {
softmax_output_layer_name: [
"categorical_crossentropy",
"acc",
f1_score_training,
]
}

self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
output_dict = self._create_model_outputs(final_softmax_layer)
self._model = tf.keras.Model(self._model.inputs, output_dict)
self._compile_model(num_labels)

self._epoch_id = 0
self._model_num_labels = num_labels
Expand Down Expand Up @@ -381,42 +396,60 @@ def fit(
f1_report: dict = {}

self._model.reset_metrics()
softmax_output_layer_name = self._model.output_names[0]

start_time = time.time()
batch_id = 0
target_output = self._SOFTMAX_OUTPUT
for x_train, y_train in train_data:
model_results = self._model.train_on_batch(
x_train, {softmax_output_layer_name: y_train}
x_train,
{target_output: y_train},
return_dict=True,
)
acc_value = next(
(value for key, value in model_results.items() if key.endswith("acc")),
np.nan,
)
f1_value = next(
(value for key, value in model_results.items() if "f1" in key.lower()),
np.nan,
)
Comment on lines +413 to +420
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

due to dict based output

sys.stdout.flush()
if verbose:
sys.stdout.write(
"\rEPOCH %d, batch_id %d: loss: %f - acc: %f - "
"f1_score %f" % (self._epoch_id, batch_id, *model_results[1:])
"f1_score %f"
% (
self._epoch_id,
batch_id,
model_results.get("loss", np.nan),
acc_value,
f1_value,
)
)
batch_id += 1

for i, metric_label in enumerate(self._model.metrics_names):
history[metric_label] = model_results[i]
history.update(model_results)

if val_data:
f1, f1_report = self._validate_training(val_data) # type: ignore
history["f1_report"] = f1_report

val_f1 = f1_report["weighted avg"]["f1-score"] if f1_report else np.NAN
val_f1 = f1_report["weighted avg"]["f1-score"] if f1_report else np.nan
val_precision = (
f1_report["weighted avg"]["precision"] if f1_report else np.NAN
f1_report["weighted avg"]["precision"] if f1_report else np.nan
)
val_recall = f1_report["weighted avg"]["recall"] if f1_report else np.NAN
val_recall = f1_report["weighted avg"]["recall"] if f1_report else np.nan
epoch_time = time.time() - start_time
logger.info(
"\rEPOCH %d (%ds), loss: %f - acc: %f - f1_score %f -- "
"val_f1: %f - val_precision: %f - val_recall %f"
% (
self._epoch_id,
epoch_time,
*model_results[1:],
model_results.get("loss", np.nan),
acc_value,
f1_value,
val_f1,
val_precision,
val_recall,
Expand Down Expand Up @@ -463,7 +496,7 @@ def _validate_training(
y_val_pred.append(
self._model.predict(
x_val, batch_size=batch_size_test, verbose=verbose_keras
)[1]
)[self._ARGMAX_OUTPUT]
)
y_val_test.append(np.argmax(y_val, axis=-1))
batch_id += 1
Expand Down Expand Up @@ -536,10 +569,10 @@ def predict(
if show_confidences:
confidences[
allocation_index : allocation_index + num_samples_in_batch
] = model_output[0].numpy()
] = model_output[self._SOFTMAX_OUTPUT].numpy()
predictions[
allocation_index : allocation_index + num_samples_in_batch
] = model_output[1].numpy()
] = model_output[self._ARGMAX_OUTPUT].numpy()

allocation_index += num_samples_in_batch

Expand Down
Loading
Loading