Skip to content
Open
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
d6b1ec1
refactor: move from deprecated pkg_resources
JGSweets May 6, 2026
9920fab
fix: to use func
JGSweets May 6, 2026
7592895
fix: add missing change
JGSweets May 6, 2026
a3592fb
refactor: resources to be in package
JGSweets May 6, 2026
3ecaf6b
fix: tests bc of almost
JGSweets May 6, 2026
0a2efd3
feat: refactor to pass in a path or string or None
JGSweets May 6, 2026
fb321a2
fix: import for older versions
JGSweets May 6, 2026
2207920
fix: Tranversable must be done at runtime
JGSweets May 6, 2026
96344db
refactor: keras reqs and others
JGSweets May 6, 2026
0458e73
refactor: losses for keras and tests
JGSweets May 6, 2026
2fe4ddd
fix: remove unneeded global
JGSweets May 6, 2026
c41303e
fix: accidentally duplicated test on rebase
JGSweets May 6, 2026
0615268
fix: rebase duplicates
JGSweets May 6, 2026
f08af16
fix: keras reqs
JGSweets May 6, 2026
e5f4041
refactor: update to be more than 3.4.0 for keras
JGSweets May 6, 2026
052d058
refactor: numpy2 and mypy
JGSweets May 11, 2026
3965667
fix: mypy 3.10
JGSweets May 11, 2026
f1046a9
fix: bugs
JGSweets May 11, 2026
8f1b4e0
fix: float
JGSweets May 11, 2026
fdc671e
refactor: for hist fix too
JGSweets May 11, 2026
34c47fe
fix: issue with none in hist
JGSweets May 11, 2026
57066fb
fix: remove comment
JGSweets May 22, 2026
5de7abe
refactor: to still utilize dict mapping for losses
JGSweets May 22, 2026
e1afcf7
fix: int pre-commit
JGSweets May 22, 2026
0b00aed
fix: train labeling
JGSweets May 22, 2026
8edd1dc
refactor notes, reqs, and change log
JGSweets May 22, 2026
03b4fa1
fix: pre-commit
JGSweets May 22, 2026
ffbac1a
refactor: add unit tests validating usage of the old load format
JGSweets May 22, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 7 additions & 9 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ repos:
# Flake8: complexity and style checking
# https://flake8.pycqa.org/en/latest/user/using-hooks.html
- repo: https://github.com/pycqa/flake8
rev: 4.0.1
rev: 7.3.0
hooks:
- id: flake8
additional_dependencies: [flake8-docstrings]
Expand Down Expand Up @@ -50,15 +50,15 @@ repos:
# requirements.txt
h5py>=2.10.0,
wheel>=0.33.1,
numpy<2.0.0,
numpy>=1.0.0,
'pandas>=1.1.2,<3.0.0',
python-dateutil>=2.7.5,
pytz>=2020.1,
pyarrow>=1.0.1,
'chardet>=3.0.4,<7.0.0',
fastavro>=1.0.0.post1,
python-snappy>=0.7.1,
charset-normalizer>=1.3.6,
'charset-normalizer>=1.3.6,<7.0.0',
psutil>=4.0.0,
scipy>=1.4.1,
requests>=2.28.1,
Expand All @@ -82,11 +82,9 @@ repos:

# requirements-ml.txt
scikit-learn>=0.23.2,
'keras>=2.4.3,<=3.4.0',
'keras>3.4.0',
rapidfuzz>=2.6.1,
"tensorflow>=2.6.4,<2.15.0; sys.platform != 'darwin'",
"tensorflow>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine != 'arm64'",
"tensorflow-macos>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
"tensorflow>=2.16.0",
tqdm>=4.0.0,

# requirements-reports.txt
Expand All @@ -101,15 +99,15 @@ repos:
pytest-xdist>=2.1.0,
pytest-forked>=1.3.0,
toolz>=0.10.0,
'memray>=1.7.0,<1.12.0',
'memray>=1.18.0',
]
# Check-manifest: ensures required non-Python files are included in MANIFEST.in
# https://github.com/mgedmin/check-manifest/blob/master/.pre-commit-hooks.yaml
- repo: https://github.com/mgedmin/check-manifest
rev: "0.48"
hooks:
- id: check-manifest
additional_dependencies: ['h5py', 'wheel', 'future', 'numpy<2.0.0', 'pandas',
additional_dependencies: ['h5py', 'wheel', 'future', 'numpy>=1.0.0', 'pandas',
'python-dateutil', 'pytz', 'pyarrow', 'chardet', 'fastavro',
'python-snappy', 'charset-normalizer', 'psutil', 'scipy', 'requests',
'networkx','typing-extensions', 'HLL', 'datasketches', 'boto3']
Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/data_readers/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ def reservoir(file: TextIOWrapper, sample_nrows: int) -> list:
except StopIteration:
break
# Append new, replace old with dummy, and keep track of order
remove_index = rng.integers(0, sample_nrows)
remove_index = int(rng.integers(0, sample_nrows))
values[indices[remove_index]] = str(None)
indices[remove_index] = len(values)
values.append(newval)
Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/labelers/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def __eq__(self, other: object) -> bool:
:rtype: bool
"""
if (
type(self) != type(other)
type(self) is not type(other)
or not isinstance(other, BaseModel)
or self._parameters != other._parameters
or self._label_mapping != other._label_mapping
Expand Down
16 changes: 6 additions & 10 deletions dataprofiler/labelers/char_load_tf_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,8 +262,7 @@ def _construct_model(self) -> None:

# Compile the model w/ metrics
softmax_output_layer_name = self._model.output_names[0]
losses = {softmax_output_layer_name: "categorical_crossentropy"}

losses = ["categorical_crossentropy", None]
# use f1 score metric
f1_score_training = labeler_utils.F1Score(
num_classes=num_labels, average="micro"
Expand Down Expand Up @@ -316,7 +315,7 @@ def _reconstruct_model(self) -> None:

# Compile the model
softmax_output_layer_name = self._model.output_names[0]
losses = {softmax_output_layer_name: "categorical_crossentropy"}
losses = ["categorical_crossentropy", None]

# use f1 score metric
f1_score_training = labeler_utils.F1Score(
Expand Down Expand Up @@ -381,14 +380,11 @@ def fit(
f1_report: dict = {}

self._model.reset_metrics()
softmax_output_layer_name = self._model.output_names[0]

start_time = time.time()
batch_id = 0
for x_train, y_train in train_data:
model_results = self._model.train_on_batch(
x_train, {softmax_output_layer_name: y_train}
)
model_results = self._model.train_on_batch(x_train, y_train)
sys.stdout.flush()
if verbose:
sys.stdout.write(
Expand All @@ -404,11 +400,11 @@ def fit(
f1, f1_report = self._validate_training(val_data) # type: ignore
history["f1_report"] = f1_report

val_f1 = f1_report["weighted avg"]["f1-score"] if f1_report else np.NAN
val_f1 = f1_report["weighted avg"]["f1-score"] if f1_report else np.nan
val_precision = (
f1_report["weighted avg"]["precision"] if f1_report else np.NAN
f1_report["weighted avg"]["precision"] if f1_report else np.nan
)
val_recall = f1_report["weighted avg"]["recall"] if f1_report else np.NAN
val_recall = f1_report["weighted avg"]["recall"] if f1_report else np.nan
epoch_time = time.time() - start_time
logger.info(
"\rEPOCH %d (%ds), loss: %f - acc: %f - f1_score %f -- "
Expand Down
72 changes: 29 additions & 43 deletions dataprofiler/labelers/character_level_cnn_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,7 @@ def load_from_disk(cls, dirpath: str) -> CharacterLevelCnnModel:
loaded_model._model_default_ind = loaded_model.label_mapping[
loaded_model._parameters["default_label"]
]
loaded_model._compile_loss(loaded_model._model, loaded_model.num_labels)
return loaded_model

@staticmethod
Expand All @@ -475,6 +476,28 @@ def _argmax_threshold_layer(
# matrix.
return ThreshArgMaxLayer(threshold, num_labels, default_ind)

@staticmethod
def _compile_loss(model: tf.keras.Model, num_labels: int) -> None:
"""Compiles the loss for the given model and number of labels."""
# Compile the model
softmax_output_layer_name = model.output_names[0]
# losses = {softmax_output_layer_name: "categorical_crossentropy"}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please remove

losses = ["categorical_crossentropy", None, None]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Quick question — the loss assignment changed from dict-based (by output name) to list-based (by position):

CharacterLevelCnnModel (3 outputs)

losses = ["categorical_crossentropy", None, None]

CharLoadTFModel (2 outputs)

losses = ["categorical_crossentropy", None]

Can you confirm the output ordering is stable and these align correctly with the model outputs? Just want to make sure the positional
assignment matches up since the dict approach was order-independent.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call out! I believe the previous layers were list based which is why in keras 3 it required the list losses. This code matched that, however, like you I prefer the order-independent approach and am looking into the requirements of that and ensuring the backwards compatibility of loading a model that was list based initially.


# use f1 score metric
f1_score_training = labeler_utils.F1Score(
num_classes=num_labels, average="micro"
)
metrics = {
softmax_output_layer_name: [
"categorical_crossentropy",
"acc",
f1_score_training,
]
}

model.compile(loss=losses, optimizer="adam", metrics=metrics)

def _construct_model(self) -> None:
"""
Construct model for the data labeler.
Expand Down Expand Up @@ -570,24 +593,7 @@ def _construct_model(self) -> None:
final_predicted_layer(argmax_layer, self._model.outputs[0]),
]
self._model = tf.keras.Model(self._model.inputs, argmax_outputs)

# Compile the model
softmax_output_layer_name = self._model.output_names[0]
losses = {softmax_output_layer_name: "categorical_crossentropy"}

# use f1 score metric
f1_score_training = labeler_utils.F1Score(
num_classes=num_labels, average="micro"
)
metrics = {
softmax_output_layer_name: [
"categorical_crossentropy",
"acc",
f1_score_training,
]
}

self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
self._compile_loss(self._model, num_labels)

self._epoch_id = 0
self._model_num_labels = num_labels
Expand Down Expand Up @@ -632,24 +638,7 @@ def _reconstruct_model(self) -> None:
final_predicted_layer(argmax_layer, final_softmax_layer),
]
self._model = tf.keras.Model(self._model.inputs, argmax_outputs)

# Compile the model
softmax_output_layer_name = self._model.output_names[0]
losses = {softmax_output_layer_name: "categorical_crossentropy"}

# use f1 score metric
f1_score_training = labeler_utils.F1Score(
num_classes=num_labels, average="micro"
)
metrics = {
softmax_output_layer_name: [
"categorical_crossentropy",
"acc",
f1_score_training,
]
}

self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
self._compile_loss(self._model, num_labels)
self._epoch_id = 0
self._model_num_labels = num_labels
self._model_default_ind = default_ind
Expand Down Expand Up @@ -699,14 +688,11 @@ def fit(
f1_report: dict = {}

self._model.reset_metrics()
softmax_output_layer_name = self._model.output_names[0]

start_time = time.time()
batch_id = 0
for x_train, y_train in train_data:
model_results = self._model.train_on_batch(
x_train, {softmax_output_layer_name: y_train}
)
model_results = self._model.train_on_batch(x_train, y_train)
sys.stdout.flush()
if verbose:
sys.stdout.write(
Expand All @@ -722,11 +708,11 @@ def fit(
f1, f1_report = self._validate_training(val_data) # type: ignore
history["f1_report"] = f1_report

val_f1 = f1_report["weighted avg"]["f1-score"] if f1_report else np.NAN
val_f1 = f1_report["weighted avg"]["f1-score"] if f1_report else np.nan
val_precision = (
f1_report["weighted avg"]["precision"] if f1_report else np.NAN
f1_report["weighted avg"]["precision"] if f1_report else np.nan
)
val_recall = f1_report["weighted avg"]["recall"] if f1_report else np.NAN
val_recall = f1_report["weighted avg"]["recall"] if f1_report else np.nan
epoch_time = time.time() - start_time
logger.info(
"\rEPOCH %d (%ds), loss: %f - acc: %f - f1_score %f -- "
Expand Down
12 changes: 7 additions & 5 deletions dataprofiler/labelers/classification_report_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ def convert_confusion_matrix_to_MCM(conf_matrix: list | np.ndarray) -> np.ndarra
"""
if not isinstance(conf_matrix, np.ndarray):
conf_matrix = np.array(conf_matrix)
num_labels = conf_matrix.shape[0]
num_samples = np.sum(conf_matrix)
num_labels = len(conf_matrix)
num_samples: int = int(np.sum(conf_matrix))
MCM = np.zeros((num_labels, 2, 2), dtype=np.int64)

# True Positives
Expand Down Expand Up @@ -205,6 +205,8 @@ def precision_recall_fscore_support(
f_score = (1 + beta2) * precision * recall / denom

# Average the results
weights: np.ndarray | None
support: np.ndarray | None = true_sum
if average == "weighted":
weights = true_sum
if weights.sum() == 0:
Expand All @@ -219,9 +221,9 @@ def precision_recall_fscore_support(
precision = np.average(precision, weights=weights)
recall = np.average(recall, weights=weights)
f_score = np.average(f_score, weights=weights)
true_sum = None # return no support
support = None # return no support

return precision, recall, f_score, true_sum
return precision, recall, f_score, support


def classification_report(
Expand Down Expand Up @@ -300,7 +302,7 @@ def classification_report(
"""
# ALTERATION: replaced the _check_targets with this if statement since
# no y_true, y_pred
y_type = "multiclass" if conf_matrix.shape[0] > 2 else "binary"
y_type = "multiclass" if len(conf_matrix) > 2 else "binary"

labels_given = True
if labels is None:
Expand Down
24 changes: 12 additions & 12 deletions dataprofiler/labelers/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def __eq__(self, other: object) -> bool:
:rtype: bool
"""
if (
type(self) != type(other)
type(self) is not type(other)
or not isinstance(other, BaseDataProcessor)
or self._parameters != other._parameters
):
Expand Down Expand Up @@ -692,7 +692,7 @@ def process(
:return batch_data: A dict containing samples of size batch_size
:rtype batch_data: dicts
"""
num_dim = sum([dim > 1 for dim in data.shape])
num_dim = sum(dim > 1 for dim in np.shape(data))
if num_dim > 1:
raise ValueError(
"Multidimensional data given to "
Expand Down Expand Up @@ -1213,10 +1213,10 @@ def match_sentence_lengths(
:type inplace: bool
:return: dict(pred=...) or dict(pred=..., conf=...)
"""
pred_buffer = []
conf_buffer = []
pred_buffer: np.ndarray = np.array([])
conf_buffer: np.ndarray = np.array([])
result_ind = 0
buffer_add_inds = np.cumsum(list(map(len, results["pred"]))).tolist()
buffer_add_inds: list[int] = np.cumsum(list(map(len, results["pred"]))).tolist()
separator_len = len(flatten_separator)

if not inplace:
Expand Down Expand Up @@ -1469,14 +1469,14 @@ def process(
"If `labels` are specified, `label_mapping` "
"must also be specified."
)
if data.shape != labels.shape:
if np.shape(data) != np.shape(labels):
raise ValueError(
f"Data and labels given to "
f"StructCharPreprocessor are of different "
f"shapes, {data.shape} != {labels.shape}"
f"shapes, {np.shape(data)} != {np.shape(labels)}"
)

num_dim = sum([dim > 1 for dim in data.shape])
num_dim = sum(dim > 1 for dim in np.shape(data))
if num_dim > 1:
warnings.warn(
"Data given to StructCharPreprocessor was "
Expand Down Expand Up @@ -1589,7 +1589,7 @@ def __eq__(self, other: object) -> bool:
:rtype: bool
"""
if (
type(self) != type(other)
type(self) is not type(other)
or not isinstance(other, StructCharPostprocessor)
or self._parameters["default_label"] != other._parameters["default_label"]
or self._parameters["pad_label"] != other._parameters["pad_label"]
Expand Down Expand Up @@ -1681,10 +1681,10 @@ def match_sentence_lengths(
:type inplace: bool
:return: dict(pred=...) or dict(pred=..., conf=...)
"""
pred_buffer = []
conf_buffer = []
pred_buffer: np.ndarray = np.array([])
conf_buffer: np.ndarray = np.array([])
result_ind = 0
buffer_add_inds = np.cumsum(list(map(len, results["pred"]))).tolist()
buffer_add_inds: list[int] = np.cumsum(list(map(len, results["pred"]))).tolist()
separator_len = len(flatten_separator)

if not inplace:
Expand Down
15 changes: 9 additions & 6 deletions dataprofiler/labelers/labeler_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@ class 1 1.00 0.67 0.80 3


def evaluate_accuracy(
predicted_entities_in_index: list[list[int]],
true_entities_in_index: list[list[int]],
predicted_entities_in_index: list[list[int]] | np.ndarray,
true_entities_in_index: list[list[int]] | np.ndarray,
num_labels: int,
entity_rev_dict: dict[int, str],
verbose: bool = True,
Expand Down Expand Up @@ -119,13 +119,16 @@ def evaluate_accuracy(
if x[1] not in omitted_labels
]

max_len = len(predicted_entities_in_index[0])
true_labels_padded = np.zeros((len(true_entities_in_index), max_len))
for i, true_labels_row in enumerate(true_entities_in_index):
predicted_entities = [np.asarray(row) for row in predicted_entities_in_index]
true_entities = [np.asarray(row) for row in true_entities_in_index]

max_len = len(predicted_entities[0])
true_labels_padded = np.zeros((len(true_entities), max_len))
for i, true_labels_row in enumerate(true_entities):
true_labels_padded[i][: len(true_labels_row)] = true_labels_row

true_labels_flatten = np.hstack(true_labels_padded) # type: ignore
predicted_labels_flatten = np.hstack(predicted_entities_in_index)
predicted_labels_flatten = np.hstack(predicted_entities)

all_labels: list[str] = []
if entity_rev_dict:
Expand Down
Loading
Loading