Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
d6b1ec1
refactor: move from deprecated pkg_resources
JGSweets May 6, 2026
9920fab
fix: to use func
JGSweets May 6, 2026
7592895
fix: add missing change
JGSweets May 6, 2026
a3592fb
refactor: resources to be in package
JGSweets May 6, 2026
3ecaf6b
fix: tests bc of almost
JGSweets May 6, 2026
0a2efd3
feat: refactor to pass in a path or string or None
JGSweets May 6, 2026
fb321a2
fix: import for older versions
JGSweets May 6, 2026
2207920
fix: Tranversable must be done at runtime
JGSweets May 6, 2026
96344db
refactor: keras reqs and others
JGSweets May 6, 2026
0458e73
refactor: losses for keras and tests
JGSweets May 6, 2026
2fe4ddd
fix: remove unneeded global
JGSweets May 6, 2026
c41303e
fix: accidentally duplicated test on rebase
JGSweets May 6, 2026
0615268
fix: rebase duplicates
JGSweets May 6, 2026
f08af16
fix: keras reqs
JGSweets May 6, 2026
e5f4041
refactor: update to be more than 3.4.0 for keras
JGSweets May 6, 2026
052d058
refactor: numpy2 and mypy
JGSweets May 11, 2026
3965667
fix: mypy 3.10
JGSweets May 11, 2026
f1046a9
fix: bugs
JGSweets May 11, 2026
8f1b4e0
fix: float
JGSweets May 11, 2026
fdc671e
refactor: for hist fix too
JGSweets May 11, 2026
34c47fe
fix: issue with none in hist
JGSweets May 11, 2026
57066fb
fix: remove comment
JGSweets May 22, 2026
5de7abe
refactor: to still utilize dict mapping for losses
JGSweets May 22, 2026
e1afcf7
fix: int pre-commit
JGSweets May 22, 2026
0b00aed
fix: train labeling
JGSweets May 22, 2026
8edd1dc
refactor notes, reqs, and change log
JGSweets May 22, 2026
03b4fa1
fix: pre-commit
JGSweets May 22, 2026
ffbac1a
refactor: add unit tests validating usage of the old load format
JGSweets May 22, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 22 additions & 20 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ repos:
# Black: format Python code
# https://github.com/psf/black/blob/master/.pre-commit-hooks.yaml
- repo: https://github.com/psf/black
rev: 22.3.0
rev: 24.3.0
hooks:
- id: black
exclude: (versioneer.py|dataprofiler/_version.py|_docs/)
Expand All @@ -21,7 +21,7 @@ repos:
# Flake8: complexity and style checking
# https://flake8.pycqa.org/en/latest/user/using-hooks.html
- repo: https://github.com/pycqa/flake8
rev: 4.0.1
rev: 7.3.0
hooks:
- id: flake8
additional_dependencies: [flake8-docstrings]
Expand Down Expand Up @@ -50,29 +50,31 @@ repos:
# requirements.txt
h5py>=2.10.0,
wheel>=0.33.1,
numpy<2.0.0,
'numpy>=1.22.0,<3.0.0',
'pandas>=1.1.2,<3.0.0',
python-dateutil>=2.7.5,
pytz>=2020.1,
pyarrow>=1.0.1,
'chardet>=3.0.4,<7.0.0',
fastavro>=1.0.0.post1,
fastavro>=1.1.0,
python-snappy>=0.7.1,
charset-normalizer>=1.3.6,
psutil>=4.0.0,
scipy>=1.4.1,
requests>=2.28.1,
scipy>=1.10.0,
requests>=2.32.4,
networkx>=2.5.1,
typing-extensions>=3.10.0.2,
HLL>=2.0.3,
datasketches>=4.1.0,
boto3>=1.28.61,
packaging>=23.0,
boto3>=1.37.15,
urllib3>=2.5.0,

# requirements-dev.txt
check-manifest>=0.48,
black==22.3.0,
check-manifest>=0.50,
black>=24.3.0,
isort==5.12.0,
pre-commit==2.19.0,
pre-commit==4.3.0,
tox==3.25.1,
types-setuptools==67.7.0.1,
types-python-dateutil==2.8.19.12,
Expand All @@ -82,11 +84,9 @@ repos:

# requirements-ml.txt
scikit-learn>=0.23.2,
'keras>=2.4.3,<=3.4.0',
'keras>3.4.0,<4.0.0',
rapidfuzz>=2.6.1,
"tensorflow>=2.6.4,<2.15.0; sys.platform != 'darwin'",
"tensorflow>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine != 'arm64'",
"tensorflow-macos>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
"tensorflow>=2.16.0",
tqdm>=4.0.0,

# requirements-reports.txt
Expand All @@ -101,18 +101,20 @@ repos:
pytest-xdist>=2.1.0,
pytest-forked>=1.3.0,
toolz>=0.10.0,
'memray>=1.7.0,<1.12.0',
'memray>=1.18.0',
]
# Check-manifest: ensures required non-Python files are included in MANIFEST.in
# https://github.com/mgedmin/check-manifest/blob/master/.pre-commit-hooks.yaml
- repo: https://github.com/mgedmin/check-manifest
rev: "0.48"
rev: "0.50"
hooks:
- id: check-manifest
additional_dependencies: ['h5py', 'wheel', 'future', 'numpy<2.0.0', 'pandas',
'python-dateutil', 'pytz', 'pyarrow', 'chardet', 'fastavro',
'python-snappy', 'charset-normalizer', 'psutil', 'scipy', 'requests',
'networkx','typing-extensions', 'HLL', 'datasketches', 'boto3']
additional_dependencies: ['h5py', 'wheel', 'future', 'numpy>=1.22.0,<3.0.0',
'pandas', 'python-dateutil', 'pytz', 'pyarrow', 'chardet',
'fastavro>=1.1.0', 'python-snappy', 'charset-normalizer', 'psutil',
'scipy>=1.10.0', 'requests>=2.32.4', 'networkx', 'typing-extensions',
'HLL', 'datasketches', 'packaging>=23.0', 'boto3>=1.37.15',
'urllib3>=2.5.0']
# Pyupgrade - standardize and modernize Python syntax for newer versions of the language
- repo: https://github.com/asottile/pyupgrade
rev: v3.3.0
Expand Down
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Changelog

## Unreleased

- Added compatibility support for NumPy 2.0 while constraining `numpy` to
`>=1.22.0,<3.0.0` to avoid future breakage from NumPy 3.
- Added compatibility support for Keras versions newer than 3.4.0 while
constraining `keras` to `>3.4.0,<4.0.0` to avoid future breakage from Keras 4.
- Updated the pre-commit configuration to align hook versions and hook
dependencies with the current project requirements.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ global-exclude .DS_Store
global-exclude */__pycache__/*

include *.txt
include CHANGELOG.md
include CODEOWNERS
recursive-include dataprofiler *.avro
recursive-include dataprofiler *.csv
Expand Down
1 change: 1 addition & 0 deletions dataprofiler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Package for dataprofiler."""

from . import settings
from ._version import get_versions
from .data_readers.data import Data
Expand Down
1 change: 1 addition & 0 deletions dataprofiler/_typing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Contains typing aliases."""

from typing import Dict, List, NewType, Union

import numpy as np
Expand Down
1 change: 1 addition & 0 deletions dataprofiler/data_readers/avro_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Contains class for saving and loading spreadsheet data."""

from io import BytesIO, StringIO
from typing import Any, Dict, List, Optional, Union

Expand Down
1 change: 1 addition & 0 deletions dataprofiler/data_readers/base_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Contains abstract class for data loading and saving."""

import locale
import sys
from collections import OrderedDict
Expand Down
1 change: 1 addition & 0 deletions dataprofiler/data_readers/csv_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Contains class that saves and loads spreadsheet data."""

import csv
import random
import re
Expand Down
3 changes: 2 additions & 1 deletion dataprofiler/data_readers/data_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Contains functions for data readers."""

import json
import logging
import os
Expand Down Expand Up @@ -334,7 +335,7 @@ def reservoir(file: TextIOWrapper, sample_nrows: int) -> list:
except StopIteration:
break
# Append new, replace old with dummy, and keep track of order
remove_index = rng.integers(0, sample_nrows)
remove_index = int(rng.integers(0, sample_nrows))
values[indices[remove_index]] = str(None)
indices[remove_index] = len(values)
values.append(newval)
Expand Down
1 change: 1 addition & 0 deletions dataprofiler/data_readers/filepath_or_buffer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Contains functions and classes for handling filepaths and buffers."""

from io import BytesIO, StringIO, TextIOWrapper
from typing import IO, Any, Optional, Type, Union, cast

Expand Down
1 change: 1 addition & 0 deletions dataprofiler/data_readers/graph_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Contains class for identifying, reading, and loading graph data."""

import csv
from typing import Dict, List, Optional, Union, cast

Expand Down
7 changes: 4 additions & 3 deletions dataprofiler/data_readers/json_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Contains class to save and load json data."""

import json
import re
import warnings
Expand Down Expand Up @@ -71,9 +72,9 @@ def __init__(

self._data_formats["records"] = self._get_data_as_records
self._data_formats["json"] = self._get_data_as_json
self._data_formats[
"flattened_dataframe"
] = self._get_data_as_flattened_dataframe
self._data_formats["flattened_dataframe"] = (
self._get_data_as_flattened_dataframe
)
self._selected_data_format: str = options.get(
"data_format", "flattened_dataframe"
)
Expand Down
1 change: 1 addition & 0 deletions dataprofiler/data_readers/parquet_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Contains class to save and load parquet data."""

from io import BytesIO, StringIO
from typing import Any, Dict, List, Optional, Union

Expand Down
1 change: 1 addition & 0 deletions dataprofiler/data_readers/structured_mixins.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Contains mixin data class for loading datasets of tye SpreadSheet."""

from logging import Logger
from typing import Any, Dict, List, Optional, Union, cast

Expand Down
1 change: 1 addition & 0 deletions dataprofiler/labelers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
2. structured_model
3. regex_model
"""

# import data labelers
# import models
from .base_data_labeler import BaseDataLabeler, TrainableDataLabeler
Expand Down
3 changes: 2 additions & 1 deletion dataprofiler/labelers/base_model.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Contains abstract classes for labeling data."""

from __future__ import annotations

import abc
Expand Down Expand Up @@ -78,7 +79,7 @@ def __eq__(self, other: object) -> bool:
:rtype: bool
"""
if (
type(self) != type(other)
type(self) is not type(other)
or not isinstance(other, BaseModel)
or self._parameters != other._parameters
or self._label_mapping != other._label_mapping
Expand Down
Loading
Loading