Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 38 additions & 8 deletions lamindb/models/_feature_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -2231,17 +2231,47 @@ def bulk_set_features_in_records(records: Iterable[Record]) -> None:

assert batch_schema is not None # noqa: S101
schema_features = list(batch_schema.members.all())
dataframe = pd.DataFrame(prepared_rows)
from .feature import convert_to_pandas_dtype
from .record import move_schema_index_column_to_dataframe_index

# Build the validation dataframe column-by-column with the dtype declared by
# each schema feature, instead of calling pd.DataFrame(prepared_rows) and
# letting pandas re-infer dtypes. The chopped per-row dicts drop null cells,
# so a naive pd.DataFrame would backfill NaN and widen partial-null columns
# (bool -> object, int -> float64), failing validation even for valid input.
# Supplying each column's declared pandas dtype up front propagates the type
# correctly through construction and removes the need for per-type casting.
feature_dtype_by_name = {
feature.name: convert_to_pandas_dtype(feature.dtype_as_str)
for feature in schema_features
}
# single pass to discover which columns actually appear across all rows
# (rows may differ once null cells are dropped)
present_columns: set[str] = set()
for row in prepared_rows:
present_columns.update(row)

# schema features first (stable, declared order), then any extra injected
# columns (e.g. the index/name), sorted for determinism
ordered_columns = [f.name for f in schema_features if f.name in present_columns]
ordered_columns += sorted(
column for column in present_columns if column not in feature_dtype_by_name
)

data: dict[str, pd.Series] = {}
for column in ordered_columns:
values = [row.get(column, pd.NA) for row in prepared_rows]
target_dtype = feature_dtype_by_name.get(column)
if target_dtype is not None:
data[column] = pd.Series(values, dtype=target_dtype)
else:
data[column] = pd.Series(values)
if data:
dataframe = pd.DataFrame(data)
else:
dataframe = pd.DataFrame(index=range(len(prepared_rows)))

dataframe = move_schema_index_column_to_dataframe_index(dataframe, batch_schema)
for feature in schema_features:
if (
feature.name in dataframe
and feature.dtype_as_str.startswith("cat")
and not feature.dtype_as_str.startswith("list[cat")
):
dataframe[feature.name] = dataframe[feature.name].astype("category")
# Single-pass dataframe curation:
# validate schema and resolve categoricals once for the entire batch.
#
Expand Down
39 changes: 39 additions & 0 deletions tests/core/test_curator_basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,45 @@ def test_nullable():
disease.delete(permanent=True)


def test_curator_partial_null_bool_int():
flag = ln.Feature(name="cur-flag", dtype=bool, nullable=True).save()
count = ln.Feature(name="cur-count", dtype=int, nullable=True).save()
schema = ln.Schema(features=[flag, count]).save()

# Case 1: properly-typed nullable columns -> should PASS
# This is what a well-behaved user supplies, and what our _feature_manager fix
# produces after the cast. We are asserting the curator accepts these dtypes.
df_good = pd.DataFrame(
{
"cur-flag": pd.array([True, None, False], dtype="boolean"),

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perfect!

"cur-count": pd.array([1, None, 3], dtype="Int64"),
}
)
assert ln.curators.DataFrameCurator(df_good, schema).validate() is None

# Case 2: degraded dtypes (what pd.DataFrame(prepared_rows) produces internally)
# bool + null -> object, int + null -> float64.
# Should FAIL — this is correct curator behaviour, not a bug.
df_degraded = pd.DataFrame(
{
"cur-flag": [True, None, False], # object
"cur-count": [1, None, 3], # float64
}
)
with pytest.raises(ln.errors.ValidationError) as excinfo:
ln.curators.DataFrameCurator(df_degraded, schema).validate()

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please add a test that makes evident which error message is raised here?

assert "Column 'cur-flag' failed dtype check for 'bool': got object" in str(
excinfo.value
)
assert "Column 'cur-count' failed dtype check for 'int': got float64" in str(
excinfo.value
)

schema.delete(permanent=True)
flag.delete(permanent=True)
count.delete(permanent=True)


def test_pandera_dataframe_schema(
df,
df_missing_sample_type_column,
Expand Down
60 changes: 60 additions & 0 deletions tests/core/test_record_basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,66 @@ def test_record_lazy_features_on_save():
score_feature.delete(permanent=True)


def test_record_from_dataframe_partial_null_bool_int():
"""Partial-null bool/int features survive the from_dataframe save round-trip.

The user supplies correctly-typed nullable columns (boolean / Int64). On
.save(), _build_records chops the frame into per-row dicts (dropping null
cells) and bulk_set_features_in_records rebuilds it. Building each column with
its declared dtype (via convert_to_pandas_dtype) instead of letting
pd.DataFrame re-infer keeps bool/int from degrading to object/float64, so
validation passes and the values round-trip.
"""
flag = ln.Feature(name="from-df-flag", dtype=bool).save()
count = ln.Feature(name="from-df-count", dtype=int).save()
label = ln.Feature(name="from-df-label", dtype=str).save()
schema = ln.Schema([flag, count, label], name="from-df-bool-int-schema").save()
sheet = ln.Record(name="from-df-bool-int-sheet", is_type=True, schema=schema).save()

df = pd.DataFrame(
{
"__lamindb_record_name__": ["from-df-a", "from-df-b", "from-df-c"],
"from-df-flag": pd.array([True, None, False], dtype="boolean"),
"from-df-count": pd.array([1, None, 3], dtype="Int64"),
"from-df-label": ["a", "b", "c"],
}
)

# the supplied dtypes are genuinely correct/nullable
assert df["from-df-flag"].dtype.name == "boolean"
assert df["from-df-count"].dtype.name == "Int64"

records = ln.Record.from_dataframe(df, type=sheet)
assert len(records) == 3
records.save()

# non-null values round-trip
assert ln.Record.get(name="from-df-a").features.get_values()["from-df-flag"] is True
assert ln.Record.get(name="from-df-a").features.get_values()["from-df-count"] == 1
# the None row drops the fragile keys entirely
assert "from-df-flag" not in ln.Record.get(name="from-df-b").features.get_values()
assert "from-df-count" not in ln.Record.get(name="from-df-b").features.get_values()
assert (
ln.Record.get(name="from-df-c").features.get_values()["from-df-flag"] is False
)
assert ln.Record.get(name="from-df-c").features.get_values()["from-df-count"] == 3

# dtypes survive the export round-trip: the columns come back as the nullable
# extension dtypes, not degraded object / float64
exported = sheet.to_dataframe()
assert exported["from-df-flag"].dtype.name == "boolean"
assert exported["from-df-count"].dtype.name == "Int64"

ln.Record.filter(name__in=["from-df-a", "from-df-b", "from-df-c"]).delete(
permanent=True
)
ln.Record.filter(name="from-df-bool-int-sheet").delete(permanent=True)
schema.delete(permanent=True)
flag.delete(permanent=True)
count.delete(permanent=True)
label.delete(permanent=True)


def test_record_from_dataframe_bulk_save_paths():
score = ln.Feature(name="from-df-score", dtype=float).save()
schema = ln.Schema([score], name="from-df-schema").save()
Expand Down
Loading