laminlabs · ishitajain9717 · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026 · Jun 23, 2026
diff --git a/lamindb/models/_feature_manager.py b/lamindb/models/_feature_manager.py
@@ -2231,17 +2231,47 @@ def bulk_set_features_in_records(records: Iterable[Record]) -> None:
 
     assert batch_schema is not None  # noqa: S101
     schema_features = list(batch_schema.members.all())
-    dataframe = pd.DataFrame(prepared_rows)
+    from .feature import convert_to_pandas_dtype
     from .record import move_schema_index_column_to_dataframe_index
 
+    # Build the validation dataframe column-by-column with the dtype declared by
+    # each schema feature, instead of calling pd.DataFrame(prepared_rows) and
+    # letting pandas re-infer dtypes. The chopped per-row dicts drop null cells,
+    # so a naive pd.DataFrame would backfill NaN and widen partial-null columns
+    # (bool -> object, int -> float64), failing validation even for valid input.
+    # Supplying each column's declared pandas dtype up front propagates the type
+    # correctly through construction and removes the need for per-type casting.
+    feature_dtype_by_name = {
+        feature.name: convert_to_pandas_dtype(feature.dtype_as_str)
+        for feature in schema_features
+    }
+    # single pass to discover which columns actually appear across all rows
+    # (rows may differ once null cells are dropped)
+    present_columns: set[str] = set()
+    for row in prepared_rows:
+        present_columns.update(row)
+
+    # schema features first (stable, declared order), then any extra injected
+    # columns (e.g. the index/name), sorted for determinism
+    ordered_columns = [f.name for f in schema_features if f.name in present_columns]
+    ordered_columns += sorted(
+        column for column in present_columns if column not in feature_dtype_by_name
+    )
+
+    data: dict[str, pd.Series] = {}
+    for column in ordered_columns:
+        values = [row.get(column, pd.NA) for row in prepared_rows]
+        target_dtype = feature_dtype_by_name.get(column)
+        if target_dtype is not None:
+            data[column] = pd.Series(values, dtype=target_dtype)
+        else:
+            data[column] = pd.Series(values)
+    if data:
+        dataframe = pd.DataFrame(data)
+    else:
+        dataframe = pd.DataFrame(index=range(len(prepared_rows)))
+
     dataframe = move_schema_index_column_to_dataframe_index(dataframe, batch_schema)
-    for feature in schema_features:
-        if (
-            feature.name in dataframe
-            and feature.dtype_as_str.startswith("cat")
-            and not feature.dtype_as_str.startswith("list[cat")
-        ):
-            dataframe[feature.name] = dataframe[feature.name].astype("category")
     # Single-pass dataframe curation:
     # validate schema and resolve categoricals once for the entire batch.
     #

diff --git a/tests/core/test_curator_basics.py b/tests/core/test_curator_basics.py
@@ -480,6 +480,45 @@ def test_nullable():
     disease.delete(permanent=True)
 
 
+def test_curator_partial_null_bool_int():
+    flag = ln.Feature(name="cur-flag", dtype=bool, nullable=True).save()
+    count = ln.Feature(name="cur-count", dtype=int, nullable=True).save()
+    schema = ln.Schema(features=[flag, count]).save()
+
+    # Case 1: properly-typed nullable columns -> should PASS
+    # This is what a well-behaved user supplies, and what our _feature_manager fix
+    # produces after the cast. We are asserting the curator accepts these dtypes.
+    df_good = pd.DataFrame(
+        {
+            "cur-flag": pd.array([True, None, False], dtype="boolean"),
+            "cur-count": pd.array([1, None, 3], dtype="Int64"),
+        }
+    )
+    assert ln.curators.DataFrameCurator(df_good, schema).validate() is None
+
+    # Case 2: degraded dtypes (what pd.DataFrame(prepared_rows) produces internally)
+    # bool + null -> object, int + null -> float64.
+    # Should FAIL — this is correct curator behaviour, not a bug.
+    df_degraded = pd.DataFrame(
+        {
+            "cur-flag": [True, None, False],  # object
+            "cur-count": [1, None, 3],  # float64
+        }
+    )
+    with pytest.raises(ln.errors.ValidationError) as excinfo:
+        ln.curators.DataFrameCurator(df_degraded, schema).validate()
+    assert "Column 'cur-flag' failed dtype check for 'bool': got object" in str(
+        excinfo.value
+    )
+    assert "Column 'cur-count' failed dtype check for 'int': got float64" in str(
+        excinfo.value
+    )
+
+    schema.delete(permanent=True)
+    flag.delete(permanent=True)
+    count.delete(permanent=True)
+
+
 def test_pandera_dataframe_schema(
     df,
     df_missing_sample_type_column,

diff --git a/tests/core/test_record_basics.py b/tests/core/test_record_basics.py
@@ -129,6 +129,66 @@ def test_record_lazy_features_on_save():
     score_feature.delete(permanent=True)
 
 
+def test_record_from_dataframe_partial_null_bool_int():
+    """Partial-null bool/int features survive the from_dataframe save round-trip.
+
+    The user supplies correctly-typed nullable columns (boolean / Int64). On
+    .save(), _build_records chops the frame into per-row dicts (dropping null
+    cells) and bulk_set_features_in_records rebuilds it. Building each column with
+    its declared dtype (via convert_to_pandas_dtype) instead of letting
+    pd.DataFrame re-infer keeps bool/int from degrading to object/float64, so
+    validation passes and the values round-trip.
+    """
+    flag = ln.Feature(name="from-df-flag", dtype=bool).save()
+    count = ln.Feature(name="from-df-count", dtype=int).save()
+    label = ln.Feature(name="from-df-label", dtype=str).save()
+    schema = ln.Schema([flag, count, label], name="from-df-bool-int-schema").save()
+    sheet = ln.Record(name="from-df-bool-int-sheet", is_type=True, schema=schema).save()
+
+    df = pd.DataFrame(
+        {
+            "__lamindb_record_name__": ["from-df-a", "from-df-b", "from-df-c"],
+            "from-df-flag": pd.array([True, None, False], dtype="boolean"),
+            "from-df-count": pd.array([1, None, 3], dtype="Int64"),
+            "from-df-label": ["a", "b", "c"],
+        }
+    )
+
+    # the supplied dtypes are genuinely correct/nullable
+    assert df["from-df-flag"].dtype.name == "boolean"
+    assert df["from-df-count"].dtype.name == "Int64"
+
+    records = ln.Record.from_dataframe(df, type=sheet)
+    assert len(records) == 3
+    records.save()
+
+    # non-null values round-trip
+    assert ln.Record.get(name="from-df-a").features.get_values()["from-df-flag"] is True
+    assert ln.Record.get(name="from-df-a").features.get_values()["from-df-count"] == 1
+    # the None row drops the fragile keys entirely
+    assert "from-df-flag" not in ln.Record.get(name="from-df-b").features.get_values()
+    assert "from-df-count" not in ln.Record.get(name="from-df-b").features.get_values()
+    assert (
+        ln.Record.get(name="from-df-c").features.get_values()["from-df-flag"] is False
+    )
+    assert ln.Record.get(name="from-df-c").features.get_values()["from-df-count"] == 3
+
+    # dtypes survive the export round-trip: the columns come back as the nullable
+    # extension dtypes, not degraded object / float64
+    exported = sheet.to_dataframe()
+    assert exported["from-df-flag"].dtype.name == "boolean"
+    assert exported["from-df-count"].dtype.name == "Int64"
+
+    ln.Record.filter(name__in=["from-df-a", "from-df-b", "from-df-c"]).delete(
+        permanent=True
+    )
+    ln.Record.filter(name="from-df-bool-int-sheet").delete(permanent=True)
+    schema.delete(permanent=True)
+    flag.delete(permanent=True)
+    count.delete(permanent=True)
+    label.delete(permanent=True)
+
+
 def test_record_from_dataframe_bulk_save_paths():
     score = ln.Feature(name="from-df-score", dtype=float).save()
     schema = ln.Schema([score], name="from-df-schema").save()