From 35e55ddc8a862b676009359edaa80be6497ce506 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hans=20W=C3=BCrfel?= <git@wuerfel.io>
Date: Fri, 20 Feb 2026 11:55:48 +0100
Subject: [PATCH 01/11] =?UTF-8?q?add=20Diagonal=20overload=20f=C3=BCr=20ma?=
 =?UTF-8?q?ss=20matrix?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 lib/OrdinaryDiffEqNonlinearSolve/src/initialize_dae.jl |  5 +++++
 .../test/sparse_algebraic_detection_tests.jl           | 10 ++++++++++
 2 files changed, 15 insertions(+)

diff --git a/lib/OrdinaryDiffEqNonlinearSolve/src/initialize_dae.jl b/lib/OrdinaryDiffEqNonlinearSolve/src/initialize_dae.jl
index bee55d5f6b6..668e9745799 100644
--- a/lib/OrdinaryDiffEqNonlinearSolve/src/initialize_dae.jl
+++ b/lib/OrdinaryDiffEqNonlinearSolve/src/initialize_dae.jl
@@ -29,6 +29,11 @@ function find_algebraic_vars_eqs(M::SparseMatrixCSC)
     return algebraic_vars, algebraic_eqs
 end
 
+function find_algebraic_vars_eqs(M::LinearAlgebra.Diagonal)
+    _idxs = map(iszero, LinearAlgebra.diag(M))
+    return _idxs, _idxs
+end
+
 # Fallback for non-sparse matrices (original behavior)
 function find_algebraic_vars_eqs(M::AbstractMatrix)
     algebraic_vars = vec(all(iszero, M, dims = 1))
diff --git a/lib/OrdinaryDiffEqNonlinearSolve/test/sparse_algebraic_detection_tests.jl b/lib/OrdinaryDiffEqNonlinearSolve/test/sparse_algebraic_detection_tests.jl
index 894b87af451..1f5cb9eca01 100644
--- a/lib/OrdinaryDiffEqNonlinearSolve/test/sparse_algebraic_detection_tests.jl
+++ b/lib/OrdinaryDiffEqNonlinearSolve/test/sparse_algebraic_detection_tests.jl
@@ -79,4 +79,14 @@ using OrdinaryDiffEqNonlinearSolve: find_algebraic_vars_eqs
         @test vars == [false, true]
         @test eqs == [false, true]
     end
+
+    # Test 4: Test Diagonal case
+    @testset "Test Diagonal cast" begin
+        M_diag = Diagonal([1.0, 0.0, 1.0, 1.0, 0.0])
+        vars, eqs = find_algebraic_vars_eqs(M_diag)
+        @test vars == [false, true, false, false, true]
+        @test eqs == [false, true, false, false, true]
+        # compare to dense
+        @test find_algebraic_vars_eqs(M_diag) == find_algebraic_vars_eqs(collect(M_diag))
+    end
 end

From 1ae3b732448b4ac01dd2f20fa32170e7fe0e14e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hans=20W=C3=BCrfel?= <git@wuerfel.io>
Date: Fri, 20 Feb 2026 19:07:57 +0100
Subject: [PATCH 02/11] fix dae gpu compat

---
 .../src/derivative_utils.jl                   | 10 ++++-
 test/gpu/simple_dae.jl                        | 40 ++++++++++++-------
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/lib/OrdinaryDiffEqDifferentiation/src/derivative_utils.jl b/lib/OrdinaryDiffEqDifferentiation/src/derivative_utils.jl
index 51608029b91..a976e2e28bf 100644
--- a/lib/OrdinaryDiffEqDifferentiation/src/derivative_utils.jl
+++ b/lib/OrdinaryDiffEqDifferentiation/src/derivative_utils.jl
@@ -14,7 +14,7 @@ function calc_tderivative!(integrator, cache, dtd1, repeat_step)
                 tf.p = p
                 alg = unwrap_alg(integrator, true)
 
-                autodiff_alg = ADTypes.dense_ad(gpu_safe_autodiff(alg_autodiff(alg), u))
+                autodiff_alg = gpu_safe_autodiff(ADTypes.dense_ad(alg_autodiff(alg)), u)
 
                 # Convert t to eltype(dT) if using ForwardDiff, to make FunctionWrappers work
                 t = autodiff_alg isa AutoForwardDiff ? convert(eltype(dT), t) : t
@@ -59,7 +59,7 @@ function calc_tderivative(integrator, cache)
         tf.u = uprev
         tf.p = p
 
-        autodiff_alg = ADTypes.dense_ad(gpu_safe_autodiff(alg_autodiff(alg), u))
+        autodiff_alg = gpu_safe_autodiff(ADTypes.dense_ad(alg_autodiff(alg)), u)
 
         if alg_autodiff isa AutoFiniteDiff
             autodiff_alg = SciMLBase.@set autodiff_alg.dir = diffdir(integrator)
@@ -403,6 +403,12 @@ function jacobian2W!(
             else
                 @.. broadcast = false @view(W[idxs]) = muladd(λ, invdtgamma, @view(J[idxs]))
             end
+        elseif is_sparse(W) && !ArrayInterface.fast_scalar_indexing(nonzeros(W))
+            # Sparse GPU arrays (e.g. CuSparseMatrixCSC/CSR) don't support broadcasting.
+            # ArrayInterface.fast_scalar_indexing is not specialized for AbstractGPUSparseArray,
+            # so we detect them by checking if the underlying nonzeros storage is a GPU array.
+            # we then fall back to allocating matrix arithmetic
+            copyto!(W, J - invdtgamma * mass_matrix)
         else
             @.. broadcast = false W = muladd(-mass_matrix, invdtgamma, J)
         end
diff --git a/test/gpu/simple_dae.jl b/test/gpu/simple_dae.jl
index 1b39a42cd1a..c1a346f0108 100644
--- a/test/gpu/simple_dae.jl
+++ b/test/gpu/simple_dae.jl
@@ -5,6 +5,7 @@ using LinearAlgebra
 using Adapt
 using SparseArrays
 using Test
+using CUDSS
 
 #=
 du[1] = -u[1]
@@ -38,30 +39,39 @@ tspan = (0.0, 5.0)
 prob = ODEProblem(odef, u0, tspan, p)
 sol = solve(prob, Rodas5P())
 
-# gpu version
 mass_matrix_d = adapt(CuArray, mass_matrix)
-
-# TODO: jac_prototype fails
-# jac_prototype_d = adapt(CuArray, jac_prototype)
-# jac_prototype_d = CUDA.CUSPARSE.CuSparseMatrixCSR(jac_prototype)
-jac_prototype_d = nothing
+jac_prototype_d_csc = CUDA.CUSPARSE.CuSparseMatrixCSC(jac_prototype)
+jac_prototype_d_csr = CUDA.CUSPARSE.CuSparseMatrixCSR(jac_prototype)
 
 u0_d = adapt(CuArray, u0)
 p_d = adapt(CuArray, p)
-odef_d = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = jac_prototype_d)
-prob_d = ODEProblem(odef_d, u0_d, tspan, p_d)
-sol_d = solve(prob_d, Rodas5P())
+
+odef_d_dns = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = nothing)
+odef_d_csc = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = jac_prototype_d_csc)
+odef_d_csr = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = jac_prototype_d_csr)
+
+prob_d_dns = ODEProblem(odef_d_dns, u0_d, tspan, p_d)
+prob_d_csc = ODEProblem(odef_d_csc, u0_d, tspan, p_d)
+prob_d_csr = ODEProblem(odef_d_csr, u0_d, tspan, p_d)
+
+sol_d_dns = solve(prob_d_dns, Rodas5P()) # works
+sol_d_csc = solve(prob_d_csc, Rodas5P()) # uses Krylov
+sol_d_csr = solve(prob_d_csr, Rodas5P()) # uses CUDSS
 
 @testset "Test constraints in GPU sol" begin
-    for t in sol_d.t
-        u = Vector(sol_d(t))
-        @test isapprox(u[1] + u[2], u[3]; atol = 1.0e-6)
-        @test isapprox(-u[1] + u[2], u[4]; atol = 1.0e-6)
+    for sol_d in [sol_d_dns, sol_d_csc, sol_d_csr]
+        for t in sol_d.t
+            u = Vector(sol_d(t))
+            @test isapprox(u[1] + u[2], u[3]; atol = 1.0e-6)
+            @test isapprox(-u[1] + u[2], u[4]; atol = 1.0e-6)
+        end
     end
 end
 
 @testset "Compare GPU to CPU solution" begin
-    for t in tspan[begin]:0.1:tspan[end]
-        @test isapprox(Vector(sol_d(t)), sol(t); rtol = 1.0e-4)
+    for sol_d in [sol_d_dns, sol_d_csc, sol_d_csr]
+        for t in tspan[begin]:0.1:tspan[end]
+            @test Vector(sol_d(t)) ≈ sol(t) atol=1e-5
+        end
     end
 end

From f83c90dc7dfee5d1b354e8cc797c598e3960d415 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hans=20W=C3=BCrfel?= <git@wuerfel.io>
Date: Mon, 23 Feb 2026 08:34:34 +0100
Subject: [PATCH 03/11] move find_algebraic_vars_eqs to OrdinayDiffEqCore

---
 .../ext/OrdinaryDiffEqCoreSparseArraysExt.jl  | 26 +++++++++-
 .../src/OrdinaryDiffEqCore.jl                 |  2 +-
 lib/OrdinaryDiffEqCore/src/misc_utils.jl      | 24 ++++++++++
 .../test/algebraic_vars_detection_tests.jl}   |  2 +-
 lib/OrdinaryDiffEqCore/test/runtests.jl       |  1 +
 .../src/OrdinaryDiffEqNonlinearSolve.jl       |  2 +-
 .../src/initialize_dae.jl                     | 48 -------------------
 .../test/runtests.jl                          |  1 -
 8 files changed, 53 insertions(+), 53 deletions(-)
 rename lib/{OrdinaryDiffEqNonlinearSolve/test/sparse_algebraic_detection_tests.jl => OrdinaryDiffEqCore/test/algebraic_vars_detection_tests.jl} (98%)

diff --git a/lib/OrdinaryDiffEqCore/ext/OrdinaryDiffEqCoreSparseArraysExt.jl b/lib/OrdinaryDiffEqCore/ext/OrdinaryDiffEqCoreSparseArraysExt.jl
index 6b57562971f..6a311b0792b 100644
--- a/lib/OrdinaryDiffEqCore/ext/OrdinaryDiffEqCoreSparseArraysExt.jl
+++ b/lib/OrdinaryDiffEqCore/ext/OrdinaryDiffEqCoreSparseArraysExt.jl
@@ -1,7 +1,7 @@
 module OrdinaryDiffEqCoreSparseArraysExt
 
 using SparseArrays: SparseMatrixCSC
-import OrdinaryDiffEqCore: _isdiag
+import OrdinaryDiffEqCore: _isdiag, find_algebraic_vars_eqs
 
 # Efficient O(nnz) isdiag check for sparse matrices.
 # Standard isdiag is O(n²) which is prohibitively slow for large sparse matrices.
@@ -22,4 +22,28 @@ function _isdiag(A::SparseMatrixCSC)
     return true
 end
 
+"""
+    find_algebraic_vars_eqs(M::SparseMatrixCSC)
+
+O(nnz) detection of algebraic variables (zero columns) and equations (zero rows).
+"""
+function find_algebraic_vars_eqs(M::SparseMatrixCSC)
+    n_cols = size(M, 2)
+    n_rows = size(M, 1)
+
+    algebraic_vars = fill(true, n_cols)
+    algebraic_eqs = fill(true, n_rows)
+
+    @inbounds for j in 1:n_cols
+        for idx in M.colptr[j]:(M.colptr[j + 1] - 1)
+            if !iszero(M.nzval[idx])
+                algebraic_vars[j] = false
+                algebraic_eqs[M.rowval[idx]] = false
+            end
+        end
+    end
+
+    return algebraic_vars, algebraic_eqs
+end
+
 end
diff --git a/lib/OrdinaryDiffEqCore/src/OrdinaryDiffEqCore.jl b/lib/OrdinaryDiffEqCore/src/OrdinaryDiffEqCore.jl
index fbfd8a743b7..d774a540d2d 100644
--- a/lib/OrdinaryDiffEqCore/src/OrdinaryDiffEqCore.jl
+++ b/lib/OrdinaryDiffEqCore/src/OrdinaryDiffEqCore.jl
@@ -15,7 +15,7 @@ import Logging: @logmsg, LogLevel
 
 using MuladdMacro: @muladd
 
-using LinearAlgebra: opnorm, I, UniformScaling, diag, rank, isdiag
+using LinearAlgebra: opnorm, I, UniformScaling, diag, rank, isdiag, Diagonal
 
 import PrecompileTools
 
diff --git a/lib/OrdinaryDiffEqCore/src/misc_utils.jl b/lib/OrdinaryDiffEqCore/src/misc_utils.jl
index a5c672cba95..65a6e56c21b 100644
--- a/lib/OrdinaryDiffEqCore/src/misc_utils.jl
+++ b/lib/OrdinaryDiffEqCore/src/misc_utils.jl
@@ -131,6 +131,30 @@ end
 # Sparse specialization is provided in OrdinaryDiffEqCoreSparseArraysExt
 _isdiag(A::AbstractMatrix) = isdiag(A)
 
+"""
+    find_algebraic_vars_eqs(M)
+
+Find algebraic variables (zero columns) and algebraic equations (zero rows) from mass matrix.
+Returns `(algebraic_vars, algebraic_eqs)` as boolean arrays (true = algebraic).
+
+Works on CPU and GPU arrays. Sparse specialization (O(nnz)) is provided in
+OrdinaryDiffEqCoreSparseArraysExt.
+"""
+function find_algebraic_vars_eqs(M::Diagonal)
+    _idxs = map(iszero, diag(M))
+    return _idxs, _idxs
+end
+
+function find_algebraic_vars_eqs(M::AbstractMatrix)
+    algebraic_vars = vec(all(iszero, M, dims=1))
+    algebraic_eqs = vec(all(iszero, M, dims=2))
+    return algebraic_vars, algebraic_eqs
+end
+
+function find_algebraic_vars_eqs(M::AbstractSciMLOperator)
+    return find_algebraic_vars_eqs(convert(AbstractMatrix, M))
+end
+
 isnewton(::Any) = false
 
 function _bool_to_ADType(::Val{true}, ::Val{CS}, _) where {CS}
diff --git a/lib/OrdinaryDiffEqNonlinearSolve/test/sparse_algebraic_detection_tests.jl b/lib/OrdinaryDiffEqCore/test/algebraic_vars_detection_tests.jl
similarity index 98%
rename from lib/OrdinaryDiffEqNonlinearSolve/test/sparse_algebraic_detection_tests.jl
rename to lib/OrdinaryDiffEqCore/test/algebraic_vars_detection_tests.jl
index 1f5cb9eca01..3b3b3caa1af 100644
--- a/lib/OrdinaryDiffEqNonlinearSolve/test/sparse_algebraic_detection_tests.jl
+++ b/lib/OrdinaryDiffEqCore/test/algebraic_vars_detection_tests.jl
@@ -1,6 +1,6 @@
 using Test
 using SparseArrays
-using OrdinaryDiffEqNonlinearSolve: find_algebraic_vars_eqs
+using OrdinaryDiffEqCore: find_algebraic_vars_eqs
 
 @testset "Sparse Algebraic Detection Performance" begin
     # Test 1: Correctness - results should match between sparse and dense methods
diff --git a/lib/OrdinaryDiffEqCore/test/runtests.jl b/lib/OrdinaryDiffEqCore/test/runtests.jl
index 2a9ad4e8615..f5a294c3d20 100644
--- a/lib/OrdinaryDiffEqCore/test/runtests.jl
+++ b/lib/OrdinaryDiffEqCore/test/runtests.jl
@@ -24,4 +24,5 @@ end
 # Functional tests
 if TEST_GROUP == "Core" || TEST_GROUP == "ALL"
     @time @safetestset "Sparse isdiag Performance" include("sparse_isdiag_tests.jl")
+    @time @safetestset "Algebraic Vars Detection" include("algebraic_vars_detection_tests.jl")
 end
diff --git a/lib/OrdinaryDiffEqNonlinearSolve/src/OrdinaryDiffEqNonlinearSolve.jl b/lib/OrdinaryDiffEqNonlinearSolve/src/OrdinaryDiffEqNonlinearSolve.jl
index 60cd480372f..f72e8189de2 100644
--- a/lib/OrdinaryDiffEqNonlinearSolve/src/OrdinaryDiffEqNonlinearSolve.jl
+++ b/lib/OrdinaryDiffEqNonlinearSolve/src/OrdinaryDiffEqNonlinearSolve.jl
@@ -50,7 +50,7 @@ using OrdinaryDiffEqCore: resize_nlsolver!, _initialize_dae!,
     FastConvergence, Convergence,
     SlowConvergence, VerySlowConvergence, Divergence, NLStatus,
     MethodType, alg_order, error_constant,
-    alg_extrapolates, resize_J_W!, has_autodiff
+    alg_extrapolates, resize_J_W!, has_autodiff, find_algebraic_vars_eqs
 
 import OrdinaryDiffEqCore: _initialize_dae!, _default_dae_init!,
     isnewton, get_W, isfirstcall, isfirststage,
diff --git a/lib/OrdinaryDiffEqNonlinearSolve/src/initialize_dae.jl b/lib/OrdinaryDiffEqNonlinearSolve/src/initialize_dae.jl
index 668e9745799..b2c4ba9cc5e 100644
--- a/lib/OrdinaryDiffEqNonlinearSolve/src/initialize_dae.jl
+++ b/lib/OrdinaryDiffEqNonlinearSolve/src/initialize_dae.jl
@@ -1,51 +1,3 @@
-# Efficient algebraic variable/equation detection for sparse mass matrices.
-# O(nnz) instead of O(n²) for sparse matrices.
-"""
-    find_algebraic_vars_eqs(M::SparseMatrixCSC)
-
-Find algebraic variables (zero columns) and algebraic equations (zero rows) from mass matrix.
-Returns (algebraic_vars::Vector{Bool}, algebraic_eqs::Vector{Bool}).
-
-For sparse matrices, uses O(nnz) traversal of CSC structure instead of O(n²) iteration.
-"""
-function find_algebraic_vars_eqs(M::SparseMatrixCSC)
-    n_cols = size(M, 2)
-    n_rows = size(M, 1)
-
-    # Initialize all as algebraic (true = zero column/row)
-    algebraic_vars = fill(true, n_cols)
-    algebraic_eqs = fill(true, n_rows)
-
-    # Mark columns/rows with non-zero values as differential (false)
-    @inbounds for j in 1:n_cols
-        for idx in M.colptr[j]:(M.colptr[j + 1] - 1)
-            if !iszero(M.nzval[idx])
-                algebraic_vars[j] = false
-                algebraic_eqs[M.rowval[idx]] = false
-            end
-        end
-    end
-
-    return algebraic_vars, algebraic_eqs
-end
-
-function find_algebraic_vars_eqs(M::LinearAlgebra.Diagonal)
-    _idxs = map(iszero, LinearAlgebra.diag(M))
-    return _idxs, _idxs
-end
-
-# Fallback for non-sparse matrices (original behavior)
-function find_algebraic_vars_eqs(M::AbstractMatrix)
-    algebraic_vars = vec(all(iszero, M, dims = 1))
-    algebraic_eqs = vec(all(iszero, M, dims = 2))
-    return algebraic_vars, algebraic_eqs
-end
-
-# Handle SciMLOperators (e.g., MatrixOperator) by converting to matrix
-function find_algebraic_vars_eqs(M::AbstractSciMLOperator)
-    return find_algebraic_vars_eqs(convert(AbstractMatrix, M))
-end
-
 # Optimized tolerance checking that avoids allocations
 @inline function check_dae_tolerance(integrator, err, abstol, t, ::Val{true})
     if abstol isa Number
diff --git a/lib/OrdinaryDiffEqNonlinearSolve/test/runtests.jl b/lib/OrdinaryDiffEqNonlinearSolve/test/runtests.jl
index d2f6725597c..0839c157818 100644
--- a/lib/OrdinaryDiffEqNonlinearSolve/test/runtests.jl
+++ b/lib/OrdinaryDiffEqNonlinearSolve/test/runtests.jl
@@ -20,7 +20,6 @@ end
 # Run functional tests
 if TEST_GROUP ∉ ("QA", "ModelingToolkit")
     @time @safetestset "Newton Tests" include("newton_tests.jl")
-    @time @safetestset "Sparse Algebraic Detection" include("sparse_algebraic_detection_tests.jl")
     @time @safetestset "Sparse DAE Initialization" include("sparse_dae_initialization_tests.jl")
     @time @safetestset "Linear Nonlinear Solver Tests" include("linear_nonlinear_tests.jl")
     @time @safetestset "Linear Solver Tests" include("linear_solver_tests.jl")

From f8a5ff4d6da045a21421bdd664bf454f7fbe548a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hans=20W=C3=BCrfel?= <git@wuerfel.io>
Date: Mon, 23 Feb 2026 12:50:48 +0100
Subject: [PATCH 04/11] fix cache build for several solvers

---
 lib/OrdinaryDiffEqBDF/src/OrdinaryDiffEqBDF.jl               | 2 +-
 lib/OrdinaryDiffEqBDF/src/bdf_caches.jl                      | 2 +-
 lib/OrdinaryDiffEqRosenbrock/src/OrdinaryDiffEqRosenbrock.jl | 3 ++-
 lib/OrdinaryDiffEqRosenbrock/src/rosenbrock_caches.jl        | 4 ++--
 lib/OrdinaryDiffEqSDIRK/src/OrdinaryDiffEqSDIRK.jl           | 3 ++-
 lib/OrdinaryDiffEqSDIRK/src/sdirk_caches.jl                  | 2 +-
 6 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/lib/OrdinaryDiffEqBDF/src/OrdinaryDiffEqBDF.jl b/lib/OrdinaryDiffEqBDF/src/OrdinaryDiffEqBDF.jl
index e6e9a07d884..e8cb7cf5e1f 100644
--- a/lib/OrdinaryDiffEqBDF/src/OrdinaryDiffEqBDF.jl
+++ b/lib/OrdinaryDiffEqBDF/src/OrdinaryDiffEqBDF.jl
@@ -23,7 +23,7 @@ import OrdinaryDiffEqCore: alg_order, calculate_residuals!,
     get_fsalfirstlast, generic_solver_docstring, _bool_to_ADType,
     _process_AD_choice,
     _ode_interpolant, _ode_interpolant!, has_stiff_interpolation,
-    _ode_addsteps!, DerivativeOrderNotPossibleError
+    _ode_addsteps!, DerivativeOrderNotPossibleError, find_algebraic_vars_eqs
 using OrdinaryDiffEqSDIRK: ImplicitEulerConstantCache, ImplicitEulerCache
 
 using TruncatedStacktraces: @truncate_stacktrace
diff --git a/lib/OrdinaryDiffEqBDF/src/bdf_caches.jl b/lib/OrdinaryDiffEqBDF/src/bdf_caches.jl
index 80b92a6f70c..5d71d3995ea 100644
--- a/lib/OrdinaryDiffEqBDF/src/bdf_caches.jl
+++ b/lib/OrdinaryDiffEqBDF/src/bdf_caches.jl
@@ -62,7 +62,7 @@ function alg_cache(
     atmp = similar(u, uEltypeNoUnits)
     recursivefill!(atmp, false)
     algebraic_vars = f.mass_matrix === I ? nothing :
-        [all(iszero, x) for x in eachcol(f.mass_matrix)]
+        find_algebraic_vars_eqs(f.mass_matrix)[1]
 
     eulercache = ImplicitEulerCache(
         u, uprev, uprev2, fsalfirst, atmp, nlsolver, algebraic_vars, alg.step_limiter!
diff --git a/lib/OrdinaryDiffEqRosenbrock/src/OrdinaryDiffEqRosenbrock.jl b/lib/OrdinaryDiffEqRosenbrock/src/OrdinaryDiffEqRosenbrock.jl
index 23913546d00..9b269d923e8 100644
--- a/lib/OrdinaryDiffEqRosenbrock/src/OrdinaryDiffEqRosenbrock.jl
+++ b/lib/OrdinaryDiffEqRosenbrock/src/OrdinaryDiffEqRosenbrock.jl
@@ -13,7 +13,8 @@ import OrdinaryDiffEqCore: alg_order, alg_adaptive_order, isWmethod, isfsal, _un
     calculate_residuals, has_stiff_interpolation, ODEIntegrator,
     resize_non_user_cache!, _ode_addsteps!, full_cache,
     DerivativeOrderNotPossibleError, _bool_to_ADType,
-    _process_AD_choice, LinearAliasSpecifier, copyat_or_push!
+    _process_AD_choice, LinearAliasSpecifier, copyat_or_push!,
+    find_algebraic_vars_eqs
 using MuladdMacro, FastBroadcast, RecursiveArrayTools
 import MacroTools: namify
 using MacroTools: @capture
diff --git a/lib/OrdinaryDiffEqRosenbrock/src/rosenbrock_caches.jl b/lib/OrdinaryDiffEqRosenbrock/src/rosenbrock_caches.jl
index d99fd25496c..7db92ffbb8c 100644
--- a/lib/OrdinaryDiffEqRosenbrock/src/rosenbrock_caches.jl
+++ b/lib/OrdinaryDiffEqRosenbrock/src/rosenbrock_caches.jl
@@ -179,7 +179,7 @@ function alg_cache(
     )
 
     algebraic_vars = f.mass_matrix === I ? nothing :
-        [all(iszero, x) for x in eachcol(f.mass_matrix)]
+        find_algebraic_vars_eqs(f.mass_matrix)[1]
 
     return Rosenbrock23Cache(
         u, uprev, k₁, k₂, k₃, du1, du2, f₁,
@@ -239,7 +239,7 @@ function alg_cache(
     )
 
     algebraic_vars = f.mass_matrix === I ? nothing :
-        [all(iszero, x) for x in eachcol(f.mass_matrix)]
+        find_algebraic_vars_eqs(f.mass_matrix)[1]
 
     return Rosenbrock32Cache(
         u, uprev, k₁, k₂, k₃, du1, du2, f₁, fsalfirst, fsallast, dT, J, W,
diff --git a/lib/OrdinaryDiffEqSDIRK/src/OrdinaryDiffEqSDIRK.jl b/lib/OrdinaryDiffEqSDIRK/src/OrdinaryDiffEqSDIRK.jl
index 37eb64be2c2..c22a23b8bd3 100644
--- a/lib/OrdinaryDiffEqSDIRK/src/OrdinaryDiffEqSDIRK.jl
+++ b/lib/OrdinaryDiffEqSDIRK/src/OrdinaryDiffEqSDIRK.jl
@@ -14,7 +14,8 @@ import OrdinaryDiffEqCore: alg_order, calculate_residuals!,
     trivial_limiter!, _ode_interpolant!,
     isesdirk, issplit,
     ssp_coefficient, get_fsalfirstlast, generic_solver_docstring,
-    _bool_to_ADType, _process_AD_choice, current_extrapolant!
+    _bool_to_ADType, _process_AD_choice, current_extrapolant!,
+    find_algebraic_vars_eqs
 using TruncatedStacktraces: @truncate_stacktrace
 using MuladdMacro, MacroTools, FastBroadcast, RecursiveArrayTools
 using SciMLBase: SplitFunction
diff --git a/lib/OrdinaryDiffEqSDIRK/src/sdirk_caches.jl b/lib/OrdinaryDiffEqSDIRK/src/sdirk_caches.jl
index 978120e3b86..70f63fd4708 100644
--- a/lib/OrdinaryDiffEqSDIRK/src/sdirk_caches.jl
+++ b/lib/OrdinaryDiffEqSDIRK/src/sdirk_caches.jl
@@ -35,7 +35,7 @@ function alg_cache(
     recursivefill!(atmp, false)
 
     algebraic_vars = f.mass_matrix === I ? nothing :
-        [all(iszero, x) for x in eachcol(f.mass_matrix)]
+        find_algebraic_vars_eqs(f.mass_matrix)[1]
 
     return ImplicitEulerCache(
         u, uprev, uprev2, fsalfirst, atmp, nlsolver, algebraic_vars, alg.step_limiter!

From 6f48f856647a6a8ed6fa0158d09b878e5b88cca7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hans=20W=C3=BCrfel?= <git@wuerfel.io>
Date: Mon, 23 Feb 2026 10:49:27 +0100
Subject: [PATCH 05/11] WIP test script

---
 test/gpu/simple_dae.jl | 206 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 175 insertions(+), 31 deletions(-)

diff --git a/test/gpu/simple_dae.jl b/test/gpu/simple_dae.jl
index c1a346f0108..0849fc604af 100644
--- a/test/gpu/simple_dae.jl
+++ b/test/gpu/simple_dae.jl
@@ -1,4 +1,7 @@
 using OrdinaryDiffEqRosenbrock
+using OrdinaryDiffEqSDIRK
+using OrdinaryDiffEqBDF
+using OrdinaryDiffEqFIRK
 using OrdinaryDiffEqNonlinearSolve
 using CUDA
 using LinearAlgebra
@@ -6,6 +9,7 @@ using Adapt
 using SparseArrays
 using Test
 using CUDSS
+using Printf
 
 #=
 du[1] = -u[1]
@@ -25,53 +29,193 @@ p = [
     -1 1 0 -1
 ]
 
-# mass_matrix = [1 0 0 0
-#                0 1 0 0
-#                0 0 0 0
-#                0 0 0 0]
 mass_matrix = Diagonal([1, 1, 0, 0])
 jac_prototype = sparse(map(x -> iszero(x) ? 0.0 : 1.0, p))
 
 u0 = [1.0, 1.0, 0.5, 0.5] # force init
-odef = ODEFunction(dae!, mass_matrix = mass_matrix, jac_prototype = jac_prototype)
-
 tspan = (0.0, 5.0)
+
+# CPU reference solution
+odef = ODEFunction(dae!, mass_matrix = mass_matrix, jac_prototype = jac_prototype)
 prob = ODEProblem(odef, u0, tspan, p)
 sol = solve(prob, Rodas5P())
 
-mass_matrix_d = adapt(CuArray, mass_matrix)
-jac_prototype_d_csc = CUDA.CUSPARSE.CuSparseMatrixCSC(jac_prototype)
-jac_prototype_d_csr = CUDA.CUSPARSE.CuSparseMatrixCSR(jac_prototype)
+# GPU data -- we use F64 for higher accuracy for comparision
+u0_d = adapt(CuArray{Float64}, u0)
+p_d = adapt(CuArray{Float64}, p)
 
-u0_d = adapt(CuArray, u0)
-p_d = adapt(CuArray, p)
+# dense or spares mass matrix does not work yet!
+mass_matrix_d = cu(mass_matrix)
+
+function test_gpu_dae(jac_prototype_d, solver)
+    sol_ref = solve(prob, solver)
+    odef_d = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = jac_prototype_d)
+    prob_d = ODEProblem(odef_d, u0_d, tspan, p_d)
+    sol_d = solve(prob_d, solver)
+
+    for t in sol_d.t
+        u = Vector(sol_d(t))
+        @test isapprox(u[1] + u[2], u[3]; atol = 1.0e-6)
+        @test isapprox(-u[1] + u[2], u[4]; atol = 1.0e-6)
+    end
 
-odef_d_dns = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = nothing)
-odef_d_csc = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = jac_prototype_d_csc)
-odef_d_csr = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = jac_prototype_d_csr)
+    max_abserr = 0.0
+    max_relerr = 0.0
+    for t in tspan[begin]:0.1:tspan[end]
+        diff = abs.(Vector(sol_d(t)) - sol_ref(t))
+        ref = abs.(sol_ref(t))
+        max_abserr = max(max_abserr, maximum(diff))
+        max_relerr = max(max_relerr, maximum(diff ./ max.(ref, eps())))
+    end
+    cond = (max_abserr < 1e-5 || max_relerr < 1e-5)
+    cond || println("Solver $sol abstol=$max_abserr retlol=$max_relerr")
+    @test (max_abserr < 1e-5 || max_relerr < 1e-5)
+end
 
-prob_d_dns = ODEProblem(odef_d_dns, u0_d, tspan, p_d)
-prob_d_csc = ODEProblem(odef_d_csc, u0_d, tspan, p_d)
-prob_d_csr = ODEProblem(odef_d_csr, u0_d, tspan, p_d)
+# Jacobian prototype options
+jac_prots = [
+    "none" => nothing,
+    "CSC" => CUDA.CUSPARSE.CuSparseMatrixCSC(jac_prototype),
+    "CSR" => CUDA.CUSPARSE.CuSparseMatrixCSR(jac_prototype),
+]
 
-sol_d_dns = solve(prob_d_dns, Rodas5P()) # works
-sol_d_csc = solve(prob_d_csc, Rodas5P()) # uses Krylov
-sol_d_csr = solve(prob_d_csr, Rodas5P()) # uses CUDSS
+# Solver options
+solvers = [
+    # Rosenbrock (diagonal mass matrix only)
+    "Rosenbrock23" => Rosenbrock23(),           # ✅
+    "Rosenbrock32" => Rosenbrock32(),           # 📉 poor fit for this DAE, 💥 CSC catastrophic
+    # Rosenbrock 2nd order
+    "ROS2" => ROS2(),                           # ✅
+    "ROS2PR" => ROS2PR(),                       # ✅
+    "ROS2S" => ROS2S(),                         # ✅
+    # Rosenbrock 3rd order
+    "ROS3" => ROS3(),                           # ✅
+    "ROS3PR" => ROS3PR(),                       # 📉 poor fit for this DAE
+    "ROS3PRL" => ROS3PRL(),                     # ⚠️ CSC accuracy loss
+    "ROS3PRL2" => ROS3PRL2(),                   # ⚠️ CSC accuracy loss
+    "ROS3P" => ROS3P(),                         # 📉 poor fit for this DAE
+    "Rodas3" => Rodas3(),                       # ⚠️ CSC accuracy loss
+    # "Rodas23W" => Rodas23W(),                   # 🚫 scalar indexing, requires large cahnges to `calculate_interpoldiff!`
+    # "Rodas3P" => Rodas3P(),                     # 🚫 scalar indexing
+    "Scholz4_7" => Scholz4_7(),                 # ✅
+    # Rosenbrock 4th order
+    "ROS34PW1a" => ROS34PW1a(),                 # 📉 poor fit for this DAE
+    "ROS34PW1b" => ROS34PW1b(),                 # 📉 poor fit for this DAE
+    "ROS34PW2" => ROS34PW2(),                   # ⚠️ CSC accuracy loss
+    "ROS34PW3" => ROS34PW3(),                   # ✅
+    "ROS34PRw" => ROS34PRw(),                   # ✅
+    "RosShamp4" => RosShamp4(),                 # ✅
+    "Veldd4" => Veldd4(),                       # ✅
+    "Velds4" => Velds4(),                       # ✅
+    "GRK4T" => GRK4T(),                         # ✅
+    "GRK4A" => GRK4A(),                         # ✅
+    "Ros4LStab" => Ros4LStab(),                 # ✅
+    "Rodas4" => Rodas4(),                       # ✅
+    "Rodas42" => Rodas42(),                     # ✅
+    "Rodas4P" => Rodas4P(),                     # ✅
+    "Rodas4P2" => Rodas4P2(),                   # ✅
+    "ROK4a" => ROK4a(),                         # ✅
+    # Rosenbrock 5th order
+    "Rodas5" => Rodas5(),                       # ✅
+    "Rodas5P" => Rodas5P(),                     # ✅
+    "Rodas5Pe" => Rodas5Pe(),                   # ✅
+    "Rodas5Pr" => Rodas5Pr(),                   # ✅
+    # Rosenbrock 6th order
+    "Rodas6P" => Rodas6P(),                     # ✅
+    # SDIRK (don't include fixed time step which need explicit dt)
+    "ImplicitEuler" => ImplicitEuler(),         # ✅
+    "Trapezoid" => Trapezoid(),                 # ✅
+    "SDIRK2" => SDIRK2(),                       # ✅
+    "Cash4" => Cash4(),                         # ⚠️ CSC accuracy loss
+    "Hairer4" => Hairer4(),                     # ⚠️ CSC accuracy loss
+    "Hairer42" => Hairer42(),                   # ⚠️ CSC accuracy loss
+    # BDF
+    "ABDF2" => ABDF2(),                         # ⚠️ CSC accuracy loss (💥 catastrophic)
+    "QNDF1" => QNDF1(),                        # ✅
+    "QNDF2" => QNDF2(),                        # ⚠️ CSC accuracy loss
+    # "QNDF" => QNDF(),                          # 🔧 DeviceMemory error in LinAlg
+    "QBDF1" => QBDF1(),                        # ✅
+    "QBDF2" => QBDF2(),                        # ⚠️ accuracy loss (all jac_prots)
+    # "QBDF" => QBDF(),                          # 🔧 DeviceMemory error in LinAlg
+    # "FBDF" => FBDF(),                           # 🚫 scalar indexing -> needs extensive work on reinitFBDF!
+    # FIRK -> all need subtential changes to `perform_step!` for FIRK methods
+    # "RadauIIA3" => RadauIIA3(),                 # 🚫 scalar indexing, ComplexF64 sparse unsupported
+    # "RadauIIA5" => RadauIIA5(),                 # 🚫 scalar indexing, ComplexF64 sparse unsupported
+    # "RadauIIA9" => RadauIIA9(),                 # 🚫 scalar indexing, ComplexF64 sparse unsupported
+    # "AdaptiveRadau" => AdaptiveRadau(),         # 🚫 scalar indexing, ComplexF64 sparse unsupported
+]
 
-@testset "Test constraints in GPU sol" begin
-    for sol_d in [sol_d_dns, sol_d_csc, sol_d_csr]
-        for t in sol_d.t
-            u = Vector(sol_d(t))
-            @test isapprox(u[1] + u[2], u[3]; atol = 1.0e-6)
-            @test isapprox(-u[1] + u[2], u[4]; atol = 1.0e-6)
-        end
+function _maxerrs(sol_a, sol_b)
+    max_abs = 0.0
+    max_rel = 0.0
+    for t in tspan[begin]:0.1:tspan[end]
+        a = isa(sol_a(t), CuArray) ? Vector(sol_a(t)) : Vector(sol_a(t))
+        b = isa(sol_b(t), CuArray) ? Vector(sol_b(t)) : Vector(sol_b(t))
+        diff = abs.(a - b)
+        ref = abs.(b)
+        max_abs = max(max_abs, maximum(diff))
+        max_rel = max(max_rel, maximum(diff ./ max.(ref, eps())))
     end
+    return max_abs, max_rel
 end
 
-@testset "Compare GPU to CPU solution" begin
-    for sol_d in [sol_d_dns, sol_d_csc, sol_d_csr]
-        for t in tspan[begin]:0.1:tspan[end]
-            @test Vector(sol_d(t)) ≈ sol(t) atol=1e-5
+function _printval(val; threshold = 1e-3)
+    s = @sprintf("%.2e", val)
+    if val > threshold
+        printstyled(s; color = :red)
+    else
+        print(s)
+    end
+end
+
+function debug_gpu_dae(jac_prototype_d, solver, name)
+    # CPU: this solver vs Rodas5P reference
+    sol_cpu = solve(prob, solver)
+    cpu_abs, cpu_rel = _maxerrs(sol_cpu, sol)
+
+    # GPU solve
+    odef_d = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = jac_prototype_d)
+    prob_d = ODEProblem(odef_d, u0_d, tspan, p_d)
+    sol_d = solve(prob_d, solver)
+
+    # GPU vs CPU same solver
+    gpu_abs, gpu_rel = _maxerrs(sol_d, sol_cpu)
+
+    # Print row
+    print(rpad(name, 30))
+    _printval(cpu_abs); print("  ")
+    _printval(cpu_rel); print("  ")
+    _printval(gpu_abs); print("  ")
+    _printval(gpu_rel)
+    println()
+end
+
+using Printf
+
+println(rpad("", 30), "cpu_abs  cpu_rel  gpu_abs  gpu_rel")
+println("-"^72)
+
+for (sn, sv) in solvers, (jn, jp) in jac_prots
+    label = "$sn / $jn"
+    try
+        debug_gpu_dae(jp, sv, label)
+    catch e
+        printstyled(rpad(label, 30), "ERROR: ", sprint(showerror, e; context=:limit=>true), "\n"; color = :yellow)
+    end
+end
+
+println("-"^72)
+
+#= Uncomment for actual testing
+@testset "End-to-end GPU compat of mass matrix DAE solvers" begin
+    for (sn, sv) in solvers
+        @testset "GPU DAE: $sn" begin
+            for (jn, jp) in jac_prots
+                @testset "Jacboain prototype: $jn" begin
+                    test_gpu_dae(jp, sv)
+                end
+            end
         end
     end
 end
+=#

From 86dcda05dfa5e4628b4a0d1911dea32ffac88019 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hans=20W=C3=BCrfel?= <git@wuerfel.io>
Date: Mon, 23 Feb 2026 11:30:31 +0100
Subject: [PATCH 06/11] improve test script

---
 test/gpu/simple_dae.jl | 329 +++++++++++++++++++++++------------------
 1 file changed, 185 insertions(+), 144 deletions(-)

diff --git a/test/gpu/simple_dae.jl b/test/gpu/simple_dae.jl
index 0849fc604af..d43c93b2731 100644
--- a/test/gpu/simple_dae.jl
+++ b/test/gpu/simple_dae.jl
@@ -35,122 +35,159 @@ jac_prototype = sparse(map(x -> iszero(x) ? 0.0 : 1.0, p))
 u0 = [1.0, 1.0, 0.5, 0.5] # force init
 tspan = (0.0, 5.0)
 
-# CPU reference solution
+# CPU reference solution (Rodas5P)
 odef = ODEFunction(dae!, mass_matrix = mass_matrix, jac_prototype = jac_prototype)
 prob = ODEProblem(odef, u0, tspan, p)
-sol = solve(prob, Rodas5P())
+sol_ref = solve(prob, Rodas5P())
 
-# GPU data -- we use F64 for higher accuracy for comparision
+# GPU data -- we use F64 for higher accuracy for comparison
 u0_d = adapt(CuArray{Float64}, u0)
 p_d = adapt(CuArray{Float64}, p)
 
-# dense or spares mass matrix does not work yet!
+# dense or sparse mass matrix does not work yet!
 mass_matrix_d = cu(mass_matrix)
 
-function test_gpu_dae(jac_prototype_d, solver)
-    sol_ref = solve(prob, solver)
-    odef_d = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = jac_prototype_d)
-    prob_d = ODEProblem(odef_d, u0_d, tspan, p_d)
-    sol_d = solve(prob_d, solver)
-
-    for t in sol_d.t
-        u = Vector(sol_d(t))
-        @test isapprox(u[1] + u[2], u[3]; atol = 1.0e-6)
-        @test isapprox(-u[1] + u[2], u[4]; atol = 1.0e-6)
-    end
-
-    max_abserr = 0.0
-    max_relerr = 0.0
-    for t in tspan[begin]:0.1:tspan[end]
-        diff = abs.(Vector(sol_d(t)) - sol_ref(t))
-        ref = abs.(sol_ref(t))
-        max_abserr = max(max_abserr, maximum(diff))
-        max_relerr = max(max_relerr, maximum(diff ./ max.(ref, eps())))
-    end
-    cond = (max_abserr < 1e-5 || max_relerr < 1e-5)
-    cond || println("Solver $sol abstol=$max_abserr retlol=$max_relerr")
-    @test (max_abserr < 1e-5 || max_relerr < 1e-5)
-end
-
-# Jacobian prototype options
+# ── Jacobian prototype options ────────────────────────────────────────────────
 jac_prots = [
     "none" => nothing,
     "CSC" => CUDA.CUSPARSE.CuSparseMatrixCSC(jac_prototype),
     "CSR" => CUDA.CUSPARSE.CuSparseMatrixCSR(jac_prototype),
 ]
 
-# Solver options
+# ── Solver definitions ────────────────────────────────────────────────────────
+#
+# Each entry: solver => (; tol overrides...)
+#   method_tol = (; atol, rtol)  – cpu solver vs Rodas5P reference (some methods are poor fits)
+#   gpu_tol    = (; atol, rtol)  – gpu vs cpu (none jac_prot)
+#   csc_tol    = (; atol, rtol)  – gpu vs cpu (CSC jac_prot)
+#   csr_tol    = (; atol, rtol)  – gpu vs cpu (CSR jac_prot)
+# Only specify fields that deviate from the defaults.
+
+const DEFAULT_METHOD_TOL = (; atol = 1e-4, rtol = 1e-4)     # generous: we only care about GPU vs CPU match
+const DEFAULT_GPU_TOL    = (; atol = 1e-4, rtol = 1e-4)
+
 solvers = [
-    # Rosenbrock (diagonal mass matrix only)
-    "Rosenbrock23" => Rosenbrock23(),           # ✅
-    "Rosenbrock32" => Rosenbrock32(),           # 📉 poor fit for this DAE, 💥 CSC catastrophic
-    # Rosenbrock 2nd order
-    "ROS2" => ROS2(),                           # ✅
-    "ROS2PR" => ROS2PR(),                       # ✅
-    "ROS2S" => ROS2S(),                         # ✅
-    # Rosenbrock 3rd order
-    "ROS3" => ROS3(),                           # ✅
-    "ROS3PR" => ROS3PR(),                       # 📉 poor fit for this DAE
-    "ROS3PRL" => ROS3PRL(),                     # ⚠️ CSC accuracy loss
-    "ROS3PRL2" => ROS3PRL2(),                   # ⚠️ CSC accuracy loss
-    "ROS3P" => ROS3P(),                         # 📉 poor fit for this DAE
-    "Rodas3" => Rodas3(),                       # ⚠️ CSC accuracy loss
-    # "Rodas23W" => Rodas23W(),                   # 🚫 scalar indexing, requires large cahnges to `calculate_interpoldiff!`
-    # "Rodas3P" => Rodas3P(),                     # 🚫 scalar indexing
-    "Scholz4_7" => Scholz4_7(),                 # ✅
-    # Rosenbrock 4th order
-    "ROS34PW1a" => ROS34PW1a(),                 # 📉 poor fit for this DAE
-    "ROS34PW1b" => ROS34PW1b(),                 # 📉 poor fit for this DAE
-    "ROS34PW2" => ROS34PW2(),                   # ⚠️ CSC accuracy loss
-    "ROS34PW3" => ROS34PW3(),                   # ✅
-    "ROS34PRw" => ROS34PRw(),                   # ✅
-    "RosShamp4" => RosShamp4(),                 # ✅
-    "Veldd4" => Veldd4(),                       # ✅
-    "Velds4" => Velds4(),                       # ✅
-    "GRK4T" => GRK4T(),                         # ✅
-    "GRK4A" => GRK4A(),                         # ✅
-    "Ros4LStab" => Ros4LStab(),                 # ✅
-    "Rodas4" => Rodas4(),                       # ✅
-    "Rodas42" => Rodas42(),                     # ✅
-    "Rodas4P" => Rodas4P(),                     # ✅
-    "Rodas4P2" => Rodas4P2(),                   # ✅
-    "ROK4a" => ROK4a(),                         # ✅
-    # Rosenbrock 5th order
-    "Rodas5" => Rodas5(),                       # ✅
-    "Rodas5P" => Rodas5P(),                     # ✅
-    "Rodas5Pe" => Rodas5Pe(),                   # ✅
-    "Rodas5Pr" => Rodas5Pr(),                   # ✅
-    # Rosenbrock 6th order
-    "Rodas6P" => Rodas6P(),                     # ✅
-    # SDIRK (don't include fixed time step which need explicit dt)
-    "ImplicitEuler" => ImplicitEuler(),         # ✅
-    "Trapezoid" => Trapezoid(),                 # ✅
-    "SDIRK2" => SDIRK2(),                       # ✅
-    "Cash4" => Cash4(),                         # ⚠️ CSC accuracy loss
-    "Hairer4" => Hairer4(),                     # ⚠️ CSC accuracy loss
-    "Hairer42" => Hairer42(),                   # ⚠️ CSC accuracy loss
-    # BDF
-    "ABDF2" => ABDF2(),                         # ⚠️ CSC accuracy loss (💥 catastrophic)
-    "QNDF1" => QNDF1(),                        # ✅
-    "QNDF2" => QNDF2(),                        # ⚠️ CSC accuracy loss
-    # "QNDF" => QNDF(),                          # 🔧 DeviceMemory error in LinAlg
-    "QBDF1" => QBDF1(),                        # ✅
-    "QBDF2" => QBDF2(),                        # ⚠️ accuracy loss (all jac_prots)
-    # "QBDF" => QBDF(),                          # 🔧 DeviceMemory error in LinAlg
-    # "FBDF" => FBDF(),                           # 🚫 scalar indexing -> needs extensive work on reinitFBDF!
-    # FIRK -> all need subtential changes to `perform_step!` for FIRK methods
-    # "RadauIIA3" => RadauIIA3(),                 # 🚫 scalar indexing, ComplexF64 sparse unsupported
-    # "RadauIIA5" => RadauIIA5(),                 # 🚫 scalar indexing, ComplexF64 sparse unsupported
-    # "RadauIIA9" => RadauIIA9(),                 # 🚫 scalar indexing, ComplexF64 sparse unsupported
-    # "AdaptiveRadau" => AdaptiveRadau(),         # 🚫 scalar indexing, ComplexF64 sparse unsupported
+    # ── Rosenbrock 2nd order ──
+    Rosenbrock23() => (;
+        csc_tol = (; atol = 2e-4, rtol = 5e-4),
+    ),
+    Rosenbrock32() => (;
+        csc_tol = (; atol = Inf, rtol = Inf),
+    ),
+    ROS2()   => (;),
+    ROS2PR() => (;
+        csc_tol = (; atol = 3e-4, rtol = 4e-4),
+    ),
+    ROS2S()  => (;
+        csc_tol = (; atol = 7e-4, rtol = 1.2e-3),
+    ),
+    # ── Rosenbrock 3rd order ──
+    ROS3()     => (;),
+    ROS3PR()   => (;),
+    ROS3PRL()  => (;
+        csc_tol = (; atol = 2e-3, rtol = 4e-3),
+    ),
+    ROS3PRL2() => (;
+        csc_tol = (; atol = 3e-3, rtol = 4e-3),
+    ),
+    ROS3P()    => (;),
+    Rodas3()   => (;
+        csc_tol = (; atol = 2e-3, rtol = 3e-3),
+    ),
+    # Rodas23W()  # 🚫 scalar indexing, requires large changes to `calculate_interpoldiff!`
+    # Rodas3P()   # 🚫 scalar indexing
+    Scholz4_7() => (;),
+    # ── Rosenbrock 4th order ──
+    ROS34PW1a() => (;),
+    ROS34PW1b() => (;),
+    ROS34PW2()  => (;
+        csc_tol = (; atol = 1.2e-3, rtol = 2.2e-3),
+    ),
+    ROS34PW3()  => (;),
+    ROS34PRw()  => (;
+        csc_tol = (; atol = 6e-4, rtol = 1e-3),
+    ),
+    RosShamp4() => (;),
+    Veldd4()    => (;),
+    Velds4()    => (;),
+    GRK4T()     => (;),
+    GRK4A()     => (;),
+    Ros4LStab() => (;),
+    Rodas4()    => (;),
+    Rodas42()   => (;
+        csc_tol = (; atol = 2e-5, rtol = 1.3e-4),
+    ),
+    Rodas4P()   => (;
+        csc_tol = (; atol = 9e-6, rtol = 1.3e-4),
+    ),
+    Rodas4P2()  => (;),
+    ROK4a()     => (;),
+    # ── Rosenbrock 5th order ──
+    Rodas5()   => (;),
+    Rodas5P()  => (;),
+    Rodas5Pe() => (;),
+    Rodas5Pr() => (;),
+    # ── Rosenbrock 6th order ──
+    Rodas6P()  => (;),
+    # ── SDIRK (don't include fixed time step which need explicit dt) ──
+    ImplicitEuler() => (;
+        gpu_tol = (; atol = 1e-4, rtol = 4e-4),
+        csc_tol = (; atol = 6e-5, rtol = 3e-4),
+    ),
+    Trapezoid() => (;),
+    SDIRK2()    => (;
+        csc_tol = (; atol = 1.5e-4, rtol = 2.1e-4),
+    ),
+    Cash4()     => (;
+        csc_tol = (; atol = 5e-3, rtol = 8e-3),
+    ),
+    Hairer4()   => (;
+        csc_tol = (; atol = 8e-3, rtol = 6e-2),
+    ),
+    Hairer42()  => (;
+        csc_tol = (; atol = 7e-3, rtol = 5e-2),
+    ),
+    # ── BDF ──
+    ABDF2() => (;
+        csc_tol = (; atol = Inf, rtol = Inf),                # 💥 CSC catastrophic
+    ),
+    QNDF1() => (;),
+    QNDF2() => (;
+        csc_tol = (; atol = 4e-3, rtol = 5e-3),
+    ),
+    # QNDF()  # 🔧 DeviceMemory error in LinAlg
+    QBDF1() => (;),
+    QBDF2() => (;
+        gpu_tol = (; atol = 2e-3, rtol = 2e-3),
+        csc_tol = (; atol = 3e-3, rtol = 6e-3),
+        csr_tol = (; atol = 2e-3, rtol = 2e-3),
+    ),
+    # QBDF()  # 🔧 DeviceMemory error in LinAlg
+    # FBDF()  # 🚫 scalar indexing -> needs extensive work on reinitFBDF!
+    # ── FIRK -> all need substantial changes to `perform_step!` for FIRK methods ──
+    # RadauIIA3()     # 🚫 scalar indexing, ComplexF64 sparse unsupported
+    # RadauIIA5()     # 🚫 scalar indexing, ComplexF64 sparse unsupported
+    # RadauIIA9()     # 🚫 scalar indexing, ComplexF64 sparse unsupported
+    # AdaptiveRadau() # 🚫 scalar indexing, ComplexF64 sparse unsupported
 ]
 
-function _maxerrs(sol_a, sol_b)
+# ── Helpers ───────────────────────────────────────────────────────────────────
+
+function _get_tol(overrides, jac_name)
+    _tol_keys = Dict("none" => :gpu_tol, "CSC" => :csc_tol, "CSR" => :csr_tol)
+    key = _tol_keys[jac_name]
+    hasproperty(overrides, key) && return getproperty(overrides, key)
+    # gpu_tol acts as default for all GPU combos
+    hasproperty(overrides, :gpu_tol) && return overrides.gpu_tol
+    return DEFAULT_GPU_TOL
+end
+
+function maxerrs(sol_a, sol_b)
     max_abs = 0.0
     max_rel = 0.0
     for t in tspan[begin]:0.1:tspan[end]
-        a = isa(sol_a(t), CuArray) ? Vector(sol_a(t)) : Vector(sol_a(t))
-        b = isa(sol_b(t), CuArray) ? Vector(sol_b(t)) : Vector(sol_b(t))
+        a = Vector(sol_a(t))
+        b = Vector(sol_b(t))
         diff = abs.(a - b)
         ref = abs.(b)
         max_abs = max(max_abs, maximum(diff))
@@ -159,63 +196,67 @@ function _maxerrs(sol_a, sol_b)
     return max_abs, max_rel
 end
 
-function _printval(val; threshold = 1e-3)
-    s = @sprintf("%.2e", val)
-    if val > threshold
-        printstyled(s; color = :red)
-    else
-        print(s)
-    end
-end
+function run_dae_tests()
+    results = Any[]
 
-function debug_gpu_dae(jac_prototype_d, solver, name)
-    # CPU: this solver vs Rodas5P reference
-    sol_cpu = solve(prob, solver)
-    cpu_abs, cpu_rel = _maxerrs(sol_cpu, sol)
-
-    # GPU solve
-    odef_d = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = jac_prototype_d)
-    prob_d = ODEProblem(odef_d, u0_d, tspan, p_d)
-    sol_d = solve(prob_d, solver)
-
-    # GPU vs CPU same solver
-    gpu_abs, gpu_rel = _maxerrs(sol_d, sol_cpu)
-
-    # Print row
-    print(rpad(name, 30))
-    _printval(cpu_abs); print("  ")
-    _printval(cpu_rel); print("  ")
-    _printval(gpu_abs); print("  ")
-    _printval(gpu_rel)
-    println()
-end
+    for (sv, overrides) in solvers, (jn, jp) in jac_prots
+        sn = string(nameof(typeof(sv)))
+        (; atol, rtol) = _get_tol(overrides, jn)
 
-using Printf
+        # CPU: this solver vs reference
+        sol_cpu = solve(prob, sv)
+        cpu_abs, cpu_rel = maxerrs(sol_cpu, sol_ref)
 
-println(rpad("", 30), "cpu_abs  cpu_rel  gpu_abs  gpu_rel")
-println("-"^72)
+        # GPU solve
+        odef_d = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = jp)
+        prob_d = ODEProblem(odef_d, u0_d, tspan, p_d)
+        sol_d = solve(prob_d, sv)
 
-for (sn, sv) in solvers, (jn, jp) in jac_prots
-    label = "$sn / $jn"
-    try
-        debug_gpu_dae(jp, sv, label)
-    catch e
-        printstyled(rpad(label, 30), "ERROR: ", sprint(showerror, e; context=:limit=>true), "\n"; color = :yellow)
+        # GPU vs CPU (same solver)
+        gpu_abs, gpu_rel = maxerrs(sol_d, sol_cpu)
+
+        passed = (gpu_abs < atol) && (gpu_rel < rtol)
+        push!(results, (; solver = sn, jac = jn, cpu_abs, cpu_rel, gpu_abs, gpu_rel,
+                            tol_abs = atol, tol_rel = rtol, passed, error = ""))
+        @test passed
     end
+
+    return results
 end
 
-println("-"^72)
+function show_results(results)
+    function _fmt(val; threshold = 1e-3)
+        isnan(val) && return "   ---  "
+        s = @sprintf("%.2e", val)
+        if val > threshold
+            return "\e[31m$s\e[0m"
+        end
+        return s
+    end
+
+    println(rpad("Solver / Jac", 30), "cpu_abs  cpu_rel  gpu_abs  gpu_rel  status")
+    println("-"^85)
 
-#= Uncomment for actual testing
-@testset "End-to-end GPU compat of mass matrix DAE solvers" begin
-    for (sn, sv) in solvers
-        @testset "GPU DAE: $sn" begin
-            for (jn, jp) in jac_prots
-                @testset "Jacboain prototype: $jn" begin
-                    test_gpu_dae(jp, sv)
-                end
+    for r in results
+        label = rpad("$(r.solver) / $(r.jac)", 30)
+        if r.error != ""
+            printstyled(label, "ERROR: ", r.error, "\n"; color = :yellow)
+        else
+            print(label)
+            print(_fmt(r.cpu_abs), "  ", _fmt(r.cpu_rel), "  ")
+            print(_fmt(r.gpu_abs), "  ", _fmt(r.gpu_rel), "  ")
+            if r.passed
+                printstyled("✓\n"; color = :green)
+            else
+                printstyled("✗\n"; color = :red)
             end
         end
     end
+    println("-"^85)
 end
-=#
+
+@testset "GPU DAE solver compatibility" begin
+    results = run_dae_tests()
+end
+
+show_results(results)

From f00915691c51963e65d3bf3de12223e968973e2b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hans=20W=C3=BCrfel?= <git@wuerfel.io>
Date: Mon, 23 Feb 2026 12:21:36 +0100
Subject: [PATCH 07/11] improve test script

---
 test/gpu/simple_dae.jl | 138 +++++++++++++++++++++--------------------
 1 file changed, 71 insertions(+), 67 deletions(-)

diff --git a/test/gpu/simple_dae.jl b/test/gpu/simple_dae.jl
index d43c93b2731..83b13c734f0 100644
--- a/test/gpu/simple_dae.jl
+++ b/test/gpu/simple_dae.jl
@@ -10,6 +10,8 @@ using SparseArrays
 using Test
 using CUDSS
 using Printf
+using OrdinaryDiffEqNonlinearSolve.LinearSolve: KrylovJL_GMRES
+
 
 #=
 du[1] = -u[1]
@@ -39,6 +41,7 @@ tspan = (0.0, 5.0)
 odef = ODEFunction(dae!, mass_matrix = mass_matrix, jac_prototype = jac_prototype)
 prob = ODEProblem(odef, u0, tspan, p)
 sol_ref = solve(prob, Rodas5P())
+sol_ref_krylov = solve(prob, Rodas5P(linsolve=KrylovJL_GMRES()))
 
 # GPU data -- we use F64 for higher accuracy for comparison
 u0_d = adapt(CuArray{Float64}, u0)
@@ -68,107 +71,102 @@ const DEFAULT_GPU_TOL    = (; atol = 1e-4, rtol = 1e-4)
 
 solvers = [
     # ── Rosenbrock 2nd order ──
-    Rosenbrock23() => (;
+    Rosenbrock23 => (;
         csc_tol = (; atol = 2e-4, rtol = 5e-4),
     ),
-    Rosenbrock32() => (;
+    Rosenbrock32 => (;
         csc_tol = (; atol = Inf, rtol = Inf),
     ),
-    ROS2()   => (;),
-    ROS2PR() => (;
+    ROS2 => (;),
+    ROS2PR => (;
         csc_tol = (; atol = 3e-4, rtol = 4e-4),
     ),
-    ROS2S()  => (;
+    ROS2S  => (;
         csc_tol = (; atol = 7e-4, rtol = 1.2e-3),
     ),
     # ── Rosenbrock 3rd order ──
-    ROS3()     => (;),
-    ROS3PR()   => (;),
-    ROS3PRL()  => (;
+    ROS3     => (;),
+    ROS3PR   => (;),
+    ROS3PRL  => (;
         csc_tol = (; atol = 2e-3, rtol = 4e-3),
     ),
-    ROS3PRL2() => (;
+    ROS3PRL2 => (;
         csc_tol = (; atol = 3e-3, rtol = 4e-3),
     ),
-    ROS3P()    => (;),
-    Rodas3()   => (;
+    ROS3P    => (;),
+    Rodas3   => (;
         csc_tol = (; atol = 2e-3, rtol = 3e-3),
     ),
-    # Rodas23W()  # 🚫 scalar indexing, requires large changes to `calculate_interpoldiff!`
-    # Rodas3P()   # 🚫 scalar indexing
-    Scholz4_7() => (;),
+    # Rodas23W()  # scalar indexing, requires large changes to `calculate_interpoldiff!`
+    # Rodas3P()   # scalar indexing
+    Scholz4_7 => (;),
     # ── Rosenbrock 4th order ──
-    ROS34PW1a() => (;),
-    ROS34PW1b() => (;),
-    ROS34PW2()  => (;
+    ROS34PW1a => (;),
+    ROS34PW1b => (;),
+    ROS34PW2  => (;
         csc_tol = (; atol = 1.2e-3, rtol = 2.2e-3),
     ),
-    ROS34PW3()  => (;),
-    ROS34PRw()  => (;
+    ROS34PW3  => (;),
+    ROS34PRw  => (;
         csc_tol = (; atol = 6e-4, rtol = 1e-3),
     ),
-    RosShamp4() => (;),
-    Veldd4()    => (;),
-    Velds4()    => (;),
-    GRK4T()     => (;),
-    GRK4A()     => (;),
-    Ros4LStab() => (;),
-    Rodas4()    => (;),
-    Rodas42()   => (;
-        csc_tol = (; atol = 2e-5, rtol = 1.3e-4),
-    ),
-    Rodas4P()   => (;
-        csc_tol = (; atol = 9e-6, rtol = 1.3e-4),
-    ),
-    Rodas4P2()  => (;),
-    ROK4a()     => (;),
+    RosShamp4 => (;),
+    Veldd4    => (;),
+    Velds4    => (;),
+    GRK4T     => (;),
+    GRK4A     => (;),
+    Ros4LStab => (;),
+    Rodas4    => (;),
+    Rodas42   => (;),
+    Rodas4P   => (;),
+    Rodas4P2  => (;),
+    ROK4a     => (;),
     # ── Rosenbrock 5th order ──
-    Rodas5()   => (;),
-    Rodas5P()  => (;),
-    Rodas5Pe() => (;),
-    Rodas5Pr() => (;),
+    Rodas5   => (;),
+    Rodas5P  => (;),
+    Rodas5Pe => (;),
+    Rodas5Pr => (;),
     # ── Rosenbrock 6th order ──
-    Rodas6P()  => (;),
+    Rodas6P  => (;),
     # ── SDIRK (don't include fixed time step which need explicit dt) ──
-    ImplicitEuler() => (;
-        gpu_tol = (; atol = 1e-4, rtol = 4e-4),
-        csc_tol = (; atol = 6e-5, rtol = 3e-4),
+    ImplicitEuler => (;),
+    Trapezoid => (;
+        csc_tol = (; atol = 8e-3, rtol = 2.5e-2),
     ),
-    Trapezoid() => (;),
-    SDIRK2()    => (;
-        csc_tol = (; atol = 1.5e-4, rtol = 2.1e-4),
+    SDIRK2    => (;
+        csc_tol = (; atol = 1.5e-4, rtol = 3.5e-4),
     ),
-    Cash4()     => (;
+    Cash4     => (;
         csc_tol = (; atol = 5e-3, rtol = 8e-3),
     ),
-    Hairer4()   => (;
+    Hairer4   => (;
         csc_tol = (; atol = 8e-3, rtol = 6e-2),
     ),
-    Hairer42()  => (;
+    Hairer42  => (;
         csc_tol = (; atol = 7e-3, rtol = 5e-2),
     ),
     # ── BDF ──
-    ABDF2() => (;
-        csc_tol = (; atol = Inf, rtol = Inf),                # 💥 CSC catastrophic
+    ABDF2 => (;
+        csc_tol = (; atol = 2e-4, rtol = 7e-4),
     ),
-    QNDF1() => (;),
-    QNDF2() => (;
+    QNDF1 => (;),
+    QNDF2 => (;
         csc_tol = (; atol = 4e-3, rtol = 5e-3),
     ),
     # QNDF()  # 🔧 DeviceMemory error in LinAlg
-    QBDF1() => (;),
-    QBDF2() => (;
+    QBDF1 => (;),
+    QBDF2 => (;
         gpu_tol = (; atol = 2e-3, rtol = 2e-3),
-        csc_tol = (; atol = 3e-3, rtol = 6e-3),
+        csc_tol = (; atol = 4e-3, rtol = 7e-3),
         csr_tol = (; atol = 2e-3, rtol = 2e-3),
     ),
-    # QBDF()  # 🔧 DeviceMemory error in LinAlg
-    # FBDF()  # 🚫 scalar indexing -> needs extensive work on reinitFBDF!
+    # QBDF()  # DeviceMemory error in LinAlg
+    # FBDF()  # scalar indexing -> needs extensive work on reinitFBDF!
     # ── FIRK -> all need substantial changes to `perform_step!` for FIRK methods ──
-    # RadauIIA3()     # 🚫 scalar indexing, ComplexF64 sparse unsupported
-    # RadauIIA5()     # 🚫 scalar indexing, ComplexF64 sparse unsupported
-    # RadauIIA9()     # 🚫 scalar indexing, ComplexF64 sparse unsupported
-    # AdaptiveRadau() # 🚫 scalar indexing, ComplexF64 sparse unsupported
+    # RadauIIA3()     #  scalar indexing, ComplexF64 sparse unsupported
+    # RadauIIA5()     #  scalar indexing, ComplexF64 sparse unsupported
+    # RadauIIA9()     #  scalar indexing, ComplexF64 sparse unsupported
+    # AdaptiveRadau() #  scalar indexing, ComplexF64 sparse unsupported
 ]
 
 # ── Helpers ───────────────────────────────────────────────────────────────────
@@ -200,22 +198,28 @@ function run_dae_tests()
     results = Any[]
 
     for (sv, overrides) in solvers, (jn, jp) in jac_prots
-        sn = string(nameof(typeof(sv)))
+        println("Test $sv with prototype $jn")
+        sn = string(sv)
         (; atol, rtol) = _get_tol(overrides, jn)
 
+        # CSC will allways fall back to krylov so the ref solution should do to
+        krylov = (jn == "CSC")
+        _sol_ref = krylov ? sol_ref_krylov : sol_ref
+
         # CPU: this solver vs reference
-        sol_cpu = solve(prob, sv)
-        cpu_abs, cpu_rel = maxerrs(sol_cpu, sol_ref)
+        cpu_alg = krylov ? sv(linsolve=KrylovJL_GMRES()) : sv()
+        sol_cpu = solve(prob, cpu_alg)
+        cpu_abs, cpu_rel = maxerrs(sol_cpu, _sol_ref)
 
         # GPU solve
         odef_d = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = jp)
         prob_d = ODEProblem(odef_d, u0_d, tspan, p_d)
-        sol_d = solve(prob_d, sv)
+        sol_d = solve(prob_d, sv())
 
         # GPU vs CPU (same solver)
         gpu_abs, gpu_rel = maxerrs(sol_d, sol_cpu)
 
-        passed = (gpu_abs < atol) && (gpu_rel < rtol)
+        passed = (gpu_abs < atol) || (gpu_rel < rtol)
         push!(results, (; solver = sn, jac = jn, cpu_abs, cpu_rel, gpu_abs, gpu_rel,
                             tol_abs = atol, tol_rel = rtol, passed, error = ""))
         @test passed
@@ -256,7 +260,7 @@ function show_results(results)
 end
 
 @testset "GPU DAE solver compatibility" begin
+    global results
     results = run_dae_tests()
 end
-
 show_results(results)

From fcf7d911de7e4de597cfe6ce2740f177b9b691ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hans=20W=C3=BCrfel?= <git@wuerfel.io>
Date: Mon, 23 Feb 2026 12:42:47 +0100
Subject: [PATCH 08/11] improve test script

---
 test/gpu/simple_dae.jl | 62 ++++++++++++++++++++++++++++++------------
 1 file changed, 44 insertions(+), 18 deletions(-)

diff --git a/test/gpu/simple_dae.jl b/test/gpu/simple_dae.jl
index 83b13c734f0..adf2c54d8e6 100644
--- a/test/gpu/simple_dae.jl
+++ b/test/gpu/simple_dae.jl
@@ -59,14 +59,15 @@ jac_prots = [
 
 # ── Solver definitions ────────────────────────────────────────────────────────
 #
-# Each entry: solver => (; tol overrides...)
-#   method_tol = (; atol, rtol)  – cpu solver vs Rodas5P reference (some methods are poor fits)
-#   gpu_tol    = (; atol, rtol)  – gpu vs cpu (none jac_prot)
-#   csc_tol    = (; atol, rtol)  – gpu vs cpu (CSC jac_prot)
-#   csr_tol    = (; atol, rtol)  – gpu vs cpu (CSR jac_prot)
+# Each entry: SolverType => (; tol overrides...)
+#   method_tol     = (; atol, rtol)  – cpu solver vs Rodas5P ref (all jac); some methods are poor fits for DAEs
+#   method_csc_tol = (; atol, rtol)  – cpu solver vs Rodas5P ref (CSC/Krylov path only)
+#   gpu_tol        = (; atol, rtol)  – gpu vs cpu (none/CSR fallback)
+#   csc_tol        = (; atol, rtol)  – gpu vs cpu (CSC jac_prot)
+#   csr_tol        = (; atol, rtol)  – gpu vs cpu (CSR jac_prot)
 # Only specify fields that deviate from the defaults.
 
-const DEFAULT_METHOD_TOL = (; atol = 1e-4, rtol = 1e-4)     # generous: we only care about GPU vs CPU match
+const DEFAULT_METHOD_TOL = (; atol = 1e-2, rtol = 1e-2)
 const DEFAULT_GPU_TOL    = (; atol = 1e-4, rtol = 1e-4)
 
 solvers = [
@@ -75,7 +76,8 @@ solvers = [
         csc_tol = (; atol = 2e-4, rtol = 5e-4),
     ),
     Rosenbrock32 => (;
-        csc_tol = (; atol = Inf, rtol = Inf),
+        method_tol = (; atol = 2e-2, rtol = 1.5),
+        csc_tol    = (; atol = Inf, rtol = Inf),
     ),
     ROS2 => (;),
     ROS2PR => (;
@@ -86,14 +88,18 @@ solvers = [
     ),
     # ── Rosenbrock 3rd order ──
     ROS3     => (;),
-    ROS3PR   => (;),
+    ROS3PR   => (;
+        method_tol = (; atol = 0.25, rtol = 15.0),
+    ),
     ROS3PRL  => (;
         csc_tol = (; atol = 2e-3, rtol = 4e-3),
     ),
     ROS3PRL2 => (;
         csc_tol = (; atol = 3e-3, rtol = 4e-3),
     ),
-    ROS3P    => (;),
+    ROS3P    => (;
+        method_tol = (; atol = 0.25, rtol = 15.0),
+    ),
     Rodas3   => (;
         csc_tol = (; atol = 2e-3, rtol = 3e-3),
     ),
@@ -101,8 +107,12 @@ solvers = [
     # Rodas3P()   # scalar indexing
     Scholz4_7 => (;),
     # ── Rosenbrock 4th order ──
-    ROS34PW1a => (;),
-    ROS34PW1b => (;),
+    ROS34PW1a => (;
+        method_tol = (; atol = 0.25, rtol = 5.0),
+    ),
+    ROS34PW1b => (;
+        method_tol = (; atol = 0.25, rtol = 5.0),
+    ),
     ROS34PW2  => (;
         csc_tol = (; atol = 1.2e-3, rtol = 2.2e-3),
     ),
@@ -147,14 +157,17 @@ solvers = [
     ),
     # ── BDF ──
     ABDF2 => (;
-        csc_tol = (; atol = 2e-4, rtol = 7e-4),
+        method_csc_tol = (; atol = Inf, rtol = Inf),   # ABDF2 + Krylov diverges vs Rodas5P + Krylov
+        csc_tol        = (; atol = 2e-4, rtol = 7e-4),
     ),
     QNDF1 => (;),
     QNDF2 => (;
         csc_tol = (; atol = 4e-3, rtol = 5e-3),
     ),
     # QNDF()  # 🔧 DeviceMemory error in LinAlg
-    QBDF1 => (;),
+    QBDF1 => (;
+        method_tol = (; atol = 2e-2, rtol = 0.2),
+    ),
     QBDF2 => (;
         gpu_tol = (; atol = 2e-3, rtol = 2e-3),
         csc_tol = (; atol = 4e-3, rtol = 7e-3),
@@ -171,6 +184,12 @@ solvers = [
 
 # ── Helpers ───────────────────────────────────────────────────────────────────
 
+function _get_method_tol(overrides, jac_name)
+    jac_name == "CSC" && hasproperty(overrides, :method_csc_tol) && return overrides.method_csc_tol
+    hasproperty(overrides, :method_tol) && return overrides.method_tol
+    return DEFAULT_METHOD_TOL
+end
+
 function _get_tol(overrides, jac_name)
     _tol_keys = Dict("none" => :gpu_tol, "CSC" => :csc_tol, "CSR" => :csr_tol)
     key = _tol_keys[jac_name]
@@ -200,7 +219,8 @@ function run_dae_tests()
     for (sv, overrides) in solvers, (jn, jp) in jac_prots
         println("Test $sv with prototype $jn")
         sn = string(sv)
-        (; atol, rtol) = _get_tol(overrides, jn)
+        gtol = _get_tol(overrides, jn)
+        mtol = _get_method_tol(overrides, jn)
 
         # CSC will allways fall back to krylov so the ref solution should do to
         krylov = (jn == "CSC")
@@ -219,9 +239,11 @@ function run_dae_tests()
         # GPU vs CPU (same solver)
         gpu_abs, gpu_rel = maxerrs(sol_d, sol_cpu)
 
-        passed = (gpu_abs < atol) || (gpu_rel < rtol)
+        method_passed = (cpu_abs < mtol.atol) || (cpu_rel < mtol.rtol)
+        gpu_passed    = (gpu_abs < gtol.atol) || (gpu_rel < gtol.rtol)
+        passed = method_passed && gpu_passed
         push!(results, (; solver = sn, jac = jn, cpu_abs, cpu_rel, gpu_abs, gpu_rel,
-                            tol_abs = atol, tol_rel = rtol, passed, error = ""))
+                            gtol, mtol, method_passed, gpu_passed, passed, error = ""))
         @test passed
     end
 
@@ -247,12 +269,16 @@ function show_results(results)
             printstyled(label, "ERROR: ", r.error, "\n"; color = :yellow)
         else
             print(label)
-            print(_fmt(r.cpu_abs), "  ", _fmt(r.cpu_rel), "  ")
+            print(_fmt(r.cpu_abs, threshold=1e-2), "  ", _fmt(r.cpu_rel, threshold=1e-2), "  ")
             print(_fmt(r.gpu_abs), "  ", _fmt(r.gpu_rel), "  ")
             if r.passed
                 printstyled("✓\n"; color = :green)
+            elseif !r.method_passed && !r.gpu_passed
+                printstyled("✗ method+gpu\n"; color = :red)
+            elseif !r.method_passed
+                printstyled("✗ method\n"; color = :red)
             else
-                printstyled("✗\n"; color = :red)
+                printstyled("✗ gpu\n"; color = :red)
             end
         end
     end

From 7de1820cde5055d661a327a1c2fc1e956f5e2055 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hans=20W=C3=BCrfel?= <git@wuerfel.io>
Date: Mon, 23 Feb 2026 12:49:18 +0100
Subject: [PATCH 09/11] adapt project.toml

---
 test/gpu/Project.toml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/gpu/Project.toml b/test/gpu/Project.toml
index aecabf6d1aa..dd879bde104 100644
--- a/test/gpu/Project.toml
+++ b/test/gpu/Project.toml
@@ -3,14 +3,16 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
 DiffEqBase = "2b5f629d-d688-5b77-993f-72d75c75574e"
-FastBroadcast = "7034ab61-46d4-4ed7-9d0f-46aef9175898"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
+FastBroadcast = "7034ab61-46d4-4ed7-9d0f-46aef9175898"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
 OrdinaryDiffEqBDF = "6ad6398a-0878-4a85-9266-38940aa047c8"
+OrdinaryDiffEqFIRK = "5960d6e9-dd7a-4743-88e7-cf307b64f125"
 OrdinaryDiffEqNonlinearSolve = "127b3ac7-2247-4354-8eb6-78cf4e7c58e8"
 OrdinaryDiffEqRKIP = "a4daff8c-1d43-4ff3-8eff-f78720aeecdc"
 OrdinaryDiffEqRosenbrock = "43230ef6-c299-4910-a778-202eb28ce4ce"
+OrdinaryDiffEqSDIRK = "2d112036-d095-4a1e-ab9a-08536f3ecdbf"
 RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 SciMLOperators = "c0aeaf25-5076-4817-a8d5-81caf7dfa961"
@@ -26,8 +28,8 @@ Adapt = "4"
 CUDA = "4, 5"
 ComponentArrays = "0.15"
 DiffEqBase = "6.182"
-FastBroadcast = "0.3"
 FFTW = "1.8"
+FastBroadcast = "0.3"
 FillArrays = "1"
 OrdinaryDiffEq = "6"
 OrdinaryDiffEqBDF = "1"

From 503d2cd832a61f54cce0caf0976d5897b02a8525 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hans=20W=C3=BCrfel?= <git@wuerfel.io>
Date: Mon, 23 Feb 2026 13:58:02 +0100
Subject: [PATCH 10/11] apply runic

---
 lib/OrdinaryDiffEqCore/src/misc_utils.jl |   4 +-
 test/gpu/simple_dae.jl                   | 114 ++++++++++++-----------
 2 files changed, 61 insertions(+), 57 deletions(-)

diff --git a/lib/OrdinaryDiffEqCore/src/misc_utils.jl b/lib/OrdinaryDiffEqCore/src/misc_utils.jl
index 65a6e56c21b..75c75f3d636 100644
--- a/lib/OrdinaryDiffEqCore/src/misc_utils.jl
+++ b/lib/OrdinaryDiffEqCore/src/misc_utils.jl
@@ -146,8 +146,8 @@ function find_algebraic_vars_eqs(M::Diagonal)
 end
 
 function find_algebraic_vars_eqs(M::AbstractMatrix)
-    algebraic_vars = vec(all(iszero, M, dims=1))
-    algebraic_eqs = vec(all(iszero, M, dims=2))
+    algebraic_vars = vec(all(iszero, M, dims = 1))
+    algebraic_eqs = vec(all(iszero, M, dims = 2))
     return algebraic_vars, algebraic_eqs
 end
 
diff --git a/test/gpu/simple_dae.jl b/test/gpu/simple_dae.jl
index adf2c54d8e6..44803183385 100644
--- a/test/gpu/simple_dae.jl
+++ b/test/gpu/simple_dae.jl
@@ -41,7 +41,7 @@ tspan = (0.0, 5.0)
 odef = ODEFunction(dae!, mass_matrix = mass_matrix, jac_prototype = jac_prototype)
 prob = ODEProblem(odef, u0, tspan, p)
 sol_ref = solve(prob, Rodas5P())
-sol_ref_krylov = solve(prob, Rodas5P(linsolve=KrylovJL_GMRES()))
+sol_ref_krylov = solve(prob, Rodas5P(linsolve = KrylovJL_GMRES()))
 
 # GPU data -- we use F64 for higher accuracy for comparison
 u0_d = adapt(CuArray{Float64}, u0)
@@ -67,41 +67,41 @@ jac_prots = [
 #   csr_tol        = (; atol, rtol)  – gpu vs cpu (CSR jac_prot)
 # Only specify fields that deviate from the defaults.
 
-const DEFAULT_METHOD_TOL = (; atol = 1e-2, rtol = 1e-2)
-const DEFAULT_GPU_TOL    = (; atol = 1e-4, rtol = 1e-4)
+const DEFAULT_METHOD_TOL = (; atol = 1.0e-2, rtol = 1.0e-2)
+const DEFAULT_GPU_TOL = (; atol = 1.0e-4, rtol = 1.0e-4)
 
 solvers = [
     # ── Rosenbrock 2nd order ──
     Rosenbrock23 => (;
-        csc_tol = (; atol = 2e-4, rtol = 5e-4),
+        csc_tol = (; atol = 2.0e-4, rtol = 5.0e-4),
     ),
     Rosenbrock32 => (;
-        method_tol = (; atol = 2e-2, rtol = 1.5),
-        csc_tol    = (; atol = Inf, rtol = Inf),
+        method_tol = (; atol = 2.0e-2, rtol = 1.5),
+        csc_tol = (; atol = Inf, rtol = Inf),
     ),
     ROS2 => (;),
     ROS2PR => (;
-        csc_tol = (; atol = 3e-4, rtol = 4e-4),
+        csc_tol = (; atol = 3.0e-4, rtol = 4.0e-4),
     ),
-    ROS2S  => (;
-        csc_tol = (; atol = 7e-4, rtol = 1.2e-3),
+    ROS2S => (;
+        csc_tol = (; atol = 7.0e-4, rtol = 1.2e-3),
     ),
     # ── Rosenbrock 3rd order ──
-    ROS3     => (;),
-    ROS3PR   => (;
+    ROS3 => (;),
+    ROS3PR => (;
         method_tol = (; atol = 0.25, rtol = 15.0),
     ),
-    ROS3PRL  => (;
-        csc_tol = (; atol = 2e-3, rtol = 4e-3),
+    ROS3PRL => (;
+        csc_tol = (; atol = 2.0e-3, rtol = 4.0e-3),
     ),
     ROS3PRL2 => (;
-        csc_tol = (; atol = 3e-3, rtol = 4e-3),
+        csc_tol = (; atol = 3.0e-3, rtol = 4.0e-3),
     ),
-    ROS3P    => (;
+    ROS3P => (;
         method_tol = (; atol = 0.25, rtol = 15.0),
     ),
-    Rodas3   => (;
-        csc_tol = (; atol = 2e-3, rtol = 3e-3),
+    Rodas3 => (;
+        csc_tol = (; atol = 2.0e-3, rtol = 3.0e-3),
     ),
     # Rodas23W()  # scalar indexing, requires large changes to `calculate_interpoldiff!`
     # Rodas3P()   # scalar indexing
@@ -113,65 +113,65 @@ solvers = [
     ROS34PW1b => (;
         method_tol = (; atol = 0.25, rtol = 5.0),
     ),
-    ROS34PW2  => (;
+    ROS34PW2 => (;
         csc_tol = (; atol = 1.2e-3, rtol = 2.2e-3),
     ),
-    ROS34PW3  => (;),
-    ROS34PRw  => (;
-        csc_tol = (; atol = 6e-4, rtol = 1e-3),
+    ROS34PW3 => (;),
+    ROS34PRw => (;
+        csc_tol = (; atol = 6.0e-4, rtol = 1.0e-3),
     ),
     RosShamp4 => (;),
-    Veldd4    => (;),
-    Velds4    => (;),
-    GRK4T     => (;),
-    GRK4A     => (;),
+    Veldd4 => (;),
+    Velds4 => (;),
+    GRK4T => (;),
+    GRK4A => (;),
     Ros4LStab => (;),
-    Rodas4    => (;),
-    Rodas42   => (;),
-    Rodas4P   => (;),
-    Rodas4P2  => (;),
-    ROK4a     => (;),
+    Rodas4 => (;),
+    Rodas42 => (;),
+    Rodas4P => (;),
+    Rodas4P2 => (;),
+    ROK4a => (;),
     # ── Rosenbrock 5th order ──
-    Rodas5   => (;),
-    Rodas5P  => (;),
+    Rodas5 => (;),
+    Rodas5P => (;),
     Rodas5Pe => (;),
     Rodas5Pr => (;),
     # ── Rosenbrock 6th order ──
-    Rodas6P  => (;),
+    Rodas6P => (;),
     # ── SDIRK (don't include fixed time step which need explicit dt) ──
     ImplicitEuler => (;),
     Trapezoid => (;
-        csc_tol = (; atol = 8e-3, rtol = 2.5e-2),
+        csc_tol = (; atol = 8.0e-3, rtol = 2.5e-2),
     ),
-    SDIRK2    => (;
+    SDIRK2 => (;
         csc_tol = (; atol = 1.5e-4, rtol = 3.5e-4),
     ),
-    Cash4     => (;
-        csc_tol = (; atol = 5e-3, rtol = 8e-3),
+    Cash4 => (;
+        csc_tol = (; atol = 5.0e-3, rtol = 8.0e-3),
     ),
-    Hairer4   => (;
-        csc_tol = (; atol = 8e-3, rtol = 6e-2),
+    Hairer4 => (;
+        csc_tol = (; atol = 8.0e-3, rtol = 6.0e-2),
     ),
-    Hairer42  => (;
-        csc_tol = (; atol = 7e-3, rtol = 5e-2),
+    Hairer42 => (;
+        csc_tol = (; atol = 7.0e-3, rtol = 5.0e-2),
     ),
     # ── BDF ──
     ABDF2 => (;
         method_csc_tol = (; atol = Inf, rtol = Inf),   # ABDF2 + Krylov diverges vs Rodas5P + Krylov
-        csc_tol        = (; atol = 2e-4, rtol = 7e-4),
+        csc_tol = (; atol = 2.0e-4, rtol = 7.0e-4),
     ),
     QNDF1 => (;),
     QNDF2 => (;
-        csc_tol = (; atol = 4e-3, rtol = 5e-3),
+        csc_tol = (; atol = 4.0e-3, rtol = 5.0e-3),
     ),
     # QNDF()  # 🔧 DeviceMemory error in LinAlg
     QBDF1 => (;
-        method_tol = (; atol = 2e-2, rtol = 0.2),
+        method_tol = (; atol = 2.0e-2, rtol = 0.2),
     ),
     QBDF2 => (;
-        gpu_tol = (; atol = 2e-3, rtol = 2e-3),
-        csc_tol = (; atol = 4e-3, rtol = 7e-3),
-        csr_tol = (; atol = 2e-3, rtol = 2e-3),
+        gpu_tol = (; atol = 2.0e-3, rtol = 2.0e-3),
+        csc_tol = (; atol = 4.0e-3, rtol = 7.0e-3),
+        csr_tol = (; atol = 2.0e-3, rtol = 2.0e-3),
     ),
     # QBDF()  # DeviceMemory error in LinAlg
     # FBDF()  # scalar indexing -> needs extensive work on reinitFBDF!
@@ -222,12 +222,12 @@ function run_dae_tests()
         gtol = _get_tol(overrides, jn)
         mtol = _get_method_tol(overrides, jn)
 
-        # CSC will allways fall back to krylov so the ref solution should do to
+        # CSC will always fall back to krylov so the ref solution should do to
         krylov = (jn == "CSC")
         _sol_ref = krylov ? sol_ref_krylov : sol_ref
 
         # CPU: this solver vs reference
-        cpu_alg = krylov ? sv(linsolve=KrylovJL_GMRES()) : sv()
+        cpu_alg = krylov ? sv(linsolve = KrylovJL_GMRES()) : sv()
         sol_cpu = solve(prob, cpu_alg)
         cpu_abs, cpu_rel = maxerrs(sol_cpu, _sol_ref)
 
@@ -240,10 +240,14 @@ function run_dae_tests()
         gpu_abs, gpu_rel = maxerrs(sol_d, sol_cpu)
 
         method_passed = (cpu_abs < mtol.atol) || (cpu_rel < mtol.rtol)
-        gpu_passed    = (gpu_abs < gtol.atol) || (gpu_rel < gtol.rtol)
+        gpu_passed = (gpu_abs < gtol.atol) || (gpu_rel < gtol.rtol)
         passed = method_passed && gpu_passed
-        push!(results, (; solver = sn, jac = jn, cpu_abs, cpu_rel, gpu_abs, gpu_rel,
-                            gtol, mtol, method_passed, gpu_passed, passed, error = ""))
+        push!(
+            results, (;
+                solver = sn, jac = jn, cpu_abs, cpu_rel, gpu_abs, gpu_rel,
+                gtol, mtol, method_passed, gpu_passed, passed, error = "",
+            )
+        )
         @test passed
     end
 
@@ -251,7 +255,7 @@ function run_dae_tests()
 end
 
 function show_results(results)
-    function _fmt(val; threshold = 1e-3)
+    function _fmt(val; threshold = 1.0e-3)
         isnan(val) && return "   ---  "
         s = @sprintf("%.2e", val)
         if val > threshold
@@ -269,7 +273,7 @@ function show_results(results)
             printstyled(label, "ERROR: ", r.error, "\n"; color = :yellow)
         else
             print(label)
-            print(_fmt(r.cpu_abs, threshold=1e-2), "  ", _fmt(r.cpu_rel, threshold=1e-2), "  ")
+            print(_fmt(r.cpu_abs, threshold = 1.0e-2), "  ", _fmt(r.cpu_rel, threshold = 1.0e-2), "  ")
             print(_fmt(r.gpu_abs), "  ", _fmt(r.gpu_rel), "  ")
             if r.passed
                 printstyled("✓\n"; color = :green)
@@ -282,7 +286,7 @@ function show_results(results)
             end
         end
     end
-    println("-"^85)
+    return println("-"^85)
 end
 
 @testset "GPU DAE solver compatibility" begin

From 7d1d6a1d6d4980c3f1c8e66074baca42a450be2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hans=20W=C3=BCrfel?= <git@wuerfel.io>
Date: Mon, 23 Feb 2026 14:03:51 +0100
Subject: [PATCH 11/11] fix missing imports/packages

---
 .../test/algebraic_vars_detection_tests.jl                  | 1 +
 test/gpu/Project.toml                                       | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/lib/OrdinaryDiffEqCore/test/algebraic_vars_detection_tests.jl b/lib/OrdinaryDiffEqCore/test/algebraic_vars_detection_tests.jl
index 3b3b3caa1af..0c88682844c 100644
--- a/lib/OrdinaryDiffEqCore/test/algebraic_vars_detection_tests.jl
+++ b/lib/OrdinaryDiffEqCore/test/algebraic_vars_detection_tests.jl
@@ -1,6 +1,7 @@
 using Test
 using SparseArrays
 using OrdinaryDiffEqCore: find_algebraic_vars_eqs
+using LinearAlgebra
 
 @testset "Sparse Algebraic Detection Performance" begin
     # Test 1: Correctness - results should match between sparse and dense methods
diff --git a/test/gpu/Project.toml b/test/gpu/Project.toml
index dd879bde104..478a27e51ad 100644
--- a/test/gpu/Project.toml
+++ b/test/gpu/Project.toml
@@ -1,6 +1,7 @@
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+CUDSS = "45b445bb-4962-46a0-9369-b4df9d0f772e"
 ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
 DiffEqBase = "2b5f629d-d688-5b77-993f-72d75c75574e"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
@@ -25,7 +26,8 @@ OrdinaryDiffEqRosenbrock = {path = "../../lib/OrdinaryDiffEqRosenbrock"}
 
 [compat]
 Adapt = "4"
-CUDA = "4, 5"
+CUDA = "5"
+CUDSS = "0.6.7"
 ComponentArrays = "0.15"
 DiffEqBase = "6.182"
 FFTW = "1.8"
@@ -33,9 +35,11 @@ FastBroadcast = "0.3"
 FillArrays = "1"
 OrdinaryDiffEq = "6"
 OrdinaryDiffEqBDF = "1"
+OrdinaryDiffEqFIRK = "1"
 OrdinaryDiffEqNonlinearSolve = "1"
 OrdinaryDiffEqRKIP = "1"
 OrdinaryDiffEqRosenbrock = "1"
+OrdinaryDiffEqSDIRK = "1"
 RecursiveArrayTools = "3"
 SciMLBase = "2.99"
 SciMLOperators = "1.3"