diff --git a/examples/custom_potential.jl b/examples/custom_potential.jl
index 43f166df6f..edf765fafc 100644
--- a/examples/custom_potential.jl
+++ b/examples/custom_potential.jl
@@ -21,12 +21,12 @@ CustomPotential() = CustomPotential(1.0, 0.5);
 
 # We extend the two methods providing access to the real and Fourier
 # representation of the potential to DFTK.
-function DFTK.local_potential_real(el::CustomPotential, r::Real)
-    -el.α / (√(2π) * el.L) * exp(- (r / el.L)^2 / 2)
+function DFTK.local_potential_real(el::CustomPotential, rs::Vector)
+    map(r -> -el.α / (√(2π) * el.L) * exp(- (r / el.L)^2 / 2), rs)
 end
-function DFTK.local_potential_fourier(el::CustomPotential, p::Real)
+function DFTK.local_potential_fourier(el::CustomPotential, ps::Vector)
     ## = ∫ V(r) exp(-ix⋅p) dx
-    -el.α * exp(- (p * el.L)^2 / 2)
+    map(p -> -el.α * exp(- (p * el.L)^2 / 2), ps)
 end
 
 # !!! tip "Gaussian potentials and DFTK"
diff --git a/src/density_methods.jl b/src/density_methods.jl
index f1a5523e6c..cb889e3e79 100644
--- a/src/density_methods.jl
+++ b/src/density_methods.jl
@@ -180,17 +180,18 @@ function atomic_density_form_factors(basis::PlaneWaveBasis{T},
         p = norm(G)
         iG2ifnorm_cpu[iG] = get!(norm_indices, p, length(norm_indices) + 1)
     end
+    iG2ifnorm = to_device(basis.architecture, iG2ifnorm_cpu)
 
-    form_factors_cpu = zeros(T, length(norm_indices), length(basis.model.atom_groups))
-    for (p, ifnorm) in norm_indices
-        for (igroup, group) in enumerate(basis.model.atom_groups)
-            element = basis.model.atoms[first(group)]
-            form_factors_cpu[ifnorm, igroup] = atomic_density(element, p, method)
-        end
+    ni_pairs = collect(pairs(norm_indices))
+    ps = to_device(basis.architecture, first.(ni_pairs))
+    indices = to_device(basis.architecture, last.(ni_pairs))
+
+    form_factors = similar(ps, length(norm_indices), length(basis.model.atom_groups))
+    for (igroup, group) in enumerate(basis.model.atom_groups)
+        element = basis.model.atoms[first(group)]
+        @inbounds form_factors[indices, igroup] .= atomic_density(element, ps, method)
     end
 
-    form_factors = to_device(basis.architecture, form_factors_cpu)
-    iG2ifnorm = to_device(basis.architecture, iG2ifnorm_cpu)
     (; form_factors, iG2ifnorm)
 end
 
@@ -214,6 +215,7 @@ function atomic_density(element::Element, Gnorm::T,
     has_core_density(element) ? core_charge_density_fourier(element, Gnorm) : zero(T)
 end
 
+#TODO: vectorize and check GPU
 function atomic_density(element::Element, Gnorm::T,
                         ::CoreKineticEnergyDensity)::T where {T <: Real}
     if has_core_kinetic_energy_density(element)
@@ -223,6 +225,25 @@ function atomic_density(element::Element, Gnorm::T,
     end
 end
 
+# Generic vectoriezed version of the above
+function atomic_density(element::Element, Gnorms::AbstractVector{T},
+                        method::AtomicDensity) where {T <: Real}
+    arch = architecture(Gnorms)
+    to_device(arch,
+              map(Gnorm -> atomic_density(element, Gnorm, method),
+                  to_cpu(Gnorms)))
+end
+
+# Vectorized version for CoreDensity, GPU optimized
+function atomic_density(element::Element, Gnorms::AbstractVector{T},
+                        ::CoreDensity) where {T <: Real}
+    if has_core_density(element)
+        core_charge_density_fourier(element, Gnorms)
+    else
+        zeros_like(Gnorms)
+    end
+end
+
 # Get the lengthscale of the valence density for an atom with `n_elec_core` core
 # and `n_elec_valence` valence electrons.
 function atom_decay_length(n_elec_core, n_elec_valence)
diff --git a/src/elements.jl b/src/elements.jl
index 43771c20e4..433f8731e9 100644
--- a/src/elements.jl
+++ b/src/elements.jl
@@ -43,7 +43,7 @@ eval_psp_energy_correction(T, ::Element) = zero(T)
 eval_psp_energy_correction(psp::Element) = eval_psp_energy_correction(Float64, psp)
 
 # Fall back to the Gaussian table for Elements without pseudopotentials
-function valence_charge_density_fourier(el::Element, p::T)::T where {T <: Real}
+function valence_charge_density_fourier(el::Element, p)
     gaussian_valence_charge_density_fourier(el, p)
 end
 
@@ -51,6 +51,10 @@ end
 function gaussian_valence_charge_density_fourier(el::Element, p::T)::T where {T <: Real}
     charge_ionic(el) * exp(-(p * atom_decay_length(el))^2)
 end
+function gaussian_valence_charge_density_fourier(el::Element, ps::AbstractVector{T}) where {T <: Real}
+    arch = architecture(ps)
+    to_device(arch, map(p -> gaussian_valence_charge_density_fourier(el, p), to_cpu(ps)))
+end
 
 function core_charge_density_fourier(::Element, ::T)::T where {T <: Real}
     error("Abstract elements do not necesesarily provide core charge density.")
@@ -60,12 +64,6 @@ function core_kinetic_energy_density_fourier(::Element, ::T)::T where {T <: Real
     error("Abstract elements do not necesesarily provide core kinetic energy density.")
 end
 
-# Generic vectorized version of local_potential_fourier, GPU-safe
-function local_potential_fourier(el::Element, ps::AbstractVector{T}) where {T <: Real}
-    arch = architecture(ps)
-    to_device(arch, map(p -> local_potential_fourier(el, p), to_cpu(ps)))
-end
-
 Base.show(io::IO, el::Element) = print(io, "$(typeof(el))(:$(species(el)))")
 
 #
@@ -175,26 +173,17 @@ has_core_density(el::ElementPsp)  = has_core_density(el.psp)
 has_core_kinetic_energy_density(el::ElementPsp) = has_core_kinetic_energy_density(el.psp)
 eval_psp_energy_correction(T, el::ElementPsp) = eval_psp_energy_correction(T, el.psp)
 
-function local_potential_fourier(el::ElementPsp, p::T) where {T <: Real}
-    eval_psp_local_fourier(el.psp, p)
-end
-# Vectorized version of the above
-function local_potential_fourier(el::ElementPsp, ps::AbstractVector{T}) where {T <: Real}
-    eval_psp_local_fourier(el.psp, ps)
-end
-local_potential_real(el::ElementPsp, r::Real) = eval_psp_local_real(el.psp, r)
+local_potential_fourier(el::ElementPsp, p) = eval_psp_local_fourier(el.psp, p)
+local_potential_real(el::ElementPsp, r) = eval_psp_local_real(el.psp, r)
 
-function valence_charge_density_fourier(el::ElementPsp, p::T) where {T <: Real}
+function valence_charge_density_fourier(el::ElementPsp, p)
     if has_valence_density(el.psp)
         eval_psp_valence_density_fourier(el.psp, p)
     else
         gaussian_valence_charge_density_fourier(el, p)
     end
 end
-function core_charge_density_fourier(el::ElementPsp, p::T) where {T <: Real}
-    eval_psp_core_density_fourier(el.psp, p)
-end
-
+core_charge_density_fourier(el::ElementPsp, p) = eval_psp_core_density_fourier(el.psp, p)
 
 #
 # ElementCohenBergstresser
@@ -264,7 +253,6 @@ function local_potential_fourier(el::ElementCohenBergstresser, p::T) where {T <:
 end
 # TODO Strictly speaking needs a eval_psp_energy_correction
 
-
 #
 # ElementGaussian
 #
@@ -297,6 +285,24 @@ function local_potential_fourier(el::ElementGaussian, p::Real)
 end
 # TODO Strictly speaking needs a eval_psp_energy_correction
 
+# Generic API expectations: all element functions taking a real space scalar |r| or a
+# reciprocal space scalar |p| should have a vectorized version accepting vectors of |r| or |p|.
+# The following loop vectorizes element functions by calling the single-value version
+# elementwise. This is GPU safe and generic. Performance critical functions should have their
+# own GPU-optimized implementation. Note: explicit loop over Element types in order to avoid
+# ambiguities with the specific ElementPsp functions.
+for fn in [:gaussian_valence_charge_density_fourier, :core_charge_density_fourier,
+           :local_potential_fourier, :local_potential_real]
+    for el_type in [ElementCoulomb, ElementCohenBergstresser, ElementGaussian]
+        @eval begin
+            function DFTK.$fn(el::$el_type, arg::AbstractVector{T}) where {T <: Real}
+                arch = architecture(arg)
+                to_device(arch, map(p -> $fn(el, p), to_cpu(arg)))
+            end
+        end
+    end
+end
+
 #
 # Helper functions
 #
diff --git a/src/pseudo/NormConservingPsp.jl b/src/pseudo/NormConservingPsp.jl
index a600f7d77f..6efc580007 100644
--- a/src/pseudo/NormConservingPsp.jl
+++ b/src/pseudo/NormConservingPsp.jl
@@ -20,7 +20,6 @@ abstract type NormConservingPsp end
 # eval_psp_projector_fourier(psp, i, l, ps::AbstrcatArray{<Real})
 # eval_psp_local_real(psp, r::Real)
 # eval_psp_local_fourier(psp, p::Real)
-# eval_psp_local_fourier(psp, ps::AbstractArray{<Real})
 # eval_psp_energy_correction(T::Type, psp)
 
 #### Optional methods:
@@ -36,6 +35,42 @@ abstract type NormConservingPsp end
 # count_n_pswfc_radial(psp, l::Integer)
 # pswfc_label(psp, i::Integer, l::Integer)
 
+#### Vectorization: all methods are expected to accept vectorized inputs, namely:
+# eval_psp_projector_real(psp, i, l, r::AbstractVector{<Real})
+# eval_psp_projector_fourier(psp, i, l, p::AbstractVector{<Real})
+# eval_psp_local_real(psp, r::AbstractVector{<Real})
+# eval_psp_local_fourier(psp, p::AbstractVector{<Real})
+# eval_psp_valence_density_real(psp, r::AbstractVector{<Real})
+# eval_psp_valence_density_fourier(psp, p::AbstractVector{<Real})
+# eval_psp_core_density_real(psp, r::AbstractVector{<Real})
+# eval_psp_core_density_fourier(psp, p::AbstractVector{<Real})
+# eval_psp_core_kinetic_energy_density_real(psp, r::AbstractVector{<Real})
+# eval_psp_core_kinetic_energy_density_fourier(psp, p::AbstractVector{<Real})
+# eval_psp_pswfc_real(psp, i, l, p::AbstractVector{<Real})
+# eval_psp_pswfc_fourier(psp, i, l, p::AbstractVector{<Real})
+
+# Macros that vectorize a given method for a Psp type by calling an existing scalar
+# version elementwise. Safe for GPUArray inputs. Performance critical methods should
+# have their own GPU-optimized implementation instead of relying on this macro. The
+# different norm-conserving pseudopotential types are responsible for the implementation
+# of vectorized functions, whether by using these macros or not.
+macro vectorize_psp_function(PspType, fn)
+    quote
+        function $fn(psp::$PspType, vec::AbstractVector{T}) where {T <: Real}
+            arch = architecture(vec)
+            to_device(arch, map(p -> $fn(psp, p), to_cpu(vec)))
+        end
+    end
+end
+macro vectorize_psp_projector_function(PspType, fn)
+    quote
+        function $fn(psp::$PspType, i, l, vec::AbstractVector{T}) where {T <: Real}
+            arch = architecture(vec)
+            to_device(arch, map(p -> $fn(psp, i, l, p), to_cpu(vec)))
+        end
+    end
+end
+
 """
     eval_psp_projector_real(psp, i, l, r)
 
@@ -97,12 +132,6 @@ V_{\rm loc}(p) &= ∫_{ℝ^3} (V_{\rm loc}(r) - C(r)) e^{-ip·r} dr + F[C(r)] \\
 eval_psp_local_fourier(psp::NormConservingPsp, p::AbstractVector) =
     eval_psp_local_fourier(psp, norm(p))
 
-# Fallback vectorized implementation for non GPU-optimized code.
-function eval_psp_local_fourier(psp::NormConservingPsp, ps::AbstractVector{T}) where {T <: Real}
-    arch = architecture(ps)
-    to_device(arch, map(p -> eval_psp_local_fourier(psp, p), to_cpu(ps)))
-end
-
 @doc raw"""
     eval_psp_energy_correction([T=Float64,] psp::NormConservingPsp)
     eval_psp_energy_correction([T=Float64,] element::Element)
@@ -184,7 +213,6 @@ eval_psp_core_kinetic_energy_density_fourier(::NormConservingPsp, ::T) where {T
 eval_psp_core_kinetic_energy_density_fourier(psp::NormConservingPsp, p::AbstractVector) =
     eval_psp_core_kinetic_energy_density_fourier(psp, norm(p))
 
-
 #### Methods defined on a NormConservingPsp
 import Base.Broadcast.broadcastable
 Base.Broadcast.broadcastable(psp::NormConservingPsp) = Ref(psp)
@@ -256,4 +284,4 @@ function find_pswfc(psp::NormConservingPsp, label::String)
     end
     error("Could not find pseudo atomic orbital with label $label "
           * "in pseudopotential $(psp).")
-end
+end
\ No newline at end of file
diff --git a/src/pseudo/PspHgh.jl b/src/pseudo/PspHgh.jl
index 52079c16c6..ad181e6fca 100644
--- a/src/pseudo/PspHgh.jl
+++ b/src/pseudo/PspHgh.jl
@@ -122,6 +122,7 @@ function eval_psp_local_fourier(psp::PspHgh, p::T) where {T <: Real}
 
     4T(π) * rloc^2 * (-Zion + sqrt(T(π) / 2) * rloc * t^2 * P) * exp(-t^2 / 2) / t^2
 end
+@vectorize_psp_function PspHgh DFTK.eval_psp_local_fourier
 
 # [GTH98] (1)
 function eval_psp_local_real(psp::PspHgh, r::T) where {T <: Real}
@@ -133,6 +134,7 @@ function eval_psp_local_real(psp::PspHgh, r::T) where {T <: Real}
         + exp(-rr^2 / 2) * (cloc[1] + cloc[2] * rr^2 + cloc[3] * rr^4 + cloc[4] * rr^6)
     )
 end
+@vectorize_psp_function PspHgh DFTK.eval_psp_local_real
 
 
 # [HGH98] (7-15) except they do it with plane waves normalized by 1/sqrt(Ω).
@@ -161,7 +163,7 @@ function eval_psp_projector_fourier(psp::PspHgh, i, l, p::T) where {T <: Real}
 
     error("Not implemented for l=$l and i=$i")
 end
-
+@vectorize_psp_projector_function PspHgh DFTK.eval_psp_projector_fourier
 
 # [HGH98] (3)
 function eval_psp_projector_real(psp::PspHgh, i, l, r::T) where {T <: Real}
@@ -169,6 +171,7 @@ function eval_psp_projector_real(psp::PspHgh, i, l, r::T) where {T <: Real}
     ired = (4i - 1) / T(2)
     sqrt(T(2)) * r^(l + 2(i - 1)) * exp(-r^2 / 2rp^2) / rp^(l + ired) / sqrt(gamma(l + ired))
 end
+@vectorize_psp_projector_function PspHgh DFTK.eval_psp_projector_real
 
 function eval_psp_energy_correction(T, psp::PspHgh)
     # By construction we need to compute the DC component of the difference
diff --git a/src/pseudo/PspLinComb.jl b/src/pseudo/PspLinComb.jl
index ac66fcc0b1..f3e3d1b452 100644
--- a/src/pseudo/PspLinComb.jl
+++ b/src/pseudo/PspLinComb.jl
@@ -76,6 +76,15 @@ macro make_psplincomb_call(fn)
         end
     end
 end
+
+macro make_psplincomb_call_vectorized(fn)
+    quote
+        function $fn(psp::PspLinComb, arg::AbstractVector{<:Real})
+            sum(c * $fn(pp, arg) for (c, pp) in zip(psp.coefficients, psp.pseudos))
+        end
+    end
+end
+
 @make_psplincomb_call DFTK.eval_psp_local_real
 @make_psplincomb_call DFTK.eval_psp_local_fourier
 @make_psplincomb_call DFTK.eval_psp_valence_density_real
@@ -84,3 +93,14 @@ end
 @make_psplincomb_call DFTK.eval_psp_core_density_fourier
 @make_psplincomb_call DFTK.eval_psp_core_kinetic_energy_density_real
 @make_psplincomb_call DFTK.eval_psp_core_kinetic_energy_density_fourier
+
+@vectorize_psp_function PspLinComb DFTK.eval_psp_local_real
+@vectorize_psp_function PspLinComb DFTK.eval_psp_local_fourier
+@vectorize_psp_function PspLinComb DFTK.eval_psp_valence_density_real
+@vectorize_psp_function PspLinComb DFTK.eval_psp_valence_density_fourier
+@vectorize_psp_function PspLinComb DFTK.eval_psp_core_density_real
+@vectorize_psp_function PspLinComb DFTK.eval_psp_core_density_fourier
+@vectorize_psp_function PspLinComb DFTK.eval_psp_core_kinetic_energy_density_real
+@vectorize_psp_function PspLinComb DFTK.eval_psp_core_kinetic_energy_density_fourier
+@vectorize_psp_projector_function PspLinComb DFTK.eval_psp_projector_real
+@vectorize_psp_projector_function PspLinComb DFTK.eval_psp_projector_fourier
\ No newline at end of file
diff --git a/src/pseudo/PspUpf.jl b/src/pseudo/PspUpf.jl
index 804e36c24c..8ab775e43c 100644
--- a/src/pseudo/PspUpf.jl
+++ b/src/pseudo/PspUpf.jl
@@ -182,6 +182,7 @@ has_core_kinetic_energy_density(psp::PspUpf) = !all(iszero, psp.r2_τcore)
 function eval_psp_projector_real(psp::PspUpf, i, l, r::T)::T where {T<:Real}
     psp.r2_projs_interp[l+1][i](r) / r^2  # TODO if r is below a threshold, return zero
 end
+@vectorize_psp_projector_function PspUpf DFTK.eval_psp_projector_real
 
 function eval_psp_projector_fourier(psp::PspUpf, i, l, p::T)::T where {T<:Real}
     # The projectors may have been cut off before the end of the radial mesh
@@ -214,6 +215,7 @@ pswfc_label(psp::PspUpf, i, l) = psp.pswfc_labels[l+1][i]
 function eval_psp_pswfc_real(psp::PspUpf, i, l, r::T)::T where {T<:Real}
     psp.r2_pswfcs_interp[l+1][i](r) / r^2  # TODO if r is below a threshold, return zero
 end
+@vectorize_psp_projector_function PspUpf DFTK.eval_psp_pswfc_real
 
 function eval_psp_pswfc_fourier(psp::PspUpf, i, l, p::T)::T where {T<:Real}
     # Pseudo-atomic wavefunctions are _not_ currently cut off like the other
@@ -222,8 +224,10 @@ function eval_psp_pswfc_fourier(psp::PspUpf, i, l, p::T)::T where {T<:Real}
     # If issues arise, try cutting them off too.
     return hankel(psp.rgrid, psp.r2_pswfcs[l+1][i], l, p)
 end
+@vectorize_psp_projector_function PspUpf DFTK.eval_psp_pswfc_fourier
 
 eval_psp_local_real(psp::PspUpf, r::T) where {T<:Real} = psp.vloc_interp(r)
+@vectorize_psp_function PspUpf DFTK.eval_psp_local_real
 
 # Low-level function for the local part of the pseudopotential in reciprocal space
 function _eval_psp_local_fourier(quadrature, rgrid, vloc, Zion, p::T)::T where {T<:Real}
@@ -247,7 +251,7 @@ function eval_psp_local_fourier(psp::PspUpf, p::T) where {T<:Real}
     _eval_psp_local_fourier(quadrature, rgrid, vloc, psp.Zion, p)
 end
 
-# Vectorized version of the above, GPU compatible
+# Vectorized version of the above, GPU optimized
 function eval_psp_local_fourier(psp::PspUpf, ps::AbstractVector{T}) where {T<:Real}
     quadrature = default_psp_quadrature(psp.rgrid)
     arch = architecture(ps)
@@ -264,16 +268,19 @@ end
 function eval_psp_valence_density_real(psp::PspUpf, r::T) where {T<:Real}
     psp.r2_ρion_interp(r) / r^2  # TODO if r is below a threshold, return zero
 end
+@vectorize_psp_function PspUpf DFTK.eval_psp_valence_density_real
 
 function eval_psp_valence_density_fourier(psp::PspUpf, p::T) where {T<:Real}
     rgrid = @view psp.rgrid[1:psp.ircut]
     r2_ρion = @view psp.r2_ρion[1:psp.ircut]
     return hankel(rgrid, r2_ρion, 0, p)
 end
+@vectorize_psp_function PspUpf DFTK.eval_psp_valence_density_fourier
 
 function eval_psp_core_density_real(psp::PspUpf, r::T) where {T<:Real}
     psp.r2_ρcore_interp(r) / r^2  # TODO if r is below a threshold, return zero
 end
+@vectorize_psp_function PspUpf DFTK.eval_psp_core_density_real
 
 function eval_psp_core_density_fourier(psp::PspUpf, p::T) where {T<:Real}
     rgrid = @view psp.rgrid[1:psp.ircut]
@@ -281,15 +288,30 @@ function eval_psp_core_density_fourier(psp::PspUpf, p::T) where {T<:Real}
     return hankel(rgrid, r2_ρcore, 0, p)
 end
 
+# Vectorized version of the above, GPU optimized
+function eval_psp_core_density_fourier(psp::PspUpf, ps::AbstractVector{T}) where {T<:Real}
+    quadrature = default_psp_quadrature(psp.rgrid)
+    arch = architecture(ps)
+    rgrid = to_device(arch, @view psp.rgrid[1:psp.ircut])
+    r2_ρcore = to_device(arch, @view psp.r2_ρcore[1:psp.ircut])
+    map(ps) do p
+        # GPU kernels with dynamic function calls do not compile,
+        # hence the pre-determined explicit integration function
+        hankel(quadrature, rgrid, r2_ρcore, 0, p)
+    end
+end
+
 function eval_psp_core_kinetic_energy_density_real(psp::PspUpf, r::T) where {T<:Real}
     psp.r2_τcore_interp(r) / r^2  # TODO if r is below a threshold, return zero
 end
+@vectorize_psp_function PspUpf DFTK.eval_psp_core_kinetic_energy_density_real
 
 function eval_psp_core_kinetic_energy_density_fourier(psp::PspUpf, p::T) where {T<:Real}
     rgrid = @view psp.rgrid[1:psp.ircut]
     r2_τcore = @view psp.r2_τcore[1:psp.ircut]
     return hankel(rgrid, r2_τcore, 0, p)
 end
+@vectorize_psp_function PspUpf DFTK.eval_psp_core_kinetic_energy_density_fourier
 
 function eval_psp_energy_correction(T, psp::PspUpf)
     rgrid = @view psp.rgrid[1:psp.ircut]
diff --git a/src/workarounds/forwarddiff_rules.jl b/src/workarounds/forwarddiff_rules.jl
index f28fe9cac8..2af87e37dc 100644
--- a/src/workarounds/forwarddiff_rules.jl
+++ b/src/workarounds/forwarddiff_rules.jl
@@ -304,6 +304,13 @@ end
 end
 
 function hankel(r::AbstractVector, r2_f::AbstractVector, l::Integer, p::TT) where {TT <: ForwardDiff.Dual}
+    quadrature = default_psp_quadrature(r)
+    hankel(quadrature, r, r2_f, l, p)
+end
+
+# For GPU kernels to compile, dynamic function calls must be avoided. Therefore, the quadrature
+# must be known and passed ahead of time, and the Dual type explicitly parametrized.
+function hankel(quadrature, r::AbstractVector, r2_f::AbstractVector, l::Integer, p::Dual{Tg,V,N}) where {Tg,V,N}
     # This custom rule uses two properties of the hankel transform:
     #   d H[f] / dp = 4\pi \int_0^∞ r^2 f(r) j_l'(p⋅r)⋅r dr
     # and that
@@ -314,20 +321,16 @@ function hankel(r::AbstractVector, r2_f::AbstractVector, l::Integer, p::TT) wher
     # the tricky bit is to exploit that one needs both the j_l'(p⋅r) and j_l(p⋅r) values
     # but one does not want to precompute and allocate them into arrays
     # TODO Investigate custom rules for bessels and integration
-
-    T  = ForwardDiff.valtype(TT)
     pv = ForwardDiff.value(p)
 
-    jl = sphericalbesselj_fast.(l, pv .* r)
-    value = 4T(π) * simpson((i, r) -> r2_f[i] * jl[i], r)
-
-    if iszero(pv)
-        return TT(value, zero(T) * ForwardDiff.partials(p))
-    end
-    derivative = 4T(π) * simpson(r) do i, r
-        (r2_f[i] * (l * jl[i] / pv - r * sphericalbesselj_fast(l+1, pv * r)))
+    # To reduce allocations, compute value and derivative simultaneously, using a SVector as integrand
+    value, derivative = 4V(π) * quadrature(r) do i, r
+        jl_i = sphericalbesselj_fast(l, pv * r)
+        val = r2_f[i] * jl_i
+        deriv = iszero(pv) ? zero(V) : r2_f[i] * (l * jl_i / pv - r * sphericalbesselj_fast(l+1, pv * r))
+        SVector(val, deriv)
     end
-    TT(value, derivative * ForwardDiff.partials(p))
+    Dual{Tg,V,N}(value, derivative * ForwardDiff.partials(p))
 end
 
 # other workarounds