diff --git a/examples/custom_potential.jl b/examples/custom_potential.jl index 43f166df6f..edf765fafc 100644 --- a/examples/custom_potential.jl +++ b/examples/custom_potential.jl @@ -21,12 +21,12 @@ CustomPotential() = CustomPotential(1.0, 0.5); # We extend the two methods providing access to the real and Fourier # representation of the potential to DFTK. -function DFTK.local_potential_real(el::CustomPotential, r::Real) - -el.α / (√(2π) * el.L) * exp(- (r / el.L)^2 / 2) +function DFTK.local_potential_real(el::CustomPotential, rs::Vector) + map(r -> -el.α / (√(2π) * el.L) * exp(- (r / el.L)^2 / 2), rs) end -function DFTK.local_potential_fourier(el::CustomPotential, p::Real) +function DFTK.local_potential_fourier(el::CustomPotential, ps::Vector) ## = ∫ V(r) exp(-ix⋅p) dx - -el.α * exp(- (p * el.L)^2 / 2) + map(p -> -el.α * exp(- (p * el.L)^2 / 2), ps) end # !!! tip "Gaussian potentials and DFTK" diff --git a/src/density_methods.jl b/src/density_methods.jl index f1a5523e6c..cb889e3e79 100644 --- a/src/density_methods.jl +++ b/src/density_methods.jl @@ -180,17 +180,18 @@ function atomic_density_form_factors(basis::PlaneWaveBasis{T}, p = norm(G) iG2ifnorm_cpu[iG] = get!(norm_indices, p, length(norm_indices) + 1) end + iG2ifnorm = to_device(basis.architecture, iG2ifnorm_cpu) - form_factors_cpu = zeros(T, length(norm_indices), length(basis.model.atom_groups)) - for (p, ifnorm) in norm_indices - for (igroup, group) in enumerate(basis.model.atom_groups) - element = basis.model.atoms[first(group)] - form_factors_cpu[ifnorm, igroup] = atomic_density(element, p, method) - end + ni_pairs = collect(pairs(norm_indices)) + ps = to_device(basis.architecture, first.(ni_pairs)) + indices = to_device(basis.architecture, last.(ni_pairs)) + + form_factors = similar(ps, length(norm_indices), length(basis.model.atom_groups)) + for (igroup, group) in enumerate(basis.model.atom_groups) + element = basis.model.atoms[first(group)] + @inbounds form_factors[indices, igroup] .= atomic_density(element, ps, method) end - form_factors = to_device(basis.architecture, form_factors_cpu) - iG2ifnorm = to_device(basis.architecture, iG2ifnorm_cpu) (; form_factors, iG2ifnorm) end @@ -214,6 +215,7 @@ function atomic_density(element::Element, Gnorm::T, has_core_density(element) ? core_charge_density_fourier(element, Gnorm) : zero(T) end +#TODO: vectorize and check GPU function atomic_density(element::Element, Gnorm::T, ::CoreKineticEnergyDensity)::T where {T <: Real} if has_core_kinetic_energy_density(element) @@ -223,6 +225,25 @@ function atomic_density(element::Element, Gnorm::T, end end +# Generic vectoriezed version of the above +function atomic_density(element::Element, Gnorms::AbstractVector{T}, + method::AtomicDensity) where {T <: Real} + arch = architecture(Gnorms) + to_device(arch, + map(Gnorm -> atomic_density(element, Gnorm, method), + to_cpu(Gnorms))) +end + +# Vectorized version for CoreDensity, GPU optimized +function atomic_density(element::Element, Gnorms::AbstractVector{T}, + ::CoreDensity) where {T <: Real} + if has_core_density(element) + core_charge_density_fourier(element, Gnorms) + else + zeros_like(Gnorms) + end +end + # Get the lengthscale of the valence density for an atom with `n_elec_core` core # and `n_elec_valence` valence electrons. function atom_decay_length(n_elec_core, n_elec_valence) diff --git a/src/elements.jl b/src/elements.jl index 43771c20e4..433f8731e9 100644 --- a/src/elements.jl +++ b/src/elements.jl @@ -43,7 +43,7 @@ eval_psp_energy_correction(T, ::Element) = zero(T) eval_psp_energy_correction(psp::Element) = eval_psp_energy_correction(Float64, psp) # Fall back to the Gaussian table for Elements without pseudopotentials -function valence_charge_density_fourier(el::Element, p::T)::T where {T <: Real} +function valence_charge_density_fourier(el::Element, p) gaussian_valence_charge_density_fourier(el, p) end @@ -51,6 +51,10 @@ end function gaussian_valence_charge_density_fourier(el::Element, p::T)::T where {T <: Real} charge_ionic(el) * exp(-(p * atom_decay_length(el))^2) end +function gaussian_valence_charge_density_fourier(el::Element, ps::AbstractVector{T}) where {T <: Real} + arch = architecture(ps) + to_device(arch, map(p -> gaussian_valence_charge_density_fourier(el, p), to_cpu(ps))) +end function core_charge_density_fourier(::Element, ::T)::T where {T <: Real} error("Abstract elements do not necesesarily provide core charge density.") @@ -60,12 +64,6 @@ function core_kinetic_energy_density_fourier(::Element, ::T)::T where {T <: Real error("Abstract elements do not necesesarily provide core kinetic energy density.") end -# Generic vectorized version of local_potential_fourier, GPU-safe -function local_potential_fourier(el::Element, ps::AbstractVector{T}) where {T <: Real} - arch = architecture(ps) - to_device(arch, map(p -> local_potential_fourier(el, p), to_cpu(ps))) -end - Base.show(io::IO, el::Element) = print(io, "$(typeof(el))(:$(species(el)))") # @@ -175,26 +173,17 @@ has_core_density(el::ElementPsp) = has_core_density(el.psp) has_core_kinetic_energy_density(el::ElementPsp) = has_core_kinetic_energy_density(el.psp) eval_psp_energy_correction(T, el::ElementPsp) = eval_psp_energy_correction(T, el.psp) -function local_potential_fourier(el::ElementPsp, p::T) where {T <: Real} - eval_psp_local_fourier(el.psp, p) -end -# Vectorized version of the above -function local_potential_fourier(el::ElementPsp, ps::AbstractVector{T}) where {T <: Real} - eval_psp_local_fourier(el.psp, ps) -end -local_potential_real(el::ElementPsp, r::Real) = eval_psp_local_real(el.psp, r) +local_potential_fourier(el::ElementPsp, p) = eval_psp_local_fourier(el.psp, p) +local_potential_real(el::ElementPsp, r) = eval_psp_local_real(el.psp, r) -function valence_charge_density_fourier(el::ElementPsp, p::T) where {T <: Real} +function valence_charge_density_fourier(el::ElementPsp, p) if has_valence_density(el.psp) eval_psp_valence_density_fourier(el.psp, p) else gaussian_valence_charge_density_fourier(el, p) end end -function core_charge_density_fourier(el::ElementPsp, p::T) where {T <: Real} - eval_psp_core_density_fourier(el.psp, p) -end - +core_charge_density_fourier(el::ElementPsp, p) = eval_psp_core_density_fourier(el.psp, p) # # ElementCohenBergstresser @@ -264,7 +253,6 @@ function local_potential_fourier(el::ElementCohenBergstresser, p::T) where {T <: end # TODO Strictly speaking needs a eval_psp_energy_correction - # # ElementGaussian # @@ -297,6 +285,24 @@ function local_potential_fourier(el::ElementGaussian, p::Real) end # TODO Strictly speaking needs a eval_psp_energy_correction +# Generic API expectations: all element functions taking a real space scalar |r| or a +# reciprocal space scalar |p| should have a vectorized version accepting vectors of |r| or |p|. +# The following loop vectorizes element functions by calling the single-value version +# elementwise. This is GPU safe and generic. Performance critical functions should have their +# own GPU-optimized implementation. Note: explicit loop over Element types in order to avoid +# ambiguities with the specific ElementPsp functions. +for fn in [:gaussian_valence_charge_density_fourier, :core_charge_density_fourier, + :local_potential_fourier, :local_potential_real] + for el_type in [ElementCoulomb, ElementCohenBergstresser, ElementGaussian] + @eval begin + function DFTK.$fn(el::$el_type, arg::AbstractVector{T}) where {T <: Real} + arch = architecture(arg) + to_device(arch, map(p -> $fn(el, p), to_cpu(arg))) + end + end + end +end + # # Helper functions # diff --git a/src/pseudo/NormConservingPsp.jl b/src/pseudo/NormConservingPsp.jl index a600f7d77f..6efc580007 100644 --- a/src/pseudo/NormConservingPsp.jl +++ b/src/pseudo/NormConservingPsp.jl @@ -20,7 +20,6 @@ abstract type NormConservingPsp end # eval_psp_projector_fourier(psp, i, l, ps::AbstrcatArray{ $fn(psp, p), to_cpu(vec))) + end + end +end +macro vectorize_psp_projector_function(PspType, fn) + quote + function $fn(psp::$PspType, i, l, vec::AbstractVector{T}) where {T <: Real} + arch = architecture(vec) + to_device(arch, map(p -> $fn(psp, i, l, p), to_cpu(vec))) + end + end +end + """ eval_psp_projector_real(psp, i, l, r) @@ -97,12 +132,6 @@ V_{\rm loc}(p) &= ∫_{ℝ^3} (V_{\rm loc}(r) - C(r)) e^{-ip·r} dr + F[C(r)] \\ eval_psp_local_fourier(psp::NormConservingPsp, p::AbstractVector) = eval_psp_local_fourier(psp, norm(p)) -# Fallback vectorized implementation for non GPU-optimized code. -function eval_psp_local_fourier(psp::NormConservingPsp, ps::AbstractVector{T}) where {T <: Real} - arch = architecture(ps) - to_device(arch, map(p -> eval_psp_local_fourier(psp, p), to_cpu(ps))) -end - @doc raw""" eval_psp_energy_correction([T=Float64,] psp::NormConservingPsp) eval_psp_energy_correction([T=Float64,] element::Element) @@ -184,7 +213,6 @@ eval_psp_core_kinetic_energy_density_fourier(::NormConservingPsp, ::T) where {T eval_psp_core_kinetic_energy_density_fourier(psp::NormConservingPsp, p::AbstractVector) = eval_psp_core_kinetic_energy_density_fourier(psp, norm(p)) - #### Methods defined on a NormConservingPsp import Base.Broadcast.broadcastable Base.Broadcast.broadcastable(psp::NormConservingPsp) = Ref(psp) @@ -256,4 +284,4 @@ function find_pswfc(psp::NormConservingPsp, label::String) end error("Could not find pseudo atomic orbital with label $label " * "in pseudopotential $(psp).") -end +end \ No newline at end of file diff --git a/src/pseudo/PspHgh.jl b/src/pseudo/PspHgh.jl index 52079c16c6..ad181e6fca 100644 --- a/src/pseudo/PspHgh.jl +++ b/src/pseudo/PspHgh.jl @@ -122,6 +122,7 @@ function eval_psp_local_fourier(psp::PspHgh, p::T) where {T <: Real} 4T(π) * rloc^2 * (-Zion + sqrt(T(π) / 2) * rloc * t^2 * P) * exp(-t^2 / 2) / t^2 end +@vectorize_psp_function PspHgh DFTK.eval_psp_local_fourier # [GTH98] (1) function eval_psp_local_real(psp::PspHgh, r::T) where {T <: Real} @@ -133,6 +134,7 @@ function eval_psp_local_real(psp::PspHgh, r::T) where {T <: Real} + exp(-rr^2 / 2) * (cloc[1] + cloc[2] * rr^2 + cloc[3] * rr^4 + cloc[4] * rr^6) ) end +@vectorize_psp_function PspHgh DFTK.eval_psp_local_real # [HGH98] (7-15) except they do it with plane waves normalized by 1/sqrt(Ω). @@ -161,7 +163,7 @@ function eval_psp_projector_fourier(psp::PspHgh, i, l, p::T) where {T <: Real} error("Not implemented for l=$l and i=$i") end - +@vectorize_psp_projector_function PspHgh DFTK.eval_psp_projector_fourier # [HGH98] (3) function eval_psp_projector_real(psp::PspHgh, i, l, r::T) where {T <: Real} @@ -169,6 +171,7 @@ function eval_psp_projector_real(psp::PspHgh, i, l, r::T) where {T <: Real} ired = (4i - 1) / T(2) sqrt(T(2)) * r^(l + 2(i - 1)) * exp(-r^2 / 2rp^2) / rp^(l + ired) / sqrt(gamma(l + ired)) end +@vectorize_psp_projector_function PspHgh DFTK.eval_psp_projector_real function eval_psp_energy_correction(T, psp::PspHgh) # By construction we need to compute the DC component of the difference diff --git a/src/pseudo/PspLinComb.jl b/src/pseudo/PspLinComb.jl index ac66fcc0b1..f3e3d1b452 100644 --- a/src/pseudo/PspLinComb.jl +++ b/src/pseudo/PspLinComb.jl @@ -76,6 +76,15 @@ macro make_psplincomb_call(fn) end end end + +macro make_psplincomb_call_vectorized(fn) + quote + function $fn(psp::PspLinComb, arg::AbstractVector{<:Real}) + sum(c * $fn(pp, arg) for (c, pp) in zip(psp.coefficients, psp.pseudos)) + end + end +end + @make_psplincomb_call DFTK.eval_psp_local_real @make_psplincomb_call DFTK.eval_psp_local_fourier @make_psplincomb_call DFTK.eval_psp_valence_density_real @@ -84,3 +93,14 @@ end @make_psplincomb_call DFTK.eval_psp_core_density_fourier @make_psplincomb_call DFTK.eval_psp_core_kinetic_energy_density_real @make_psplincomb_call DFTK.eval_psp_core_kinetic_energy_density_fourier + +@vectorize_psp_function PspLinComb DFTK.eval_psp_local_real +@vectorize_psp_function PspLinComb DFTK.eval_psp_local_fourier +@vectorize_psp_function PspLinComb DFTK.eval_psp_valence_density_real +@vectorize_psp_function PspLinComb DFTK.eval_psp_valence_density_fourier +@vectorize_psp_function PspLinComb DFTK.eval_psp_core_density_real +@vectorize_psp_function PspLinComb DFTK.eval_psp_core_density_fourier +@vectorize_psp_function PspLinComb DFTK.eval_psp_core_kinetic_energy_density_real +@vectorize_psp_function PspLinComb DFTK.eval_psp_core_kinetic_energy_density_fourier +@vectorize_psp_projector_function PspLinComb DFTK.eval_psp_projector_real +@vectorize_psp_projector_function PspLinComb DFTK.eval_psp_projector_fourier \ No newline at end of file diff --git a/src/pseudo/PspUpf.jl b/src/pseudo/PspUpf.jl index 804e36c24c..8ab775e43c 100644 --- a/src/pseudo/PspUpf.jl +++ b/src/pseudo/PspUpf.jl @@ -182,6 +182,7 @@ has_core_kinetic_energy_density(psp::PspUpf) = !all(iszero, psp.r2_τcore) function eval_psp_projector_real(psp::PspUpf, i, l, r::T)::T where {T<:Real} psp.r2_projs_interp[l+1][i](r) / r^2 # TODO if r is below a threshold, return zero end +@vectorize_psp_projector_function PspUpf DFTK.eval_psp_projector_real function eval_psp_projector_fourier(psp::PspUpf, i, l, p::T)::T where {T<:Real} # The projectors may have been cut off before the end of the radial mesh @@ -214,6 +215,7 @@ pswfc_label(psp::PspUpf, i, l) = psp.pswfc_labels[l+1][i] function eval_psp_pswfc_real(psp::PspUpf, i, l, r::T)::T where {T<:Real} psp.r2_pswfcs_interp[l+1][i](r) / r^2 # TODO if r is below a threshold, return zero end +@vectorize_psp_projector_function PspUpf DFTK.eval_psp_pswfc_real function eval_psp_pswfc_fourier(psp::PspUpf, i, l, p::T)::T where {T<:Real} # Pseudo-atomic wavefunctions are _not_ currently cut off like the other @@ -222,8 +224,10 @@ function eval_psp_pswfc_fourier(psp::PspUpf, i, l, p::T)::T where {T<:Real} # If issues arise, try cutting them off too. return hankel(psp.rgrid, psp.r2_pswfcs[l+1][i], l, p) end +@vectorize_psp_projector_function PspUpf DFTK.eval_psp_pswfc_fourier eval_psp_local_real(psp::PspUpf, r::T) where {T<:Real} = psp.vloc_interp(r) +@vectorize_psp_function PspUpf DFTK.eval_psp_local_real # Low-level function for the local part of the pseudopotential in reciprocal space function _eval_psp_local_fourier(quadrature, rgrid, vloc, Zion, p::T)::T where {T<:Real} @@ -247,7 +251,7 @@ function eval_psp_local_fourier(psp::PspUpf, p::T) where {T<:Real} _eval_psp_local_fourier(quadrature, rgrid, vloc, psp.Zion, p) end -# Vectorized version of the above, GPU compatible +# Vectorized version of the above, GPU optimized function eval_psp_local_fourier(psp::PspUpf, ps::AbstractVector{T}) where {T<:Real} quadrature = default_psp_quadrature(psp.rgrid) arch = architecture(ps) @@ -264,16 +268,19 @@ end function eval_psp_valence_density_real(psp::PspUpf, r::T) where {T<:Real} psp.r2_ρion_interp(r) / r^2 # TODO if r is below a threshold, return zero end +@vectorize_psp_function PspUpf DFTK.eval_psp_valence_density_real function eval_psp_valence_density_fourier(psp::PspUpf, p::T) where {T<:Real} rgrid = @view psp.rgrid[1:psp.ircut] r2_ρion = @view psp.r2_ρion[1:psp.ircut] return hankel(rgrid, r2_ρion, 0, p) end +@vectorize_psp_function PspUpf DFTK.eval_psp_valence_density_fourier function eval_psp_core_density_real(psp::PspUpf, r::T) where {T<:Real} psp.r2_ρcore_interp(r) / r^2 # TODO if r is below a threshold, return zero end +@vectorize_psp_function PspUpf DFTK.eval_psp_core_density_real function eval_psp_core_density_fourier(psp::PspUpf, p::T) where {T<:Real} rgrid = @view psp.rgrid[1:psp.ircut] @@ -281,15 +288,30 @@ function eval_psp_core_density_fourier(psp::PspUpf, p::T) where {T<:Real} return hankel(rgrid, r2_ρcore, 0, p) end +# Vectorized version of the above, GPU optimized +function eval_psp_core_density_fourier(psp::PspUpf, ps::AbstractVector{T}) where {T<:Real} + quadrature = default_psp_quadrature(psp.rgrid) + arch = architecture(ps) + rgrid = to_device(arch, @view psp.rgrid[1:psp.ircut]) + r2_ρcore = to_device(arch, @view psp.r2_ρcore[1:psp.ircut]) + map(ps) do p + # GPU kernels with dynamic function calls do not compile, + # hence the pre-determined explicit integration function + hankel(quadrature, rgrid, r2_ρcore, 0, p) + end +end + function eval_psp_core_kinetic_energy_density_real(psp::PspUpf, r::T) where {T<:Real} psp.r2_τcore_interp(r) / r^2 # TODO if r is below a threshold, return zero end +@vectorize_psp_function PspUpf DFTK.eval_psp_core_kinetic_energy_density_real function eval_psp_core_kinetic_energy_density_fourier(psp::PspUpf, p::T) where {T<:Real} rgrid = @view psp.rgrid[1:psp.ircut] r2_τcore = @view psp.r2_τcore[1:psp.ircut] return hankel(rgrid, r2_τcore, 0, p) end +@vectorize_psp_function PspUpf DFTK.eval_psp_core_kinetic_energy_density_fourier function eval_psp_energy_correction(T, psp::PspUpf) rgrid = @view psp.rgrid[1:psp.ircut] diff --git a/src/workarounds/forwarddiff_rules.jl b/src/workarounds/forwarddiff_rules.jl index f28fe9cac8..2af87e37dc 100644 --- a/src/workarounds/forwarddiff_rules.jl +++ b/src/workarounds/forwarddiff_rules.jl @@ -304,6 +304,13 @@ end end function hankel(r::AbstractVector, r2_f::AbstractVector, l::Integer, p::TT) where {TT <: ForwardDiff.Dual} + quadrature = default_psp_quadrature(r) + hankel(quadrature, r, r2_f, l, p) +end + +# For GPU kernels to compile, dynamic function calls must be avoided. Therefore, the quadrature +# must be known and passed ahead of time, and the Dual type explicitly parametrized. +function hankel(quadrature, r::AbstractVector, r2_f::AbstractVector, l::Integer, p::Dual{Tg,V,N}) where {Tg,V,N} # This custom rule uses two properties of the hankel transform: # d H[f] / dp = 4\pi \int_0^∞ r^2 f(r) j_l'(p⋅r)⋅r dr # and that @@ -314,20 +321,16 @@ function hankel(r::AbstractVector, r2_f::AbstractVector, l::Integer, p::TT) wher # the tricky bit is to exploit that one needs both the j_l'(p⋅r) and j_l(p⋅r) values # but one does not want to precompute and allocate them into arrays # TODO Investigate custom rules for bessels and integration - - T = ForwardDiff.valtype(TT) pv = ForwardDiff.value(p) - jl = sphericalbesselj_fast.(l, pv .* r) - value = 4T(π) * simpson((i, r) -> r2_f[i] * jl[i], r) - - if iszero(pv) - return TT(value, zero(T) * ForwardDiff.partials(p)) - end - derivative = 4T(π) * simpson(r) do i, r - (r2_f[i] * (l * jl[i] / pv - r * sphericalbesselj_fast(l+1, pv * r))) + # To reduce allocations, compute value and derivative simultaneously, using a SVector as integrand + value, derivative = 4V(π) * quadrature(r) do i, r + jl_i = sphericalbesselj_fast(l, pv * r) + val = r2_f[i] * jl_i + deriv = iszero(pv) ? zero(V) : r2_f[i] * (l * jl_i / pv - r * sphericalbesselj_fast(l+1, pv * r)) + SVector(val, deriv) end - TT(value, derivative * ForwardDiff.partials(p)) + Dual{Tg,V,N}(value, derivative * ForwardDiff.partials(p)) end # other workarounds