diff --git a/src/architecture.jl b/src/architecture.jl index 0a3a73cf5b..b607c9fd92 100644 --- a/src/architecture.jl +++ b/src/architecture.jl @@ -46,3 +46,8 @@ Explanaition of key entries: function memory_usage(::CPU) (; max_rss=Sys.maxrss(), gc_bytes=Base.gc_live_bytes()) end + +""" +Returns the architecture of the given array. +""" +architecture(x::AbstractArray) = x isa AbstractGPUArray ? GPU{typeof(x)}() : CPU() diff --git a/src/common/quadrature.jl b/src/common/quadrature.jl index 48655fbff9..9e16c2b536 100644 --- a/src/common/quadrature.jl +++ b/src/common/quadrature.jl @@ -38,9 +38,9 @@ point `i` (not necessarily in order). """ simpson @inbounds function simpson(integrand, x::AbstractVector) - n = length(x) - n <= 4 && return trapezoidal(integrand, x) - if (x[2] - x[1]) ≈ (x[3] - x[2]) + if length(x) <= 4 + trapezoidal(integrand, x) + elseif (x[2] - x[1]) ≈ (x[3] - x[2]) simpson_uniform(integrand, x) else simpson_nonuniform(integrand, x) @@ -114,3 +114,16 @@ end return I end + +""" +Return the approproate integration function given a PSP quadrature +""" +function default_psp_quadrature(x::AbstractArray) + if length(x) <= 4 + trapezoidal + elseif (x[2] - x[1]) ≈ (x[3] - x[2]) + simpson_uniform + else + simpson_nonuniform + end +end \ No newline at end of file diff --git a/src/common/spherical_bessels.jl b/src/common/spherical_bessels.jl index a5b0c82828..c73961c0bd 100644 --- a/src/common/spherical_bessels.jl +++ b/src/common/spherical_bessels.jl @@ -18,5 +18,5 @@ with `SpecialFunctions.sphericalbesselj`. Specialized for integer ``0 ≤ l ≤ l == 3 && return (sin(x) * (15 - 6x^2) + cos(x) * (x^3 - 15x)) / x^4 l == 4 && return (sin(x) * (105 - 45x^2 + x^4) + cos(x) * (10x^3 - 105x)) / x^5 l == 5 && return (sin(x) * (945 - 420x^2 + 15x^4) + cos(x) * (-945x + 105x^3 - x^5)) / x^6 - error("The case l = $l is not implemented") + throw(BoundsError()) # specific l not implemented end diff --git a/src/elements.jl b/src/elements.jl index 0f64dc86d3..968c1d514b 100644 --- a/src/elements.jl +++ b/src/elements.jl @@ -52,6 +52,12 @@ function core_charge_density_fourier(::Element, ::T)::T where {T <: Real} error("Abstract elements do not necesesarily provide core charge density.") end +# Generic vectorized version of local_potential_fourier, GPU-safe +function local_potential_fourier(el::Element, ps::AbstractVector{T}) where {T <: Real} + arch = architecture(ps) + to_device(arch, map(p -> local_potential_fourier(el, p), to_cpu(ps))) +end + Base.show(io::IO, el::Element) = print(io, "$(typeof(el))(:$(species(el)))") # @@ -83,6 +89,7 @@ function local_potential_fourier(el::ElementCoulomb, p::T) where {T <: Real} Z = charge_nuclear(el) return -4T(π) * Z / p^2 end + local_potential_real(el::ElementCoulomb, r::Real) = -charge_nuclear(el) / r @@ -160,9 +167,12 @@ has_core_density(el::ElementPsp) = has_core_density(el.psp) eval_psp_energy_correction(T, el::ElementPsp) = eval_psp_energy_correction(T, el.psp) function local_potential_fourier(el::ElementPsp, p::T) where {T <: Real} - p == 0 && return zero(T) # Compensating charge background eval_psp_local_fourier(el.psp, p) end +# Vectorized version of the above +function local_potential_fourier(el::ElementPsp, ps::AbstractVector{T}) where {T <: Real} + eval_psp_local_fourier(el.psp, ps) +end local_potential_real(el::ElementPsp, r::Real) = eval_psp_local_real(el.psp, r) function valence_charge_density_fourier(el::ElementPsp, p::T) where {T <: Real} diff --git a/src/pseudo/NormConservingPsp.jl b/src/pseudo/NormConservingPsp.jl index ea8e08d999..b5de190216 100644 --- a/src/pseudo/NormConservingPsp.jl +++ b/src/pseudo/NormConservingPsp.jl @@ -18,6 +18,7 @@ abstract type NormConservingPsp end # eval_psp_projector_fourier(psp, i, l, p::Real) # eval_psp_local_real(psp, r::Real) # eval_psp_local_fourier(psp, p::Real) +# eval_psp_local_fourier(psp, ps::AbstractArray{ eval_psp_local_fourier(psp, p), to_cpu(ps))) +end + @doc raw""" eval_psp_energy_correction([T=Float64,] psp::NormConservingPsp) eval_psp_energy_correction([T=Float64,] element::Element) diff --git a/src/pseudo/PspHgh.jl b/src/pseudo/PspHgh.jl index b6e0594243..ff94fc8e42 100644 --- a/src/pseudo/PspHgh.jl +++ b/src/pseudo/PspHgh.jl @@ -129,11 +129,11 @@ end # [GTH98] (6) except they do it with plane waves normalized by 1/sqrt(Ω). function eval_psp_local_fourier(psp::PspHgh, p::T) where {T <: Real} + p == 0 && return zero(T) # Compensating charge background t::T = p * psp.rloc psp_local_polynomial(T, psp, t) * exp(-t^2 / 2) / t^2 end - @doc raw""" Estimate an upper bound for the argument `p` after which `abs(eval_psp_local_fourier(psp, p))` is a strictly decreasing function. diff --git a/src/pseudo/PspUpf.jl b/src/pseudo/PspUpf.jl index 365b39c22e..092b01c95c 100644 --- a/src/pseudo/PspUpf.jl +++ b/src/pseudo/PspUpf.jl @@ -203,19 +203,40 @@ end eval_psp_local_real(psp::PspUpf, r::T) where {T<:Real} = psp.vloc_interp(r) -function eval_psp_local_fourier(psp::PspUpf, p::T)::T where {T<:Real} +# Low-level function for the local part of the pseudopotential in reciprocal space +function _eval_psp_local_fourier(quadrature, rgrid, vloc, Zion, p::T)::T where {T<:Real} # QE style C(r) = -Zerf(r)/r Coulomb tail correction used to ensure # exponential decay of `f` so that the Hankel transform is accurate. # H[Vloc(r)] = H[Vloc(r) - C(r)] + H[C(r)], # where H[-Zerf(r)/r] = -Z/p^2 exp(-p^2 /4) # ABINIT uses a more 'pure' Coulomb term with the same asymptotic behavior # C(r) = -Z/r; H[-Z/r] = -Z/p^2 + p == 0 && return zero(T) # Compensating charge background + I = quadrature(rgrid) do i, r + r * (r * vloc[i] - -Zion * erf(r)) * sphericalbesselj_fast(0, p * r) + end + 4T(π) * (I + -Zion / p^2 * exp(-p^2 / T(4))) +end + +function eval_psp_local_fourier(psp::PspUpf, p::T) where {T<:Real} + quadrature = default_psp_quadrature(psp.rgrid) rgrid = @view psp.rgrid[1:psp.ircut] vloc = @view psp.vloc[1:psp.ircut] - I = simpson(rgrid) do i, r - r * (r * vloc[i] - -psp.Zion * erf(r)) * sphericalbesselj_fast(0, p * r) + _eval_psp_local_fourier(quadrature, rgrid, vloc, psp.Zion, p) +end + +# Vectorized version of the above, GPU compatible +function eval_psp_local_fourier(psp::PspUpf, ps::AbstractVector{T}) where {T<:Real} + quadrature = default_psp_quadrature(psp.rgrid) + arch = architecture(ps) + rgrid = to_device(arch, @view psp.rgrid[1:psp.ircut]) + vloc = to_device(arch, @view psp.vloc[1:psp.ircut]) + Zion = psp.Zion + map(ps) do p + # GPU kernels with dynamic function calls do not compile, + # hence the pre-determined explicit integration function + _eval_psp_local_fourier(quadrature, rgrid, vloc, Zion, p) end - 4T(π) * (I + -psp.Zion / p^2 * exp(-p^2 / T(4))) end function eval_psp_density_valence_real(psp::PspUpf, r::T) where {T<:Real} diff --git a/src/terms/local.jl b/src/terms/local.jl index ebcce36ea7..e65d1c62ed 100644 --- a/src/terms/local.jl +++ b/src/terms/local.jl @@ -81,17 +81,18 @@ function atomic_local_form_factors(basis::PlaneWaveBasis{T}; q=zero(Vec3{T})) wh p = norm(G) iG2ifnorm_cpu[iG] = get!(norm_indices, p, length(norm_indices) + 1) end + iG2ifnorm = to_device(basis.architecture, iG2ifnorm_cpu) - form_factors_cpu = zeros(T, length(norm_indices), length(basis.model.atom_groups)) - for(p, ifnorm) in norm_indices - for (igroup, group) in enumerate(basis.model.atom_groups) - element = basis.model.atoms[first(group)] - form_factors_cpu[ifnorm, igroup] = local_potential_fourier(element, p) - end + ni_pairs = collect(pairs(norm_indices)) + ps = to_device(basis.architecture, first.(ni_pairs)) + indices = to_device(basis.architecture, last.(ni_pairs)) + + form_factors = similar(ps, length(norm_indices), length(basis.model.atom_groups)) + for (igroup, group) in enumerate(basis.model.atom_groups) + element = basis.model.atoms[first(group)] + @inbounds form_factors[indices, igroup] .= local_potential_fourier(element, ps) end - form_factors = to_device(basis.architecture, form_factors_cpu) - iG2ifnorm = to_device(basis.architecture, iG2ifnorm_cpu) (; form_factors, iG2ifnorm) end