Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions src/common/hankel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,13 @@ as input the radial grid `r`, the precomputed quantity r²f(r) `r2_f`, angular
momentum / spherical bessel order `l`, and the Hankel coordinate `p`.
"""
function hankel(r::AbstractVector, r2_f::AbstractVector, l::Integer, p::T)::T where {T<:Real}
quadrature = default_psp_quadrature(r)
hankel(quadrature, r, r2_f, l, p)
end

function hankel(quadrature, r::AbstractVector, r2_f::AbstractVector, l::Integer, p::T)::T where {T<:Real}
@assert length(r) == length(r2_f)
4T(π) * simpson(r) do i, ri
4T(π) * quadrature(r) do i, ri
r2_f[i] * sphericalbesselj_fast(l, p * ri)
end
end
end
2 changes: 1 addition & 1 deletion src/common/spherical_harmonics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ function ylm_real(l::Integer, m::Integer, rvec::AbstractVector{T}) where {T}
(m == 3) && return sqrt( 35 / 32T(π)) * (x^2 - 3y^2) * x / r^3
end

error("The case l = $l and m = $m is not implemented")
throw(BoundsError()) # specific (l,m) pair not implemented
end

"""
Expand Down
12 changes: 10 additions & 2 deletions src/pseudo/NormConservingPsp.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ abstract type NormConservingPsp end
# has_core_density(psp)
# eval_psp_projector_real(psp, i, l, r::Real)
# eval_psp_projector_fourier(psp, i, l, p::Real)
# eval_psp_projector_fourier(psp, i, l, ps::AbstrcatArray{<Real})
# eval_psp_local_real(psp, r::Real)
# eval_psp_local_fourier(psp, p::Real)
# eval_psp_local_fourier(psp, ps::AbstractArray{<Real})
Expand Down Expand Up @@ -53,8 +54,15 @@ at the reciprocal vector with modulus `p`:
\end{aligned}
```
"""
eval_psp_projector_fourier(psp::NormConservingPsp, p::AbstractVector) =
eval_psp_projector_fourier(psp, norm(p))
eval_psp_projector_fourier(psp::NormConservingPsp, i, l, p::AbstractVector) =
eval_psp_projector_fourier(psp, i, l, norm(p))

# Fallback vectorized implementation for non GPU-optimized code.
function eval_psp_projector_fourier(psp::NormConservingPsp, i, l,
ps::AbstractVector{T}) where {T <: Real}
arch = architecture(ps)
to_device(arch, map(p -> eval_psp_projector_fourier(psp, i, l, p), to_cpu(ps)))
end

"""
eval_psp_local_real(psp, r)
Expand Down
14 changes: 14 additions & 0 deletions src/pseudo/PspUpf.jl
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,20 @@ function eval_psp_projector_fourier(psp::PspUpf, i, l, p::T)::T where {T<:Real}
hankel(rgrid, r2_proj, l, p)
end

# Vectorized version of the above, GPU compatible
function eval_psp_projector_fourier(psp::PspUpf, i, l, ps::AbstractVector{T}) where {T<:Real}
quadrature = default_psp_quadrature(psp.rgrid)
arch = architecture(ps)
ircut_proj = min(psp.ircut, length(psp.r2_projs[l+1][i]))
rgrid = to_device(arch, @view psp.rgrid[1:ircut_proj])
r2_proj = to_device(arch, @view psp.r2_projs[l+1][i][1:ircut_proj])
map(ps) do p
# GPU kernels with dynamic function calls do not compile,
# hence the pre-determined explicit integration function
hankel(quadrature, rgrid, r2_proj, l, p)
end
end

count_n_pswfc_radial(psp::PspUpf, l) = length(psp.r2_pswfcs[l+1])

pswfc_label(psp::PspUpf, i, l) = psp.pswfc_labels[l+1][i]
Expand Down
50 changes: 35 additions & 15 deletions src/terms/nonlocal.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,9 @@ end
C = to_device(basis.architecture, build_projection_coefficients(T, element.psp))
for (ik, kpt) in enumerate(basis.kpoints)
# We compute the forces from the irreductible BZ; they are symmetrized later.
G_plus_k_cart = to_cpu(Gplusk_vectors_cart(basis, kpt))
G_plus_k_cart = Gplusk_vectors_cart(basis, kpt)
G_plus_k = Gplusk_vectors(basis, kpt)
form_factors = to_device(basis.architecture,
build_projector_form_factors(element.psp, G_plus_k_cart))
form_factors = build_projector_form_factors(element.psp, G_plus_k_cart)

# Pre-allocation of large arrays (Noticable performance improvements on
# CPU and GPU here)
Expand Down Expand Up @@ -170,8 +169,8 @@ function build_projection_vectors(basis::PlaneWaveBasis{T}, kpt::Kpoint,
unit_cell_volume = basis.model.unit_cell_volume
n_proj = count_n_proj(psps, psp_positions)
n_G = length(G_vectors(basis, kpt))
proj_vectors = zeros(Complex{eltype(psp_positions[1][1])}, n_G, n_proj)
G_plus_k = to_cpu(Gplusk_vectors(basis, kpt))
G_plus_k = Gplusk_vectors(basis, kpt)
proj_vectors = zeros_like(G_plus_k, Complex{eltype(psp_positions[1][1])}, n_G, n_proj)

# Compute the columns of proj_vectors = 1/√Ω \hat proj_i(k+G)
# Since the proj_i are translates of each others, \hat proj_i(k+G) decouples as
Expand All @@ -180,7 +179,7 @@ function build_projection_vectors(basis::PlaneWaveBasis{T}, kpt::Kpoint,
offset = 0 # offset into proj_vectors
for (psp, positions) in zip(psps, psp_positions)
# Compute position-independent form factors
G_plus_k_cart = to_cpu(Gplusk_vectors_cart(basis, kpt))
G_plus_k_cart = Gplusk_vectors_cart(basis, kpt)
form_factors = build_projector_form_factors(psp, G_plus_k_cart)

# Combine with structure factors
Expand All @@ -196,30 +195,51 @@ function build_projection_vectors(basis::PlaneWaveBasis{T}, kpt::Kpoint,
end
@assert offset == n_proj

# Offload potential values to a device (like a GPU)
to_device(basis.architecture, proj_vectors)
proj_vectors
end

"""
Build form factors (Fourier transforms of projectors) for all orbitals of an atom centered at 0.
This is a highly optimized, GPU compatible function.
"""
function build_projector_form_factors(psp::NormConservingPsp,
G_plus_k::AbstractVector{Vec3{TT}}) where {TT}
G_plus_ks = [G_plus_k]
G_plus_k::AbstractVector{Vec3{T}}) where {T}
Gpk = to_cpu(G_plus_k)
arch = architecture(G_plus_k)

iG2ifnorm_cpu = zeros(Int, length(Gpk))
norm_indices = IdDict{T, Int}()
for (iG, G) in enumerate(Gpk)
p = norm(G)
iG2ifnorm_cpu[iG] = get!(norm_indices, p, length(norm_indices) + 1)
end
iG2ifnorm = to_device(arch, iG2ifnorm_cpu)

ni_pairs = collect(pairs(norm_indices))
ps = to_device(arch, first.(ni_pairs))
p_indices = to_device(arch, last.(ni_pairs))

n_proj = count_n_proj(psp)
form_factors = zeros(Complex{TT}, length(G_plus_k), n_proj)
form_factors = similar(G_plus_k, Complex{T}, length(G_plus_k), n_proj)
G_indices = to_device(arch, collect(1:length(G_plus_k)))
proj_li = similar(G_indices, Complex{T}, length(G_indices))
for l = 0:psp.lmax,
n_proj_l = count_n_proj_radial(psp, l)
offset = sum(x -> count_n_proj(psp, x), 0:l-1; init=0) .+
n_proj_l .* (collect(1:2l+1) .- 1) # offset about m for a given l
for i = 1:n_proj_l
proj_li(p) = eval_psp_projector_fourier(psp, i, l, p)
form_factors_li = build_form_factors(proj_li, l, G_plus_ks)
@views form_factors[:, offset.+i] = form_factors_li[1]
# Performs same computation as build_form_factors(eval_psp_projector_fourier, l, [G_plus_k]),
# but in a highly optimized, vectorized, and GPU compatible way. Also saves on allocations,
# and many recomputations of unique |G+k|.
proj_li[p_indices] .= eval_psp_projector_fourier(psp, i, l, ps)
for m = -l:l
map!(@view(form_factors[:, offset[m + l + 1] + i]), G_indices) do iG
angular = (-im)^l * ylm_real(l, m, G_plus_k[iG])
proj_li[iG2ifnorm[iG]] * angular
end
end
end
end

form_factors
end

Expand Down
Loading