diff --git a/src/common/hankel.jl b/src/common/hankel.jl
index 8c2bc0cf8c..c0db57b3e9 100644
--- a/src/common/hankel.jl
+++ b/src/common/hankel.jl
@@ -10,8 +10,13 @@ as input the radial grid `r`, the precomputed quantity r²f(r) `r2_f`, angular
 momentum / spherical bessel order `l`, and the Hankel coordinate `p`.
 """
 function hankel(r::AbstractVector, r2_f::AbstractVector, l::Integer, p::T)::T where {T<:Real}
+    quadrature = default_psp_quadrature(r)
+    hankel(quadrature, r, r2_f, l, p)
+end
+
+function hankel(quadrature, r::AbstractVector, r2_f::AbstractVector, l::Integer, p::T)::T where {T<:Real}
     @assert length(r) == length(r2_f)
-    4T(π) * simpson(r) do i, ri
+    4T(π) * quadrature(r) do i, ri
         r2_f[i] * sphericalbesselj_fast(l, p * ri)
     end
-end
+end
\ No newline at end of file
diff --git a/src/common/spherical_harmonics.jl b/src/common/spherical_harmonics.jl
index 287a8521bb..1a45e0965a 100644
--- a/src/common/spherical_harmonics.jl
+++ b/src/common/spherical_harmonics.jl
@@ -45,7 +45,7 @@ function ylm_real(l::Integer, m::Integer, rvec::AbstractVector{T}) where {T}
         (m ==  3) && return sqrt( 35 / 32T(π)) * (x^2 - 3y^2) * x / r^3
     end
 
-    error("The case l = $l and m = $m is not implemented")
+    throw(BoundsError()) # specific (l,m) pair not implemented
 end
 
 """
diff --git a/src/pseudo/NormConservingPsp.jl b/src/pseudo/NormConservingPsp.jl
index b5de190216..a1d2c71e1e 100644
--- a/src/pseudo/NormConservingPsp.jl
+++ b/src/pseudo/NormConservingPsp.jl
@@ -16,6 +16,7 @@ abstract type NormConservingPsp end
 # has_core_density(psp)
 # eval_psp_projector_real(psp, i, l, r::Real)
 # eval_psp_projector_fourier(psp, i, l, p::Real)
+# eval_psp_projector_fourier(psp, i, l, ps::AbstrcatArray{<Real})
 # eval_psp_local_real(psp, r::Real)
 # eval_psp_local_fourier(psp, p::Real)
 # eval_psp_local_fourier(psp, ps::AbstractArray{<Real})
@@ -53,8 +54,15 @@ at the reciprocal vector with modulus `p`:
 \end{aligned}
 ```
 """
-eval_psp_projector_fourier(psp::NormConservingPsp, p::AbstractVector) =
-    eval_psp_projector_fourier(psp, norm(p))
+eval_psp_projector_fourier(psp::NormConservingPsp, i, l, p::AbstractVector) =
+    eval_psp_projector_fourier(psp, i, l, norm(p))
+
+# Fallback vectorized implementation for non GPU-optimized code.
+function eval_psp_projector_fourier(psp::NormConservingPsp, i, l,
+                                    ps::AbstractVector{T}) where {T <: Real}
+    arch = architecture(ps)
+    to_device(arch, map(p -> eval_psp_projector_fourier(psp, i, l, p), to_cpu(ps)))
+end
 
 """
     eval_psp_local_real(psp, r)
diff --git a/src/pseudo/PspUpf.jl b/src/pseudo/PspUpf.jl
index 552614794b..f9dce6cd5b 100644
--- a/src/pseudo/PspUpf.jl
+++ b/src/pseudo/PspUpf.jl
@@ -185,6 +185,20 @@ function eval_psp_projector_fourier(psp::PspUpf, i, l, p::T)::T where {T<:Real}
     hankel(rgrid, r2_proj, l, p)
 end
 
+# Vectorized version of the above, GPU compatible
+function eval_psp_projector_fourier(psp::PspUpf, i, l, ps::AbstractVector{T}) where {T<:Real}
+    quadrature = default_psp_quadrature(psp.rgrid)
+    arch = architecture(ps)
+    ircut_proj = min(psp.ircut, length(psp.r2_projs[l+1][i]))
+    rgrid = to_device(arch, @view psp.rgrid[1:ircut_proj])
+    r2_proj = to_device(arch, @view psp.r2_projs[l+1][i][1:ircut_proj])
+    map(ps) do p
+        # GPU kernels with dynamic function calls do not compile,
+        # hence the pre-determined explicit integration function
+        hankel(quadrature, rgrid, r2_proj, l, p)
+    end
+end
+
 count_n_pswfc_radial(psp::PspUpf, l) = length(psp.r2_pswfcs[l+1])
 
 pswfc_label(psp::PspUpf, i, l) = psp.pswfc_labels[l+1][i]
diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl
index 5bdbf0e90b..2aadd04abe 100644
--- a/src/terms/nonlocal.jl
+++ b/src/terms/nonlocal.jl
@@ -68,10 +68,9 @@ end
         C = to_device(basis.architecture, build_projection_coefficients(T, element.psp))
         for (ik, kpt) in enumerate(basis.kpoints)
             # We compute the forces from the irreductible BZ; they are symmetrized later.
-            G_plus_k_cart = to_cpu(Gplusk_vectors_cart(basis, kpt))
+            G_plus_k_cart = Gplusk_vectors_cart(basis, kpt)
             G_plus_k = Gplusk_vectors(basis, kpt)
-            form_factors = to_device(basis.architecture,
-                                     build_projector_form_factors(element.psp, G_plus_k_cart))
+            form_factors = build_projector_form_factors(element.psp, G_plus_k_cart)
 
             # Pre-allocation of large arrays (Noticable performance improvements on
             # CPU and GPU here)
@@ -170,8 +169,8 @@ function build_projection_vectors(basis::PlaneWaveBasis{T}, kpt::Kpoint,
     unit_cell_volume = basis.model.unit_cell_volume
     n_proj = count_n_proj(psps, psp_positions)
     n_G    = length(G_vectors(basis, kpt))
-    proj_vectors = zeros(Complex{eltype(psp_positions[1][1])}, n_G, n_proj)
-    G_plus_k = to_cpu(Gplusk_vectors(basis, kpt))
+    G_plus_k = Gplusk_vectors(basis, kpt)
+    proj_vectors = zeros_like(G_plus_k, Complex{eltype(psp_positions[1][1])}, n_G, n_proj)
 
     # Compute the columns of proj_vectors = 1/√Ω \hat proj_i(k+G)
     # Since the proj_i are translates of each others, \hat proj_i(k+G) decouples as
@@ -180,7 +179,7 @@ function build_projection_vectors(basis::PlaneWaveBasis{T}, kpt::Kpoint,
     offset = 0  # offset into proj_vectors
     for (psp, positions) in zip(psps, psp_positions)
         # Compute position-independent form factors
-        G_plus_k_cart = to_cpu(Gplusk_vectors_cart(basis, kpt))
+        G_plus_k_cart = Gplusk_vectors_cart(basis, kpt)
         form_factors = build_projector_form_factors(psp, G_plus_k_cart)
 
         # Combine with structure factors
@@ -196,30 +195,51 @@ function build_projection_vectors(basis::PlaneWaveBasis{T}, kpt::Kpoint,
     end
     @assert offset == n_proj
 
-    # Offload potential values to a device (like a GPU)
-    to_device(basis.architecture, proj_vectors)
+    proj_vectors
 end
 
 """
 Build form factors (Fourier transforms of projectors) for all orbitals of an atom centered at 0.
+This is a highly optimized, GPU compatible function.
 """
 function build_projector_form_factors(psp::NormConservingPsp,
-                                      G_plus_k::AbstractVector{Vec3{TT}}) where {TT}
-    G_plus_ks = [G_plus_k]
+                                      G_plus_k::AbstractVector{Vec3{T}}) where {T}
+    Gpk = to_cpu(G_plus_k)
+    arch = architecture(G_plus_k)
+
+    iG2ifnorm_cpu = zeros(Int, length(Gpk))
+    norm_indices = IdDict{T, Int}()
+    for (iG, G) in enumerate(Gpk)
+        p = norm(G)
+        iG2ifnorm_cpu[iG] = get!(norm_indices, p, length(norm_indices) + 1)
+    end
+    iG2ifnorm = to_device(arch, iG2ifnorm_cpu)
+
+    ni_pairs = collect(pairs(norm_indices))
+    ps = to_device(arch, first.(ni_pairs))
+    p_indices = to_device(arch, last.(ni_pairs))
 
     n_proj = count_n_proj(psp)
-    form_factors = zeros(Complex{TT}, length(G_plus_k), n_proj)
+    form_factors = similar(G_plus_k, Complex{T}, length(G_plus_k), n_proj)
+    G_indices = to_device(arch, collect(1:length(G_plus_k)))
+    proj_li = similar(G_indices, Complex{T}, length(G_indices))
     for l = 0:psp.lmax, 
         n_proj_l = count_n_proj_radial(psp, l)
         offset = sum(x -> count_n_proj(psp, x), 0:l-1; init=0) .+ 
                  n_proj_l .* (collect(1:2l+1) .- 1) # offset about m for a given l 
         for i = 1:n_proj_l
-            proj_li(p) = eval_psp_projector_fourier(psp, i, l, p)
-            form_factors_li = build_form_factors(proj_li, l, G_plus_ks)
-            @views form_factors[:, offset.+i] = form_factors_li[1]
+            # Performs same computation as build_form_factors(eval_psp_projector_fourier, l, [G_plus_k]),
+            # but in a highly optimized, vectorized, and GPU compatible way. Also saves on allocations,
+            # and many recomputations of unique |G+k|.
+            proj_li[p_indices] .= eval_psp_projector_fourier(psp, i, l, ps)
+            for m = -l:l
+                map!(@view(form_factors[:, offset[m + l + 1] + i]), G_indices) do iG
+                    angular = (-im)^l * ylm_real(l, m, G_plus_k[iG])
+                    proj_li[iG2ifnorm[iG]] * angular
+                end
+            end
         end
     end
-
     form_factors
 end