Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 92 additions & 12 deletions library/core/src/slice/ascii.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@
use core::ascii::EscapeDefault;

use crate::fmt::{self, Write};
#[cfg(not(any(
all(target_arch = "x86_64", target_feature = "sse2"),
all(target_arch = "loongarch64", target_feature = "lsx")
)))]
#[cfg(not(all(target_arch = "loongarch64", target_feature = "lsx")))]
use crate::intrinsics::const_eval_select;
use crate::{ascii, iter, ops};

Expand Down Expand Up @@ -463,19 +460,102 @@ const fn is_ascii(s: &[u8]) -> bool {
)
}

/// ASCII test optimized to use the `pmovmskb` instruction on `x86-64` and the
/// `vmskltz.b` instruction on `loongarch64`.
/// Chunk size for vectorized ASCII checking (two 16-byte SSE registers).
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
const CHUNK_SIZE: usize = 32;

/// SSE2 implementation using `_mm_movemask_epi8` (compiles to `pmovmskb`) to
/// avoid LLVM's broken AVX-512 auto-vectorization of counting loops.
Comment thread
folkertdev marked this conversation as resolved.
///
/// # Safety
/// Requires SSE2 support (guaranteed on x86_64).
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
#[target_feature(enable = "sse2")]
unsafe fn is_ascii_sse2(bytes: &[u8]) -> bool {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This shouldn't need target_feature(enable ...) since it's already gated behind sse2, right? Which would allow the function to be safe.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The unsafe is not because of the target feature. In fact I suspect you can remove the unsafe from the function. The only pre-condition is sse2 support, and with the target_feature annotation that will get enforced anyway

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

use crate::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128};

let mut i = 0;

while i + CHUNK_SIZE <= bytes.len() {
// SAFETY: We have verified that `i + CHUNK_SIZE <= bytes.len()`.
let ptr = unsafe { bytes.as_ptr().add(i) };

// Load two 16-byte chunks and combine them.
// SAFETY: We verified `i + 32 <= len`, so ptr is valid for 32 bytes.
// `_mm_loadu_si128` allows unaligned loads.
let chunk1 = unsafe { _mm_loadu_si128(ptr as *const __m128i) };
// SAFETY: Same as above - ptr.add(16) is within the valid 32-byte range.
let chunk2 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) };

// OR them together - if any byte has the high bit set, the result will too
let combined = _mm_or_si128(chunk1, chunk2);

// Create a mask from the MSBs of each byte.
// If any byte is >= 128, its MSB is 1, so the mask will be non-zero.
let mask = _mm_movemask_epi8(combined);

if mask != 0 {
return false;
}

i += CHUNK_SIZE;
}

// Handle remaining bytes with simple loop
while i < bytes.len() {
if !bytes[i].is_ascii() {
return false;
}
i += 1;
}

true
}

/// ASCII test optimized to use the `pmovmskb` instruction on `x86-64`.
///
/// Uses explicit SSE2 intrinsics to prevent LLVM from auto-vectorizing with
/// broken AVX-512 code that extracts mask bits one-by-one.
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
#[inline]
#[rustc_allow_const_fn_unstable(const_eval_select)]
const fn is_ascii(bytes: &[u8]) -> bool {
const USIZE_SIZE: usize = size_of::<usize>();
const NONASCII_MASK: usize = usize::MAX / 255 * 0x80;

const_eval_select!(
@capture { bytes: &[u8] } -> bool:
if const {
is_ascii_simple(bytes)
} else {
// For small inputs, use usize-at-a-time processing to avoid SSE2 call overhead.
if bytes.len() < CHUNK_SIZE {
let chunks = bytes.chunks_exact(USIZE_SIZE);
let remainder = chunks.remainder();
for chunk in chunks {
let word = usize::from_ne_bytes(chunk.try_into().unwrap());
if (word & NONASCII_MASK) != 0 {
return false;
}
}
return remainder.iter().all(|b| b.is_ascii());
}

// SAFETY: SSE2 is guaranteed available on x86_64
Comment thread
folkertdev marked this conversation as resolved.
Outdated
unsafe { is_ascii_sse2(bytes) }
}
)
}

/// ASCII test optimized to use the `vmskltz.b` instruction on `loongarch64`.
///
/// Other platforms are not likely to benefit from this code structure, so they
/// use SWAR techniques to test for ASCII in `usize`-sized chunks.
#[cfg(any(
all(target_arch = "x86_64", target_feature = "sse2"),
all(target_arch = "loongarch64", target_feature = "lsx")
))]
#[cfg(all(target_arch = "loongarch64", target_feature = "lsx"))]
#[inline]
const fn is_ascii(bytes: &[u8]) -> bool {
// Process chunks of 32 bytes at a time in the fast path to enable
// auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers
// auto-vectorization and use of `vmskltz.b`. Two 128-bit vector registers
// can be OR'd together and then the resulting vector can be tested for
// non-ASCII bytes.
const CHUNK_SIZE: usize = 32;
Expand All @@ -485,7 +565,7 @@ const fn is_ascii(bytes: &[u8]) -> bool {
while i + CHUNK_SIZE <= bytes.len() {
let chunk_end = i + CHUNK_SIZE;

// Get LLVM to produce a `pmovmskb` instruction on x86-64 which
// Get LLVM to produce a `vmskltz.b` instruction on loongarch64 which
// creates a mask from the most significant bit of each byte.
// ASCII bytes are less than 128 (0x80), so their most significant
// bit is unset.
Expand Down
18 changes: 18 additions & 0 deletions tests/assembly-llvm/slice-is-ascii-avx512.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
//@ only-x86_64
//@ compile-flags: -C opt-level=3 -C target-cpu=znver4
//@ compile-flags: -C llvm-args=-x86-asm-syntax=intel
//@ assembly-output: emit-asm
#![crate_type = "lib"]

// Verify is_ascii uses pmovmskb/vpmovmskb instead of kshiftrd with AVX-512.
// The fix uses explicit SSE2 intrinsics to avoid LLVM's broken auto-vectorization.
//
// See: https://github.com/rust-lang/rust/issues/129293

// CHECK-LABEL: test_is_ascii
#[no_mangle]
pub fn test_is_ascii(s: &[u8]) -> bool {
// CHECK-NOT: kshiftrd
// CHECK-NOT: kshiftrq
s.is_ascii()
}
9 changes: 6 additions & 3 deletions tests/codegen-llvm/slice-is-ascii.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
//@ only-x86_64
//@ compile-flags: -C opt-level=3 -C target-cpu=x86-64
//@ only-loongarch64
//@ compile-flags: -C opt-level=3
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The two test files can be merged using revisions, which would be nice if we want the same thing for other arches:

//@ revisions: X86_64 LA64
//@ compile-flags: -C opt-level=3
//
//@ [X86_64]: only-x86-64
//@ [X86_64] compile-flags: 
//
//@ [LA64]: only-loongarch64
...

Then use CHECK as the prefix that applies to all, and X86_64 / LA64 as the specific prefix.

#![crate_type = "lib"]

/// Check that the fast-path of `is_ascii` uses a `pmovmskb` instruction.
/// Check that the fast-path of `is_ascii` uses a `vmskltz.b` instruction.
/// Platforms lacking an equivalent instruction use other techniques for
/// optimizing `is_ascii`.
///
/// Note: x86_64 uses explicit SSE2 intrinsics instead of relying on
/// auto-vectorization. See `slice-is-ascii-avx512.rs`.
// CHECK-LABEL: @is_ascii_autovectorized
#[no_mangle]
pub fn is_ascii_autovectorized(s: &[u8]) -> bool {
Expand Down
Loading