-
-
Notifications
You must be signed in to change notification settings - Fork 14.9k
Fix is_ascii performance regression on AVX-512 CPUs when compiling with -C target-cpu=native #151259
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix is_ascii performance regression on AVX-512 CPUs when compiling with -C target-cpu=native #151259
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,10 +3,7 @@ | |
| use core::ascii::EscapeDefault; | ||
|
|
||
| use crate::fmt::{self, Write}; | ||
| #[cfg(not(any( | ||
| all(target_arch = "x86_64", target_feature = "sse2"), | ||
| all(target_arch = "loongarch64", target_feature = "lsx") | ||
| )))] | ||
| #[cfg(not(all(target_arch = "loongarch64", target_feature = "lsx")))] | ||
| use crate::intrinsics::const_eval_select; | ||
| use crate::{ascii, iter, ops}; | ||
|
|
||
|
|
@@ -463,19 +460,102 @@ const fn is_ascii(s: &[u8]) -> bool { | |
| ) | ||
| } | ||
|
|
||
| /// ASCII test optimized to use the `pmovmskb` instruction on `x86-64` and the | ||
| /// `vmskltz.b` instruction on `loongarch64`. | ||
| /// Chunk size for vectorized ASCII checking (two 16-byte SSE registers). | ||
| #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] | ||
| const CHUNK_SIZE: usize = 32; | ||
|
|
||
| /// SSE2 implementation using `_mm_movemask_epi8` (compiles to `pmovmskb`) to | ||
| /// avoid LLVM's broken AVX-512 auto-vectorization of counting loops. | ||
| /// | ||
| /// # Safety | ||
| /// Requires SSE2 support (guaranteed on x86_64). | ||
| #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] | ||
| #[target_feature(enable = "sse2")] | ||
| unsafe fn is_ascii_sse2(bytes: &[u8]) -> bool { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This shouldn't need
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
| use crate::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128}; | ||
|
|
||
| let mut i = 0; | ||
|
|
||
| while i + CHUNK_SIZE <= bytes.len() { | ||
| // SAFETY: We have verified that `i + CHUNK_SIZE <= bytes.len()`. | ||
| let ptr = unsafe { bytes.as_ptr().add(i) }; | ||
|
|
||
| // Load two 16-byte chunks and combine them. | ||
| // SAFETY: We verified `i + 32 <= len`, so ptr is valid for 32 bytes. | ||
| // `_mm_loadu_si128` allows unaligned loads. | ||
| let chunk1 = unsafe { _mm_loadu_si128(ptr as *const __m128i) }; | ||
| // SAFETY: Same as above - ptr.add(16) is within the valid 32-byte range. | ||
| let chunk2 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) }; | ||
|
|
||
| // OR them together - if any byte has the high bit set, the result will too | ||
| let combined = _mm_or_si128(chunk1, chunk2); | ||
|
|
||
| // Create a mask from the MSBs of each byte. | ||
| // If any byte is >= 128, its MSB is 1, so the mask will be non-zero. | ||
| let mask = _mm_movemask_epi8(combined); | ||
|
|
||
| if mask != 0 { | ||
| return false; | ||
| } | ||
|
|
||
| i += CHUNK_SIZE; | ||
| } | ||
|
|
||
| // Handle remaining bytes with simple loop | ||
| while i < bytes.len() { | ||
| if !bytes[i].is_ascii() { | ||
| return false; | ||
| } | ||
| i += 1; | ||
| } | ||
|
|
||
| true | ||
| } | ||
|
|
||
| /// ASCII test optimized to use the `pmovmskb` instruction on `x86-64`. | ||
| /// | ||
| /// Uses explicit SSE2 intrinsics to prevent LLVM from auto-vectorizing with | ||
| /// broken AVX-512 code that extracts mask bits one-by-one. | ||
| #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] | ||
| #[inline] | ||
| #[rustc_allow_const_fn_unstable(const_eval_select)] | ||
| const fn is_ascii(bytes: &[u8]) -> bool { | ||
| const USIZE_SIZE: usize = size_of::<usize>(); | ||
| const NONASCII_MASK: usize = usize::MAX / 255 * 0x80; | ||
|
|
||
| const_eval_select!( | ||
| @capture { bytes: &[u8] } -> bool: | ||
| if const { | ||
| is_ascii_simple(bytes) | ||
| } else { | ||
| // For small inputs, use usize-at-a-time processing to avoid SSE2 call overhead. | ||
| if bytes.len() < CHUNK_SIZE { | ||
| let chunks = bytes.chunks_exact(USIZE_SIZE); | ||
| let remainder = chunks.remainder(); | ||
| for chunk in chunks { | ||
| let word = usize::from_ne_bytes(chunk.try_into().unwrap()); | ||
| if (word & NONASCII_MASK) != 0 { | ||
| return false; | ||
| } | ||
| } | ||
| return remainder.iter().all(|b| b.is_ascii()); | ||
| } | ||
|
|
||
| // SAFETY: SSE2 is guaranteed available on x86_64 | ||
|
folkertdev marked this conversation as resolved.
Outdated
|
||
| unsafe { is_ascii_sse2(bytes) } | ||
| } | ||
| ) | ||
| } | ||
|
|
||
| /// ASCII test optimized to use the `vmskltz.b` instruction on `loongarch64`. | ||
| /// | ||
| /// Other platforms are not likely to benefit from this code structure, so they | ||
| /// use SWAR techniques to test for ASCII in `usize`-sized chunks. | ||
| #[cfg(any( | ||
| all(target_arch = "x86_64", target_feature = "sse2"), | ||
| all(target_arch = "loongarch64", target_feature = "lsx") | ||
| ))] | ||
| #[cfg(all(target_arch = "loongarch64", target_feature = "lsx"))] | ||
| #[inline] | ||
| const fn is_ascii(bytes: &[u8]) -> bool { | ||
| // Process chunks of 32 bytes at a time in the fast path to enable | ||
| // auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers | ||
| // auto-vectorization and use of `vmskltz.b`. Two 128-bit vector registers | ||
| // can be OR'd together and then the resulting vector can be tested for | ||
| // non-ASCII bytes. | ||
| const CHUNK_SIZE: usize = 32; | ||
|
|
@@ -485,7 +565,7 @@ const fn is_ascii(bytes: &[u8]) -> bool { | |
| while i + CHUNK_SIZE <= bytes.len() { | ||
| let chunk_end = i + CHUNK_SIZE; | ||
|
|
||
| // Get LLVM to produce a `pmovmskb` instruction on x86-64 which | ||
| // Get LLVM to produce a `vmskltz.b` instruction on loongarch64 which | ||
| // creates a mask from the most significant bit of each byte. | ||
| // ASCII bytes are less than 128 (0x80), so their most significant | ||
| // bit is unset. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| //@ only-x86_64 | ||
| //@ compile-flags: -C opt-level=3 -C target-cpu=znver4 | ||
| //@ compile-flags: -C llvm-args=-x86-asm-syntax=intel | ||
| //@ assembly-output: emit-asm | ||
| #![crate_type = "lib"] | ||
|
|
||
| // Verify is_ascii uses pmovmskb/vpmovmskb instead of kshiftrd with AVX-512. | ||
| // The fix uses explicit SSE2 intrinsics to avoid LLVM's broken auto-vectorization. | ||
| // | ||
| // See: https://github.com/rust-lang/rust/issues/129293 | ||
|
|
||
| // CHECK-LABEL: test_is_ascii | ||
| #[no_mangle] | ||
| pub fn test_is_ascii(s: &[u8]) -> bool { | ||
| // CHECK-NOT: kshiftrd | ||
| // CHECK-NOT: kshiftrq | ||
| s.is_ascii() | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,10 +1,13 @@ | ||
| //@ only-x86_64 | ||
| //@ compile-flags: -C opt-level=3 -C target-cpu=x86-64 | ||
| //@ only-loongarch64 | ||
| //@ compile-flags: -C opt-level=3 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The two test files can be merged using revisions, which would be nice if we want the same thing for other arches: //@ revisions: X86_64 LA64
//@ compile-flags: -C opt-level=3
//
//@ [X86_64]: only-x86-64
//@ [X86_64] compile-flags:
//
//@ [LA64]: only-loongarch64
...Then use |
||
| #![crate_type = "lib"] | ||
|
|
||
| /// Check that the fast-path of `is_ascii` uses a `pmovmskb` instruction. | ||
| /// Check that the fast-path of `is_ascii` uses a `vmskltz.b` instruction. | ||
| /// Platforms lacking an equivalent instruction use other techniques for | ||
| /// optimizing `is_ascii`. | ||
| /// | ||
| /// Note: x86_64 uses explicit SSE2 intrinsics instead of relying on | ||
| /// auto-vectorization. See `slice-is-ascii-avx512.rs`. | ||
| // CHECK-LABEL: @is_ascii_autovectorized | ||
| #[no_mangle] | ||
| pub fn is_ascii_autovectorized(s: &[u8]) -> bool { | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.