Thanks for using Compiler Explorer
Sponsors
Jakt
C++
Ada
Algol68
Analysis
Android Java
Android Kotlin
Assembly
C
C3
Carbon
C with Coccinelle
C++ with Coccinelle
C++ (Circle)
CIRCT
Clean
Clojure
CMake
CMakeScript
COBOL
C++ for OpenCL
MLIR
Cppx
Cppx-Blue
Cppx-Gold
Cpp2-cppfront
Crystal
C#
CUDA C++
D
Dart
Elixir
Erlang
Fortran
F#
GLSL
Go
Haskell
HLSL
Helion
Hook
Hylo
IL
ispc
Java
Julia
Kotlin
LLVM IR
LLVM MIR
Modula-2
Mojo
Nim
Numba
Nix
Objective-C
Objective-C++
OCaml
Odin
OpenCL C
Pascal
Pony
PTX
Python
Racket
Raku
Ruby
Rust
Sail
Snowball
Scala
Slang
Solidity
Spice
SPIR-V
Swift
LLVM TableGen
Toit
Triton
TypeScript Native
V
Vala
Visual Basic
Vyper
WASM
Yul (Solidity IR)
Zig
Javascript
GIMPLE
Ygen
sway
rust source #1
Output
Compile to binary object
Link to binary
Execute the code
Intel asm syntax
Demangle identifiers
Verbose demangling
Filters
Unused labels
Library functions
Directives
Comments
Horizontal whitespace
Debug intrinsics
Compiler
mrustc (master)
rustc 1.0.0
rustc 1.1.0
rustc 1.10.0
rustc 1.11.0
rustc 1.12.0
rustc 1.13.0
rustc 1.14.0
rustc 1.15.1
rustc 1.16.0
rustc 1.17.0
rustc 1.18.0
rustc 1.19.0
rustc 1.2.0
rustc 1.20.0
rustc 1.21.0
rustc 1.22.0
rustc 1.23.0
rustc 1.24.0
rustc 1.25.0
rustc 1.26.0
rustc 1.27.0
rustc 1.27.1
rustc 1.28.0
rustc 1.29.0
rustc 1.3.0
rustc 1.30.0
rustc 1.31.0
rustc 1.32.0
rustc 1.33.0
rustc 1.34.0
rustc 1.35.0
rustc 1.36.0
rustc 1.37.0
rustc 1.38.0
rustc 1.39.0
rustc 1.4.0
rustc 1.40.0
rustc 1.41.0
rustc 1.42.0
rustc 1.43.0
rustc 1.44.0
rustc 1.45.0
rustc 1.45.2
rustc 1.46.0
rustc 1.47.0
rustc 1.48.0
rustc 1.49.0
rustc 1.5.0
rustc 1.50.0
rustc 1.51.0
rustc 1.52.0
rustc 1.53.0
rustc 1.54.0
rustc 1.55.0
rustc 1.56.0
rustc 1.57.0
rustc 1.58.0
rustc 1.59.0
rustc 1.6.0
rustc 1.60.0
rustc 1.61.0
rustc 1.62.0
rustc 1.63.0
rustc 1.64.0
rustc 1.65.0
rustc 1.66.0
rustc 1.67.0
rustc 1.68.0
rustc 1.69.0
rustc 1.7.0
rustc 1.70.0
rustc 1.71.0
rustc 1.72.0
rustc 1.73.0
rustc 1.74.0
rustc 1.75.0
rustc 1.76.0
rustc 1.77.0
rustc 1.78.0
rustc 1.79.0
rustc 1.8.0
rustc 1.80.0
rustc 1.81.0
rustc 1.82.0
rustc 1.83.0
rustc 1.84.0
rustc 1.85.0
rustc 1.86.0
rustc 1.87.0
rustc 1.88.0
rustc 1.89.0
rustc 1.9.0
rustc 1.90.0
rustc 1.91.0
rustc beta
rustc nightly
rustc-cg-gcc (master)
x86-64 GCCRS (GCC master)
x86-64 GCCRS (GCCRS master)
x86-64 GCCRS 14.1 (GCC assertions)
x86-64 GCCRS 14.1 (GCC)
x86-64 GCCRS 14.2 (GCC assertions)
x86-64 GCCRS 14.2 (GCC)
x86-64 GCCRS 14.3 (GCC assertions)
x86-64 GCCRS 14.3 (GCC)
x86-64 GCCRS 15.1 (GCC assertions)
x86-64 GCCRS 15.1 (GCC)
x86-64 GCCRS 15.2 (GCC assertions)
x86-64 GCCRS 15.2 (GCC)
Options
Source code
#![feature(strict_provenance)] #![feature(core_intrinsics)] #![warn(unsafe_op_in_unsafe_fn)] use std::arch::x86_64::__m128 as f32x4; pub fn simd_align1(nums: &[f32]) -> *const f32x4 { let a = nums.as_ptr().align_offset(core::mem::align_of::<f32x4>()); assert!(a != usize::MAX, "not relevant here"); nums.as_ptr().wrapping_add(a).cast() } pub fn simd_align2(nums: &[f32]) -> *const f32x4 { let a = nums .as_ptr() .cast::<u8>() .align_offset(core::mem::align_of::<f32x4>()); assert!(a != usize::MAX, "not relevant here"); nums.as_ptr().cast::<u8>().wrapping_add(a).cast() } pub fn simd_align_offset2(nums: &[f32]) -> usize { nums.as_ptr().cast::<u8>().align_offset(core::mem::align_of::<f32x4>()) } pub fn simd_align3(nums: &[f32]) -> *const f32x4 { let a = align_offset(nums.as_ptr(), core::mem::align_of::<f32x4>()); assert!(a != usize::MAX, "not relevant here"); nums.as_ptr().wrapping_add(a).cast() } pub fn simd_align_offset3(nums: &[f32]) -> usize { align_offset(nums.as_ptr(), core::mem::align_of::<f32x4>()) } // ----------------------- fn align_offset<T>(ptr: *const T, align: usize) -> usize where T: Sized, { if !align.is_power_of_two() { panic!("align_offset: align is not a power-of-two"); } unsafe { align_offset_impl(ptr, align) } } pub(crate) unsafe fn align_offset_impl<T: Sized>(p: *const T, a: usize) -> usize { use core::{intrinsics, mem}; // FIXME(#75598): Direct use of these intrinsics improves codegen significantly at opt-level <= // 1, where the method versions of these operations are not inlined. use intrinsics::{ unchecked_shl, unchecked_shr, unchecked_sub, wrapping_add, wrapping_mul, wrapping_sub, }; let addr = p.addr(); /// Calculate multiplicative modular inverse of `x` modulo `m`. /// /// This implementation is tailored for `align_offset` and has following preconditions: /// /// * `m` is a power-of-two; /// * `x < m`; (if `x ≥ m`, pass in `x % m` instead) /// /// Implementation of this function shall not panic. Ever. #[inline] unsafe fn mod_inv(x: usize, m: usize) -> usize { /// Multiplicative modular inverse table modulo 2⁴ = 16. /// /// Note, that this table does not contain values where inverse does not exist (i.e., for /// `0⁻¹ mod 16`, `2⁻¹ mod 16`, etc.) const INV_TABLE_MOD_16: [u8; 8] = [1, 11, 13, 7, 9, 3, 5, 15]; /// Modulo for which the `INV_TABLE_MOD_16` is intended. const INV_TABLE_MOD: usize = 16; /// INV_TABLE_MOD² const INV_TABLE_MOD_SQUARED: usize = INV_TABLE_MOD * INV_TABLE_MOD; let table_inverse = INV_TABLE_MOD_16[(x & (INV_TABLE_MOD - 1)) >> 1] as usize; // SAFETY: `m` is required to be a power-of-two, hence non-zero. let m_minus_one = unsafe { unchecked_sub(m, 1) }; if m <= INV_TABLE_MOD { table_inverse & m_minus_one } else { // We iterate "up" using the following formula: // // $$ xy ≡ 1 (mod 2ⁿ) → xy (2 - xy) ≡ 1 (mod 2²ⁿ) $$ // // until 2²ⁿ ≥ m. Then we can reduce to our desired `m` by taking the result `mod m`. let mut inverse = table_inverse; let mut going_mod = INV_TABLE_MOD_SQUARED; loop { // y = y * (2 - xy) mod n // // Note, that we use wrapping operations here intentionally – the original formula // uses e.g., subtraction `mod n`. It is entirely fine to do them `mod // usize::MAX` instead, because we take the result `mod n` at the end // anyway. inverse = wrapping_mul(inverse, wrapping_sub(2usize, wrapping_mul(x, inverse))); if going_mod >= m { return inverse & m_minus_one; } going_mod = wrapping_mul(going_mod, going_mod); } } } let stride = mem::size_of::<T>(); // SAFETY: `a` is a power-of-two, therefore non-zero. let a_minus_one = unsafe { unchecked_sub(a, 1) }; if stride == 1 { // `stride == 1` case can be computed more simply through `-p (mod a)`, but doing so // inhibits LLVM's ability to select instructions like `lea`. Instead we compute // // round_up_to_next_alignment(p, a) - p // // which distributes operations around the load-bearing, but pessimizing `and` sufficiently // for LLVM to be able to utilize the various optimizations it knows about. return wrapping_sub(wrapping_add(addr, a_minus_one) & wrapping_sub(0, a), addr); } let pmoda = addr & a_minus_one; if pmoda == 0 { // // Already aligned. Yay! // return 0; } else if stride == 0 { // If the pointer is not aligned, and the element is zero-sized, then no amount of // elements will ever align the pointer. return usize::MAX; } let smoda = stride & a_minus_one; // SAFETY: a is power-of-two hence non-zero. stride == 0 case is handled above. let gcdpow = unsafe { intrinsics::cttz_nonzero(stride).min(intrinsics::cttz_nonzero(a)) }; // SAFETY: gcdpow has an upper-bound that’s at most the number of bits in a usize. let gcd = unsafe { unchecked_shl(1usize, gcdpow) }; // SAFETY: gcd is always greater or equal to 1. if addr & unsafe { unchecked_sub(gcd, 1) } == 0 { // This branch solves for the following linear congruence equation: // // ` p + so = 0 mod a ` // // `p` here is the pointer value, `s` - stride of `T`, `o` offset in `T`s, and `a` - the // requested alignment. // // With `g = gcd(a, s)`, and the above condition asserting that `p` is also divisible by // `g`, we can denote `a' = a/g`, `s' = s/g`, `p' = p/g`, then this becomes equivalent to: // // ` p' + s'o = 0 mod a' ` // ` o = (a' - (p' mod a')) * (s'^-1 mod a') ` // // The first term is "the relative alignment of `p` to `a`" (divided by the `g`), the second // term is "how does incrementing `p` by `s` bytes change the relative alignment of `p`" (again // divided by `g`). // Division by `g` is necessary to make the inverse well formed if `a` and `s` are not // co-prime. // // Furthermore, the result produced by this solution is not "minimal", so it is necessary // to take the result `o mod lcm(s, a)`. We can replace `lcm(s, a)` with just a `a'`. // SAFETY: `gcdpow` has an upper-bound not greater than the number of trailing 0-bits in // `a`. let a2 = unsafe { unchecked_shr(a, gcdpow) }; // SAFETY: `a2` is non-zero. Shifting `a` by `gcdpow` cannot shift out any of the set bits // in `a` (of which it has exactly one). let a2minus1 = unsafe { unchecked_sub(a2, 1) }; // SAFETY: `gcdpow` has an upper-bound not greater than the number of trailing 0-bits in // `a`. let s2 = unsafe { unchecked_shr(smoda, gcdpow) }; // SAFETY: `gcdpow` has an upper-bound not greater than the number of trailing 0-bits in // `a`. Furthermore, the subtraction cannot overflow, because `a2 = a >> gcdpow` will // always be strictly greater than `(p % a) >> gcdpow`. let minusp2 = unsafe { unchecked_sub(a2, unchecked_shr(pmoda, gcdpow)) }; // SAFETY: `a2` is a power-of-two, as proven above. `s2` is strictly less than `a2` // because `(s % a) >> gcdpow` is strictly less than `a >> gcdpow`. return wrapping_mul(minusp2, unsafe { mod_inv(s2, a2) }) & a2minus1; } // Cannot be aligned at all. usize::MAX }
Become a Patron
Sponsor on GitHub
Donate via PayPal
Compiler Explorer Shop
Source on GitHub
Mailing list
Installed libraries
Wiki
Report an issue
How it works
Contact the author
CE on Mastodon
CE on Bluesky
Statistics
Changelog
Version tree