Compiler Explorer

Source code

#![feature(strict_provenance)]
#![feature(core_intrinsics)]
#![warn(unsafe_op_in_unsafe_fn)]

use std::arch::x86_64::__m128 as f32x4;

pub fn simd_align1(nums: &[f32]) -> *const f32x4 {
    let a = nums.as_ptr().align_offset(core::mem::align_of::<f32x4>());
    assert!(a != usize::MAX, "not relevant here");
    nums.as_ptr().wrapping_add(a).cast()
}

pub fn simd_align2(nums: &[f32]) -> *const f32x4 {
    let a = nums
        .as_ptr()
        .cast::<u8>()
        .align_offset(core::mem::align_of::<f32x4>());
    assert!(a != usize::MAX, "not relevant here");
    nums.as_ptr().cast::<u8>().wrapping_add(a).cast()
}

pub fn simd_align_offset2(nums: &[f32]) -> usize {
    nums.as_ptr().cast::<u8>().align_offset(core::mem::align_of::<f32x4>())
}

pub fn simd_align3(nums: &[f32]) -> *const f32x4 {
    let a = align_offset(nums.as_ptr(), core::mem::align_of::<f32x4>());
    assert!(a != usize::MAX, "not relevant here");
    nums.as_ptr().wrapping_add(a).cast()
}

pub fn simd_align_offset3(nums: &[f32]) -> usize {
    align_offset(nums.as_ptr(), core::mem::align_of::<f32x4>())
}

// -----------------------

fn align_offset<T>(ptr: *const T, align: usize) -> usize
where
    T: Sized,
{
    if !align.is_power_of_two() {
        panic!("align_offset: align is not a power-of-two");
    }

unsafe { align_offset_impl(ptr, align) }
}

pub(crate) unsafe fn align_offset_impl<T: Sized>(p: *const T, a: usize) -> usize {
    use core::{intrinsics, mem};

// FIXME(#75598): Direct use of these intrinsics improves codegen significantly at opt-level <=
    // 1, where the method versions of these operations are not inlined.
    use intrinsics::{
        unchecked_shl, unchecked_shr, unchecked_sub, wrapping_add, wrapping_mul, wrapping_sub,
    };

let addr = p.addr();

/// Calculate multiplicative modular inverse of `x` modulo `m`.
    ///
    /// This implementation is tailored for `align_offset` and has following preconditions:
    ///
    /// * `m` is a power-of-two;
    /// * `x < m`; (if `x ≥ m`, pass in `x % m` instead)
    ///
    /// Implementation of this function shall not panic. Ever.
    #[inline]
    unsafe fn mod_inv(x: usize, m: usize) -> usize {
        /// Multiplicative modular inverse table modulo 2⁴ = 16.
        ///
        /// Note, that this table does not contain values where inverse does not exist (i.e., for
        /// `0⁻¹ mod 16`, `2⁻¹ mod 16`, etc.)
        const INV_TABLE_MOD_16: [u8; 8] = [1, 11, 13, 7, 9, 3, 5, 15];
        /// Modulo for which the `INV_TABLE_MOD_16` is intended.
        const INV_TABLE_MOD: usize = 16;
        /// INV_TABLE_MOD²
        const INV_TABLE_MOD_SQUARED: usize = INV_TABLE_MOD * INV_TABLE_MOD;

let stride = mem::size_of::<T>();
    // SAFETY: `a` is a power-of-two, therefore non-zero.
    let a_minus_one = unsafe { unchecked_sub(a, 1) };
    if stride == 1 {
        // `stride == 1` case can be computed more simply through `-p (mod a)`, but doing so
        // inhibits LLVM's ability to select instructions like `lea`. Instead we compute
        //
        //    round_up_to_next_alignment(p, a) - p
        //
        // which distributes operations around the load-bearing, but pessimizing `and` sufficiently
        // for LLVM to be able to utilize the various optimizations it knows about.
        return wrapping_sub(wrapping_add(addr, a_minus_one) & wrapping_sub(0, a), addr);
    }

let pmoda = addr & a_minus_one;
    if pmoda == 0 {
        // // Already aligned. Yay!
        // return 0;
    } else if stride == 0 {
        // If the pointer is not aligned, and the element is zero-sized, then no amount of
        // elements will ever align the pointer.
        return usize::MAX;
    }

let smoda = stride & a_minus_one;
    // SAFETY: a is power-of-two hence non-zero. stride == 0 case is handled above.
    let gcdpow = unsafe { intrinsics::cttz_nonzero(stride).min(intrinsics::cttz_nonzero(a)) };
    // SAFETY: gcdpow has an upper-bound that’s at most the number of bits in a usize.
    let gcd = unsafe { unchecked_shl(1usize, gcdpow) };

// SAFETY: gcd is always greater or equal to 1.
    if addr & unsafe { unchecked_sub(gcd, 1) } == 0 {
        // This branch solves for the following linear congruence equation:
        //
        // ` p + so = 0 mod a `
        //
        // `p` here is the pointer value, `s` - stride of `T`, `o` offset in `T`s, and `a` - the
        // requested alignment.
        //
        // With `g = gcd(a, s)`, and the above condition asserting that `p` is also divisible by
        // `g`, we can denote `a' = a/g`, `s' = s/g`, `p' = p/g`, then this becomes equivalent to:
        //
        // ` p' + s'o = 0 mod a' `
        // ` o = (a' - (p' mod a')) * (s'^-1 mod a') `
        //
        // The first term is "the relative alignment of `p` to `a`" (divided by the `g`), the second
        // term is "how does incrementing `p` by `s` bytes change the relative alignment of `p`" (again
        // divided by `g`).
        // Division by `g` is necessary to make the inverse well formed if `a` and `s` are not
        // co-prime.
        //
        // Furthermore, the result produced by this solution is not "minimal", so it is necessary
        // to take the result `o mod lcm(s, a)`. We can replace `lcm(s, a)` with just a `a'`.

// SAFETY: `gcdpow` has an upper-bound not greater than the number of trailing 0-bits in
        // `a`.
        let a2 = unsafe { unchecked_shr(a, gcdpow) };
        // SAFETY: `a2` is non-zero. Shifting `a` by `gcdpow` cannot shift out any of the set bits
        // in `a` (of which it has exactly one).
        let a2minus1 = unsafe { unchecked_sub(a2, 1) };
        // SAFETY: `gcdpow` has an upper-bound not greater than the number of trailing 0-bits in
        // `a`.
        let s2 = unsafe { unchecked_shr(smoda, gcdpow) };
        // SAFETY: `gcdpow` has an upper-bound not greater than the number of trailing 0-bits in
        // `a`. Furthermore, the subtraction cannot overflow, because `a2 = a >> gcdpow` will
        // always be strictly greater than `(p % a) >> gcdpow`.
        let minusp2 = unsafe { unchecked_sub(a2, unchecked_shr(pmoda, gcdpow)) };
        // SAFETY: `a2` is a power-of-two, as proven above. `s2` is strictly less than `a2`
        // because `(s % a) >> gcdpow` is strictly less than `a >> gcdpow`.
        return wrapping_mul(minusp2, unsafe { mod_inv(s2, a2) }) & a2minus1;
    }

// Cannot be aligned at all.
    usize::MAX
}