Compiler Explorer

Source code

#![feature(array_chunks, array_windows, portable_simd, pointer_is_aligned_to)]

use std::{
  simd::{cmp::SimdPartialOrd, u16x16, u8x32, u8x64, u8x8},
};

#[repr(C, align(64))]
struct AlignToSixtyFour([u8; 64]);

/// This creates a vector with its data aligned to 64 bytes.  This allows us to use faster aligned
/// loads for SIMD operations.
///
/// adapted from: https://stackoverflow.com/a/60180226/3833068
fn aligned_vec<T>(n_bytes: usize) -> Vec<T> {
  assert_eq!(
    std::mem::size_of::<AlignToSixtyFour>() % std::mem::size_of::<T>(),
    0,
    "64 must be divisible by the size of `T` in bytes"
  );

// Lazy math to ensure we always have enough.
  let n_units = (n_bytes / std::mem::size_of::<AlignToSixtyFour>()) + 1;

let mut aligned: Vec<AlignToSixtyFour> = Vec::with_capacity(n_units);

let ptr = aligned.as_mut_ptr();
  let len_units = aligned.len();
  let cap_units = aligned.capacity();

std::mem::forget(aligned);

unsafe {
    Vec::from_raw_parts(
      ptr as *mut T,
      (len_units * std::mem::size_of::<AlignToSixtyFour>()) / std::mem::size_of::<T>(),
      (cap_units * std::mem::size_of::<AlignToSixtyFour>()) / std::mem::size_of::<T>(),
    )
  }
}

const MAX_ID: usize = 9_999;

fn parse_input_p2(input: &[u8]) -> (Vec<u8>, Vec<u8>, Vec<MiniVec>) {
  let id_count = if input.len() % 2 == 1 {
    input.len() / 2 + 1
  } else {
    input.len() / 2
  };

let mut orig_counts: Vec<u8> = aligned_vec(id_count + 1);
  unsafe { orig_counts.set_len(id_count + 1) };
  // this sets a special element at `orig_counts[MAX_ID + 1]` is used to facilitate efficient
  // `pop_front()` of the minivecs that happens when one of the files is moved down to a different
  // span.
  //
  // The ID of the removed slot is set to `MAX_ID + 1` which we hard-code to zero here.  This has a
  // result of causing the computed checksum for that slot to be zero while avoiding the need to do
  // complicated stuff like shift elements down, add state to track whether the first element has
  // been removed, etc.
  unsafe { *orig_counts.get_unchecked_mut(MAX_ID + 1) = 0 };
  let mut empty_spaces: Vec<u8> = aligned_vec(id_count);
  unsafe { empty_spaces.set_len(id_count) };
  let mut slots: Vec<MiniVec> = aligned_vec(id_count * std::mem::size_of::<MiniVec>());
  unsafe { slots.set_len(id_count) };
  // initialize the memory layout for the minivecs manually using SIMD.
  //
  // this sets them all up to have a length of one with a single element corresponding to file index
  // `i` as the first and only element.
  unsafe {
    let data: [u16; 16] = std::mem::transmute([
      MiniVec {
        len: 1,
        elements: [
          Slot { id: 0 },
          Slot { id: 0 },
          Slot { id: 0 },
          Slot { id: 0 },
          Slot { id: 0 },
          Slot { id: 0 },
        ],
        padding: 0,
      },
      MiniVec {
        len: 1,
        elements: [
          Slot { id: 1 },
          Slot { id: 0 },
          Slot { id: 0 },
          Slot { id: 0 },
          Slot { id: 0 },
          Slot { id: 0 },
        ],
        padding: 0,
      },
    ]);
    /// how many minivecs are set per SIMD store
    const CHUNK_SIZE: usize = 2;
    let mut data = u16x16::from_array(data);
    let to_add: [u16; 16] = std::mem::transmute([
      MiniVec {
        len: 0,
        elements: [
          Slot {
            id: CHUNK_SIZE as u16,
          },
          Slot { id: 0 },
          Slot { id: 0 },
          Slot { id: 0 },
          Slot { id: 0 },
          Slot { id: 0 },
        ],
        padding: 0,
      },
      MiniVec {
        len: 0,
        elements: [
          Slot {
            id: CHUNK_SIZE as u16,
          },
          Slot { id: 0 },
          Slot { id: 0 },
          Slot { id: 0 },
          Slot { id: 0 },
          Slot { id: 0 },
        ],
        padding: 0,
      },
    ]);
    assert_eq!(std::mem::size_of::<MiniVec>(), 16);
    assert_eq!(
      std::mem::size_of::<MiniVec>() * CHUNK_SIZE,
      std::mem::size_of::<u16x16>()
    );
    debug_assert_eq!(slots.len() % CHUNK_SIZE, 0);
    let to_add = u16x16::from_array(to_add);

let start = slots.as_mut_ptr() as *mut u16x16;
    for chunk_ix in 0..(slots.len() / CHUNK_SIZE) {
      let out_ptr = start.add(chunk_ix);
      out_ptr.write(data);
      data += to_add;

debug_assert_eq!(
        &slots[chunk_ix * CHUNK_SIZE..chunk_ix * CHUNK_SIZE + CHUNK_SIZE],
        &[
          MiniVec {
            len: 1,
            elements: [
              Slot {
                id: (chunk_ix * CHUNK_SIZE) as u16
              },
              Slot { id: 0 },
              Slot { id: 0 },
              Slot { id: 0 },
              Slot { id: 0 },
              Slot { id: 0 },
            ],
            padding: 0,
          },
          MiniVec {
            len: 1,
            elements: [
              Slot {
                id: (chunk_ix * CHUNK_SIZE + 1) as u16
              },
              Slot { id: 0 },
              Slot { id: 0 },
              Slot { id: 0 },
              Slot { id: 0 },
              Slot { id: 0 },
            ],
            padding: 0,
          }
        ]
      )
    }

// for i in 0..id_count {
    //   slots.get_unchecked_mut(i).len = 1;
    //   slots.get_unchecked_mut(i).elements[0].id = i as _;
    // }
  }

debug_assert!(input.as_ptr().is_aligned_to(std::mem::align_of::<u8x64>()));

const VECTOR_LEN: usize = 32;
  const STORE_VECTOR_LEN: usize = VECTOR_LEN / 2;
  let batch_count = input.len() / VECTOR_LEN;

for batch_ix in 0..batch_count {
    let vec: u8x32 =
      unsafe { std::ptr::read(input.as_ptr().add(batch_ix * VECTOR_LEN) as *const _) };
    // convert from ascii digits to bytes representing the digit ('0' -> 0)
    let converted = vec - u8x32::splat(48);
    // split out from size,free,size,free to ([size,size], [free,free])
    let (sizes, frees) = converted.deinterleave(converted);
    // the de-interleave duplicates the results, so keeping only the first half is correct
    let sizes = sizes.resize::<STORE_VECTOR_LEN>(STORE_VECTOR_LEN as u8);
    let frees = frees.resize::<STORE_VECTOR_LEN>(STORE_VECTOR_LEN as u8);

unsafe {
      let frees_ptr = empty_spaces.as_mut_ptr().add(batch_ix * STORE_VECTOR_LEN) as *mut _;
      *frees_ptr = frees;

let orig_counts_ptr = orig_counts.as_mut_ptr().add(batch_ix * STORE_VECTOR_LEN) as *mut _;
      *orig_counts_ptr = sizes;
    }
  }

(orig_counts, empty_spaces, slots)
}

#[repr(align(2))]
#[derive(Debug, Clone, Copy, PartialEq)]
struct Slot {
  pub id: u16,
}

impl Slot {
  pub fn count(&self, orig_counts: &[u8]) -> u8 {
    unsafe { *orig_counts.get_unchecked(self.id as usize) }
  }
}

#[repr(align(16))]
#[derive(Clone, Debug, PartialEq)]
struct MiniVec {
  pub len: u16,
  pub elements: [Slot; 6],
  pub padding: u16,
}

impl MiniVec {
  fn push(&mut self, item: Slot) {
    unsafe {
      *self.elements.get_unchecked_mut(self.len as usize) = item;
    }
    self.len += 1;
    debug_assert!(self.len as usize <= self.elements.len());
  }

fn pop_front(&mut self) {
    // let out = self.elements[0];
    // for i in 1..self.len {
    //   unsafe {
    //     *self.elements.get_unchecked_mut(i as usize - 1) = self.elements[i as usize];
    //   }
    // }
    // self.len -= 1;

// \/ does not help
    // if self.len == 1 {
    //   self.len = 0;
    //   return;
    // }

// we should only ever mutate the vector once
    debug_assert!(self.elements[0].id != MAX_ID as u16 + 1);
    // this is a nice trick I came up with to accomplish the equivalent
    self.elements[0].id = MAX_ID as u16 + 1;
  }

fn as_slice(&self) -> &[Slot] { unsafe { self.elements.get_unchecked(..self.len as usize) } }
}

const ADD_FACTORIAL_LUT: [usize; 11] = [
  0,
  0,
  1,
  2 + 1,
  3 + 2 + 1,
  4 + 3 + 2 + 1,
  5 + 4 + 3 + 2 + 1,
  6 + 5 + 4 + 3 + 2 + 1,
  7 + 6 + 5 + 4 + 3 + 2 + 1,
  8 + 7 + 6 + 5 + 4 + 3 + 2 + 1,
  9 + 8 + 7 + 6 + 5 + 4 + 3 + 2 + 1,
];

impl Slot {
  fn checksum(&self, total_prev: &mut usize, orig_counts: &[u8]) -> usize {
    // naive impl:
    // (0..self.count)
    //   .map(|i| (total_prev + i as usize) * self.id as usize)
    //   .sum::<usize>()

// So, this condenses down to a sum of the following:
    //
    // (total_prev + 0) * id
    // (total_prev + 1) * id
    // (total_prev + 2) * id
    // ...
    // (total_prev + (count - 1)) * id
    //
    // the `total_prev` part can be split out:
    // total_prev * self.count * id
    //
    // leaving that base plus a sum of the following:
    //
    // 0 * id
    // 1 * id
    // 2 * id
    // ...
    // (count - 1) * id
    //
    // this reduces to (0 + 1 + 2 + ... + (count - 1)) * id
    //
    // and since count is always [0,9], we can use a tiny LUT for this which makes this whole
    // checksum essentially constant time

let count = self.count(orig_counts) as usize;
    let checksum = *total_prev * count * self.id as usize
      + unsafe { *ADD_FACTORIAL_LUT.get_unchecked(count) } * self.id as usize;
    *total_prev += count;
    checksum
  }
}

pub fn part2(raw_input: &[u8]) -> usize {
  let (counts, mut empty_spaces, mut slots) = parse_input_p2(raw_input);

fn checksum(
    slots: &[Slot],
    empty_space: u8,
    total_prev: &mut usize,
    orig_counts: &[u8],
  ) -> usize {
    let mut sum = 0usize;
    for slot in slots {
      sum += slot.checksum(total_prev, orig_counts);
    }
    *total_prev += empty_space as usize;
    sum
  }

let mut start_span_ix_by_needed_size: [u16; 10] = [0; 10];
  let mut finished_digit_count = 0usize;
  // we keep track of the highest span that still has a value in it.
  //
  // this allows us to skip iterating over fully empty spans at the end when computing the checksum
  let mut max_unmoved_src_id = 0;
  'outer: for src_id in (0..=MAX_ID).rev() {
    let src_count = unsafe { *counts.get_unchecked(src_id) };

let start_ix =
      unsafe { *start_span_ix_by_needed_size.get_unchecked(src_count as usize) } as usize;

// we can only move elements to the left
    if start_ix >= src_id {
      if start_ix != u16::MAX as usize {
        max_unmoved_src_id = max_unmoved_src_id.max(src_id);
        debug_assert!(slots[max_unmoved_src_id + 1..]
          .iter()
          .all(|s| s.as_slice().is_empty()
            || s
              .as_slice()
              .iter()
              .all(|s| s.id == 0 || s.count(&counts) == 0)));

finished_digit_count += 1;
        if finished_digit_count == 9 {
          debug_assert_eq!(
            start_span_ix_by_needed_size[0], 0,
            "there are never zero-size files in the inputs apparently"
          );
          break;
        }

// mark this finished and all bigger digits too

// unsafe {
        //   start_span_ix_by_needed_size
        //     .get_unchecked_mut(src_count as usize..10)
        //     .fill(u16::MAX);
        // }

// this is faster than above and creates huge diff in the resulting assembly across the
        // whole binary, although the callsite looks almost identical with a `memset` still getting
        // generated.
        //
        // also, `get_unchecked_mut` generates idential code; bounds checks are elided automatically
        // somehow
        for i in src_count as usize..10 {
          start_span_ix_by_needed_size[i] = u16::MAX;
        }
      }

continue;
    }

let src_id = src_id as u16;

let start_ptr = unsafe { empty_spaces.as_ptr().add(start_ix) };
    let mut cur_ptr = start_ptr;
    let mut cur_offset = 0usize;
    let mut dst_span_ix = loop {
      const VEC_SIZE: usize = 8usize;
      // let end_ix = start_ix + cur_offset + VEC_SIZE;
      // same caveat as before.  For a 100% correct implementation for all possible inputs, we'd
      // need to handle manually checking the tail here but I'm leaving that out
      //
      // I could leave this off if I wanted to and it wouldn't be an issue...
      // if end_ix > input.len() - VEC_SIZE {
      //   start_span_ix_by_needed_size[src_count as usize] = usize::MAX;
      //   finished_digit_count += 1;
      //   max_unmoved_src_id = max_unmoved_src_id.max(src_id as usize);
      //   continue 'outer;
      // }

let empty_spaces_v: u8x8 = unsafe { std::ptr::read_unaligned(cur_ptr as *const _) };
      debug_assert_eq!(empty_spaces_v.len(), VEC_SIZE);
      let mask = empty_spaces_v.simd_ge(u8x8::splat(src_count));
      match mask.first_set() {
        Some(i) => {
          let dst_span_ix = start_ix + cur_offset + i;
          if dst_span_ix >= src_id as usize {
            unsafe {
              *start_span_ix_by_needed_size.get_unchecked_mut(src_count as usize) = u16::MAX
            };
            finished_digit_count += 1;
            max_unmoved_src_id = max_unmoved_src_id.max(src_id as usize);
            continue 'outer;
          }
          debug_assert!(empty_spaces[dst_span_ix] >= src_count);
          break dst_span_ix;
        },
        None => {
          cur_ptr = unsafe { cur_ptr.add(VEC_SIZE) };
          cur_offset += VEC_SIZE
        },
      }
    };

let dst_slots: &mut MiniVec = unsafe { slots.get_unchecked_mut(dst_span_ix) };
    max_unmoved_src_id = max_unmoved_src_id.max(dst_span_ix);
    dst_slots.push(Slot { id: src_id });

let new_empty_space = unsafe { *empty_spaces.get_unchecked(dst_span_ix) - src_count };
    unsafe { *empty_spaces.get_unchecked_mut(dst_span_ix) = new_empty_space };

// no way we could fit more of this size into this span, so might as well move on to the next
    // one before continuing to loop
    if new_empty_space < src_count {
      dst_span_ix += 1;
    }

unsafe {
      *start_span_ix_by_needed_size.get_unchecked_mut(src_count as usize) = dst_span_ix as u16
    };

// \/ this code uses the fact that if a span of size `src_count` can't fit before `dst_span_ix`,
    // then no bigger span could either.
    //
    // However, it turns out to make things slower - especially when compiling with
    // `target-cpu=native`.  That causes some fancy SIMD that performs this operation using masks
    // and whatnot to be emitted, but that turns out to be way slower than the scalar version.
    //
    // Anyway, just skipping all this work here seems to be the fastest method of them all, probably
    // because our SIMD free slot search is fast enough to make up for the savings of doing the more
    // fancy accounting after the bookkeeping overhead.
    //
    // for i in src_count as usize..10 {
    //   start_span_ix_by_needed_size[i] = start_span_ix_by_needed_size[i].max(dst_span_ix);
    // }

// the element we're removing is at the first index of the array since any others added to this
    // span will have been put after it
    let src_slots = unsafe { slots.get_unchecked_mut(src_id as usize) };
    // debug_assert_eq!(src_slots.elements[0].id, src_id);
    unsafe { *empty_spaces.get_unchecked_mut(src_id as usize - 1) += src_count };
    src_slots.pop_front();
  }

let mut out = 0usize;
  let mut total_prev = 0usize;
  for (slot, &empty_count) in unsafe { slots.get_unchecked_mut(..=max_unmoved_src_id) }
    .iter()
    .zip(unsafe { empty_spaces.get_unchecked_mut(..=max_unmoved_src_id) }.iter())
  {
    out += checksum(slot.as_slice(), empty_count, &mut total_prev, &counts);
  }

out
}