Thanks for using Compiler Explorer
Sponsors
Jakt
C++
Ada
Analysis
Android Java
Android Kotlin
Assembly
C
C3
Carbon
C++ (Circle)
CIRCT
Clean
CMake
CMakeScript
COBOL
C++ for OpenCL
MLIR
Cppx
Cppx-Blue
Cppx-Gold
Cpp2-cppfront
Crystal
C#
CUDA C++
D
Dart
Elixir
Erlang
Fortran
F#
GLSL
Go
Haskell
HLSL
Hook
Hylo
IL
ispc
Java
Julia
Kotlin
LLVM IR
LLVM MIR
Modula-2
Nim
Objective-C
Objective-C++
OCaml
Odin
OpenCL C
Pascal
Pony
Python
Racket
Ruby
Rust
Snowball
Scala
Slang
Solidity
Spice
SPIR-V
Swift
LLVM TableGen
Toit
TypeScript Native
V
Vala
Visual Basic
Vyper
WASM
Zig
Javascript
GIMPLE
Ygen
rust source #1
Output
Compile to binary object
Link to binary
Execute the code
Intel asm syntax
Demangle identifiers
Verbose demangling
Filters
Unused labels
Library functions
Directives
Comments
Horizontal whitespace
Debug intrinsics
Compiler
mrustc (master)
rustc 1.0.0
rustc 1.1.0
rustc 1.10.0
rustc 1.11.0
rustc 1.12.0
rustc 1.13.0
rustc 1.14.0
rustc 1.15.1
rustc 1.16.0
rustc 1.17.0
rustc 1.18.0
rustc 1.19.0
rustc 1.2.0
rustc 1.20.0
rustc 1.21.0
rustc 1.22.0
rustc 1.23.0
rustc 1.24.0
rustc 1.25.0
rustc 1.26.0
rustc 1.27.0
rustc 1.27.1
rustc 1.28.0
rustc 1.29.0
rustc 1.3.0
rustc 1.30.0
rustc 1.31.0
rustc 1.32.0
rustc 1.33.0
rustc 1.34.0
rustc 1.35.0
rustc 1.36.0
rustc 1.37.0
rustc 1.38.0
rustc 1.39.0
rustc 1.4.0
rustc 1.40.0
rustc 1.41.0
rustc 1.42.0
rustc 1.43.0
rustc 1.44.0
rustc 1.45.0
rustc 1.45.2
rustc 1.46.0
rustc 1.47.0
rustc 1.48.0
rustc 1.49.0
rustc 1.5.0
rustc 1.50.0
rustc 1.51.0
rustc 1.52.0
rustc 1.53.0
rustc 1.54.0
rustc 1.55.0
rustc 1.56.0
rustc 1.57.0
rustc 1.58.0
rustc 1.59.0
rustc 1.6.0
rustc 1.60.0
rustc 1.61.0
rustc 1.62.0
rustc 1.63.0
rustc 1.64.0
rustc 1.65.0
rustc 1.66.0
rustc 1.67.0
rustc 1.68.0
rustc 1.69.0
rustc 1.7.0
rustc 1.70.0
rustc 1.71.0
rustc 1.72.0
rustc 1.73.0
rustc 1.74.0
rustc 1.75.0
rustc 1.76.0
rustc 1.77.0
rustc 1.78.0
rustc 1.79.0
rustc 1.8.0
rustc 1.80.0
rustc 1.81.0
rustc 1.82.0
rustc 1.83.0
rustc 1.9.0
rustc beta
rustc nightly
rustc-cg-gcc (master)
x86-64 GCCRS (GCC master)
x86-64 GCCRS (GCCRS master)
x86-64 GCCRS 14.1 (GCC assertions)
x86-64 GCCRS 14.1 (GCC)
x86-64 GCCRS 14.2 (GCC assertions)
x86-64 GCCRS 14.2 (GCC)
Options
Source code
#![feature(array_chunks, array_windows, portable_simd, pointer_is_aligned_to)] use std::{ simd::{cmp::SimdPartialOrd, u16x16, u8x32, u8x64, u8x8}, }; #[repr(C, align(64))] struct AlignToSixtyFour([u8; 64]); /// This creates a vector with its data aligned to 64 bytes. This allows us to use faster aligned /// loads for SIMD operations. /// /// adapted from: https://stackoverflow.com/a/60180226/3833068 fn aligned_vec<T>(n_bytes: usize) -> Vec<T> { assert_eq!( std::mem::size_of::<AlignToSixtyFour>() % std::mem::size_of::<T>(), 0, "64 must be divisible by the size of `T` in bytes" ); // Lazy math to ensure we always have enough. let n_units = (n_bytes / std::mem::size_of::<AlignToSixtyFour>()) + 1; let mut aligned: Vec<AlignToSixtyFour> = Vec::with_capacity(n_units); let ptr = aligned.as_mut_ptr(); let len_units = aligned.len(); let cap_units = aligned.capacity(); std::mem::forget(aligned); unsafe { Vec::from_raw_parts( ptr as *mut T, (len_units * std::mem::size_of::<AlignToSixtyFour>()) / std::mem::size_of::<T>(), (cap_units * std::mem::size_of::<AlignToSixtyFour>()) / std::mem::size_of::<T>(), ) } } const MAX_ID: usize = 9_999; fn parse_input_p2(input: &[u8]) -> (Vec<u8>, Vec<u8>, Vec<MiniVec>) { let id_count = if input.len() % 2 == 1 { input.len() / 2 + 1 } else { input.len() / 2 }; let mut orig_counts: Vec<u8> = aligned_vec(id_count + 1); unsafe { orig_counts.set_len(id_count + 1) }; // this sets a special element at `orig_counts[MAX_ID + 1]` is used to facilitate efficient // `pop_front()` of the minivecs that happens when one of the files is moved down to a different // span. // // The ID of the removed slot is set to `MAX_ID + 1` which we hard-code to zero here. This has a // result of causing the computed checksum for that slot to be zero while avoiding the need to do // complicated stuff like shift elements down, add state to track whether the first element has // been removed, etc. unsafe { *orig_counts.get_unchecked_mut(MAX_ID + 1) = 0 }; let mut empty_spaces: Vec<u8> = aligned_vec(id_count); unsafe { empty_spaces.set_len(id_count) }; let mut slots: Vec<MiniVec> = aligned_vec(id_count * std::mem::size_of::<MiniVec>()); unsafe { slots.set_len(id_count) }; // initialize the memory layout for the minivecs manually using SIMD. // // this sets them all up to have a length of one with a single element corresponding to file index // `i` as the first and only element. unsafe { let data: [u16; 16] = std::mem::transmute([ MiniVec { len: 1, elements: [ Slot { id: 0 }, Slot { id: 0 }, Slot { id: 0 }, Slot { id: 0 }, Slot { id: 0 }, Slot { id: 0 }, ], padding: 0, }, MiniVec { len: 1, elements: [ Slot { id: 1 }, Slot { id: 0 }, Slot { id: 0 }, Slot { id: 0 }, Slot { id: 0 }, Slot { id: 0 }, ], padding: 0, }, ]); /// how many minivecs are set per SIMD store const CHUNK_SIZE: usize = 2; let mut data = u16x16::from_array(data); let to_add: [u16; 16] = std::mem::transmute([ MiniVec { len: 0, elements: [ Slot { id: CHUNK_SIZE as u16, }, Slot { id: 0 }, Slot { id: 0 }, Slot { id: 0 }, Slot { id: 0 }, Slot { id: 0 }, ], padding: 0, }, MiniVec { len: 0, elements: [ Slot { id: CHUNK_SIZE as u16, }, Slot { id: 0 }, Slot { id: 0 }, Slot { id: 0 }, Slot { id: 0 }, Slot { id: 0 }, ], padding: 0, }, ]); assert_eq!(std::mem::size_of::<MiniVec>(), 16); assert_eq!( std::mem::size_of::<MiniVec>() * CHUNK_SIZE, std::mem::size_of::<u16x16>() ); debug_assert_eq!(slots.len() % CHUNK_SIZE, 0); let to_add = u16x16::from_array(to_add); let start = slots.as_mut_ptr() as *mut u16x16; for chunk_ix in 0..(slots.len() / CHUNK_SIZE) { let out_ptr = start.add(chunk_ix); out_ptr.write(data); data += to_add; debug_assert_eq!( &slots[chunk_ix * CHUNK_SIZE..chunk_ix * CHUNK_SIZE + CHUNK_SIZE], &[ MiniVec { len: 1, elements: [ Slot { id: (chunk_ix * CHUNK_SIZE) as u16 }, Slot { id: 0 }, Slot { id: 0 }, Slot { id: 0 }, Slot { id: 0 }, Slot { id: 0 }, ], padding: 0, }, MiniVec { len: 1, elements: [ Slot { id: (chunk_ix * CHUNK_SIZE + 1) as u16 }, Slot { id: 0 }, Slot { id: 0 }, Slot { id: 0 }, Slot { id: 0 }, Slot { id: 0 }, ], padding: 0, } ] ) } // for i in 0..id_count { // slots.get_unchecked_mut(i).len = 1; // slots.get_unchecked_mut(i).elements[0].id = i as _; // } } debug_assert!(input.as_ptr().is_aligned_to(std::mem::align_of::<u8x64>())); const VECTOR_LEN: usize = 32; const STORE_VECTOR_LEN: usize = VECTOR_LEN / 2; let batch_count = input.len() / VECTOR_LEN; for batch_ix in 0..batch_count { let vec: u8x32 = unsafe { std::ptr::read(input.as_ptr().add(batch_ix * VECTOR_LEN) as *const _) }; // convert from ascii digits to bytes representing the digit ('0' -> 0) let converted = vec - u8x32::splat(48); // split out from size,free,size,free to ([size,size], [free,free]) let (sizes, frees) = converted.deinterleave(converted); // the de-interleave duplicates the results, so keeping only the first half is correct let sizes = sizes.resize::<STORE_VECTOR_LEN>(STORE_VECTOR_LEN as u8); let frees = frees.resize::<STORE_VECTOR_LEN>(STORE_VECTOR_LEN as u8); unsafe { let frees_ptr = empty_spaces.as_mut_ptr().add(batch_ix * STORE_VECTOR_LEN) as *mut _; *frees_ptr = frees; let orig_counts_ptr = orig_counts.as_mut_ptr().add(batch_ix * STORE_VECTOR_LEN) as *mut _; *orig_counts_ptr = sizes; } } (orig_counts, empty_spaces, slots) } #[repr(align(2))] #[derive(Debug, Clone, Copy, PartialEq)] struct Slot { pub id: u16, } impl Slot { pub fn count(&self, orig_counts: &[u8]) -> u8 { unsafe { *orig_counts.get_unchecked(self.id as usize) } } } #[repr(align(16))] #[derive(Clone, Debug, PartialEq)] struct MiniVec { pub len: u16, pub elements: [Slot; 6], pub padding: u16, } impl MiniVec { fn push(&mut self, item: Slot) { unsafe { *self.elements.get_unchecked_mut(self.len as usize) = item; } self.len += 1; debug_assert!(self.len as usize <= self.elements.len()); } fn pop_front(&mut self) { // let out = self.elements[0]; // for i in 1..self.len { // unsafe { // *self.elements.get_unchecked_mut(i as usize - 1) = self.elements[i as usize]; // } // } // self.len -= 1; // \/ does not help // if self.len == 1 { // self.len = 0; // return; // } // we should only ever mutate the vector once debug_assert!(self.elements[0].id != MAX_ID as u16 + 1); // this is a nice trick I came up with to accomplish the equivalent self.elements[0].id = MAX_ID as u16 + 1; } fn as_slice(&self) -> &[Slot] { unsafe { self.elements.get_unchecked(..self.len as usize) } } } const ADD_FACTORIAL_LUT: [usize; 11] = [ 0, 0, 1, 2 + 1, 3 + 2 + 1, 4 + 3 + 2 + 1, 5 + 4 + 3 + 2 + 1, 6 + 5 + 4 + 3 + 2 + 1, 7 + 6 + 5 + 4 + 3 + 2 + 1, 8 + 7 + 6 + 5 + 4 + 3 + 2 + 1, 9 + 8 + 7 + 6 + 5 + 4 + 3 + 2 + 1, ]; impl Slot { fn checksum(&self, total_prev: &mut usize, orig_counts: &[u8]) -> usize { // naive impl: // (0..self.count) // .map(|i| (total_prev + i as usize) * self.id as usize) // .sum::<usize>() // So, this condenses down to a sum of the following: // // (total_prev + 0) * id // (total_prev + 1) * id // (total_prev + 2) * id // ... // (total_prev + (count - 1)) * id // // the `total_prev` part can be split out: // total_prev * self.count * id // // leaving that base plus a sum of the following: // // 0 * id // 1 * id // 2 * id // ... // (count - 1) * id // // this reduces to (0 + 1 + 2 + ... + (count - 1)) * id // // and since count is always [0,9], we can use a tiny LUT for this which makes this whole // checksum essentially constant time let count = self.count(orig_counts) as usize; let checksum = *total_prev * count * self.id as usize + unsafe { *ADD_FACTORIAL_LUT.get_unchecked(count) } * self.id as usize; *total_prev += count; checksum } } pub fn part2(raw_input: &[u8]) -> usize { let (counts, mut empty_spaces, mut slots) = parse_input_p2(raw_input); fn checksum( slots: &[Slot], empty_space: u8, total_prev: &mut usize, orig_counts: &[u8], ) -> usize { let mut sum = 0usize; for slot in slots { sum += slot.checksum(total_prev, orig_counts); } *total_prev += empty_space as usize; sum } let mut start_span_ix_by_needed_size: [u16; 10] = [0; 10]; let mut finished_digit_count = 0usize; // we keep track of the highest span that still has a value in it. // // this allows us to skip iterating over fully empty spans at the end when computing the checksum let mut max_unmoved_src_id = 0; 'outer: for src_id in (0..=MAX_ID).rev() { let src_count = unsafe { *counts.get_unchecked(src_id) }; let start_ix = unsafe { *start_span_ix_by_needed_size.get_unchecked(src_count as usize) } as usize; // we can only move elements to the left if start_ix >= src_id { if start_ix != u16::MAX as usize { max_unmoved_src_id = max_unmoved_src_id.max(src_id); debug_assert!(slots[max_unmoved_src_id + 1..] .iter() .all(|s| s.as_slice().is_empty() || s .as_slice() .iter() .all(|s| s.id == 0 || s.count(&counts) == 0))); finished_digit_count += 1; if finished_digit_count == 9 { debug_assert_eq!( start_span_ix_by_needed_size[0], 0, "there are never zero-size files in the inputs apparently" ); break; } // mark this finished and all bigger digits too // unsafe { // start_span_ix_by_needed_size // .get_unchecked_mut(src_count as usize..10) // .fill(u16::MAX); // } // this is faster than above and creates huge diff in the resulting assembly across the // whole binary, although the callsite looks almost identical with a `memset` still getting // generated. // // also, `get_unchecked_mut` generates idential code; bounds checks are elided automatically // somehow for i in src_count as usize..10 { start_span_ix_by_needed_size[i] = u16::MAX; } } continue; } let src_id = src_id as u16; let start_ptr = unsafe { empty_spaces.as_ptr().add(start_ix) }; let mut cur_ptr = start_ptr; let mut cur_offset = 0usize; let mut dst_span_ix = loop { const VEC_SIZE: usize = 8usize; // let end_ix = start_ix + cur_offset + VEC_SIZE; // same caveat as before. For a 100% correct implementation for all possible inputs, we'd // need to handle manually checking the tail here but I'm leaving that out // // I could leave this off if I wanted to and it wouldn't be an issue... // if end_ix > input.len() - VEC_SIZE { // start_span_ix_by_needed_size[src_count as usize] = usize::MAX; // finished_digit_count += 1; // max_unmoved_src_id = max_unmoved_src_id.max(src_id as usize); // continue 'outer; // } let empty_spaces_v: u8x8 = unsafe { std::ptr::read_unaligned(cur_ptr as *const _) }; debug_assert_eq!(empty_spaces_v.len(), VEC_SIZE); let mask = empty_spaces_v.simd_ge(u8x8::splat(src_count)); match mask.first_set() { Some(i) => { let dst_span_ix = start_ix + cur_offset + i; if dst_span_ix >= src_id as usize { unsafe { *start_span_ix_by_needed_size.get_unchecked_mut(src_count as usize) = u16::MAX }; finished_digit_count += 1; max_unmoved_src_id = max_unmoved_src_id.max(src_id as usize); continue 'outer; } debug_assert!(empty_spaces[dst_span_ix] >= src_count); break dst_span_ix; }, None => { cur_ptr = unsafe { cur_ptr.add(VEC_SIZE) }; cur_offset += VEC_SIZE }, } }; let dst_slots: &mut MiniVec = unsafe { slots.get_unchecked_mut(dst_span_ix) }; max_unmoved_src_id = max_unmoved_src_id.max(dst_span_ix); dst_slots.push(Slot { id: src_id }); let new_empty_space = unsafe { *empty_spaces.get_unchecked(dst_span_ix) - src_count }; unsafe { *empty_spaces.get_unchecked_mut(dst_span_ix) = new_empty_space }; // no way we could fit more of this size into this span, so might as well move on to the next // one before continuing to loop if new_empty_space < src_count { dst_span_ix += 1; } unsafe { *start_span_ix_by_needed_size.get_unchecked_mut(src_count as usize) = dst_span_ix as u16 }; // \/ this code uses the fact that if a span of size `src_count` can't fit before `dst_span_ix`, // then no bigger span could either. // // However, it turns out to make things slower - especially when compiling with // `target-cpu=native`. That causes some fancy SIMD that performs this operation using masks // and whatnot to be emitted, but that turns out to be way slower than the scalar version. // // Anyway, just skipping all this work here seems to be the fastest method of them all, probably // because our SIMD free slot search is fast enough to make up for the savings of doing the more // fancy accounting after the bookkeeping overhead. // // for i in src_count as usize..10 { // start_span_ix_by_needed_size[i] = start_span_ix_by_needed_size[i].max(dst_span_ix); // } // the element we're removing is at the first index of the array since any others added to this // span will have been put after it let src_slots = unsafe { slots.get_unchecked_mut(src_id as usize) }; // debug_assert_eq!(src_slots.elements[0].id, src_id); unsafe { *empty_spaces.get_unchecked_mut(src_id as usize - 1) += src_count }; src_slots.pop_front(); } let mut out = 0usize; let mut total_prev = 0usize; for (slot, &empty_count) in unsafe { slots.get_unchecked_mut(..=max_unmoved_src_id) } .iter() .zip(unsafe { empty_spaces.get_unchecked_mut(..=max_unmoved_src_id) }.iter()) { out += checksum(slot.as_slice(), empty_count, &mut total_prev, &counts); } out }
Become a Patron
Sponsor on GitHub
Donate via PayPal
Source on GitHub
Mailing list
Installed libraries
Wiki
Report an issue
How it works
Contact the author
CE on Mastodon
CE on Bluesky
About the author
Statistics
Changelog
Version tree