Compiler Explorer

Source code

use core::mem;

pub fn push(s: &mut String, ch: char) {
    let len = s.len();
    let ch_len = ch.len_utf8();
    s.reserve(ch_len);

// SAFETY: Just reserved capacity for at least the length needed to encode `ch`.
    unsafe {
        encode_utf8_raw_unchecked(ch as u32, s.as_mut_vec().spare_capacity_mut());
        s.as_mut_vec().set_len(len + ch_len);
    }
}

const TAG_CONT: u8 = 0b1000_0000;
const TAG_TWO_B: u8 = 0b1100_0000;
const TAG_THREE_B: u8 = 0b1110_0000;
const TAG_FOUR_B: u8 = 0b1111_0000;
const MAX_ONE_B: u32 = 0x80;
const MAX_TWO_B: u32 = 0x800;
const MAX_THREE_B: u32 = 0x10000;

#[inline]
const fn len_utf8(code: u32) -> usize {
    if code < MAX_ONE_B {
        1
    } else if code < MAX_TWO_B {
        2
    } else if code < MAX_THREE_B {
        3
    } else {
        4
    }
}

/// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
/// and then returns the subslice of the buffer that contains the encoded character.
///
/// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range.
/// (Creating a `char` in the surrogate range is UB.)
/// The result is valid [generalized UTF-8] but not valid UTF-8.
///
/// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8
///
/// # Panics
///
/// Panics if the buffer is not large enough.
/// A buffer of length four is large enough to encode any `char`.
#[inline]
pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] {
    let len = len_utf8(code);
    if dst.len() < len {
        panic!(
            "encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}",
            len,
            code,
            dst.len(),
        );
    }
    // SAFETY: it's safe to transmute a slice of `T` to a slice of `MaybeUninit<T>`
    let dst = unsafe { &mut *(dst as *mut [u8] as *mut [mem::MaybeUninit<u8>]) };
    // SAFETY: `dst` is checked to have at least the length needed to encode the codepoint
    unsafe { encode_utf8_raw_unchecked(code, dst) }
}

/// Encodes a raw u32 value as UTF-8 into the provided possibly uninitialized byte buffer,
/// and then returns the subslice of the buffer that contains the encoded character.
///
/// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range.
/// (Creating a `char` in the surrogate range is UB.)
/// The result is valid [generalized UTF-8] but not valid UTF-8.
///
/// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8
///
/// # Safety
///
/// The behavior is undefined if the buffer is not large enough to hold the encoded codepoint.
/// A buffer of length four is large enough to encode any `char`.
///
/// For a safe version of this function, see the [`encode_utf8_raw`] function.
#[inline]
pub unsafe fn encode_utf8_raw_unchecked(code: u32, dst: &mut [mem::MaybeUninit<u8>]) -> &mut [u8] {
    let len = len_utf8(code);
    // SAFETY: the caller must guarantee that `dst` is at least `len` bytes long
    unsafe {
        match len {
            1 => {
                dst.get_unchecked_mut(0).write(code as u8);
            }
            2 => {
                dst.get_unchecked_mut(0)
                    .write((code >> 6 & 0x1F) as u8 | TAG_TWO_B);
                dst.get_unchecked_mut(1)
                    .write((code & 0x3F) as u8 | TAG_CONT);
            }
            3 => {
                dst.get_unchecked_mut(0)
                    .write((code >> 12 & 0x0F) as u8 | TAG_THREE_B);
                dst.get_unchecked_mut(1)
                    .write((code >> 6 & 0x3F) as u8 | TAG_CONT);
                dst.get_unchecked_mut(2)
                    .write((code & 0x3F) as u8 | TAG_CONT);
            }
            4 => {
                dst.get_unchecked_mut(0)
                    .write((code >> 18 & 0x07) as u8 | TAG_FOUR_B);
                dst.get_unchecked_mut(1)
                    .write((code >> 12 & 0x3F) as u8 | TAG_CONT);
                dst.get_unchecked_mut(2)
                    .write((code >> 6 & 0x3F) as u8 | TAG_CONT);
                dst.get_unchecked_mut(3)
                    .write((code & 0x3F) as u8 | TAG_CONT);
            }
            _ => unreachable!(),
        }
    }
    // SAFETY: data has been written to the first `len` bytes
    unsafe { &mut *(dst.get_unchecked_mut(..len) as *mut [mem::MaybeUninit<u8>] as *mut [u8]) }
}