From c11576e16a8bd305d653a77c6bc4edfe2607b0eb Mon Sep 17 00:00:00 2001 From: Yaroslav Bolyukin Date: Thu, 26 May 2022 18:05:31 +0000 Subject: [PATCH] perf: specialized Rc for interner --- --- a/crates/jrsonnet-interner/Cargo.toml +++ b/crates/jrsonnet-interner/Cargo.toml @@ -6,7 +6,12 @@ license = "MIT" edition = "2021" +[features] +default = ["serde"] +serde = ["dep:serde"] + [dependencies] -serde = { version = "1.0" } +serde = { version = "1.0", optional = true } rustc-hash = "1.1" gcmodule = { git = "https://github.com/CertainLach/gcmodule", branch = "jrsonnet" } +hashbrown = { version = "0.12.1", features = ["inline-more"] } --- /dev/null +++ b/crates/jrsonnet-interner/src/inner.rs @@ -0,0 +1,234 @@ +use std::{ + alloc::{self, Layout}, + borrow::Borrow, + cmp, + hash::{Hash, Hasher}, + mem, + ptr::{self, NonNull}, + slice, str, +}; + +const UTF8_MASK: u32 = 1 << 31; +const REFCNT_MASK: u32 = !UTF8_MASK; + +#[repr(C)] +struct InnerHeader { + size: u32, + // MSB is checked utf8 flag, rest - refcnt + utf8_refcnt: u32, +} +impl InnerHeader { + const fn new(size: u32, is_utf8: bool) -> Self { + Self { + size, + utf8_refcnt: 1 | (if is_utf8 { UTF8_MASK } else { 0 }), + } + } + + const fn refcnt(&self) -> u32 { + self.utf8_refcnt & REFCNT_MASK + } + const fn is_utf8(&self) -> bool { + self.utf8_refcnt & UTF8_MASK != 0 + } + + fn set_refcnt(&mut self, cnt: u32) { + assert_eq!(cnt & UTF8_MASK, 0); + // Reset all bits expect last + self.utf8_refcnt &= UTF8_MASK; + // Store refcnt + self.utf8_refcnt |= cnt; + } + fn set_is_utf8(&mut self) { + self.utf8_refcnt |= UTF8_MASK; + } +} + +/// Similar to Rc<[u8]>, but stores all data (refcnt, size) inline, instead of being DST +pub struct Inner(NonNull); +impl Inner { + /// # Safety + /// `is_utf8` should only be set if data is really checked to be utf8 + /// # Panics + /// If data is larger than 4GB + // we allocate with correct alignment + #[allow(clippy::cast_ptr_alignment)] + unsafe fn new_raw(bytes: &[u8], is_utf8: bool) -> Self { + // SAFETY: + // - layout has non-zero size, and correct align + // - data is written right after allocation + // - new allocation can't overlap with passed slice + unsafe { + let data = alloc::alloc(Layout::from_size_align_unchecked( + mem::size_of::() + bytes.len(), + mem::align_of::(), + )); + assert!(!data.is_null()); + *data.cast::() = + InnerHeader::new(bytes.len().try_into().expect("bytes > 4GB"), is_utf8); + ptr::copy_nonoverlapping( + bytes.as_ptr(), + data.add(mem::size_of::()), + bytes.len(), + ); + Self(NonNull::new_unchecked(data)) + } + } + pub fn new_bytes(bytes: &[u8]) -> Self { + // SAFETY: is_utf8 is not set + unsafe { Self::new_raw(bytes, false) } + } + #[allow(dead_code)] + pub fn new_str(str: &str) -> Self { + // SAFETY: strings always utf8 + unsafe { Self::new_raw(str.as_bytes(), true) } + } + + pub fn as_slice(&self) -> &[u8] { + let header = Self::header(self); + // SAFETY: data is not null, and it is correctly initialized + let size = unsafe { (*header).size }; + // SAFETY: bytes after data is allocated to be exactly data.size in length + unsafe { + slice::from_raw_parts( + self.0.as_ptr().add(mem::size_of::()), + size as usize, + ) + } + } + + /// # Safety + /// Data should be checked to be utf8 via [`check_utf8`] first + pub unsafe fn as_str_unchecked(&self) -> &str { + // SAFETY: data is checked + unsafe { str::from_utf8_unchecked(self.as_slice()) } + } + + /// Check data to be utf-8 + /// + /// Positive results are cached + pub fn check_utf8(this: &Self) -> bool { + let header = Self::header_mut(this); + // SAFETY: header is initialized + if unsafe { (*header).is_utf8() } { + return true; + } + + if str::from_utf8(this.as_slice()).is_ok() { + // SAFETY: header is initialized + unsafe { (*header).set_is_utf8() }; + true + } else { + false + } + } + + /// Marks data as utf-8 + /// + /// # Safety + /// data should be really utf-8 + pub unsafe fn assume_utf8(this: &Self) { + let header = Self::header_mut(this); + // SAFETY: header is correct + unsafe { (*header).set_is_utf8() } + } + + const fn header(this: &Self) -> *const InnerHeader { + // in `new`, we allocate with correct alignment + #![allow(clippy::cast_ptr_alignment)] + this.0.as_ptr() as *const InnerHeader + } + const fn header_mut(this: &Self) -> *mut InnerHeader { + // in `new`, we allocate with correct alignment + #![allow(clippy::cast_ptr_alignment)] + this.0.as_ptr().cast::() + } + + fn clone(this: &Self) -> Self { + let header = Self::header_mut(this); + // SAFETY: header is initialized + unsafe { + let refcnt = (*header).refcnt() + 1; + (*header).set_refcnt(refcnt); + } + Self(this.0) + } + + pub fn ptr_eq(a: &Self, b: &Self) -> bool { + a.0 == b.0 + } + pub fn as_ptr(this: &Self) -> *const u8 { + // SAFETY: data is initialized + unsafe { this.0.as_ptr().add(mem::size_of::()) } + } + + pub const fn strong_count(this: &Self) -> u32 { + let header = Self::header(this); + // SAFETY: header is initialized + unsafe { (*header).refcnt() } + } +} + +impl Clone for Inner { + fn clone(&self) -> Self { + Self::clone(self) + } +} + +impl Drop for Inner { + fn drop(&mut self) { + #[cold] + #[inline(never)] + fn dealloc(val: &Inner) { + let header = Inner::header_mut(val); + // SAFETY: size is correct, layout is valid + unsafe { + alloc::dealloc( + val.0.as_ptr(), + Layout::from_size_align_unchecked( + mem::size_of::() + (*header).size as usize, + mem::align_of::(), + ), + ); + } + } + let header = Self::header_mut(self); + // SAFETY: header is initialized + let refcnt = unsafe { + let refcnt = (*header).refcnt() - 1; + (*header).set_refcnt(refcnt); + refcnt + }; + if refcnt == 0 { + dealloc(self); + } + } +} + +impl PartialEq for Inner { + fn eq(&self, other: &Self) -> bool { + self.0 == other.0 || self.as_slice().eq(other.as_slice()) + } +} +impl Hash for Inner { + fn hash(&self, state: &mut H) { + self.as_slice().hash(state); + } +} +impl Eq for Inner {} +impl PartialOrd for Inner { + fn partial_cmp(&self, other: &Self) -> Option { + self.as_slice().partial_cmp(other.as_slice()) + } +} +impl Ord for Inner { + fn cmp(&self, other: &Self) -> cmp::Ordering { + self.as_slice().cmp(other.as_slice()) + } +} + +impl Borrow<[u8]> for Inner { + fn borrow(&self) -> &[u8] { + self.as_slice() + } +} --- a/crates/jrsonnet-interner/src/lib.rs +++ b/crates/jrsonnet-interner/src/lib.rs @@ -1,129 +1,243 @@ +#![deny( + unsafe_op_in_unsafe_fn, + clippy::missing_safety_doc, + clippy::undocumented_unsafe_blocks +)] +#![warn(clippy::pedantic, clippy::nursery)] use std::{ borrow::Cow, cell::RefCell, - convert::TryFrom, fmt::{self, Display}, hash::{BuildHasherDefault, Hash, Hasher}, ops::Deref, - rc::Rc, - str::Utf8Error, + str, }; use gcmodule::Trace; -use rustc_hash::FxHashMap; -use serde::{Deserialize, Serialize}; +use hashbrown::HashMap; +use rustc_hash::FxHasher; + +mod inner; +use inner::Inner; +/// Interned string +/// +/// Provides O(1) comparsions and hashing, cheap copy, and cheap conversion to [`IBytes`] #[derive(Clone, PartialOrd, Ord, Eq)] -pub struct IStr(Rc); +pub struct IStr(Inner); impl Trace for IStr { fn is_type_tracked() -> bool { false } } +impl IStr { + #[must_use] + pub fn as_str(&self) -> &str { + self as &str + } + + #[must_use] + pub fn cast_bytes(self) -> IBytes { + IBytes(self.0.clone()) + } +} + impl Deref for IStr { type Target = str; fn deref(&self) -> &Self::Target { - &self.0 + // SAFETY: Inner::check_utf8 is called on IStr construction, data is utf-8 + unsafe { self.0.as_str_unchecked() } } } impl PartialEq for IStr { fn eq(&self, other: &Self) -> bool { - // It is ok, since all IStr should be inlined into same pool - Rc::ptr_eq(&self.0, &other.0) + // all IStr should be inlined into same pool + Inner::ptr_eq(&self.0, &other.0) } } impl PartialEq for IStr { fn eq(&self, other: &str) -> bool { - &self.0 as &str == other + self as &str == other } } impl Hash for IStr { fn hash(&self, state: &mut H) { - state.write_usize(Rc::as_ptr(&self.0) as *const () as usize) + // IStr is always obtained from pool, where no string have duplicate, thus every unique string has unique address + state.write_usize(Inner::as_ptr(&self.0).cast::<()>() as usize); } } impl Drop for IStr { fn drop(&mut self) { + #[cold] + #[inline(never)] + fn unpool(inner: &Inner) { + // May fail on program termination + let res = POOL.try_with(|pool| pool.borrow_mut().remove(inner)); + if res.is_ok() { + debug_assert_eq!(Inner::strong_count(inner), 1); + } + } // First reference - current object, second - POOL - if Rc::strong_count(&self.0) <= 2 { - let _result = STR_POOL.try_with(|pool| pool.borrow_mut().remove(&self.0)); + if Inner::strong_count(&self.0) <= 2 { + unpool(&self.0); } } } impl fmt::Debug for IStr { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{:?}", &self.0) + fmt::Debug::fmt(self as &str, f) } } impl Display for IStr { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(&self.0) + fmt::Display::fmt(self as &str, f) + } +} + +/// Interned byte array +#[derive(Clone, PartialOrd, Ord, Eq)] +pub struct IBytes(Inner); +impl Trace for IBytes { + fn is_type_tracked() -> bool { + false + } +} + +impl IBytes { + #[must_use] + pub fn cast_str(self) -> Option { + if Inner::check_utf8(&self.0) { + Some(IStr(self.0.clone())) + } else { + None + } + } + /// # Safety + /// data should be valid utf8 + unsafe fn cast_str_unchecked(self) -> IStr { + // SAFETY: data is utf8 + unsafe { Inner::assume_utf8(&self.0) }; + IStr(self.0.clone()) + } +} + +impl Deref for IBytes { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + self.0.as_slice() + } +} + +impl PartialEq for IBytes { + fn eq(&self, other: &Self) -> bool { + // all IStr should be inlined into same pool + Inner::ptr_eq(&self.0, &other.0) } } -thread_local! { - static STR_POOL: RefCell, ()>> = RefCell::new(FxHashMap::with_capacity_and_hasher(200, BuildHasherDefault::default())); +impl Hash for IBytes { + fn hash(&self, state: &mut H) { + // IBytes is always obtained from pool, where no string have duplicate, thus every unique string has unique address + state.write_usize(Inner::as_ptr(&self.0).cast::<()>() as usize); + } } -impl From<&str> for IStr { - fn from(str: &str) -> Self { - IStr(STR_POOL.with(|pool| { - let mut pool = pool.borrow_mut(); - if let Some((k, _)) = pool.get_key_value(str) { - k.clone() - } else { - let rc: Rc = str.into(); - pool.insert(rc.clone(), ()); - rc +impl Drop for IBytes { + fn drop(&mut self) { + #[cold] + #[inline(never)] + fn unpool(inner: &Inner) { + // May fail on program termination + let res = POOL.try_with(|pool| pool.borrow_mut().remove(inner)); + if res.is_ok() { + debug_assert_eq!(Inner::strong_count(inner), 1); } - })) + } + // First reference - current object, second - POOL + if Inner::strong_count(&self.0) <= 2 { + unpool(&self.0); + } } } -impl TryFrom<&[u8]> for IStr { - type Error = Utf8Error; +impl fmt::Debug for IBytes { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(self as &[u8], f) + } +} - fn try_from(value: &[u8]) -> Result { - let str = std::str::from_utf8(value)?; - Ok(str.into()) +impl<'c> From> for IStr { + fn from(v: Cow<'c, str>) -> Self { + intern_str(&v) + } +} +impl From<&str> for IStr { + fn from(v: &str) -> Self { + intern_str(v) } } - impl From for IStr { - fn from(str: String) -> Self { - (&str as &str).into() + fn from(s: String) -> Self { + s.as_str().into() } } - -impl<'i> From> for IStr { - fn from(c: Cow<'i, str>) -> Self { - (&c as &str).into() +impl From<&[u8]> for IBytes { + fn from(v: &[u8]) -> Self { + intern_bytes(v) } } -impl Serialize for IStr { +impl serde::Serialize for IStr { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, { - (&self.0 as &str).serialize(serializer) + self.as_str().serialize(serializer) } } -impl<'de> Deserialize<'de> for IStr { +impl<'de> serde::Deserialize<'de> for IStr { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { - let s = <&str>::deserialize(deserializer)?; - Ok(s.into()) + let str = <&str>::deserialize(deserializer)?; + Ok(intern_str(str)) } } + +thread_local! { + static POOL: RefCell>> = RefCell::new(HashMap::with_capacity_and_hasher(200, BuildHasherDefault::default())); +} + +#[must_use] +pub fn intern_bytes(bytes: &[u8]) -> IBytes { + POOL.with(|pool| { + let mut pool = pool.borrow_mut(); + let entry = pool.raw_entry_mut().from_key(bytes); + match entry { + hashbrown::hash_map::RawEntryMut::Occupied(mut i) => { + IBytes(i.get_key_value().0.clone()) + } + hashbrown::hash_map::RawEntryMut::Vacant(e) => { + let (k, _) = e.insert(Inner::new_bytes(bytes), ()); + IBytes(k.clone()) + } + } + }) +} + +#[must_use] +pub fn intern_str(str: &str) -> IStr { + // SAFETY: Rust strings always utf8 + unsafe { intern_bytes(str.as_bytes()).cast_str_unchecked() } +} -- gitstuff