git.delta.rocks / jrsonnet / refs/commits / c11576e16a8b

difftreelog

perf specialized Rc for interner

Yaroslav Bolyukin2022-05-26parent: #32f6ee5.patch.diff
in: master

3 files changed

modifiedcrates/jrsonnet-interner/Cargo.tomldiffbeforeafterboth
--- a/crates/jrsonnet-interner/Cargo.toml
+++ b/crates/jrsonnet-interner/Cargo.toml
@@ -6,7 +6,12 @@
 license = "MIT"
 edition = "2021"
 
+[features]
+default = ["serde"]
+serde = ["dep:serde"]
+
 [dependencies]
-serde = { version = "1.0" }
+serde = { version = "1.0", optional = true }
 rustc-hash = "1.1"
 gcmodule = { git = "https://github.com/CertainLach/gcmodule", branch = "jrsonnet" }
+hashbrown = { version = "0.12.1", features = ["inline-more"] }
addedcrates/jrsonnet-interner/src/inner.rsdiffbeforeafterboth
--- /dev/null
+++ b/crates/jrsonnet-interner/src/inner.rs
@@ -0,0 +1,234 @@
+use std::{
+	alloc::{self, Layout},
+	borrow::Borrow,
+	cmp,
+	hash::{Hash, Hasher},
+	mem,
+	ptr::{self, NonNull},
+	slice, str,
+};
+
+const UTF8_MASK: u32 = 1 << 31;
+const REFCNT_MASK: u32 = !UTF8_MASK;
+
+#[repr(C)]
+struct InnerHeader {
+	size: u32,
+	// MSB is checked utf8 flag, rest - refcnt
+	utf8_refcnt: u32,
+}
+impl InnerHeader {
+	const fn new(size: u32, is_utf8: bool) -> Self {
+		Self {
+			size,
+			utf8_refcnt: 1 | (if is_utf8 { UTF8_MASK } else { 0 }),
+		}
+	}
+
+	const fn refcnt(&self) -> u32 {
+		self.utf8_refcnt & REFCNT_MASK
+	}
+	const fn is_utf8(&self) -> bool {
+		self.utf8_refcnt & UTF8_MASK != 0
+	}
+
+	fn set_refcnt(&mut self, cnt: u32) {
+		assert_eq!(cnt & UTF8_MASK, 0);
+		// Reset all bits expect last
+		self.utf8_refcnt &= UTF8_MASK;
+		// Store refcnt
+		self.utf8_refcnt |= cnt;
+	}
+	fn set_is_utf8(&mut self) {
+		self.utf8_refcnt |= UTF8_MASK;
+	}
+}
+
+/// Similar to Rc<[u8]>, but stores all data (refcnt, size) inline, instead of being DST
+pub struct Inner(NonNull<u8>);
+impl Inner {
+	/// # Safety
+	/// `is_utf8` should only be set if data is really checked to be utf8
+	/// # Panics
+	/// If data is larger than 4GB
+	// we allocate with correct alignment
+	#[allow(clippy::cast_ptr_alignment)]
+	unsafe fn new_raw(bytes: &[u8], is_utf8: bool) -> Self {
+		// SAFETY:
+		// - layout has non-zero size, and correct align
+		// - data is written right after allocation
+		// - new allocation can't overlap with passed slice
+		unsafe {
+			let data = alloc::alloc(Layout::from_size_align_unchecked(
+				mem::size_of::<InnerHeader>() + bytes.len(),
+				mem::align_of::<InnerHeader>(),
+			));
+			assert!(!data.is_null());
+			*data.cast::<InnerHeader>() =
+				InnerHeader::new(bytes.len().try_into().expect("bytes > 4GB"), is_utf8);
+			ptr::copy_nonoverlapping(
+				bytes.as_ptr(),
+				data.add(mem::size_of::<InnerHeader>()),
+				bytes.len(),
+			);
+			Self(NonNull::new_unchecked(data))
+		}
+	}
+	pub fn new_bytes(bytes: &[u8]) -> Self {
+		// SAFETY: is_utf8 is not set
+		unsafe { Self::new_raw(bytes, false) }
+	}
+	#[allow(dead_code)]
+	pub fn new_str(str: &str) -> Self {
+		// SAFETY: strings always utf8
+		unsafe { Self::new_raw(str.as_bytes(), true) }
+	}
+
+	pub fn as_slice(&self) -> &[u8] {
+		let header = Self::header(self);
+		// SAFETY: data is not null, and it is correctly initialized
+		let size = unsafe { (*header).size };
+		// SAFETY: bytes after data is allocated to be exactly data.size in length
+		unsafe {
+			slice::from_raw_parts(
+				self.0.as_ptr().add(mem::size_of::<InnerHeader>()),
+				size as usize,
+			)
+		}
+	}
+
+	/// # Safety
+	/// Data should be checked to be utf8 via [`check_utf8`] first
+	pub unsafe fn as_str_unchecked(&self) -> &str {
+		// SAFETY: data is checked
+		unsafe { str::from_utf8_unchecked(self.as_slice()) }
+	}
+
+	/// Check data to be utf-8
+	///
+	/// Positive results are cached
+	pub fn check_utf8(this: &Self) -> bool {
+		let header = Self::header_mut(this);
+		// SAFETY: header is initialized
+		if unsafe { (*header).is_utf8() } {
+			return true;
+		}
+
+		if str::from_utf8(this.as_slice()).is_ok() {
+			// SAFETY: header is initialized
+			unsafe { (*header).set_is_utf8() };
+			true
+		} else {
+			false
+		}
+	}
+
+	/// Marks data as utf-8
+	///
+	/// # Safety
+	/// data should be really utf-8
+	pub unsafe fn assume_utf8(this: &Self) {
+		let header = Self::header_mut(this);
+		// SAFETY: header is correct
+		unsafe { (*header).set_is_utf8() }
+	}
+
+	const fn header(this: &Self) -> *const InnerHeader {
+		// in `new`, we allocate with correct alignment
+		#![allow(clippy::cast_ptr_alignment)]
+		this.0.as_ptr() as *const InnerHeader
+	}
+	const fn header_mut(this: &Self) -> *mut InnerHeader {
+		// in `new`, we allocate with correct alignment
+		#![allow(clippy::cast_ptr_alignment)]
+		this.0.as_ptr().cast::<InnerHeader>()
+	}
+
+	fn clone(this: &Self) -> Self {
+		let header = Self::header_mut(this);
+		// SAFETY: header is initialized
+		unsafe {
+			let refcnt = (*header).refcnt() + 1;
+			(*header).set_refcnt(refcnt);
+		}
+		Self(this.0)
+	}
+
+	pub fn ptr_eq(a: &Self, b: &Self) -> bool {
+		a.0 == b.0
+	}
+	pub fn as_ptr(this: &Self) -> *const u8 {
+		// SAFETY: data is initialized
+		unsafe { this.0.as_ptr().add(mem::size_of::<InnerHeader>()) }
+	}
+
+	pub const fn strong_count(this: &Self) -> u32 {
+		let header = Self::header(this);
+		// SAFETY: header is initialized
+		unsafe { (*header).refcnt() }
+	}
+}
+
+impl Clone for Inner {
+	fn clone(&self) -> Self {
+		Self::clone(self)
+	}
+}
+
+impl Drop for Inner {
+	fn drop(&mut self) {
+		#[cold]
+		#[inline(never)]
+		fn dealloc(val: &Inner) {
+			let header = Inner::header_mut(val);
+			// SAFETY: size is correct, layout is valid
+			unsafe {
+				alloc::dealloc(
+					val.0.as_ptr(),
+					Layout::from_size_align_unchecked(
+						mem::size_of::<InnerHeader>() + (*header).size as usize,
+						mem::align_of::<InnerHeader>(),
+					),
+				);
+			}
+		}
+		let header = Self::header_mut(self);
+		// SAFETY: header is initialized
+		let refcnt = unsafe {
+			let refcnt = (*header).refcnt() - 1;
+			(*header).set_refcnt(refcnt);
+			refcnt
+		};
+		if refcnt == 0 {
+			dealloc(self);
+		}
+	}
+}
+
+impl PartialEq for Inner {
+	fn eq(&self, other: &Self) -> bool {
+		self.0 == other.0 || self.as_slice().eq(other.as_slice())
+	}
+}
+impl Hash for Inner {
+	fn hash<H: Hasher>(&self, state: &mut H) {
+		self.as_slice().hash(state);
+	}
+}
+impl Eq for Inner {}
+impl PartialOrd for Inner {
+	fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
+		self.as_slice().partial_cmp(other.as_slice())
+	}
+}
+impl Ord for Inner {
+	fn cmp(&self, other: &Self) -> cmp::Ordering {
+		self.as_slice().cmp(other.as_slice())
+	}
+}
+
+impl Borrow<[u8]> for Inner {
+	fn borrow(&self) -> &[u8] {
+		self.as_slice()
+	}
+}
modifiedcrates/jrsonnet-interner/src/lib.rsdiffbeforeafterboth
before · crates/jrsonnet-interner/src/lib.rs
1use std::{2	borrow::Cow,3	cell::RefCell,4	convert::TryFrom,5	fmt::{self, Display},6	hash::{BuildHasherDefault, Hash, Hasher},7	ops::Deref,8	rc::Rc,9	str::Utf8Error,10};1112use gcmodule::Trace;13use rustc_hash::FxHashMap;14use serde::{Deserialize, Serialize};1516#[derive(Clone, PartialOrd, Ord, Eq)]17pub struct IStr(Rc<str>);18impl Trace for IStr {19	fn is_type_tracked() -> bool {20		false21	}22}2324impl Deref for IStr {25	type Target = str;2627	fn deref(&self) -> &Self::Target {28		&self.029	}30}3132impl PartialEq for IStr {33	fn eq(&self, other: &Self) -> bool {34		// It is ok, since all IStr should be inlined into same pool35		Rc::ptr_eq(&self.0, &other.0)36	}37}3839impl PartialEq<str> for IStr {40	fn eq(&self, other: &str) -> bool {41		&self.0 as &str == other42	}43}4445impl Hash for IStr {46	fn hash<H: Hasher>(&self, state: &mut H) {47		state.write_usize(Rc::as_ptr(&self.0) as *const () as usize)48	}49}5051impl Drop for IStr {52	fn drop(&mut self) {53		// First reference - current object, second - POOL54		if Rc::strong_count(&self.0) <= 2 {55			let _result = STR_POOL.try_with(|pool| pool.borrow_mut().remove(&self.0));56		}57	}58}5960impl fmt::Debug for IStr {61	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {62		write!(f, "{:?}", &self.0)63	}64}6566impl Display for IStr {67	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {68		f.write_str(&self.0)69	}70}7172thread_local! {73	static STR_POOL: RefCell<FxHashMap<Rc<str>, ()>> = RefCell::new(FxHashMap::with_capacity_and_hasher(200, BuildHasherDefault::default()));74}7576impl From<&str> for IStr {77	fn from(str: &str) -> Self {78		IStr(STR_POOL.with(|pool| {79			let mut pool = pool.borrow_mut();80			if let Some((k, _)) = pool.get_key_value(str) {81				k.clone()82			} else {83				let rc: Rc<str> = str.into();84				pool.insert(rc.clone(), ());85				rc86			}87		}))88	}89}9091impl TryFrom<&[u8]> for IStr {92	type Error = Utf8Error;9394	fn try_from(value: &[u8]) -> Result<Self, Self::Error> {95		let str = std::str::from_utf8(value)?;96		Ok(str.into())97	}98}99100impl From<String> for IStr {101	fn from(str: String) -> Self {102		(&str as &str).into()103	}104}105106impl<'i> From<Cow<'i, str>> for IStr {107	fn from(c: Cow<'i, str>) -> Self {108		(&c as &str).into()109	}110}111112impl Serialize for IStr {113	fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>114	where115		S: serde::Serializer,116	{117		(&self.0 as &str).serialize(serializer)118	}119}120121impl<'de> Deserialize<'de> for IStr {122	fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>123	where124		D: serde::Deserializer<'de>,125	{126		let s = <&str>::deserialize(deserializer)?;127		Ok(s.into())128	}129}
after · crates/jrsonnet-interner/src/lib.rs
1#![deny(2	unsafe_op_in_unsafe_fn,3	clippy::missing_safety_doc,4	clippy::undocumented_unsafe_blocks5)]6#![warn(clippy::pedantic, clippy::nursery)]7use std::{8	borrow::Cow,9	cell::RefCell,10	fmt::{self, Display},11	hash::{BuildHasherDefault, Hash, Hasher},12	ops::Deref,13	str,14};1516use gcmodule::Trace;17use hashbrown::HashMap;18use rustc_hash::FxHasher;1920mod inner;21use inner::Inner;2223/// Interned string24///25/// Provides O(1) comparsions and hashing, cheap copy, and cheap conversion to [`IBytes`]26#[derive(Clone, PartialOrd, Ord, Eq)]27pub struct IStr(Inner);28impl Trace for IStr {29	fn is_type_tracked() -> bool {30		false31	}32}3334impl IStr {35	#[must_use]36	pub fn as_str(&self) -> &str {37		self as &str38	}3940	#[must_use]41	pub fn cast_bytes(self) -> IBytes {42		IBytes(self.0.clone())43	}44}4546impl Deref for IStr {47	type Target = str;4849	fn deref(&self) -> &Self::Target {50		// SAFETY: Inner::check_utf8 is called on IStr construction, data is utf-851		unsafe { self.0.as_str_unchecked() }52	}53}5455impl PartialEq for IStr {56	fn eq(&self, other: &Self) -> bool {57		// all IStr should be inlined into same pool58		Inner::ptr_eq(&self.0, &other.0)59	}60}6162impl PartialEq<str> for IStr {63	fn eq(&self, other: &str) -> bool {64		self as &str == other65	}66}6768impl Hash for IStr {69	fn hash<H: Hasher>(&self, state: &mut H) {70		// IStr is always obtained from pool, where no string have duplicate, thus every unique string has unique address71		state.write_usize(Inner::as_ptr(&self.0).cast::<()>() as usize);72	}73}7475impl Drop for IStr {76	fn drop(&mut self) {77		#[cold]78		#[inline(never)]79		fn unpool(inner: &Inner) {80			// May fail on program termination81			let res = POOL.try_with(|pool| pool.borrow_mut().remove(inner));82			if res.is_ok() {83				debug_assert_eq!(Inner::strong_count(inner), 1);84			}85		}86		// First reference - current object, second - POOL87		if Inner::strong_count(&self.0) <= 2 {88			unpool(&self.0);89		}90	}91}9293impl fmt::Debug for IStr {94	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {95		fmt::Debug::fmt(self as &str, f)96	}97}9899impl Display for IStr {100	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {101		fmt::Display::fmt(self as &str, f)102	}103}104105/// Interned byte array106#[derive(Clone, PartialOrd, Ord, Eq)]107pub struct IBytes(Inner);108impl Trace for IBytes {109	fn is_type_tracked() -> bool {110		false111	}112}113114impl IBytes {115	#[must_use]116	pub fn cast_str(self) -> Option<IStr> {117		if Inner::check_utf8(&self.0) {118			Some(IStr(self.0.clone()))119		} else {120			None121		}122	}123	/// # Safety124	/// data should be valid utf8125	unsafe fn cast_str_unchecked(self) -> IStr {126		// SAFETY: data is utf8127		unsafe { Inner::assume_utf8(&self.0) };128		IStr(self.0.clone())129	}130}131132impl Deref for IBytes {133	type Target = [u8];134135	fn deref(&self) -> &Self::Target {136		self.0.as_slice()137	}138}139140impl PartialEq for IBytes {141	fn eq(&self, other: &Self) -> bool {142		// all IStr should be inlined into same pool143		Inner::ptr_eq(&self.0, &other.0)144	}145}146147impl Hash for IBytes {148	fn hash<H: Hasher>(&self, state: &mut H) {149		// IBytes is always obtained from pool, where no string have duplicate, thus every unique string has unique address150		state.write_usize(Inner::as_ptr(&self.0).cast::<()>() as usize);151	}152}153154impl Drop for IBytes {155	fn drop(&mut self) {156		#[cold]157		#[inline(never)]158		fn unpool(inner: &Inner) {159			// May fail on program termination160			let res = POOL.try_with(|pool| pool.borrow_mut().remove(inner));161			if res.is_ok() {162				debug_assert_eq!(Inner::strong_count(inner), 1);163			}164		}165		// First reference - current object, second - POOL166		if Inner::strong_count(&self.0) <= 2 {167			unpool(&self.0);168		}169	}170}171172impl fmt::Debug for IBytes {173	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {174		fmt::Debug::fmt(self as &[u8], f)175	}176}177178impl<'c> From<Cow<'c, str>> for IStr {179	fn from(v: Cow<'c, str>) -> Self {180		intern_str(&v)181	}182}183impl From<&str> for IStr {184	fn from(v: &str) -> Self {185		intern_str(v)186	}187}188impl From<String> for IStr {189	fn from(s: String) -> Self {190		s.as_str().into()191	}192}193impl From<&[u8]> for IBytes {194	fn from(v: &[u8]) -> Self {195		intern_bytes(v)196	}197}198199impl serde::Serialize for IStr {200	fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>201	where202		S: serde::Serializer,203	{204		self.as_str().serialize(serializer)205	}206}207208impl<'de> serde::Deserialize<'de> for IStr {209	fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>210	where211		D: serde::Deserializer<'de>,212	{213		let str = <&str>::deserialize(deserializer)?;214		Ok(intern_str(str))215	}216}217218thread_local! {219	static POOL: RefCell<HashMap<Inner, (), BuildHasherDefault<FxHasher>>> = RefCell::new(HashMap::with_capacity_and_hasher(200, BuildHasherDefault::default()));220}221222#[must_use]223pub fn intern_bytes(bytes: &[u8]) -> IBytes {224	POOL.with(|pool| {225		let mut pool = pool.borrow_mut();226		let entry = pool.raw_entry_mut().from_key(bytes);227		match entry {228			hashbrown::hash_map::RawEntryMut::Occupied(mut i) => {229				IBytes(i.get_key_value().0.clone())230			}231			hashbrown::hash_map::RawEntryMut::Vacant(e) => {232				let (k, _) = e.insert(Inner::new_bytes(bytes), ());233				IBytes(k.clone())234			}235		}236	})237}238239#[must_use]240pub fn intern_str(str: &str) -> IStr {241	// SAFETY: Rust strings always utf8242	unsafe { intern_bytes(str.as_bytes()).cast_str_unchecked() }243}