diff --git a/Cargo.lock b/Cargo.lock index fe647084ac96d..e2f6a79162b5e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8385,6 +8385,7 @@ version = "0.1.0" name = "turbo-rcstr" version = "0.1.0" dependencies = [ + "new_debug_unreachable", "serde", "triomphe 0.1.12", "turbo-tasks-hash", diff --git a/turbopack/crates/turbo-rcstr/Cargo.toml b/turbopack/crates/turbo-rcstr/Cargo.toml index a217496d7f4da..443d3a6e2dd0b 100644 --- a/turbopack/crates/turbo-rcstr/Cargo.toml +++ b/turbopack/crates/turbo-rcstr/Cargo.toml @@ -4,10 +4,15 @@ version = "0.1.0" edition = "2021" license = "MIT" +[features] +atom_size_64=[] +atom_size_128=[] + [dependencies] triomphe = { workspace = true } turbo-tasks-hash = { workspace = true } serde = { workspace = true } +new_debug_unreachable = "1.0.6" [lints] workspace = true diff --git a/turbopack/crates/turbo-rcstr/src/dynamic.rs b/turbopack/crates/turbo-rcstr/src/dynamic.rs new file mode 100644 index 0000000000000..044e88573eb1d --- /dev/null +++ b/turbopack/crates/turbo-rcstr/src/dynamic.rs @@ -0,0 +1,50 @@ +use std::ptr::NonNull; + +use triomphe::Arc; + +use crate::{ + tagged_value::{TaggedValue, MAX_INLINE_LEN}, + RcStr, INLINE_TAG_INIT, LEN_OFFSET, TAG_MASK, +}; + +pub unsafe fn cast(ptr: TaggedValue) -> *const String { + ptr.get_ptr().cast() +} + +pub unsafe fn deref_from<'i>(ptr: TaggedValue) -> &'i String { + &*cast(ptr) +} + +/// Caller should call `forget` (or `clone`) on the returned `Arc` +pub unsafe fn restore_arc(v: TaggedValue) -> Arc { + let ptr = v.get_ptr() as *const String; + Arc::from_raw(ptr) +} + +/// This can create any kind of [Atom], although this lives in the `dynamic` +/// module. +pub(crate) fn new_atom + Into>(text: T) -> RcStr { + let len = text.as_ref().len(); + + if len < MAX_INLINE_LEN { + // INLINE_TAG ensures this is never zero + let tag = INLINE_TAG_INIT | ((len as u8) << LEN_OFFSET); + let mut unsafe_data = TaggedValue::new_tag(tag); + unsafe { + unsafe_data.data_mut()[..len].copy_from_slice(text.as_ref().as_bytes()); + } + return RcStr { unsafe_data }; + } + + let entry = Arc::new(text.into()); + let entry = Arc::into_raw(entry); + + let ptr: NonNull = unsafe { + // Safety: Arc::into_raw returns a non-null pointer + NonNull::new_unchecked(entry as *mut String) + }; + debug_assert!(0 == ptr.as_ptr() as u8 & TAG_MASK); + RcStr { + unsafe_data: TaggedValue::new_ptr(ptr), + } +} diff --git a/turbopack/crates/turbo-rcstr/src/lib.rs b/turbopack/crates/turbo-rcstr/src/lib.rs index e351900a337ba..5255a5aee0751 100644 --- a/turbopack/crates/turbo-rcstr/src/lib.rs +++ b/turbopack/crates/turbo-rcstr/src/lib.rs @@ -2,14 +2,23 @@ use std::{ borrow::{Borrow, Cow}, ffi::OsStr, fmt::{Debug, Display}, + hash::{Hash, Hasher}, + mem::forget, + num::NonZeroU8, ops::Deref, path::{Path, PathBuf}, }; -use serde::{Deserialize, Serialize}; +use debug_unreachable::debug_unreachable; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; use triomphe::Arc; use turbo_tasks_hash::{DeterministicHash, DeterministicHasher}; +use crate::{dynamic::new_atom, tagged_value::TaggedValue}; + +mod dynamic; +mod tagged_value; + /// An immutable reference counted [`String`], similar to [`Arc`][std::sync::Arc]. /// /// This is the preferred immutable string type for [`turbo_task::function`][macro@crate::function] @@ -44,13 +53,37 @@ use turbo_tasks_hash::{DeterministicHash, DeterministicHasher}; // If you want to change the underlying string type to `Arc`, please ensure that you profile // performance. The current implementation offers very cheap `String -> RcStr -> String`, meaning we // only pay for the allocation for `Arc` when we pass `format!("").into()` to a function. -#[derive(Clone, Default, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] -#[serde(transparent)] -pub struct RcStr(Arc); +pub struct RcStr { + unsafe_data: TaggedValue, +} + +unsafe impl Send for RcStr {} +unsafe impl Sync for RcStr {} + +const DYNAMIC_TAG: u8 = 0b_00; +const INLINE_TAG: u8 = 0b_01; // len in upper nybble +const INLINE_TAG_INIT: NonZeroU8 = unsafe { NonZeroU8::new_unchecked(INLINE_TAG) }; +const TAG_MASK: u8 = 0b_11; +const LEN_OFFSET: usize = 4; +const LEN_MASK: u8 = 0xf0; impl RcStr { + #[inline(always)] + fn tag(&self) -> u8 { + self.unsafe_data.tag() & TAG_MASK + } + + #[inline(never)] pub fn as_str(&self) -> &str { - self.0.as_str() + match self.tag() { + DYNAMIC_TAG => unsafe { dynamic::deref_from(self.unsafe_data) }, + INLINE_TAG => { + let len = (self.unsafe_data.tag() & LEN_MASK) >> LEN_OFFSET; + let src = self.unsafe_data.data(); + unsafe { std::str::from_utf8_unchecked(&src[..(len as usize)]) } + } + _ => unsafe { debug_unreachable!() }, + } } /// Returns an owned mutable [`String`]. @@ -61,14 +94,39 @@ impl RcStr { /// underlying string without cloning in `O(1)` time. /// - This avoids some of the potential overhead of the `Display` trait. pub fn into_owned(self) -> String { - match Arc::try_unwrap(self.0) { - Ok(v) => v, - Err(arc) => (*arc).clone(), + match self.tag() { + DYNAMIC_TAG => { + let arc = unsafe { dynamic::restore_arc(self.unsafe_data) }; + + match Arc::try_unwrap(arc.clone()) { + Ok(v) => v, + Err(arc) => { + let s = arc.to_string(); + forget(arc); + s + } + } + } + INLINE_TAG => self.as_str().to_string(), + _ => unsafe { debug_unreachable!() }, } } pub fn map(self, f: impl FnOnce(String) -> String) -> Self { - RcStr(Arc::new(f(self.into_owned()))) + RcStr::from(Cow::Owned(f(self.into_owned()))) + } + + #[inline] + pub(crate) fn from_alias(alias: TaggedValue) -> Self { + if alias.tag() & TAG_MASK == DYNAMIC_TAG { + unsafe { + let arc = dynamic::restore_arc(alias); + forget(arc.clone()); + forget(arc); + } + } + + Self { unsafe_data: alias } } } @@ -83,70 +141,73 @@ impl Deref for RcStr { type Target = str; fn deref(&self) -> &Self::Target { - self.0.as_str() + self.as_str() } } impl Borrow for RcStr { fn borrow(&self) -> &str { - self.0.as_str() + self.as_str() } } impl From> for RcStr { fn from(s: Arc) -> Self { - RcStr(s) + match Arc::try_unwrap(s) { + Ok(v) => new_atom(Cow::Owned(v)), + Err(arc) => new_atom(Cow::Borrowed(&**arc)), + } } } impl From for RcStr { fn from(s: String) -> Self { - RcStr(Arc::new(s)) + new_atom(Cow::Owned(s)) } } impl From<&'_ str> for RcStr { fn from(s: &str) -> Self { - RcStr(Arc::new(s.to_string())) + new_atom(Cow::Borrowed(s)) } } impl From> for RcStr { fn from(s: Cow) -> Self { - RcStr(Arc::new(s.into_owned())) + new_atom(s) } } /// Mimic `&str` impl AsRef for RcStr { fn as_ref(&self) -> &Path { - (*self.0).as_ref() + self.as_str().as_ref() } } /// Mimic `&str` impl AsRef for RcStr { fn as_ref(&self) -> &OsStr { - (*self.0).as_ref() + self.as_str().as_ref() } } /// Mimic `&str` impl AsRef<[u8]> for RcStr { fn as_ref(&self) -> &[u8] { - (*self.0).as_ref() + self.as_str().as_ref() } } impl PartialEq for RcStr { fn eq(&self, other: &str) -> bool { - self.0.as_str() == other + self.as_str() == other } } impl PartialEq<&'_ str> for RcStr { fn eq(&self, other: &&str) -> bool { - self.0.as_str() == *other + self.as_str() == *other } } @@ -158,13 +219,13 @@ impl PartialEq for RcStr { impl Debug for RcStr { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - Debug::fmt(&self.0, f) + Debug::fmt(&self.as_str(), f) } } impl Display for RcStr { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - Display::fmt(&self.0, f) + Display::fmt(&self.as_str(), f) } } @@ -179,3 +240,63 @@ impl From for PathBuf { String::from(s).into() } } + +impl Clone for RcStr { + #[inline(always)] + fn clone(&self) -> Self { + Self::from_alias(self.unsafe_data) + } +} + +impl Default for RcStr { + fn default() -> Self { + RcStr::from("") + } +} + +impl PartialEq for RcStr { + fn eq(&self, other: &Self) -> bool { + self.as_str() == other.as_str() + } +} + +impl Eq for RcStr {} + +impl PartialOrd for RcStr { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for RcStr { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.as_str().cmp(other.as_str()) + } +} + +impl Hash for RcStr { + fn hash(&self, state: &mut H) { + self.as_str().hash(state); + } +} + +impl Serialize for RcStr { + fn serialize(&self, serializer: S) -> Result { + serializer.serialize_str(self.as_str()) + } +} + +impl<'de> Deserialize<'de> for RcStr { + fn deserialize>(deserializer: D) -> Result { + let s = String::deserialize(deserializer)?; + Ok(RcStr::from(s)) + } +} + +impl Drop for RcStr { + fn drop(&mut self) { + if self.tag() == DYNAMIC_TAG { + unsafe { drop(dynamic::restore_arc(self.unsafe_data)) } + } + } +} diff --git a/turbopack/crates/turbo-rcstr/src/tagged_value.rs b/turbopack/crates/turbo-rcstr/src/tagged_value.rs new file mode 100644 index 0000000000000..b69432412dcb7 --- /dev/null +++ b/turbopack/crates/turbo-rcstr/src/tagged_value.rs @@ -0,0 +1,143 @@ +#![allow(clippy::missing_transmute_annotations)] + +use std::{num::NonZeroU8, os::raw::c_void, ptr::NonNull, slice}; + +#[cfg(feature = "atom_size_128")] +type RawTaggedValue = u128; +#[cfg(any( + target_pointer_width = "32", + target_pointer_width = "16", + feature = "atom_size_64" +))] +type RawTaggedValue = u64; +#[cfg(not(any( + target_pointer_width = "32", + target_pointer_width = "16", + feature = "atom_size_64", + feature = "atom_size_128" +)))] +type RawTaggedValue = usize; + +#[cfg(feature = "atom_size_128")] +type RawTaggedNonZeroValue = std::num::NonZeroU128; +#[cfg(any( + target_pointer_width = "32", + target_pointer_width = "16", + feature = "atom_size_64" +))] +type RawTaggedNonZeroValue = std::num::NonZeroU64; +#[cfg(not(any( + target_pointer_width = "32", + target_pointer_width = "16", + feature = "atom_size_64", + feature = "atom_size_128" +)))] +type RawTaggedNonZeroValue = std::ptr::NonNull<()>; + +pub(crate) const MAX_INLINE_LEN: usize = std::mem::size_of::() - 1; + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[repr(transparent)] +pub(crate) struct TaggedValue { + value: RawTaggedNonZeroValue, +} + +impl TaggedValue { + #[inline(always)] + pub fn new_ptr(value: NonNull) -> Self { + #[cfg(any( + target_pointer_width = "32", + target_pointer_width = "16", + feature = "atom_size_64", + feature = "atom_size_128" + ))] + unsafe { + let value: std::num::NonZeroUsize = std::mem::transmute(value); + Self { + value: RawTaggedNonZeroValue::new_unchecked(value.get() as _), + } + } + + #[cfg(not(any( + target_pointer_width = "32", + target_pointer_width = "16", + feature = "atom_size_64", + feature = "atom_size_128" + )))] + { + Self { + value: value.cast(), + } + } + } + + #[inline(always)] + pub fn new_tag(value: NonZeroU8) -> Self { + let value = value.get() as RawTaggedValue; + Self { + value: unsafe { std::mem::transmute(value) }, + } + } + + #[inline(always)] + pub fn get_ptr(&self) -> *const c_void { + #[cfg(any( + target_pointer_width = "32", + target_pointer_width = "16", + feature = "atom_size_64", + feature = "atom_size_128" + ))] + { + self.value.get() as usize as _ + } + #[cfg(not(any( + target_pointer_width = "32", + target_pointer_width = "16", + feature = "atom_size_64", + feature = "atom_size_128" + )))] + unsafe { + std::mem::transmute(Some(self.value)) + } + } + + #[inline(always)] + fn get_value(&self) -> RawTaggedValue { + unsafe { std::mem::transmute(Some(self.value)) } + } + + #[inline(always)] + pub fn tag(&self) -> u8 { + (self.get_value() & 0xff) as u8 + } + + pub fn data(&self) -> &[u8] { + let x: *const _ = &self.value; + let mut data = x as *const u8; + // All except the lowest byte, which is first in little-endian, last in + // big-endian. + if cfg!(target_endian = "little") { + unsafe { + data = data.offset(1); + } + } + let len = std::mem::size_of::() - 1; + unsafe { slice::from_raw_parts(data, len) } + } + + /// The `TaggedValue` is a non-zero number or pointer, so caution must be + /// used when setting the untagged slice part of this value. If tag is + /// zero and the slice is zeroed out, using this `TaggedValue` will be + /// UB! + pub unsafe fn data_mut(&mut self) -> &mut [u8] { + let x: *mut _ = &mut self.value; + let mut data = x as *mut u8; + // All except the lowest byte, which is first in little-endian, last in + // big-endian. + if cfg!(target_endian = "little") { + data = data.offset(1); + } + let len = std::mem::size_of::() - 1; + slice::from_raw_parts_mut(data, len) + } +}