inturn/interner/
bytes.rs

1use crate::{InternerSymbol, Symbol};
2use boxcar::Vec as LFVec;
3use bumpalo::Bump;
4use dashmap::DashMap;
5use hashbrown::hash_table;
6use std::{collections::hash_map::RandomState, hash::BuildHasher};
7use thread_local::ThreadLocal;
8
9/// `[u8] -> Symbol` interner.
10/// The hash is also stored to avoid double hashing.
11///
12/// This uses `NoHasher` because we want to store the `hash_builder`
13/// outside of the lock, and to avoid hashing twice on insertion.
14pub(crate) type Map<S> = DashMap<MapKey, S, NoHasherBuilder>;
15pub(crate) type MapKey = (u64, &'static [u8]);
16pub(crate) type RawMapKey<S> = (MapKey, S);
17
18// TODO: Use a lock-free arena.
19type Arena = ThreadLocal<Bump>;
20
21/// Byte string interner.
22///
23/// See the [crate-level docs][crate] for more details.
24pub struct BytesInterner<S = Symbol, H = RandomState> {
25    pub(crate) map: Map<S>,
26    hash_builder: H,
27    strs: LFVec<&'static [u8]>,
28    arena: Arena,
29}
30
31impl Default for BytesInterner {
32    #[inline]
33    fn default() -> Self {
34        Self::new()
35    }
36}
37
38impl BytesInterner<Symbol, RandomState> {
39    /// Creates a new, empty `Interner` with the default symbol and hasher.
40    #[inline]
41    pub fn new() -> Self {
42        Self::with_capacity(0)
43    }
44
45    /// Creates a new `Interner` with the given capacity and default symbol and hasher.
46    #[inline]
47    pub fn with_capacity(capacity: usize) -> Self {
48        Self::with_capacity_and_hasher(capacity, Default::default())
49    }
50}
51
52impl<S: InternerSymbol, H: BuildHasher> BytesInterner<S, H> {
53    /// Creates a new `Interner` with the given custom hasher.
54    #[inline]
55    pub fn with_hasher(hash_builder: H) -> Self {
56        Self::with_capacity_and_hasher(0, hash_builder)
57    }
58
59    /// Creates a new `Interner` with the given capacitiy and custom hasher.
60    pub fn with_capacity_and_hasher(capacity: usize, hash_builder: H) -> Self {
61        let map = Map::with_capacity_and_hasher(capacity, Default::default());
62        let strs = LFVec::with_capacity(capacity);
63        Self { map, strs, arena: Default::default(), hash_builder }
64    }
65
66    /// Returns the number of unique strings in the interner.
67    #[inline]
68    pub fn len(&self) -> usize {
69        self.strs.count()
70    }
71
72    /// Returns `true` if the interner is empty.
73    #[inline]
74    pub fn is_empty(&self) -> bool {
75        self.len() == 0
76    }
77
78    /// Returns an iterator over the interned strings and their corresponding `Symbol`s.
79    ///
80    /// Does not guarantee that it includes symbols added after the iterator was created.
81    #[inline]
82    pub fn iter(&self) -> impl ExactSizeIterator<Item = (S, &[u8])> + Clone {
83        self.all_symbols().map(|s| (s, self.resolve(s)))
84    }
85
86    /// Returns an iterator over all symbols in the interner.
87    #[inline]
88    pub fn all_symbols(&self) -> impl ExactSizeIterator<Item = S> + Send + Sync + Clone {
89        (0..self.len()).map(S::from_usize)
90    }
91
92    /// Interns a string, returning its unique `Symbol`.
93    ///
94    /// Allocates the string internally if it is not already interned.
95    ///
96    /// If `s` outlives `self`, like `&'static [u8]`, prefer using
97    /// [`intern_static`](Self::intern_static), as it will not allocate the string on the heap.
98    pub fn intern(&self, s: &[u8]) -> S {
99        self.do_intern(s, alloc)
100    }
101
102    /// Interns a string, returning its unique `Symbol`.
103    ///
104    /// Allocates the string internally if it is not already interned.
105    ///
106    /// If `s` outlives `self`, like `&'static [u8]`, prefer using
107    /// [`intern_mut_static`](Self::intern_mut_static), as it will not allocate the string on the
108    /// heap.
109    ///
110    /// By taking `&mut self`, this never acquires any locks.
111    pub fn intern_mut(&mut self, s: &[u8]) -> S {
112        self.do_intern_mut(s, alloc)
113    }
114
115    /// Interns a static string, returning its unique `Symbol`.
116    ///
117    /// Note that this only requires that `s` outlives `self`, which means we can avoid allocating
118    /// the string.
119    pub fn intern_static<'a, 'b: 'a>(&'a self, s: &'b [u8]) -> S {
120        self.do_intern(s, no_alloc)
121    }
122
123    /// Interns a static string, returning its unique `Symbol`.
124    ///
125    /// Note that this only requires that `s` outlives `self`, which means we can avoid allocating
126    /// the string.
127    ///
128    /// By taking `&mut self`, this never acquires any locks.
129    pub fn intern_mut_static<'a, 'b: 'a>(&'a mut self, s: &'b [u8]) -> S {
130        self.do_intern_mut(s, no_alloc)
131    }
132
133    /// Interns multiple strings.
134    ///
135    /// Allocates the strings internally if they are not already interned.
136    ///
137    /// If the strings outlive `self`, like `&'static [u8]`, prefer using
138    /// [`intern_many_static`](Self::intern_many_static), as it will not allocate the strings on the
139    /// heap.
140    pub fn intern_many<'a>(&self, strings: impl IntoIterator<Item = &'a [u8]>) {
141        for s in strings {
142            self.intern(s);
143        }
144    }
145
146    /// Interns multiple strings.
147    ///
148    /// Allocates the strings internally if they are not already interned.
149    ///
150    /// If the strings outlive `self`, like `&'static [u8]`, prefer using
151    /// [`intern_many_mut_static`](Self::intern_many_mut_static), as it will not allocate the
152    /// strings on the heap.
153    ///
154    /// By taking `&mut self`, this never acquires any locks.
155    pub fn intern_many_mut<'a>(&mut self, strings: impl IntoIterator<Item = &'a [u8]>) {
156        for s in strings {
157            self.intern_mut(s);
158        }
159    }
160
161    /// Interns multiple static strings.
162    ///
163    /// Note that this only requires that the strings outlive `self`, which means we can avoid
164    /// allocating the strings.
165    pub fn intern_many_static<'a, 'b: 'a>(&'a self, strings: impl IntoIterator<Item = &'b [u8]>) {
166        for s in strings {
167            self.intern_static(s);
168        }
169    }
170
171    /// Interns multiple static strings.
172    ///
173    /// Note that this only requires that the strings outlive `self`, which means we can avoid
174    /// allocating the strings.
175    ///
176    /// By taking `&mut self`, this never acquires any locks.
177    pub fn intern_many_mut_static<'a, 'b: 'a>(
178        &'a mut self,
179        strings: impl IntoIterator<Item = &'b [u8]>,
180    ) {
181        for s in strings {
182            self.intern_mut_static(s);
183        }
184    }
185
186    /// Maps a `Symbol` to its string. This is a cheap, lock-free operation.
187    ///
188    /// # Panics
189    ///
190    /// Panics if `Symbol` is out of bounds of this `Interner`. You should only use `Symbol`s
191    /// created by this `Interner`.
192    #[inline]
193    #[must_use]
194    #[cfg_attr(debug_assertions, track_caller)]
195    pub fn resolve(&self, sym: S) -> &[u8] {
196        if cfg!(debug_assertions) {
197            self.strs.get(sym.to_usize()).expect("symbol out of bounds")
198        } else {
199            unsafe { self.strs.get_unchecked(sym.to_usize()) }
200        }
201    }
202
203    #[inline]
204    fn do_intern(&self, s: &[u8], alloc: impl FnOnce(&Arena, &[u8]) -> &'static [u8]) -> S {
205        let hash = self.hash(s);
206        let shard_idx = self.map.determine_shard(hash as usize);
207        let shard = &*self.map.shards()[shard_idx];
208
209        if let Some((_, v)) = cvt(&shard.read()).find(hash, mk_eq(s)) {
210            return *v.get();
211        }
212
213        get_or_insert(&self.strs, &self.arena, s, hash, cvt_mut(&mut shard.write()), alloc)
214    }
215
216    #[inline]
217    fn do_intern_mut(&mut self, s: &[u8], alloc: impl FnOnce(&Arena, &[u8]) -> &'static [u8]) -> S {
218        let hash = self.hash(s);
219        let shard_idx = self.map.determine_shard(hash as usize);
220        let shard = &mut *self.map.shards_mut()[shard_idx];
221
222        get_or_insert(&self.strs, &self.arena, s, hash, cvt_mut(shard.get_mut()), alloc)
223    }
224
225    #[inline]
226    fn hash(&self, s: &[u8]) -> u64 {
227        // We don't use `self.hash_builder.hash_one(s)` because we want to avoid hashing the length.
228        use std::hash::Hasher;
229        let mut h = self.hash_builder.build_hasher();
230        h.write(s);
231        h.finish()
232    }
233}
234
235pub(crate) type NoHasherBuilder = std::hash::BuildHasherDefault<NoHasher>;
236
237pub(crate) enum NoHasher {}
238impl Default for NoHasher {
239    #[inline]
240    fn default() -> Self {
241        unreachable!()
242    }
243}
244impl std::hash::Hasher for NoHasher {
245    #[inline]
246    fn finish(&self) -> u64 {
247        match *self {}
248    }
249    #[inline]
250    fn write(&mut self, _bytes: &[u8]) {
251        match *self {}
252    }
253}
254
255#[inline]
256fn get_or_insert<S: InternerSymbol>(
257    strs: &LFVec<&'static [u8]>,
258    arena: &Arena,
259    s: &[u8],
260    hash: u64,
261    shard: &mut hash_table::HashTable<RawMapKey<dashmap::SharedValue<S>>>,
262    alloc: impl FnOnce(&Arena, &[u8]) -> &'static [u8],
263) -> S {
264    match shard.entry(hash, mk_eq(s), hasher) {
265        hash_table::Entry::Occupied(e) => *e.get().1.get(),
266        hash_table::Entry::Vacant(e) => {
267            let s = alloc(arena, s);
268            let i = strs.push(s);
269            let new_sym = S::from_usize(i);
270            e.insert(((hash, s), dashmap::SharedValue::new(new_sym)));
271            new_sym
272        }
273    }
274}
275
276#[inline]
277fn hasher<S>(((hash, _), _): &RawMapKey<S>) -> u64 {
278    *hash
279}
280
281#[inline]
282fn mk_eq<S>(s: &[u8]) -> impl Fn(&RawMapKey<S>) -> bool + Copy + '_ {
283    move |((_, ss), _): &RawMapKey<S>| s == *ss
284}
285
286#[inline]
287fn alloc(arena: &Arena, s: &[u8]) -> &'static [u8] {
288    // SAFETY: extends the lifetime of `&Arena` to `'static`. This is never exposed so it's ok.
289    unsafe {
290        std::mem::transmute::<&[u8], &'static [u8]>(arena.get_or_default().alloc_slice_copy(s))
291    }
292}
293
294#[inline]
295fn no_alloc(_: &Arena, s: &[u8]) -> &'static [u8] {
296    // SAFETY: `s` outlives `arena`, so we don't need to allocate it. See above.
297    unsafe { std::mem::transmute::<&[u8], &'static [u8]>(s) }
298}
299
300// SAFETY: `HashTable` is a thin wrapper around `RawTable`. This is not guaranteed but idc.
301#[inline]
302fn cvt<T>(old: &hashbrown::raw::RawTable<T>) -> &hash_table::HashTable<T> {
303    unsafe { std::mem::transmute(old) }
304}
305
306#[inline]
307fn cvt_mut<T>(old: &mut hashbrown::raw::RawTable<T>) -> &mut hash_table::HashTable<T> {
308    unsafe { std::mem::transmute(old) }
309}