tinystr/
int_ops.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::asciibyte::AsciiByte;
6
7/// Internal helper struct that performs operations on aligned integers.
8/// Supports strings up to 4 bytes long.
9#[repr(transparent)]
10pub struct Aligned4(u32);
11
12impl Aligned4 {
13    /// # Panics
14    /// Panics if N is greater than 4
15    #[inline]
16    pub const fn from_bytes<const N: usize>(src: &[u8; N]) -> Self {
17        let mut bytes = [0; 4];
18        let mut i = 0;
19        // The function documentation defines when panics may occur
20        #[allow(clippy::indexing_slicing)]
21        while i < N {
22            bytes[i] = src[i];
23            i += 1;
24        }
25        Self(u32::from_ne_bytes(bytes))
26    }
27
28    #[inline]
29    pub const fn from_ascii_bytes<const N: usize>(src: &[AsciiByte; N]) -> Self {
30        Self::from_bytes::<N>(unsafe { core::mem::transmute::<&[AsciiByte; N], &[u8; N]>(src) })
31    }
32
33    #[inline]
34    pub const fn to_bytes(&self) -> [u8; 4] {
35        self.0.to_ne_bytes()
36    }
37
38    #[inline]
39    pub const fn to_ascii_bytes(&self) -> [AsciiByte; 4] {
40        unsafe { core::mem::transmute(self.to_bytes()) }
41    }
42
43    pub const fn len(&self) -> usize {
44        let word = self.0;
45        #[cfg(target_endian = "little")]
46        let len = (4 - word.leading_zeros() / 8) as usize;
47        #[cfg(target_endian = "big")]
48        let len = (4 - word.trailing_zeros() / 8) as usize;
49        len
50    }
51
52    pub const fn is_ascii_alphabetic(&self) -> bool {
53        let word = self.0;
54        // Each of the following bitmasks set *the high bit* (0x8) to 0 for valid and 1 for invalid.
55        // `mask` sets all NUL bytes to 0.
56        let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
57        // `lower` converts the string to lowercase. It may also change the value of non-alpha
58        // characters, but this does not matter for the alphabetic test that follows.
59        let lower = word | 0x2020_2020;
60        // `alpha` sets all alphabetic bytes to 0. We only need check for lowercase characters.
61        let alpha = !(lower + 0x1f1f_1f1f) | (lower + 0x0505_0505);
62        // The overall string is valid if every character passes at least one test.
63        // We performed two tests here: non-NUL (`mask`) and alphabetic (`alpha`).
64        (alpha & mask) == 0
65    }
66
67    pub const fn is_ascii_alphanumeric(&self) -> bool {
68        let word = self.0;
69        // See explanatory comments in is_ascii_alphabetic
70        let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
71        let numeric = !(word + 0x5050_5050) | (word + 0x4646_4646);
72        let lower = word | 0x2020_2020;
73        let alpha = !(lower + 0x1f1f_1f1f) | (lower + 0x0505_0505);
74        (alpha & numeric & mask) == 0
75    }
76
77    pub const fn is_ascii_numeric(&self) -> bool {
78        let word = self.0;
79        // See explanatory comments in is_ascii_alphabetic
80        let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
81        let numeric = !(word + 0x5050_5050) | (word + 0x4646_4646);
82        (numeric & mask) == 0
83    }
84
85    pub const fn is_ascii_lowercase(&self) -> bool {
86        let word = self.0;
87        // For efficiency, this function tests for an invalid string rather than a valid string.
88        // A string is ASCII lowercase iff it contains no uppercase ASCII characters.
89        // `invalid_case` sets all uppercase ASCII characters to 0 and all others to 1.
90        let invalid_case = !(word + 0x3f3f_3f3f) | (word + 0x2525_2525);
91        // The string is valid if it contains no invalid characters (if all high bits are 1).
92        (invalid_case & 0x8080_8080) == 0x8080_8080
93    }
94
95    pub const fn is_ascii_titlecase(&self) -> bool {
96        let word = self.0;
97        // See explanatory comments in is_ascii_lowercase
98        let invalid_case = if cfg!(target_endian = "little") {
99            !(word + 0x3f3f_3f1f) | (word + 0x2525_2505)
100        } else {
101            !(word + 0x1f3f_3f3f) | (word + 0x0525_2525)
102        };
103        (invalid_case & 0x8080_8080) == 0x8080_8080
104    }
105
106    pub const fn is_ascii_uppercase(&self) -> bool {
107        let word = self.0;
108        // See explanatory comments in is_ascii_lowercase
109        let invalid_case = !(word + 0x1f1f_1f1f) | (word + 0x0505_0505);
110        (invalid_case & 0x8080_8080) == 0x8080_8080
111    }
112
113    pub const fn is_ascii_alphabetic_lowercase(&self) -> bool {
114        let word = self.0;
115        // `mask` sets all NUL bytes to 0.
116        let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
117        // `lower_alpha` sets all lowercase ASCII characters to 0 and all others to 1.
118        let lower_alpha = !(word + 0x1f1f_1f1f) | (word + 0x0505_0505);
119        // The overall string is valid if every character passes at least one test.
120        // We performed two tests here: non-NUL (`mask`) and lowercase ASCII character (`alpha`).
121        (lower_alpha & mask) == 0
122    }
123
124    pub const fn is_ascii_alphabetic_titlecase(&self) -> bool {
125        let word = self.0;
126        // See explanatory comments in is_ascii_alphabetic_lowercase
127        let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
128        let title_case = if cfg!(target_endian = "little") {
129            !(word + 0x1f1f_1f3f) | (word + 0x0505_0525)
130        } else {
131            !(word + 0x3f1f_1f1f) | (word + 0x2505_0505)
132        };
133        (title_case & mask) == 0
134    }
135
136    pub const fn is_ascii_alphabetic_uppercase(&self) -> bool {
137        let word = self.0;
138        // See explanatory comments in is_ascii_alphabetic_lowercase
139        let mask = (word + 0x7f7f_7f7f) & 0x8080_8080;
140        let upper_alpha = !(word + 0x3f3f_3f3f) | (word + 0x2525_2525);
141        (upper_alpha & mask) == 0
142    }
143
144    pub const fn to_ascii_lowercase(&self) -> Self {
145        let word = self.0;
146        let result = word | (((word + 0x3f3f_3f3f) & !(word + 0x2525_2525) & 0x8080_8080) >> 2);
147        Self(result)
148    }
149
150    pub const fn to_ascii_titlecase(&self) -> Self {
151        let word = self.0.to_le();
152        let mask = ((word + 0x3f3f_3f1f) & !(word + 0x2525_2505) & 0x8080_8080) >> 2;
153        let result = (word | mask) & !(0x20 & mask);
154        Self(u32::from_le(result))
155    }
156
157    pub const fn to_ascii_uppercase(&self) -> Self {
158        let word = self.0;
159        let result = word & !(((word + 0x1f1f_1f1f) & !(word + 0x0505_0505) & 0x8080_8080) >> 2);
160        Self(result)
161    }
162}
163
164/// Internal helper struct that performs operations on aligned integers.
165/// Supports strings up to 8 bytes long.
166#[repr(transparent)]
167pub struct Aligned8(u64);
168
169impl Aligned8 {
170    /// # Panics
171    /// Panics if N is greater than 8
172    #[inline]
173    pub const fn from_bytes<const N: usize>(src: &[u8; N]) -> Self {
174        let mut bytes = [0; 8];
175        let mut i = 0;
176        // The function documentation defines when panics may occur
177        #[allow(clippy::indexing_slicing)]
178        while i < N {
179            bytes[i] = src[i];
180            i += 1;
181        }
182        Self(u64::from_ne_bytes(bytes))
183    }
184
185    #[inline]
186    pub const fn from_ascii_bytes<const N: usize>(src: &[AsciiByte; N]) -> Self {
187        Self::from_bytes::<N>(unsafe { core::mem::transmute::<&[AsciiByte; N], &[u8; N]>(src) })
188    }
189
190    #[inline]
191    pub const fn to_bytes(&self) -> [u8; 8] {
192        self.0.to_ne_bytes()
193    }
194
195    #[inline]
196    pub const fn to_ascii_bytes(&self) -> [AsciiByte; 8] {
197        unsafe { core::mem::transmute(self.to_bytes()) }
198    }
199
200    pub const fn len(&self) -> usize {
201        let word = self.0;
202        #[cfg(target_endian = "little")]
203        let len = (8 - word.leading_zeros() / 8) as usize;
204        #[cfg(target_endian = "big")]
205        let len = (8 - word.trailing_zeros() / 8) as usize;
206        len
207    }
208
209    pub const fn is_ascii_alphabetic(&self) -> bool {
210        let word = self.0;
211        let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
212        let lower = word | 0x2020_2020_2020_2020;
213        let alpha = !(lower + 0x1f1f_1f1f_1f1f_1f1f) | (lower + 0x0505_0505_0505_0505);
214        (alpha & mask) == 0
215    }
216
217    pub const fn is_ascii_alphanumeric(&self) -> bool {
218        let word = self.0;
219        let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
220        let numeric = !(word + 0x5050_5050_5050_5050) | (word + 0x4646_4646_4646_4646);
221        let lower = word | 0x2020_2020_2020_2020;
222        let alpha = !(lower + 0x1f1f_1f1f_1f1f_1f1f) | (lower + 0x0505_0505_0505_0505);
223        (alpha & numeric & mask) == 0
224    }
225
226    pub const fn is_ascii_numeric(&self) -> bool {
227        let word = self.0;
228        let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
229        let numeric = !(word + 0x5050_5050_5050_5050) | (word + 0x4646_4646_4646_4646);
230        (numeric & mask) == 0
231    }
232
233    pub const fn is_ascii_lowercase(&self) -> bool {
234        let word = self.0;
235        let invalid_case = !(word + 0x3f3f_3f3f_3f3f_3f3f) | (word + 0x2525_2525_2525_2525);
236        (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080
237    }
238
239    pub const fn is_ascii_titlecase(&self) -> bool {
240        let word = self.0;
241        let invalid_case = if cfg!(target_endian = "little") {
242            !(word + 0x3f3f_3f3f_3f3f_3f1f) | (word + 0x2525_2525_2525_2505)
243        } else {
244            !(word + 0x1f3f_3f3f_3f3f_3f3f) | (word + 0x0525_2525_2525_2525)
245        };
246        (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080
247    }
248
249    pub const fn is_ascii_uppercase(&self) -> bool {
250        let word = self.0;
251        let invalid_case = !(word + 0x1f1f_1f1f_1f1f_1f1f) | (word + 0x0505_0505_0505_0505);
252        (invalid_case & 0x8080_8080_8080_8080) == 0x8080_8080_8080_8080
253    }
254
255    pub const fn is_ascii_alphabetic_lowercase(&self) -> bool {
256        let word = self.0;
257        // `mask` sets all NUL bytes to 0.
258        let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
259        // `lower_alpha` sets all lowercase ASCII characters to 0 and all others to 1.
260        let lower_alpha = !(word + 0x1f1f_1f1f_1f1f_1f1f) | (word + 0x0505_0505_0505_0505);
261        // The overall string is valid if every character passes at least one test.
262        // We performed two tests here: non-NUL (`mask`) and lowercase ASCII character (`alpha`).
263        (lower_alpha & mask) == 0
264    }
265
266    pub const fn is_ascii_alphabetic_titlecase(&self) -> bool {
267        let word = self.0;
268        // See explanatory comments in is_ascii_alphabetic_lowercase
269        let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
270        let title_case = if cfg!(target_endian = "little") {
271            !(word + 0x1f1f_1f1f_1f1f_1f3f) | (word + 0x0505_0505_0505_0525)
272        } else {
273            !(word + 0x3f1f_1f1f_1f1f_1f1f) | (word + 0x2505_0505_0505_0505)
274        };
275        (title_case & mask) == 0
276    }
277
278    pub const fn is_ascii_alphabetic_uppercase(&self) -> bool {
279        let word = self.0;
280        // See explanatory comments in is_ascii_alphabetic_lowercase
281        let mask = (word + 0x7f7f_7f7f_7f7f_7f7f) & 0x8080_8080_8080_8080;
282        let upper_alpha = !(word + 0x3f3f_3f3f_3f3f_3f3f) | (word + 0x2525_2525_2525_2525);
283        (upper_alpha & mask) == 0
284    }
285
286    pub const fn to_ascii_lowercase(&self) -> Self {
287        let word = self.0;
288        let result = word
289            | (((word + 0x3f3f_3f3f_3f3f_3f3f)
290                & !(word + 0x2525_2525_2525_2525)
291                & 0x8080_8080_8080_8080)
292                >> 2);
293        Self(result)
294    }
295
296    pub const fn to_ascii_titlecase(&self) -> Self {
297        let word = self.0.to_le();
298        let mask = ((word + 0x3f3f_3f3f_3f3f_3f1f)
299            & !(word + 0x2525_2525_2525_2505)
300            & 0x8080_8080_8080_8080)
301            >> 2;
302        let result = (word | mask) & !(0x20 & mask);
303        Self(u64::from_le(result))
304    }
305
306    pub const fn to_ascii_uppercase(&self) -> Self {
307        let word = self.0;
308        let result = word
309            & !(((word + 0x1f1f_1f1f_1f1f_1f1f)
310                & !(word + 0x0505_0505_0505_0505)
311                & 0x8080_8080_8080_8080)
312                >> 2);
313        Self(result)
314    }
315}
tinystr/int_ops.rs

tinystr/
int_ops.rs