1#![no_std]
18
19mod indices;
37mod report;
38
39pub use crate::indices::Utf8CharIndices;
40pub use crate::report::ErrorReportingUtf8Chars;
41pub use crate::report::Utf8CharsError;
42use core::iter::FusedIterator;
43
44#[repr(align(64))] struct Utf8Data {
46    pub table: [u8; 384],
47}
48
49static UTF8_DATA: Utf8Data = Utf8Data {
54    table: [
55        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
56        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
57        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
58        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
59        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
60        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
61        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
62        252, 252, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 148, 148, 148,
63        148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 164, 164, 164, 164, 164,
64        164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164,
65        164, 164, 164, 164, 164, 164, 164, 164, 164, 252, 252, 252, 252, 252, 252, 252, 252, 252,
66        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
67        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
68        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
69        252, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
70        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
71        4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
72        8, 8, 8, 8, 8, 8, 8, 16, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 32, 8, 8, 64, 8, 8, 8, 128, 4,
73        4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
74    ],
75};
76
77#[inline(always)]
80fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool {
81    i.wrapping_sub(start) <= (end - start)
82}
83
84#[derive(Debug, Clone)]
87pub struct Utf8Chars<'a> {
88    remaining: &'a [u8],
89}
90
91impl<'a> Utf8Chars<'a> {
92    #[inline(always)]
93    pub fn new(bytes: &'a [u8]) -> Self {
95        Utf8Chars::<'a> { remaining: bytes }
96    }
97
98    #[inline(always)]
101    pub fn as_slice(&self) -> &'a [u8] {
102        self.remaining
103    }
104
105    #[inline(never)]
106    fn next_fallback(&mut self) -> Option<char> {
107        if self.remaining.is_empty() {
108            return None;
109        }
110        let first = self.remaining[0];
111        if first < 0x80 {
112            self.remaining = &self.remaining[1..];
113            return Some(char::from(first));
114        }
115        if !in_inclusive_range8(first, 0xC2, 0xF4) || self.remaining.len() == 1 {
116            self.remaining = &self.remaining[1..];
117            return Some('\u{FFFD}');
118        }
119        let second = self.remaining[1];
120        let (lower_bound, upper_bound) = match first {
121            0xE0 => (0xA0, 0xBF),
122            0xED => (0x80, 0x9F),
123            0xF0 => (0x90, 0xBF),
124            0xF4 => (0x80, 0x8F),
125            _ => (0x80, 0xBF),
126        };
127        if !in_inclusive_range8(second, lower_bound, upper_bound) {
128            self.remaining = &self.remaining[1..];
129            return Some('\u{FFFD}');
130        }
131        if first < 0xE0 {
132            self.remaining = &self.remaining[2..];
133            let point = ((u32::from(first) & 0x1F) << 6) | (u32::from(second) & 0x3F);
134            return Some(unsafe { char::from_u32_unchecked(point) });
135        }
136        if self.remaining.len() == 2 {
137            self.remaining = &self.remaining[2..];
138            return Some('\u{FFFD}');
139        }
140        let third = self.remaining[2];
141        if !in_inclusive_range8(third, 0x80, 0xBF) {
142            self.remaining = &self.remaining[2..];
143            return Some('\u{FFFD}');
144        }
145        if first < 0xF0 {
146            self.remaining = &self.remaining[3..];
147            let point = ((u32::from(first) & 0xF) << 12)
148                | ((u32::from(second) & 0x3F) << 6)
149                | (u32::from(third) & 0x3F);
150            return Some(unsafe { char::from_u32_unchecked(point) });
151        }
152        self.remaining = &self.remaining[3..];
156        Some('\u{FFFD}')
157    }
158}
159
160impl<'a> Iterator for Utf8Chars<'a> {
161    type Item = char;
162
163    #[inline]
164    fn next(&mut self) -> Option<char> {
165        #[allow(clippy::never_loop)]
173        loop {
174            if self.remaining.len() < 4 {
175                break;
176            }
177            let first = self.remaining[0];
178            if first < 0x80 {
179                self.remaining = &self.remaining[1..];
180                return Some(char::from(first));
181            }
182            let second = self.remaining[1];
183            if in_inclusive_range8(first, 0xC2, 0xDF) {
184                if !in_inclusive_range8(second, 0x80, 0xBF) {
185                    break;
186                }
187                let point = ((u32::from(first) & 0x1F) << 6) | (u32::from(second) & 0x3F);
188                self.remaining = &self.remaining[2..];
189                return Some(unsafe { char::from_u32_unchecked(point) });
190            }
191            let third = self.remaining[2];
194            if first < 0xF0 {
195                if ((UTF8_DATA.table[usize::from(second)]
196                    & UTF8_DATA.table[usize::from(first) + 0x80])
197                    | (third >> 6))
198                    != 2
199                {
200                    break;
201                }
202                let point = ((u32::from(first) & 0xF) << 12)
203                    | ((u32::from(second) & 0x3F) << 6)
204                    | (u32::from(third) & 0x3F);
205                self.remaining = &self.remaining[3..];
206                return Some(unsafe { char::from_u32_unchecked(point) });
207            }
208            let fourth = self.remaining[3];
209            if (u16::from(
210                UTF8_DATA.table[usize::from(second)] & UTF8_DATA.table[usize::from(first) + 0x80],
211            ) | u16::from(third >> 6)
212                | (u16::from(fourth & 0xC0) << 2))
213                != 0x202
214            {
215                break;
216            }
217            let point = ((u32::from(first) & 0x7) << 18)
218                | ((u32::from(second) & 0x3F) << 12)
219                | ((u32::from(third) & 0x3F) << 6)
220                | (u32::from(fourth) & 0x3F);
221            self.remaining = &self.remaining[4..];
222            return Some(unsafe { char::from_u32_unchecked(point) });
223        }
224        self.next_fallback()
225    }
226}
227
228impl<'a> DoubleEndedIterator for Utf8Chars<'a> {
229    #[inline]
230    fn next_back(&mut self) -> Option<char> {
231        if self.remaining.is_empty() {
232            return None;
233        }
234        let mut attempt = 1;
235        for b in self.remaining.iter().rev() {
236            if b & 0xC0 != 0x80 {
237                let (head, tail) = self.remaining.split_at(self.remaining.len() - attempt);
238                let mut inner = Utf8Chars::new(tail);
239                let candidate = inner.next();
240                if inner.as_slice().is_empty() {
241                    self.remaining = head;
242                    return candidate;
243                }
244                break;
245            }
246            if attempt == 4 {
247                break;
248            }
249            attempt += 1;
250        }
251
252        self.remaining = &self.remaining[..self.remaining.len() - 1];
253        Some('\u{FFFD}')
254    }
255}
256
257impl FusedIterator for Utf8Chars<'_> {}
258
259pub trait Utf8CharsEx {
262    fn chars(&self) -> Utf8Chars<'_>;
263    fn char_indices(&self) -> Utf8CharIndices<'_>;
264}
265
266impl Utf8CharsEx for [u8] {
267    #[inline]
270    fn chars(&self) -> Utf8Chars<'_> {
271        Utf8Chars::new(self)
272    }
273    #[inline]
276    fn char_indices(&self) -> Utf8CharIndices<'_> {
277        Utf8CharIndices::new(self)
278    }
279}
280
281