futf/
lib.rs

1// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
2// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
3// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
4// option. This file may not be copied, modified, or distributed
5// except according to those terms.
6
7#![cfg_attr(test, feature(test))]
8
9#[macro_use]
10extern crate debug_unreachable;
11
12#[macro_use]
13extern crate mac;
14
15#[cfg(test)]
16extern crate test as std_test;
17
18use std::{slice, char};
19
20/// Meaning of a complete or partial UTF-8 codepoint.
21///
22/// Not all checking is performed eagerly. That is, a codepoint `Prefix` or
23/// `Suffix` may in reality have no valid completion.
24#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
25pub enum Meaning {
26    /// We found a whole codepoint.
27    Whole(char),
28
29    /// We found something that isn't a valid Unicode codepoint, but
30    /// it *would* correspond to a UTF-16 leading surrogate code unit,
31    /// i.e. a value in the range `U+D800` - `U+DBFF`.
32    ///
33    /// The argument is the code unit's 10-bit index within that range.
34    ///
35    /// These are found in UTF-8 variants such as CESU-8 and WTF-8.
36    LeadSurrogate(u16),
37
38    /// We found something that isn't a valid Unicode codepoint, but
39    /// it *would* correspond to a UTF-16 trailing surrogate code unit,
40    /// i.e. a value in the range `U+DC00` - `U+DFFF`.
41    ///
42    /// The argument is the code unit's 10-bit index within that range.
43    ///
44    /// These are found in UTF-8 variants such as CESU-8 and WTF-8.
45    TrailSurrogate(u16),
46
47    /// We found only a prefix of a codepoint before the buffer ended.
48    ///
49    /// Includes the number of additional bytes needed.
50    Prefix(usize),
51
52    /// We found only a suffix of a codepoint before running off the
53    /// start of the buffer.
54    ///
55    /// Up to 3 more bytes may be needed.
56    Suffix,
57}
58
59/// Represents a complete or partial UTF-8 codepoint.
60#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
61pub struct Codepoint<'a> {
62    /// The bytes that make up the partial or full codepoint.
63    ///
64    /// For a `Suffix` this depends on `idx`. We don't scan forward
65    /// for additional continuation bytes after the reverse scan
66    /// failed to locate a multibyte sequence start.
67    pub bytes: &'a [u8],
68
69    /// Start of the codepoint in the buffer, expressed as an offset
70    /// back from `idx`.
71    pub rewind: usize,
72
73    /// Meaning of the partial or full codepoint.
74    pub meaning: Meaning,
75}
76
77#[derive(Debug, PartialEq, Eq)]
78enum Byte {
79    Ascii,
80    Start(usize),
81    Cont,
82}
83
84impl Byte {
85    #[inline(always)]
86    fn classify(x: u8) -> Option<Byte> {
87        match x & 0xC0 {
88            0xC0 => match x {
89                x if x & 0b11111_000 == 0b11110_000 => Some(Byte::Start(4)),
90                x if x & 0b1111_0000 == 0b1110_0000 => Some(Byte::Start(3)),
91                x if x & 0b111_00000 == 0b110_00000 => Some(Byte::Start(2)),
92                _ => None,
93            },
94            0x80 => Some(Byte::Cont),
95            _ => Some(Byte::Ascii),
96        }
97    }
98}
99
100#[inline(always)]
101fn all_cont(buf: &[u8]) -> bool {
102    buf.iter().all(|&b| matches!(Byte::classify(b), Some(Byte::Cont)))
103}
104
105// NOTE: Assumes the buffer is a syntactically valid multi-byte UTF-8 sequence:
106// a starting byte followed by the correct number of continuation bytes.
107#[inline(always)]
108unsafe fn decode(buf: &[u8]) -> Option<Meaning> {
109    debug_assert!(buf.len() >= 2);
110    debug_assert!(buf.len() <= 4);
111    let n;
112    match buf.len() {
113        2 => {
114            n = ((*buf.get_unchecked(0) & 0b11111) as u32) << 6
115                | ((*buf.get_unchecked(1) & 0x3F) as u32);
116            if n < 0x80 { return None }  // Overlong
117        }
118        3 => {
119            n = ((*buf.get_unchecked(0) & 0b1111) as u32) << 12
120                | ((*buf.get_unchecked(1) & 0x3F) as u32) << 6
121                | ((*buf.get_unchecked(2) & 0x3F) as u32);
122            match n {
123                0x0000 ... 0x07FF => return None,  // Overlong
124                0xD800 ... 0xDBFF => return Some(Meaning::LeadSurrogate(n as u16 - 0xD800)),
125                0xDC00 ... 0xDFFF => return Some(Meaning::TrailSurrogate(n as u16 - 0xDC00)),
126                _ => {}
127            }
128        }
129        4 => {
130            n = ((*buf.get_unchecked(0) & 0b111) as u32) << 18
131                | ((*buf.get_unchecked(1) & 0x3F) as u32) << 12
132                | ((*buf.get_unchecked(2) & 0x3F) as u32) << 6
133                | ((*buf.get_unchecked(3) & 0x3F) as u32);
134            if n < 0x1_0000 { return None }  // Overlong
135        }
136        _ => debug_unreachable!(),
137    }
138
139    char::from_u32(n).map(Meaning::Whole)
140}
141
142#[inline(always)]
143unsafe fn unsafe_slice<'a>(buf: &'a [u8], start: usize, new_len: usize) -> &'a [u8] {
144    debug_assert!(start <= buf.len());
145    debug_assert!(new_len <= (buf.len() - start));
146    slice::from_raw_parts(buf.as_ptr().offset(start as isize), new_len)
147}
148
149macro_rules! otry {
150    ($x:expr) => { unwrap_or_return!($x, None) }
151}
152
153/// Describes the UTF-8 codepoint containing the byte at index `idx` within
154/// `buf`.
155///
156/// Returns `None` if `idx` is out of range, or if `buf` contains invalid UTF-8
157/// in the vicinity of `idx`.
158#[inline]
159pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option<Codepoint<'a>> {
160    if idx >= buf.len() {
161        return None;
162    }
163
164    unsafe {
165        let x = *buf.get_unchecked(idx);
166        match otry!(Byte::classify(x)) {
167            Byte::Ascii => Some(Codepoint {
168                bytes: unsafe_slice(buf, idx, 1),
169                rewind: 0,
170                meaning: Meaning::Whole(x as char),
171            }),
172            Byte::Start(n) => {
173                let avail = buf.len() - idx;
174                if avail >= n {
175                    let bytes = unsafe_slice(buf, idx, n);
176                    if !all_cont(unsafe_slice(bytes, 1, n-1)) {
177                        return None;
178                    }
179                    let meaning = otry!(decode(bytes));
180                    Some(Codepoint {
181                        bytes: bytes,
182                        rewind: 0,
183                        meaning: meaning,
184                    })
185                } else {
186                    Some(Codepoint {
187                        bytes: unsafe_slice(buf, idx, avail),
188                        rewind: 0,
189                        meaning: Meaning::Prefix(n - avail),
190                    })
191                }
192            },
193            Byte::Cont => {
194                let mut start = idx;
195                let mut checked = 0;
196                loop {
197                    if start == 0 {
198                        // Whoops, fell off the beginning.
199                        return Some(Codepoint {
200                            bytes: unsafe_slice(buf, 0, idx + 1),
201                            rewind: idx,
202                            meaning: Meaning::Suffix,
203                        });
204                    }
205
206                    start -= 1;
207                    checked += 1;
208                    match otry!(Byte::classify(*buf.get_unchecked(start))) {
209                        Byte::Cont => (),
210                        Byte::Start(n) => {
211                            let avail = buf.len() - start;
212                            if avail >= n {
213                                let bytes = unsafe_slice(buf, start, n);
214                                if checked < n {
215                                    if !all_cont(unsafe_slice(bytes, checked, n-checked)) {
216                                        return None;
217                                    }
218                                }
219                                let meaning = otry!(decode(bytes));
220                                return Some(Codepoint {
221                                    bytes: bytes,
222                                    rewind: idx - start,
223                                    meaning: meaning,
224                                });
225                            } else {
226                                return Some(Codepoint {
227                                    bytes: unsafe_slice(buf, start, avail),
228                                    rewind: idx - start,
229                                    meaning: Meaning::Prefix(n - avail),
230                                });
231                            }
232                        }
233                        _ => return None,
234                    }
235
236                    if idx - start >= 3 {
237                        // We looked at 3 bytes before a continuation byte
238                        // and didn't find a start byte.
239                        return None;
240                    }
241                }
242            }
243        }
244    }
245}
246
247#[cfg(test)]
248mod test;