regex_automata/util/utf8.rs
1/*!
2Utilities for dealing with UTF-8.
3
4This module provides some UTF-8 related helper routines, including an
5incremental decoder.
6*/
7
8/// Returns true if and only if the given byte is considered a word character.
9/// This only applies to ASCII.
10///
11/// This was copied from regex-syntax so that we can use it to determine the
12/// starting DFA state while searching without depending on regex-syntax. The
13/// definition is never going to change, so there's no maintenance/bit-rot
14/// hazard here.
15#[cfg_attr(feature = "perf-inline", inline(always))]
16pub(crate) fn is_word_byte(b: u8) -> bool {
17 const fn mkwordset() -> [bool; 256] {
18 // FIXME: Use as_usize() once const functions in traits are stable.
19 let mut set = [false; 256];
20 set[b'_' as usize] = true;
21
22 let mut byte = b'0';
23 while byte <= b'9' {
24 set[byte as usize] = true;
25 byte += 1;
26 }
27 byte = b'A';
28 while byte <= b'Z' {
29 set[byte as usize] = true;
30 byte += 1;
31 }
32 byte = b'a';
33 while byte <= b'z' {
34 set[byte as usize] = true;
35 byte += 1;
36 }
37 set
38 }
39 const WORD: [bool; 256] = mkwordset();
40 WORD[b as usize]
41}
42
43/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
44///
45/// If no valid encoding of a codepoint exists at the beginning of the given
46/// byte slice, then the first byte is returned instead.
47///
48/// This returns `None` if and only if `bytes` is empty.
49///
50/// This never panics.
51///
52/// *WARNING*: This is not designed for performance. If you're looking for a
53/// fast UTF-8 decoder, this is not it. If you feel like you need one in this
54/// crate, then please file an issue and discuss your use case.
55#[cfg_attr(feature = "perf-inline", inline(always))]
56pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, u8>> {
57 if bytes.is_empty() {
58 return None;
59 }
60 let len = match len(bytes[0]) {
61 None => return Some(Err(bytes[0])),
62 Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
63 Some(1) => return Some(Ok(char::from(bytes[0]))),
64 Some(len) => len,
65 };
66 match core::str::from_utf8(&bytes[..len]) {
67 Ok(s) => Some(Ok(s.chars().next().unwrap())),
68 Err(_) => Some(Err(bytes[0])),
69 }
70}
71
72/// Decodes the last UTF-8 encoded codepoint from the given byte slice.
73///
74/// If no valid encoding of a codepoint exists at the end of the given byte
75/// slice, then the last byte is returned instead.
76///
77/// This returns `None` if and only if `bytes` is empty.
78#[cfg_attr(feature = "perf-inline", inline(always))]
79pub(crate) fn decode_last(bytes: &[u8]) -> Option<Result<char, u8>> {
80 if bytes.is_empty() {
81 return None;
82 }
83 let mut start = bytes.len() - 1;
84 let limit = bytes.len().saturating_sub(4);
85 while start > limit && !is_leading_or_invalid_byte(bytes[start]) {
86 start -= 1;
87 }
88 match decode(&bytes[start..]) {
89 None => None,
90 Some(Ok(ch)) => Some(Ok(ch)),
91 Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])),
92 }
93}
94
95/// Given a UTF-8 leading byte, this returns the total number of code units
96/// in the following encoded codepoint.
97///
98/// If the given byte is not a valid UTF-8 leading byte, then this returns
99/// `None`.
100#[cfg_attr(feature = "perf-inline", inline(always))]
101fn len(byte: u8) -> Option<usize> {
102 match byte {
103 0b0000_0000..=0b0111_1111 => Some(1),
104 0b1000_0000..=0b1011_1111 => None,
105 0b1100_0000..=0b1101_1111 => Some(2),
106 0b1110_0000..=0b1110_1111 => Some(3),
107 0b1111_0000..=0b1111_0111 => Some(4),
108 _ => None,
109 }
110}
111
112/// Returns true if and only if the given offset in the given bytes falls on a
113/// valid UTF-8 encoded codepoint boundary.
114///
115/// If `bytes` is not valid UTF-8, then the behavior of this routine is
116/// unspecified.
117#[cfg_attr(feature = "perf-inline", inline(always))]
118pub(crate) fn is_boundary(bytes: &[u8], i: usize) -> bool {
119 match bytes.get(i) {
120 // The position at the end of the bytes always represents an empty
121 // string, which is a valid boundary. But anything after that doesn't
122 // make much sense to call valid a boundary.
123 None => i == bytes.len(),
124 // Other than ASCII (where the most significant bit is never set),
125 // valid starting bytes always have their most significant two bits
126 // set, where as continuation bytes never have their second most
127 // significant bit set. Therefore, this only returns true when bytes[i]
128 // corresponds to a byte that begins a valid UTF-8 encoding of a
129 // Unicode scalar value.
130 Some(&b) => b <= 0b0111_1111 || b >= 0b1100_0000,
131 }
132}
133
134/// Returns true if and only if the given byte is either a valid leading UTF-8
135/// byte, or is otherwise an invalid byte that can never appear anywhere in a
136/// valid UTF-8 sequence.
137#[cfg_attr(feature = "perf-inline", inline(always))]
138fn is_leading_or_invalid_byte(b: u8) -> bool {
139 // In the ASCII case, the most significant bit is never set. The leading
140 // byte of a 2/3/4-byte sequence always has the top two most significant
141 // bits set. For bytes that can never appear anywhere in valid UTF-8, this
142 // also returns true, since every such byte has its two most significant
143 // bits set:
144 //
145 // \xC0 :: 11000000
146 // \xC1 :: 11000001
147 // \xF5 :: 11110101
148 // \xF6 :: 11110110
149 // \xF7 :: 11110111
150 // \xF8 :: 11111000
151 // \xF9 :: 11111001
152 // \xFA :: 11111010
153 // \xFB :: 11111011
154 // \xFC :: 11111100
155 // \xFD :: 11111101
156 // \xFE :: 11111110
157 // \xFF :: 11111111
158 (b & 0b1100_0000) != 0b1000_0000
159}
160
161/*
162/// Returns the smallest possible index of the next valid UTF-8 sequence
163/// starting after `i`.
164///
165/// For all inputs, including invalid UTF-8 and any value of `i`, the return
166/// value is guaranteed to be greater than `i`. (If there is no value greater
167/// than `i` that fits in `usize`, then this panics.)
168///
169/// Generally speaking, this should only be called on `text` when it is
170/// permitted to assume that it is valid UTF-8 and where either `i >=
171/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence.
172///
173/// NOTE: This method was used in a previous conception of iterators where we
174/// specifically tried to skip over empty matches that split a codepoint by
175/// simply requiring that our next search begin at the beginning of codepoint.
176/// But we ended up changing that technique to always advance by 1 byte and
177/// then filter out matches that split a codepoint after-the-fact. Thus, we no
178/// longer use this method. But I've kept it around in case we want to switch
179/// back to this approach. Its guarantees are a little subtle, so I'd prefer
180/// not to rebuild it from whole cloth.
181pub(crate) fn next(text: &[u8], i: usize) -> usize {
182 let b = match text.get(i) {
183 None => return i.checked_add(1).unwrap(),
184 Some(&b) => b,
185 };
186 // For cases where we see an invalid UTF-8 byte, there isn't much we can do
187 // other than just start at the next byte.
188 let inc = len(b).unwrap_or(1);
189 i.checked_add(inc).unwrap()
190}
191*/