jiff/shared/util/utf8.rs
1/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
2///
3/// If no valid encoding of a codepoint exists at the beginning of the
4/// given byte slice, then a 1-3 byte slice is returned (which is guaranteed
5/// to be a prefix of `bytes`). That byte slice corresponds either to a single
6/// invalid byte, or to a prefix of a valid UTF-8 encoding of a Unicode scalar
7/// value (but which ultimately did not lead to a valid encoding).
8///
9/// This returns `None` if and only if `bytes` is empty.
10///
11/// This never panics.
12///
13/// *WARNING*: This is not designed for performance. If you're looking for
14/// a fast UTF-8 decoder, this is not it. If you feel like you need one in
15/// this crate, then please file an issue and discuss your use case.
16pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, &[u8]>> {
17 if bytes.is_empty() {
18 return None;
19 }
20 let string = match core::str::from_utf8(&bytes[..bytes.len().min(4)]) {
21 Ok(s) => s,
22 Err(ref err) if err.valid_up_to() > 0 => {
23 core::str::from_utf8(&bytes[..err.valid_up_to()]).unwrap()
24 }
25 // In this case, we want to return 1-3 bytes that make up a prefix of
26 // a potentially valid codepoint.
27 Err(err) => {
28 return Some(Err(
29 &bytes[..err.error_len().unwrap_or_else(|| bytes.len())]
30 ))
31 }
32 };
33 // OK because we guaranteed above that `string`
34 // must be non-empty. And thus, `str::chars` must
35 // yield at least one Unicode scalar value.
36 Some(Ok(string.chars().next().unwrap()))
37}