jiff/util/
utf8.rs

1use core::cmp::Ordering;
2
3/// Represents an invalid UTF-8 sequence.
4///
5/// This is an error returned by `decode`. It is guaranteed to
6/// contain 1, 2 or 3 bytes.
7pub(crate) struct Utf8Error {
8    bytes: [u8; 3],
9    len: u8,
10}
11
12impl Utf8Error {
13    #[cold]
14    #[inline(never)]
15    fn new(original_bytes: &[u8], err: core::str::Utf8Error) -> Utf8Error {
16        let len = err.error_len().unwrap_or_else(|| original_bytes.len());
17        // OK because the biggest invalid UTF-8
18        // sequence possible is 3.
19        debug_assert!(1 <= len && len <= 3);
20        let mut bytes = [0; 3];
21        bytes[..len].copy_from_slice(&original_bytes[..len]);
22        Utf8Error {
23            bytes,
24            // OK because the biggest invalid UTF-8
25            // sequence possible is 3.
26            len: u8::try_from(len).unwrap(),
27        }
28    }
29
30    /// Returns the slice of invalid UTF-8 bytes.
31    ///
32    /// The slice returned is guaranteed to have length equivalent
33    /// to `Utf8Error::len`.
34    pub(crate) fn as_slice(&self) -> &[u8] {
35        &self.bytes[..self.len()]
36    }
37
38    /// Returns the length of the invalid UTF-8 sequence found.
39    ///
40    /// This is guaranteed to be 1, 2 or 3.
41    pub(crate) fn len(&self) -> usize {
42        usize::from(self.len)
43    }
44}
45
46impl core::fmt::Display for Utf8Error {
47    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
48        write!(
49            f,
50            "found invalid UTF-8 byte {errant_bytes:?} in format \
51             string (format strings must be valid UTF-8)",
52            errant_bytes = crate::util::escape::Bytes(self.as_slice()),
53        )
54    }
55}
56
57/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
58///
59/// If no valid encoding of a codepoint exists at the beginning of the
60/// given byte slice, then a 1-3 byte slice is returned (which is guaranteed
61/// to be a prefix of `bytes`). That byte slice corresponds either to a single
62/// invalid byte, or to a prefix of a valid UTF-8 encoding of a Unicode scalar
63/// value (but which ultimately did not lead to a valid encoding).
64///
65/// This returns `None` if and only if `bytes` is empty.
66///
67/// This never panics.
68///
69/// *WARNING*: This is not designed for performance. If you're looking for
70/// a fast UTF-8 decoder, this is not it. If you feel like you need one in
71/// this crate, then please file an issue and discuss your use case.
72pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, Utf8Error>> {
73    if bytes.is_empty() {
74        return None;
75    }
76    let string = match core::str::from_utf8(&bytes[..bytes.len().min(4)]) {
77        Ok(s) => s,
78        Err(ref err) if err.valid_up_to() > 0 => {
79            // OK because we just verified we have at least some
80            // valid UTF-8.
81            core::str::from_utf8(&bytes[..err.valid_up_to()]).unwrap()
82        }
83        // In this case, we want to return 1-3 bytes that make up a prefix of
84        // a potentially valid codepoint.
85        Err(err) => return Some(Err(Utf8Error::new(bytes, err))),
86    };
87    // OK because we guaranteed above that `string`
88    // must be non-empty. And thus, `str::chars` must
89    // yield at least one Unicode scalar value.
90    Some(Ok(string.chars().next().unwrap()))
91}
92
93/// Like std's `eq_ignore_ascii_case`, but returns a full `Ordering`.
94#[inline]
95pub(crate) fn cmp_ignore_ascii_case(s1: &str, s2: &str) -> Ordering {
96    cmp_ignore_ascii_case_bytes(s1.as_bytes(), s2.as_bytes())
97}
98
99/// Like std's `eq_ignore_ascii_case`, but returns a full `Ordering` on
100/// `&[u8]`.
101#[inline]
102pub(crate) fn cmp_ignore_ascii_case_bytes(s1: &[u8], s2: &[u8]) -> Ordering {
103    // This function used to look like this:
104    //
105    //     let it1 = s1.iter().map(|&b| b.to_ascii_lowercase());
106    //     let it2 = s2.iter().map(|&b| b.to_ascii_lowercase());
107    //     it1.cmp(it2)
108    //
109    // But the code below seems to do better in microbenchmarks.
110    let mut i = 0;
111    loop {
112        let b1 = s1.get(i).copied().map(|b| b.to_ascii_lowercase());
113        let b2 = s2.get(i).copied().map(|b| b.to_ascii_lowercase());
114        match (b1, b2) {
115            (None, None) => return Ordering::Equal,
116            (Some(_), None) => return Ordering::Greater,
117            (None, Some(_)) => return Ordering::Less,
118            (Some(b1), Some(b2)) if b1 == b2 => i += 1,
119            (Some(b1), Some(b2)) => return b1.cmp(&b2),
120        }
121    }
122}