jiff/util/utf8.rs
1use core::cmp::Ordering;
2
3/// Represents an invalid UTF-8 sequence.
4///
5/// This is an error returned by `decode`. It is guaranteed to
6/// contain 1, 2 or 3 bytes.
7pub(crate) struct Utf8Error {
8 bytes: [u8; 3],
9 len: u8,
10}
11
12impl Utf8Error {
13 #[cold]
14 #[inline(never)]
15 fn new(original_bytes: &[u8], err: core::str::Utf8Error) -> Utf8Error {
16 let len = err.error_len().unwrap_or_else(|| original_bytes.len());
17 // OK because the biggest invalid UTF-8
18 // sequence possible is 3.
19 debug_assert!(1 <= len && len <= 3);
20 let mut bytes = [0; 3];
21 bytes[..len].copy_from_slice(&original_bytes[..len]);
22 Utf8Error {
23 bytes,
24 // OK because the biggest invalid UTF-8
25 // sequence possible is 3.
26 len: u8::try_from(len).unwrap(),
27 }
28 }
29
30 /// Returns the slice of invalid UTF-8 bytes.
31 ///
32 /// The slice returned is guaranteed to have length equivalent
33 /// to `Utf8Error::len`.
34 pub(crate) fn as_slice(&self) -> &[u8] {
35 &self.bytes[..self.len()]
36 }
37
38 /// Returns the length of the invalid UTF-8 sequence found.
39 ///
40 /// This is guaranteed to be 1, 2 or 3.
41 pub(crate) fn len(&self) -> usize {
42 usize::from(self.len)
43 }
44}
45
46impl core::fmt::Display for Utf8Error {
47 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
48 write!(
49 f,
50 "found invalid UTF-8 byte {errant_bytes:?} in format \
51 string (format strings must be valid UTF-8)",
52 errant_bytes = crate::util::escape::Bytes(self.as_slice()),
53 )
54 }
55}
56
57/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
58///
59/// If no valid encoding of a codepoint exists at the beginning of the
60/// given byte slice, then a 1-3 byte slice is returned (which is guaranteed
61/// to be a prefix of `bytes`). That byte slice corresponds either to a single
62/// invalid byte, or to a prefix of a valid UTF-8 encoding of a Unicode scalar
63/// value (but which ultimately did not lead to a valid encoding).
64///
65/// This returns `None` if and only if `bytes` is empty.
66///
67/// This never panics.
68///
69/// *WARNING*: This is not designed for performance. If you're looking for
70/// a fast UTF-8 decoder, this is not it. If you feel like you need one in
71/// this crate, then please file an issue and discuss your use case.
72pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, Utf8Error>> {
73 if bytes.is_empty() {
74 return None;
75 }
76 let string = match core::str::from_utf8(&bytes[..bytes.len().min(4)]) {
77 Ok(s) => s,
78 Err(ref err) if err.valid_up_to() > 0 => {
79 // OK because we just verified we have at least some
80 // valid UTF-8.
81 core::str::from_utf8(&bytes[..err.valid_up_to()]).unwrap()
82 }
83 // In this case, we want to return 1-3 bytes that make up a prefix of
84 // a potentially valid codepoint.
85 Err(err) => return Some(Err(Utf8Error::new(bytes, err))),
86 };
87 // OK because we guaranteed above that `string`
88 // must be non-empty. And thus, `str::chars` must
89 // yield at least one Unicode scalar value.
90 Some(Ok(string.chars().next().unwrap()))
91}
92
93/// Like std's `eq_ignore_ascii_case`, but returns a full `Ordering`.
94#[inline]
95pub(crate) fn cmp_ignore_ascii_case(s1: &str, s2: &str) -> Ordering {
96 cmp_ignore_ascii_case_bytes(s1.as_bytes(), s2.as_bytes())
97}
98
99/// Like std's `eq_ignore_ascii_case`, but returns a full `Ordering` on
100/// `&[u8]`.
101#[inline]
102pub(crate) fn cmp_ignore_ascii_case_bytes(s1: &[u8], s2: &[u8]) -> Ordering {
103 // This function used to look like this:
104 //
105 // let it1 = s1.iter().map(|&b| b.to_ascii_lowercase());
106 // let it2 = s2.iter().map(|&b| b.to_ascii_lowercase());
107 // it1.cmp(it2)
108 //
109 // But the code below seems to do better in microbenchmarks.
110 let mut i = 0;
111 loop {
112 let b1 = s1.get(i).copied().map(|b| b.to_ascii_lowercase());
113 let b2 = s2.get(i).copied().map(|b| b.to_ascii_lowercase());
114 match (b1, b2) {
115 (None, None) => return Ordering::Equal,
116 (Some(_), None) => return Ordering::Greater,
117 (None, Some(_)) => return Ordering::Less,
118 (Some(b1), Some(b2)) if b1 == b2 => i += 1,
119 (Some(b1), Some(b2)) => return b1.cmp(&b2),
120 }
121 }
122}