icu_capi/
utf.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use alloc::borrow::Cow;
6
7use core::fmt::Write;
8use writeable::{LengthHint, Part, TryWriteable, Writeable};
9
10#[allow(dead_code)]
11pub(crate) struct LossyWrap<T>(pub T);
12
13impl<T: TryWriteable> Writeable for LossyWrap<T> {
14    fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result {
15        let _ = self.0.try_write_to(sink)?;
16        Ok(())
17    }
18
19    fn writeable_length_hint(&self) -> LengthHint {
20        self.0.writeable_length_hint()
21    }
22}
23
24use core::{char::DecodeUtf16Error, fmt, str::Utf8Error};
25
26/// Implements [`Writeable`] for [`&[u8]`] according to the [WHATWG Encoding Standard](
27/// https://encoding.spec.whatwg.org/#utf-8-decoder).
28#[derive(Debug)]
29#[allow(clippy::exhaustive_structs)] // newtype
30pub struct PotentiallyInvalidUtf8<'a>(pub &'a [u8]);
31
32impl TryWriteable for PotentiallyInvalidUtf8<'_> {
33    type Error = Utf8Error;
34
35    fn try_write_to_parts<S: writeable::PartsWrite + ?Sized>(
36        &self,
37        sink: &mut S,
38    ) -> Result<Result<(), Self::Error>, fmt::Error> {
39        let mut remaining = self.0;
40        let mut r = Ok(());
41        loop {
42            match core::str::from_utf8(remaining) {
43                Ok(valid) => {
44                    sink.write_str(valid)?;
45                    return Ok(r);
46                }
47                Err(e) => {
48                    // SAFETY: By Utf8Error invariants
49                    let valid = unsafe {
50                        core::str::from_utf8_unchecked(remaining.get_unchecked(..e.valid_up_to()))
51                    };
52                    sink.write_str(valid)?;
53                    sink.with_part(Part::ERROR, |s| s.write_char(char::REPLACEMENT_CHARACTER))?;
54                    if r.is_ok() {
55                        r = Err(e);
56                    }
57                    let Some(error_len) = e.error_len() else {
58                        return Ok(r); // end of string
59                    };
60                    // SAFETY: By Utf8Error invariants
61                    remaining = unsafe { remaining.get_unchecked(e.valid_up_to() + error_len..) }
62                }
63            }
64        }
65    }
66
67    fn writeable_length_hint(&self) -> writeable::LengthHint {
68        // Lower bound is all valid UTF-8, upper bound is all bytes with the high bit, which become replacement characters.
69        LengthHint::between(self.0.len(), self.0.len() * 3)
70    }
71
72    fn try_write_to_string(&self) -> Result<Cow<str>, (Self::Error, Cow<str>)> {
73        match core::str::from_utf8(self.0) {
74            Ok(valid) => Ok(Cow::Borrowed(valid)),
75            Err(e) => {
76                // SAFETY: By Utf8Error invariants
77                let valid = unsafe {
78                    core::str::from_utf8_unchecked(self.0.get_unchecked(..e.valid_up_to()))
79                };
80
81                // Let's assume this is the only error
82                let mut out = alloc::string::String::with_capacity(
83                    self.0.len() + char::REPLACEMENT_CHARACTER.len_utf8()
84                        - e.error_len().unwrap_or(0),
85                );
86
87                out.push_str(valid);
88                out.push(char::REPLACEMENT_CHARACTER);
89
90                // If there's more, we can use `try_write_to`
91                if let Some(error_len) = e.error_len() {
92                    // SAFETY: By Utf8Error invariants
93                    let remaining = unsafe { self.0.get_unchecked(e.valid_up_to() + error_len..) };
94                    let _discard = Self(remaining).try_write_to(&mut out);
95                }
96
97                Err((e, Cow::Owned(out)))
98            }
99        }
100    }
101}
102
103/// Implements [`Writeable`] for [`&[u16]`] according to the [WHATWG Encoding Standard](
104/// https://encoding.spec.whatwg.org/#shared-utf-16-decoder).
105#[derive(Debug)]
106#[allow(clippy::exhaustive_structs)] // newtype
107pub struct PotentiallyInvalidUtf16<'a>(pub &'a [u16]);
108
109impl TryWriteable for PotentiallyInvalidUtf16<'_> {
110    type Error = DecodeUtf16Error;
111
112    fn try_write_to_parts<S: writeable::PartsWrite + ?Sized>(
113        &self,
114        sink: &mut S,
115    ) -> Result<Result<(), Self::Error>, fmt::Error> {
116        let mut r = Ok(());
117        for c in core::char::decode_utf16(self.0.iter().copied()) {
118            match c {
119                Ok(c) => sink.write_char(c)?,
120                Err(e) => {
121                    if r.is_ok() {
122                        r = Err(e);
123                    }
124                    sink.with_part(Part::ERROR, |s| s.write_char(char::REPLACEMENT_CHARACTER))?;
125                }
126            }
127        }
128        Ok(r)
129    }
130
131    fn writeable_length_hint(&self) -> LengthHint {
132        // Lower bound is all ASCII, upper bound is all 3-byte code points (including replacement character)
133        LengthHint::between(self.0.len(), self.0.len() * 3)
134    }
135}
136
137#[cfg(test)]
138mod test {
139    #![allow(invalid_from_utf8)] // only way to construct the error
140    use super::*;
141    use writeable::assert_try_writeable_parts_eq;
142
143    #[test]
144    fn test_utf8() {
145        assert_try_writeable_parts_eq!(PotentiallyInvalidUtf8(b"Foo Bar"), "Foo Bar", Ok(()), []);
146        assert_try_writeable_parts_eq!(
147            PotentiallyInvalidUtf8(b"Foo\xFDBar"),
148            "Foo�Bar",
149            Err(core::str::from_utf8(b"Foo\xFDBar").unwrap_err()),
150            [(3, 6, Part::ERROR)]
151        );
152        assert_try_writeable_parts_eq!(
153            PotentiallyInvalidUtf8(b"Foo\xFDBar\xff"),
154            "Foo�Bar�",
155            Err(core::str::from_utf8(b"Foo\xFDBar\xff").unwrap_err()),
156            [(3, 6, Part::ERROR), (9, 12, Part::ERROR)],
157        );
158    }
159
160    #[test]
161    fn test_utf16() {
162        assert_try_writeable_parts_eq!(
163            PotentiallyInvalidUtf16(&[0xD83E, 0xDD73]),
164            "🥳",
165            Ok(()),
166            []
167        );
168        assert_try_writeable_parts_eq!(
169            PotentiallyInvalidUtf16(&[0xD83E, 0x20, 0xDD73]),
170            "� �",
171            Err(core::char::decode_utf16([0xD83E].into_iter())
172                .next()
173                .unwrap()
174                .unwrap_err()),
175            [(0, 3, Part::ERROR), (4, 7, Part::ERROR)]
176        );
177    }
178}