tendril/
fmt.rs

1// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
2// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
3// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
4// option. This file may not be copied, modified, or distributed
5// except according to those terms.
6
7//! Marker types for formats.
8//!
9//! This module defines the types and traits used to mark a `Tendril`
10//! with the format of data it contains. It includes those formats
11//! for which `Tendril` supports at least some operations without
12//! conversion.
13//!
14//! To convert a string tendril to/from a byte tendril in an arbitrary
15//! character encoding, see the `encode` and `decode` methods on
16//! `Tendril`.
17//!
18//! `Tendril` operations may become memory-unsafe if data invalid for
19//! the format sneaks in. For that reason, these traits require
20//! `unsafe impl`.
21
22use std::default::Default;
23use std::{char, mem, str};
24
25use crate::futf::{self, Codepoint, Meaning};
26
27/// Implementation details.
28///
29/// You don't need these unless you are implementing
30/// a new format.
31pub mod imp {
32    use std::default::Default;
33    use std::{iter, slice};
34
35    /// Describes how to fix up encodings when concatenating.
36    ///
37    /// We can drop characters on either side of the splice,
38    /// and insert up to 4 bytes in the middle.
39    pub struct Fixup {
40        pub drop_left: u32,
41        pub drop_right: u32,
42        pub insert_len: u32,
43        pub insert_bytes: [u8; 4],
44    }
45
46    impl Default for Fixup {
47        #[inline(always)]
48        fn default() -> Fixup {
49            Fixup {
50                drop_left: 0,
51                drop_right: 0,
52                insert_len: 0,
53                insert_bytes: [0; 4],
54            }
55        }
56    }
57
58    pub struct SingleByteCharIndices<'a> {
59        inner: iter::Enumerate<slice::Iter<'a, u8>>,
60    }
61
62    impl<'a> Iterator for SingleByteCharIndices<'a> {
63        type Item = (usize, char);
64
65        #[inline]
66        fn next(&mut self) -> Option<(usize, char)> {
67            self.inner
68                .next()
69                .map(|(i, &b)| unsafe { (i, char::from_u32_unchecked(b as u32)) })
70        }
71    }
72
73    impl<'a> SingleByteCharIndices<'a> {
74        #[inline]
75        pub fn new(buf: &'a [u8]) -> SingleByteCharIndices<'a> {
76            SingleByteCharIndices {
77                inner: buf.iter().enumerate(),
78            }
79        }
80    }
81}
82
83/// Trait for format marker types.
84///
85/// The type implementing this trait is usually not instantiated.
86/// It's used with a phantom type parameter of `Tendril`.
87pub unsafe trait Format {
88    /// Check whether the buffer is valid for this format.
89    fn validate(buf: &[u8]) -> bool;
90
91    /// Check whether the buffer is valid for this format.
92    ///
93    /// You may assume the buffer is a prefix of a valid buffer.
94    #[inline]
95    fn validate_prefix(buf: &[u8]) -> bool {
96        <Self as Format>::validate(buf)
97    }
98
99    /// Check whether the buffer is valid for this format.
100    ///
101    /// You may assume the buffer is a suffix of a valid buffer.
102    #[inline]
103    fn validate_suffix(buf: &[u8]) -> bool {
104        <Self as Format>::validate(buf)
105    }
106
107    /// Check whether the buffer is valid for this format.
108    ///
109    /// You may assume the buffer is a contiguous subsequence
110    /// of a valid buffer, but not necessarily a prefix or
111    /// a suffix.
112    #[inline]
113    fn validate_subseq(buf: &[u8]) -> bool {
114        <Self as Format>::validate(buf)
115    }
116
117    /// Compute any fixup needed when concatenating buffers.
118    ///
119    /// The default is to do nothing.
120    ///
121    /// The function is `unsafe` because it may assume the input
122    /// buffers are already valid for the format. Also, no
123    /// bounds-checking is performed on the return value!
124    #[inline(always)]
125    unsafe fn fixup(_lhs: &[u8], _rhs: &[u8]) -> imp::Fixup {
126        Default::default()
127    }
128}
129
130/// Indicates that one format is a subset of another.
131///
132/// The subset format can be converted to the superset format
133/// for free.
134pub unsafe trait SubsetOf<Super>: Format
135where
136    Super: Format,
137{
138    /// Validate the *other* direction of conversion; check if
139    /// this buffer from the superset format conforms to the
140    /// subset format.
141    ///
142    /// The default calls `Self::validate`, but some conversions
143    /// may implement a check which is cheaper than validating
144    /// from scratch.
145    fn revalidate_subset(x: &[u8]) -> bool {
146        Self::validate(x)
147    }
148}
149
150/// Indicates a format which corresponds to a Rust slice type,
151/// representing exactly the same invariants.
152pub unsafe trait SliceFormat: Format + Sized {
153    type Slice: ?Sized + Slice;
154}
155
156/// Indicates a format which contains characters from Unicode
157/// (all of it, or some proper subset).
158pub unsafe trait CharFormat<'a>: Format {
159    /// Iterator for characters and their byte indices.
160    type Iter: Iterator<Item = (usize, char)>;
161
162    /// Iterate over the characters of the string and their byte
163    /// indices.
164    ///
165    /// You may assume the buffer is *already validated* for `Format`.
166    unsafe fn char_indices(buf: &'a [u8]) -> Self::Iter;
167
168    /// Encode the character as bytes and pass them to a continuation.
169    ///
170    /// Returns `Err(())` iff the character cannot be represented.
171    fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
172    where
173        F: FnOnce(&[u8]);
174}
175
176/// Indicates a Rust slice type that is represented in memory as bytes.
177pub unsafe trait Slice {
178    /// Access the raw bytes of the slice.
179    fn as_bytes(&self) -> &[u8];
180
181    /// Convert a byte slice to this kind of slice.
182    ///
183    /// You may assume the buffer is *already validated*
184    /// for `Format`.
185    unsafe fn from_bytes(x: &[u8]) -> &Self;
186
187    /// Convert a byte slice to this kind of slice.
188    ///
189    /// You may assume the buffer is *already validated*
190    /// for `Format`.
191    unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut Self;
192}
193
194/// Marker type for uninterpreted bytes.
195///
196/// Validation will never fail for this format.
197#[derive(Copy, Clone, Default, Debug)]
198pub struct Bytes;
199
200unsafe impl Format for Bytes {
201    #[inline(always)]
202    fn validate(_: &[u8]) -> bool {
203        true
204    }
205}
206
207unsafe impl SliceFormat for Bytes {
208    type Slice = [u8];
209}
210
211unsafe impl Slice for [u8] {
212    #[inline(always)]
213    fn as_bytes(&self) -> &[u8] {
214        self
215    }
216
217    #[inline(always)]
218    unsafe fn from_bytes(x: &[u8]) -> &[u8] {
219        x
220    }
221
222    #[inline(always)]
223    unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut [u8] {
224        x
225    }
226}
227
228/// Marker type for ASCII text.
229#[derive(Copy, Clone, Default, Debug)]
230pub struct ASCII;
231
232unsafe impl Format for ASCII {
233    #[inline]
234    fn validate(buf: &[u8]) -> bool {
235        buf.iter().all(|&n| n <= 127)
236    }
237
238    #[inline(always)]
239    fn validate_prefix(_: &[u8]) -> bool {
240        true
241    }
242
243    #[inline(always)]
244    fn validate_suffix(_: &[u8]) -> bool {
245        true
246    }
247
248    #[inline(always)]
249    fn validate_subseq(_: &[u8]) -> bool {
250        true
251    }
252}
253
254unsafe impl SubsetOf<UTF8> for ASCII {}
255unsafe impl SubsetOf<Latin1> for ASCII {}
256
257unsafe impl<'a> CharFormat<'a> for ASCII {
258    type Iter = imp::SingleByteCharIndices<'a>;
259
260    #[inline]
261    unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
262        imp::SingleByteCharIndices::new(buf)
263    }
264
265    #[inline]
266    fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
267    where
268        F: FnOnce(&[u8]),
269    {
270        let n = ch as u32;
271        if n > 0x7F {
272            return Err(());
273        }
274        cont(&[n as u8]);
275        Ok(())
276    }
277}
278
279/// Marker type for UTF-8 text.
280#[derive(Copy, Clone, Default, Debug)]
281pub struct UTF8;
282
283unsafe impl Format for UTF8 {
284    #[inline]
285    fn validate(buf: &[u8]) -> bool {
286        str::from_utf8(buf).is_ok()
287    }
288
289    #[inline]
290    fn validate_prefix(buf: &[u8]) -> bool {
291        if buf.is_empty() {
292            return true;
293        }
294        matches!(
295            futf::classify(buf, buf.len() - 1),
296            Some(Codepoint {
297                meaning: Meaning::Whole(_),
298                ..
299            })
300        )
301    }
302
303    #[inline]
304    fn validate_suffix(buf: &[u8]) -> bool {
305        if buf.is_empty() {
306            return true;
307        }
308        matches!(
309            futf::classify(buf, 0),
310            Some(Codepoint {
311                meaning: Meaning::Whole(_),
312                ..
313            })
314        )
315    }
316
317    #[inline]
318    fn validate_subseq(buf: &[u8]) -> bool {
319        <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf)
320    }
321}
322
323unsafe impl SubsetOf<WTF8> for UTF8 {}
324
325unsafe impl SliceFormat for UTF8 {
326    type Slice = str;
327}
328
329unsafe impl Slice for str {
330    #[inline(always)]
331    fn as_bytes(&self) -> &[u8] {
332        str::as_bytes(self)
333    }
334
335    #[inline(always)]
336    unsafe fn from_bytes(x: &[u8]) -> &str {
337        str::from_utf8_unchecked(x)
338    }
339
340    #[inline(always)]
341    unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut str {
342        mem::transmute(x)
343    }
344}
345
346unsafe impl<'a> CharFormat<'a> for UTF8 {
347    type Iter = str::CharIndices<'a>;
348
349    #[inline]
350    unsafe fn char_indices(buf: &'a [u8]) -> str::CharIndices<'a> {
351        str::from_utf8_unchecked(buf).char_indices()
352    }
353
354    #[inline]
355    fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
356    where
357        F: FnOnce(&[u8]),
358    {
359        cont(ch.encode_utf8(&mut [0_u8; 4]).as_bytes());
360        Ok(())
361    }
362}
363
364/// Marker type for WTF-8 text.
365///
366/// See the [WTF-8 spec](https://simonsapin.github.io/wtf-8/).
367#[derive(Copy, Clone, Default, Debug)]
368pub struct WTF8;
369
370#[inline]
371fn wtf8_meaningful(m: Meaning) -> bool {
372    matches!(
373        m,
374        Meaning::Whole(_) | Meaning::LeadSurrogate(_) | Meaning::TrailSurrogate(_)
375    )
376}
377
378unsafe impl Format for WTF8 {
379    #[inline]
380    fn validate(buf: &[u8]) -> bool {
381        let mut i = 0;
382        let mut prev_lead = false;
383        while i < buf.len() {
384            let Some(codept) = futf::classify(buf, i) else {
385                return false;
386            };
387            if !wtf8_meaningful(codept.meaning) {
388                return false;
389            }
390            i += codept.bytes.len();
391            prev_lead = match codept.meaning {
392                Meaning::TrailSurrogate(_) if prev_lead => return false,
393                Meaning::LeadSurrogate(_) => true,
394                _ => false,
395            };
396        }
397
398        true
399    }
400
401    #[inline]
402    fn validate_prefix(buf: &[u8]) -> bool {
403        if buf.is_empty() {
404            return true;
405        }
406        match futf::classify(buf, buf.len() - 1) {
407            Some(c) => wtf8_meaningful(c.meaning),
408            _ => false,
409        }
410    }
411
412    #[inline]
413    fn validate_suffix(buf: &[u8]) -> bool {
414        if buf.is_empty() {
415            return true;
416        }
417        match futf::classify(buf, 0) {
418            Some(c) => wtf8_meaningful(c.meaning),
419            _ => false,
420        }
421    }
422
423    #[inline]
424    fn validate_subseq(buf: &[u8]) -> bool {
425        <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf)
426    }
427
428    #[inline]
429    unsafe fn fixup(lhs: &[u8], rhs: &[u8]) -> imp::Fixup {
430        const ERR: &str = "WTF8: internal error";
431
432        if lhs.len() >= 3 && rhs.len() >= 3 {
433            if let (
434                Some(Codepoint {
435                    meaning: Meaning::LeadSurrogate(hi),
436                    ..
437                }),
438                Some(Codepoint {
439                    meaning: Meaning::TrailSurrogate(lo),
440                    ..
441                }),
442            ) = (futf::classify(lhs, lhs.len() - 1), futf::classify(rhs, 0))
443            {
444                let mut fixup = imp::Fixup {
445                    drop_left: 3,
446                    drop_right: 3,
447                    insert_len: 0,
448                    insert_bytes: [0_u8; 4],
449                };
450
451                let n = 0x10000 + ((hi as u32) << 10) + (lo as u32);
452
453                let ch = char::from_u32(n).expect(ERR);
454                fixup.insert_len = ch.encode_utf8(&mut fixup.insert_bytes).len() as u32;
455
456                return fixup;
457            }
458        }
459
460        Default::default()
461    }
462}
463
464/// Marker type for the single-byte encoding of the first 256 Unicode codepoints.
465///
466/// This is IANA's "ISO-8859-1". It's ISO's "ISO 8859-1" with the addition of the
467/// C0 and C1 control characters from ECMA-48 / ISO 6429.
468///
469/// Not to be confused with WHATWG's "latin1" or "iso8859-1" labels (or the
470/// many other aliases), which actually stand for Windows-1252.
471#[derive(Copy, Clone, Default, Debug)]
472pub struct Latin1;
473
474unsafe impl Format for Latin1 {
475    #[inline(always)]
476    fn validate(_: &[u8]) -> bool {
477        true
478    }
479
480    #[inline(always)]
481    fn validate_prefix(_: &[u8]) -> bool {
482        true
483    }
484
485    #[inline(always)]
486    fn validate_suffix(_: &[u8]) -> bool {
487        true
488    }
489
490    #[inline(always)]
491    fn validate_subseq(_: &[u8]) -> bool {
492        true
493    }
494}
495
496unsafe impl<'a> CharFormat<'a> for Latin1 {
497    type Iter = imp::SingleByteCharIndices<'a>;
498
499    #[inline]
500    unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
501        imp::SingleByteCharIndices::new(buf)
502    }
503
504    #[inline]
505    fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
506    where
507        F: FnOnce(&[u8]),
508    {
509        let n = ch as u32;
510        if n > 0xFF {
511            return Err(());
512        }
513        cont(&[n as u8]);
514        Ok(())
515    }
516}