data_url/
lib.rs

1//! Processing of `data:` URLs according to the Fetch Standard:
2//! <https://fetch.spec.whatwg.org/#data-urls>
3//! but starting from a string rather than a parsed URL to avoid extra copies.
4//!
5//! ```rust
6//! use data_url::{DataUrl, mime};
7//!
8//! let url = DataUrl::process("data:,Hello%20World!").unwrap();
9//! let (body, fragment) = url.decode_to_vec().unwrap();
10//!
11//! assert!(url.mime_type().matches("text", "plain"));
12//! assert_eq!(url.mime_type().get_parameter("charset"), Some("US-ASCII"));
13//! assert_eq!(body, b"Hello World!");
14//! assert!(fragment.is_none());
15//! ```
16#![no_std]
17
18// For forwards compatibility
19#[cfg(feature = "std")]
20extern crate std;
21
22#[macro_use]
23extern crate alloc;
24
25#[cfg(not(feature = "alloc"))]
26compile_error!("the `alloc` feature must be enabled");
27
28use alloc::{string::String, vec::Vec};
29use core::fmt;
30
31macro_rules! require {
32    ($condition: expr) => {
33        if !$condition {
34            return None;
35        }
36    };
37}
38
39pub mod forgiving_base64;
40pub mod mime;
41
42pub struct DataUrl<'a> {
43    mime_type: mime::Mime,
44    base64: bool,
45    encoded_body_plus_fragment: &'a str,
46}
47
48#[derive(Debug)]
49pub enum DataUrlError {
50    NotADataUrl,
51    NoComma,
52}
53
54impl fmt::Display for DataUrlError {
55    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
56        match self {
57            Self::NotADataUrl => write!(f, "not a valid data url"),
58            Self::NoComma => write!(
59                f,
60                "data url is missing comma delimiting attributes and body"
61            ),
62        }
63    }
64}
65
66#[cfg(feature = "std")]
67impl std::error::Error for DataUrlError {}
68
69impl<'a> DataUrl<'a> {
70    /// <https://fetch.spec.whatwg.org/#data-url-processor>
71    /// but starting from a string rather than a parsed `Url`, to avoid extra string copies.
72    pub fn process(input: &'a str) -> Result<Self, DataUrlError> {
73        use crate::DataUrlError::*;
74
75        let after_colon = pretend_parse_data_url(input).ok_or(NotADataUrl)?;
76
77        let (from_colon_to_comma, encoded_body_plus_fragment) =
78            find_comma_before_fragment(after_colon).ok_or(NoComma)?;
79
80        let (mime_type, base64) = parse_header(from_colon_to_comma);
81
82        Ok(DataUrl {
83            mime_type,
84            base64,
85            encoded_body_plus_fragment,
86        })
87    }
88
89    pub fn mime_type(&self) -> &mime::Mime {
90        &self.mime_type
91    }
92
93    /// Streaming-decode the data URL’s body to `write_body_bytes`,
94    /// and return the URL’s fragment identifier if it has one.
95    pub fn decode<F, E>(
96        &self,
97        write_body_bytes: F,
98    ) -> Result<Option<FragmentIdentifier<'a>>, forgiving_base64::DecodeError<E>>
99    where
100        F: FnMut(&[u8]) -> Result<(), E>,
101    {
102        if self.base64 {
103            decode_with_base64(self.encoded_body_plus_fragment, write_body_bytes)
104        } else {
105            decode_without_base64(self.encoded_body_plus_fragment, write_body_bytes)
106                .map_err(forgiving_base64::DecodeError::WriteError)
107        }
108    }
109
110    /// Return the decoded body, and the URL’s fragment identifier if it has one.
111    pub fn decode_to_vec(
112        &self,
113    ) -> Result<(Vec<u8>, Option<FragmentIdentifier<'a>>), forgiving_base64::InvalidBase64> {
114        let mut body = Vec::new();
115        let fragment = self.decode(|bytes| {
116            body.extend_from_slice(bytes);
117            Ok(())
118        })?;
119        Ok((body, fragment))
120    }
121}
122
123/// The URL’s fragment identifier (after `#`)
124pub struct FragmentIdentifier<'a>(&'a str);
125
126impl FragmentIdentifier<'_> {
127    /// Like in a parsed URL
128    pub fn to_percent_encoded(&self) -> String {
129        let mut string = String::new();
130        for byte in self.0.bytes() {
131            match byte {
132                // Ignore ASCII tabs or newlines like the URL parser would
133                b'\t' | b'\n' | b'\r' => continue,
134                // https://url.spec.whatwg.org/#fragment-percent-encode-set
135                b'\0'..=b' ' | b'"' | b'<' | b'>' | b'`' | b'\x7F'..=b'\xFF' => {
136                    percent_encode(byte, &mut string)
137                }
138                // Printable ASCII
139                _ => string.push(byte as char),
140            }
141        }
142        string
143    }
144}
145
146/// Similar to <https://url.spec.whatwg.org/#concept-basic-url-parser>
147/// followed by <https://url.spec.whatwg.org/#concept-url-serializer>
148///
149/// * `None`: not a data URL.
150///
151/// * `Some(s)`: sort of the result of serialization, except:
152///
153///   - `data:` prefix removed
154///   - The fragment is included
155///   - Other components are **not** UTF-8 percent-encoded
156///   - ASCII tabs and newlines in the middle are **not** removed
157fn pretend_parse_data_url(input: &str) -> Option<&str> {
158    // Trim C0 control or space
159    let left_trimmed = input.trim_start_matches(|ch| ch <= ' ');
160
161    let mut bytes = left_trimmed.bytes();
162    {
163        // Ignore ASCII tabs or newlines like the URL parser would
164        let mut iter = bytes
165            .by_ref()
166            .filter(|&byte| !matches!(byte, b'\t' | b'\n' | b'\r'));
167        require!(iter.next()?.eq_ignore_ascii_case(&b'd'));
168        require!(iter.next()?.eq_ignore_ascii_case(&b'a'));
169        require!(iter.next()?.eq_ignore_ascii_case(&b't'));
170        require!(iter.next()?.eq_ignore_ascii_case(&b'a'));
171        require!(iter.next()? == b':');
172    }
173    let bytes_consumed = left_trimmed.len() - bytes.len();
174    let after_colon = &left_trimmed[bytes_consumed..];
175
176    // Trim C0 control or space
177    Some(after_colon.trim_end_matches(|ch| ch <= ' '))
178}
179
180fn find_comma_before_fragment(after_colon: &str) -> Option<(&str, &str)> {
181    for (i, byte) in after_colon.bytes().enumerate() {
182        if byte == b',' {
183            return Some((&after_colon[..i], &after_colon[i + 1..]));
184        }
185        if byte == b'#' {
186            break;
187        }
188    }
189    None
190}
191
192fn parse_header(from_colon_to_comma: &str) -> (mime::Mime, bool) {
193    // "Strip leading and trailing ASCII whitespace"
194    //     \t, \n, and \r would have been filtered by the URL parser
195    //     \f percent-encoded by the URL parser
196    //     space is the only remaining ASCII whitespace
197    let trimmed = from_colon_to_comma.trim_matches(|c| matches!(c, ' ' | '\t' | '\n' | '\r'));
198
199    let without_base64_suffix = remove_base64_suffix(trimmed);
200    let base64 = without_base64_suffix.is_some();
201    let mime_type = without_base64_suffix.unwrap_or(trimmed);
202
203    let mut string = String::new();
204    if mime_type.starts_with(';') {
205        string.push_str("text/plain")
206    }
207    let mut in_query = false;
208    for byte in mime_type.bytes() {
209        match byte {
210            // Ignore ASCII tabs or newlines like the URL parser would
211            b'\t' | b'\n' | b'\r' => continue,
212
213            // https://url.spec.whatwg.org/#c0-control-percent-encode-set
214            b'\0'..=b'\x1F' | b'\x7F'..=b'\xFF' => percent_encode(byte, &mut string),
215
216            // Bytes other than the C0 percent-encode set that are percent-encoded
217            // by the URL parser in the query state.
218            // '#' is also in that list but cannot occur here
219            // since it indicates the start of the URL’s fragment.
220            b' ' | b'"' | b'<' | b'>' if in_query => percent_encode(byte, &mut string),
221
222            b'?' => {
223                in_query = true;
224                string.push('?')
225            }
226
227            // Printable ASCII
228            _ => string.push(byte as char),
229        }
230    }
231
232    // FIXME: does Mime::from_str match the MIME Sniffing Standard’s parsing algorithm?
233    // <https://mimesniff.spec.whatwg.org/#parse-a-mime-type>
234    let mime_type = string.parse().unwrap_or_else(|_| mime::Mime {
235        type_: String::from("text"),
236        subtype: String::from("plain"),
237        parameters: vec![(String::from("charset"), String::from("US-ASCII"))],
238    });
239
240    (mime_type, base64)
241}
242
243/// None: no base64 suffix
244#[allow(clippy::skip_while_next)]
245fn remove_base64_suffix(s: &str) -> Option<&str> {
246    let mut bytes = s.bytes();
247    {
248        // Ignore ASCII tabs or newlines like the URL parser would
249        let iter = bytes
250            .by_ref()
251            .filter(|&byte| !matches!(byte, b'\t' | b'\n' | b'\r'));
252
253        // Search from the end
254        let mut iter = iter.rev();
255
256        require!(iter.next()? == b'4');
257        require!(iter.next()? == b'6');
258        require!(iter.next()?.eq_ignore_ascii_case(&b'e'));
259        require!(iter.next()?.eq_ignore_ascii_case(&b's'));
260        require!(iter.next()?.eq_ignore_ascii_case(&b'a'));
261        require!(iter.next()?.eq_ignore_ascii_case(&b'b'));
262        require!(iter.skip_while(|&byte| byte == b' ').next()? == b';');
263    }
264    Some(&s[..bytes.len()])
265}
266
267fn percent_encode(byte: u8, string: &mut String) {
268    const HEX_UPPER: [u8; 16] = *b"0123456789ABCDEF";
269    string.push('%');
270    string.push(HEX_UPPER[(byte >> 4) as usize] as char);
271    string.push(HEX_UPPER[(byte & 0x0f) as usize] as char);
272}
273
274/// This is <https://url.spec.whatwg.org/#string-percent-decode> while also:
275///
276/// * Ignoring ASCII tab or newlines
277/// * Stopping at the first '#' (which indicates the start of the fragment)
278///
279/// Anything that would have been UTF-8 percent-encoded by the URL parser
280/// would be percent-decoded here.
281/// We skip that round-trip and pass it through unchanged.
282fn decode_without_base64<F, E>(
283    encoded_body_plus_fragment: &str,
284    mut write_bytes: F,
285) -> Result<Option<FragmentIdentifier<'_>>, E>
286where
287    F: FnMut(&[u8]) -> Result<(), E>,
288{
289    let bytes = encoded_body_plus_fragment.as_bytes();
290    let mut slice_start = 0;
291    for (i, &byte) in bytes.iter().enumerate() {
292        // We only need to look for 5 different "special" byte values.
293        // For everything else we make slices as large as possible, borrowing the input,
294        // in order to make fewer write_all() calls.
295        if matches!(byte, b'%' | b'#' | b'\t' | b'\n' | b'\r') {
296            // Write everything (if anything) "non-special" we’ve accumulated
297            // before this special byte
298            if i > slice_start {
299                write_bytes(&bytes[slice_start..i])?;
300                slice_start = i;
301            }
302            // Then deal with the special byte.
303            match byte {
304                b'%' => {
305                    let l = bytes.get(i + 2).and_then(|&b| (b as char).to_digit(16));
306                    let h = bytes.get(i + 1).and_then(|&b| (b as char).to_digit(16));
307                    if let (Some(h), Some(l)) = (h, l) {
308                        // '%' followed by two ASCII hex digits
309                        let one_byte = h as u8 * 0x10 + l as u8;
310                        write_bytes(&[one_byte])?;
311                        slice_start = i + 3;
312                    } else {
313                        // Do nothing. Leave slice_start unchanged.
314                        // The % sign will be part of the next slice.
315                    }
316                }
317
318                b'#' => {
319                    let fragment_start = i + 1;
320                    let fragment = &encoded_body_plus_fragment[fragment_start..];
321                    return Ok(Some(FragmentIdentifier(fragment)));
322                }
323
324                // Ignore over '\t' | '\n' | '\r'
325                _ => slice_start = i + 1,
326            }
327        }
328    }
329    write_bytes(&bytes[slice_start..])?;
330    Ok(None)
331}
332
333/// `decode_without_base64()` composed with
334/// <https://infra.spec.whatwg.org/#isomorphic-decode> composed with
335/// <https://infra.spec.whatwg.org/#forgiving-base64-decode>.
336fn decode_with_base64<F, E>(
337    encoded_body_plus_fragment: &str,
338    write_bytes: F,
339) -> Result<Option<FragmentIdentifier<'_>>, forgiving_base64::DecodeError<E>>
340where
341    F: FnMut(&[u8]) -> Result<(), E>,
342{
343    let mut decoder = forgiving_base64::Decoder::new(write_bytes);
344    let fragment = decode_without_base64(encoded_body_plus_fragment, |bytes| decoder.feed(bytes))?;
345    decoder.finish()?;
346    Ok(fragment)
347}