script/dom/servoparser/
encoding.rs

1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
4
5use std::borrow::Cow;
6use std::mem;
7use std::time::{Duration, Instant};
8
9use encoding_rs::{Encoding, UTF_8, UTF_16BE, UTF_16LE, WINDOWS_1252, X_USER_DEFINED};
10use tendril::fmt::UTF8;
11use tendril::stream::LossyDecoder;
12use tendril::{ByteTendril, StrTendril, TendrilSink};
13
14use crate::dom::document::Document;
15
16#[derive(JSTraceable, MallocSizeOf)]
17pub(super) struct DetectingState {
18    /// The `charset` that was specified in the `Content-Type` header, if any.
19    #[no_trace]
20    encoding_hint_from_content_type: Option<&'static Encoding>,
21    /// The encoding of a same-origin container document, if this document is in an
22    /// `<iframe>`.
23    #[no_trace]
24    encoding_of_container_document: Option<&'static Encoding>,
25    start_timestamp: Instant,
26    attempted_bom_sniffing: bool,
27    buffered_bytes: Vec<u8>,
28}
29
30#[derive(JSTraceable, MallocSizeOf)]
31pub(super) struct DecodingState {
32    /// The actual decoder.
33    ///
34    /// This field is `None` after we've finished parsing, because `LossyDecoder::finish`
35    /// takes ownership of the decoder.
36    #[ignore_malloc_size_of = "Defined in tendril"]
37    #[no_trace]
38    decoder: Option<LossyDecoder<NetworkSink>>,
39    #[no_trace]
40    pub(super) encoding: &'static Encoding,
41}
42
43#[derive(JSTraceable, MallocSizeOf)]
44pub(super) enum NetworkDecoderState {
45    /// In this stage the decoder is buffering bytes until it has enough to determine the encoding.
46    Detecting(DetectingState),
47    Decoding(DecodingState),
48}
49
50impl DetectingState {
51    /// The maximum amount of bytes to buffer before attempting to determine the encoding
52    const BUFFER_THRESHOLD: usize = 1024;
53
54    /// The time threshold after which we will attempt to determine the encoding and start decoding,
55    /// even if there are less than [BUFFER_THRESHOLD] bytes in the buffer.
56    const MAX_TIME_TO_BUFFER: Duration = Duration::from_secs(1);
57
58    /// Appends some data to the internal buffer and attempts to [determine the character encoding].
59    ///
60    /// If an encoding was detected then it is returned. A return value of `None` indicates that
61    /// more bytes are required.
62    ///
63    /// [determine the character encoding]: https://html.spec.whatwg.org/multipage/#determining-the-character-encoding
64    fn buffer(
65        &mut self,
66        data: &[u8],
67        document: &Document,
68        is_at_end_of_file: AtEndOfFile,
69    ) -> Option<&'static Encoding> {
70        self.buffered_bytes.extend_from_slice(data);
71        let can_wait_longer = self.start_timestamp.elapsed() < Self::MAX_TIME_TO_BUFFER;
72        self.determine_the_character_encoding(document, can_wait_longer, is_at_end_of_file)
73    }
74
75    /// <https://html.spec.whatwg.org/multipage/#determining-the-character-encoding>
76    fn determine_the_character_encoding(
77        &mut self,
78        document: &Document,
79        potentially_wait_for_more_data: bool,
80        is_at_end_of_file: AtEndOfFile,
81    ) -> Option<&'static Encoding> {
82        // Step 1. If the result of BOM sniffing is an encoding, return that encoding with confidence certain.
83        if !self.attempted_bom_sniffing && self.buffered_bytes.len() > 2 {
84            self.attempted_bom_sniffing = true;
85
86            if let Some((encoding, _)) = Encoding::for_bom(self.buffered_bytes.as_slice()) {
87                log::debug!(
88                    "Determined that the document is {} via BOM-sniffing",
89                    encoding.name()
90                );
91                return Some(encoding);
92            }
93        }
94
95        // Step 2. If the user has explicitly instructed the user agent to override the document's character
96        // encoding with a specific encoding, optionally return that encoding with the confidence certain.
97        // NOTE: Our users have no way to do that.
98
99        // Step 3. The user agent may wait for more bytes of the resource to be available, either in this
100        // step or at any later step in this algorithm.
101        if potentially_wait_for_more_data && self.buffered_bytes.len() < Self::BUFFER_THRESHOLD {
102            return None;
103        }
104
105        // TODO: Step 4. If the transport layer specifies a character encoding, and it is supported, return that
106        // encoding with the confidence certain.
107        if let Some(encoding_hint_from_content_type) = self.encoding_hint_from_content_type {
108            log::debug!(
109                "Inferred encoding to be {} from the Content-Type header",
110                encoding_hint_from_content_type.name()
111            );
112            return Some(encoding_hint_from_content_type);
113        }
114
115        // Step 5. Optionally, prescan the byte stream to determine its encoding, with the end condition
116        // being when the user agent decides that scanning further bytes would not be efficient.
117        // NOTE: According to the spec, we should always try to get an xml encoding right after failing
118        // to prescan the byte stream
119        let bytes_to_prescan =
120            &self.buffered_bytes[..Self::BUFFER_THRESHOLD.min(self.buffered_bytes.len())];
121        let sniffed_encoding = if document.is_html_document() {
122            prescan_the_byte_stream_to_determine_the_encoding(bytes_to_prescan)
123                .or_else(|| get_xml_encoding(bytes_to_prescan))
124        } else {
125            get_xml_encoding(bytes_to_prescan)
126        };
127        if let Some(encoding) = sniffed_encoding {
128            log::debug!(
129                "Prescanning the byte stream determined that the encoding is {}",
130                encoding.name()
131            );
132            return Some(encoding);
133        }
134
135        if document.is_html_document() {
136            // Step 6. If the HTML parser for which this algorithm is being run is associated with a Document d
137            // whose container document is non-null, then:
138            // Step 6.1 Let parentDocument be d's container document.
139            // Step 6.2 If parentDocument's origin is same origin with d's origin and parentDocument's character encoding
140            // is not UTF-16BE/LE, then return parentDocument's character encoding, with the confidence tentative.
141            // NOTE: This should not happen for XML documents
142            if let Some(encoding) = self.encoding_of_container_document {
143                if encoding != UTF_16LE && encoding != UTF_16BE {
144                    log::debug!(
145                        "Inferred encoding to be that of the container document, which is {}",
146                        encoding.name()
147                    );
148                    return Some(encoding);
149                }
150            }
151
152            // Step 7. Otherwise, if the user agent has information on the likely encoding for this page, e.g.
153            // based on the encoding of the page when it was last visited, then return that encoding,
154            // with the confidence tentative.
155            // NOTE: We have no such information.
156
157            // Step 8. The user agent may attempt to autodetect the character encoding from applying frequency analysis
158            // or other algorithms to the data stream.
159            let mut encoding_detector = chardetng::EncodingDetector::new();
160            encoding_detector.feed(&self.buffered_bytes, is_at_end_of_file == AtEndOfFile::Yes);
161            let url = document.url();
162            let tld = url
163                .as_url()
164                .domain()
165                .and_then(|domain| domain.rsplit('.').next())
166                .map(|tld| tld.as_bytes());
167            let (guessed_encoding, is_probably_right) = encoding_detector.guess_assess(tld, true);
168            if is_probably_right {
169                log::debug!(
170                    "chardetng determined that the document encoding is {}",
171                    guessed_encoding.name()
172                );
173                return Some(guessed_encoding);
174            }
175        }
176
177        // Step 9. Otherwise, return an implementation-defined or user-specified default character encoding,
178        // with the confidence tentative.
179        // TODO: The spec has a cool table here for determining an appropriate fallback encoding based on the
180        // user locale. Use it!
181        log::debug!("Failed to determine encoding of byte stream, falling back to UTF-8");
182        Some(UTF_8)
183    }
184
185    fn finish(&mut self, document: &Document) -> &'static Encoding {
186        self.determine_the_character_encoding(document, false, AtEndOfFile::Yes)
187            .expect("Should always return character encoding when we're not allowed to wait")
188    }
189}
190
191impl NetworkDecoderState {
192    pub(super) fn new(
193        encoding_hint_from_content_type: Option<&'static Encoding>,
194        encoding_of_container_document: Option<&'static Encoding>,
195    ) -> Self {
196        Self::Detecting(DetectingState {
197            encoding_hint_from_content_type,
198            encoding_of_container_document,
199            start_timestamp: Instant::now(),
200            attempted_bom_sniffing: false,
201            buffered_bytes: vec![],
202        })
203    }
204
205    /// Feeds the network decoder a chunk of bytes.
206    ///
207    /// If a new encoding is detected, then the encoding of `document` is updated appropriately.
208    ///
209    /// The decoded bytes are returned to the caller. Note that there is not necessarily a 1:1
210    /// relation between `chunk` and the return value. In the beginning, the decoder will buffer
211    /// bytes and return `None`, then later it will flush them and return a large `StrTendril` all
212    /// at once.
213    pub(super) fn push(&mut self, chunk: &[u8], document: &Document) -> Option<StrTendril> {
214        match self {
215            Self::Detecting(encoding_detector) => {
216                if let Some(encoding) = encoding_detector.buffer(chunk, document, AtEndOfFile::No) {
217                    document.set_encoding(encoding);
218                    let buffered_bytes = mem::take(&mut encoding_detector.buffered_bytes);
219                    *self = Self::Decoding(DecodingState {
220                        decoder: Some(LossyDecoder::new_from_encoding_rs_decoder(
221                            encoding.new_decoder_without_bom_handling(),
222                            NetworkSink::default(),
223                        )),
224                        encoding,
225                    });
226                    return self.push(&buffered_bytes, document);
227                }
228
229                None
230            },
231            Self::Decoding(network_decoder) => {
232                let decoder = network_decoder
233                    .decoder
234                    .as_mut()
235                    .expect("Can't push after call to finish()");
236                decoder.process(ByteTendril::from(chunk));
237                Some(std::mem::take(&mut decoder.inner_sink_mut().output))
238            },
239        }
240    }
241
242    pub(super) fn finish(&mut self, document: &Document) -> StrTendril {
243        match self {
244            Self::Detecting(encoding_detector) => {
245                let encoding = encoding_detector.finish(document);
246                document.set_encoding(encoding);
247                let buffered_bytes = mem::take(&mut encoding_detector.buffered_bytes);
248                let mut decoder = LossyDecoder::new_from_encoding_rs_decoder(
249                    encoding.new_decoder_without_bom_handling(),
250                    NetworkSink::default(),
251                );
252                decoder.process(ByteTendril::from(&*buffered_bytes));
253                *self = Self::Decoding(DecodingState {
254                    // Important to set `None` here to indicate that we're done decoding
255                    decoder: None,
256                    encoding,
257                });
258                let mut chunk = std::mem::take(&mut decoder.inner_sink_mut().output);
259                chunk.push_tendril(&decoder.finish());
260                chunk
261            },
262            Self::Decoding(network_decoder) => network_decoder
263                .decoder
264                .take()
265                .map(|decoder| decoder.finish())
266                .unwrap_or_default(),
267        }
268    }
269
270    pub(super) fn is_finished(&self) -> bool {
271        match self {
272            Self::Detecting(_) => false,
273            Self::Decoding(network_decoder) => network_decoder.decoder.is_none(),
274        }
275    }
276
277    pub(super) fn decoder(&mut self) -> &mut DecodingState {
278        match self {
279            Self::Detecting(_) => unreachable!("Cannot access decoder before decoding"),
280            Self::Decoding(decoder) => decoder,
281        }
282    }
283}
284
285/// An implementor of `TendrilSink` with the sole purpose of buffering decoded data
286/// so we can take it later.
287#[derive(Default, JSTraceable)]
288pub(crate) struct NetworkSink {
289    #[no_trace]
290    pub(crate) output: StrTendril,
291}
292
293impl TendrilSink<UTF8> for NetworkSink {
294    type Output = StrTendril;
295
296    fn process(&mut self, tendril: StrTendril) {
297        if self.output.is_empty() {
298            self.output = tendril;
299        } else {
300            self.output.push_tendril(&tendril);
301        }
302    }
303
304    fn error(&mut self, _desc: Cow<'static, str>) {}
305
306    fn finish(self) -> Self::Output {
307        self.output
308    }
309}
310
311#[derive(Default)]
312struct Attribute {
313    name: Vec<u8>,
314    value: Vec<u8>,
315}
316
317/// <https://html.spec.whatwg.org/multipage/#prescan-a-byte-stream-to-determine-its-encoding>
318pub fn prescan_the_byte_stream_to_determine_the_encoding(
319    byte_stream: &[u8],
320) -> Option<&'static Encoding> {
321    // Step 1. Let position be a pointer to a byte in the input byte stream,
322    // initially pointing at the first byte.
323    let mut position = 0;
324
325    // Step 2. Prescan for UTF-16 XML declarations: If position points to:
326    match byte_stream {
327        // A sequence of bytes starting with: 0x3C, 0x0, 0x3F, 0x0, 0x78, 0x0
328        // (case-sensitive UTF-16 little-endian '<?x')
329        [0x3C, 0x0, 0x3F, 0x0, 0x78, 0x0, ..] => {
330            // Return UTF-16LE.
331            return Some(UTF_16LE);
332        },
333
334        // A sequence of bytes starting with: 0x0, 0x3C, 0x0, 0x3F, 0x0, 0x78
335        // (case-sensitive UTF-16 big-endian '<?x')
336        [0x0, 0x3C, 0x0, 0x3F, 0x0, 0x78, ..] => {
337            // Return UTF-16BE.
338            return Some(UTF_16BE);
339        },
340        _ => {},
341    }
342
343    loop {
344        // Step 3. Loop: If position points to:
345        let remaining_byte_stream = byte_stream.get(position..)?;
346
347        // A sequence of bytes starting with: 0x3C 0x21 0x2D 0x2D (`<!--`)
348        if remaining_byte_stream.starts_with(b"<!--") {
349            // Advance the position pointer so that it points at the first 0x3E byte which is preceded by two 0x2D bytes
350            // (i.e. at the end of an ASCII '-->' sequence) and comes after the 0x3C byte that was found.
351            // (The two 0x2D bytes can be the same as those in the '<!--' sequence.)
352            // NOTE: This is not very efficient, but likely not an issue...
353            position += remaining_byte_stream
354                .windows(3)
355                .position(|window| window == b"-->")?;
356        }
357        // A sequence of bytes starting with: 0x3C, 0x4D or 0x6D, 0x45 or 0x65, 0x54 or 0x74, 0x41 or 0x61,
358        // and one of 0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F (case-insensitive ASCII '<meta' followed by a space or slash)
359        else if remaining_byte_stream
360            .get(..b"<meta ".len())
361            .is_some_and(|candidate| {
362                candidate[..b"<meta".len()].eq_ignore_ascii_case(b"<meta") &&
363                    candidate.last().is_some_and(|byte| {
364                        matches!(byte, 0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x2F)
365                    })
366            })
367        {
368            // Step 1. Advance the position pointer so that it points at the next 0x09, 0x0A, 0x0C, 0x0D, 0x20,
369            // or 0x2F byte (the one in sequence of characters matched above).
370            position += b"<meta".len();
371
372            // Step 2. Let attribute list be an empty list of strings.
373            // NOTE: This is used to track which attributes we have already seen. As there are only
374            // three attributes that we care about, we instead use three booleans.
375            let mut have_seen_http_equiv_attribute = false;
376            let mut have_seen_content_attribute = false;
377            let mut have_seen_charset_attribute = false;
378
379            // Step 3. Let got pragma be false.
380            let mut got_pragma = false;
381
382            // Step 4. Let need pragma be null.
383            let mut need_pragma = None;
384
385            // Step 5. Let charset be the null value (which, for the purposes of this algorithm,
386            // is distinct from an unrecognized encoding or the empty string).
387            let mut charset = None;
388
389            // Step 6. Attributes: Get an attribute and its value. If no attribute was sniffed,
390            // then jump to the processing step below.
391            while let Some(attribute) = get_an_attribute(byte_stream, &mut position) {
392                // Step 7 If the attribute's name is already in attribute list,
393                // then return to the step labeled attributes.
394                // Step 8. Add the attribute's name to attribute list.
395                // NOTE: This happens in the match arms below
396                // Step 9. Run the appropriate step from the following list, if one applies:
397                match attribute.name.as_slice() {
398                    // If the attribute's name is "http-equiv"
399                    b"http-equiv" if !have_seen_http_equiv_attribute => {
400                        have_seen_http_equiv_attribute = true;
401
402                        // If the attribute's value is "content-type", then set got pragma to true.
403                        if attribute.value == b"content-type" {
404                            got_pragma = true;
405                        }
406                    },
407                    // If the attribute's name is "content"
408                    b"content" if !have_seen_content_attribute => {
409                        have_seen_content_attribute = true;
410
411                        // Apply the algorithm for extracting a character encoding from a meta element,
412                        // giving the attribute's value as the string to parse. If a character encoding
413                        // is returned, and if charset is still set to null, let charset be the encoding
414                        // returned, and set need pragma to true.
415                        if charset.is_none() {
416                            if let Some(extracted_charset) =
417                                extract_a_character_encoding_from_a_meta_element(&attribute.value)
418                            {
419                                need_pragma = Some(true);
420                                charset = Some(extracted_charset);
421                            }
422                        }
423                    },
424                    // If the attribute's name is "charset"
425                    b"charset" if !have_seen_charset_attribute => {
426                        have_seen_charset_attribute = true;
427
428                        // Let charset be the result of getting an encoding from the attribute's value,
429                        // and set need pragma to false.
430                        if let Some(extracted_charset) = Encoding::for_label(&attribute.value) {
431                            charset = Some(extracted_charset);
432                        }
433
434                        need_pragma = Some(false);
435                    },
436                    _ => {},
437                }
438
439                // Step 10. Return to the step labeled attributes.
440            }
441
442            // Step 11. Processing: If need pragma is null, then jump to the step below labeled next byte.
443            if let Some(need_pragma) = need_pragma {
444                // Step 12. If need pragma is true but got pragma is false,
445                // then jump to the step below labeled next byte.
446                if !need_pragma || got_pragma {
447                    // Step 13. If charset is UTF-16BE/LE, then set charset to UTF-8.
448                    if charset.is_some_and(|charset| charset == UTF_16BE || charset == UTF_16LE) {
449                        charset = Some(UTF_8);
450                    }
451                    // Step 14. If charset is x-user-defined, then set charset to windows-1252.
452                    else if charset.is_some_and(|charset| charset == X_USER_DEFINED) {
453                        charset = Some(WINDOWS_1252);
454                    }
455
456                    // Step 15. Return charset.
457                    return charset;
458                }
459            }
460        }
461        // A sequence of bytes starting with a 0x3C byte (<), optionally a 0x2F byte (/),
462        // and finally a byte in the range 0x41-0x5A or 0x61-0x7A (A-Z or a-z)
463        else if *remaining_byte_stream.first()? == b'<' &&
464            remaining_byte_stream
465                .get(1)
466                .filter(|byte| **byte != b'=')
467                .or(remaining_byte_stream.get(2))?
468                .is_ascii_alphabetic()
469        {
470            // Step 1. Advance the position pointer so that it points at the next 0x09 (HT),
471            // 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>) byte.
472            position += remaining_byte_stream
473                .iter()
474                .position(|byte| byte.is_ascii_whitespace() || *byte == b'>')?;
475
476            // Step 2. Repeatedly get an attribute until no further attributes can be found,
477            // then jump to the step below labeled next byte.
478            while get_an_attribute(byte_stream, &mut position).is_some() {}
479        }
480        // A sequence of bytes starting with: 0x3C 0x21 (`<!`)
481        // A sequence of bytes starting with: 0x3C 0x2F (`</`)
482        // A sequence of bytes starting with: 0x3C 0x3F (`<?`)
483        else if remaining_byte_stream.starts_with(b"<!") ||
484            remaining_byte_stream.starts_with(b"</") ||
485            remaining_byte_stream.starts_with(b"<?")
486        {
487            // Advance the position pointer so that it points at the first 0x3E byte (>) that comes after the 0x3C byte that was found.
488            position += remaining_byte_stream
489                .iter()
490                .position(|byte| *byte == b'>')?;
491        }
492        // Any other byte
493        else {
494            // Do nothing with that byte.
495        }
496
497        // Next byte: Move position so it points at the next byte in the input byte stream,
498        // and return to the step above labeled loop.
499        position += 1;
500    }
501}
502
503/// <https://html.spec.whatwg.org/multipage/#concept-get-attributes-when-sniffing>
504fn get_an_attribute(input: &[u8], position: &mut usize) -> Option<Attribute> {
505    // NOTE: If we reach the end of the input during parsing then we return "None"
506    // (because there obviously is no attribute). The caller will then also run
507    // out of bytes and invoke "get an xml encoding" as mandated by the spec.
508
509    // Step 1. If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR),
510    // 0x20 (SP), or 0x2F (/), then advance position to the next byte and redo this step.
511    *position += &input[*position..]
512        .iter()
513        .position(|b| !matches!(b, 0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x2F))?;
514
515    // Step 2. If the byte at position is 0x3E (>), then abort the get an attribute algorithm.
516    // There isn't one.
517    if input[*position] == 0x3E {
518        return None;
519    }
520
521    // Step 3. Otherwise, the byte at position is the start of the attribute name.
522    // Let attribute name and attribute value be the empty string.
523    let mut attribute = Attribute::default();
524    let mut have_spaces = false;
525    loop {
526        // Step 4. Process the byte at position as follows:
527        match *input.get(*position)? {
528            // If it is 0x3D (=), and the attribute name is longer than the empty string
529            b'=' if !attribute.name.is_empty() => {
530                // Advance position to the next byte and jump to the step below labeled value.
531                *position += 1;
532                break;
533            },
534
535            // If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP)
536            0x09 | 0x0A | 0x0C | 0x0D | 0x20 => {
537                // Jump to the step below labeled spaces.
538                have_spaces = true;
539                break;
540            },
541
542            // If it is 0x2F (/) or 0x3E (>)
543            b'/' | b'>' => {
544                // Abort the get an attribute algorithm.
545                // The attribute's name is the value of attribute name, its value is the empty string.
546                return Some(attribute);
547            },
548
549            // If it is in the range 0x41 (A) to 0x5A (Z)
550            byte @ (b'A'..=b'Z') => {
551                // Append the code point b+0x20 to attribute name (where b is the value of the byte at position).
552                // (This converts the input to lowercase.)
553                attribute.name.push(byte + 0x20);
554            },
555
556            // Anything else
557            byte => {
558                // Append the code point with the same value as the byte at position to attribute name.
559                // (It doesn't actually matter how bytes outside the ASCII range are handled here, since only
560                // ASCII bytes can contribute to the detection of a character encoding.)
561                attribute.name.push(byte);
562            },
563        }
564
565        // Step 5. Advance position to the next byte and return to the previous step.
566        *position += 1;
567    }
568
569    if have_spaces {
570        // Step 6. Spaces: If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR),
571        // or 0x20 (SP), then advance position to the next byte, then, repeat this step.
572        *position += &input[*position..]
573            .iter()
574            .position(|b| !b.is_ascii_whitespace())?;
575
576        // Step 7. If the byte at position is not 0x3D (=), abort the get an attribute algorithm.
577        // The attribute's name is the value of attribute name, its value is the empty string.
578        if input[*position] != b'=' {
579            return Some(attribute);
580        }
581
582        // Step 8. Advance position past the 0x3D (=) byte.
583        *position += 1;
584    }
585
586    // Step 9. Value: If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP),
587    // then advance position to the next byte, then, repeat this step.
588    *position += &input[*position..]
589        .iter()
590        .position(|b| !b.is_ascii_whitespace())?;
591
592    // Step 10. Process the byte at position as follows:
593    match input[*position] {
594        // If it is 0x22 (") or 0x27 (')
595        b @ (b'"' | b'\'') => {
596            // Step 1. Let b be the value of the byte at position.
597            // NOTE: We already have b.
598            loop {
599                // Step 2. Quote loop: Advance position to the next byte.
600                *position += 1;
601
602                // Step 3. If the value of the byte at position is the value of b, then advance position to the next byte
603                // and abort the "get an attribute" algorithm. The attribute's name is the value of attribute name, and
604                // its value is the value of attribute value.
605                let byte_at_position = *input.get(*position)?;
606                if byte_at_position == b {
607                    *position += 1;
608                    return Some(attribute);
609                }
610                // Step 4. Otherwise, if the value of the byte at position is in the range 0x41 (A) to 0x5A (Z),
611                // then append a code point to attribute value whose value is 0x20 more than the value of the byte
612                // at position.
613                else if byte_at_position.is_ascii_uppercase() {
614                    attribute.value.push(byte_at_position + 0x20);
615                }
616                // Step 5. Otherwise, append a code point to attribute value whose value is the same
617                // as the value of the byte at position.
618                else {
619                    attribute.value.push(byte_at_position);
620                }
621
622                // Step 6. Return to the step above labeled quote loop.
623            }
624        },
625
626        // If it is 0x3E (>)
627        b'>' => {
628            // Abort the get an attribute algorithm. The attribute's name is the value of attribute name,
629            // its value is the empty string.
630            return Some(attribute);
631        },
632
633        // If it is in the range 0x41 (A) to 0x5A (Z)
634        b @ (b'A'..=b'Z') => {
635            // Append a code point b+0x20 to attribute value (where b is the value of the byte at position).
636            // Advance position to the next byte.
637            attribute.value.push(b + 0x20);
638            *position += 1;
639        },
640
641        // Anything else
642        b => {
643            // Append a code point with the same value as the byte at position to attribute value.
644            // Advance position to the next byte.
645            attribute.value.push(b);
646            *position += 1
647        },
648    }
649
650    loop {
651        // Step 11. Process the byte at position as follows:
652        match *input.get(*position)? {
653            // If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>)
654            0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x3E => {
655                // Abort the get an attribute algorithm. The attribute's name is the value of attribute name and
656                // its value is the value of attribute value.
657                return Some(attribute);
658            },
659
660            // If it is in the range 0x41 (A) to 0x5A (Z)
661            byte if byte.is_ascii_uppercase() => {
662                // Append a code point b+0x20 to attribute value (where b is the value of the byte at position).
663                attribute.value.push(byte + 0x20);
664            },
665
666            // Anything else
667            byte => {
668                // Append a code point with the same value as the byte at position to attribute value.
669                attribute.value.push(byte);
670            },
671        }
672
673        // Step 12. Advance position to the next byte and return to the previous step.
674        *position += 1;
675    }
676}
677
678/// <https://html.spec.whatwg.org/multipage/#algorithm-for-extracting-a-character-encoding-from-a-meta-element>
679fn extract_a_character_encoding_from_a_meta_element(input: &[u8]) -> Option<&'static Encoding> {
680    // Step 1. Let position be a pointer into s, initially pointing at the start of the string.
681    let mut position = 0;
682
683    loop {
684        // Step 2. Loop: Find the first seven characters in s after position that are an ASCII case-insensitive
685        // match for the word "charset". If no such match is found, return nothing.
686        // NOTE: In our case, the attribute value always comes from "get_an_attribute" and is already lowercased.
687        position += input[position..]
688            .windows(7)
689            .position(|window| window == b"charset")? +
690            b"charset".len();
691
692        // Step 3. Skip any ASCII whitespace that immediately follow the word "charset" (there might not be any).
693        position += &input[position..]
694            .iter()
695            .position(|byte| !byte.is_ascii_whitespace())?;
696
697        // Step 4. If the next character is not a U+003D EQUALS SIGN (=), then move position to point just before
698        // that next character, and jump back to the step labeled loop.
699        // NOTE: This is phrased very oddly, because position is already pointing to that character.
700        if *input.get(position)? == b'=' {
701            position += 1;
702            break;
703        }
704    }
705
706    // Step 5. Skip any ASCII whitespace that immediately follow the equals sign (there might not be any).
707    position += &input[position..]
708        .iter()
709        .position(|byte| !byte.is_ascii_whitespace())?;
710
711    // Step 6. Process the next character as follows:
712    let next_character = input.get(position)?;
713
714    // If it is a U+0022 QUOTATION MARK character (") and there is a later U+0022 QUOTATION MARK character (") in s
715    // If it is a U+0027 APOSTROPHE character (') and there is a later U+0027 APOSTROPHE character (') in s
716    if matches!(*next_character, b'"' | b'\'') {
717        // Return the result of getting an encoding from the substring that is between
718        // this character and the next earliest occurrence of this character.
719        let remaining = input.get(position + 1..)?;
720        let end = remaining.iter().position(|byte| byte == next_character)?;
721        Encoding::for_label(&remaining[..end])
722    }
723    // If it is an unmatched U+0022 QUOTATION MARK character (")
724    // If it is an unmatched U+0027 APOSTROPHE character (')
725    // If there is no next character
726    // NOTE: All of these cases are already covered above
727
728    // Otherwise
729    else {
730        // Return the result of getting an encoding from the substring that consists of this character up
731        // to but not including the first ASCII whitespace or U+003B SEMICOLON character (;), or the end of s,
732        // whichever comes first.
733        let remaining = input.get(position..)?;
734        let end = remaining
735            .iter()
736            .position(|byte| byte.is_ascii_whitespace() || *byte == b';')
737            .unwrap_or(remaining.len());
738
739        Encoding::for_label(&remaining[..end])
740    }
741}
742
743/// <https://html.spec.whatwg.org/multipage/#concept-get-xml-encoding-when-sniffing>
744pub fn get_xml_encoding(input: &[u8]) -> Option<&'static Encoding> {
745    // Step 1. Let encodingPosition be a pointer to the start of the stream.
746    // NOTE: We don't need this variable yet.
747    // Step 2. If encodingPosition does not point to the start of a byte sequence 0x3C, 0x3F, 0x78,
748    // 0x6D, 0x6C (`<?xml`), then return failure.
749    if !input.starts_with(b"<?xml") {
750        return None;
751    }
752
753    // Step 3. Let xmlDeclarationEnd be a pointer to the next byte in the input byte stream which is 0x3E (>).
754    // If there is no such byte, then return failure.
755    // NOTE: The spec does not use this variable but the intention is clear.
756    let xml_declaration_end = input.iter().position(|byte| *byte == b'>')?;
757    let input = &input[..xml_declaration_end];
758
759    // Step 4. Set encodingPosition to the position of the first occurrence of the subsequence of bytes 0x65, 0x6E,
760    // 0x63, 0x6F, 0x64, 0x69, 0x6E, 0x67 (`encoding`) at or after the current encodingPosition. If there is no
761    // such sequence, then return failure.
762    let mut encoding_position = input
763        .windows(b"encoding".len())
764        .position(|window| window == b"encoding")?;
765
766    // Step 5. Advance encodingPosition past the 0x67 (g) byte.
767    encoding_position += b"encoding".len();
768
769    // Step 6. While the byte at encodingPosition is less than or equal to 0x20 (i.e., it is either an
770    // ASCII space or control character), advance encodingPosition to the next byte.
771    while *input.get(encoding_position)? <= 0x20 {
772        encoding_position += 1;
773    }
774
775    // Step 7. If the byte at encodingPosition is not 0x3D (=), then return failure.
776    if *input.get(encoding_position)? != b'=' {
777        return None;
778    }
779
780    // Step 8. Advance encodingPosition to the next byte.
781    encoding_position += 1;
782
783    // Step 9. While the byte at encodingPosition is less than or equal to 0x20 (i.e., it is either an
784    // ASCII space or control character), advance encodingPosition to the next byte.
785    while *input.get(encoding_position)? <= 0x20 {
786        encoding_position += 1;
787    }
788
789    // Step 10. Let quoteMark be the byte at encodingPosition.
790    let quote_mark = *input.get(encoding_position)?;
791
792    // Step 11. If quoteMark is not either 0x22 (") or 0x27 ('), then return failure.
793    if !matches!(quote_mark, b'"' | b'\'') {
794        return None;
795    }
796
797    // Step 12. Advance encodingPosition to the next byte.
798    encoding_position += 1;
799
800    // Step 13. Let encodingEndPosition be the position of the next occurrence of quoteMark at or after
801    // encodingPosition. If quoteMark does not occur again, then return failure.
802    let encoding_end_position = input[encoding_position..]
803        .iter()
804        .position(|byte| *byte == quote_mark)?;
805
806    // Step 14. Let potentialEncoding be the sequence of the bytes between encodingPosition
807    // (inclusive) and encodingEndPosition (exclusive).
808    let potential_encoding = &input[encoding_position..][..encoding_end_position];
809
810    // Step 15. If potentialEncoding contains one or more bytes whose byte value is 0x20 or below,
811    // then return failure.
812    if potential_encoding.iter().any(|byte| *byte <= 0x20) {
813        return None;
814    }
815
816    // Step 16. Let encoding be the result of getting an encoding given potentialEncoding isomorphic decoded.
817    let encoding = Encoding::for_label(potential_encoding)?;
818
819    // Step 17. If the encoding is UTF-16BE/LE, then change it to UTF-8.
820    // Step 18. Return encoding.
821    if encoding == UTF_16BE || encoding == UTF_16LE {
822        Some(UTF_8)
823    } else {
824        Some(encoding)
825    }
826}
827
828#[derive(PartialEq)]
829enum AtEndOfFile {
830    Yes,
831    No,
832}