script/dom/servoparser/encoding.rs
1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
4
5use std::borrow::Cow;
6use std::mem;
7use std::time::{Duration, Instant};
8
9use encoding_rs::{Encoding, UTF_8, UTF_16BE, UTF_16LE, WINDOWS_1252, X_USER_DEFINED};
10use tendril::fmt::UTF8;
11use tendril::stream::LossyDecoder;
12use tendril::{ByteTendril, StrTendril, TendrilSink};
13
14use crate::dom::document::Document;
15
16#[derive(JSTraceable, MallocSizeOf)]
17pub(super) struct DetectingState {
18 /// The `charset` that was specified in the `Content-Type` header, if any.
19 #[no_trace]
20 encoding_hint_from_content_type: Option<&'static Encoding>,
21 /// The encoding of a same-origin container document, if this document is in an
22 /// `<iframe>`.
23 #[no_trace]
24 encoding_of_container_document: Option<&'static Encoding>,
25 start_timestamp: Instant,
26 attempted_bom_sniffing: bool,
27 buffered_bytes: Vec<u8>,
28}
29
30#[derive(JSTraceable, MallocSizeOf)]
31pub(super) struct DecodingState {
32 /// The actual decoder.
33 ///
34 /// This field is `None` after we've finished parsing, because `LossyDecoder::finish`
35 /// takes ownership of the decoder.
36 #[ignore_malloc_size_of = "Defined in tendril"]
37 #[no_trace]
38 decoder: Option<LossyDecoder<NetworkSink>>,
39 #[no_trace]
40 pub(super) encoding: &'static Encoding,
41}
42
43#[derive(JSTraceable, MallocSizeOf)]
44pub(super) enum NetworkDecoderState {
45 /// In this stage the decoder is buffering bytes until it has enough to determine the encoding.
46 Detecting(DetectingState),
47 Decoding(DecodingState),
48}
49
50impl DetectingState {
51 /// The maximum amount of bytes to buffer before attempting to determine the encoding
52 const BUFFER_THRESHOLD: usize = 1024;
53
54 /// The time threshold after which we will attempt to determine the encoding and start decoding,
55 /// even if there are less than [BUFFER_THRESHOLD] bytes in the buffer.
56 const MAX_TIME_TO_BUFFER: Duration = Duration::from_secs(1);
57
58 /// Appends some data to the internal buffer and attempts to [determine the character encoding].
59 ///
60 /// If an encoding was detected then it is returned. A return value of `None` indicates that
61 /// more bytes are required.
62 ///
63 /// [determine the character encoding]: https://html.spec.whatwg.org/multipage/#determining-the-character-encoding
64 fn buffer(
65 &mut self,
66 data: &[u8],
67 document: &Document,
68 is_at_end_of_file: AtEndOfFile,
69 ) -> Option<&'static Encoding> {
70 self.buffered_bytes.extend_from_slice(data);
71 let can_wait_longer = self.start_timestamp.elapsed() < Self::MAX_TIME_TO_BUFFER;
72 self.determine_the_character_encoding(document, can_wait_longer, is_at_end_of_file)
73 }
74
75 /// <https://html.spec.whatwg.org/multipage/#determining-the-character-encoding>
76 fn determine_the_character_encoding(
77 &mut self,
78 document: &Document,
79 potentially_wait_for_more_data: bool,
80 is_at_end_of_file: AtEndOfFile,
81 ) -> Option<&'static Encoding> {
82 // Step 1. If the result of BOM sniffing is an encoding, return that encoding with confidence certain.
83 if !self.attempted_bom_sniffing && self.buffered_bytes.len() > 2 {
84 self.attempted_bom_sniffing = true;
85
86 if let Some((encoding, _)) = Encoding::for_bom(self.buffered_bytes.as_slice()) {
87 log::debug!(
88 "Determined that the document is {} via BOM-sniffing",
89 encoding.name()
90 );
91 return Some(encoding);
92 }
93 }
94
95 // Step 2. If the user has explicitly instructed the user agent to override the document's character
96 // encoding with a specific encoding, optionally return that encoding with the confidence certain.
97 // NOTE: Our users have no way to do that.
98
99 // Step 3. The user agent may wait for more bytes of the resource to be available, either in this
100 // step or at any later step in this algorithm.
101 if potentially_wait_for_more_data && self.buffered_bytes.len() < Self::BUFFER_THRESHOLD {
102 return None;
103 }
104
105 // TODO: Step 4. If the transport layer specifies a character encoding, and it is supported, return that
106 // encoding with the confidence certain.
107 if let Some(encoding_hint_from_content_type) = self.encoding_hint_from_content_type {
108 log::debug!(
109 "Inferred encoding to be {} from the Content-Type header",
110 encoding_hint_from_content_type.name()
111 );
112 return Some(encoding_hint_from_content_type);
113 }
114
115 // Step 5. Optionally, prescan the byte stream to determine its encoding, with the end condition
116 // being when the user agent decides that scanning further bytes would not be efficient.
117 // NOTE: According to the spec, we should always try to get an xml encoding right after failing
118 // to prescan the byte stream
119 let bytes_to_prescan =
120 &self.buffered_bytes[..Self::BUFFER_THRESHOLD.min(self.buffered_bytes.len())];
121 let sniffed_encoding = if document.is_html_document() {
122 prescan_the_byte_stream_to_determine_the_encoding(bytes_to_prescan)
123 .or_else(|| get_xml_encoding(bytes_to_prescan))
124 } else {
125 get_xml_encoding(bytes_to_prescan)
126 };
127 if let Some(encoding) = sniffed_encoding {
128 log::debug!(
129 "Prescanning the byte stream determined that the encoding is {}",
130 encoding.name()
131 );
132 return Some(encoding);
133 }
134
135 if document.is_html_document() {
136 // Step 6. If the HTML parser for which this algorithm is being run is associated with a Document d
137 // whose container document is non-null, then:
138 // Step 6.1 Let parentDocument be d's container document.
139 // Step 6.2 If parentDocument's origin is same origin with d's origin and parentDocument's character encoding
140 // is not UTF-16BE/LE, then return parentDocument's character encoding, with the confidence tentative.
141 // NOTE: This should not happen for XML documents
142 if let Some(encoding) = self.encoding_of_container_document &&
143 encoding != UTF_16LE &&
144 encoding != UTF_16BE
145 {
146 log::debug!(
147 "Inferred encoding to be that of the container document, which is {}",
148 encoding.name()
149 );
150 return Some(encoding);
151 }
152
153 // Step 7. Otherwise, if the user agent has information on the likely encoding for this page, e.g.
154 // based on the encoding of the page when it was last visited, then return that encoding,
155 // with the confidence tentative.
156 // NOTE: We have no such information.
157
158 // Step 8. The user agent may attempt to autodetect the character encoding from applying frequency analysis
159 // or other algorithms to the data stream.
160 let mut encoding_detector = chardetng::EncodingDetector::new();
161 encoding_detector.feed(&self.buffered_bytes, is_at_end_of_file == AtEndOfFile::Yes);
162 let url = document.url();
163 let tld = url
164 .as_url()
165 .domain()
166 .and_then(|domain| domain.rsplit('.').next())
167 .map(|tld| tld.as_bytes());
168 let (guessed_encoding, is_probably_right) = encoding_detector.guess_assess(tld, true);
169 if is_probably_right {
170 log::debug!(
171 "chardetng determined that the document encoding is {}",
172 guessed_encoding.name()
173 );
174 return Some(guessed_encoding);
175 }
176 }
177
178 // Step 9. Otherwise, return an implementation-defined or user-specified default character encoding,
179 // with the confidence tentative.
180 // TODO: The spec has a cool table here for determining an appropriate fallback encoding based on the
181 // user locale. Use it!
182 log::debug!("Failed to determine encoding of byte stream, falling back to UTF-8");
183 Some(UTF_8)
184 }
185
186 fn finish(&mut self, document: &Document) -> &'static Encoding {
187 self.determine_the_character_encoding(document, false, AtEndOfFile::Yes)
188 .expect("Should always return character encoding when we're not allowed to wait")
189 }
190}
191
192impl NetworkDecoderState {
193 pub(super) fn new(
194 encoding_hint_from_content_type: Option<&'static Encoding>,
195 encoding_of_container_document: Option<&'static Encoding>,
196 ) -> Self {
197 Self::Detecting(DetectingState {
198 encoding_hint_from_content_type,
199 encoding_of_container_document,
200 start_timestamp: Instant::now(),
201 attempted_bom_sniffing: false,
202 buffered_bytes: vec![],
203 })
204 }
205
206 /// Feeds the network decoder a chunk of bytes.
207 ///
208 /// If a new encoding is detected, then the encoding of `document` is updated appropriately.
209 ///
210 /// The decoded bytes are returned to the caller. Note that there is not necessarily a 1:1
211 /// relation between `chunk` and the return value. In the beginning, the decoder will buffer
212 /// bytes and return `None`, then later it will flush them and return a large `StrTendril` all
213 /// at once.
214 pub(super) fn push(&mut self, chunk: &[u8], document: &Document) -> Option<StrTendril> {
215 match self {
216 Self::Detecting(encoding_detector) => {
217 if let Some(encoding) = encoding_detector.buffer(chunk, document, AtEndOfFile::No) {
218 document.set_encoding(encoding);
219 let buffered_bytes = mem::take(&mut encoding_detector.buffered_bytes);
220 *self = Self::Decoding(DecodingState {
221 decoder: Some(LossyDecoder::new_from_encoding_rs_decoder(
222 encoding.new_decoder_without_bom_handling(),
223 NetworkSink::default(),
224 )),
225 encoding,
226 });
227 return self.push(&buffered_bytes, document);
228 }
229
230 None
231 },
232 Self::Decoding(network_decoder) => {
233 let decoder = network_decoder
234 .decoder
235 .as_mut()
236 .expect("Can't push after call to finish()");
237 decoder.process(ByteTendril::from(chunk));
238 Some(std::mem::take(&mut decoder.inner_sink_mut().output))
239 },
240 }
241 }
242
243 pub(super) fn finish(&mut self, document: &Document) -> StrTendril {
244 match self {
245 Self::Detecting(encoding_detector) => {
246 let encoding = encoding_detector.finish(document);
247 document.set_encoding(encoding);
248 let buffered_bytes = mem::take(&mut encoding_detector.buffered_bytes);
249 let mut decoder = LossyDecoder::new_from_encoding_rs_decoder(
250 encoding.new_decoder_without_bom_handling(),
251 NetworkSink::default(),
252 );
253 decoder.process(ByteTendril::from(&*buffered_bytes));
254 *self = Self::Decoding(DecodingState {
255 // Important to set `None` here to indicate that we're done decoding
256 decoder: None,
257 encoding,
258 });
259 let mut chunk = std::mem::take(&mut decoder.inner_sink_mut().output);
260 chunk.push_tendril(&decoder.finish());
261 chunk
262 },
263 Self::Decoding(network_decoder) => network_decoder
264 .decoder
265 .take()
266 .map(|decoder| decoder.finish())
267 .unwrap_or_default(),
268 }
269 }
270
271 pub(super) fn is_finished(&self) -> bool {
272 match self {
273 Self::Detecting(_) => false,
274 Self::Decoding(network_decoder) => network_decoder.decoder.is_none(),
275 }
276 }
277
278 pub(super) fn decoder(&mut self) -> &mut DecodingState {
279 match self {
280 Self::Detecting(_) => unreachable!("Cannot access decoder before decoding"),
281 Self::Decoding(decoder) => decoder,
282 }
283 }
284}
285
286/// An implementor of `TendrilSink` with the sole purpose of buffering decoded data
287/// so we can take it later.
288#[derive(Default, JSTraceable)]
289pub(crate) struct NetworkSink {
290 #[no_trace]
291 pub(crate) output: StrTendril,
292}
293
294impl TendrilSink<UTF8> for NetworkSink {
295 type Output = StrTendril;
296
297 fn process(&mut self, tendril: StrTendril) {
298 if self.output.is_empty() {
299 self.output = tendril;
300 } else {
301 self.output.push_tendril(&tendril);
302 }
303 }
304
305 fn error(&mut self, _desc: Cow<'static, str>) {}
306
307 fn finish(self) -> Self::Output {
308 self.output
309 }
310}
311
312#[derive(Default)]
313struct Attribute {
314 name: Vec<u8>,
315 value: Vec<u8>,
316}
317
318/// <https://html.spec.whatwg.org/multipage/#prescan-a-byte-stream-to-determine-its-encoding>
319pub fn prescan_the_byte_stream_to_determine_the_encoding(
320 byte_stream: &[u8],
321) -> Option<&'static Encoding> {
322 // Step 1. Let position be a pointer to a byte in the input byte stream,
323 // initially pointing at the first byte.
324 let mut position = 0;
325
326 // Step 2. Prescan for UTF-16 XML declarations: If position points to:
327 match byte_stream {
328 // A sequence of bytes starting with: 0x3C, 0x0, 0x3F, 0x0, 0x78, 0x0
329 // (case-sensitive UTF-16 little-endian '<?x')
330 [0x3C, 0x0, 0x3F, 0x0, 0x78, 0x0, ..] => {
331 // Return UTF-16LE.
332 return Some(UTF_16LE);
333 },
334
335 // A sequence of bytes starting with: 0x0, 0x3C, 0x0, 0x3F, 0x0, 0x78
336 // (case-sensitive UTF-16 big-endian '<?x')
337 [0x0, 0x3C, 0x0, 0x3F, 0x0, 0x78, ..] => {
338 // Return UTF-16BE.
339 return Some(UTF_16BE);
340 },
341 _ => {},
342 }
343
344 loop {
345 // Step 3. Loop: If position points to:
346 let remaining_byte_stream = byte_stream.get(position..)?;
347
348 // A sequence of bytes starting with: 0x3C 0x21 0x2D 0x2D (`<!--`)
349 if remaining_byte_stream.starts_with(b"<!--") {
350 // Advance the position pointer so that it points at the first 0x3E byte which is preceded by two 0x2D bytes
351 // (i.e. at the end of an ASCII '-->' sequence) and comes after the 0x3C byte that was found.
352 // (The two 0x2D bytes can be the same as those in the '<!--' sequence.)
353 // NOTE: This is not very efficient, but likely not an issue...
354 position += remaining_byte_stream
355 .windows(3)
356 .position(|window| window == b"-->")?;
357 }
358 // A sequence of bytes starting with: 0x3C, 0x4D or 0x6D, 0x45 or 0x65, 0x54 or 0x74, 0x41 or 0x61,
359 // and one of 0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F (case-insensitive ASCII '<meta' followed by a space or slash)
360 else if remaining_byte_stream
361 .get(..b"<meta ".len())
362 .is_some_and(|candidate| {
363 candidate[..b"<meta".len()].eq_ignore_ascii_case(b"<meta") &&
364 candidate.last().is_some_and(|byte| {
365 matches!(byte, 0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x2F)
366 })
367 })
368 {
369 // Step 1. Advance the position pointer so that it points at the next 0x09, 0x0A, 0x0C, 0x0D, 0x20,
370 // or 0x2F byte (the one in sequence of characters matched above).
371 position += b"<meta".len();
372
373 // Step 2. Let attribute list be an empty list of strings.
374 // NOTE: This is used to track which attributes we have already seen. As there are only
375 // three attributes that we care about, we instead use three booleans.
376 let mut have_seen_http_equiv_attribute = false;
377 let mut have_seen_content_attribute = false;
378 let mut have_seen_charset_attribute = false;
379
380 // Step 3. Let got pragma be false.
381 let mut got_pragma = false;
382
383 // Step 4. Let need pragma be null.
384 let mut need_pragma = None;
385
386 // Step 5. Let charset be the null value (which, for the purposes of this algorithm,
387 // is distinct from an unrecognized encoding or the empty string).
388 let mut charset = None;
389
390 // Step 6. Attributes: Get an attribute and its value. If no attribute was sniffed,
391 // then jump to the processing step below.
392 while let Some(attribute) = get_an_attribute(byte_stream, &mut position) {
393 // Step 7 If the attribute's name is already in attribute list,
394 // then return to the step labeled attributes.
395 // Step 8. Add the attribute's name to attribute list.
396 // NOTE: This happens in the match arms below
397 // Step 9. Run the appropriate step from the following list, if one applies:
398 match attribute.name.as_slice() {
399 // If the attribute's name is "http-equiv"
400 b"http-equiv" if !have_seen_http_equiv_attribute => {
401 have_seen_http_equiv_attribute = true;
402
403 // If the attribute's value is "content-type", then set got pragma to true.
404 if attribute.value == b"content-type" {
405 got_pragma = true;
406 }
407 },
408 // If the attribute's name is "content"
409 b"content" if !have_seen_content_attribute => {
410 have_seen_content_attribute = true;
411
412 // Apply the algorithm for extracting a character encoding from a meta element,
413 // giving the attribute's value as the string to parse. If a character encoding
414 // is returned, and if charset is still set to null, let charset be the encoding
415 // returned, and set need pragma to true.
416 if charset.is_none() &&
417 let Some(extracted_charset) =
418 extract_a_character_encoding_from_a_meta_element(
419 &attribute.value,
420 )
421 {
422 need_pragma = Some(true);
423 charset = Some(extracted_charset);
424 }
425 },
426 // If the attribute's name is "charset"
427 b"charset" if !have_seen_charset_attribute => {
428 have_seen_charset_attribute = true;
429
430 // Let charset be the result of getting an encoding from the attribute's value,
431 // and set need pragma to false.
432 if let Some(extracted_charset) = Encoding::for_label(&attribute.value) {
433 charset = Some(extracted_charset);
434 }
435
436 need_pragma = Some(false);
437 },
438 _ => {},
439 }
440
441 // Step 10. Return to the step labeled attributes.
442 }
443
444 // Step 11. Processing: If need pragma is null, then jump to the step below labeled next byte.
445 if let Some(need_pragma) = need_pragma {
446 // Step 12. If need pragma is true but got pragma is false,
447 // then jump to the step below labeled next byte.
448 if !need_pragma || got_pragma {
449 // Step 13. If charset is UTF-16BE/LE, then set charset to UTF-8.
450 if charset.is_some_and(|charset| charset == UTF_16BE || charset == UTF_16LE) {
451 charset = Some(UTF_8);
452 }
453 // Step 14. If charset is x-user-defined, then set charset to windows-1252.
454 else if charset.is_some_and(|charset| charset == X_USER_DEFINED) {
455 charset = Some(WINDOWS_1252);
456 }
457
458 // Step 15. Return charset.
459 return charset;
460 }
461 }
462 }
463 // A sequence of bytes starting with a 0x3C byte (<), optionally a 0x2F byte (/),
464 // and finally a byte in the range 0x41-0x5A or 0x61-0x7A (A-Z or a-z)
465 else if *remaining_byte_stream.first()? == b'<' &&
466 remaining_byte_stream
467 .get(1)
468 .filter(|byte| **byte != b'=')
469 .or(remaining_byte_stream.get(2))?
470 .is_ascii_alphabetic()
471 {
472 // Step 1. Advance the position pointer so that it points at the next 0x09 (HT),
473 // 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>) byte.
474 position += remaining_byte_stream
475 .iter()
476 .position(|byte| byte.is_ascii_whitespace() || *byte == b'>')?;
477
478 // Step 2. Repeatedly get an attribute until no further attributes can be found,
479 // then jump to the step below labeled next byte.
480 while get_an_attribute(byte_stream, &mut position).is_some() {}
481 }
482 // A sequence of bytes starting with: 0x3C 0x21 (`<!`)
483 // A sequence of bytes starting with: 0x3C 0x2F (`</`)
484 // A sequence of bytes starting with: 0x3C 0x3F (`<?`)
485 else if remaining_byte_stream.starts_with(b"<!") ||
486 remaining_byte_stream.starts_with(b"</") ||
487 remaining_byte_stream.starts_with(b"<?")
488 {
489 // Advance the position pointer so that it points at the first 0x3E byte (>) that comes after the 0x3C byte that was found.
490 position += remaining_byte_stream
491 .iter()
492 .position(|byte| *byte == b'>')?;
493 }
494 // Any other byte
495 else {
496 // Do nothing with that byte.
497 }
498
499 // Next byte: Move position so it points at the next byte in the input byte stream,
500 // and return to the step above labeled loop.
501 position += 1;
502 }
503}
504
505/// <https://html.spec.whatwg.org/multipage/#concept-get-attributes-when-sniffing>
506fn get_an_attribute(input: &[u8], position: &mut usize) -> Option<Attribute> {
507 // NOTE: If we reach the end of the input during parsing then we return "None"
508 // (because there obviously is no attribute). The caller will then also run
509 // out of bytes and invoke "get an xml encoding" as mandated by the spec.
510
511 // Step 1. If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR),
512 // 0x20 (SP), or 0x2F (/), then advance position to the next byte and redo this step.
513 *position += &input[*position..]
514 .iter()
515 .position(|b| !matches!(b, 0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x2F))?;
516
517 // Step 2. If the byte at position is 0x3E (>), then abort the get an attribute algorithm.
518 // There isn't one.
519 if input[*position] == 0x3E {
520 return None;
521 }
522
523 // Step 3. Otherwise, the byte at position is the start of the attribute name.
524 // Let attribute name and attribute value be the empty string.
525 let mut attribute = Attribute::default();
526 let mut have_spaces = false;
527 loop {
528 // Step 4. Process the byte at position as follows:
529 match *input.get(*position)? {
530 // If it is 0x3D (=), and the attribute name is longer than the empty string
531 b'=' if !attribute.name.is_empty() => {
532 // Advance position to the next byte and jump to the step below labeled value.
533 *position += 1;
534 break;
535 },
536
537 // If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP)
538 0x09 | 0x0A | 0x0C | 0x0D | 0x20 => {
539 // Jump to the step below labeled spaces.
540 have_spaces = true;
541 break;
542 },
543
544 // If it is 0x2F (/) or 0x3E (>)
545 b'/' | b'>' => {
546 // Abort the get an attribute algorithm.
547 // The attribute's name is the value of attribute name, its value is the empty string.
548 return Some(attribute);
549 },
550
551 // If it is in the range 0x41 (A) to 0x5A (Z)
552 byte @ (b'A'..=b'Z') => {
553 // Append the code point b+0x20 to attribute name (where b is the value of the byte at position).
554 // (This converts the input to lowercase.)
555 attribute.name.push(byte + 0x20);
556 },
557
558 // Anything else
559 byte => {
560 // Append the code point with the same value as the byte at position to attribute name.
561 // (It doesn't actually matter how bytes outside the ASCII range are handled here, since only
562 // ASCII bytes can contribute to the detection of a character encoding.)
563 attribute.name.push(byte);
564 },
565 }
566
567 // Step 5. Advance position to the next byte and return to the previous step.
568 *position += 1;
569 }
570
571 if have_spaces {
572 // Step 6. Spaces: If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR),
573 // or 0x20 (SP), then advance position to the next byte, then, repeat this step.
574 *position += &input[*position..]
575 .iter()
576 .position(|b| !b.is_ascii_whitespace())?;
577
578 // Step 7. If the byte at position is not 0x3D (=), abort the get an attribute algorithm.
579 // The attribute's name is the value of attribute name, its value is the empty string.
580 if input[*position] != b'=' {
581 return Some(attribute);
582 }
583
584 // Step 8. Advance position past the 0x3D (=) byte.
585 *position += 1;
586 }
587
588 // Step 9. Value: If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP),
589 // then advance position to the next byte, then, repeat this step.
590 *position += &input[*position..]
591 .iter()
592 .position(|b| !b.is_ascii_whitespace())?;
593
594 // Step 10. Process the byte at position as follows:
595 match input[*position] {
596 // If it is 0x22 (") or 0x27 (')
597 b @ (b'"' | b'\'') => {
598 // Step 1. Let b be the value of the byte at position.
599 // NOTE: We already have b.
600 loop {
601 // Step 2. Quote loop: Advance position to the next byte.
602 *position += 1;
603
604 // Step 3. If the value of the byte at position is the value of b, then advance position to the next byte
605 // and abort the "get an attribute" algorithm. The attribute's name is the value of attribute name, and
606 // its value is the value of attribute value.
607 let byte_at_position = *input.get(*position)?;
608 if byte_at_position == b {
609 *position += 1;
610 return Some(attribute);
611 }
612 // Step 4. Otherwise, if the value of the byte at position is in the range 0x41 (A) to 0x5A (Z),
613 // then append a code point to attribute value whose value is 0x20 more than the value of the byte
614 // at position.
615 else if byte_at_position.is_ascii_uppercase() {
616 attribute.value.push(byte_at_position + 0x20);
617 }
618 // Step 5. Otherwise, append a code point to attribute value whose value is the same
619 // as the value of the byte at position.
620 else {
621 attribute.value.push(byte_at_position);
622 }
623
624 // Step 6. Return to the step above labeled quote loop.
625 }
626 },
627
628 // If it is 0x3E (>)
629 b'>' => {
630 // Abort the get an attribute algorithm. The attribute's name is the value of attribute name,
631 // its value is the empty string.
632 return Some(attribute);
633 },
634
635 // If it is in the range 0x41 (A) to 0x5A (Z)
636 b @ (b'A'..=b'Z') => {
637 // Append a code point b+0x20 to attribute value (where b is the value of the byte at position).
638 // Advance position to the next byte.
639 attribute.value.push(b + 0x20);
640 *position += 1;
641 },
642
643 // Anything else
644 b => {
645 // Append a code point with the same value as the byte at position to attribute value.
646 // Advance position to the next byte.
647 attribute.value.push(b);
648 *position += 1
649 },
650 }
651
652 loop {
653 // Step 11. Process the byte at position as follows:
654 match *input.get(*position)? {
655 // If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>)
656 0x09 | 0x0A | 0x0C | 0x0D | 0x20 | 0x3E => {
657 // Abort the get an attribute algorithm. The attribute's name is the value of attribute name and
658 // its value is the value of attribute value.
659 return Some(attribute);
660 },
661
662 // If it is in the range 0x41 (A) to 0x5A (Z)
663 byte if byte.is_ascii_uppercase() => {
664 // Append a code point b+0x20 to attribute value (where b is the value of the byte at position).
665 attribute.value.push(byte + 0x20);
666 },
667
668 // Anything else
669 byte => {
670 // Append a code point with the same value as the byte at position to attribute value.
671 attribute.value.push(byte);
672 },
673 }
674
675 // Step 12. Advance position to the next byte and return to the previous step.
676 *position += 1;
677 }
678}
679
680/// <https://html.spec.whatwg.org/multipage/#algorithm-for-extracting-a-character-encoding-from-a-meta-element>
681fn extract_a_character_encoding_from_a_meta_element(input: &[u8]) -> Option<&'static Encoding> {
682 // Step 1. Let position be a pointer into s, initially pointing at the start of the string.
683 let mut position = 0;
684
685 loop {
686 // Step 2. Loop: Find the first seven characters in s after position that are an ASCII case-insensitive
687 // match for the word "charset". If no such match is found, return nothing.
688 // NOTE: In our case, the attribute value always comes from "get_an_attribute" and is already lowercased.
689 position += input[position..]
690 .windows(7)
691 .position(|window| window == b"charset")? +
692 b"charset".len();
693
694 // Step 3. Skip any ASCII whitespace that immediately follow the word "charset" (there might not be any).
695 position += &input[position..]
696 .iter()
697 .position(|byte| !byte.is_ascii_whitespace())?;
698
699 // Step 4. If the next character is not a U+003D EQUALS SIGN (=), then move position to point just before
700 // that next character, and jump back to the step labeled loop.
701 // NOTE: This is phrased very oddly, because position is already pointing to that character.
702 if *input.get(position)? == b'=' {
703 position += 1;
704 break;
705 }
706 }
707
708 // Step 5. Skip any ASCII whitespace that immediately follow the equals sign (there might not be any).
709 position += &input[position..]
710 .iter()
711 .position(|byte| !byte.is_ascii_whitespace())?;
712
713 // Step 6. Process the next character as follows:
714 let next_character = input.get(position)?;
715
716 // If it is a U+0022 QUOTATION MARK character (") and there is a later U+0022 QUOTATION MARK character (") in s
717 // If it is a U+0027 APOSTROPHE character (') and there is a later U+0027 APOSTROPHE character (') in s
718 if matches!(*next_character, b'"' | b'\'') {
719 // Return the result of getting an encoding from the substring that is between
720 // this character and the next earliest occurrence of this character.
721 let remaining = input.get(position + 1..)?;
722 let end = remaining.iter().position(|byte| byte == next_character)?;
723 Encoding::for_label(&remaining[..end])
724 }
725 // If it is an unmatched U+0022 QUOTATION MARK character (")
726 // If it is an unmatched U+0027 APOSTROPHE character (')
727 // If there is no next character
728 // NOTE: All of these cases are already covered above
729
730 // Otherwise
731 else {
732 // Return the result of getting an encoding from the substring that consists of this character up
733 // to but not including the first ASCII whitespace or U+003B SEMICOLON character (;), or the end of s,
734 // whichever comes first.
735 let remaining = input.get(position..)?;
736 let end = remaining
737 .iter()
738 .position(|byte| byte.is_ascii_whitespace() || *byte == b';')
739 .unwrap_or(remaining.len());
740
741 Encoding::for_label(&remaining[..end])
742 }
743}
744
745/// <https://html.spec.whatwg.org/multipage/#concept-get-xml-encoding-when-sniffing>
746pub fn get_xml_encoding(input: &[u8]) -> Option<&'static Encoding> {
747 // Step 1. Let encodingPosition be a pointer to the start of the stream.
748 // NOTE: We don't need this variable yet.
749 // Step 2. If encodingPosition does not point to the start of a byte sequence 0x3C, 0x3F, 0x78,
750 // 0x6D, 0x6C (`<?xml`), then return failure.
751 if !input.starts_with(b"<?xml") {
752 return None;
753 }
754
755 // Step 3. Let xmlDeclarationEnd be a pointer to the next byte in the input byte stream which is 0x3E (>).
756 // If there is no such byte, then return failure.
757 // NOTE: The spec does not use this variable but the intention is clear.
758 let xml_declaration_end = input.iter().position(|byte| *byte == b'>')?;
759 let input = &input[..xml_declaration_end];
760
761 // Step 4. Set encodingPosition to the position of the first occurrence of the subsequence of bytes 0x65, 0x6E,
762 // 0x63, 0x6F, 0x64, 0x69, 0x6E, 0x67 (`encoding`) at or after the current encodingPosition. If there is no
763 // such sequence, then return failure.
764 let mut encoding_position = input
765 .windows(b"encoding".len())
766 .position(|window| window == b"encoding")?;
767
768 // Step 5. Advance encodingPosition past the 0x67 (g) byte.
769 encoding_position += b"encoding".len();
770
771 // Step 6. While the byte at encodingPosition is less than or equal to 0x20 (i.e., it is either an
772 // ASCII space or control character), advance encodingPosition to the next byte.
773 while *input.get(encoding_position)? <= 0x20 {
774 encoding_position += 1;
775 }
776
777 // Step 7. If the byte at encodingPosition is not 0x3D (=), then return failure.
778 if *input.get(encoding_position)? != b'=' {
779 return None;
780 }
781
782 // Step 8. Advance encodingPosition to the next byte.
783 encoding_position += 1;
784
785 // Step 9. While the byte at encodingPosition is less than or equal to 0x20 (i.e., it is either an
786 // ASCII space or control character), advance encodingPosition to the next byte.
787 while *input.get(encoding_position)? <= 0x20 {
788 encoding_position += 1;
789 }
790
791 // Step 10. Let quoteMark be the byte at encodingPosition.
792 let quote_mark = *input.get(encoding_position)?;
793
794 // Step 11. If quoteMark is not either 0x22 (") or 0x27 ('), then return failure.
795 if !matches!(quote_mark, b'"' | b'\'') {
796 return None;
797 }
798
799 // Step 12. Advance encodingPosition to the next byte.
800 encoding_position += 1;
801
802 // Step 13. Let encodingEndPosition be the position of the next occurrence of quoteMark at or after
803 // encodingPosition. If quoteMark does not occur again, then return failure.
804 let encoding_end_position = input[encoding_position..]
805 .iter()
806 .position(|byte| *byte == quote_mark)?;
807
808 // Step 14. Let potentialEncoding be the sequence of the bytes between encodingPosition
809 // (inclusive) and encodingEndPosition (exclusive).
810 let potential_encoding = &input[encoding_position..][..encoding_end_position];
811
812 // Step 15. If potentialEncoding contains one or more bytes whose byte value is 0x20 or below,
813 // then return failure.
814 if potential_encoding.iter().any(|byte| *byte <= 0x20) {
815 return None;
816 }
817
818 // Step 16. Let encoding be the result of getting an encoding given potentialEncoding isomorphic decoded.
819 let encoding = Encoding::for_label(potential_encoding)?;
820
821 // Step 17. If the encoding is UTF-16BE/LE, then change it to UTF-8.
822 // Step 18. Return encoding.
823 if encoding == UTF_16BE || encoding == UTF_16LE {
824 Some(UTF_8)
825 } else {
826 Some(encoding)
827 }
828}
829
830#[derive(PartialEq)]
831enum AtEndOfFile {
832 Yes,
833 No,
834}