html5ever/
encoding.rs

1// Copyright 2014-2025 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use crate::tendril::StrTendril;
11
12/// <https://html.spec.whatwg.org/multipage/#algorithm-for-extracting-a-character-encoding-from-a-meta-element>
13pub(crate) fn extract_a_character_encoding_from_a_meta_element(
14    input: StrTendril,
15) -> Option<StrTendril> {
16    // Step 1. Let position be a pointer into s, initially pointing at the start of the string.
17    let mut position = 0;
18    loop {
19        // Step 2. Loop: Find the first seven characters in s after position that are an ASCII
20        // case-insensitive match for the word "charset". If no such match is found, return nothing.
21        loop {
22            let candidate = input.as_bytes().get(position..position + "charset".len())?;
23            if candidate.eq_ignore_ascii_case(b"charset") {
24                break;
25            }
26
27            position += 1;
28        }
29        position += "charset".len();
30
31        // Step 3. Skip any ASCII whitespace that immediately follow the word "charset" (there might not be any).
32        position += input.as_bytes()[position..]
33            .iter()
34            .take_while(|byte| byte.is_ascii_whitespace())
35            .count();
36
37        // Step 4. If the next character is not a U+003D EQUALS SIGN (=), then move position to point just before
38        // that next character, and jump back to the step labeled loop.
39        if input.as_bytes()[position] == b'=' {
40            break;
41        }
42    }
43    // Skip the "="
44    position += 1;
45
46    // Step 5. Skip any ASCII whitespace that immediately follow the equals sign (there might not be any).
47    position += input.as_bytes()[position..]
48        .iter()
49        .take_while(|byte| byte.is_ascii_whitespace())
50        .count();
51
52    // Step 6. Process the next character as follows:
53    match input.as_bytes().get(position)? {
54        quote @ (b'"' | b'\'') => {
55            // Return the result of getting an encoding from the substring that is between this character
56            // and the next earliest occurrence of this character.
57            let length = input.as_bytes()[position + 1..]
58                .iter()
59                .position(|byte| byte == quote)?;
60            Some(input.subtendril(position as u32 + 1, length as u32))
61        },
62        _ => {
63            // Return the result of getting an encoding from the substring that consists of this character
64            // up to but not including the first ASCII whitespace or U+003B SEMICOLON character (;),
65            // or the end of s, whichever comes first.
66            let length = input.as_bytes()[position..]
67                .iter()
68                .position(|byte| byte.is_ascii_whitespace() || *byte == b';');
69            if let Some(length) = length {
70                Some(input.subtendril(position as u32, length as u32))
71            } else {
72                Some(input.subtendril(position as u32, (input.len() - position) as u32))
73            }
74        },
75    }
76}
77
78#[cfg(test)]
79mod tests {
80    use super::*;
81
82    #[test]
83    fn meta_element_without_charset() {
84        assert_eq!(
85            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("foobar")),
86            None
87        );
88    }
89
90    #[test]
91    fn meta_element_with_capitalized_charset() {
92        assert_eq!(
93            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
94                "cHarSet=utf8"
95            )),
96            Some(StrTendril::from_slice("utf8"))
97        );
98    }
99
100    #[test]
101    fn meta_element_with_no_equals_after_charset() {
102        assert_eq!(
103            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
104                "charset utf8"
105            )),
106            None
107        );
108    }
109
110    #[test]
111    fn meta_element_with_whitespace_around_equals() {
112        assert_eq!(
113            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
114                "charset \t=\tutf8"
115            )),
116            Some(StrTendril::from_slice("utf8"))
117        );
118    }
119
120    #[test]
121    fn meta_element_with_quoted_value() {
122        assert_eq!(
123            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
124                "charset='utf8'"
125            )),
126            Some(StrTendril::from_slice("utf8"))
127        );
128        assert_eq!(
129            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
130                "charset=\"utf8\""
131            )),
132            Some(StrTendril::from_slice("utf8"))
133        );
134        assert_eq!(
135            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
136                "charset='utf8"
137            )),
138            None
139        );
140        assert_eq!(
141            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
142                "charset=\"utf8"
143            )),
144            None
145        );
146    }
147
148    #[test]
149    fn meta_element_with_implicit_terminator() {
150        assert_eq!(
151            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
152                "charset=utf8 foo"
153            )),
154            Some(StrTendril::from_slice("utf8"))
155        );
156        assert_eq!(
157            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
158                "charset=utf8;foo"
159            )),
160            Some(StrTendril::from_slice("utf8"))
161        );
162    }
163
164    #[test]
165    fn meta_element_with_content_type() {
166        assert_eq!(
167            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
168                "text/html; charset=utf8"
169            )),
170            Some(StrTendril::from_slice("utf8"))
171        );
172    }
173}