html5ever/tokenizer/
interface.rs

1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use markup5ever::ns;
11
12use crate::interface::Attribute;
13use crate::tendril::StrTendril;
14use crate::tokenizer::states;
15use crate::LocalName;
16use std::borrow::Cow;
17
18pub use self::TagKind::{EndTag, StartTag};
19pub use self::Token::{CharacterTokens, CommentToken, DoctypeToken, TagToken};
20pub use self::Token::{EOFToken, NullCharacterToken, ParseError};
21
22/// A `DOCTYPE` token.
23#[derive(PartialEq, Eq, Clone, Debug, Default)]
24pub struct Doctype {
25    pub name: Option<StrTendril>,
26    pub public_id: Option<StrTendril>,
27    pub system_id: Option<StrTendril>,
28    /// Indicates if this DOCTYPE token should put the document in [quirks mode].
29    ///
30    /// [quirks mode]: https://dom.spec.whatwg.org/#concept-document-quirks
31    pub force_quirks: bool,
32}
33
34/// Whether the tag is a start or an end tag.
35#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)]
36pub enum TagKind {
37    StartTag,
38    EndTag,
39}
40
41/// A tag token.
42#[derive(PartialEq, Eq, Clone, Debug)]
43pub struct Tag {
44    /// Whether the tag is a start or an end tag.
45    pub kind: TagKind,
46    pub name: LocalName,
47    /// Whether the tag closes itself.
48    ///
49    /// An example of a self closing tag is `<foo />`.
50    pub self_closing: bool,
51    pub attrs: Vec<Attribute>,
52    /// Whether duplicate attributes were encountered during tokenization.
53    /// This is used for CSP nonce validation - elements with duplicate
54    /// attributes are not nonceable per the CSP spec.
55    pub had_duplicate_attributes: bool,
56}
57
58impl Tag {
59    /// Are the tags equivalent when we don't care about attribute order?
60    /// Also ignores the self-closing flag.
61    pub fn equiv_modulo_attr_order(&self, other: &Tag) -> bool {
62        if (self.kind != other.kind) || (self.name != other.name) {
63            return false;
64        }
65
66        let mut self_attrs = self.attrs.clone();
67        let mut other_attrs = other.attrs.clone();
68        self_attrs.sort();
69        other_attrs.sort();
70
71        self_attrs == other_attrs
72    }
73
74    pub(crate) fn get_attribute(&self, name: &LocalName) -> Option<StrTendril> {
75        self.attrs
76            .iter()
77            .find(|attribute| attribute.name.ns == *ns!() && attribute.name.local == *name)
78            .map(|attribute| attribute.value.clone())
79    }
80}
81
82#[derive(PartialEq, Eq, Debug)]
83pub enum Token {
84    /// A DOCTYPE declaration like `<!DOCTYPE html>`
85    DoctypeToken(Doctype),
86    /// A opening or closing tag, like `<foo>` or `</bar>`
87    TagToken(Tag),
88    /// A comment like `<!-- foo -->`.
89    CommentToken(StrTendril),
90    /// A sequence of characters.
91    CharacterTokens(StrTendril),
92    /// A `U+0000 NULL` character in the input.
93    NullCharacterToken,
94    EOFToken,
95    ParseError(Cow<'static, str>),
96}
97
98/// The result of a [TokenSink] consuming a single token.
99#[derive(Debug, PartialEq)]
100#[must_use]
101pub enum TokenSinkResult<Handle> {
102    /// The tokenizer can continue parsing the input as usual.
103    Continue,
104    /// The token sink has completed parsing a `<script>` tag, blocking the tokenizer
105    /// until the script is executed.
106    Script(Handle),
107    /// The tokenizer should set its state to the [PLAINTEXT state](https://html.spec.whatwg.org/#plaintext-state).
108    Plaintext,
109    /// The tokenizer should set its state to the given rawdata state.
110    RawData(states::RawKind),
111    /// The document indicated that the given encoding should be used to parse it.
112    ///
113    /// HTML5-compatible implementations should parse the encoding label using the algorithm
114    /// described in <https://encoding.spec.whatwg.org/#concept-encoding-get>. The label
115    /// has not been validated by html5ever. Invalid or unknown encodings can be ignored.
116    ///
117    /// If the decoder is confident that the current encoding is correct then this message
118    /// can safely be ignored.
119    EncodingIndicator(StrTendril),
120}
121
122/// Types which can receive tokens from the tokenizer.
123pub trait TokenSink {
124    /// The type of a DOM node.
125    type Handle;
126
127    /// Process a token.
128    fn process_token(&self, token: Token, line_number: u64) -> TokenSinkResult<Self::Handle>;
129
130    /// Signal that tokenization reached the end of the document.
131    fn end(&self) {}
132
133    /// Used in the [markup declaration open state]. By default, this always
134    /// returns false and thus all CDATA sections are tokenized as bogus
135    /// comments.
136    ///
137    /// [markup declaration open state]: https://html.spec.whatwg.org/multipage/#markup-declaration-open-state
138    fn adjusted_current_node_present_but_not_in_html_namespace(&self) -> bool {
139        false
140    }
141}