html5ever/tokenizer/
interface.rs

1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use markup5ever::ns;
11
12use crate::interface::Attribute;
13use crate::tendril::StrTendril;
14use crate::tokenizer::states;
15use crate::LocalName;
16use std::borrow::Cow;
17
18pub use self::TagKind::{EndTag, StartTag};
19pub use self::Token::{CharacterTokens, CommentToken, DoctypeToken, TagToken};
20pub use self::Token::{EOFToken, NullCharacterToken, ParseError};
21
22/// A `DOCTYPE` token.
23#[derive(PartialEq, Eq, Clone, Debug, Default)]
24pub struct Doctype {
25    pub name: Option<StrTendril>,
26    pub public_id: Option<StrTendril>,
27    pub system_id: Option<StrTendril>,
28    /// Indicates if this DOCTYPE token should put the document in [quirks mode].
29    ///
30    /// [quirks mode]: https://dom.spec.whatwg.org/#concept-document-quirks
31    pub force_quirks: bool,
32}
33
34/// Whether the tag is a start or an end tag.
35#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)]
36pub enum TagKind {
37    StartTag,
38    EndTag,
39}
40
41/// A tag token.
42#[derive(PartialEq, Eq, Clone, Debug)]
43pub struct Tag {
44    /// Whether the tag is a start or an end tag.
45    pub kind: TagKind,
46    pub name: LocalName,
47    /// Whether the tag closes itself.
48    ///
49    /// An example of a self closing tag is `<foo />`.
50    pub self_closing: bool,
51    pub attrs: Vec<Attribute>,
52}
53
54impl Tag {
55    /// Are the tags equivalent when we don't care about attribute order?
56    /// Also ignores the self-closing flag.
57    pub fn equiv_modulo_attr_order(&self, other: &Tag) -> bool {
58        if (self.kind != other.kind) || (self.name != other.name) {
59            return false;
60        }
61
62        let mut self_attrs = self.attrs.clone();
63        let mut other_attrs = other.attrs.clone();
64        self_attrs.sort();
65        other_attrs.sort();
66
67        self_attrs == other_attrs
68    }
69
70    pub(crate) fn get_attribute(&self, name: &LocalName) -> Option<StrTendril> {
71        self.attrs
72            .iter()
73            .find(|attribute| attribute.name.ns == *ns!() && attribute.name.local == *name)
74            .map(|attribute| attribute.value.clone())
75    }
76}
77
78#[derive(PartialEq, Eq, Debug)]
79pub enum Token {
80    /// A DOCTYPE declaration like `<!DOCTYPE html>`
81    DoctypeToken(Doctype),
82    /// A opening or closing tag, like `<foo>` or `</bar>`
83    TagToken(Tag),
84    /// A comment like `<!-- foo -->`.
85    CommentToken(StrTendril),
86    /// A sequence of characters.
87    CharacterTokens(StrTendril),
88    /// A `U+0000 NULL` character in the input.
89    NullCharacterToken,
90    EOFToken,
91    ParseError(Cow<'static, str>),
92}
93
94/// The result of a [TokenSink] consuming a single token.
95#[derive(Debug, PartialEq)]
96#[must_use]
97pub enum TokenSinkResult<Handle> {
98    /// The tokenizer can continue parsing the input as usual.
99    Continue,
100    /// The token sink has completed parsing a `<script>` tag, blocking the tokenizer
101    /// until the script is executed.
102    Script(Handle),
103    /// The tokenizer should set its state to the [PLAINTEXT state](https://html.spec.whatwg.org/#plaintext-state).
104    Plaintext,
105    /// The tokenizer should set its state to the given rawdata state.
106    RawData(states::RawKind),
107    /// The document indicated that the given encoding should be used to parse it.
108    ///
109    /// HTML5-compatible implementations should parse the encoding label using the algorithm
110    /// described in <https://encoding.spec.whatwg.org/#concept-encoding-get>. The label
111    /// has not been validated by html5ever. Invalid or unknown encodings can be ignored.
112    ///
113    /// If the decoder is confident that the current encoding is correct then this message
114    /// can safely be ignored.
115    EncodingIndicator(StrTendril),
116}
117
118/// Types which can receive tokens from the tokenizer.
119pub trait TokenSink {
120    /// The type of a DOM node.
121    type Handle;
122
123    /// Process a token.
124    fn process_token(&self, token: Token, line_number: u64) -> TokenSinkResult<Self::Handle>;
125
126    /// Signal that tokenization reached the end of the document.
127    fn end(&self) {}
128
129    /// Used in the [markup declaration open state]. By default, this always
130    /// returns false and thus all CDATA sections are tokenized as bogus
131    /// comments.
132    ///
133    /// [markup declaration open state]: https://html.spec.whatwg.org/multipage/#markup-declaration-open-state
134    fn adjusted_current_node_present_but_not_in_html_namespace(&self) -> bool {
135        false
136    }
137}