html5ever/tokenizer/interface.rs
1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use markup5ever::ns;
11
12use crate::interface::Attribute;
13use crate::tendril::StrTendril;
14use crate::tokenizer::states;
15use crate::LocalName;
16use std::borrow::Cow;
17
18pub use self::TagKind::{EndTag, StartTag};
19pub use self::Token::{CharacterTokens, CommentToken, DoctypeToken, TagToken};
20pub use self::Token::{EOFToken, NullCharacterToken, ParseError};
21
22/// A `DOCTYPE` token.
23#[derive(PartialEq, Eq, Clone, Debug, Default)]
24pub struct Doctype {
25 pub name: Option<StrTendril>,
26 pub public_id: Option<StrTendril>,
27 pub system_id: Option<StrTendril>,
28 /// Indicates if this DOCTYPE token should put the document in [quirks mode].
29 ///
30 /// [quirks mode]: https://dom.spec.whatwg.org/#concept-document-quirks
31 pub force_quirks: bool,
32}
33
34/// Whether the tag is a start or an end tag.
35#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)]
36pub enum TagKind {
37 StartTag,
38 EndTag,
39}
40
41/// A tag token.
42#[derive(PartialEq, Eq, Clone, Debug)]
43pub struct Tag {
44 /// Whether the tag is a start or an end tag.
45 pub kind: TagKind,
46 pub name: LocalName,
47 /// Whether the tag closes itself.
48 ///
49 /// An example of a self closing tag is `<foo />`.
50 pub self_closing: bool,
51 pub attrs: Vec<Attribute>,
52 /// Whether duplicate attributes were encountered during tokenization.
53 /// This is used for CSP nonce validation - elements with duplicate
54 /// attributes are not nonceable per the CSP spec.
55 pub had_duplicate_attributes: bool,
56}
57
58impl Tag {
59 /// Are the tags equivalent when we don't care about attribute order?
60 /// Also ignores the self-closing flag.
61 pub fn equiv_modulo_attr_order(&self, other: &Tag) -> bool {
62 if (self.kind != other.kind) || (self.name != other.name) {
63 return false;
64 }
65
66 let mut self_attrs = self.attrs.clone();
67 let mut other_attrs = other.attrs.clone();
68 self_attrs.sort();
69 other_attrs.sort();
70
71 self_attrs == other_attrs
72 }
73
74 pub(crate) fn get_attribute(&self, name: &LocalName) -> Option<StrTendril> {
75 self.attrs
76 .iter()
77 .find(|attribute| attribute.name.ns == *ns!() && attribute.name.local == *name)
78 .map(|attribute| attribute.value.clone())
79 }
80}
81
82#[derive(PartialEq, Eq, Debug)]
83pub enum Token {
84 /// A DOCTYPE declaration like `<!DOCTYPE html>`
85 DoctypeToken(Doctype),
86 /// A opening or closing tag, like `<foo>` or `</bar>`
87 TagToken(Tag),
88 /// A comment like `<!-- foo -->`.
89 CommentToken(StrTendril),
90 /// A sequence of characters.
91 CharacterTokens(StrTendril),
92 /// A `U+0000 NULL` character in the input.
93 NullCharacterToken,
94 EOFToken,
95 ParseError(Cow<'static, str>),
96}
97
98/// The result of a [TokenSink] consuming a single token.
99#[derive(Debug, PartialEq)]
100#[must_use]
101pub enum TokenSinkResult<Handle> {
102 /// The tokenizer can continue parsing the input as usual.
103 Continue,
104 /// The token sink has completed parsing a `<script>` tag, blocking the tokenizer
105 /// until the script is executed.
106 Script(Handle),
107 /// The tokenizer should set its state to the [PLAINTEXT state](https://html.spec.whatwg.org/#plaintext-state).
108 Plaintext,
109 /// The tokenizer should set its state to the given rawdata state.
110 RawData(states::RawKind),
111 /// The document indicated that the given encoding should be used to parse it.
112 ///
113 /// HTML5-compatible implementations should parse the encoding label using the algorithm
114 /// described in <https://encoding.spec.whatwg.org/#concept-encoding-get>. The label
115 /// has not been validated by html5ever. Invalid or unknown encodings can be ignored.
116 ///
117 /// If the decoder is confident that the current encoding is correct then this message
118 /// can safely be ignored.
119 EncodingIndicator(StrTendril),
120}
121
122/// Types which can receive tokens from the tokenizer.
123pub trait TokenSink {
124 /// The type of a DOM node.
125 type Handle;
126
127 /// Process a token.
128 fn process_token(&self, token: Token, line_number: u64) -> TokenSinkResult<Self::Handle>;
129
130 /// Signal that tokenization reached the end of the document.
131 fn end(&self) {}
132
133 /// Used in the [markup declaration open state]. By default, this always
134 /// returns false and thus all CDATA sections are tokenized as bogus
135 /// comments.
136 ///
137 /// [markup declaration open state]: https://html.spec.whatwg.org/multipage/#markup-declaration-open-state
138 fn adjusted_current_node_present_but_not_in_html_namespace(&self) -> bool {
139 false
140 }
141}