html5ever/tokenizer/interface.rs
1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use markup5ever::ns;
11
12use crate::interface::Attribute;
13use crate::tendril::StrTendril;
14use crate::tokenizer::states;
15use crate::LocalName;
16use std::borrow::Cow;
17
18pub use self::TagKind::{EndTag, StartTag};
19pub use self::Token::{CharacterTokens, CommentToken, DoctypeToken, TagToken};
20pub use self::Token::{EOFToken, NullCharacterToken, ParseError};
21
22/// A `DOCTYPE` token.
23#[derive(PartialEq, Eq, Clone, Debug, Default)]
24pub struct Doctype {
25 pub name: Option<StrTendril>,
26 pub public_id: Option<StrTendril>,
27 pub system_id: Option<StrTendril>,
28 /// Indicates if this DOCTYPE token should put the document in [quirks mode].
29 ///
30 /// [quirks mode]: https://dom.spec.whatwg.org/#concept-document-quirks
31 pub force_quirks: bool,
32}
33
34/// Whether the tag is a start or an end tag.
35#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)]
36pub enum TagKind {
37 StartTag,
38 EndTag,
39}
40
41/// A tag token.
42#[derive(PartialEq, Eq, Clone, Debug)]
43pub struct Tag {
44 /// Whether the tag is a start or an end tag.
45 pub kind: TagKind,
46 pub name: LocalName,
47 /// Whether the tag closes itself.
48 ///
49 /// An example of a self closing tag is `<foo />`.
50 pub self_closing: bool,
51 pub attrs: Vec<Attribute>,
52}
53
54impl Tag {
55 /// Are the tags equivalent when we don't care about attribute order?
56 /// Also ignores the self-closing flag.
57 pub fn equiv_modulo_attr_order(&self, other: &Tag) -> bool {
58 if (self.kind != other.kind) || (self.name != other.name) {
59 return false;
60 }
61
62 let mut self_attrs = self.attrs.clone();
63 let mut other_attrs = other.attrs.clone();
64 self_attrs.sort();
65 other_attrs.sort();
66
67 self_attrs == other_attrs
68 }
69
70 pub(crate) fn get_attribute(&self, name: &LocalName) -> Option<StrTendril> {
71 self.attrs
72 .iter()
73 .find(|attribute| attribute.name.ns == *ns!() && attribute.name.local == *name)
74 .map(|attribute| attribute.value.clone())
75 }
76}
77
78#[derive(PartialEq, Eq, Debug)]
79pub enum Token {
80 /// A DOCTYPE declaration like `<!DOCTYPE html>`
81 DoctypeToken(Doctype),
82 /// A opening or closing tag, like `<foo>` or `</bar>`
83 TagToken(Tag),
84 /// A comment like `<!-- foo -->`.
85 CommentToken(StrTendril),
86 /// A sequence of characters.
87 CharacterTokens(StrTendril),
88 /// A `U+0000 NULL` character in the input.
89 NullCharacterToken,
90 EOFToken,
91 ParseError(Cow<'static, str>),
92}
93
94/// The result of a [TokenSink] consuming a single token.
95#[derive(Debug, PartialEq)]
96#[must_use]
97pub enum TokenSinkResult<Handle> {
98 /// The tokenizer can continue parsing the input as usual.
99 Continue,
100 /// The token sink has completed parsing a `<script>` tag, blocking the tokenizer
101 /// until the script is executed.
102 Script(Handle),
103 /// The tokenizer should set its state to the [PLAINTEXT state](https://html.spec.whatwg.org/#plaintext-state).
104 Plaintext,
105 /// The tokenizer should set its state to the given rawdata state.
106 RawData(states::RawKind),
107 /// The document indicated that the given encoding should be used to parse it.
108 ///
109 /// HTML5-compatible implementations should parse the encoding label using the algorithm
110 /// described in <https://encoding.spec.whatwg.org/#concept-encoding-get>. The label
111 /// has not been validated by html5ever. Invalid or unknown encodings can be ignored.
112 ///
113 /// If the decoder is confident that the current encoding is correct then this message
114 /// can safely be ignored.
115 EncodingIndicator(StrTendril),
116}
117
118/// Types which can receive tokens from the tokenizer.
119pub trait TokenSink {
120 /// The type of a DOM node.
121 type Handle;
122
123 /// Process a token.
124 fn process_token(&self, token: Token, line_number: u64) -> TokenSinkResult<Self::Handle>;
125
126 /// Signal that tokenization reached the end of the document.
127 fn end(&self) {}
128
129 /// Used in the [markup declaration open state]. By default, this always
130 /// returns false and thus all CDATA sections are tokenized as bogus
131 /// comments.
132 ///
133 /// [markup declaration open state]: https://html.spec.whatwg.org/multipage/#markup-declaration-open-state
134 fn adjusted_current_node_present_but_not_in_html_namespace(&self) -> bool {
135 false
136 }
137}