1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
// Copyright 2014-2017 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use std::borrow::Cow;

use crate::tendril::StrTendril;
use crate::{Attribute, QualName};

pub use self::TagKind::{EmptyTag, EndTag, ShortTag, StartTag};
pub use self::Token::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
pub use self::Token::{CommentToken, DoctypeToken, PIToken, TagToken};

use super::states;

/// Tag kind denotes which kind of tag did we encounter.
#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)]
pub enum TagKind {
    /// Beginning of a tag (e.g. `<a>`).
    StartTag,
    /// End of a tag (e.g. `</a>`).
    EndTag,
    /// Empty tag (e.g. `<a/>`).
    EmptyTag,
    /// Short tag (e.g. `</>`).
    ShortTag,
}

/// XML 5 Tag Token
#[derive(PartialEq, Eq, Debug, Clone)]
pub struct Tag {
    /// Token kind denotes which type of token was encountered.
    /// E.g. if parser parsed `</a>` the token kind would be `EndTag`.
    pub kind: TagKind,
    /// Qualified name of the tag.
    pub name: QualName,
    /// List of attributes attached to this tag.
    /// Only valid in start and empty tag.
    pub attrs: Vec<Attribute>,
}

impl Tag {
    /// Sorts attributes in a tag.
    pub fn equiv_modulo_attr_order(&self, other: &Tag) -> bool {
        if (self.kind != other.kind) || (self.name != other.name) {
            return false;
        }

        let mut self_attrs = self.attrs.clone();
        let mut other_attrs = other.attrs.clone();
        self_attrs.sort();
        other_attrs.sort();

        self_attrs == other_attrs
    }
}

/// A `DOCTYPE` token.
/// Doctype token in XML5 is rather limited for reasons, such as:
/// security and simplicity. XML5 only supports declaring DTD with
/// name, public identifier and system identifier
#[derive(PartialEq, Eq, Clone, Debug)]
pub struct Doctype {
    /// Name of DOCTYPE declared
    pub name: Option<StrTendril>,
    /// Public identifier of this DOCTYPE.
    pub public_id: Option<StrTendril>,
    /// System identifier of this DOCTYPE.
    pub system_id: Option<StrTendril>,
}

impl Doctype {
    /// Constructs an empty DOCTYPE, with all fields set to None.
    pub fn new() -> Doctype {
        Doctype {
            name: None,
            public_id: None,
            system_id: None,
        }
    }
}

/// A ProcessingInstruction token.
#[derive(PartialEq, Eq, Clone, Debug)]
pub struct Pi {
    /// What is the name of processing instruction.
    pub target: StrTendril,

    /// Text of processing instruction.
    pub data: StrTendril,
}

/// Describes tokens encountered during parsing of input.
#[derive(PartialEq, Eq, Debug)]
pub enum Token {
    /// Doctype token
    DoctypeToken(Doctype),
    /// Token tag founds. This token applies to all
    /// possible kinds of tags (like start, end, empty tag, etc.).
    TagToken(Tag),
    /// Processing Instruction token
    PIToken(Pi),
    /// Comment token.
    CommentToken(StrTendril),
    /// Token that represents a series of characters.
    CharacterTokens(StrTendril),
    /// End of File found.
    EOFToken,
    /// NullCharacter encountered.
    NullCharacterToken,
    /// Error happened
    ParseError(Cow<'static, str>),
}

/// Types which can receive tokens from the tokenizer.
pub trait TokenSink {
    /// Process a token.
    fn process_token(&mut self, token: Token);

    /// Signal to the sink that parsing has ended.
    fn end(&mut self) {}

    /// The tokenizer will call this after emitting any start tag.
    /// This allows the tree builder to change the tokenizer's state.
    /// By default no state changes occur.
    fn query_state_change(&mut self) -> Option<states::XmlState> {
        None
    }
}