xml5ever/tokenizer/
mod.rs

1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10mod char_ref;
11mod interface;
12mod qname;
13pub mod states;
14
15pub use self::interface::{
16    Doctype, EmptyTag, EndTag, Pi, ShortTag, StartTag, Tag, TagKind, Token, TokenSink,
17};
18pub use crate::{LocalName, Namespace, Prefix};
19
20use crate::macros::time;
21use crate::tendril::StrTendril;
22use crate::{buffer_queue, Attribute, QualName, SmallCharSet};
23use log::debug;
24use markup5ever::{local_name, namespace_prefix, ns, small_char_set, TokenizerResult};
25use std::borrow::Cow::{self, Borrowed};
26use std::cell::{Cell, RefCell, RefMut};
27use std::collections::BTreeMap;
28use std::mem::replace;
29
30use buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
31use char_ref::{CharRef, CharRefTokenizer};
32use qname::QualNameTokenizer;
33use states::{AttrValueKind::*, DoctypeKind, DoctypeKind::*, XmlState};
34
35/// Copy of Tokenizer options, with an impl for `Default`.
36#[derive(Copy, Clone)]
37pub struct XmlTokenizerOpts {
38    /// Report all parse errors described in the spec, at some
39    /// performance penalty?  Default: false
40    pub exact_errors: bool,
41
42    /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
43    /// of the stream?  Default: true
44    pub discard_bom: bool,
45
46    /// Keep a record of how long we spent in each state?  Printed
47    /// when `end()` is called.  Default: false
48    pub profile: bool,
49
50    /// Initial state override.  Only the test runner should use
51    /// a non-`None` value!
52    pub initial_state: Option<XmlState>,
53}
54
55fn process_qname(tag_name: StrTendril) -> QualName {
56    // If tag name can't possibly contain full namespace, skip qualified name
57    // parsing altogether. For a tag to have namespace it must look like:
58    //     a:b
59    // Since StrTendril are UTF-8, we know that minimal size in bytes must be
60    // three bytes minimum.
61    let split = if (*tag_name).len() < 3 {
62        None
63    } else {
64        QualNameTokenizer::new((*tag_name).as_bytes()).run()
65    };
66
67    match split {
68        None => QualName::new(None, ns!(), LocalName::from(&*tag_name)),
69        Some(col) => {
70            let len = (*tag_name).len() as u32;
71            let prefix = tag_name.subtendril(0, col);
72            let local = tag_name.subtendril(col + 1, len - col - 1);
73            let ns = ns!(); // Actual namespace URL set in XmlTreeBuilder::bind_qname
74            QualName::new(Some(Prefix::from(&*prefix)), ns, LocalName::from(&*local))
75        },
76    }
77}
78
79fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
80    match *opt_str {
81        Some(ref mut s) => s.push_char(c),
82        None => *opt_str = Some(StrTendril::from_char(c)),
83    }
84}
85
86impl Default for XmlTokenizerOpts {
87    fn default() -> XmlTokenizerOpts {
88        XmlTokenizerOpts {
89            exact_errors: false,
90            discard_bom: true,
91            profile: false,
92            initial_state: None,
93        }
94    }
95}
96/// The Xml tokenizer.
97pub struct XmlTokenizer<Sink> {
98    /// Options controlling the behavior of the tokenizer.
99    opts: XmlTokenizerOpts,
100
101    /// Destination for tokens we emit.
102    pub sink: Sink,
103
104    /// The abstract machine state as described in the spec.
105    state: Cell<XmlState>,
106
107    /// Are we at the end of the file, once buffers have been processed
108    /// completely? This affects whether we will wait for lookahead or not.
109    at_eof: Cell<bool>,
110
111    /// Tokenizer for character references, if we're tokenizing
112    /// one at the moment.
113    char_ref_tokenizer: RefCell<Option<Box<CharRefTokenizer>>>,
114
115    /// Current input character.  Just consumed, may reconsume.
116    current_char: Cell<char>,
117
118    /// Should we reconsume the current input character?
119    reconsume: Cell<bool>,
120
121    /// Did we just consume \r, translating it to \n?  In that case we need
122    /// to ignore the next character if it's \n.
123    ignore_lf: Cell<bool>,
124
125    /// Discard a U+FEFF BYTE ORDER MARK if we see one?  Only done at the
126    /// beginning of the stream.
127    discard_bom: Cell<bool>,
128
129    /// Temporary buffer
130    temp_buf: RefCell<StrTendril>,
131
132    /// Current tag kind.
133    current_tag_kind: Cell<TagKind>,
134
135    /// Current tag name.
136    current_tag_name: RefCell<StrTendril>,
137
138    /// Current tag attributes.
139    current_tag_attrs: RefCell<Vec<Attribute>>,
140
141    /// Current attribute name.
142    current_attr_name: RefCell<StrTendril>,
143
144    /// Current attribute value.
145    current_attr_value: RefCell<StrTendril>,
146
147    current_doctype: RefCell<Doctype>,
148
149    /// Current comment.
150    current_comment: RefCell<StrTendril>,
151
152    /// Current processing instruction target.
153    current_pi_target: RefCell<StrTendril>,
154
155    /// Current processing instruction value.
156    current_pi_data: RefCell<StrTendril>,
157
158    /// Record of how many ns we spent in each state, if profiling is enabled.
159    state_profile: RefCell<BTreeMap<XmlState, u64>>,
160
161    /// Record of how many ns we spent in the token sink.
162    time_in_sink: Cell<u64>,
163}
164
165impl<Sink: TokenSink> XmlTokenizer<Sink> {
166    /// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
167    pub fn new(sink: Sink, opts: XmlTokenizerOpts) -> XmlTokenizer<Sink> {
168        if opts.profile && cfg!(for_c) {
169            panic!("Can't profile tokenizer when built as a C library");
170        }
171
172        let state = *opts.initial_state.as_ref().unwrap_or(&XmlState::Data);
173        let discard_bom = opts.discard_bom;
174        XmlTokenizer {
175            opts,
176            sink,
177            state: Cell::new(state),
178            char_ref_tokenizer: RefCell::new(None),
179            at_eof: Cell::new(false),
180            current_char: Cell::new('\0'),
181            reconsume: Cell::new(false),
182            ignore_lf: Cell::new(false),
183            temp_buf: RefCell::new(StrTendril::new()),
184            discard_bom: Cell::new(discard_bom),
185            current_tag_kind: Cell::new(StartTag),
186            current_tag_name: RefCell::new(StrTendril::new()),
187            current_tag_attrs: RefCell::new(vec![]),
188            current_attr_name: RefCell::new(StrTendril::new()),
189            current_attr_value: RefCell::new(StrTendril::new()),
190            current_comment: RefCell::new(StrTendril::new()),
191            current_pi_data: RefCell::new(StrTendril::new()),
192            current_pi_target: RefCell::new(StrTendril::new()),
193            current_doctype: RefCell::new(Doctype::default()),
194            state_profile: RefCell::new(BTreeMap::new()),
195            time_in_sink: Cell::new(0),
196        }
197    }
198
199    /// Feed an input string into the tokenizer.
200    pub fn feed(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
201        if input.is_empty() {
202            return TokenizerResult::Done;
203        }
204
205        if self.discard_bom.get() {
206            if let Some(c) = input.peek() {
207                if c == '\u{feff}' {
208                    input.next();
209                }
210            } else {
211                return TokenizerResult::Done;
212            }
213        };
214
215        self.run(input)
216    }
217
218    fn process_token(&self, token: Token) -> ProcessResult<Sink::Handle> {
219        if self.opts.profile {
220            let (result, dt) = time!(self.sink.process_token(token));
221            self.time_in_sink.set(self.time_in_sink.get() + dt);
222            result
223        } else {
224            self.sink.process_token(token)
225        }
226    }
227
228    // Get the next input character, which might be the character
229    // 'c' that we already consumed from the buffers.
230    fn get_preprocessed_char(&self, mut c: char, input: &BufferQueue) -> Option<char> {
231        if self.ignore_lf.get() {
232            self.ignore_lf.set(false);
233            if c == '\n' {
234                c = input.next()?;
235            }
236        }
237
238        if c == '\r' {
239            self.ignore_lf.set(true);
240            c = '\n';
241        }
242
243        // Normalize \x00 into \uFFFD
244        if c == '\x00' {
245            c = '\u{FFFD}'
246        }
247
248        // Exclude forbidden Unicode characters
249        if self.opts.exact_errors
250            && match c as u32 {
251                0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
252                n if (n & 0xFFFE) == 0xFFFE => true,
253                _ => false,
254            }
255        {
256            let msg = format!("Bad character {c}");
257            self.emit_error(Cow::Owned(msg));
258        }
259
260        debug!("got character {c}");
261        self.current_char.set(c);
262        Some(c)
263    }
264
265    fn bad_eof_error(&self) {
266        let msg = if self.opts.exact_errors {
267            Cow::from(format!("Saw EOF in state {:?}", self.state))
268        } else {
269            Cow::from("Unexpected EOF")
270        };
271        self.emit_error(msg);
272    }
273
274    fn pop_except_from(&self, input: &BufferQueue, set: SmallCharSet) -> Option<SetResult> {
275        // Bail to the slow path for various corner cases.
276        // This means that `FromSet` can contain characters not in the set!
277        // It shouldn't matter because the fallback `FromSet` case should
278        // always do the same thing as the `NotFromSet` case.
279        if self.opts.exact_errors || self.reconsume.get() || self.ignore_lf.get() {
280            return self.get_char(input).map(FromSet);
281        }
282
283        let d = input.pop_except_from(set);
284        debug!("got characters {d:?}");
285        match d {
286            Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet),
287
288            // NB: We don't set self.current_char for a run of characters not
289            // in the set.  It shouldn't matter for the codepaths that use
290            // this.
291            _ => d,
292        }
293    }
294
295    // Check if the next characters are an ASCII case-insensitive match.  See
296    // BufferQueue::eat.
297    //
298    // NB: this doesn't do input stream preprocessing or set the current input
299    // character.
300    fn eat(&self, input: &BufferQueue, pat: &str) -> Option<bool> {
301        input.push_front(replace(&mut *self.temp_buf.borrow_mut(), StrTendril::new()));
302        match input.eat(pat, u8::eq_ignore_ascii_case) {
303            None if self.at_eof.get() => Some(false),
304            None => {
305                let mut temp_buf = self.temp_buf.borrow_mut();
306                while let Some(data) = input.next() {
307                    temp_buf.push_char(data);
308                }
309                None
310            },
311            Some(matched) => Some(matched),
312        }
313    }
314
315    /// Run the state machine for as long as we can.
316    pub fn run(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
317        if self.opts.profile {
318            loop {
319                let state = self.state.get();
320                let old_sink = self.time_in_sink.get();
321                let (run, mut dt) = time!(self.step(input));
322                dt -= self.time_in_sink.get() - old_sink;
323                let new = match self.state_profile.borrow_mut().get_mut(&state) {
324                    Some(x) => {
325                        *x += dt;
326                        false
327                    },
328                    None => true,
329                };
330                if new {
331                    // do this here because of borrow shenanigans
332                    self.state_profile.borrow_mut().insert(state, dt);
333                }
334                match run {
335                    ProcessResult::Continue => continue,
336                    ProcessResult::Done => return TokenizerResult::Done,
337                    ProcessResult::Script(handle) => return TokenizerResult::Script(handle),
338                }
339            }
340        } else {
341            loop {
342                match self.step(input) {
343                    ProcessResult::Continue => continue,
344                    ProcessResult::Done => return TokenizerResult::Done,
345                    ProcessResult::Script(handle) => return TokenizerResult::Script(handle),
346                }
347            }
348        }
349    }
350
351    //§ tokenization
352    // Get the next input character, if one is available.
353    fn get_char(&self, input: &BufferQueue) -> Option<char> {
354        if self.reconsume.get() {
355            self.reconsume.set(false);
356            Some(self.current_char.get())
357        } else {
358            input
359                .next()
360                .and_then(|c| self.get_preprocessed_char(c, input))
361        }
362    }
363
364    fn bad_char_error(&self) {
365        let msg = if self.opts.exact_errors {
366            let c = self.current_char.get();
367            let state = self.state.get();
368            Cow::from(format!("Saw {c} in state {state:?}"))
369        } else {
370            Cow::from("Bad character")
371        };
372        self.emit_error(msg);
373    }
374
375    fn discard_tag(&self) {
376        *self.current_tag_name.borrow_mut() = StrTendril::new();
377        *self.current_tag_attrs.borrow_mut() = Vec::new();
378    }
379
380    fn create_tag(&self, kind: TagKind, c: char) {
381        self.discard_tag();
382        self.current_tag_name.borrow_mut().push_char(c);
383        self.current_tag_kind.set(kind);
384    }
385
386    // This method creates a PI token and
387    // sets its target to given char
388    fn create_pi(&self, c: char) {
389        *self.current_pi_target.borrow_mut() = StrTendril::new();
390        *self.current_pi_data.borrow_mut() = StrTendril::new();
391        self.current_pi_target.borrow_mut().push_char(c);
392    }
393
394    fn emit_char(&self, c: char) {
395        self.process_token(Token::Characters(StrTendril::from_char(match c {
396            '\0' => '\u{FFFD}',
397            c => c,
398        })));
399    }
400
401    fn emit_short_tag(&self) -> ProcessResult<Sink::Handle> {
402        self.current_tag_kind.set(ShortTag);
403        *self.current_tag_name.borrow_mut() = StrTendril::new();
404        self.emit_current_tag()
405    }
406
407    fn emit_empty_tag(&self) -> ProcessResult<Sink::Handle> {
408        self.current_tag_kind.set(EmptyTag);
409        self.emit_current_tag()
410    }
411
412    fn set_empty_tag(&self) {
413        self.current_tag_kind.set(EmptyTag);
414    }
415
416    fn emit_start_tag(&self) -> ProcessResult<Sink::Handle> {
417        self.current_tag_kind.set(StartTag);
418        self.emit_current_tag()
419    }
420
421    fn emit_current_tag(&self) -> ProcessResult<Sink::Handle> {
422        self.finish_attribute();
423
424        let qname = process_qname(replace(
425            &mut *self.current_tag_name.borrow_mut(),
426            StrTendril::new(),
427        ));
428
429        match self.current_tag_kind.get() {
430            StartTag | EmptyTag => {},
431            EndTag => {
432                if !self.current_tag_attrs.borrow().is_empty() {
433                    self.emit_error(Borrowed("Attributes on an end tag"));
434                }
435            },
436            ShortTag => {
437                if !self.current_tag_attrs.borrow().is_empty() {
438                    self.emit_error(Borrowed("Attributes on a short tag"));
439                }
440            },
441        }
442
443        let token = Token::Tag(Tag {
444            kind: self.current_tag_kind.get(),
445            name: qname,
446            attrs: self.current_tag_attrs.take(),
447        });
448
449        self.process_token(token)
450    }
451
452    // The string must not contain '\0'!
453    fn emit_chars(&self, b: StrTendril) {
454        self.process_token(Token::Characters(b));
455    }
456
457    // Emits the current Processing Instruction
458    fn emit_pi(&self) -> ProcessResult<<Sink as TokenSink>::Handle> {
459        let token = Token::ProcessingInstruction(Pi {
460            target: replace(&mut *self.current_pi_target.borrow_mut(), StrTendril::new()),
461            data: replace(&mut *self.current_pi_data.borrow_mut(), StrTendril::new()),
462        });
463        self.process_token(token)
464    }
465
466    fn consume_char_ref(&self, addnl_allowed: Option<char>) {
467        // NB: The char ref tokenizer assumes we have an additional allowed
468        // character iff we're tokenizing in an attribute value.
469        *self.char_ref_tokenizer.borrow_mut() =
470            Some(Box::new(CharRefTokenizer::new(addnl_allowed)));
471    }
472
473    fn emit_eof(&self) {
474        self.process_token(Token::EndOfFile);
475    }
476
477    fn emit_error(&self, error: Cow<'static, str>) {
478        self.process_token(Token::ParseError(error));
479    }
480
481    fn emit_current_comment(&self) {
482        let comment = self.current_comment.take();
483        self.process_token(Token::Comment(comment));
484    }
485
486    fn emit_current_doctype(&self) {
487        let doctype = self.current_doctype.take();
488        self.process_token(Token::Doctype(doctype));
489    }
490
491    fn doctype_id(&self, kind: DoctypeKind) -> RefMut<'_, Option<StrTendril>> {
492        let current_doctype = self.current_doctype.borrow_mut();
493        match kind {
494            DoctypeKind::Public => RefMut::map(current_doctype, |d| &mut d.public_id),
495            DoctypeKind::System => RefMut::map(current_doctype, |d| &mut d.system_id),
496        }
497    }
498
499    fn clear_doctype_id(&self, kind: DoctypeKind) {
500        let mut id = self.doctype_id(kind);
501        match *id {
502            Some(ref mut s) => s.clear(),
503            None => *id = Some(StrTendril::new()),
504        }
505    }
506
507    fn peek(&self, input: &BufferQueue) -> Option<char> {
508        if self.reconsume.get() {
509            Some(self.current_char.get())
510        } else {
511            input.peek()
512        }
513    }
514
515    fn discard_char(&self, input: &BufferQueue) {
516        let c = self.get_char(input);
517        assert!(c.is_some());
518    }
519
520    fn unconsume(&self, input: &BufferQueue, buf: StrTendril) {
521        input.push_front(buf);
522    }
523}
524
525// Shorthand for common state machine behaviors.
526macro_rules! shorthand (
527    ( $me:ident : emit $c:expr                     ) => ( $me.emit_char($c)                                   );
528    ( $me:ident : create_tag $kind:ident $c:expr   ) => ( $me.create_tag($kind, $c)                           );
529    ( $me:ident : push_tag $c:expr                 ) => ( $me.current_tag_name.borrow_mut().push_char($c)     );
530    ( $me:ident : discard_tag $input:expr          ) => ( $me.discard_tag($input)                             );
531    ( $me:ident : discard_char                     ) => ( $me.discard_char()                                  );
532    ( $me:ident : push_temp $c:expr                ) => ( $me.temp_buf.borrow_mut().push_char($c)             );
533    ( $me:ident : emit_temp                        ) => ( $me.emit_temp_buf()                                 );
534    ( $me:ident : clear_temp                       ) => ( $me.clear_temp_buf()                                );
535    ( $me:ident : create_attr $c:expr              ) => ( $me.create_attribute($c)                            );
536    ( $me:ident : push_name $c:expr                ) => ( $me.current_attr_name.borrow_mut().push_char($c)    );
537    ( $me:ident : push_value $c:expr               ) => ( $me.current_attr_value.borrow_mut().push_char($c)   );
538    ( $me:ident : append_value $c:expr             ) => ( $me.current_attr_value.borrow_mut().push_tendril($c));
539    ( $me:ident : push_comment $c:expr             ) => ( $me.current_comment.borrow_mut().push_char($c)      );
540    ( $me:ident : append_comment $c:expr           ) => ( $me.current_comment.borrow_mut().push_slice($c)     );
541    ( $me:ident : emit_comment                     ) => ( $me.emit_current_comment()                          );
542    ( $me:ident : clear_comment                    ) => ( $me.current_comment.borrow_mut().clear()            );
543    ( $me:ident : create_doctype                   ) => ( *$me.current_doctype.borrow_mut() = Doctype::default() );
544    ( $me:ident : push_doctype_name $c:expr        ) => ( option_push(&mut $me.current_doctype.borrow_mut().name, $c) );
545    ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push(&mut $me.doctype_id($k), $c)            );
546    ( $me:ident : clear_doctype_id $k:ident        ) => ( $me.clear_doctype_id($k)                            );
547    ( $me:ident : emit_doctype                     ) => ( $me.emit_current_doctype()                          );
548    ( $me:ident : error                            ) => ( $me.bad_char_error()                                );
549    ( $me:ident : error_eof                        ) => ( $me.bad_eof_error()                                 );
550    ( $me:ident : create_pi $c:expr                ) => ( $me.create_pi($c)                                   );
551    ( $me:ident : push_pi_target $c:expr           ) => ( $me.current_pi_target.borrow_mut().push_char($c)    );
552    ( $me:ident : push_pi_data $c:expr             ) => ( $me.current_pi_data.borrow_mut().push_char($c)      );
553    ( $me:ident : set_empty_tag                    ) => ( $me.set_empty_tag()                                 );
554);
555
556// Tracing of tokenizer actions.  This adds significant bloat and compile time,
557// so it's behind a cfg flag.
558#[cfg(feature = "trace_tokenizer")]
559macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
560    debug!("  {:?}", stringify!($($cmds)*));
561    shorthand!($me : $($cmds)*);
562}));
563
564#[cfg(not(feature = "trace_tokenizer"))]
565macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
566
567// A little DSL for sequencing shorthand actions.
568macro_rules! go (
569    // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity.
570    // We have to tell the parser how much lookahead we need.
571
572    ( $me:ident : $a:tt                   ; $($rest:tt)* ) => ({ sh_trace!($me: $a);          go!($me: $($rest)*); });
573    ( $me:ident : $a:tt $b:tt             ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b);       go!($me: $($rest)*); });
574    ( $me:ident : $a:tt $b:tt $c:tt       ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c);    go!($me: $($rest)*); });
575    ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
576
577    // These can only come at the end.
578
579    ( $me:ident : to $s:ident                    ) => ({ $me.state.set(XmlState::$s); return ProcessResult::Continue;           });
580    ( $me:ident : to $s:ident $k1:expr           ) => ({ $me.state.set(XmlState::$s($k1)); return ProcessResult::Continue;      });
581    ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state.set(XmlState::$s($k1($k2))); return ProcessResult::Continue; });
582
583    ( $me:ident : reconsume $s:ident                    ) => ({ $me.reconsume.set(true); go!($me: to $s);         });
584    ( $me:ident : reconsume $s:ident $k1:expr           ) => ({ $me.reconsume.set(true); go!($me: to $s $k1);     });
585    ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1 $k2); });
586
587    ( $me:ident : consume_char_ref             ) => ({ $me.consume_char_ref(None); return ProcessResult::Continue;         });
588    ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return ProcessResult::Continue; });
589
590    // We have a default next state after emitting a tag, but the sink can override.
591    ( $me:ident : emit_tag $s:ident ) => ({
592        $me.state.set(XmlState::$s);
593        return $me.emit_current_tag();
594    });
595
596    // We have a special when dealing with empty and short tags in Xml
597    ( $me:ident : emit_short_tag $s:ident ) => ({
598        $me.state.set(XmlState::$s);
599        return $me.emit_short_tag();
600    });
601
602    ( $me:ident : emit_empty_tag $s:ident ) => ({
603        $me.state.set(XmlState::$s);
604        return $me.emit_empty_tag();
605    });
606
607    ( $me:ident : emit_start_tag $s:ident ) => ({
608        $me.state.set(XmlState::$s);
609        return $me.emit_start_tag();
610    });
611
612    ( $me:ident : emit_pi $s:ident ) => ({
613        $me.state.set(XmlState::$s);
614        return $me.emit_pi();
615    });
616
617    ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Done; });
618
619    // If nothing else matched, it's a single command
620    ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) );
621
622    // or nothing.
623    ( $me:ident : ) => (());
624);
625
626// This is a macro because it can cause early return
627// from the function where it is used.
628macro_rules! get_char ( ($me:expr, $input:expr) => {{
629    let Some(character) = $me.get_char($input) else {
630        return ProcessResult::Done;
631    };
632    character
633}});
634
635macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => {{
636    let Some(popped_element) = $me.pop_except_from($input, $set) else {
637        return ProcessResult::Done;
638    };
639    popped_element
640}});
641
642macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => {{
643    let Some(value) = $me.eat($input, $pat) else {
644        return ProcessResult::Done;
645    };
646    value
647}});
648
649/// The result of a single tokenization operation
650pub enum ProcessResult<Handle> {
651    /// The tokenizer needs more input before it can continue
652    Done,
653    /// The tokenizer can be invoked again immediately
654    Continue,
655    /// The tokenizer encountered a script element that must be executed
656    /// before tokenization can continue
657    Script(Handle),
658}
659
660impl<Sink: TokenSink> XmlTokenizer<Sink> {
661    // Run the state machine for a while.
662    #[allow(clippy::never_loop)]
663    fn step(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
664        if self.char_ref_tokenizer.borrow().is_some() {
665            return self.step_char_ref_tokenizer(input);
666        }
667
668        debug!("processing in state {:?}", self.state);
669        match self.state.get() {
670            //§ data-state
671            XmlState::Data => loop {
672                match pop_except_from!(self, input, small_char_set!('\r' '&' '<')) {
673                    FromSet('&') => go!(self: consume_char_ref),
674                    FromSet('<') => go!(self: to TagState),
675                    FromSet(c) => go!(self: emit c),
676                    NotFromSet(b) => self.emit_chars(b),
677                }
678            },
679            //§ tag-state
680            XmlState::TagState => loop {
681                match get_char!(self, input) {
682                    '!' => go!(self: to MarkupDecl),
683                    '/' => go!(self: to EndTagState),
684                    '?' => go!(self: to Pi),
685                    '\t' | '\n' | ' ' | ':' | '<' | '>' => {
686                        go!(self: error; emit '<'; reconsume Data)
687                    },
688                    cl => go!(self: create_tag StartTag cl; to TagName),
689                }
690            },
691            //§ end-tag-state
692            XmlState::EndTagState => loop {
693                match get_char!(self, input) {
694                    '>' => go!(self:  emit_short_tag Data),
695                    '\t' | '\n' | ' ' | '<' | ':' => {
696                        go!(self: error; emit '<'; emit '/'; reconsume Data)
697                    },
698                    cl => go!(self: create_tag EndTag cl; to EndTagName),
699                }
700            },
701            //§ end-tag-name-state
702            XmlState::EndTagName => loop {
703                match get_char!(self, input) {
704                    '\t' | '\n' | ' ' => go!(self: to EndTagNameAfter),
705                    '/' => go!(self: error; to EndTagNameAfter),
706                    '>' => go!(self: emit_tag Data),
707                    cl => go!(self: push_tag cl),
708                }
709            },
710            //§ end-tag-name-after-state
711            XmlState::EndTagNameAfter => loop {
712                match get_char!(self, input) {
713                    '>' => go!(self: emit_tag Data),
714                    '\t' | '\n' | ' ' => (),
715                    _ => self.emit_error(Borrowed("Unexpected element in tag name")),
716                }
717            },
718            //§ pi-state
719            XmlState::Pi => loop {
720                match get_char!(self, input) {
721                    '\t' | '\n' | ' ' => go!(self: error; reconsume BogusComment),
722                    cl => go!(self: create_pi cl; to PiTarget),
723                }
724            },
725            //§ pi-target-state
726            XmlState::PiTarget => loop {
727                match get_char!(self, input) {
728                    '\t' | '\n' | ' ' => go!(self: to PiTargetAfter),
729                    '?' => go!(self: to PiAfter),
730                    cl => go!(self: push_pi_target cl),
731                }
732            },
733            //§ pi-target-after-state
734            XmlState::PiTargetAfter => loop {
735                match get_char!(self, input) {
736                    '\t' | '\n' | ' ' => (),
737                    _ => go!(self: reconsume PiData),
738                }
739            },
740            //§ pi-data-state
741            XmlState::PiData => loop {
742                match get_char!(self, input) {
743                    '?' => go!(self: to PiAfter),
744                    cl => go!(self: push_pi_data cl),
745                }
746            },
747            //§ pi-after-state
748            XmlState::PiAfter => loop {
749                match get_char!(self, input) {
750                    '>' => go!(self: emit_pi Data),
751                    '?' => go!(self: to PiAfter),
752                    cl => go!(self: push_pi_data cl),
753                }
754            },
755            //§ markup-declaration-state
756            XmlState::MarkupDecl => loop {
757                if eat!(self, input, "--") {
758                    go!(self: clear_comment; to CommentStart);
759                } else if eat!(self, input, "[CDATA[") {
760                    go!(self: to Cdata);
761                } else if eat!(self, input, "DOCTYPE") {
762                    go!(self: to Doctype);
763                } else {
764                    // FIXME: 'error' gives wrong message
765                    go!(self: error; to BogusComment);
766                }
767            },
768            //§ comment-start-state
769            XmlState::CommentStart => loop {
770                match get_char!(self, input) {
771                    '-' => go!(self: to CommentStartDash),
772                    '>' => go!(self: error; emit_comment; to Data),
773                    _ => go!(self: reconsume Comment),
774                }
775            },
776            //§ comment-start-dash-state
777            XmlState::CommentStartDash => loop {
778                match get_char!(self, input) {
779                    '-' => go!(self: to CommentEnd),
780                    '>' => go!(self: error; emit_comment; to Data),
781                    _ => go!(self: push_comment '-'; reconsume Comment),
782                }
783            },
784            //§ comment-state
785            XmlState::Comment => loop {
786                match get_char!(self, input) {
787                    '<' => go!(self: push_comment '<'; to CommentLessThan),
788                    '-' => go!(self: to CommentEndDash),
789                    c => go!(self: push_comment c),
790                }
791            },
792            //§ comment-less-than-sign-state
793            XmlState::CommentLessThan => loop {
794                match get_char!(self, input) {
795                    '!' => go!(self: push_comment '!';to CommentLessThanBang),
796                    '<' => go!(self: push_comment '<'),
797                    _ => go!(self: reconsume Comment),
798                }
799            },
800            //§ comment-less-than-sign-bang-state
801            XmlState::CommentLessThanBang => loop {
802                match get_char!(self, input) {
803                    '-' => go!(self: to CommentLessThanBangDash),
804                    _ => go!(self: reconsume Comment),
805                }
806            },
807            //§ comment-less-than-sign-bang-dash-state
808            XmlState::CommentLessThanBangDash => loop {
809                match get_char!(self, input) {
810                    '-' => go!(self: to CommentLessThanBangDashDash),
811                    _ => go!(self: reconsume CommentEndDash),
812                }
813            },
814            //§ comment-less-than-sign-bang-dash-dash-state
815            XmlState::CommentLessThanBangDashDash => loop {
816                match get_char!(self, input) {
817                    '>' => go!(self: reconsume CommentEnd),
818                    _ => go!(self: error; reconsume CommentEnd),
819                }
820            },
821            //§ comment-end-dash-state
822            XmlState::CommentEndDash => loop {
823                match get_char!(self, input) {
824                    '-' => go!(self: to CommentEnd),
825                    _ => go!(self: push_comment '-'; reconsume Comment),
826                }
827            },
828            //§ comment-end-state
829            XmlState::CommentEnd => loop {
830                match get_char!(self, input) {
831                    '>' => go!(self: emit_comment; to Data),
832                    '!' => go!(self: to CommentEndBang),
833                    '-' => go!(self: push_comment '-'),
834                    _ => go!(self: append_comment "--"; reconsume Comment),
835                }
836            },
837            //§ comment-end-bang-state
838            XmlState::CommentEndBang => loop {
839                match get_char!(self, input) {
840                    '-' => go!(self: append_comment "--!"; to CommentEndDash),
841                    '>' => go!(self: error; emit_comment; to Data),
842                    _ => go!(self: append_comment "--!"; reconsume Comment),
843                }
844            },
845            //§ bogus-comment-state
846            XmlState::BogusComment => loop {
847                match get_char!(self, input) {
848                    '>' => go!(self: emit_comment; to Data),
849                    c => go!(self: push_comment c),
850                }
851            },
852            //§ cdata-state
853            XmlState::Cdata => loop {
854                match get_char!(self, input) {
855                    ']' => go!(self: to CdataBracket),
856                    cl => go!(self: emit cl),
857                }
858            },
859            //§ cdata-bracket-state
860            XmlState::CdataBracket => loop {
861                match get_char!(self, input) {
862                    ']' => go!(self: to CdataEnd),
863                    cl => go!(self: emit ']'; emit cl; to Cdata),
864                }
865            },
866            //§ cdata-end-state
867            XmlState::CdataEnd => loop {
868                match get_char!(self, input) {
869                    '>' => go!(self: to Data),
870                    ']' => go!(self: emit ']'),
871                    cl => go!(self: emit ']'; emit ']'; emit cl; to Cdata),
872                }
873            },
874            //§ tag-name-state
875            XmlState::TagName => loop {
876                match get_char!(self, input) {
877                    '\t' | '\n' | ' ' => go!(self: to TagAttrNameBefore),
878                    '>' => go!(self: emit_tag Data),
879                    '/' => go!(self: set_empty_tag; to TagEmpty),
880                    cl => go!(self: push_tag cl),
881                }
882            },
883            //§ empty-tag-state
884            XmlState::TagEmpty => loop {
885                match get_char!(self, input) {
886                    '>' => go!(self: emit_empty_tag Data),
887                    _ => go!(self: reconsume TagAttrValueBefore),
888                }
889            },
890            //§ tag-attribute-name-before-state
891            XmlState::TagAttrNameBefore => loop {
892                match get_char!(self, input) {
893                    '\t' | '\n' | ' ' => (),
894                    '>' => go!(self: emit_tag Data),
895                    '/' => go!(self: set_empty_tag; to TagEmpty),
896                    ':' => go!(self: error),
897                    cl => go!(self: create_attr cl; to TagAttrName),
898                }
899            },
900            //§ tag-attribute-name-state
901            XmlState::TagAttrName => loop {
902                match get_char!(self, input) {
903                    '=' => go!(self: to TagAttrValueBefore),
904                    '>' => go!(self: emit_tag Data),
905                    '\t' | '\n' | ' ' => go!(self: to TagAttrNameAfter),
906                    '/' => go!(self: set_empty_tag; to TagEmpty),
907                    cl => go!(self: push_name cl),
908                }
909            },
910            //§ tag-attribute-name-after-state
911            XmlState::TagAttrNameAfter => loop {
912                match get_char!(self, input) {
913                    '\t' | '\n' | ' ' => (),
914                    '=' => go!(self: to TagAttrValueBefore),
915                    '>' => go!(self: emit_tag Data),
916                    '/' => go!(self: set_empty_tag; to TagEmpty),
917                    cl => go!(self: create_attr cl; to TagAttrName),
918                }
919            },
920            //§ tag-attribute-value-before-state
921            XmlState::TagAttrValueBefore => loop {
922                match get_char!(self, input) {
923                    '\t' | '\n' | ' ' => (),
924                    '"' => go!(self: to TagAttrValue DoubleQuoted),
925                    '\'' => go!(self: to TagAttrValue SingleQuoted),
926                    '&' => go!(self: reconsume TagAttrValue(Unquoted)),
927                    '>' => go!(self: emit_tag Data),
928                    cl => go!(self: push_value cl; to TagAttrValue(Unquoted)),
929                }
930            },
931            //§ tag-attribute-value-double-quoted-state
932            XmlState::TagAttrValue(DoubleQuoted) => loop {
933                match pop_except_from!(self, input, small_char_set!('\n' '"' '&')) {
934                    FromSet('"') => go!(self: to TagAttrNameBefore),
935                    FromSet('&') => go!(self: consume_char_ref '"' ),
936                    FromSet(c) => go!(self: push_value c),
937                    NotFromSet(ref b) => go!(self: append_value b),
938                }
939            },
940            //§ tag-attribute-value-single-quoted-state
941            XmlState::TagAttrValue(SingleQuoted) => loop {
942                match pop_except_from!(self, input, small_char_set!('\n' '\'' '&')) {
943                    FromSet('\'') => go!(self: to TagAttrNameBefore),
944                    FromSet('&') => go!(self: consume_char_ref '\''),
945                    FromSet(c) => go!(self: push_value c),
946                    NotFromSet(ref b) => go!(self: append_value b),
947                }
948            },
949            //§ tag-attribute-value-double-quoted-state
950            XmlState::TagAttrValue(Unquoted) => loop {
951                match pop_except_from!(self, input, small_char_set!('\n' '\t' ' ' '&' '>')) {
952                    FromSet('\t') | FromSet('\n') | FromSet(' ') => go!(self: to TagAttrNameBefore),
953                    FromSet('&') => go!(self: consume_char_ref),
954                    FromSet('>') => go!(self: emit_tag Data),
955                    FromSet(c) => go!(self: push_value c),
956                    NotFromSet(ref b) => go!(self: append_value b),
957                }
958            },
959
960            //§ doctype-state
961            XmlState::Doctype => loop {
962                match get_char!(self, input) {
963                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
964                    _ => go!(self: error; reconsume BeforeDoctypeName),
965                }
966            },
967            //§ before-doctype-name-state
968            XmlState::BeforeDoctypeName => loop {
969                match get_char!(self, input) {
970                    '\t' | '\n' | '\x0C' | ' ' => (),
971                    '>' => go!(self: error; emit_doctype; to Data),
972                    c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
973                                  to DoctypeName),
974                }
975            },
976            //§ doctype-name-state
977            XmlState::DoctypeName => loop {
978                match get_char!(self, input) {
979                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterDoctypeName),
980                    '>' => go!(self: emit_doctype; to Data),
981                    c => go!(self: push_doctype_name (c.to_ascii_lowercase());
982                                  to DoctypeName),
983                }
984            },
985            //§ after-doctype-name-state
986            XmlState::AfterDoctypeName => loop {
987                if eat!(self, input, "public") {
988                    go!(self: to AfterDoctypeKeyword Public);
989                } else if eat!(self, input, "system") {
990                    go!(self: to AfterDoctypeKeyword System);
991                } else {
992                    match get_char!(self, input) {
993                        '\t' | '\n' | '\x0C' | ' ' => (),
994                        '>' => go!(self: emit_doctype; to Data),
995                        _ => go!(self: error; to BogusDoctype),
996                    }
997                }
998            },
999            //§ after-doctype-public-keyword-state
1000            XmlState::AfterDoctypeKeyword(Public) => loop {
1001                match get_char!(self, input) {
1002                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier Public),
1003                    '"' => {
1004                        go!(self: error; clear_doctype_id Public; to DoctypeIdentifierDoubleQuoted Public)
1005                    },
1006                    '\'' => {
1007                        go!(self: error; clear_doctype_id Public; to DoctypeIdentifierSingleQuoted Public)
1008                    },
1009                    '>' => go!(self: error; emit_doctype; to Data),
1010                    _ => go!(self: error; to BogusDoctype),
1011                }
1012            },
1013            //§ after-doctype-system-keyword-state
1014            XmlState::AfterDoctypeKeyword(System) => loop {
1015                match get_char!(self, input) {
1016                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier System),
1017                    '"' => {
1018                        go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1019                    },
1020                    '\'' => {
1021                        go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1022                    },
1023                    '>' => go!(self: error; emit_doctype; to Data),
1024                    _ => go!(self: error; to BogusDoctype),
1025                }
1026            },
1027            //§ before_doctype_public_identifier_state before_doctype_system_identifier_state
1028            XmlState::BeforeDoctypeIdentifier(kind) => loop {
1029                match get_char!(self, input) {
1030                    '\t' | '\n' | '\x0C' | ' ' => (),
1031                    '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1032                    '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1033                    '>' => go!(self: error; emit_doctype; to Data),
1034                    _ => go!(self: error; to BogusDoctype),
1035                }
1036            },
1037            //§ doctype_public_identifier_double_quoted_state doctype_system_identifier_double_quoted_state
1038            XmlState::DoctypeIdentifierDoubleQuoted(kind) => loop {
1039                match get_char!(self, input) {
1040                    '"' => go!(self: to AfterDoctypeIdentifier kind),
1041                    '>' => go!(self: error; emit_doctype; to Data),
1042                    c => go!(self: push_doctype_id kind c),
1043                }
1044            },
1045            //§ doctype_public_identifier_single_quoted_state doctype_system_identifier_single_quoted_state
1046            XmlState::DoctypeIdentifierSingleQuoted(kind) => loop {
1047                match get_char!(self, input) {
1048                    '\'' => go!(self: to AfterDoctypeIdentifier kind),
1049                    '>' => go!(self: error; emit_doctype; to Data),
1050                    c => go!(self: push_doctype_id kind c),
1051                }
1052            },
1053            //§ doctype_public_identifier_single_quoted_state
1054            XmlState::AfterDoctypeIdentifier(Public) => loop {
1055                match get_char!(self, input) {
1056                    '\t' | '\n' | '\x0C' | ' ' => {
1057                        go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1058                    },
1059                    '\'' => {
1060                        go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted(System))
1061                    },
1062                    '"' => {
1063                        go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted(System))
1064                    },
1065                    '>' => go!(self: emit_doctype; to Data),
1066                    _ => go!(self: error; to BogusDoctype),
1067                }
1068            },
1069            //§ doctype_system_identifier_single_quoted_state
1070            XmlState::AfterDoctypeIdentifier(System) => loop {
1071                match get_char!(self, input) {
1072                    '\t' | '\n' | '\x0C' | ' ' => (),
1073                    '>' => go!(self: emit_doctype; to Data),
1074                    _ => go!(self: error; to BogusDoctype),
1075                }
1076            },
1077            //§ between_doctype_public_and_system_identifier_state
1078            XmlState::BetweenDoctypePublicAndSystemIdentifiers => loop {
1079                match get_char!(self, input) {
1080                    '\t' | '\n' | '\x0C' | ' ' => (),
1081                    '>' => go!(self: emit_doctype; to Data),
1082                    '\'' => go!(self: to DoctypeIdentifierSingleQuoted System),
1083                    '"' => go!(self: to DoctypeIdentifierDoubleQuoted System),
1084                    _ => go!(self: error; to BogusDoctype),
1085                }
1086            },
1087            //§ bogus_doctype_state
1088            XmlState::BogusDoctype => loop {
1089                if get_char!(self, input) == '>' {
1090                    go!(self: emit_doctype; to Data);
1091                }
1092            },
1093        }
1094    }
1095
1096    /// Indicate that we have reached the end of the input.
1097    pub fn end(&self) {
1098        // Handle EOF in the char ref sub-tokenizer, if there is one.
1099        // Do this first because it might un-consume stuff.
1100        let input = BufferQueue::default();
1101        match self.char_ref_tokenizer.take() {
1102            None => (),
1103            Some(mut tok) => {
1104                tok.end_of_file(self, &input);
1105                self.process_char_ref(tok.get_result());
1106            },
1107        }
1108
1109        // Process all remaining buffered input.
1110        // If we're waiting for lookahead, we're not gonna get it.
1111        self.at_eof.set(true);
1112        let _ = self.run(&input);
1113
1114        loop {
1115            if !matches!(self.eof_step(), ProcessResult::Continue) {
1116                break;
1117            }
1118        }
1119
1120        self.sink.end();
1121
1122        if self.opts.profile {
1123            self.dump_profile();
1124        }
1125    }
1126
1127    #[cfg(for_c)]
1128    fn dump_profile(&self) {
1129        unreachable!();
1130    }
1131
1132    #[cfg(not(for_c))]
1133    fn dump_profile(&self) {
1134        let mut results: Vec<(XmlState, u64)> = self
1135            .state_profile
1136            .borrow()
1137            .iter()
1138            .map(|(s, t)| (*s, *t))
1139            .collect();
1140        results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1141
1142        let total: u64 = results
1143            .iter()
1144            .map(|&(_, t)| t)
1145            .fold(0, ::std::ops::Add::add);
1146        debug!("\nTokenizer profile, in nanoseconds");
1147        debug!(
1148            "\n{:12}         total in token sink",
1149            self.time_in_sink.get()
1150        );
1151        debug!("\n{total:12}         total in tokenizer");
1152
1153        for (k, v) in results.into_iter() {
1154            let pct = 100.0 * (v as f64) / (total as f64);
1155            debug!("{v:12}  {pct:4.1}%  {k:?}");
1156        }
1157    }
1158
1159    fn eof_step(&self) -> ProcessResult<Sink::Handle> {
1160        debug!("processing EOF in state {:?}", self.state.get());
1161        match self.state.get() {
1162            XmlState::Data => go!(self: eof),
1163            XmlState::CommentStart | XmlState::CommentLessThan | XmlState::CommentLessThanBang => {
1164                go!(self: reconsume Comment)
1165            },
1166            XmlState::CommentLessThanBangDash => go!(self: reconsume CommentEndDash),
1167            XmlState::CommentLessThanBangDashDash => go!(self: reconsume CommentEnd),
1168            XmlState::CommentStartDash
1169            | XmlState::Comment
1170            | XmlState::CommentEndDash
1171            | XmlState::CommentEnd
1172            | XmlState::CommentEndBang => go!(self: error_eof; emit_comment; eof),
1173            XmlState::TagState => go!(self: error_eof; emit '<'; to Data),
1174            XmlState::EndTagState => go!(self: error_eof; emit '<'; emit '/'; to Data),
1175            XmlState::TagEmpty => go!(self: error_eof; to TagAttrNameBefore),
1176            XmlState::Cdata | XmlState::CdataBracket | XmlState::CdataEnd => {
1177                go!(self: error_eof; to Data)
1178            },
1179            XmlState::Pi => go!(self: error_eof; to BogusComment),
1180            XmlState::PiTargetAfter | XmlState::PiAfter => go!(self: reconsume PiData),
1181            XmlState::MarkupDecl => go!(self: error_eof; to BogusComment),
1182            XmlState::TagName
1183            | XmlState::TagAttrNameBefore
1184            | XmlState::EndTagName
1185            | XmlState::TagAttrNameAfter
1186            | XmlState::EndTagNameAfter
1187            | XmlState::TagAttrValueBefore
1188            | XmlState::TagAttrValue(_) => go!(self: error_eof; emit_tag Data),
1189            XmlState::PiData | XmlState::PiTarget => go!(self: error_eof; emit_pi Data),
1190            XmlState::TagAttrName => go!(self: error_eof; emit_start_tag Data),
1191            XmlState::BeforeDoctypeName
1192            | XmlState::Doctype
1193            | XmlState::DoctypeName
1194            | XmlState::AfterDoctypeName
1195            | XmlState::AfterDoctypeKeyword(_)
1196            | XmlState::BeforeDoctypeIdentifier(_)
1197            | XmlState::AfterDoctypeIdentifier(_)
1198            | XmlState::DoctypeIdentifierSingleQuoted(_)
1199            | XmlState::DoctypeIdentifierDoubleQuoted(_)
1200            | XmlState::BetweenDoctypePublicAndSystemIdentifiers => {
1201                go!(self: error_eof; emit_doctype; to Data)
1202            },
1203            XmlState::BogusDoctype => go!(self: emit_doctype; to Data),
1204            XmlState::BogusComment => go!(self: emit_comment; to Data),
1205        }
1206    }
1207
1208    fn process_char_ref(&self, char_ref: CharRef) {
1209        let CharRef {
1210            mut chars,
1211            mut num_chars,
1212        } = char_ref;
1213
1214        if num_chars == 0 {
1215            chars[0] = '&';
1216            num_chars = 1;
1217        }
1218
1219        for i in 0..num_chars {
1220            let c = chars[i as usize];
1221            match self.state.get() {
1222                XmlState::Data | XmlState::Cdata => go!(self: emit c),
1223
1224                XmlState::TagAttrValue(_) => go!(self: push_value c),
1225
1226                _ => panic!(
1227                    "state {:?} should not be reachable in process_char_ref",
1228                    self.state.get()
1229                ),
1230            }
1231        }
1232    }
1233
1234    fn step_char_ref_tokenizer(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
1235        let mut tok = self.char_ref_tokenizer.take().unwrap();
1236        let outcome = tok.step(self, input);
1237
1238        let progress = match outcome {
1239            char_ref::Done => {
1240                self.process_char_ref(tok.get_result());
1241                return ProcessResult::Continue;
1242            },
1243
1244            char_ref::Stuck => ProcessResult::Done,
1245            char_ref::Progress => ProcessResult::Continue,
1246        };
1247
1248        *self.char_ref_tokenizer.borrow_mut() = Some(tok);
1249        progress
1250    }
1251
1252    fn finish_attribute(&self) {
1253        if self.current_attr_name.borrow().is_empty() {
1254            return;
1255        }
1256
1257        // Check for a duplicate attribute.
1258        // FIXME: the spec says we should error as soon as the name is finished.
1259        // FIXME: linear time search, do we care?
1260        let dup = {
1261            let current_attr_name = self.current_attr_name.borrow();
1262            let name = &current_attr_name[..];
1263            self.current_tag_attrs
1264                .borrow()
1265                .iter()
1266                .any(|a| &*a.name.local == name)
1267        };
1268
1269        if dup {
1270            self.emit_error(Borrowed("Duplicate attribute"));
1271            self.current_attr_name.borrow_mut().clear();
1272            self.current_attr_value.borrow_mut().clear();
1273        } else {
1274            let qname = process_qname(replace(
1275                &mut self.current_attr_name.borrow_mut(),
1276                StrTendril::new(),
1277            ));
1278            let attr = Attribute {
1279                name: qname.clone(),
1280                value: replace(&mut self.current_attr_value.borrow_mut(), StrTendril::new()),
1281            };
1282
1283            if qname.local == local_name!("xmlns")
1284                || qname.prefix == Some(namespace_prefix!("xmlns"))
1285            {
1286                self.current_tag_attrs.borrow_mut().insert(0, attr);
1287            } else {
1288                self.current_tag_attrs.borrow_mut().push(attr);
1289            }
1290        }
1291    }
1292
1293    fn create_attribute(&self, c: char) {
1294        self.finish_attribute();
1295
1296        self.current_attr_name.borrow_mut().push_char(c);
1297    }
1298}
1299
1300#[cfg(test)]
1301mod test {
1302
1303    use super::process_qname;
1304    use crate::tendril::SliceExt;
1305    use crate::{LocalName, Prefix};
1306
1307    #[test]
1308    fn simple_namespace() {
1309        let qname = process_qname("prefix:local".to_tendril());
1310        assert_eq!(qname.prefix, Some(Prefix::from("prefix")));
1311        assert_eq!(qname.local, LocalName::from("local"));
1312
1313        let qname = process_qname("a:b".to_tendril());
1314        assert_eq!(qname.prefix, Some(Prefix::from("a")));
1315        assert_eq!(qname.local, LocalName::from("b"));
1316    }
1317
1318    #[test]
1319    fn wrong_namespaces() {
1320        let qname = process_qname(":local".to_tendril());
1321        assert_eq!(qname.prefix, None);
1322        assert_eq!(qname.local, LocalName::from(":local"));
1323
1324        let qname = process_qname("::local".to_tendril());
1325        assert_eq!(qname.prefix, None);
1326        assert_eq!(qname.local, LocalName::from("::local"));
1327
1328        let qname = process_qname("a::local".to_tendril());
1329        assert_eq!(qname.prefix, None);
1330        assert_eq!(qname.local, LocalName::from("a::local"));
1331
1332        let qname = process_qname("fake::".to_tendril());
1333        assert_eq!(qname.prefix, None);
1334        assert_eq!(qname.local, LocalName::from("fake::"));
1335
1336        let qname = process_qname(":::".to_tendril());
1337        assert_eq!(qname.prefix, None);
1338        assert_eq!(qname.local, LocalName::from(":::"));
1339
1340        let qname = process_qname(":a:b:".to_tendril());
1341        assert_eq!(qname.prefix, None);
1342        assert_eq!(qname.local, LocalName::from(":a:b:"));
1343    }
1344}