xml5ever/tokenizer/
mod.rs

1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10mod char_ref;
11mod interface;
12mod qname;
13pub mod states;
14
15pub use self::interface::{
16    Doctype, EmptyTag, EndTag, Pi, ShortTag, StartTag, Tag, TagKind, Token, TokenSink,
17};
18pub use crate::{LocalName, Namespace, Prefix};
19
20use crate::macros::{time, unwrap_or_return};
21use crate::tendril::StrTendril;
22use crate::{buffer_queue, Attribute, QualName, SmallCharSet};
23use log::debug;
24use markup5ever::{local_name, namespace_prefix, ns, small_char_set, TokenizerResult};
25use std::borrow::Cow::{self, Borrowed};
26use std::cell::{Cell, RefCell, RefMut};
27use std::collections::BTreeMap;
28use std::mem::replace;
29
30use self::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
31use self::char_ref::{CharRef, CharRefTokenizer};
32use self::qname::QualNameTokenizer;
33use self::states::XmlState;
34use self::states::{DoctypeKind, Public, System};
35use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
36
37/// Copy of Tokenizer options, with an impl for `Default`.
38#[derive(Copy, Clone)]
39pub struct XmlTokenizerOpts {
40    /// Report all parse errors described in the spec, at some
41    /// performance penalty?  Default: false
42    pub exact_errors: bool,
43
44    /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
45    /// of the stream?  Default: true
46    pub discard_bom: bool,
47
48    /// Keep a record of how long we spent in each state?  Printed
49    /// when `end()` is called.  Default: false
50    pub profile: bool,
51
52    /// Initial state override.  Only the test runner should use
53    /// a non-`None` value!
54    pub initial_state: Option<states::XmlState>,
55}
56
57fn process_qname(tag_name: StrTendril) -> QualName {
58    // If tag name can't possibly contain full namespace, skip qualified name
59    // parsing altogether. For a tag to have namespace it must look like:
60    //     a:b
61    // Since StrTendril are UTF-8, we know that minimal size in bytes must be
62    // three bytes minimum.
63    let split = if (*tag_name).len() < 3 {
64        None
65    } else {
66        QualNameTokenizer::new((*tag_name).as_bytes()).run()
67    };
68
69    match split {
70        None => QualName::new(None, ns!(), LocalName::from(&*tag_name)),
71        Some(col) => {
72            let len = (*tag_name).len() as u32;
73            let prefix = tag_name.subtendril(0, col);
74            let local = tag_name.subtendril(col + 1, len - col - 1);
75            let ns = ns!(); // Actual namespace URL set in XmlTreeBuilder::bind_qname
76            QualName::new(Some(Prefix::from(&*prefix)), ns, LocalName::from(&*local))
77        },
78    }
79}
80
81fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
82    match *opt_str {
83        Some(ref mut s) => s.push_char(c),
84        None => *opt_str = Some(StrTendril::from_char(c)),
85    }
86}
87
88impl Default for XmlTokenizerOpts {
89    fn default() -> XmlTokenizerOpts {
90        XmlTokenizerOpts {
91            exact_errors: false,
92            discard_bom: true,
93            profile: false,
94            initial_state: None,
95        }
96    }
97}
98/// The Xml tokenizer.
99pub struct XmlTokenizer<Sink> {
100    /// Options controlling the behavior of the tokenizer.
101    opts: XmlTokenizerOpts,
102
103    /// Destination for tokens we emit.
104    pub sink: Sink,
105
106    /// The abstract machine state as described in the spec.
107    state: Cell<states::XmlState>,
108
109    /// Are we at the end of the file, once buffers have been processed
110    /// completely? This affects whether we will wait for lookahead or not.
111    at_eof: Cell<bool>,
112
113    /// Tokenizer for character references, if we're tokenizing
114    /// one at the moment.
115    char_ref_tokenizer: RefCell<Option<Box<CharRefTokenizer>>>,
116
117    /// Current input character.  Just consumed, may reconsume.
118    current_char: Cell<char>,
119
120    /// Should we reconsume the current input character?
121    reconsume: Cell<bool>,
122
123    /// Did we just consume \r, translating it to \n?  In that case we need
124    /// to ignore the next character if it's \n.
125    ignore_lf: Cell<bool>,
126
127    /// Discard a U+FEFF BYTE ORDER MARK if we see one?  Only done at the
128    /// beginning of the stream.
129    discard_bom: Cell<bool>,
130
131    /// Temporary buffer
132    temp_buf: RefCell<StrTendril>,
133
134    /// Current tag kind.
135    current_tag_kind: Cell<TagKind>,
136
137    /// Current tag name.
138    current_tag_name: RefCell<StrTendril>,
139
140    /// Current tag attributes.
141    current_tag_attrs: RefCell<Vec<Attribute>>,
142
143    /// Current attribute name.
144    current_attr_name: RefCell<StrTendril>,
145
146    /// Current attribute value.
147    current_attr_value: RefCell<StrTendril>,
148
149    current_doctype: RefCell<Doctype>,
150
151    /// Current comment.
152    current_comment: RefCell<StrTendril>,
153
154    /// Current processing instruction target.
155    current_pi_target: RefCell<StrTendril>,
156
157    /// Current processing instruction value.
158    current_pi_data: RefCell<StrTendril>,
159
160    /// Record of how many ns we spent in each state, if profiling is enabled.
161    state_profile: RefCell<BTreeMap<states::XmlState, u64>>,
162
163    /// Record of how many ns we spent in the token sink.
164    time_in_sink: Cell<u64>,
165}
166
167impl<Sink: TokenSink> XmlTokenizer<Sink> {
168    /// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
169    pub fn new(sink: Sink, opts: XmlTokenizerOpts) -> XmlTokenizer<Sink> {
170        if opts.profile && cfg!(for_c) {
171            panic!("Can't profile tokenizer when built as a C library");
172        }
173
174        let state = *opts.initial_state.as_ref().unwrap_or(&states::Data);
175        let discard_bom = opts.discard_bom;
176        XmlTokenizer {
177            opts,
178            sink,
179            state: Cell::new(state),
180            char_ref_tokenizer: RefCell::new(None),
181            at_eof: Cell::new(false),
182            current_char: Cell::new('\0'),
183            reconsume: Cell::new(false),
184            ignore_lf: Cell::new(false),
185            temp_buf: RefCell::new(StrTendril::new()),
186            discard_bom: Cell::new(discard_bom),
187            current_tag_kind: Cell::new(StartTag),
188            current_tag_name: RefCell::new(StrTendril::new()),
189            current_tag_attrs: RefCell::new(vec![]),
190            current_attr_name: RefCell::new(StrTendril::new()),
191            current_attr_value: RefCell::new(StrTendril::new()),
192            current_comment: RefCell::new(StrTendril::new()),
193            current_pi_data: RefCell::new(StrTendril::new()),
194            current_pi_target: RefCell::new(StrTendril::new()),
195            current_doctype: RefCell::new(Doctype::default()),
196            state_profile: RefCell::new(BTreeMap::new()),
197            time_in_sink: Cell::new(0),
198        }
199    }
200
201    /// Feed an input string into the tokenizer.
202    pub fn feed(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
203        if input.is_empty() {
204            return TokenizerResult::Done;
205        }
206
207        if self.discard_bom.get() {
208            if let Some(c) = input.peek() {
209                if c == '\u{feff}' {
210                    input.next();
211                }
212            } else {
213                return TokenizerResult::Done;
214            }
215        };
216
217        self.run(input)
218    }
219
220    fn process_token(&self, token: Token) -> ProcessResult<Sink::Handle> {
221        if self.opts.profile {
222            let (result, dt) = time!(self.sink.process_token(token));
223            self.time_in_sink.set(self.time_in_sink.get() + dt);
224            result
225        } else {
226            self.sink.process_token(token)
227        }
228    }
229
230    // Get the next input character, which might be the character
231    // 'c' that we already consumed from the buffers.
232    fn get_preprocessed_char(&self, mut c: char, input: &BufferQueue) -> Option<char> {
233        if self.ignore_lf.get() {
234            self.ignore_lf.set(false);
235            if c == '\n' {
236                c = input.next()?;
237            }
238        }
239
240        if c == '\r' {
241            self.ignore_lf.set(true);
242            c = '\n';
243        }
244
245        // Normalize \x00 into \uFFFD
246        if c == '\x00' {
247            c = '\u{FFFD}'
248        }
249
250        // Exclude forbidden Unicode characters
251        if self.opts.exact_errors
252            && match c as u32 {
253                0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
254                n if (n & 0xFFFE) == 0xFFFE => true,
255                _ => false,
256            }
257        {
258            let msg = format!("Bad character {c}");
259            self.emit_error(Cow::Owned(msg));
260        }
261
262        debug!("got character {c}");
263        self.current_char.set(c);
264        Some(c)
265    }
266
267    fn bad_eof_error(&self) {
268        let msg = if self.opts.exact_errors {
269            Cow::from(format!("Saw EOF in state {:?}", self.state))
270        } else {
271            Cow::from("Unexpected EOF")
272        };
273        self.emit_error(msg);
274    }
275
276    fn pop_except_from(&self, input: &BufferQueue, set: SmallCharSet) -> Option<SetResult> {
277        // Bail to the slow path for various corner cases.
278        // This means that `FromSet` can contain characters not in the set!
279        // It shouldn't matter because the fallback `FromSet` case should
280        // always do the same thing as the `NotFromSet` case.
281        if self.opts.exact_errors || self.reconsume.get() || self.ignore_lf.get() {
282            return self.get_char(input).map(FromSet);
283        }
284
285        let d = input.pop_except_from(set);
286        debug!("got characters {d:?}");
287        match d {
288            Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet),
289
290            // NB: We don't set self.current_char for a run of characters not
291            // in the set.  It shouldn't matter for the codepaths that use
292            // this.
293            _ => d,
294        }
295    }
296
297    // Check if the next characters are an ASCII case-insensitive match.  See
298    // BufferQueue::eat.
299    //
300    // NB: this doesn't do input stream preprocessing or set the current input
301    // character.
302    fn eat(&self, input: &BufferQueue, pat: &str) -> Option<bool> {
303        input.push_front(replace(&mut *self.temp_buf.borrow_mut(), StrTendril::new()));
304        match input.eat(pat, u8::eq_ignore_ascii_case) {
305            None if self.at_eof.get() => Some(false),
306            None => {
307                let mut temp_buf = self.temp_buf.borrow_mut();
308                while let Some(data) = input.next() {
309                    temp_buf.push_char(data);
310                }
311                None
312            },
313            Some(matched) => Some(matched),
314        }
315    }
316
317    /// Run the state machine for as long as we can.
318    pub fn run(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
319        if self.opts.profile {
320            loop {
321                let state = self.state.get();
322                let old_sink = self.time_in_sink.get();
323                let (run, mut dt) = time!(self.step(input));
324                dt -= self.time_in_sink.get() - old_sink;
325                let new = match self.state_profile.borrow_mut().get_mut(&state) {
326                    Some(x) => {
327                        *x += dt;
328                        false
329                    },
330                    None => true,
331                };
332                if new {
333                    // do this here because of borrow shenanigans
334                    self.state_profile.borrow_mut().insert(state, dt);
335                }
336                match run {
337                    ProcessResult::Continue => continue,
338                    ProcessResult::Done => return TokenizerResult::Done,
339                    ProcessResult::Script(handle) => return TokenizerResult::Script(handle),
340                }
341            }
342        } else {
343            loop {
344                match self.step(input) {
345                    ProcessResult::Continue => continue,
346                    ProcessResult::Done => return TokenizerResult::Done,
347                    ProcessResult::Script(handle) => return TokenizerResult::Script(handle),
348                }
349            }
350        }
351    }
352
353    //§ tokenization
354    // Get the next input character, if one is available.
355    fn get_char(&self, input: &BufferQueue) -> Option<char> {
356        if self.reconsume.get() {
357            self.reconsume.set(false);
358            Some(self.current_char.get())
359        } else {
360            input
361                .next()
362                .and_then(|c| self.get_preprocessed_char(c, input))
363        }
364    }
365
366    fn bad_char_error(&self) {
367        let msg = if self.opts.exact_errors {
368            let c = self.current_char.get();
369            let state = self.state.get();
370            Cow::from(format!("Saw {c} in state {state:?}"))
371        } else {
372            Cow::from("Bad character")
373        };
374        self.emit_error(msg);
375    }
376
377    fn discard_tag(&self) {
378        *self.current_tag_name.borrow_mut() = StrTendril::new();
379        *self.current_tag_attrs.borrow_mut() = Vec::new();
380    }
381
382    fn create_tag(&self, kind: TagKind, c: char) {
383        self.discard_tag();
384        self.current_tag_name.borrow_mut().push_char(c);
385        self.current_tag_kind.set(kind);
386    }
387
388    // This method creates a PI token and
389    // sets its target to given char
390    fn create_pi(&self, c: char) {
391        *self.current_pi_target.borrow_mut() = StrTendril::new();
392        *self.current_pi_data.borrow_mut() = StrTendril::new();
393        self.current_pi_target.borrow_mut().push_char(c);
394    }
395
396    fn emit_char(&self, c: char) {
397        self.process_token(Token::Characters(StrTendril::from_char(match c {
398            '\0' => '\u{FFFD}',
399            c => c,
400        })));
401    }
402
403    fn emit_short_tag(&self) -> ProcessResult<Sink::Handle> {
404        self.current_tag_kind.set(ShortTag);
405        *self.current_tag_name.borrow_mut() = StrTendril::new();
406        self.emit_current_tag()
407    }
408
409    fn emit_empty_tag(&self) -> ProcessResult<Sink::Handle> {
410        self.current_tag_kind.set(EmptyTag);
411        self.emit_current_tag()
412    }
413
414    fn set_empty_tag(&self) {
415        self.current_tag_kind.set(EmptyTag);
416    }
417
418    fn emit_start_tag(&self) -> ProcessResult<Sink::Handle> {
419        self.current_tag_kind.set(StartTag);
420        self.emit_current_tag()
421    }
422
423    fn emit_current_tag(&self) -> ProcessResult<Sink::Handle> {
424        self.finish_attribute();
425
426        let qname = process_qname(replace(
427            &mut *self.current_tag_name.borrow_mut(),
428            StrTendril::new(),
429        ));
430
431        match self.current_tag_kind.get() {
432            StartTag | EmptyTag => {},
433            EndTag => {
434                if !self.current_tag_attrs.borrow().is_empty() {
435                    self.emit_error(Borrowed("Attributes on an end tag"));
436                }
437            },
438            ShortTag => {
439                if !self.current_tag_attrs.borrow().is_empty() {
440                    self.emit_error(Borrowed("Attributes on a short tag"));
441                }
442            },
443        }
444
445        let token = Token::Tag(Tag {
446            kind: self.current_tag_kind.get(),
447            name: qname,
448            attrs: self.current_tag_attrs.take(),
449        });
450
451        self.process_token(token)
452    }
453
454    // The string must not contain '\0'!
455    fn emit_chars(&self, b: StrTendril) {
456        self.process_token(Token::Characters(b));
457    }
458
459    // Emits the current Processing Instruction
460    fn emit_pi(&self) -> ProcessResult<<Sink as TokenSink>::Handle> {
461        let token = Token::ProcessingInstruction(Pi {
462            target: replace(&mut *self.current_pi_target.borrow_mut(), StrTendril::new()),
463            data: replace(&mut *self.current_pi_data.borrow_mut(), StrTendril::new()),
464        });
465        self.process_token(token)
466    }
467
468    fn consume_char_ref(&self, addnl_allowed: Option<char>) {
469        // NB: The char ref tokenizer assumes we have an additional allowed
470        // character iff we're tokenizing in an attribute value.
471        *self.char_ref_tokenizer.borrow_mut() =
472            Some(Box::new(CharRefTokenizer::new(addnl_allowed)));
473    }
474
475    fn emit_eof(&self) {
476        self.process_token(Token::EndOfFile);
477    }
478
479    fn emit_error(&self, error: Cow<'static, str>) {
480        self.process_token(Token::ParseError(error));
481    }
482
483    fn emit_current_comment(&self) {
484        let comment = self.current_comment.take();
485        self.process_token(Token::Comment(comment));
486    }
487
488    fn emit_current_doctype(&self) {
489        let doctype = self.current_doctype.take();
490        self.process_token(Token::Doctype(doctype));
491    }
492
493    fn doctype_id(&self, kind: DoctypeKind) -> RefMut<'_, Option<StrTendril>> {
494        let current_doctype = self.current_doctype.borrow_mut();
495        match kind {
496            Public => RefMut::map(current_doctype, |d| &mut d.public_id),
497            System => RefMut::map(current_doctype, |d| &mut d.system_id),
498        }
499    }
500
501    fn clear_doctype_id(&self, kind: DoctypeKind) {
502        let mut id = self.doctype_id(kind);
503        match *id {
504            Some(ref mut s) => s.clear(),
505            None => *id = Some(StrTendril::new()),
506        }
507    }
508
509    fn peek(&self, input: &BufferQueue) -> Option<char> {
510        if self.reconsume.get() {
511            Some(self.current_char.get())
512        } else {
513            input.peek()
514        }
515    }
516
517    fn discard_char(&self, input: &BufferQueue) {
518        let c = self.get_char(input);
519        assert!(c.is_some());
520    }
521
522    fn unconsume(&self, input: &BufferQueue, buf: StrTendril) {
523        input.push_front(buf);
524    }
525}
526
527// Shorthand for common state machine behaviors.
528macro_rules! shorthand (
529    ( $me:ident : emit $c:expr                     ) => ( $me.emit_char($c)                                   );
530    ( $me:ident : create_tag $kind:ident $c:expr   ) => ( $me.create_tag($kind, $c)                           );
531    ( $me:ident : push_tag $c:expr                 ) => ( $me.current_tag_name.borrow_mut().push_char($c)     );
532    ( $me:ident : discard_tag $input:expr          ) => ( $me.discard_tag($input)                             );
533    ( $me:ident : discard_char                     ) => ( $me.discard_char()                                  );
534    ( $me:ident : push_temp $c:expr                ) => ( $me.temp_buf.borrow_mut().push_char($c)             );
535    ( $me:ident : emit_temp                        ) => ( $me.emit_temp_buf()                                 );
536    ( $me:ident : clear_temp                       ) => ( $me.clear_temp_buf()                                );
537    ( $me:ident : create_attr $c:expr              ) => ( $me.create_attribute($c)                            );
538    ( $me:ident : push_name $c:expr                ) => ( $me.current_attr_name.borrow_mut().push_char($c)    );
539    ( $me:ident : push_value $c:expr               ) => ( $me.current_attr_value.borrow_mut().push_char($c)   );
540    ( $me:ident : append_value $c:expr             ) => ( $me.current_attr_value.borrow_mut().push_tendril($c));
541    ( $me:ident : push_comment $c:expr             ) => ( $me.current_comment.borrow_mut().push_char($c)      );
542    ( $me:ident : append_comment $c:expr           ) => ( $me.current_comment.borrow_mut().push_slice($c)     );
543    ( $me:ident : emit_comment                     ) => ( $me.emit_current_comment()                          );
544    ( $me:ident : clear_comment                    ) => ( $me.current_comment.borrow_mut().clear()            );
545    ( $me:ident : create_doctype                   ) => ( *$me.current_doctype.borrow_mut() = Doctype::default() );
546    ( $me:ident : push_doctype_name $c:expr        ) => ( option_push(&mut $me.current_doctype.borrow_mut().name, $c) );
547    ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push(&mut $me.doctype_id($k), $c)            );
548    ( $me:ident : clear_doctype_id $k:ident        ) => ( $me.clear_doctype_id($k)                            );
549    ( $me:ident : emit_doctype                     ) => ( $me.emit_current_doctype()                          );
550    ( $me:ident : error                            ) => ( $me.bad_char_error()                                );
551    ( $me:ident : error_eof                        ) => ( $me.bad_eof_error()                                 );
552    ( $me:ident : create_pi $c:expr                ) => ( $me.create_pi($c)                                   );
553    ( $me:ident : push_pi_target $c:expr           ) => ( $me.current_pi_target.borrow_mut().push_char($c)    );
554    ( $me:ident : push_pi_data $c:expr             ) => ( $me.current_pi_data.borrow_mut().push_char($c)      );
555    ( $me:ident : set_empty_tag                    ) => ( $me.set_empty_tag()                                 );
556);
557
558// Tracing of tokenizer actions.  This adds significant bloat and compile time,
559// so it's behind a cfg flag.
560#[cfg(feature = "trace_tokenizer")]
561macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
562    debug!("  {:?}", stringify!($($cmds)*));
563    shorthand!($me : $($cmds)*);
564}));
565
566#[cfg(not(feature = "trace_tokenizer"))]
567macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
568
569// A little DSL for sequencing shorthand actions.
570macro_rules! go (
571    // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity.
572    // We have to tell the parser how much lookahead we need.
573
574    ( $me:ident : $a:tt                   ; $($rest:tt)* ) => ({ sh_trace!($me: $a);          go!($me: $($rest)*); });
575    ( $me:ident : $a:tt $b:tt             ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b);       go!($me: $($rest)*); });
576    ( $me:ident : $a:tt $b:tt $c:tt       ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c);    go!($me: $($rest)*); });
577    ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
578
579    // These can only come at the end.
580
581    ( $me:ident : to $s:ident                    ) => ({ $me.state.set(states::$s); return ProcessResult::Continue;           });
582    ( $me:ident : to $s:ident $k1:expr           ) => ({ $me.state.set(states::$s($k1)); return ProcessResult::Continue;      });
583    ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state.set(states::$s($k1($k2))); return ProcessResult::Continue; });
584
585    ( $me:ident : reconsume $s:ident                    ) => ({ $me.reconsume.set(true); go!($me: to $s);         });
586    ( $me:ident : reconsume $s:ident $k1:expr           ) => ({ $me.reconsume.set(true); go!($me: to $s $k1);     });
587    ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1 $k2); });
588
589    ( $me:ident : consume_char_ref             ) => ({ $me.consume_char_ref(None); return ProcessResult::Continue;         });
590    ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return ProcessResult::Continue; });
591
592    // We have a default next state after emitting a tag, but the sink can override.
593    ( $me:ident : emit_tag $s:ident ) => ({
594        $me.state.set(states::$s);
595        return $me.emit_current_tag();
596    });
597
598    // We have a special when dealing with empty and short tags in Xml
599    ( $me:ident : emit_short_tag $s:ident ) => ({
600        $me.state.set(states::$s);
601        return $me.emit_short_tag();
602    });
603
604    ( $me:ident : emit_empty_tag $s:ident ) => ({
605        $me.state.set(states::$s);
606        return $me.emit_empty_tag();
607    });
608
609    ( $me:ident : emit_start_tag $s:ident ) => ({
610        $me.state.set(states::$s);
611        return $me.emit_start_tag();
612    });
613
614    ( $me:ident : emit_pi $s:ident ) => ({
615        $me.state.set(states::$s);
616        return $me.emit_pi();
617    });
618
619    ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Done; });
620
621    // If nothing else matched, it's a single command
622    ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) );
623
624    // or nothing.
625    ( $me:ident : ) => (());
626);
627
628// This is a macro because it can cause early return
629// from the function where it is used.
630macro_rules! get_char ( ($me:expr, $input:expr) => (
631    unwrap_or_return!($me.get_char($input), ProcessResult::Done)
632));
633
634macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
635    unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Done)
636));
637
638macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
639    unwrap_or_return!($me.eat($input, $pat), ProcessResult::Done)
640));
641
642/// The result of a single tokenization operation
643pub enum ProcessResult<Handle> {
644    /// The tokenizer needs more input before it can continue
645    Done,
646    /// The tokenizer can be invoked again immediately
647    Continue,
648    /// The tokenizer encountered a script element that must be executed
649    /// before tokenization can continue
650    Script(Handle),
651}
652
653impl<Sink: TokenSink> XmlTokenizer<Sink> {
654    // Run the state machine for a while.
655    #[allow(clippy::never_loop)]
656    fn step(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
657        if self.char_ref_tokenizer.borrow().is_some() {
658            return self.step_char_ref_tokenizer(input);
659        }
660
661        debug!("processing in state {:?}", self.state);
662        match self.state.get() {
663            XmlState::Quiescent => {
664                self.state.set(XmlState::Data);
665                ProcessResult::Done
666            },
667            //§ data-state
668            XmlState::Data => loop {
669                match pop_except_from!(self, input, small_char_set!('\r' '&' '<')) {
670                    FromSet('&') => go!(self: consume_char_ref),
671                    FromSet('<') => go!(self: to TagState),
672                    FromSet(c) => go!(self: emit c),
673                    NotFromSet(b) => self.emit_chars(b),
674                }
675            },
676            //§ tag-state
677            XmlState::TagState => loop {
678                match get_char!(self, input) {
679                    '!' => go!(self: to MarkupDecl),
680                    '/' => go!(self: to EndTagState),
681                    '?' => go!(self: to Pi),
682                    '\t' | '\n' | ' ' | ':' | '<' | '>' => {
683                        go!(self: error; emit '<'; reconsume Data)
684                    },
685                    cl => go!(self: create_tag StartTag cl; to TagName),
686                }
687            },
688            //§ end-tag-state
689            XmlState::EndTagState => loop {
690                match get_char!(self, input) {
691                    '>' => go!(self:  emit_short_tag Data),
692                    '\t' | '\n' | ' ' | '<' | ':' => {
693                        go!(self: error; emit '<'; emit '/'; reconsume Data)
694                    },
695                    cl => go!(self: create_tag EndTag cl; to EndTagName),
696                }
697            },
698            //§ end-tag-name-state
699            XmlState::EndTagName => loop {
700                match get_char!(self, input) {
701                    '\t' | '\n' | ' ' => go!(self: to EndTagNameAfter),
702                    '/' => go!(self: error; to EndTagNameAfter),
703                    '>' => go!(self: emit_tag Data),
704                    cl => go!(self: push_tag cl),
705                }
706            },
707            //§ end-tag-name-after-state
708            XmlState::EndTagNameAfter => loop {
709                match get_char!(self, input) {
710                    '>' => go!(self: emit_tag Data),
711                    '\t' | '\n' | ' ' => (),
712                    _ => self.emit_error(Borrowed("Unexpected element in tag name")),
713                }
714            },
715            //§ pi-state
716            XmlState::Pi => loop {
717                match get_char!(self, input) {
718                    '\t' | '\n' | ' ' => go!(self: error; reconsume BogusComment),
719                    cl => go!(self: create_pi cl; to PiTarget),
720                }
721            },
722            //§ pi-target-state
723            XmlState::PiTarget => loop {
724                match get_char!(self, input) {
725                    '\t' | '\n' | ' ' => go!(self: to PiTargetAfter),
726                    '?' => go!(self: to PiAfter),
727                    cl => go!(self: push_pi_target cl),
728                }
729            },
730            //§ pi-target-after-state
731            XmlState::PiTargetAfter => loop {
732                match get_char!(self, input) {
733                    '\t' | '\n' | ' ' => (),
734                    _ => go!(self: reconsume PiData),
735                }
736            },
737            //§ pi-data-state
738            XmlState::PiData => loop {
739                match get_char!(self, input) {
740                    '?' => go!(self: to PiAfter),
741                    cl => go!(self: push_pi_data cl),
742                }
743            },
744            //§ pi-after-state
745            XmlState::PiAfter => loop {
746                match get_char!(self, input) {
747                    '>' => go!(self: emit_pi Data),
748                    '?' => go!(self: to PiAfter),
749                    cl => go!(self: push_pi_data cl),
750                }
751            },
752            //§ markup-declaration-state
753            XmlState::MarkupDecl => loop {
754                if eat!(self, input, "--") {
755                    go!(self: clear_comment; to CommentStart);
756                } else if eat!(self, input, "[CDATA[") {
757                    go!(self: to Cdata);
758                } else if eat!(self, input, "DOCTYPE") {
759                    go!(self: to Doctype);
760                } else {
761                    // FIXME: 'error' gives wrong message
762                    go!(self: error; to BogusComment);
763                }
764            },
765            //§ comment-start-state
766            XmlState::CommentStart => loop {
767                match get_char!(self, input) {
768                    '-' => go!(self: to CommentStartDash),
769                    '>' => go!(self: error; emit_comment; to Data),
770                    _ => go!(self: reconsume Comment),
771                }
772            },
773            //§ comment-start-dash-state
774            XmlState::CommentStartDash => loop {
775                match get_char!(self, input) {
776                    '-' => go!(self: to CommentEnd),
777                    '>' => go!(self: error; emit_comment; to Data),
778                    _ => go!(self: push_comment '-'; reconsume Comment),
779                }
780            },
781            //§ comment-state
782            XmlState::Comment => loop {
783                match get_char!(self, input) {
784                    '<' => go!(self: push_comment '<'; to CommentLessThan),
785                    '-' => go!(self: to CommentEndDash),
786                    c => go!(self: push_comment c),
787                }
788            },
789            //§ comment-less-than-sign-state
790            XmlState::CommentLessThan => loop {
791                match get_char!(self, input) {
792                    '!' => go!(self: push_comment '!';to CommentLessThanBang),
793                    '<' => go!(self: push_comment '<'),
794                    _ => go!(self: reconsume Comment),
795                }
796            },
797            //§ comment-less-than-sign-bang-state
798            XmlState::CommentLessThanBang => loop {
799                match get_char!(self, input) {
800                    '-' => go!(self: to CommentLessThanBangDash),
801                    _ => go!(self: reconsume Comment),
802                }
803            },
804            //§ comment-less-than-sign-bang-dash-state
805            XmlState::CommentLessThanBangDash => loop {
806                match get_char!(self, input) {
807                    '-' => go!(self: to CommentLessThanBangDashDash),
808                    _ => go!(self: reconsume CommentEndDash),
809                }
810            },
811            //§ comment-less-than-sign-bang-dash-dash-state
812            XmlState::CommentLessThanBangDashDash => loop {
813                match get_char!(self, input) {
814                    '>' => go!(self: reconsume CommentEnd),
815                    _ => go!(self: error; reconsume CommentEnd),
816                }
817            },
818            //§ comment-end-dash-state
819            XmlState::CommentEndDash => loop {
820                match get_char!(self, input) {
821                    '-' => go!(self: to CommentEnd),
822                    _ => go!(self: push_comment '-'; reconsume Comment),
823                }
824            },
825            //§ comment-end-state
826            XmlState::CommentEnd => loop {
827                match get_char!(self, input) {
828                    '>' => go!(self: emit_comment; to Data),
829                    '!' => go!(self: to CommentEndBang),
830                    '-' => go!(self: push_comment '-'),
831                    _ => go!(self: append_comment "--"; reconsume Comment),
832                }
833            },
834            //§ comment-end-bang-state
835            XmlState::CommentEndBang => loop {
836                match get_char!(self, input) {
837                    '-' => go!(self: append_comment "--!"; to CommentEndDash),
838                    '>' => go!(self: error; emit_comment; to Data),
839                    _ => go!(self: append_comment "--!"; reconsume Comment),
840                }
841            },
842            //§ bogus-comment-state
843            XmlState::BogusComment => loop {
844                match get_char!(self, input) {
845                    '>' => go!(self: emit_comment; to Data),
846                    c => go!(self: push_comment c),
847                }
848            },
849            //§ cdata-state
850            XmlState::Cdata => loop {
851                match get_char!(self, input) {
852                    ']' => go!(self: to CdataBracket),
853                    cl => go!(self: emit cl),
854                }
855            },
856            //§ cdata-bracket-state
857            XmlState::CdataBracket => loop {
858                match get_char!(self, input) {
859                    ']' => go!(self: to CdataEnd),
860                    cl => go!(self: emit ']'; emit cl; to Cdata),
861                }
862            },
863            //§ cdata-end-state
864            XmlState::CdataEnd => loop {
865                match get_char!(self, input) {
866                    '>' => go!(self: to Data),
867                    ']' => go!(self: emit ']'),
868                    cl => go!(self: emit ']'; emit ']'; emit cl; to Cdata),
869                }
870            },
871            //§ tag-name-state
872            XmlState::TagName => loop {
873                match get_char!(self, input) {
874                    '\t' | '\n' | ' ' => go!(self: to TagAttrNameBefore),
875                    '>' => go!(self: emit_tag Data),
876                    '/' => go!(self: set_empty_tag; to TagEmpty),
877                    cl => go!(self: push_tag cl),
878                }
879            },
880            //§ empty-tag-state
881            XmlState::TagEmpty => loop {
882                match get_char!(self, input) {
883                    '>' => go!(self: emit_empty_tag Data),
884                    _ => go!(self: reconsume TagAttrValueBefore),
885                }
886            },
887            //§ tag-attribute-name-before-state
888            XmlState::TagAttrNameBefore => loop {
889                match get_char!(self, input) {
890                    '\t' | '\n' | ' ' => (),
891                    '>' => go!(self: emit_tag Data),
892                    '/' => go!(self: set_empty_tag; to TagEmpty),
893                    ':' => go!(self: error),
894                    cl => go!(self: create_attr cl; to TagAttrName),
895                }
896            },
897            //§ tag-attribute-name-state
898            XmlState::TagAttrName => loop {
899                match get_char!(self, input) {
900                    '=' => go!(self: to TagAttrValueBefore),
901                    '>' => go!(self: emit_tag Data),
902                    '\t' | '\n' | ' ' => go!(self: to TagAttrNameAfter),
903                    '/' => go!(self: set_empty_tag; to TagEmpty),
904                    cl => go!(self: push_name cl),
905                }
906            },
907            //§ tag-attribute-name-after-state
908            XmlState::TagAttrNameAfter => loop {
909                match get_char!(self, input) {
910                    '\t' | '\n' | ' ' => (),
911                    '=' => go!(self: to TagAttrValueBefore),
912                    '>' => go!(self: emit_tag Data),
913                    '/' => go!(self: set_empty_tag; to TagEmpty),
914                    cl => go!(self: create_attr cl; to TagAttrName),
915                }
916            },
917            //§ tag-attribute-value-before-state
918            XmlState::TagAttrValueBefore => loop {
919                match get_char!(self, input) {
920                    '\t' | '\n' | ' ' => (),
921                    '"' => go!(self: to TagAttrValue DoubleQuoted),
922                    '\'' => go!(self: to TagAttrValue SingleQuoted),
923                    '&' => go!(self: reconsume TagAttrValue(Unquoted)),
924                    '>' => go!(self: emit_tag Data),
925                    cl => go!(self: push_value cl; to TagAttrValue(Unquoted)),
926                }
927            },
928            //§ tag-attribute-value-double-quoted-state
929            XmlState::TagAttrValue(DoubleQuoted) => loop {
930                match pop_except_from!(self, input, small_char_set!('\n' '"' '&')) {
931                    FromSet('"') => go!(self: to TagAttrNameBefore),
932                    FromSet('&') => go!(self: consume_char_ref '"' ),
933                    FromSet(c) => go!(self: push_value c),
934                    NotFromSet(ref b) => go!(self: append_value b),
935                }
936            },
937            //§ tag-attribute-value-single-quoted-state
938            XmlState::TagAttrValue(SingleQuoted) => loop {
939                match pop_except_from!(self, input, small_char_set!('\n' '\'' '&')) {
940                    FromSet('\'') => go!(self: to TagAttrNameBefore),
941                    FromSet('&') => go!(self: consume_char_ref '\''),
942                    FromSet(c) => go!(self: push_value c),
943                    NotFromSet(ref b) => go!(self: append_value b),
944                }
945            },
946            //§ tag-attribute-value-double-quoted-state
947            XmlState::TagAttrValue(Unquoted) => loop {
948                match pop_except_from!(self, input, small_char_set!('\n' '\t' ' ' '&' '>')) {
949                    FromSet('\t') | FromSet('\n') | FromSet(' ') => go!(self: to TagAttrNameBefore),
950                    FromSet('&') => go!(self: consume_char_ref),
951                    FromSet('>') => go!(self: emit_tag Data),
952                    FromSet(c) => go!(self: push_value c),
953                    NotFromSet(ref b) => go!(self: append_value b),
954                }
955            },
956
957            //§ doctype-state
958            XmlState::Doctype => loop {
959                match get_char!(self, input) {
960                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
961                    _ => go!(self: error; reconsume BeforeDoctypeName),
962                }
963            },
964            //§ before-doctype-name-state
965            XmlState::BeforeDoctypeName => loop {
966                match get_char!(self, input) {
967                    '\t' | '\n' | '\x0C' | ' ' => (),
968                    '>' => go!(self: error; emit_doctype; to Data),
969                    c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
970                                  to DoctypeName),
971                }
972            },
973            //§ doctype-name-state
974            XmlState::DoctypeName => loop {
975                match get_char!(self, input) {
976                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterDoctypeName),
977                    '>' => go!(self: emit_doctype; to Data),
978                    c => go!(self: push_doctype_name (c.to_ascii_lowercase());
979                                  to DoctypeName),
980                }
981            },
982            //§ after-doctype-name-state
983            XmlState::AfterDoctypeName => loop {
984                if eat!(self, input, "public") {
985                    go!(self: to AfterDoctypeKeyword Public);
986                } else if eat!(self, input, "system") {
987                    go!(self: to AfterDoctypeKeyword System);
988                } else {
989                    match get_char!(self, input) {
990                        '\t' | '\n' | '\x0C' | ' ' => (),
991                        '>' => go!(self: emit_doctype; to Data),
992                        _ => go!(self: error; to BogusDoctype),
993                    }
994                }
995            },
996            //§ after-doctype-public-keyword-state
997            XmlState::AfterDoctypeKeyword(Public) => loop {
998                match get_char!(self, input) {
999                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier Public),
1000                    '"' => {
1001                        go!(self: error; clear_doctype_id Public; to DoctypeIdentifierDoubleQuoted Public)
1002                    },
1003                    '\'' => {
1004                        go!(self: error; clear_doctype_id Public; to DoctypeIdentifierSingleQuoted Public)
1005                    },
1006                    '>' => go!(self: error; emit_doctype; to Data),
1007                    _ => go!(self: error; to BogusDoctype),
1008                }
1009            },
1010            //§ after-doctype-system-keyword-state
1011            XmlState::AfterDoctypeKeyword(System) => loop {
1012                match get_char!(self, input) {
1013                    '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier System),
1014                    '"' => {
1015                        go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1016                    },
1017                    '\'' => {
1018                        go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1019                    },
1020                    '>' => go!(self: error; emit_doctype; to Data),
1021                    _ => go!(self: error; to BogusDoctype),
1022                }
1023            },
1024            //§ before_doctype_public_identifier_state before_doctype_system_identifier_state
1025            XmlState::BeforeDoctypeIdentifier(kind) => loop {
1026                match get_char!(self, input) {
1027                    '\t' | '\n' | '\x0C' | ' ' => (),
1028                    '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1029                    '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1030                    '>' => go!(self: error; emit_doctype; to Data),
1031                    _ => go!(self: error; to BogusDoctype),
1032                }
1033            },
1034            //§ doctype_public_identifier_double_quoted_state doctype_system_identifier_double_quoted_state
1035            XmlState::DoctypeIdentifierDoubleQuoted(kind) => loop {
1036                match get_char!(self, input) {
1037                    '"' => go!(self: to AfterDoctypeIdentifier kind),
1038                    '>' => go!(self: error; emit_doctype; to Data),
1039                    c => go!(self: push_doctype_id kind c),
1040                }
1041            },
1042            //§ doctype_public_identifier_single_quoted_state doctype_system_identifier_single_quoted_state
1043            XmlState::DoctypeIdentifierSingleQuoted(kind) => loop {
1044                match get_char!(self, input) {
1045                    '\'' => go!(self: to AfterDoctypeIdentifier kind),
1046                    '>' => go!(self: error; emit_doctype; to Data),
1047                    c => go!(self: push_doctype_id kind c),
1048                }
1049            },
1050            //§ doctype_public_identifier_single_quoted_state
1051            XmlState::AfterDoctypeIdentifier(Public) => loop {
1052                match get_char!(self, input) {
1053                    '\t' | '\n' | '\x0C' | ' ' => {
1054                        go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1055                    },
1056                    '\'' => {
1057                        go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted(System))
1058                    },
1059                    '"' => {
1060                        go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted(System))
1061                    },
1062                    '>' => go!(self: emit_doctype; to Data),
1063                    _ => go!(self: error; to BogusDoctype),
1064                }
1065            },
1066            //§ doctype_system_identifier_single_quoted_state
1067            XmlState::AfterDoctypeIdentifier(System) => loop {
1068                match get_char!(self, input) {
1069                    '\t' | '\n' | '\x0C' | ' ' => (),
1070                    '>' => go!(self: emit_doctype; to Data),
1071                    _ => go!(self: error; to BogusDoctype),
1072                }
1073            },
1074            //§ between_doctype_public_and_system_identifier_state
1075            XmlState::BetweenDoctypePublicAndSystemIdentifiers => loop {
1076                match get_char!(self, input) {
1077                    '\t' | '\n' | '\x0C' | ' ' => (),
1078                    '>' => go!(self: emit_doctype; to Data),
1079                    '\'' => go!(self: to DoctypeIdentifierSingleQuoted System),
1080                    '"' => go!(self: to DoctypeIdentifierDoubleQuoted System),
1081                    _ => go!(self: error; to BogusDoctype),
1082                }
1083            },
1084            //§ bogus_doctype_state
1085            XmlState::BogusDoctype => loop {
1086                if get_char!(self, input) == '>' {
1087                    go!(self: emit_doctype; to Data);
1088                }
1089            },
1090        }
1091    }
1092
1093    /// Indicate that we have reached the end of the input.
1094    pub fn end(&self) {
1095        // Handle EOF in the char ref sub-tokenizer, if there is one.
1096        // Do this first because it might un-consume stuff.
1097        let input = BufferQueue::default();
1098        match self.char_ref_tokenizer.take() {
1099            None => (),
1100            Some(mut tok) => {
1101                tok.end_of_file(self, &input);
1102                self.process_char_ref(tok.get_result());
1103            },
1104        }
1105
1106        // Process all remaining buffered input.
1107        // If we're waiting for lookahead, we're not gonna get it.
1108        self.at_eof.set(true);
1109        let _ = self.run(&input);
1110
1111        loop {
1112            if !matches!(self.eof_step(), ProcessResult::Continue) {
1113                break;
1114            }
1115        }
1116
1117        self.sink.end();
1118
1119        if self.opts.profile {
1120            self.dump_profile();
1121        }
1122    }
1123
1124    #[cfg(for_c)]
1125    fn dump_profile(&self) {
1126        unreachable!();
1127    }
1128
1129    #[cfg(not(for_c))]
1130    fn dump_profile(&self) {
1131        let mut results: Vec<(states::XmlState, u64)> = self
1132            .state_profile
1133            .borrow()
1134            .iter()
1135            .map(|(s, t)| (*s, *t))
1136            .collect();
1137        results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1138
1139        let total: u64 = results
1140            .iter()
1141            .map(|&(_, t)| t)
1142            .fold(0, ::std::ops::Add::add);
1143        debug!("\nTokenizer profile, in nanoseconds");
1144        debug!(
1145            "\n{:12}         total in token sink",
1146            self.time_in_sink.get()
1147        );
1148        debug!("\n{total:12}         total in tokenizer");
1149
1150        for (k, v) in results.into_iter() {
1151            let pct = 100.0 * (v as f64) / (total as f64);
1152            debug!("{v:12}  {pct:4.1}%  {k:?}");
1153        }
1154    }
1155
1156    fn eof_step(&self) -> ProcessResult<Sink::Handle> {
1157        debug!("processing EOF in state {:?}", self.state.get());
1158        match self.state.get() {
1159            XmlState::Data | XmlState::Quiescent => go!(self: eof),
1160            XmlState::CommentStart | XmlState::CommentLessThan | XmlState::CommentLessThanBang => {
1161                go!(self: reconsume Comment)
1162            },
1163            XmlState::CommentLessThanBangDash => go!(self: reconsume CommentEndDash),
1164            XmlState::CommentLessThanBangDashDash => go!(self: reconsume CommentEnd),
1165            XmlState::CommentStartDash
1166            | XmlState::Comment
1167            | XmlState::CommentEndDash
1168            | XmlState::CommentEnd
1169            | XmlState::CommentEndBang => go!(self: error_eof; emit_comment; eof),
1170            XmlState::TagState => go!(self: error_eof; emit '<'; to Data),
1171            XmlState::EndTagState => go!(self: error_eof; emit '<'; emit '/'; to Data),
1172            XmlState::TagEmpty => go!(self: error_eof; to TagAttrNameBefore),
1173            XmlState::Cdata | XmlState::CdataBracket | XmlState::CdataEnd => {
1174                go!(self: error_eof; to Data)
1175            },
1176            XmlState::Pi => go!(self: error_eof; to BogusComment),
1177            XmlState::PiTargetAfter | XmlState::PiAfter => go!(self: reconsume PiData),
1178            XmlState::MarkupDecl => go!(self: error_eof; to BogusComment),
1179            XmlState::TagName
1180            | XmlState::TagAttrNameBefore
1181            | XmlState::EndTagName
1182            | XmlState::TagAttrNameAfter
1183            | XmlState::EndTagNameAfter
1184            | XmlState::TagAttrValueBefore
1185            | XmlState::TagAttrValue(_) => go!(self: error_eof; emit_tag Data),
1186            XmlState::PiData | XmlState::PiTarget => go!(self: error_eof; emit_pi Data),
1187            XmlState::TagAttrName => go!(self: error_eof; emit_start_tag Data),
1188            XmlState::BeforeDoctypeName
1189            | XmlState::Doctype
1190            | XmlState::DoctypeName
1191            | XmlState::AfterDoctypeName
1192            | XmlState::AfterDoctypeKeyword(_)
1193            | XmlState::BeforeDoctypeIdentifier(_)
1194            | XmlState::AfterDoctypeIdentifier(_)
1195            | XmlState::DoctypeIdentifierSingleQuoted(_)
1196            | XmlState::DoctypeIdentifierDoubleQuoted(_)
1197            | XmlState::BetweenDoctypePublicAndSystemIdentifiers => {
1198                go!(self: error_eof; emit_doctype; to Data)
1199            },
1200            XmlState::BogusDoctype => go!(self: emit_doctype; to Data),
1201            XmlState::BogusComment => go!(self: emit_comment; to Data),
1202        }
1203    }
1204
1205    fn process_char_ref(&self, char_ref: CharRef) {
1206        let CharRef {
1207            mut chars,
1208            mut num_chars,
1209        } = char_ref;
1210
1211        if num_chars == 0 {
1212            chars[0] = '&';
1213            num_chars = 1;
1214        }
1215
1216        for i in 0..num_chars {
1217            let c = chars[i as usize];
1218            match self.state.get() {
1219                states::Data | states::Cdata => go!(self: emit c),
1220
1221                states::TagAttrValue(_) => go!(self: push_value c),
1222
1223                _ => panic!(
1224                    "state {:?} should not be reachable in process_char_ref",
1225                    self.state.get()
1226                ),
1227            }
1228        }
1229    }
1230
1231    fn step_char_ref_tokenizer(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
1232        let mut tok = self.char_ref_tokenizer.take().unwrap();
1233        let outcome = tok.step(self, input);
1234
1235        let progress = match outcome {
1236            char_ref::Done => {
1237                self.process_char_ref(tok.get_result());
1238                return ProcessResult::Continue;
1239            },
1240
1241            char_ref::Stuck => ProcessResult::Done,
1242            char_ref::Progress => ProcessResult::Continue,
1243        };
1244
1245        *self.char_ref_tokenizer.borrow_mut() = Some(tok);
1246        progress
1247    }
1248
1249    fn finish_attribute(&self) {
1250        if self.current_attr_name.borrow().is_empty() {
1251            return;
1252        }
1253
1254        // Check for a duplicate attribute.
1255        // FIXME: the spec says we should error as soon as the name is finished.
1256        // FIXME: linear time search, do we care?
1257        let dup = {
1258            let current_attr_name = self.current_attr_name.borrow();
1259            let name = &current_attr_name[..];
1260            self.current_tag_attrs
1261                .borrow()
1262                .iter()
1263                .any(|a| &*a.name.local == name)
1264        };
1265
1266        if dup {
1267            self.emit_error(Borrowed("Duplicate attribute"));
1268            self.current_attr_name.borrow_mut().clear();
1269            self.current_attr_value.borrow_mut().clear();
1270        } else {
1271            let qname = process_qname(replace(
1272                &mut self.current_attr_name.borrow_mut(),
1273                StrTendril::new(),
1274            ));
1275            let attr = Attribute {
1276                name: qname.clone(),
1277                value: replace(&mut self.current_attr_value.borrow_mut(), StrTendril::new()),
1278            };
1279
1280            if qname.local == local_name!("xmlns")
1281                || qname.prefix == Some(namespace_prefix!("xmlns"))
1282            {
1283                self.current_tag_attrs.borrow_mut().insert(0, attr);
1284            } else {
1285                self.current_tag_attrs.borrow_mut().push(attr);
1286            }
1287        }
1288    }
1289
1290    fn create_attribute(&self, c: char) {
1291        self.finish_attribute();
1292
1293        self.current_attr_name.borrow_mut().push_char(c);
1294    }
1295}
1296
1297#[cfg(test)]
1298mod test {
1299
1300    use super::process_qname;
1301    use crate::tendril::SliceExt;
1302    use crate::{LocalName, Prefix};
1303
1304    #[test]
1305    fn simple_namespace() {
1306        let qname = process_qname("prefix:local".to_tendril());
1307        assert_eq!(qname.prefix, Some(Prefix::from("prefix")));
1308        assert_eq!(qname.local, LocalName::from("local"));
1309
1310        let qname = process_qname("a:b".to_tendril());
1311        assert_eq!(qname.prefix, Some(Prefix::from("a")));
1312        assert_eq!(qname.local, LocalName::from("b"));
1313    }
1314
1315    #[test]
1316    fn wrong_namespaces() {
1317        let qname = process_qname(":local".to_tendril());
1318        assert_eq!(qname.prefix, None);
1319        assert_eq!(qname.local, LocalName::from(":local"));
1320
1321        let qname = process_qname("::local".to_tendril());
1322        assert_eq!(qname.prefix, None);
1323        assert_eq!(qname.local, LocalName::from("::local"));
1324
1325        let qname = process_qname("a::local".to_tendril());
1326        assert_eq!(qname.prefix, None);
1327        assert_eq!(qname.local, LocalName::from("a::local"));
1328
1329        let qname = process_qname("fake::".to_tendril());
1330        assert_eq!(qname.prefix, None);
1331        assert_eq!(qname.local, LocalName::from("fake::"));
1332
1333        let qname = process_qname(":::".to_tendril());
1334        assert_eq!(qname.prefix, None);
1335        assert_eq!(qname.local, LocalName::from(":::"));
1336
1337        let qname = process_qname(":a:b:".to_tendril());
1338        assert_eq!(qname.prefix, None);
1339        assert_eq!(qname.local, LocalName::from(":a:b:"));
1340    }
1341}