xml5ever/
driver.rs

1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use crate::tokenizer::{XmlTokenizer, XmlTokenizerOpts};
11use crate::tree_builder::{TreeSink, XmlTreeBuilder, XmlTreeBuilderOpts};
12
13use std::borrow::Cow;
14
15use crate::tendril;
16use crate::tendril::stream::{TendrilSink, Utf8LossyDecoder};
17use crate::tendril::StrTendril;
18use markup5ever::buffer_queue::BufferQueue;
19use markup5ever::TokenizerResult;
20
21/// All-encompasing parser setting structure.
22#[derive(Clone, Default)]
23pub struct XmlParseOpts {
24    /// Xml tokenizer options.
25    pub tokenizer: XmlTokenizerOpts,
26    /// Xml tree builder .
27    pub tree_builder: XmlTreeBuilderOpts,
28}
29
30/// Parse and send results to a `TreeSink`.
31///
32/// ## Example
33///
34/// ```ignore
35/// let mut sink = MySink;
36/// parse_document(&mut sink, iter::once(my_str), Default::default());
37/// ```
38pub fn parse_document<Sink>(sink: Sink, opts: XmlParseOpts) -> XmlParser<Sink>
39where
40    Sink: TreeSink,
41{
42    let tb = XmlTreeBuilder::new(sink, opts.tree_builder);
43    let tok = XmlTokenizer::new(tb, opts.tokenizer);
44    XmlParser {
45        tokenizer: tok,
46        input_buffer: BufferQueue::default(),
47    }
48}
49
50/// An XML parser,
51/// ready to receive Unicode input through the `tendril::TendrilSink` trait’s methods.
52pub struct XmlParser<Sink>
53where
54    Sink: TreeSink,
55{
56    /// Tokenizer used by XmlParser.
57    pub tokenizer: XmlTokenizer<XmlTreeBuilder<Sink::Handle, Sink>>,
58    /// Input used by XmlParser.
59    pub input_buffer: BufferQueue,
60}
61
62impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for XmlParser<Sink> {
63    type Output = Sink::Output;
64
65    fn process(&mut self, t: StrTendril) {
66        self.input_buffer.push_back(t);
67        // FIXME: Properly support </script> somehow.
68        while let TokenizerResult::Script(_) = self.tokenizer.feed(&self.input_buffer) {}
69    }
70
71    // FIXME: Is it too noisy to report every character decoding error?
72    fn error(&mut self, desc: Cow<'static, str>) {
73        self.tokenizer.sink.sink.parse_error(desc)
74    }
75
76    fn finish(self) -> Self::Output {
77        self.tokenizer.end();
78        self.tokenizer.sink.sink.finish()
79    }
80}
81
82impl<Sink: TreeSink> XmlParser<Sink> {
83    /// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes.
84    ///
85    /// Use this when your input is bytes that are known to be in the UTF-8 encoding.
86    /// Decoding is lossy, like `String::from_utf8_lossy`.
87    #[allow(clippy::wrong_self_convention)]
88    pub fn from_utf8(self) -> Utf8LossyDecoder<Self> {
89        Utf8LossyDecoder::new(self)
90    }
91}