Skip to main content

quick_xml/reader/
state.rs

1use std::fmt::Debug;
2
3#[cfg(feature = "encoding")]
4use encoding_rs::UTF_8;
5
6use crate::encoding::Decoder;
7use crate::errors::{Error, IllFormedError, Result};
8use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesText, Event};
9use crate::parser::{Parser, PiParser};
10#[cfg(feature = "encoding")]
11use crate::reader::EncodingRef;
12use crate::reader::{BangType, Config, DtdParser, ParseState};
13use crate::utils::{is_whitespace, name_len, Bytes};
14
15/// A struct that holds a current reader state and a parser configuration.
16/// It is independent on a way of reading data: the reader feed data into it and
17/// get back produced [`Event`]s.
18#[derive(Clone)]
19pub(super) struct ReaderState {
20    /// Number of bytes read from the source of data since the reader was created
21    pub offset: u64,
22    /// A snapshot of an `offset` of the last error returned. It can be less than
23    /// `offset`, because some errors conveniently report at earlier position,
24    /// and changing `offset` is not possible, because `Error::IllFormed` errors
25    /// are recoverable.
26    pub last_error_offset: u64,
27    /// Defines how to process next byte
28    pub state: ParseState,
29    /// User-defined settings that affect parsing
30    pub config: Config,
31    /// All currently Started elements which didn't have a matching
32    /// End element yet.
33    ///
34    /// For an XML
35    ///
36    /// ```xml
37    /// <root><one/><inner attr="value">|<tag></inner></root>
38    /// ```
39    /// when cursor at the `|` position buffer contains:
40    ///
41    /// ```text
42    /// rootinner
43    /// ^   ^
44    /// ```
45    ///
46    /// The `^` symbols shows which positions stored in the [`Self::opened_starts`]
47    /// (0 and 4 in that case).
48    opened_buffer: Vec<u8>,
49    /// Opened name start indexes into [`Self::opened_buffer`]. See documentation
50    /// for that field for details
51    opened_starts: Vec<usize>,
52
53    #[cfg(feature = "encoding")]
54    /// Reference to the encoding used to read an XML
55    pub encoding: EncodingRef,
56}
57
58impl ReaderState {
59    /// Trims end whitespaces from `bytes`, if required, and returns a text event.
60    ///
61    /// # Parameters
62    /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
63    pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> BytesText<'b> {
64        let mut content = bytes;
65
66        if self.config.trim_text_end {
67            // Skip the ending '<'
68            let len = bytes
69                .iter()
70                .rposition(|&b| !is_whitespace(b))
71                .map_or(0, |p| p + 1);
72            content = &bytes[..len];
73        }
74        BytesText::wrap(content, self.decoder())
75    }
76
77    /// Returns `Comment`, `CData` or `DocType` event.
78    ///
79    /// `buf` contains data between `<` and `>`:
80    /// - CDATA: `<![CDATA[...]]`
81    /// - Comment: `<!--...--`
82    /// - Doctype (uppercase): `<!D...`
83    /// - Doctype (lowercase): `<!d...`
84    pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> {
85        debug_assert!(
86            buf.starts_with(b"<!"),
87            "CDATA, comment or DOCTYPE must start from '<!':\n{:?}",
88            crate::utils::Bytes(buf)
89        );
90        debug_assert!(
91            buf.ends_with(b">"),
92            "CDATA, comment or DOCTYPE must end with '>':\n{:?}",
93            crate::utils::Bytes(buf)
94        );
95
96        let uncased_starts_with = |string: &[u8], prefix: &[u8]| {
97            string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix)
98        };
99
100        let len = buf.len();
101        match bang_type {
102            BangType::Comment if buf.starts_with(b"<!--") => {
103                debug_assert!(
104                    buf.ends_with(b"-->"),
105                    "comment must end with '-->':\n{:?}",
106                    crate::utils::Bytes(buf)
107                );
108                if self.config.check_comments {
109                    // search if '--' not in comments
110                    let mut haystack = &buf[4..len - 3];
111                    let mut off = 0;
112                    while let Some(p) = memchr::memchr(b'-', haystack) {
113                        off += p + 1;
114                        // if next byte after `-` is also `-`, return an error
115                        if buf[4 + off] == b'-' {
116                            // Explanation of the magic:
117                            //
118                            // - `self.offset` just after `>`,
119                            // - `buf` contains `<!-- con--tent --`
120                            // - `p` is counted from byte after `<!--`
121                            //
122                            // <!-- con--tent -->:
123                            // ~~~~~~~~~~~~~~~~~~: - buf
124                            //  :  ===========   : - zone of search (possible values of `p`)
125                            //  :  |---p         : - p is counted from | (| is 0)
126                            //  :  :   :         ^ - self.offset
127                            //  ^  :   :           - self.offset - len
128                            //     ^   :           - self.offset - len + 4
129                            //         ^           - self.offset - len + 4 + p
130                            self.last_error_offset = self.offset - len as u64 + 4 + p as u64;
131                            return Err(Error::IllFormed(IllFormedError::DoubleHyphenInComment));
132                        }
133                        // Continue search after single `-` (+1 to skip it)
134                        haystack = &haystack[p + 1..];
135                    }
136                }
137                Ok(Event::Comment(BytesText::wrap(
138                    // Cut of `<!--` and `-->` from start and end
139                    &buf[4..len - 3],
140                    self.decoder(),
141                )))
142            }
143            // XML requires uppercase only:
144            // https://www.w3.org/TR/xml11/#sec-cdata-sect
145            // Even HTML5 required uppercase only:
146            // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
147            BangType::CData if buf.starts_with(b"<![CDATA[") => {
148                debug_assert!(
149                    buf.ends_with(b"]]>"),
150                    "CDATA must end with ']]>':\n{:?}",
151                    crate::utils::Bytes(buf)
152                );
153                Ok(Event::CData(BytesCData::wrap(
154                    // Cut of `<![CDATA[` and `]]>` from start and end
155                    &buf[9..len - 3],
156                    self.decoder(),
157                )))
158            }
159            // XML requires uppercase only, but we will check that on validation stage:
160            // https://www.w3.org/TR/xml11/#sec-prolog-dtd
161            // HTML5 allows mixed case for doctype declarations:
162            // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
163            BangType::DocType(DtdParser::Finished) if uncased_starts_with(buf, b"<!DOCTYPE") => {
164                match buf[9..len - 1].iter().position(|&b| !is_whitespace(b)) {
165                    Some(start) => Ok(Event::DocType(BytesText::wrap(
166                        // Cut of `<!DOCTYPE` and any number of spaces from start and `>` from the end
167                        &buf[9 + start..len - 1],
168                        self.decoder(),
169                    ))),
170                    None => {
171                        // Because we here, we at least read `<!DOCTYPE>` and offset after `>`.
172                        // We want report error at place where name is expected - this is just
173                        // before `>`
174                        self.last_error_offset = self.offset - 1;
175                        Err(Error::IllFormed(IllFormedError::MissingDoctypeName))
176                    }
177                }
178            }
179            _ => {
180                // <!....>
181                // ~~~~~~~- `buf` contains that and `self.offset` is after `>`.
182                // ^------- We report error at that position, so we need to subtract buf len
183                self.last_error_offset = self.offset - len as u64;
184                Err(Error::Syntax(bang_type.to_err()))
185            }
186        }
187    }
188
189    /// Wraps content of `buf` into the [`Event::End`] event. Does the check that
190    /// end name matches the last opened start name if `self.config.check_end_names` is set.
191    ///
192    /// `buf` contains data between `<` and up to, including, `>`, for example `</tag>`.
193    pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
194        debug_assert!(
195            buf.starts_with(b"</"),
196            "end tag must start from '</':\n{:?}",
197            crate::utils::Bytes(buf)
198        );
199        debug_assert!(
200            buf.ends_with(b">"),
201            "end tag must end with '>':\n{:?}",
202            crate::utils::Bytes(buf)
203        );
204
205        // Strip the `</` and `>` characters. `content` contains data between `</` and `>`
206        let content = &buf[2..buf.len() - 1];
207        // XML standard permits whitespaces after the markup name in closing tags.
208        // Let's strip them from the buffer before comparing tag names.
209        let name = if self.config.trim_markup_names_in_closing_tags {
210            if let Some(pos_end_name) = content.iter().rposition(|&b| !is_whitespace(b)) {
211                &content[..pos_end_name + 1]
212            } else {
213                content
214            }
215        } else {
216            content
217        };
218
219        let decoder = self.decoder();
220
221        // Get the index in self.opened_buffer of the name of the last opened tag
222        match self.opened_starts.pop() {
223            Some(start) => {
224                if self.config.check_end_names {
225                    let expected = &self.opened_buffer[start..];
226                    if name != expected {
227                        let expected = decoder.decode(expected).unwrap_or_default().into_owned();
228                        // #513: In order to allow error recovery we should drop content of the buffer
229                        self.opened_buffer.truncate(start);
230
231                        // Report error at start of the end tag at `<` character
232                        self.last_error_offset = self.offset - buf.len() as u64;
233                        return Err(Error::IllFormed(IllFormedError::MismatchedEndTag {
234                            expected,
235                            found: decoder.decode(name).unwrap_or_default().into_owned(),
236                        }));
237                    }
238                }
239
240                self.opened_buffer.truncate(start);
241            }
242            None => {
243                if !self.config.allow_unmatched_ends {
244                    // Report error at start of the end tag at `<` character
245                    self.last_error_offset = self.offset - buf.len() as u64;
246                    return Err(Error::IllFormed(IllFormedError::UnmatchedEndTag(
247                        decoder.decode(name).unwrap_or_default().into_owned(),
248                    )));
249                }
250            }
251        }
252
253        Ok(Event::End(BytesEnd::wrap(name.into())))
254    }
255
256    /// `buf` contains data between `<` and `>` and the first byte is `?`.
257    /// `self.offset` already after the `>`
258    ///
259    /// Returns `Decl` or `PI` event
260    pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
261        debug_assert!(
262            buf.starts_with(b"<?"),
263            "processing instruction or XML declaration must start from '<?':\n{:?}",
264            crate::utils::Bytes(buf)
265        );
266        debug_assert!(
267            buf.ends_with(b"?>"),
268            "processing instruction or XML declaration must end with '?>':\n{:?}",
269            crate::utils::Bytes(buf)
270        );
271
272        let len = buf.len();
273        // We accept at least <??>
274        //                    ~~~~ - len = 4
275        if len > 3 {
276            // Cut of `<?` and `?>` from start and end
277            let content = &buf[2..len - 2];
278            let len = content.len();
279
280            if content.starts_with(b"xml") && (len == 3 || is_whitespace(content[3])) {
281                let event = BytesDecl::from_start(BytesStart::wrap(content, 3, self.decoder()));
282
283                // Try getting encoding from the declaration event
284                #[cfg(feature = "encoding")]
285                if self.encoding.can_be_refined() {
286                    if let Some(encoding) = event.encoder() {
287                        self.encoding = EncodingRef::XmlDetected(encoding);
288                    }
289                }
290
291                Ok(Event::Decl(event))
292            } else {
293                Ok(Event::PI(BytesPI::wrap(
294                    content,
295                    name_len(content),
296                    self.decoder(),
297                )))
298            }
299        } else {
300            // <?...?>
301            // ~~~~~~~- `buf` contains that and `self.offset` is after `>`.
302            // ^------- We report error at that position, so we need to subtract buf len
303            self.last_error_offset = self.offset - len as u64;
304            Err(Error::Syntax(PiParser(false).eof_error(buf)))
305        }
306    }
307
308    /// Converts content of a tag to a `Start` or an `Empty` event
309    ///
310    /// # Parameters
311    /// - `content`: Content of a tag between `<` and `>`
312    pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Event<'b> {
313        debug_assert!(
314            content.starts_with(b"<"),
315            "start or empty tag must start from '<':\n{:?}",
316            crate::utils::Bytes(content)
317        );
318        debug_assert!(
319            content.ends_with(b">"),
320            "start or empty tag must end with '>':\n{:?}",
321            crate::utils::Bytes(content)
322        );
323
324        // strip `<`
325        let content = &content[1..];
326        if let Some(content) = content.strip_suffix(b"/>") {
327            // This is self-closed tag `<something/>`
328            let event = BytesStart::wrap(content, name_len(content), self.decoder());
329
330            if self.config.expand_empty_elements {
331                self.state = ParseState::InsideEmpty;
332                self.opened_starts.push(self.opened_buffer.len());
333                self.opened_buffer.extend(event.name().as_ref());
334                Event::Start(event)
335            } else {
336                Event::Empty(event)
337            }
338        } else {
339            // strip `>`
340            let content = &content[..content.len() - 1];
341            let event = BytesStart::wrap(content, name_len(content), self.decoder());
342
343            // #514: Always store names event when .check_end_names == false,
344            // because checks can be temporary disabled and when they would be
345            // enabled, we should have that information
346            self.opened_starts.push(self.opened_buffer.len());
347            self.opened_buffer.extend(event.name().as_ref());
348            Event::Start(event)
349        }
350    }
351
352    #[inline]
353    pub fn close_expanded_empty(&mut self) -> BytesEnd<'static> {
354        self.state = ParseState::InsideText;
355        let name = self
356            .opened_buffer
357            .split_off(self.opened_starts.pop().unwrap());
358        BytesEnd::wrap(name.into())
359    }
360
361    /// Get the decoder, used to decode bytes, read by this reader, to the strings.
362    ///
363    /// If [`encoding`] feature is enabled, the used encoding may change after
364    /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
365    ///
366    /// If [`encoding`] feature is enabled and no encoding is specified in declaration,
367    /// defaults to UTF-8.
368    ///
369    /// [`encoding`]: ../../index.html#encoding
370    pub const fn decoder(&self) -> Decoder {
371        Decoder {
372            #[cfg(feature = "encoding")]
373            encoding: self.encoding.encoding(),
374        }
375    }
376}
377
378impl Default for ReaderState {
379    fn default() -> Self {
380        Self {
381            offset: 0,
382            last_error_offset: 0,
383            state: ParseState::Init,
384            config: Config::default(),
385            opened_buffer: Vec::new(),
386            opened_starts: Vec::new(),
387
388            #[cfg(feature = "encoding")]
389            encoding: EncodingRef::Implicit(UTF_8),
390        }
391    }
392}
393
394impl Debug for ReaderState {
395    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
396        let mut d = f.debug_struct("ReaderState");
397
398        d.field("offset", &self.offset);
399        d.field("last_error_offset", &self.last_error_offset);
400        d.field("state", &self.state);
401        d.field("config", &self.config);
402        d.field("opened_buffer", &Bytes(&self.opened_buffer));
403        d.field("opened_starts", &self.opened_starts);
404
405        #[cfg(feature = "encoding")]
406        d.field("encoding", &self.encoding);
407
408        d.finish()
409    }
410}