quick_xml/reader/state.rs
1use std::fmt::Debug;
2
3#[cfg(feature = "encoding")]
4use encoding_rs::UTF_8;
5
6use crate::encoding::Decoder;
7use crate::errors::{Error, IllFormedError, Result};
8use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesText, Event};
9use crate::parser::{Parser, PiParser};
10#[cfg(feature = "encoding")]
11use crate::reader::EncodingRef;
12use crate::reader::{BangType, Config, DtdParser, ParseState};
13use crate::utils::{is_whitespace, name_len, Bytes};
14
15/// A struct that holds a current reader state and a parser configuration.
16/// It is independent on a way of reading data: the reader feed data into it and
17/// get back produced [`Event`]s.
18#[derive(Clone)]
19pub(super) struct ReaderState {
20 /// Number of bytes read from the source of data since the reader was created
21 pub offset: u64,
22 /// A snapshot of an `offset` of the last error returned. It can be less than
23 /// `offset`, because some errors conveniently report at earlier position,
24 /// and changing `offset` is not possible, because `Error::IllFormed` errors
25 /// are recoverable.
26 pub last_error_offset: u64,
27 /// Defines how to process next byte
28 pub state: ParseState,
29 /// User-defined settings that affect parsing
30 pub config: Config,
31 /// All currently Started elements which didn't have a matching
32 /// End element yet.
33 ///
34 /// For an XML
35 ///
36 /// ```xml
37 /// <root><one/><inner attr="value">|<tag></inner></root>
38 /// ```
39 /// when cursor at the `|` position buffer contains:
40 ///
41 /// ```text
42 /// rootinner
43 /// ^ ^
44 /// ```
45 ///
46 /// The `^` symbols shows which positions stored in the [`Self::opened_starts`]
47 /// (0 and 4 in that case).
48 opened_buffer: Vec<u8>,
49 /// Opened name start indexes into [`Self::opened_buffer`]. See documentation
50 /// for that field for details
51 opened_starts: Vec<usize>,
52
53 #[cfg(feature = "encoding")]
54 /// Reference to the encoding used to read an XML
55 pub encoding: EncodingRef,
56}
57
58impl ReaderState {
59 /// Trims end whitespaces from `bytes`, if required, and returns a text event.
60 ///
61 /// # Parameters
62 /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
63 pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> BytesText<'b> {
64 let mut content = bytes;
65
66 if self.config.trim_text_end {
67 // Skip the ending '<'
68 let len = bytes
69 .iter()
70 .rposition(|&b| !is_whitespace(b))
71 .map_or(0, |p| p + 1);
72 content = &bytes[..len];
73 }
74 BytesText::wrap(content, self.decoder())
75 }
76
77 /// Returns `Comment`, `CData` or `DocType` event.
78 ///
79 /// `buf` contains data between `<` and `>`:
80 /// - CDATA: `<![CDATA[...]]`
81 /// - Comment: `<!--...--`
82 /// - Doctype (uppercase): `<!D...`
83 /// - Doctype (lowercase): `<!d...`
84 pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> {
85 debug_assert!(
86 buf.starts_with(b"<!"),
87 "CDATA, comment or DOCTYPE must start from '<!':\n{:?}",
88 crate::utils::Bytes(buf)
89 );
90 debug_assert!(
91 buf.ends_with(b">"),
92 "CDATA, comment or DOCTYPE must end with '>':\n{:?}",
93 crate::utils::Bytes(buf)
94 );
95
96 let uncased_starts_with = |string: &[u8], prefix: &[u8]| {
97 string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix)
98 };
99
100 let len = buf.len();
101 match bang_type {
102 BangType::Comment if buf.starts_with(b"<!--") => {
103 debug_assert!(
104 buf.ends_with(b"-->"),
105 "comment must end with '-->':\n{:?}",
106 crate::utils::Bytes(buf)
107 );
108 if self.config.check_comments {
109 // search if '--' not in comments
110 let mut haystack = &buf[4..len - 3];
111 let mut off = 0;
112 while let Some(p) = memchr::memchr(b'-', haystack) {
113 off += p + 1;
114 // if next byte after `-` is also `-`, return an error
115 if buf[4 + off] == b'-' {
116 // Explanation of the magic:
117 //
118 // - `self.offset` just after `>`,
119 // - `buf` contains `<!-- con--tent --`
120 // - `p` is counted from byte after `<!--`
121 //
122 // <!-- con--tent -->:
123 // ~~~~~~~~~~~~~~~~~~: - buf
124 // : =========== : - zone of search (possible values of `p`)
125 // : |---p : - p is counted from | (| is 0)
126 // : : : ^ - self.offset
127 // ^ : : - self.offset - len
128 // ^ : - self.offset - len + 4
129 // ^ - self.offset - len + 4 + p
130 self.last_error_offset = self.offset - len as u64 + 4 + p as u64;
131 return Err(Error::IllFormed(IllFormedError::DoubleHyphenInComment));
132 }
133 // Continue search after single `-` (+1 to skip it)
134 haystack = &haystack[p + 1..];
135 }
136 }
137 Ok(Event::Comment(BytesText::wrap(
138 // Cut of `<!--` and `-->` from start and end
139 &buf[4..len - 3],
140 self.decoder(),
141 )))
142 }
143 // XML requires uppercase only:
144 // https://www.w3.org/TR/xml11/#sec-cdata-sect
145 // Even HTML5 required uppercase only:
146 // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
147 BangType::CData if buf.starts_with(b"<![CDATA[") => {
148 debug_assert!(
149 buf.ends_with(b"]]>"),
150 "CDATA must end with ']]>':\n{:?}",
151 crate::utils::Bytes(buf)
152 );
153 Ok(Event::CData(BytesCData::wrap(
154 // Cut of `<![CDATA[` and `]]>` from start and end
155 &buf[9..len - 3],
156 self.decoder(),
157 )))
158 }
159 // XML requires uppercase only, but we will check that on validation stage:
160 // https://www.w3.org/TR/xml11/#sec-prolog-dtd
161 // HTML5 allows mixed case for doctype declarations:
162 // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
163 BangType::DocType(DtdParser::Finished) if uncased_starts_with(buf, b"<!DOCTYPE") => {
164 match buf[9..len - 1].iter().position(|&b| !is_whitespace(b)) {
165 Some(start) => Ok(Event::DocType(BytesText::wrap(
166 // Cut of `<!DOCTYPE` and any number of spaces from start and `>` from the end
167 &buf[9 + start..len - 1],
168 self.decoder(),
169 ))),
170 None => {
171 // Because we here, we at least read `<!DOCTYPE>` and offset after `>`.
172 // We want report error at place where name is expected - this is just
173 // before `>`
174 self.last_error_offset = self.offset - 1;
175 Err(Error::IllFormed(IllFormedError::MissingDoctypeName))
176 }
177 }
178 }
179 _ => {
180 // <!....>
181 // ~~~~~~~- `buf` contains that and `self.offset` is after `>`.
182 // ^------- We report error at that position, so we need to subtract buf len
183 self.last_error_offset = self.offset - len as u64;
184 Err(Error::Syntax(bang_type.to_err()))
185 }
186 }
187 }
188
189 /// Wraps content of `buf` into the [`Event::End`] event. Does the check that
190 /// end name matches the last opened start name if `self.config.check_end_names` is set.
191 ///
192 /// `buf` contains data between `<` and up to, including, `>`, for example `</tag>`.
193 pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
194 debug_assert!(
195 buf.starts_with(b"</"),
196 "end tag must start from '</':\n{:?}",
197 crate::utils::Bytes(buf)
198 );
199 debug_assert!(
200 buf.ends_with(b">"),
201 "end tag must end with '>':\n{:?}",
202 crate::utils::Bytes(buf)
203 );
204
205 // Strip the `</` and `>` characters. `content` contains data between `</` and `>`
206 let content = &buf[2..buf.len() - 1];
207 // XML standard permits whitespaces after the markup name in closing tags.
208 // Let's strip them from the buffer before comparing tag names.
209 let name = if self.config.trim_markup_names_in_closing_tags {
210 if let Some(pos_end_name) = content.iter().rposition(|&b| !is_whitespace(b)) {
211 &content[..pos_end_name + 1]
212 } else {
213 content
214 }
215 } else {
216 content
217 };
218
219 let decoder = self.decoder();
220
221 // Get the index in self.opened_buffer of the name of the last opened tag
222 match self.opened_starts.pop() {
223 Some(start) => {
224 if self.config.check_end_names {
225 let expected = &self.opened_buffer[start..];
226 if name != expected {
227 let expected = decoder.decode(expected).unwrap_or_default().into_owned();
228 // #513: In order to allow error recovery we should drop content of the buffer
229 self.opened_buffer.truncate(start);
230
231 // Report error at start of the end tag at `<` character
232 self.last_error_offset = self.offset - buf.len() as u64;
233 return Err(Error::IllFormed(IllFormedError::MismatchedEndTag {
234 expected,
235 found: decoder.decode(name).unwrap_or_default().into_owned(),
236 }));
237 }
238 }
239
240 self.opened_buffer.truncate(start);
241 }
242 None => {
243 if !self.config.allow_unmatched_ends {
244 // Report error at start of the end tag at `<` character
245 self.last_error_offset = self.offset - buf.len() as u64;
246 return Err(Error::IllFormed(IllFormedError::UnmatchedEndTag(
247 decoder.decode(name).unwrap_or_default().into_owned(),
248 )));
249 }
250 }
251 }
252
253 Ok(Event::End(BytesEnd::wrap(name.into())))
254 }
255
256 /// `buf` contains data between `<` and `>` and the first byte is `?`.
257 /// `self.offset` already after the `>`
258 ///
259 /// Returns `Decl` or `PI` event
260 pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
261 debug_assert!(
262 buf.starts_with(b"<?"),
263 "processing instruction or XML declaration must start from '<?':\n{:?}",
264 crate::utils::Bytes(buf)
265 );
266 debug_assert!(
267 buf.ends_with(b"?>"),
268 "processing instruction or XML declaration must end with '?>':\n{:?}",
269 crate::utils::Bytes(buf)
270 );
271
272 let len = buf.len();
273 // We accept at least <??>
274 // ~~~~ - len = 4
275 if len > 3 {
276 // Cut of `<?` and `?>` from start and end
277 let content = &buf[2..len - 2];
278 let len = content.len();
279
280 if content.starts_with(b"xml") && (len == 3 || is_whitespace(content[3])) {
281 let event = BytesDecl::from_start(BytesStart::wrap(content, 3, self.decoder()));
282
283 // Try getting encoding from the declaration event
284 #[cfg(feature = "encoding")]
285 if self.encoding.can_be_refined() {
286 if let Some(encoding) = event.encoder() {
287 self.encoding = EncodingRef::XmlDetected(encoding);
288 }
289 }
290
291 Ok(Event::Decl(event))
292 } else {
293 Ok(Event::PI(BytesPI::wrap(
294 content,
295 name_len(content),
296 self.decoder(),
297 )))
298 }
299 } else {
300 // <?...?>
301 // ~~~~~~~- `buf` contains that and `self.offset` is after `>`.
302 // ^------- We report error at that position, so we need to subtract buf len
303 self.last_error_offset = self.offset - len as u64;
304 Err(Error::Syntax(PiParser(false).eof_error(buf)))
305 }
306 }
307
308 /// Converts content of a tag to a `Start` or an `Empty` event
309 ///
310 /// # Parameters
311 /// - `content`: Content of a tag between `<` and `>`
312 pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Event<'b> {
313 debug_assert!(
314 content.starts_with(b"<"),
315 "start or empty tag must start from '<':\n{:?}",
316 crate::utils::Bytes(content)
317 );
318 debug_assert!(
319 content.ends_with(b">"),
320 "start or empty tag must end with '>':\n{:?}",
321 crate::utils::Bytes(content)
322 );
323
324 // strip `<`
325 let content = &content[1..];
326 if let Some(content) = content.strip_suffix(b"/>") {
327 // This is self-closed tag `<something/>`
328 let event = BytesStart::wrap(content, name_len(content), self.decoder());
329
330 if self.config.expand_empty_elements {
331 self.state = ParseState::InsideEmpty;
332 self.opened_starts.push(self.opened_buffer.len());
333 self.opened_buffer.extend(event.name().as_ref());
334 Event::Start(event)
335 } else {
336 Event::Empty(event)
337 }
338 } else {
339 // strip `>`
340 let content = &content[..content.len() - 1];
341 let event = BytesStart::wrap(content, name_len(content), self.decoder());
342
343 // #514: Always store names event when .check_end_names == false,
344 // because checks can be temporary disabled and when they would be
345 // enabled, we should have that information
346 self.opened_starts.push(self.opened_buffer.len());
347 self.opened_buffer.extend(event.name().as_ref());
348 Event::Start(event)
349 }
350 }
351
352 #[inline]
353 pub fn close_expanded_empty(&mut self) -> BytesEnd<'static> {
354 self.state = ParseState::InsideText;
355 let name = self
356 .opened_buffer
357 .split_off(self.opened_starts.pop().unwrap());
358 BytesEnd::wrap(name.into())
359 }
360
361 /// Get the decoder, used to decode bytes, read by this reader, to the strings.
362 ///
363 /// If [`encoding`] feature is enabled, the used encoding may change after
364 /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
365 ///
366 /// If [`encoding`] feature is enabled and no encoding is specified in declaration,
367 /// defaults to UTF-8.
368 ///
369 /// [`encoding`]: ../../index.html#encoding
370 pub const fn decoder(&self) -> Decoder {
371 Decoder {
372 #[cfg(feature = "encoding")]
373 encoding: self.encoding.encoding(),
374 }
375 }
376}
377
378impl Default for ReaderState {
379 fn default() -> Self {
380 Self {
381 offset: 0,
382 last_error_offset: 0,
383 state: ParseState::Init,
384 config: Config::default(),
385 opened_buffer: Vec::new(),
386 opened_starts: Vec::new(),
387
388 #[cfg(feature = "encoding")]
389 encoding: EncodingRef::Implicit(UTF_8),
390 }
391 }
392}
393
394impl Debug for ReaderState {
395 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
396 let mut d = f.debug_struct("ReaderState");
397
398 d.field("offset", &self.offset);
399 d.field("last_error_offset", &self.last_error_offset);
400 d.field("state", &self.state);
401 d.field("config", &self.config);
402 d.field("opened_buffer", &Bytes(&self.opened_buffer));
403 d.field("opened_starts", &self.opened_starts);
404
405 #[cfg(feature = "encoding")]
406 d.field("encoding", &self.encoding);
407
408 d.finish()
409 }
410}