Skip to main content

quick_xml/parser/
dtd.rs

1use crate::parser::{CommentParser, ElementParser, Parser, PiParser};
2
3#[derive(Debug, Clone, Copy, PartialEq)]
4pub enum DtdParser {
5    /// If inside a PubidLiteral or SystemLiteral, it holds the quote type (either `'` or `"`).
6    /// Otherwise, it holds `0` (this is an initial state).
7    ///
8    /// ```text
9    /// [28]    doctypedecl     ::=   '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
10    /// ```
11    BeforeInternalSubset(u8),
12    /// Inside of the `intSubset` rule.
13    ///
14    /// ```text
15    /// [28a]   DeclSep         ::=   PEReference | S
16    /// [28b]   intSubset       ::=   (markupdecl | DeclSep)*
17    /// [29]    markupdecl      ::=   elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment
18    /// ```
19    InsideOfInternalSubset,
20    /// After `]` but before `>`.
21    AfterInternalSubset,
22    InComment(CommentParser),
23    InPi(PiParser),
24    /// ```text
25    /// [45]    elementdecl     ::=   '<!ELEMENT' S Name S contentspec S? '>'
26    /// ```
27    InElementDecl,
28    /// This state handles ATTLIST, ENTITY and NOTATION elements, i.e. all elements that can have
29    /// quotes strings (`'...'` or `"..."`) inside their markup, in which `>` should not be threated
30    /// as the end of the markup.
31    ///
32    /// This state handles the following productions from XML grammar:
33    ///
34    /// ### ATTLIST
35    ///
36    /// ```text
37    /// [52]    AttlistDecl     ::=   '<!ATTLIST' S Name AttDef* S? '>'
38    /// [53]    AttDef          ::=   S Name S AttType S DefaultDecl
39    /// [60]    DefaultDecl     ::=   '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue)
40    /// ```
41    ///
42    /// ### ENTITY
43    ///
44    /// ```text
45    /// [70]    EntityDecl      ::=   GEDecl | PEDecl
46    /// [71]    GEDecl          ::=   '<!ENTITY' S Name S EntityDef S? '>'
47    /// [72]    PEDecl          ::=   '<!ENTITY' S '%' S Name S PEDef S? '>'
48    /// [73]    EntityDef       ::=   EntityValue | (ExternalID NDataDecl?)
49    /// [74]    PEDef           ::=   EntityValue | ExternalID
50    /// [75]    ExternalID      ::=   'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
51    /// [76]    NDataDecl       ::=   S 'NDATA' S Name
52    /// ```
53    ///
54    /// ### NOTATION
55    ///
56    /// ```text
57    /// [82]    NotationDecl    ::=   '<!NOTATION' S Name S (ExternalID | PublicID) S? '>'
58    /// ```
59    InQuoteSensitive(ElementParser),
60    /// The state where it was not possible to determine which markup it was during the previous iteration.  \
61    /// It holds the number of bytes read since the start of the markup.
62    UndecidedMarkup(usize),
63    Finished,
64}
65
66impl DtdParser {
67    /// Skip DTD contents.
68    ///
69    /// # Parameters (as same as `reader::BangType::parse`)
70    /// - `buf`: buffer with data consumed on previous iterations
71    /// - `chunk`: data read on current iteration and not yet consumed from reader
72    pub fn feed(&mut self, buf: &[u8], chunk: &[u8]) -> Option<usize> {
73        // This method assumes the DTD is well-formed.
74        // Since this crate does not support parsing DTDs, the inability to read non-well-formed DTDs
75        // is not particularly problematic; the only point of interest is reporting well-formed DTDs
76        // to the user without errors.
77
78        let mut cur = chunk;
79        while !cur.is_empty() {
80            match *self {
81                Self::BeforeInternalSubset(0) => {
82                    // Find the
83                    // - start of quoted string ('...' or "...")
84                    // - start of internal subset ([...])
85                    // - end of DOCTYPE declaration (>)
86                    if let Some(i) = cur
87                        .iter()
88                        .position(|&b| matches!(b, b'\'' | b'"' | b'[' | b'>'))
89                    {
90                        let b = cur[i];
91                        match b {
92                            b'\'' | b'"' => {
93                                // SystemLiteral or PubidLiteral
94                                *self = Self::BeforeInternalSubset(b);
95                                cur = &cur[i + 1..]; // +1 to skip `'` or `"`
96                                continue;
97                            }
98                            b'[' => {
99                                *self = Self::InsideOfInternalSubset;
100                                cur = &cur[i + 1..]; // +1 to skip `[`
101                                continue;
102                            }
103                            b'>' => {
104                                *self = Self::Finished;
105                                return Some(chunk.len() - cur.len() + i);
106                            }
107                            _ => {}
108                        }
109                        continue;
110                    }
111                    break;
112                }
113                // Inside the quoted string (this is PubidLiteral or SystemLiteral) we do not want to
114                // recognize other special characters (namely [ and >). Find only the closing quote
115                Self::BeforeInternalSubset(quote) => {
116                    // ExternalID handling
117                    if let Some(i) = memchr::memchr(quote, cur) {
118                        *self = Self::BeforeInternalSubset(0);
119                        cur = &cur[i + 1..];
120                        continue;
121                    }
122                    break;
123                }
124                Self::InsideOfInternalSubset => {
125                    // Find the end of internal subset (]) or the start of the markup inside (<)
126                    if let Some(i) = memchr::memchr2(b']', b'<', cur) {
127                        if cur[i] == b']' {
128                            *self = Self::AfterInternalSubset;
129                            cur = &cur[i + 1..]; // +1 to skip `]`
130                            continue;
131                        }
132                        // +1 to start after `<`
133                        if let Some(skip) = self.switch(&cur[i + 1..]) {
134                            cur = &cur[i + 1 + skip..]; // +1 to skip `<`
135                            continue;
136                        }
137                        // Keep the number of already looked bytes (started from byte after `<`, so -1),
138                        // try to decide after feeding the new chunk.
139                        let skipped = cur.len() - i - 1;
140                        // The 9-byte work buffer in `UndecidedMarkup` is sized
141                        // for `!NOTATION` (the longest keyword). If the chunk
142                        // already gave us 9+ bytes after `<` and `switch()`
143                        // returned `None`, the markup is definitively not one
144                        // of `<!--`, `<![CDATA[`, `<!ELEMENT`, `<!ATTLIST`,
145                        // `<!ENTITY`, `<!NOTATION`, so skip until `>` rather
146                        // than staging more bytes than the buffer can hold
147                        // (which would panic on the slice copy in
148                        // `UndecidedMarkup`).
149                        if skipped >= 9 {
150                            cur = &cur[i + 1..];
151                            *self = Self::InElementDecl;
152                            continue;
153                        }
154                        *self = Self::UndecidedMarkup(skipped);
155                    }
156                    break;
157                }
158                Self::AfterInternalSubset => {
159                    if let Some(i) = memchr::memchr(b'>', cur) {
160                        *self = Self::Finished;
161                        return Some(chunk.len() - cur.len() + i);
162                    }
163                    break;
164                }
165                Self::InComment(ref mut parser) => {
166                    // If comment is ended, return to the main state, otherwise keep in the current state
167                    if let Some(i) = parser.feed(cur) {
168                        *self = Self::InsideOfInternalSubset;
169                        cur = &cur[i..];
170                        continue;
171                    }
172                    break;
173                }
174                Self::InPi(ref mut parser) => {
175                    // If processing instruction is ended, return to the main state,
176                    // otherwise keep in the current state
177                    if let Some(i) = parser.feed(cur) {
178                        *self = Self::InsideOfInternalSubset;
179                        cur = &cur[i..];
180                        continue;
181                    }
182                    break;
183                }
184                Self::InElementDecl => {
185                    // `<!ELEMENT >` does not have places where `>` could be escaped
186                    // so the first occurrence ends that state
187                    if let Some(i) = memchr::memchr(b'>', cur) {
188                        *self = Self::InsideOfInternalSubset;
189                        cur = &cur[i + 1..]; // +1 for `>`
190                        continue;
191                    }
192                    break;
193                }
194                Self::InQuoteSensitive(ref mut parser) => {
195                    // If ATTLIST, ENTITY or NOTATION is ended, return to the main state,
196                    // otherwise keep in the current state
197                    if let Some(i) = parser.feed(cur) {
198                        *self = Self::InsideOfInternalSubset;
199                        cur = &cur[i..];
200                        continue;
201                    }
202                    break;
203                }
204                Self::UndecidedMarkup(skipped) => {
205                    // Buffer is long enough to store the longest possible keyword `!NOTATION`
206                    let mut bytes = [0u8; 9];
207
208                    // Copy the last `skipped` bytes from the previous iteration into buffer,
209                    // for example, "!NOT" (skipped = 4 in that case)...
210                    bytes[..skipped].copy_from_slice(&buf[buf.len() - skipped..]);
211
212                    // ...add new bytes to the buffer from current iteration,
213                    // for example, "ATION"...
214                    let end = bytes.len().min(skipped + cur.len());
215                    bytes[skipped..end].copy_from_slice(&cur[..end - skipped]);
216
217                    // ...and try to match over it.
218                    // For example, "!NOTATION" will return 9, and we skip 9-4=5 bytes of "ATION"
219                    if let Some(skip) = self.switch(&bytes[..end]) {
220                        cur = &cur[skip - skipped..];
221                        continue;
222                    }
223                    // No keyword matched. If we have a full 9-byte window the
224                    // markup is definitively not one of `<!--`, `<![CDATA[`,
225                    // `<!ELEMENT`, `<!ATTLIST`, `<!ENTITY`, `<!NOTATION`, so
226                    // fall back to skipping until the closing `>` instead of
227                    // accumulating `skipped` past `bytes.len()` (which would
228                    // panic on the slice-copy above on a later iteration).
229                    if end == bytes.len() {
230                        cur = &cur[end - skipped..];
231                        *self = Self::InElementDecl;
232                        continue;
233                    }
234                    *self = Self::UndecidedMarkup(skipped + cur.len());
235                    break;
236                }
237                Self::Finished => break,
238            }
239        }
240
241        None
242    }
243
244    #[inline]
245    fn switch(&mut self, markup: &[u8]) -> Option<usize> {
246        match markup {
247            [b'?', ..] => {
248                // <?
249                *self = Self::InPi(PiParser(false));
250                Some(1)
251            }
252            [b'!', b'-', b'-', ..] => {
253                // <!--
254                *self = Self::InComment(CommentParser::Seen0);
255                Some(3)
256            }
257            [b'!', b'E', b'L', b'E', b'M', b'E', b'N', b'T', ..] => {
258                // <!ELEMENT
259                *self = Self::InElementDecl;
260                Some(8)
261            }
262            [b'!', b'E', b'N', b'T', b'I', b'T', b'Y', ..] => {
263                // <!ENTITY
264                *self = Self::InQuoteSensitive(ElementParser::Outside);
265                Some(7)
266            }
267            [b'!', b'A', b'T', b'T', b'L', b'I', b'S', b'T', ..] => {
268                // <!ATTLIST
269                *self = Self::InQuoteSensitive(ElementParser::Outside);
270                Some(8)
271            }
272            [b'!', b'N', b'O', b'T', b'A', b'T', b'I', b'O', b'N', ..] => {
273                // <!NOTATION
274                *self = Self::InQuoteSensitive(ElementParser::Outside);
275                Some(9)
276            }
277            // <... - `markup` does not have enough data to determine markup
278            // or markup is not known.
279            // Undecided markup bytes will be written to `buf` to be available on
280            // next iteration.
281            _ => {
282                // FIXME: to correctly report error position in DTD we need to provide
283                // DTD events. For now our task just to skip (correct) DTD, so we postpone
284                // error reporting and go with ending the unknown markup with `>`.
285                if let Some(i) = memchr::memchr(b'>', markup) {
286                    *self = Self::InsideOfInternalSubset;
287                    Some(i + 1) // +1 to skip `>`
288                } else {
289                    None
290                }
291            }
292        }
293    }
294}