quick_xml/parser/dtd.rs
1use crate::parser::{CommentParser, ElementParser, Parser, PiParser};
2
3#[derive(Debug, Clone, Copy, PartialEq)]
4pub enum DtdParser {
5 /// If inside a PubidLiteral or SystemLiteral, it holds the quote type (either `'` or `"`).
6 /// Otherwise, it holds `0` (this is an initial state).
7 ///
8 /// ```text
9 /// [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
10 /// ```
11 BeforeInternalSubset(u8),
12 /// Inside of the `intSubset` rule.
13 ///
14 /// ```text
15 /// [28a] DeclSep ::= PEReference | S
16 /// [28b] intSubset ::= (markupdecl | DeclSep)*
17 /// [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment
18 /// ```
19 InsideOfInternalSubset,
20 /// After `]` but before `>`.
21 AfterInternalSubset,
22 InComment(CommentParser),
23 InPi(PiParser),
24 /// ```text
25 /// [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'
26 /// ```
27 InElementDecl,
28 /// This state handles ATTLIST, ENTITY and NOTATION elements, i.e. all elements that can have
29 /// quotes strings (`'...'` or `"..."`) inside their markup, in which `>` should not be threated
30 /// as the end of the markup.
31 ///
32 /// This state handles the following productions from XML grammar:
33 ///
34 /// ### ATTLIST
35 ///
36 /// ```text
37 /// [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
38 /// [53] AttDef ::= S Name S AttType S DefaultDecl
39 /// [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue)
40 /// ```
41 ///
42 /// ### ENTITY
43 ///
44 /// ```text
45 /// [70] EntityDecl ::= GEDecl | PEDecl
46 /// [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
47 /// [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
48 /// [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
49 /// [74] PEDef ::= EntityValue | ExternalID
50 /// [75] ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
51 /// [76] NDataDecl ::= S 'NDATA' S Name
52 /// ```
53 ///
54 /// ### NOTATION
55 ///
56 /// ```text
57 /// [82] NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>'
58 /// ```
59 InQuoteSensitive(ElementParser),
60 /// The state where it was not possible to determine which markup it was during the previous iteration. \
61 /// It holds the number of bytes read since the start of the markup.
62 UndecidedMarkup(usize),
63 Finished,
64}
65
66impl DtdParser {
67 /// Skip DTD contents.
68 ///
69 /// # Parameters (as same as `reader::BangType::parse`)
70 /// - `buf`: buffer with data consumed on previous iterations
71 /// - `chunk`: data read on current iteration and not yet consumed from reader
72 pub fn feed(&mut self, buf: &[u8], chunk: &[u8]) -> Option<usize> {
73 // This method assumes the DTD is well-formed.
74 // Since this crate does not support parsing DTDs, the inability to read non-well-formed DTDs
75 // is not particularly problematic; the only point of interest is reporting well-formed DTDs
76 // to the user without errors.
77
78 let mut cur = chunk;
79 while !cur.is_empty() {
80 match *self {
81 Self::BeforeInternalSubset(0) => {
82 // Find the
83 // - start of quoted string ('...' or "...")
84 // - start of internal subset ([...])
85 // - end of DOCTYPE declaration (>)
86 if let Some(i) = cur
87 .iter()
88 .position(|&b| matches!(b, b'\'' | b'"' | b'[' | b'>'))
89 {
90 let b = cur[i];
91 match b {
92 b'\'' | b'"' => {
93 // SystemLiteral or PubidLiteral
94 *self = Self::BeforeInternalSubset(b);
95 cur = &cur[i + 1..]; // +1 to skip `'` or `"`
96 continue;
97 }
98 b'[' => {
99 *self = Self::InsideOfInternalSubset;
100 cur = &cur[i + 1..]; // +1 to skip `[`
101 continue;
102 }
103 b'>' => {
104 *self = Self::Finished;
105 return Some(chunk.len() - cur.len() + i);
106 }
107 _ => {}
108 }
109 continue;
110 }
111 break;
112 }
113 // Inside the quoted string (this is PubidLiteral or SystemLiteral) we do not want to
114 // recognize other special characters (namely [ and >). Find only the closing quote
115 Self::BeforeInternalSubset(quote) => {
116 // ExternalID handling
117 if let Some(i) = memchr::memchr(quote, cur) {
118 *self = Self::BeforeInternalSubset(0);
119 cur = &cur[i + 1..];
120 continue;
121 }
122 break;
123 }
124 Self::InsideOfInternalSubset => {
125 // Find the end of internal subset (]) or the start of the markup inside (<)
126 if let Some(i) = memchr::memchr2(b']', b'<', cur) {
127 if cur[i] == b']' {
128 *self = Self::AfterInternalSubset;
129 cur = &cur[i + 1..]; // +1 to skip `]`
130 continue;
131 }
132 // +1 to start after `<`
133 if let Some(skip) = self.switch(&cur[i + 1..]) {
134 cur = &cur[i + 1 + skip..]; // +1 to skip `<`
135 continue;
136 }
137 // Keep the number of already looked bytes (started from byte after `<`, so -1),
138 // try to decide after feeding the new chunk.
139 let skipped = cur.len() - i - 1;
140 // The 9-byte work buffer in `UndecidedMarkup` is sized
141 // for `!NOTATION` (the longest keyword). If the chunk
142 // already gave us 9+ bytes after `<` and `switch()`
143 // returned `None`, the markup is definitively not one
144 // of `<!--`, `<![CDATA[`, `<!ELEMENT`, `<!ATTLIST`,
145 // `<!ENTITY`, `<!NOTATION`, so skip until `>` rather
146 // than staging more bytes than the buffer can hold
147 // (which would panic on the slice copy in
148 // `UndecidedMarkup`).
149 if skipped >= 9 {
150 cur = &cur[i + 1..];
151 *self = Self::InElementDecl;
152 continue;
153 }
154 *self = Self::UndecidedMarkup(skipped);
155 }
156 break;
157 }
158 Self::AfterInternalSubset => {
159 if let Some(i) = memchr::memchr(b'>', cur) {
160 *self = Self::Finished;
161 return Some(chunk.len() - cur.len() + i);
162 }
163 break;
164 }
165 Self::InComment(ref mut parser) => {
166 // If comment is ended, return to the main state, otherwise keep in the current state
167 if let Some(i) = parser.feed(cur) {
168 *self = Self::InsideOfInternalSubset;
169 cur = &cur[i..];
170 continue;
171 }
172 break;
173 }
174 Self::InPi(ref mut parser) => {
175 // If processing instruction is ended, return to the main state,
176 // otherwise keep in the current state
177 if let Some(i) = parser.feed(cur) {
178 *self = Self::InsideOfInternalSubset;
179 cur = &cur[i..];
180 continue;
181 }
182 break;
183 }
184 Self::InElementDecl => {
185 // `<!ELEMENT >` does not have places where `>` could be escaped
186 // so the first occurrence ends that state
187 if let Some(i) = memchr::memchr(b'>', cur) {
188 *self = Self::InsideOfInternalSubset;
189 cur = &cur[i + 1..]; // +1 for `>`
190 continue;
191 }
192 break;
193 }
194 Self::InQuoteSensitive(ref mut parser) => {
195 // If ATTLIST, ENTITY or NOTATION is ended, return to the main state,
196 // otherwise keep in the current state
197 if let Some(i) = parser.feed(cur) {
198 *self = Self::InsideOfInternalSubset;
199 cur = &cur[i..];
200 continue;
201 }
202 break;
203 }
204 Self::UndecidedMarkup(skipped) => {
205 // Buffer is long enough to store the longest possible keyword `!NOTATION`
206 let mut bytes = [0u8; 9];
207
208 // Copy the last `skipped` bytes from the previous iteration into buffer,
209 // for example, "!NOT" (skipped = 4 in that case)...
210 bytes[..skipped].copy_from_slice(&buf[buf.len() - skipped..]);
211
212 // ...add new bytes to the buffer from current iteration,
213 // for example, "ATION"...
214 let end = bytes.len().min(skipped + cur.len());
215 bytes[skipped..end].copy_from_slice(&cur[..end - skipped]);
216
217 // ...and try to match over it.
218 // For example, "!NOTATION" will return 9, and we skip 9-4=5 bytes of "ATION"
219 if let Some(skip) = self.switch(&bytes[..end]) {
220 cur = &cur[skip - skipped..];
221 continue;
222 }
223 // No keyword matched. If we have a full 9-byte window the
224 // markup is definitively not one of `<!--`, `<![CDATA[`,
225 // `<!ELEMENT`, `<!ATTLIST`, `<!ENTITY`, `<!NOTATION`, so
226 // fall back to skipping until the closing `>` instead of
227 // accumulating `skipped` past `bytes.len()` (which would
228 // panic on the slice-copy above on a later iteration).
229 if end == bytes.len() {
230 cur = &cur[end - skipped..];
231 *self = Self::InElementDecl;
232 continue;
233 }
234 *self = Self::UndecidedMarkup(skipped + cur.len());
235 break;
236 }
237 Self::Finished => break,
238 }
239 }
240
241 None
242 }
243
244 #[inline]
245 fn switch(&mut self, markup: &[u8]) -> Option<usize> {
246 match markup {
247 [b'?', ..] => {
248 // <?
249 *self = Self::InPi(PiParser(false));
250 Some(1)
251 }
252 [b'!', b'-', b'-', ..] => {
253 // <!--
254 *self = Self::InComment(CommentParser::Seen0);
255 Some(3)
256 }
257 [b'!', b'E', b'L', b'E', b'M', b'E', b'N', b'T', ..] => {
258 // <!ELEMENT
259 *self = Self::InElementDecl;
260 Some(8)
261 }
262 [b'!', b'E', b'N', b'T', b'I', b'T', b'Y', ..] => {
263 // <!ENTITY
264 *self = Self::InQuoteSensitive(ElementParser::Outside);
265 Some(7)
266 }
267 [b'!', b'A', b'T', b'T', b'L', b'I', b'S', b'T', ..] => {
268 // <!ATTLIST
269 *self = Self::InQuoteSensitive(ElementParser::Outside);
270 Some(8)
271 }
272 [b'!', b'N', b'O', b'T', b'A', b'T', b'I', b'O', b'N', ..] => {
273 // <!NOTATION
274 *self = Self::InQuoteSensitive(ElementParser::Outside);
275 Some(9)
276 }
277 // <... - `markup` does not have enough data to determine markup
278 // or markup is not known.
279 // Undecided markup bytes will be written to `buf` to be available on
280 // next iteration.
281 _ => {
282 // FIXME: to correctly report error position in DTD we need to provide
283 // DTD events. For now our task just to skip (correct) DTD, so we postpone
284 // error reporting and go with ending the unknown markup with `>`.
285 if let Some(i) = memchr::memchr(b'>', markup) {
286 *self = Self::InsideOfInternalSubset;
287 Some(i + 1) // +1 to skip `>`
288 } else {
289 None
290 }
291 }
292 }
293 }
294}