xml5ever/tokenizer/char_ref/
mod.rs

1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::{TokenSink, XmlTokenizer};
11use crate::data;
12use crate::macros::unwrap_or_return;
13use crate::tendril::StrTendril;
14use log::debug;
15use markup5ever::buffer_queue::BufferQueue;
16use std::borrow::Cow::{self, Borrowed};
17use std::char::from_u32;
18
19use self::State::*;
20pub use self::Status::*;
21
22//ยง tokenizing-character-references
23pub struct CharRef {
24    /// The resulting character(s)
25    pub chars: [char; 2],
26
27    /// How many slots in `chars` are valid?
28    pub num_chars: u8,
29}
30
31pub enum Status {
32    Stuck,
33    Progress,
34    Done,
35}
36
37#[derive(Debug)]
38enum State {
39    Begin,
40    Octothorpe,
41    Numeric(u32), // base
42    NumericSemicolon,
43    Named,
44    BogusName,
45}
46
47pub struct CharRefTokenizer {
48    state: State,
49    addnl_allowed: Option<char>,
50    result: Option<CharRef>,
51
52    num: u32,
53    num_too_big: bool,
54    seen_digit: bool,
55    hex_marker: Option<char>,
56
57    name_buf_opt: Option<StrTendril>,
58    name_match: Option<(u32, u32)>,
59    name_len: usize,
60}
61
62impl CharRefTokenizer {
63    // NB: We assume that we have an additional allowed character iff we're
64    // tokenizing in an attribute value.
65    pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
66        CharRefTokenizer {
67            state: Begin,
68            addnl_allowed,
69            result: None,
70            num: 0,
71            num_too_big: false,
72            seen_digit: false,
73            hex_marker: None,
74            name_buf_opt: None,
75            name_match: None,
76            name_len: 0,
77        }
78    }
79
80    // A CharRefTokenizer can only tokenize one character reference,
81    // so this method consumes the tokenizer.
82    pub fn get_result(self) -> CharRef {
83        self.result.expect("get_result called before done")
84    }
85
86    fn name_buf(&self) -> &StrTendril {
87        self.name_buf_opt
88            .as_ref()
89            .expect("name_buf missing in named character reference")
90    }
91
92    fn name_buf_mut(&mut self) -> &mut StrTendril {
93        self.name_buf_opt
94            .as_mut()
95            .expect("name_buf missing in named character reference")
96    }
97
98    fn finish_none(&mut self) -> Status {
99        self.result = Some(CharRef {
100            chars: ['\0', '\0'],
101            num_chars: 0,
102        });
103        Done
104    }
105
106    fn finish_one(&mut self, c: char) -> Status {
107        self.result = Some(CharRef {
108            chars: [c, '\0'],
109            num_chars: 1,
110        });
111        Done
112    }
113}
114
115impl CharRefTokenizer {
116    pub fn step<Sink: TokenSink>(
117        &mut self,
118        tokenizer: &XmlTokenizer<Sink>,
119        input: &BufferQueue,
120    ) -> Status {
121        if self.result.is_some() {
122            return Done;
123        }
124
125        debug!("char ref tokenizer stepping in state {:?}", self.state);
126        match self.state {
127            Begin => self.do_begin(tokenizer, input),
128            Octothorpe => self.do_octothorpe(tokenizer, input),
129            Numeric(base) => self.do_numeric(tokenizer, base, input),
130            NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
131            Named => self.do_named(tokenizer, input),
132            BogusName => self.do_bogus_name(tokenizer, input),
133        }
134    }
135
136    fn do_begin<Sink: TokenSink>(
137        &mut self,
138        tokenizer: &XmlTokenizer<Sink>,
139        input: &BufferQueue,
140    ) -> Status {
141        match unwrap_or_return!(tokenizer.peek(input), Stuck) {
142            '\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(),
143            c if Some(c) == self.addnl_allowed => self.finish_none(),
144
145            '#' => {
146                tokenizer.discard_char(input);
147                self.state = Octothorpe;
148                Progress
149            },
150
151            _ => {
152                self.state = Named;
153                self.name_buf_opt = Some(StrTendril::new());
154                Progress
155            },
156        }
157    }
158
159    fn do_octothorpe<Sink: TokenSink>(
160        &mut self,
161        tokenizer: &XmlTokenizer<Sink>,
162        input: &BufferQueue,
163    ) -> Status {
164        let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
165        match c {
166            'x' | 'X' => {
167                tokenizer.discard_char(input);
168                self.hex_marker = Some(c);
169                self.state = Numeric(16);
170            },
171
172            _ => {
173                self.hex_marker = None;
174                self.state = Numeric(10);
175            },
176        }
177        Progress
178    }
179
180    fn do_numeric<Sink: TokenSink>(
181        &mut self,
182        tokenizer: &XmlTokenizer<Sink>,
183        base: u32,
184        input: &BufferQueue,
185    ) -> Status {
186        let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
187        match c.to_digit(base) {
188            Some(n) => {
189                tokenizer.discard_char(input);
190                self.num = self.num.wrapping_mul(base);
191                if self.num > 0x10FFFF {
192                    // We might overflow, and the character is definitely invalid.
193                    // We still parse digits and semicolon, but don't use the result.
194                    self.num_too_big = true;
195                }
196                self.num = self.num.wrapping_add(n);
197                self.seen_digit = true;
198                Progress
199            },
200
201            None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
202
203            None => {
204                self.state = NumericSemicolon;
205                Progress
206            },
207        }
208    }
209
210    fn do_numeric_semicolon<Sink: TokenSink>(
211        &mut self,
212        tokenizer: &XmlTokenizer<Sink>,
213        input: &BufferQueue,
214    ) -> Status {
215        match unwrap_or_return!(tokenizer.peek(input), Stuck) {
216            ';' => tokenizer.discard_char(input),
217            _ => tokenizer.emit_error(Borrowed(
218                "Semicolon missing after numeric character reference",
219            )),
220        };
221        self.finish_numeric(tokenizer)
222    }
223
224    fn unconsume_numeric<Sink: TokenSink>(
225        &mut self,
226        tokenizer: &XmlTokenizer<Sink>,
227        input: &BufferQueue,
228    ) -> Status {
229        let mut unconsume = StrTendril::from_char('#');
230        if let Some(c) = self.hex_marker {
231            unconsume.push_char(c);
232        }
233
234        tokenizer.unconsume(input, unconsume);
235        tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
236        self.finish_none()
237    }
238
239    fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &XmlTokenizer<Sink>) -> Status {
240        fn conv(n: u32) -> char {
241            from_u32(n).expect("invalid char missed by error handling cases")
242        }
243
244        let (c, error) = match self.num {
245            n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
246            0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
247
248            0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
249                Some(c) => (c, true),
250                None => (conv(self.num), true),
251            },
252
253            0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),
254
255            n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),
256
257            n => (conv(n), false),
258        };
259
260        if error {
261            let msg = if tokenizer.opts.exact_errors {
262                Cow::from(format!(
263                    "Invalid numeric character reference value 0x{:06X}",
264                    self.num
265                ))
266            } else {
267                Cow::from("Invalid numeric character reference")
268            };
269            tokenizer.emit_error(msg);
270        }
271
272        self.finish_one(c)
273    }
274
275    fn do_named<Sink: TokenSink>(
276        &mut self,
277        tokenizer: &XmlTokenizer<Sink>,
278        input: &BufferQueue,
279    ) -> Status {
280        let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
281        self.name_buf_mut().push_char(c);
282        match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
283            // We have either a full match or a prefix of one.
284            Some(&m) => {
285                if m.0 != 0 {
286                    // We have a full match, but there might be a longer one to come.
287                    self.name_match = Some(m);
288                    self.name_len = self.name_buf().len();
289                }
290                // Otherwise we just have a prefix match.
291                Progress
292            },
293
294            // Can't continue the match.
295            None => self.finish_named(tokenizer, Some(c), input),
296        }
297    }
298
299    fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &XmlTokenizer<Sink>) {
300        let msg = if tokenizer.opts.exact_errors {
301            Cow::from(format!("Invalid character reference &{}", self.name_buf()))
302        } else {
303            Cow::from("Invalid character reference")
304        };
305        tokenizer.emit_error(msg);
306    }
307
308    fn unconsume_name<Sink: TokenSink>(
309        &mut self,
310        tokenizer: &XmlTokenizer<Sink>,
311        input: &BufferQueue,
312    ) {
313        tokenizer.unconsume(input, self.name_buf_opt.take().unwrap());
314    }
315
316    fn finish_named<Sink: TokenSink>(
317        &mut self,
318        tokenizer: &XmlTokenizer<Sink>,
319        end_char: Option<char>,
320        input: &BufferQueue,
321    ) -> Status {
322        match self.name_match {
323            None => {
324                match end_char {
325                    Some(c) if c.is_ascii_alphanumeric() => {
326                        // Keep looking for a semicolon, to determine whether
327                        // we emit a parse error.
328                        self.state = BogusName;
329                        return Progress;
330                    },
331
332                    // Check length because &; is not a parse error.
333                    Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
334
335                    _ => (),
336                }
337                self.unconsume_name(tokenizer, input);
338                self.finish_none()
339            },
340
341            Some((c1, c2)) => {
342                // We have a complete match, but we may have consumed
343                // additional characters into self.name_buf.  Usually
344                // at least one, but several in cases like
345                //
346                //     &not    => match for U+00AC
347                //     &noti   => valid prefix for &notin
348                //     &notit  => can't continue match
349
350                let name_len = self.name_len;
351                assert!(name_len > 0);
352                let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
353
354                // There might not be a next character after the match, if
355                // we had a full match and then hit EOF.
356                let next_after = if name_len == self.name_buf().len() {
357                    None
358                } else {
359                    Some(self.name_buf()[name_len..].chars().next().unwrap())
360                };
361
362                // "If the character reference is being consumed as part of an
363                // attribute, and the last character matched is not a U+003B
364                // SEMICOLON character (;), and the next character is either a
365                // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
366                // character, then, for historical reasons, all the characters
367                // that were matched after the U+0026 AMPERSAND character (&)
368                // must be unconsumed, and nothing is returned. However, if
369                // this next character is in fact a U+003D EQUALS SIGN
370                // character (=), then this is a parse error"
371
372                let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
373                    (_, ';', _) => false,
374                    (Some(_), _, Some('=')) => {
375                        tokenizer.emit_error(Borrowed(
376                            "Equals sign after character reference in attribute",
377                        ));
378                        true
379                    },
380                    (Some(_), _, Some(c)) if c.is_ascii_alphanumeric() => true,
381                    _ => {
382                        tokenizer.emit_error(Borrowed(
383                            "Character reference does not end with semicolon",
384                        ));
385                        false
386                    },
387                };
388
389                if unconsume_all {
390                    self.unconsume_name(tokenizer, input);
391                    self.finish_none()
392                } else {
393                    tokenizer
394                        .unconsume(input, StrTendril::from_slice(&self.name_buf()[name_len..]));
395                    self.result = Some(CharRef {
396                        chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
397                        num_chars: if c2 == 0 { 1 } else { 2 },
398                    });
399                    Done
400                }
401            },
402        }
403    }
404
405    fn do_bogus_name<Sink: TokenSink>(
406        &mut self,
407        tokenizer: &XmlTokenizer<Sink>,
408        input: &BufferQueue,
409    ) -> Status {
410        let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
411        self.name_buf_mut().push_char(c);
412        match c {
413            _ if c.is_ascii_alphanumeric() => return Progress,
414            ';' => self.emit_name_error(tokenizer),
415            _ => (),
416        }
417        self.unconsume_name(tokenizer, input);
418        self.finish_none()
419    }
420
421    pub fn end_of_file<Sink: TokenSink>(
422        &mut self,
423        tokenizer: &XmlTokenizer<Sink>,
424        input: &BufferQueue,
425    ) {
426        while self.result.is_none() {
427            match self.state {
428                Begin => drop(self.finish_none()),
429
430                Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
431
432                Numeric(_) | NumericSemicolon => {
433                    tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
434                    self.finish_numeric(tokenizer);
435                },
436
437                Named => drop(self.finish_named(tokenizer, None, input)),
438
439                BogusName => {
440                    self.unconsume_name(tokenizer, input);
441                    self.finish_none();
442                },
443
444                Octothorpe => {
445                    tokenizer.unconsume(input, StrTendril::from_slice("#"));
446                    tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
447                    self.finish_none();
448                },
449            }
450        }
451    }
452}