1use super::{TokenSink, XmlTokenizer};
11use crate::data;
12use crate::macros::unwrap_or_return;
13use crate::tendril::StrTendril;
14use log::debug;
15use markup5ever::buffer_queue::BufferQueue;
16use std::borrow::Cow::{self, Borrowed};
17use std::char::from_u32;
18
19use self::State::*;
20pub use self::Status::*;
21
22pub struct CharRef {
24 pub chars: [char; 2],
26
27 pub num_chars: u8,
29}
30
31pub enum Status {
32 Stuck,
33 Progress,
34 Done,
35}
36
37#[derive(Debug)]
38enum State {
39 Begin,
40 Octothorpe,
41 Numeric(u32), NumericSemicolon,
43 Named,
44 BogusName,
45}
46
47pub struct CharRefTokenizer {
48 state: State,
49 addnl_allowed: Option<char>,
50 result: Option<CharRef>,
51
52 num: u32,
53 num_too_big: bool,
54 seen_digit: bool,
55 hex_marker: Option<char>,
56
57 name_buf_opt: Option<StrTendril>,
58 name_match: Option<(u32, u32)>,
59 name_len: usize,
60}
61
62impl CharRefTokenizer {
63 pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
66 CharRefTokenizer {
67 state: Begin,
68 addnl_allowed,
69 result: None,
70 num: 0,
71 num_too_big: false,
72 seen_digit: false,
73 hex_marker: None,
74 name_buf_opt: None,
75 name_match: None,
76 name_len: 0,
77 }
78 }
79
80 pub fn get_result(self) -> CharRef {
83 self.result.expect("get_result called before done")
84 }
85
86 fn name_buf(&self) -> &StrTendril {
87 self.name_buf_opt
88 .as_ref()
89 .expect("name_buf missing in named character reference")
90 }
91
92 fn name_buf_mut(&mut self) -> &mut StrTendril {
93 self.name_buf_opt
94 .as_mut()
95 .expect("name_buf missing in named character reference")
96 }
97
98 fn finish_none(&mut self) -> Status {
99 self.result = Some(CharRef {
100 chars: ['\0', '\0'],
101 num_chars: 0,
102 });
103 Done
104 }
105
106 fn finish_one(&mut self, c: char) -> Status {
107 self.result = Some(CharRef {
108 chars: [c, '\0'],
109 num_chars: 1,
110 });
111 Done
112 }
113}
114
115impl CharRefTokenizer {
116 pub fn step<Sink: TokenSink>(
117 &mut self,
118 tokenizer: &XmlTokenizer<Sink>,
119 input: &BufferQueue,
120 ) -> Status {
121 if self.result.is_some() {
122 return Done;
123 }
124
125 debug!("char ref tokenizer stepping in state {:?}", self.state);
126 match self.state {
127 Begin => self.do_begin(tokenizer, input),
128 Octothorpe => self.do_octothorpe(tokenizer, input),
129 Numeric(base) => self.do_numeric(tokenizer, base, input),
130 NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
131 Named => self.do_named(tokenizer, input),
132 BogusName => self.do_bogus_name(tokenizer, input),
133 }
134 }
135
136 fn do_begin<Sink: TokenSink>(
137 &mut self,
138 tokenizer: &XmlTokenizer<Sink>,
139 input: &BufferQueue,
140 ) -> Status {
141 match unwrap_or_return!(tokenizer.peek(input), Stuck) {
142 '\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(),
143 c if Some(c) == self.addnl_allowed => self.finish_none(),
144
145 '#' => {
146 tokenizer.discard_char(input);
147 self.state = Octothorpe;
148 Progress
149 },
150
151 _ => {
152 self.state = Named;
153 self.name_buf_opt = Some(StrTendril::new());
154 Progress
155 },
156 }
157 }
158
159 fn do_octothorpe<Sink: TokenSink>(
160 &mut self,
161 tokenizer: &XmlTokenizer<Sink>,
162 input: &BufferQueue,
163 ) -> Status {
164 let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
165 match c {
166 'x' | 'X' => {
167 tokenizer.discard_char(input);
168 self.hex_marker = Some(c);
169 self.state = Numeric(16);
170 },
171
172 _ => {
173 self.hex_marker = None;
174 self.state = Numeric(10);
175 },
176 }
177 Progress
178 }
179
180 fn do_numeric<Sink: TokenSink>(
181 &mut self,
182 tokenizer: &XmlTokenizer<Sink>,
183 base: u32,
184 input: &BufferQueue,
185 ) -> Status {
186 let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
187 match c.to_digit(base) {
188 Some(n) => {
189 tokenizer.discard_char(input);
190 self.num = self.num.wrapping_mul(base);
191 if self.num > 0x10FFFF {
192 self.num_too_big = true;
195 }
196 self.num = self.num.wrapping_add(n);
197 self.seen_digit = true;
198 Progress
199 },
200
201 None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
202
203 None => {
204 self.state = NumericSemicolon;
205 Progress
206 },
207 }
208 }
209
210 fn do_numeric_semicolon<Sink: TokenSink>(
211 &mut self,
212 tokenizer: &XmlTokenizer<Sink>,
213 input: &BufferQueue,
214 ) -> Status {
215 match unwrap_or_return!(tokenizer.peek(input), Stuck) {
216 ';' => tokenizer.discard_char(input),
217 _ => tokenizer.emit_error(Borrowed(
218 "Semicolon missing after numeric character reference",
219 )),
220 };
221 self.finish_numeric(tokenizer)
222 }
223
224 fn unconsume_numeric<Sink: TokenSink>(
225 &mut self,
226 tokenizer: &XmlTokenizer<Sink>,
227 input: &BufferQueue,
228 ) -> Status {
229 let mut unconsume = StrTendril::from_char('#');
230 if let Some(c) = self.hex_marker {
231 unconsume.push_char(c);
232 }
233
234 tokenizer.unconsume(input, unconsume);
235 tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
236 self.finish_none()
237 }
238
239 fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &XmlTokenizer<Sink>) -> Status {
240 fn conv(n: u32) -> char {
241 from_u32(n).expect("invalid char missed by error handling cases")
242 }
243
244 let (c, error) = match self.num {
245 n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
246 0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
247
248 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
249 Some(c) => (c, true),
250 None => (conv(self.num), true),
251 },
252
253 0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),
254
255 n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),
256
257 n => (conv(n), false),
258 };
259
260 if error {
261 let msg = if tokenizer.opts.exact_errors {
262 Cow::from(format!(
263 "Invalid numeric character reference value 0x{:06X}",
264 self.num
265 ))
266 } else {
267 Cow::from("Invalid numeric character reference")
268 };
269 tokenizer.emit_error(msg);
270 }
271
272 self.finish_one(c)
273 }
274
275 fn do_named<Sink: TokenSink>(
276 &mut self,
277 tokenizer: &XmlTokenizer<Sink>,
278 input: &BufferQueue,
279 ) -> Status {
280 let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
281 self.name_buf_mut().push_char(c);
282 match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
283 Some(&m) => {
285 if m.0 != 0 {
286 self.name_match = Some(m);
288 self.name_len = self.name_buf().len();
289 }
290 Progress
292 },
293
294 None => self.finish_named(tokenizer, Some(c), input),
296 }
297 }
298
299 fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &XmlTokenizer<Sink>) {
300 let msg = if tokenizer.opts.exact_errors {
301 Cow::from(format!("Invalid character reference &{}", self.name_buf()))
302 } else {
303 Cow::from("Invalid character reference")
304 };
305 tokenizer.emit_error(msg);
306 }
307
308 fn unconsume_name<Sink: TokenSink>(
309 &mut self,
310 tokenizer: &XmlTokenizer<Sink>,
311 input: &BufferQueue,
312 ) {
313 tokenizer.unconsume(input, self.name_buf_opt.take().unwrap());
314 }
315
316 fn finish_named<Sink: TokenSink>(
317 &mut self,
318 tokenizer: &XmlTokenizer<Sink>,
319 end_char: Option<char>,
320 input: &BufferQueue,
321 ) -> Status {
322 match self.name_match {
323 None => {
324 match end_char {
325 Some(c) if c.is_ascii_alphanumeric() => {
326 self.state = BogusName;
329 return Progress;
330 },
331
332 Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
334
335 _ => (),
336 }
337 self.unconsume_name(tokenizer, input);
338 self.finish_none()
339 },
340
341 Some((c1, c2)) => {
342 let name_len = self.name_len;
351 assert!(name_len > 0);
352 let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
353
354 let next_after = if name_len == self.name_buf().len() {
357 None
358 } else {
359 Some(self.name_buf()[name_len..].chars().next().unwrap())
360 };
361
362 let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
373 (_, ';', _) => false,
374 (Some(_), _, Some('=')) => {
375 tokenizer.emit_error(Borrowed(
376 "Equals sign after character reference in attribute",
377 ));
378 true
379 },
380 (Some(_), _, Some(c)) if c.is_ascii_alphanumeric() => true,
381 _ => {
382 tokenizer.emit_error(Borrowed(
383 "Character reference does not end with semicolon",
384 ));
385 false
386 },
387 };
388
389 if unconsume_all {
390 self.unconsume_name(tokenizer, input);
391 self.finish_none()
392 } else {
393 tokenizer
394 .unconsume(input, StrTendril::from_slice(&self.name_buf()[name_len..]));
395 self.result = Some(CharRef {
396 chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
397 num_chars: if c2 == 0 { 1 } else { 2 },
398 });
399 Done
400 }
401 },
402 }
403 }
404
405 fn do_bogus_name<Sink: TokenSink>(
406 &mut self,
407 tokenizer: &XmlTokenizer<Sink>,
408 input: &BufferQueue,
409 ) -> Status {
410 let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
411 self.name_buf_mut().push_char(c);
412 match c {
413 _ if c.is_ascii_alphanumeric() => return Progress,
414 ';' => self.emit_name_error(tokenizer),
415 _ => (),
416 }
417 self.unconsume_name(tokenizer, input);
418 self.finish_none()
419 }
420
421 pub fn end_of_file<Sink: TokenSink>(
422 &mut self,
423 tokenizer: &XmlTokenizer<Sink>,
424 input: &BufferQueue,
425 ) {
426 while self.result.is_none() {
427 match self.state {
428 Begin => drop(self.finish_none()),
429
430 Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
431
432 Numeric(_) | NumericSemicolon => {
433 tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
434 self.finish_numeric(tokenizer);
435 },
436
437 Named => drop(self.finish_named(tokenizer, None, input)),
438
439 BogusName => {
440 self.unconsume_name(tokenizer, input);
441 self.finish_none();
442 },
443
444 Octothorpe => {
445 tokenizer.unconsume(input, StrTendril::from_slice("#"));
446 tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
447 self.finish_none();
448 },
449 }
450 }
451 }
452}