1use super::{TokenSink, XmlTokenizer};
11use crate::data;
12use crate::tendril::StrTendril;
13use log::debug;
14use markup5ever::buffer_queue::BufferQueue;
15use std::borrow::Cow::{self, Borrowed};
16use std::char::from_u32;
17
18use self::State::*;
19pub use self::Status::*;
20
21pub struct CharRef {
23 pub chars: [char; 2],
25
26 pub num_chars: u8,
28}
29
30pub enum Status {
31 Stuck,
32 Progress,
33 Done,
34}
35
36#[derive(Debug)]
37enum State {
38 Begin,
39 Octothorpe,
40 Numeric(u32), NumericSemicolon,
42 Named,
43 BogusName,
44}
45
46pub struct CharRefTokenizer {
47 state: State,
48 addnl_allowed: Option<char>,
49 result: Option<CharRef>,
50
51 num: u32,
52 num_too_big: bool,
53 seen_digit: bool,
54 hex_marker: Option<char>,
55
56 name_buf_opt: Option<StrTendril>,
57 name_match: Option<(u32, u32)>,
58 name_len: usize,
59}
60
61impl CharRefTokenizer {
62 pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
65 CharRefTokenizer {
66 state: Begin,
67 addnl_allowed,
68 result: None,
69 num: 0,
70 num_too_big: false,
71 seen_digit: false,
72 hex_marker: None,
73 name_buf_opt: None,
74 name_match: None,
75 name_len: 0,
76 }
77 }
78
79 pub fn get_result(self) -> CharRef {
82 self.result.expect("get_result called before done")
83 }
84
85 fn name_buf(&self) -> &StrTendril {
86 self.name_buf_opt
87 .as_ref()
88 .expect("name_buf missing in named character reference")
89 }
90
91 fn name_buf_mut(&mut self) -> &mut StrTendril {
92 self.name_buf_opt
93 .as_mut()
94 .expect("name_buf missing in named character reference")
95 }
96
97 fn finish_none(&mut self) -> Status {
98 self.result = Some(CharRef {
99 chars: ['\0', '\0'],
100 num_chars: 0,
101 });
102 Done
103 }
104
105 fn finish_one(&mut self, c: char) -> Status {
106 self.result = Some(CharRef {
107 chars: [c, '\0'],
108 num_chars: 1,
109 });
110 Done
111 }
112}
113
114impl CharRefTokenizer {
115 pub fn step<Sink: TokenSink>(
116 &mut self,
117 tokenizer: &XmlTokenizer<Sink>,
118 input: &BufferQueue,
119 ) -> Status {
120 if self.result.is_some() {
121 return Done;
122 }
123
124 debug!("char ref tokenizer stepping in state {:?}", self.state);
125 match self.state {
126 Begin => self.do_begin(tokenizer, input),
127 Octothorpe => self.do_octothorpe(tokenizer, input),
128 Numeric(base) => self.do_numeric(tokenizer, base, input),
129 NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
130 Named => self.do_named(tokenizer, input),
131 BogusName => self.do_bogus_name(tokenizer, input),
132 }
133 }
134
135 fn do_begin<Sink: TokenSink>(
136 &mut self,
137 tokenizer: &XmlTokenizer<Sink>,
138 input: &BufferQueue,
139 ) -> Status {
140 match tokenizer.peek(input) {
141 Some('\t' | '\n' | '\x0C' | ' ' | '<' | '&') => self.finish_none(),
142 Some(c) if Some(c) == self.addnl_allowed => self.finish_none(),
143 Some('#') => {
144 tokenizer.discard_char(input);
145 self.state = Octothorpe;
146 Progress
147 },
148 Some(_) => {
149 self.state = Named;
150 self.name_buf_opt = Some(StrTendril::new());
151 Progress
152 },
153 None => Stuck,
154 }
155 }
156
157 fn do_octothorpe<Sink: TokenSink>(
158 &mut self,
159 tokenizer: &XmlTokenizer<Sink>,
160 input: &BufferQueue,
161 ) -> Status {
162 match tokenizer.peek(input) {
163 Some(c @ ('x' | 'X')) => {
164 tokenizer.discard_char(input);
165 self.hex_marker = Some(c);
166 self.state = Numeric(16);
167 },
168 Some(_) => {
169 self.hex_marker = None;
170 self.state = Numeric(10);
171 },
172 None => return Stuck,
173 }
174 Progress
175 }
176
177 fn do_numeric<Sink: TokenSink>(
178 &mut self,
179 tokenizer: &XmlTokenizer<Sink>,
180 base: u32,
181 input: &BufferQueue,
182 ) -> Status {
183 let Some(c) = tokenizer.peek(input) else {
184 return Stuck;
185 };
186 match c.to_digit(base) {
187 Some(n) => {
188 tokenizer.discard_char(input);
189 self.num = self.num.wrapping_mul(base);
190 if self.num > 0x10FFFF {
191 self.num_too_big = true;
194 }
195 self.num = self.num.wrapping_add(n);
196 self.seen_digit = true;
197 Progress
198 },
199
200 None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
201
202 None => {
203 self.state = NumericSemicolon;
204 Progress
205 },
206 }
207 }
208
209 fn do_numeric_semicolon<Sink: TokenSink>(
210 &mut self,
211 tokenizer: &XmlTokenizer<Sink>,
212 input: &BufferQueue,
213 ) -> Status {
214 match tokenizer.peek(input) {
215 Some(';') => tokenizer.discard_char(input),
216 Some(_) => tokenizer.emit_error(Borrowed(
217 "Semicolon missing after numeric character reference",
218 )),
219 None => return Stuck,
220 };
221 self.finish_numeric(tokenizer)
222 }
223
224 fn unconsume_numeric<Sink: TokenSink>(
225 &mut self,
226 tokenizer: &XmlTokenizer<Sink>,
227 input: &BufferQueue,
228 ) -> Status {
229 let mut unconsume = StrTendril::from_char('#');
230 if let Some(c) = self.hex_marker {
231 unconsume.push_char(c);
232 }
233
234 tokenizer.unconsume(input, unconsume);
235 tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
236 self.finish_none()
237 }
238
239 fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &XmlTokenizer<Sink>) -> Status {
240 fn conv(n: u32) -> char {
241 from_u32(n).expect("invalid char missed by error handling cases")
242 }
243
244 let (c, error) = match self.num {
245 n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
246 0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
247
248 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
249 Some(c) => (c, true),
250 None => (conv(self.num), true),
251 },
252
253 0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),
254
255 n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),
256
257 n => (conv(n), false),
258 };
259
260 if error {
261 let msg = if tokenizer.opts.exact_errors {
262 Cow::from(format!(
263 "Invalid numeric character reference value 0x{:06X}",
264 self.num
265 ))
266 } else {
267 Cow::from("Invalid numeric character reference")
268 };
269 tokenizer.emit_error(msg);
270 }
271
272 self.finish_one(c)
273 }
274
275 fn do_named<Sink: TokenSink>(
276 &mut self,
277 tokenizer: &XmlTokenizer<Sink>,
278 input: &BufferQueue,
279 ) -> Status {
280 let Some(c) = tokenizer.get_char(input) else {
281 return Stuck;
282 };
283 self.name_buf_mut().push_char(c);
284 match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
285 Some(&m) => {
287 if m.0 != 0 {
288 self.name_match = Some(m);
290 self.name_len = self.name_buf().len();
291 }
292 Progress
294 },
295
296 None => self.finish_named(tokenizer, Some(c), input),
298 }
299 }
300
301 fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &XmlTokenizer<Sink>) {
302 let msg = if tokenizer.opts.exact_errors {
303 Cow::from(format!("Invalid character reference &{}", self.name_buf()))
304 } else {
305 Cow::from("Invalid character reference")
306 };
307 tokenizer.emit_error(msg);
308 }
309
310 fn unconsume_name<Sink: TokenSink>(
311 &mut self,
312 tokenizer: &XmlTokenizer<Sink>,
313 input: &BufferQueue,
314 ) {
315 tokenizer.unconsume(input, self.name_buf_opt.take().unwrap());
316 }
317
318 fn finish_named<Sink: TokenSink>(
319 &mut self,
320 tokenizer: &XmlTokenizer<Sink>,
321 end_char: Option<char>,
322 input: &BufferQueue,
323 ) -> Status {
324 match self.name_match {
325 None => {
326 match end_char {
327 Some(c) if c.is_ascii_alphanumeric() => {
328 self.state = BogusName;
331 return Progress;
332 },
333
334 Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
336
337 _ => (),
338 }
339 self.unconsume_name(tokenizer, input);
340 self.finish_none()
341 },
342
343 Some((c1, c2)) => {
344 let name_len = self.name_len;
353 assert!(name_len > 0);
354 let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
355
356 let next_after = if name_len == self.name_buf().len() {
359 None
360 } else {
361 Some(self.name_buf()[name_len..].chars().next().unwrap())
362 };
363
364 let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
375 (_, ';', _) => false,
376 (Some(_), _, Some('=')) => {
377 tokenizer.emit_error(Borrowed(
378 "Equals sign after character reference in attribute",
379 ));
380 true
381 },
382 (Some(_), _, Some(c)) if c.is_ascii_alphanumeric() => true,
383 _ => {
384 tokenizer.emit_error(Borrowed(
385 "Character reference does not end with semicolon",
386 ));
387 false
388 },
389 };
390
391 if unconsume_all {
392 self.unconsume_name(tokenizer, input);
393 self.finish_none()
394 } else {
395 tokenizer
396 .unconsume(input, StrTendril::from_slice(&self.name_buf()[name_len..]));
397 self.result = Some(CharRef {
398 chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
399 num_chars: if c2 == 0 { 1 } else { 2 },
400 });
401 Done
402 }
403 },
404 }
405 }
406
407 fn do_bogus_name<Sink: TokenSink>(
408 &mut self,
409 tokenizer: &XmlTokenizer<Sink>,
410 input: &BufferQueue,
411 ) -> Status {
412 let Some(c) = tokenizer.get_char(input) else {
413 return Stuck;
414 };
415 self.name_buf_mut().push_char(c);
416 match c {
417 _ if c.is_ascii_alphanumeric() => return Progress,
418 ';' => self.emit_name_error(tokenizer),
419 _ => (),
420 }
421 self.unconsume_name(tokenizer, input);
422 self.finish_none()
423 }
424
425 pub fn end_of_file<Sink: TokenSink>(
426 &mut self,
427 tokenizer: &XmlTokenizer<Sink>,
428 input: &BufferQueue,
429 ) {
430 while self.result.is_none() {
431 match self.state {
432 Begin => drop(self.finish_none()),
433
434 Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
435
436 Numeric(_) | NumericSemicolon => {
437 tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
438 self.finish_numeric(tokenizer);
439 },
440
441 Named => drop(self.finish_named(tokenizer, None, input)),
442
443 BogusName => {
444 self.unconsume_name(tokenizer, input);
445 self.finish_none();
446 },
447
448 Octothorpe => {
449 tokenizer.unconsume(input, StrTendril::from_slice("#"));
450 tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
451 self.finish_none();
452 },
453 }
454 }
455 }
456}