use super::{TokenSink, XmlTokenizer};
use crate::data;
use crate::tendril::StrTendril;
use log::debug;
use mac::{format_if, unwrap_or_return};
use markup5ever::buffer_queue::BufferQueue;
use std::borrow::Cow::Borrowed;
use std::char::from_u32;
use self::State::*;
pub use self::Status::*;
pub struct CharRef {
pub chars: [char; 2],
pub num_chars: u8,
}
pub enum Status {
Stuck,
Progress,
Done,
}
#[derive(Debug)]
enum State {
Begin,
Octothorpe,
Numeric(u32), NumericSemicolon,
Named,
BogusName,
}
pub struct CharRefTokenizer {
state: State,
addnl_allowed: Option<char>,
result: Option<CharRef>,
num: u32,
num_too_big: bool,
seen_digit: bool,
hex_marker: Option<char>,
name_buf_opt: Option<StrTendril>,
name_match: Option<(u32, u32)>,
name_len: usize,
}
impl CharRefTokenizer {
pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
CharRefTokenizer {
state: Begin,
addnl_allowed,
result: None,
num: 0,
num_too_big: false,
seen_digit: false,
hex_marker: None,
name_buf_opt: None,
name_match: None,
name_len: 0,
}
}
pub fn get_result(self) -> CharRef {
self.result.expect("get_result called before done")
}
fn name_buf(&self) -> &StrTendril {
self.name_buf_opt
.as_ref()
.expect("name_buf missing in named character reference")
}
fn name_buf_mut(&mut self) -> &mut StrTendril {
self.name_buf_opt
.as_mut()
.expect("name_buf missing in named character reference")
}
fn finish_none(&mut self) -> Status {
self.result = Some(CharRef {
chars: ['\0', '\0'],
num_chars: 0,
});
Done
}
fn finish_one(&mut self, c: char) -> Status {
self.result = Some(CharRef {
chars: [c, '\0'],
num_chars: 1,
});
Done
}
}
impl CharRefTokenizer {
pub fn step<Sink: TokenSink>(
&mut self,
tokenizer: &XmlTokenizer<Sink>,
input: &BufferQueue,
) -> Status {
if self.result.is_some() {
return Done;
}
debug!("char ref tokenizer stepping in state {:?}", self.state);
match self.state {
Begin => self.do_begin(tokenizer, input),
Octothorpe => self.do_octothorpe(tokenizer, input),
Numeric(base) => self.do_numeric(tokenizer, base, input),
NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
Named => self.do_named(tokenizer, input),
BogusName => self.do_bogus_name(tokenizer, input),
}
}
fn do_begin<Sink: TokenSink>(
&mut self,
tokenizer: &XmlTokenizer<Sink>,
input: &BufferQueue,
) -> Status {
match unwrap_or_return!(tokenizer.peek(input), Stuck) {
'\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(),
c if Some(c) == self.addnl_allowed => self.finish_none(),
'#' => {
tokenizer.discard_char(input);
self.state = Octothorpe;
Progress
},
_ => {
self.state = Named;
self.name_buf_opt = Some(StrTendril::new());
Progress
},
}
}
fn do_octothorpe<Sink: TokenSink>(
&mut self,
tokenizer: &XmlTokenizer<Sink>,
input: &BufferQueue,
) -> Status {
let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
match c {
'x' | 'X' => {
tokenizer.discard_char(input);
self.hex_marker = Some(c);
self.state = Numeric(16);
},
_ => {
self.hex_marker = None;
self.state = Numeric(10);
},
}
Progress
}
fn do_numeric<Sink: TokenSink>(
&mut self,
tokenizer: &XmlTokenizer<Sink>,
base: u32,
input: &BufferQueue,
) -> Status {
let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
match c.to_digit(base) {
Some(n) => {
tokenizer.discard_char(input);
self.num = self.num.wrapping_mul(base);
if self.num > 0x10FFFF {
self.num_too_big = true;
}
self.num = self.num.wrapping_add(n);
self.seen_digit = true;
Progress
},
None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
None => {
self.state = NumericSemicolon;
Progress
},
}
}
fn do_numeric_semicolon<Sink: TokenSink>(
&mut self,
tokenizer: &XmlTokenizer<Sink>,
input: &BufferQueue,
) -> Status {
match unwrap_or_return!(tokenizer.peek(input), Stuck) {
';' => tokenizer.discard_char(input),
_ => tokenizer.emit_error(Borrowed(
"Semicolon missing after numeric character reference",
)),
};
self.finish_numeric(tokenizer)
}
fn unconsume_numeric<Sink: TokenSink>(
&mut self,
tokenizer: &XmlTokenizer<Sink>,
input: &BufferQueue,
) -> Status {
let mut unconsume = StrTendril::from_char('#');
if let Some(c) = self.hex_marker {
unconsume.push_char(c);
}
tokenizer.unconsume(input, unconsume);
tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
self.finish_none()
}
fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &XmlTokenizer<Sink>) -> Status {
fn conv(n: u32) -> char {
from_u32(n).expect("invalid char missed by error handling cases")
}
let (c, error) = match self.num {
n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
Some(c) => (c, true),
None => (conv(self.num), true),
},
0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),
n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),
n => (conv(n), false),
};
if error {
let msg = format_if!(
tokenizer.opts.exact_errors,
"Invalid numeric character reference",
"Invalid numeric character reference value 0x{:06X}",
self.num
);
tokenizer.emit_error(msg);
}
self.finish_one(c)
}
fn do_named<Sink: TokenSink>(
&mut self,
tokenizer: &XmlTokenizer<Sink>,
input: &BufferQueue,
) -> Status {
let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
self.name_buf_mut().push_char(c);
match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
Some(&m) => {
if m.0 != 0 {
self.name_match = Some(m);
self.name_len = self.name_buf().len();
}
Progress
},
None => self.finish_named(tokenizer, Some(c), input),
}
}
fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &XmlTokenizer<Sink>) {
let msg = format_if!(
tokenizer.opts.exact_errors,
"Invalid character reference",
"Invalid character reference &{}",
self.name_buf()
);
tokenizer.emit_error(msg);
}
fn unconsume_name<Sink: TokenSink>(
&mut self,
tokenizer: &XmlTokenizer<Sink>,
input: &BufferQueue,
) {
tokenizer.unconsume(input, self.name_buf_opt.take().unwrap());
}
fn finish_named<Sink: TokenSink>(
&mut self,
tokenizer: &XmlTokenizer<Sink>,
end_char: Option<char>,
input: &BufferQueue,
) -> Status {
match self.name_match {
None => {
match end_char {
Some(c) if c.is_ascii_alphanumeric() => {
self.state = BogusName;
return Progress;
},
Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
_ => (),
}
self.unconsume_name(tokenizer, input);
self.finish_none()
},
Some((c1, c2)) => {
let name_len = self.name_len;
assert!(name_len > 0);
let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
let next_after = if name_len == self.name_buf().len() {
None
} else {
Some(self.name_buf()[name_len..].chars().next().unwrap())
};
let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
(_, ';', _) => false,
(Some(_), _, Some('=')) => {
tokenizer.emit_error(Borrowed(
"Equals sign after character reference in attribute",
));
true
},
(Some(_), _, Some(c)) if c.is_ascii_alphanumeric() => true,
_ => {
tokenizer.emit_error(Borrowed(
"Character reference does not end with semicolon",
));
false
},
};
if unconsume_all {
self.unconsume_name(tokenizer, input);
self.finish_none()
} else {
tokenizer
.unconsume(input, StrTendril::from_slice(&self.name_buf()[name_len..]));
self.result = Some(CharRef {
chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
num_chars: if c2 == 0 { 1 } else { 2 },
});
Done
}
},
}
}
fn do_bogus_name<Sink: TokenSink>(
&mut self,
tokenizer: &XmlTokenizer<Sink>,
input: &BufferQueue,
) -> Status {
let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
self.name_buf_mut().push_char(c);
match c {
_ if c.is_ascii_alphanumeric() => return Progress,
';' => self.emit_name_error(tokenizer),
_ => (),
}
self.unconsume_name(tokenizer, input);
self.finish_none()
}
pub fn end_of_file<Sink: TokenSink>(
&mut self,
tokenizer: &XmlTokenizer<Sink>,
input: &BufferQueue,
) {
while self.result.is_none() {
match self.state {
Begin => drop(self.finish_none()),
Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
Numeric(_) | NumericSemicolon => {
tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
self.finish_numeric(tokenizer);
},
Named => drop(self.finish_named(tokenizer, None, input)),
BogusName => {
self.unconsume_name(tokenizer, input);
self.finish_none();
},
Octothorpe => {
tokenizer.unconsume(input, StrTendril::from_slice("#"));
tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
self.finish_none();
},
}
}
}
}