#![no_std]
mod indices;
mod report;
pub use crate::indices::Utf8CharIndices;
pub use crate::report::ErrorReportingUtf8Chars;
pub use crate::report::Utf8CharsError;
use core::iter::FusedIterator;
#[repr(align(64))] struct Utf8Data {
pub table: [u8; 384],
}
static UTF8_DATA: Utf8Data = Utf8Data {
table: [
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 148, 148, 148,
148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 164, 164, 164, 164, 164,
164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164,
164, 164, 164, 164, 164, 164, 164, 164, 164, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 16, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 32, 8, 8, 64, 8, 8, 8, 128, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
],
};
#[inline(always)]
fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool {
i.wrapping_sub(start) <= (end - start)
}
#[derive(Debug, Clone)]
pub struct Utf8Chars<'a> {
remaining: &'a [u8],
}
impl<'a> Utf8Chars<'a> {
#[inline(always)]
pub fn new(bytes: &'a [u8]) -> Self {
Utf8Chars::<'a> { remaining: bytes }
}
#[inline(always)]
pub fn as_slice(&self) -> &'a [u8] {
self.remaining
}
#[inline(never)]
fn next_fallback(&mut self) -> Option<char> {
if self.remaining.is_empty() {
return None;
}
let first = self.remaining[0];
if first < 0x80 {
self.remaining = &self.remaining[1..];
return Some(char::from(first));
}
if !in_inclusive_range8(first, 0xC2, 0xF4) || self.remaining.len() == 1 {
self.remaining = &self.remaining[1..];
return Some('\u{FFFD}');
}
let second = self.remaining[1];
let (lower_bound, upper_bound) = match first {
0xE0 => (0xA0, 0xBF),
0xED => (0x80, 0x9F),
0xF0 => (0x90, 0xBF),
0xF4 => (0x80, 0x8F),
_ => (0x80, 0xBF),
};
if !in_inclusive_range8(second, lower_bound, upper_bound) {
self.remaining = &self.remaining[1..];
return Some('\u{FFFD}');
}
if first < 0xE0 {
self.remaining = &self.remaining[2..];
let point = ((u32::from(first) & 0x1F) << 6) | (u32::from(second) & 0x3F);
return Some(unsafe { char::from_u32_unchecked(point) });
}
if self.remaining.len() == 2 {
self.remaining = &self.remaining[2..];
return Some('\u{FFFD}');
}
let third = self.remaining[2];
if !in_inclusive_range8(third, 0x80, 0xBF) {
self.remaining = &self.remaining[2..];
return Some('\u{FFFD}');
}
if first < 0xF0 {
self.remaining = &self.remaining[3..];
let point = ((u32::from(first) & 0xF) << 12)
| ((u32::from(second) & 0x3F) << 6)
| (u32::from(third) & 0x3F);
return Some(unsafe { char::from_u32_unchecked(point) });
}
self.remaining = &self.remaining[3..];
Some('\u{FFFD}')
}
}
impl<'a> Iterator for Utf8Chars<'a> {
type Item = char;
#[inline]
fn next(&mut self) -> Option<char> {
#[allow(clippy::never_loop)]
loop {
if self.remaining.len() < 4 {
break;
}
let first = self.remaining[0];
if first < 0x80 {
self.remaining = &self.remaining[1..];
return Some(char::from(first));
}
let second = self.remaining[1];
if in_inclusive_range8(first, 0xC2, 0xDF) {
if !in_inclusive_range8(second, 0x80, 0xBF) {
break;
}
let point = ((u32::from(first) & 0x1F) << 6) | (u32::from(second) & 0x3F);
self.remaining = &self.remaining[2..];
return Some(unsafe { char::from_u32_unchecked(point) });
}
let third = self.remaining[2];
if first < 0xF0 {
if ((UTF8_DATA.table[usize::from(second)]
& UTF8_DATA.table[usize::from(first) + 0x80])
| (third >> 6))
!= 2
{
break;
}
let point = ((u32::from(first) & 0xF) << 12)
| ((u32::from(second) & 0x3F) << 6)
| (u32::from(third) & 0x3F);
self.remaining = &self.remaining[3..];
return Some(unsafe { char::from_u32_unchecked(point) });
}
let fourth = self.remaining[3];
if (u16::from(
UTF8_DATA.table[usize::from(second)] & UTF8_DATA.table[usize::from(first) + 0x80],
) | u16::from(third >> 6)
| (u16::from(fourth & 0xC0) << 2))
!= 0x202
{
break;
}
let point = ((u32::from(first) & 0x7) << 18)
| ((u32::from(second) & 0x3F) << 12)
| ((u32::from(third) & 0x3F) << 6)
| (u32::from(fourth) & 0x3F);
self.remaining = &self.remaining[4..];
return Some(unsafe { char::from_u32_unchecked(point) });
}
self.next_fallback()
}
}
impl<'a> DoubleEndedIterator for Utf8Chars<'a> {
#[inline]
fn next_back(&mut self) -> Option<char> {
if self.remaining.is_empty() {
return None;
}
let mut attempt = 1;
for b in self.remaining.iter().rev() {
if b & 0xC0 != 0x80 {
let (head, tail) = self.remaining.split_at(self.remaining.len() - attempt);
let mut inner = Utf8Chars::new(tail);
let candidate = inner.next();
if inner.as_slice().is_empty() {
self.remaining = head;
return candidate;
}
break;
}
if attempt == 4 {
break;
}
attempt += 1;
}
self.remaining = &self.remaining[..self.remaining.len() - 1];
Some('\u{FFFD}')
}
}
impl FusedIterator for Utf8Chars<'_> {}
pub trait Utf8CharsEx {
fn chars(&self) -> Utf8Chars<'_>;
fn char_indices(&self) -> Utf8CharIndices<'_>;
}
impl Utf8CharsEx for [u8] {
#[inline]
fn chars(&self) -> Utf8Chars<'_> {
Utf8Chars::new(self)
}
#[inline]
fn char_indices(&self) -> Utf8CharIndices<'_> {
Utf8CharIndices::new(self)
}
}