1use std::default::Default;
23use std::{char, mem, str};
24
25use crate::futf::{self, Codepoint, Meaning};
26
27pub mod imp {
32 use std::default::Default;
33 use std::{iter, slice};
34
35 pub struct Fixup {
40 pub drop_left: u32,
41 pub drop_right: u32,
42 pub insert_len: u32,
43 pub insert_bytes: [u8; 4],
44 }
45
46 impl Default for Fixup {
47 #[inline(always)]
48 fn default() -> Fixup {
49 Fixup {
50 drop_left: 0,
51 drop_right: 0,
52 insert_len: 0,
53 insert_bytes: [0; 4],
54 }
55 }
56 }
57
58 pub struct SingleByteCharIndices<'a> {
59 inner: iter::Enumerate<slice::Iter<'a, u8>>,
60 }
61
62 impl<'a> Iterator for SingleByteCharIndices<'a> {
63 type Item = (usize, char);
64
65 #[inline]
66 fn next(&mut self) -> Option<(usize, char)> {
67 self.inner
68 .next()
69 .map(|(i, &b)| unsafe { (i, char::from_u32_unchecked(b as u32)) })
70 }
71 }
72
73 impl<'a> SingleByteCharIndices<'a> {
74 #[inline]
75 pub fn new(buf: &'a [u8]) -> SingleByteCharIndices<'a> {
76 SingleByteCharIndices {
77 inner: buf.iter().enumerate(),
78 }
79 }
80 }
81}
82
83pub unsafe trait Format {
88 fn validate(buf: &[u8]) -> bool;
90
91 #[inline]
95 fn validate_prefix(buf: &[u8]) -> bool {
96 <Self as Format>::validate(buf)
97 }
98
99 #[inline]
103 fn validate_suffix(buf: &[u8]) -> bool {
104 <Self as Format>::validate(buf)
105 }
106
107 #[inline]
113 fn validate_subseq(buf: &[u8]) -> bool {
114 <Self as Format>::validate(buf)
115 }
116
117 #[inline(always)]
125 unsafe fn fixup(_lhs: &[u8], _rhs: &[u8]) -> imp::Fixup {
126 Default::default()
127 }
128}
129
130pub unsafe trait SubsetOf<Super>: Format
135where
136 Super: Format,
137{
138 fn revalidate_subset(x: &[u8]) -> bool {
146 Self::validate(x)
147 }
148}
149
150pub unsafe trait SliceFormat: Format + Sized {
153 type Slice: ?Sized + Slice;
154}
155
156pub unsafe trait CharFormat<'a>: Format {
159 type Iter: Iterator<Item = (usize, char)>;
161
162 unsafe fn char_indices(buf: &'a [u8]) -> Self::Iter;
167
168 fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
172 where
173 F: FnOnce(&[u8]);
174}
175
176pub unsafe trait Slice {
178 fn as_bytes(&self) -> &[u8];
180
181 unsafe fn from_bytes(x: &[u8]) -> &Self;
186
187 unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut Self;
192}
193
194#[derive(Copy, Clone, Default, Debug)]
198pub struct Bytes;
199
200unsafe impl Format for Bytes {
201 #[inline(always)]
202 fn validate(_: &[u8]) -> bool {
203 true
204 }
205}
206
207unsafe impl SliceFormat for Bytes {
208 type Slice = [u8];
209}
210
211unsafe impl Slice for [u8] {
212 #[inline(always)]
213 fn as_bytes(&self) -> &[u8] {
214 self
215 }
216
217 #[inline(always)]
218 unsafe fn from_bytes(x: &[u8]) -> &[u8] {
219 x
220 }
221
222 #[inline(always)]
223 unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut [u8] {
224 x
225 }
226}
227
228#[derive(Copy, Clone, Default, Debug)]
230pub struct ASCII;
231
232unsafe impl Format for ASCII {
233 #[inline]
234 fn validate(buf: &[u8]) -> bool {
235 buf.iter().all(|&n| n <= 127)
236 }
237
238 #[inline(always)]
239 fn validate_prefix(_: &[u8]) -> bool {
240 true
241 }
242
243 #[inline(always)]
244 fn validate_suffix(_: &[u8]) -> bool {
245 true
246 }
247
248 #[inline(always)]
249 fn validate_subseq(_: &[u8]) -> bool {
250 true
251 }
252}
253
254unsafe impl SubsetOf<UTF8> for ASCII {}
255unsafe impl SubsetOf<Latin1> for ASCII {}
256
257unsafe impl<'a> CharFormat<'a> for ASCII {
258 type Iter = imp::SingleByteCharIndices<'a>;
259
260 #[inline]
261 unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
262 imp::SingleByteCharIndices::new(buf)
263 }
264
265 #[inline]
266 fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
267 where
268 F: FnOnce(&[u8]),
269 {
270 let n = ch as u32;
271 if n > 0x7F {
272 return Err(());
273 }
274 cont(&[n as u8]);
275 Ok(())
276 }
277}
278
279#[derive(Copy, Clone, Default, Debug)]
281pub struct UTF8;
282
283unsafe impl Format for UTF8 {
284 #[inline]
285 fn validate(buf: &[u8]) -> bool {
286 str::from_utf8(buf).is_ok()
287 }
288
289 #[inline]
290 fn validate_prefix(buf: &[u8]) -> bool {
291 if buf.is_empty() {
292 return true;
293 }
294 matches!(
295 futf::classify(buf, buf.len() - 1),
296 Some(Codepoint {
297 meaning: Meaning::Whole(_),
298 ..
299 })
300 )
301 }
302
303 #[inline]
304 fn validate_suffix(buf: &[u8]) -> bool {
305 if buf.is_empty() {
306 return true;
307 }
308 matches!(
309 futf::classify(buf, 0),
310 Some(Codepoint {
311 meaning: Meaning::Whole(_),
312 ..
313 })
314 )
315 }
316
317 #[inline]
318 fn validate_subseq(buf: &[u8]) -> bool {
319 <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf)
320 }
321}
322
323unsafe impl SubsetOf<WTF8> for UTF8 {}
324
325unsafe impl SliceFormat for UTF8 {
326 type Slice = str;
327}
328
329unsafe impl Slice for str {
330 #[inline(always)]
331 fn as_bytes(&self) -> &[u8] {
332 str::as_bytes(self)
333 }
334
335 #[inline(always)]
336 unsafe fn from_bytes(x: &[u8]) -> &str {
337 str::from_utf8_unchecked(x)
338 }
339
340 #[inline(always)]
341 unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut str {
342 mem::transmute(x)
343 }
344}
345
346unsafe impl<'a> CharFormat<'a> for UTF8 {
347 type Iter = str::CharIndices<'a>;
348
349 #[inline]
350 unsafe fn char_indices(buf: &'a [u8]) -> str::CharIndices<'a> {
351 str::from_utf8_unchecked(buf).char_indices()
352 }
353
354 #[inline]
355 fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
356 where
357 F: FnOnce(&[u8]),
358 {
359 cont(ch.encode_utf8(&mut [0_u8; 4]).as_bytes());
360 Ok(())
361 }
362}
363
364#[derive(Copy, Clone, Default, Debug)]
368pub struct WTF8;
369
370#[inline]
371fn wtf8_meaningful(m: Meaning) -> bool {
372 matches!(
373 m,
374 Meaning::Whole(_) | Meaning::LeadSurrogate(_) | Meaning::TrailSurrogate(_)
375 )
376}
377
378unsafe impl Format for WTF8 {
379 #[inline]
380 fn validate(buf: &[u8]) -> bool {
381 let mut i = 0;
382 let mut prev_lead = false;
383 while i < buf.len() {
384 let Some(codept) = futf::classify(buf, i) else {
385 return false;
386 };
387 if !wtf8_meaningful(codept.meaning) {
388 return false;
389 }
390 i += codept.bytes.len();
391 prev_lead = match codept.meaning {
392 Meaning::TrailSurrogate(_) if prev_lead => return false,
393 Meaning::LeadSurrogate(_) => true,
394 _ => false,
395 };
396 }
397
398 true
399 }
400
401 #[inline]
402 fn validate_prefix(buf: &[u8]) -> bool {
403 if buf.is_empty() {
404 return true;
405 }
406 match futf::classify(buf, buf.len() - 1) {
407 Some(c) => wtf8_meaningful(c.meaning),
408 _ => false,
409 }
410 }
411
412 #[inline]
413 fn validate_suffix(buf: &[u8]) -> bool {
414 if buf.is_empty() {
415 return true;
416 }
417 match futf::classify(buf, 0) {
418 Some(c) => wtf8_meaningful(c.meaning),
419 _ => false,
420 }
421 }
422
423 #[inline]
424 fn validate_subseq(buf: &[u8]) -> bool {
425 <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf)
426 }
427
428 #[inline]
429 unsafe fn fixup(lhs: &[u8], rhs: &[u8]) -> imp::Fixup {
430 const ERR: &str = "WTF8: internal error";
431
432 if lhs.len() >= 3 && rhs.len() >= 3 {
433 if let (
434 Some(Codepoint {
435 meaning: Meaning::LeadSurrogate(hi),
436 ..
437 }),
438 Some(Codepoint {
439 meaning: Meaning::TrailSurrogate(lo),
440 ..
441 }),
442 ) = (futf::classify(lhs, lhs.len() - 1), futf::classify(rhs, 0))
443 {
444 let mut fixup = imp::Fixup {
445 drop_left: 3,
446 drop_right: 3,
447 insert_len: 0,
448 insert_bytes: [0_u8; 4],
449 };
450
451 let n = 0x10000 + ((hi as u32) << 10) + (lo as u32);
452
453 let ch = char::from_u32(n).expect(ERR);
454 fixup.insert_len = ch.encode_utf8(&mut fixup.insert_bytes).len() as u32;
455
456 return fixup;
457 }
458 }
459
460 Default::default()
461 }
462}
463
464#[derive(Copy, Clone, Default, Debug)]
472pub struct Latin1;
473
474unsafe impl Format for Latin1 {
475 #[inline(always)]
476 fn validate(_: &[u8]) -> bool {
477 true
478 }
479
480 #[inline(always)]
481 fn validate_prefix(_: &[u8]) -> bool {
482 true
483 }
484
485 #[inline(always)]
486 fn validate_suffix(_: &[u8]) -> bool {
487 true
488 }
489
490 #[inline(always)]
491 fn validate_subseq(_: &[u8]) -> bool {
492 true
493 }
494}
495
496unsafe impl<'a> CharFormat<'a> for Latin1 {
497 type Iter = imp::SingleByteCharIndices<'a>;
498
499 #[inline]
500 unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
501 imp::SingleByteCharIndices::new(buf)
502 }
503
504 #[inline]
505 fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
506 where
507 F: FnOnce(&[u8]),
508 {
509 let n = ch as u32;
510 if n > 0xFF {
511 return Err(());
512 }
513 cont(&[n as u8]);
514 Ok(())
515 }
516}