1#![no_std]
16
17#[cfg(feature = "multithreading")]
18use rayon::prelude::*;
19
20#[cfg(feature = "multithreading")]
21use arrayvec::ArrayVec;
22
23#[cfg(all(target_arch = "x86", target_feature = "sse2"))]
24use core::arch::x86::__m128i;
25#[cfg(all(target_arch = "x86", target_feature = "sse2"))]
26use core::arch::x86::_mm_movemask_epi8;
27
28#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
29use core::arch::x86_64::__m128i;
30#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
31use core::arch::x86_64::_mm_movemask_epi8;
32
33use encoding_rs::Decoder;
34use encoding_rs::DecoderResult;
35use encoding_rs::Encoding;
36use encoding_rs::BIG5;
37use encoding_rs::EUC_JP;
38use encoding_rs::EUC_KR;
39use encoding_rs::GBK;
40use encoding_rs::ISO_2022_JP;
41use encoding_rs::ISO_8859_8;
42use encoding_rs::SHIFT_JIS;
43use encoding_rs::UTF_8;
44use encoding_rs::WINDOWS_1255;
45
46mod data;
47mod tld;
48use data::*;
49use tld::classify_tld;
50use tld::Tld;
51
52const LATIN_ADJACENCY_PENALTY: i64 = -50;
53
54const IMPLAUSIBILITY_PENALTY: i64 = -220;
55
56const ORDINAL_BONUS: i64 = 300;
57
58const COPYRIGHT_BONUS: i64 = 222;
66
67const IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY: i64 = -180;
68
69const NON_LATIN_CAPITALIZATION_BONUS: i64 = 40;
70
71const NON_LATIN_ALL_CAPS_PENALTY: i64 = -40;
72
73const NON_LATIN_MIXED_CASE_PENALTY: i64 = -20;
74
75const CJK_BASE_SCORE: i64 = 41;
77
78const CJK_SECONDARY_BASE_SCORE: i64 = 20; const SHIFT_JIS_SCORE_PER_KANA: i64 = 20;
81
82const SHIFT_JIS_SCORE_PER_LEVEL_1_KANJI: i64 = CJK_BASE_SCORE;
83
84const SHIFT_JIS_SCORE_PER_LEVEL_2_KANJI: i64 = CJK_SECONDARY_BASE_SCORE;
85
86const SHIFT_JIS_INITIAL_HALF_WIDTH_KATAKANA_PENALTY: i64 = -75;
88
89const HALF_WIDTH_KATAKANA_SCORE: i64 = 1;
90
91const HALF_WIDTH_KATAKANA_VOICING_SCORE: i64 = 10;
93
94const SHIFT_JIS_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 10); const SHIFT_JIS_EXTENSION_PENALTY: i64 = SHIFT_JIS_PUA_PENALTY * 2;
97
98const SHIFT_JIS_SINGLE_BYTE_EXTENSION_PENALTY: i64 = SHIFT_JIS_EXTENSION_PENALTY;
99
100const EUC_JP_SCORE_PER_KANA: i64 = CJK_BASE_SCORE + (CJK_BASE_SCORE / 3); const EUC_JP_SCORE_PER_NEAR_OBSOLETE_KANA: i64 = CJK_BASE_SCORE - 1;
103
104const EUC_JP_SCORE_PER_LEVEL_1_KANJI: i64 = CJK_BASE_SCORE;
105
106const EUC_JP_SCORE_PER_LEVEL_2_KANJI: i64 = CJK_SECONDARY_BASE_SCORE;
107
108const EUC_JP_SCORE_PER_OTHER_KANJI: i64 = CJK_SECONDARY_BASE_SCORE / 4;
109
110const EUC_JP_INITIAL_KANA_PENALTY: i64 = -((CJK_BASE_SCORE / 3) + 1);
111
112const EUC_JP_EXTENSION_PENALTY: i64 = -(CJK_BASE_SCORE * 50); const BIG5_SCORE_PER_LEVEL_1_HANZI: i64 = CJK_BASE_SCORE;
115
116const BIG5_SCORE_PER_OTHER_HANZI: i64 = CJK_SECONDARY_BASE_SCORE;
117
118const BIG5_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 30); const BIG5_SINGLE_BYTE_EXTENSION_PENALTY: i64 = -(CJK_BASE_SCORE * 40);
121
122const EUC_KR_SCORE_PER_EUC_HANGUL: i64 = CJK_BASE_SCORE + 1;
123
124const EUC_KR_SCORE_PER_NON_EUC_HANGUL: i64 = CJK_SECONDARY_BASE_SCORE / 5;
125
126const EUC_KR_SCORE_PER_HANJA: i64 = CJK_SECONDARY_BASE_SCORE / 2;
127
128const EUC_KR_HANJA_AFTER_HANGUL_PENALTY: i64 = -(CJK_BASE_SCORE * 10);
129
130const EUC_KR_LONG_WORD_PENALTY: i64 = -6;
131
132const EUC_KR_PUA_PENALTY: i64 = GBK_PUA_PENALTY - 1; const EUC_KR_MAC_KOREAN_PENALTY: i64 = EUC_KR_PUA_PENALTY * 2;
135
136const EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY: i64 = EUC_KR_MAC_KOREAN_PENALTY;
137
138const GBK_SCORE_PER_LEVEL_1: i64 = CJK_BASE_SCORE;
139
140const GBK_SCORE_PER_LEVEL_2: i64 = CJK_SECONDARY_BASE_SCORE;
141
142const GBK_SCORE_PER_NON_EUC: i64 = CJK_SECONDARY_BASE_SCORE / 4;
143
144const GBK_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 10); const GBK_SINGLE_BYTE_EXTENSION_PENALTY: i64 = GBK_PUA_PENALTY * 4;
147
148const CJK_LATIN_ADJACENCY_PENALTY: i64 = -CJK_BASE_SCORE; const CJ_PUNCTUATION: i64 = CJK_BASE_SCORE / 2;
151
152const CJK_OTHER: i64 = CJK_SECONDARY_BASE_SCORE / 4;
153
154const LATIN_LETTER: u8 = 1;
156
157fn contains_upper_case_period_or_non_ascii(label: &[u8]) -> bool {
158 for &b in label.into_iter() {
159 if b >= 0x80 {
160 return true;
161 }
162 if b == b'.' {
163 return true;
164 }
165 if b >= b'A' && b <= b'Z' {
166 return true;
167 }
168 }
169 false
170}
171
172#[derive(PartialEq)]
177enum LatinCaseState {
178 Space,
179 Upper,
180 Lower,
181 AllCaps,
182}
183
184#[derive(PartialEq)]
187enum NonLatinCaseState {
188 Space,
189 Upper,
190 Lower,
191 UpperLower,
192 AllCaps,
193 Mix,
194}
195
196struct NonLatinCasedCandidate {
197 data: &'static SingleByteData,
198 prev: u8,
199 case_state: NonLatinCaseState,
200 prev_ascii: bool,
201 current_word_len: u64,
202 longest_word: u64,
203 ibm866: bool,
204 prev_was_a0: bool, }
206
207impl NonLatinCasedCandidate {
208 fn new(data: &'static SingleByteData) -> Self {
209 NonLatinCasedCandidate {
210 data: data,
211 prev: 0,
212 case_state: NonLatinCaseState::Space,
213 prev_ascii: true,
214 current_word_len: 0,
215 longest_word: 0,
216 ibm866: data == &SINGLE_BYTE_DATA[IBM866_INDEX],
217 prev_was_a0: false,
218 }
219 }
220
221 fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
222 let mut score = 0i64;
223 for &b in buffer {
224 let class = self.data.classify(b);
225 if class == 255 {
226 return None;
227 }
228 let caseless_class = class & 0x7F;
229
230 let ascii = b < 0x80;
231 let ascii_pair = self.prev_ascii && ascii;
232
233 let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false);
234
235 if caseless_class == LATIN_LETTER {
250 self.case_state = NonLatinCaseState::Mix;
257 } else if !non_ascii_alphabetic {
258 match self.case_state {
260 NonLatinCaseState::Space
261 | NonLatinCaseState::Upper
262 | NonLatinCaseState::Lower => {}
263 NonLatinCaseState::UpperLower => {
264 score += NON_LATIN_CAPITALIZATION_BONUS;
266 }
267 NonLatinCaseState::AllCaps => {
268 if self.data == &SINGLE_BYTE_DATA[KOI8_U_INDEX] {
270 score += NON_LATIN_ALL_CAPS_PENALTY;
272 }
273 }
274 NonLatinCaseState::Mix => {
275 score += NON_LATIN_MIXED_CASE_PENALTY * (self.current_word_len as i64);
277 }
278 }
279 self.case_state = NonLatinCaseState::Space;
280 } else if (class >> 7) == 0 {
281 match self.case_state {
283 NonLatinCaseState::Space => {
284 self.case_state = NonLatinCaseState::Lower;
285 }
286 NonLatinCaseState::Upper => {
287 self.case_state = NonLatinCaseState::UpperLower;
288 }
289 NonLatinCaseState::Lower
290 | NonLatinCaseState::UpperLower
291 | NonLatinCaseState::Mix => {}
292 NonLatinCaseState::AllCaps => {
293 self.case_state = NonLatinCaseState::Mix;
294 }
295 }
296 } else {
297 match self.case_state {
299 NonLatinCaseState::Space => {
300 self.case_state = NonLatinCaseState::Upper;
301 }
302 NonLatinCaseState::Upper => {
303 self.case_state = NonLatinCaseState::AllCaps;
304 }
305 NonLatinCaseState::Lower | NonLatinCaseState::UpperLower => {
306 self.case_state = NonLatinCaseState::Mix;
307 }
308 NonLatinCaseState::AllCaps | NonLatinCaseState::Mix => {}
309 }
310 }
311
312 if non_ascii_alphabetic {
314 self.current_word_len += 1;
315 } else {
316 if self.current_word_len > self.longest_word {
317 self.longest_word = self.current_word_len;
318 }
319 self.current_word_len = 0;
320 }
321
322 let is_a0 = b == 0xA0;
323 if !ascii_pair {
324 if !(self.ibm866
328 && ((is_a0 && (self.prev_was_a0 || self.prev == 0))
329 || caseless_class == 0 && self.prev_was_a0))
330 {
331 score += self.data.score(caseless_class, self.prev, false);
332 }
333
334 if self.prev == LATIN_LETTER && non_ascii_alphabetic {
335 score += LATIN_ADJACENCY_PENALTY;
336 } else if caseless_class == LATIN_LETTER
337 && self.data.is_non_latin_alphabetic(self.prev, false)
338 {
339 score += LATIN_ADJACENCY_PENALTY;
340 }
341 }
342
343 self.prev_ascii = ascii;
344 self.prev = caseless_class;
345 self.prev_was_a0 = is_a0;
346 }
347 Some(score)
348 }
349}
350
351enum OrdinalState {
352 Other,
353 Space,
354 PeriodAfterN,
355 OrdinalExpectingSpace,
356 OrdinalExpectingSpaceUndoImplausibility,
357 OrdinalExpectingSpaceOrDigit,
358 OrdinalExpectingSpaceOrDigitUndoImplausibily,
359 UpperN,
360 LowerN,
361 FeminineAbbreviationStartLetter,
362 Digit,
363 Roman,
364 Copyright,
365}
366
367struct LatinCandidate {
368 data: &'static SingleByteData,
369 prev: u8,
370 case_state: LatinCaseState,
371 prev_non_ascii: u32,
372 ordinal_state: OrdinalState, windows1252: bool,
374}
375
376impl LatinCandidate {
377 fn new(data: &'static SingleByteData) -> Self {
378 LatinCandidate {
379 data: data,
380 prev: 0,
381 case_state: LatinCaseState::Space,
382 prev_non_ascii: 0,
383 ordinal_state: OrdinalState::Space,
384 windows1252: data == &SINGLE_BYTE_DATA[WINDOWS_1252_INDEX],
385 }
386 }
387
388 fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
389 let mut score = 0i64;
390 for &b in buffer {
391 let class = self.data.classify(b);
392 if class == 255 {
393 return None;
394 }
395 let caseless_class = class & 0x7F;
396
397 let ascii = b < 0x80;
398 let ascii_pair = self.prev_non_ascii == 0 && ascii;
399
400 let non_ascii_penalty = match self.prev_non_ascii {
401 0 | 1 | 2 => 0,
402 3 => -5,
403 4 => -20,
404 _ => -200,
405 };
406 score += non_ascii_penalty;
407 if !self.data.is_latin_alphabetic(caseless_class) {
411 self.case_state = LatinCaseState::Space;
412 } else if (class >> 7) == 0 {
413 if self.case_state == LatinCaseState::AllCaps && !ascii_pair {
417 score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY;
418 }
419 self.case_state = LatinCaseState::Lower;
420 } else {
421 match self.case_state {
422 LatinCaseState::Space => {
423 self.case_state = LatinCaseState::Upper;
424 }
425 LatinCaseState::Upper | LatinCaseState::AllCaps => {
426 self.case_state = LatinCaseState::AllCaps;
427 }
428 LatinCaseState::Lower => {
429 if !ascii_pair {
430 score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY;
432 }
433 self.case_state = LatinCaseState::Upper;
434 }
435 }
436 }
437
438 let ascii_ish_pair = ascii_pair
443 || (ascii && self.prev == 0)
444 || (caseless_class == 0 && self.prev_non_ascii == 0);
445
446 if !ascii_ish_pair {
447 score += self.data.score(caseless_class, self.prev, false);
448 }
449
450 if self.windows1252 {
451 match self.ordinal_state {
472 OrdinalState::Other => {
473 if caseless_class == 0 {
474 self.ordinal_state = OrdinalState::Space;
475 }
476 }
477 OrdinalState::Space => {
478 if caseless_class == 0 {
479 } else if b == 0xAA || b == 0xBA {
481 self.ordinal_state = OrdinalState::OrdinalExpectingSpace;
482 } else if b == b'M' || b == b'D' || b == b'S' {
483 self.ordinal_state = OrdinalState::FeminineAbbreviationStartLetter;
484 } else if b == b'N' {
485 self.ordinal_state = OrdinalState::UpperN;
487 } else if b == b'n' {
488 self.ordinal_state = OrdinalState::LowerN;
490 } else if caseless_class == (ASCII_DIGIT as u8) {
491 self.ordinal_state = OrdinalState::Digit;
492 } else if caseless_class == 9 || caseless_class == 22 || caseless_class == 24
493 {
495 self.ordinal_state = OrdinalState::Roman;
496 } else if b == 0xA9 {
497 self.ordinal_state = OrdinalState::Copyright;
498 } else {
499 self.ordinal_state = OrdinalState::Other;
500 }
501 }
502 OrdinalState::OrdinalExpectingSpace => {
503 if caseless_class == 0 {
504 score += ORDINAL_BONUS;
505 self.ordinal_state = OrdinalState::Space;
506 } else {
507 self.ordinal_state = OrdinalState::Other;
508 }
509 }
510 OrdinalState::OrdinalExpectingSpaceUndoImplausibility => {
511 if caseless_class == 0 {
512 score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY;
513 self.ordinal_state = OrdinalState::Space;
514 } else {
515 self.ordinal_state = OrdinalState::Other;
516 }
517 }
518 OrdinalState::OrdinalExpectingSpaceOrDigit => {
519 if caseless_class == 0 {
520 score += ORDINAL_BONUS;
521 self.ordinal_state = OrdinalState::Space;
522 } else if caseless_class == (ASCII_DIGIT as u8) {
523 score += ORDINAL_BONUS;
524 self.ordinal_state = OrdinalState::Other;
526 } else {
527 self.ordinal_state = OrdinalState::Other;
528 }
529 }
530 OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily => {
531 if caseless_class == 0 {
532 score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY;
533 self.ordinal_state = OrdinalState::Space;
534 } else if caseless_class == (ASCII_DIGIT as u8) {
535 score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY;
536 self.ordinal_state = OrdinalState::Other;
538 } else {
539 self.ordinal_state = OrdinalState::Other;
540 }
541 }
542 OrdinalState::UpperN => {
543 if b == 0xAA {
544 self.ordinal_state =
545 OrdinalState::OrdinalExpectingSpaceUndoImplausibility;
546 } else if b == 0xBA {
547 self.ordinal_state =
548 OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily;
549 } else if b == b'.' {
550 self.ordinal_state = OrdinalState::PeriodAfterN;
551 } else if caseless_class == 0 {
552 self.ordinal_state = OrdinalState::Space;
553 } else {
554 self.ordinal_state = OrdinalState::Other;
555 }
556 }
557 OrdinalState::LowerN => {
558 if b == 0xBA {
559 self.ordinal_state =
560 OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily;
561 } else if b == b'.' {
562 self.ordinal_state = OrdinalState::PeriodAfterN;
563 } else if caseless_class == 0 {
564 self.ordinal_state = OrdinalState::Space;
565 } else {
566 self.ordinal_state = OrdinalState::Other;
567 }
568 }
569 OrdinalState::FeminineAbbreviationStartLetter => {
570 if b == 0xAA {
571 self.ordinal_state =
572 OrdinalState::OrdinalExpectingSpaceUndoImplausibility;
573 } else if caseless_class == 0 {
574 self.ordinal_state = OrdinalState::Space;
575 } else {
576 self.ordinal_state = OrdinalState::Other;
577 }
578 }
579 OrdinalState::Digit => {
580 if b == 0xAA || b == 0xBA {
581 self.ordinal_state = OrdinalState::OrdinalExpectingSpace;
582 } else if caseless_class == 0 {
583 self.ordinal_state = OrdinalState::Space;
584 } else if caseless_class == (ASCII_DIGIT as u8) {
585 } else {
587 self.ordinal_state = OrdinalState::Other;
588 }
589 }
590 OrdinalState::Roman => {
591 if b == 0xAA || b == 0xBA {
592 self.ordinal_state =
593 OrdinalState::OrdinalExpectingSpaceUndoImplausibility;
594 } else if caseless_class == 0 {
595 self.ordinal_state = OrdinalState::Space;
596 } else if caseless_class == 9 || caseless_class == 22 || caseless_class == 24
597 {
599 } else {
601 self.ordinal_state = OrdinalState::Other;
602 }
603 }
604 OrdinalState::PeriodAfterN => {
605 if b == 0xBA {
606 self.ordinal_state = OrdinalState::OrdinalExpectingSpaceOrDigit;
607 } else if caseless_class == 0 {
608 self.ordinal_state = OrdinalState::Space;
609 } else {
610 self.ordinal_state = OrdinalState::Other;
611 }
612 }
613 OrdinalState::Copyright => {
614 if caseless_class == 0 {
615 score += COPYRIGHT_BONUS;
616 self.ordinal_state = OrdinalState::Space;
617 } else {
618 self.ordinal_state = OrdinalState::Other;
619 }
620 }
621 }
622 }
623
624 if ascii {
625 self.prev_non_ascii = 0;
626 } else {
627 self.prev_non_ascii += 1;
628 }
629 self.prev = caseless_class;
630 }
631 Some(score)
632 }
633}
634
635struct ArabicFrenchCandidate {
636 data: &'static SingleByteData,
637 prev: u8,
638 case_state: LatinCaseState,
639 prev_ascii: bool,
640 current_word_len: u64,
641 longest_word: u64,
642}
643
644impl ArabicFrenchCandidate {
645 fn new(data: &'static SingleByteData) -> Self {
646 ArabicFrenchCandidate {
647 data: data,
648 prev: 0,
649 case_state: LatinCaseState::Space,
650 prev_ascii: true,
651 current_word_len: 0,
652 longest_word: 0,
653 }
654 }
655
656 fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
657 let mut score = 0i64;
658 for &b in buffer {
659 let class = self.data.classify(b);
660 if class == 255 {
661 return None;
662 }
663 let caseless_class = class & 0x7F;
664
665 let ascii = b < 0x80;
666 let ascii_pair = self.prev_ascii && ascii;
667
668 if caseless_class != LATIN_LETTER {
669 self.case_state = LatinCaseState::Space;
671 } else if (class >> 7) == 0 {
672 if self.case_state == LatinCaseState::AllCaps && !ascii_pair {
673 score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY;
674 }
675 self.case_state = LatinCaseState::Lower;
676 } else {
677 match self.case_state {
678 LatinCaseState::Space => {
679 self.case_state = LatinCaseState::Upper;
680 }
681 LatinCaseState::Upper | LatinCaseState::AllCaps => {
682 self.case_state = LatinCaseState::AllCaps;
683 }
684 LatinCaseState::Lower => {
685 if !ascii_pair {
686 score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY;
687 }
688 self.case_state = LatinCaseState::Upper;
689 }
690 }
691 }
692
693 let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, true);
695 if non_ascii_alphabetic {
697 self.current_word_len += 1;
698 } else {
699 if self.current_word_len > self.longest_word {
700 self.longest_word = self.current_word_len;
701 }
702 self.current_word_len = 0;
703 }
704
705 if !ascii_pair {
706 score += self.data.score(caseless_class, self.prev, true);
707
708 if self.prev == LATIN_LETTER && non_ascii_alphabetic {
709 score += LATIN_ADJACENCY_PENALTY;
710 } else if caseless_class == LATIN_LETTER
711 && self.data.is_non_latin_alphabetic(self.prev, true)
712 {
713 score += LATIN_ADJACENCY_PENALTY;
714 }
715 }
716
717 self.prev_ascii = ascii;
718 self.prev = caseless_class;
719 }
720 Some(score)
721 }
722}
723
724struct CaselessCandidate {
725 data: &'static SingleByteData,
726 prev: u8,
727 prev_ascii: bool,
728 current_word_len: u64,
729 longest_word: u64,
730}
731
732impl CaselessCandidate {
733 fn new(data: &'static SingleByteData) -> Self {
734 CaselessCandidate {
735 data: data,
736 prev: 0,
737 prev_ascii: true,
738 current_word_len: 0,
739 longest_word: 0,
740 }
741 }
742
743 fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
744 let mut score = 0i64;
745 for &b in buffer {
746 let class = self.data.classify(b);
747 if class == 255 {
748 return None;
749 }
750 let caseless_class = class & 0x7F;
751
752 let ascii = b < 0x80;
753 let ascii_pair = self.prev_ascii && ascii;
754
755 let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false);
756 if non_ascii_alphabetic {
758 self.current_word_len += 1;
759 } else {
760 if self.current_word_len > self.longest_word {
761 self.longest_word = self.current_word_len;
762 }
763 self.current_word_len = 0;
764 }
765
766 if !ascii_pair {
767 score += self.data.score(caseless_class, self.prev, false);
768
769 if self.prev == LATIN_LETTER && non_ascii_alphabetic {
770 score += LATIN_ADJACENCY_PENALTY;
771 } else if caseless_class == LATIN_LETTER
772 && self.data.is_non_latin_alphabetic(self.prev, false)
773 {
774 score += LATIN_ADJACENCY_PENALTY;
775 }
776 }
777
778 self.prev_ascii = ascii;
779 self.prev = caseless_class;
780 }
781 Some(score)
782 }
783}
784
785fn is_ascii_punctuation(byte: u8) -> bool {
786 match byte {
787 b'.' | b',' | b':' | b';' | b'?' | b'!' => true,
788 _ => false,
789 }
790}
791
792struct LogicalCandidate {
793 data: &'static SingleByteData,
794 prev: u8,
795 prev_ascii: bool,
796 plausible_punctuation: u64,
797 current_word_len: u64,
798 longest_word: u64,
799}
800
801impl LogicalCandidate {
802 fn new(data: &'static SingleByteData) -> Self {
803 LogicalCandidate {
804 data: data,
805 prev: 0,
806 prev_ascii: true,
807 plausible_punctuation: 0,
808 current_word_len: 0,
809 longest_word: 0,
810 }
811 }
812
813 fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
814 let mut score = 0i64;
815 for &b in buffer {
816 let class = self.data.classify(b);
817 if class == 255 {
818 return None;
819 }
820 let caseless_class = class & 0x7F;
821
822 let ascii = b < 0x80;
823 let ascii_pair = self.prev_ascii && ascii;
824
825 let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false);
826 if non_ascii_alphabetic {
828 self.current_word_len += 1;
829 } else {
830 if self.current_word_len > self.longest_word {
831 self.longest_word = self.current_word_len;
832 }
833 self.current_word_len = 0;
834 }
835
836 if !ascii_pair {
837 score += self.data.score(caseless_class, self.prev, false);
838
839 let prev_non_ascii_alphabetic = self.data.is_non_latin_alphabetic(self.prev, false);
840 if caseless_class == 0 && prev_non_ascii_alphabetic && is_ascii_punctuation(b) {
841 self.plausible_punctuation += 1;
842 }
843
844 if self.prev == LATIN_LETTER && non_ascii_alphabetic {
845 score += LATIN_ADJACENCY_PENALTY;
846 } else if caseless_class == LATIN_LETTER && prev_non_ascii_alphabetic {
847 score += LATIN_ADJACENCY_PENALTY;
848 }
849 }
850
851 self.prev_ascii = ascii;
852 self.prev = caseless_class;
853 }
854 Some(score)
855 }
856}
857
858struct VisualCandidate {
859 data: &'static SingleByteData,
860 prev: u8,
861 prev_ascii: bool,
862 prev_punctuation: bool,
863 plausible_punctuation: u64,
864 current_word_len: u64,
865 longest_word: u64,
866}
867
868impl VisualCandidate {
869 fn new(data: &'static SingleByteData) -> Self {
870 VisualCandidate {
871 data: data,
872 prev: 0,
873 prev_ascii: true,
874 prev_punctuation: false,
875 plausible_punctuation: 0,
876 current_word_len: 0,
877 longest_word: 0,
878 }
879 }
880
881 fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
882 let mut score = 0i64;
883 for &b in buffer {
884 let class = self.data.classify(b);
885 if class == 255 {
886 return None;
887 }
888 let caseless_class = class & 0x7F;
889
890 let ascii = b < 0x80;
891 let ascii_pair = self.prev_ascii && ascii;
892
893 let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false);
894 if non_ascii_alphabetic {
896 self.current_word_len += 1;
897 } else {
898 if self.current_word_len > self.longest_word {
899 self.longest_word = self.current_word_len;
900 }
901 self.current_word_len = 0;
902 }
903
904 if !ascii_pair {
905 score += self.data.score(caseless_class, self.prev, false);
906
907 if non_ascii_alphabetic && self.prev_punctuation {
908 self.plausible_punctuation += 1;
909 }
910
911 if self.prev == LATIN_LETTER && non_ascii_alphabetic {
912 score += LATIN_ADJACENCY_PENALTY;
913 } else if caseless_class == LATIN_LETTER
914 && self.data.is_non_latin_alphabetic(self.prev, false)
915 {
916 score += LATIN_ADJACENCY_PENALTY;
917 }
918 }
919
920 self.prev_ascii = ascii;
921 self.prev = caseless_class;
922 self.prev_punctuation = caseless_class == 0 && is_ascii_punctuation(b);
923 }
924 Some(score)
925 }
926}
927
928struct Utf8Candidate {
929 decoder: Decoder,
930}
931
932impl Utf8Candidate {
933 fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
934 let mut dst = [0u8; 1024];
935 let mut total_read = 0;
936 loop {
937 let (result, read, _) = self.decoder.decode_to_utf8_without_replacement(
938 &buffer[total_read..],
939 &mut dst,
940 last,
941 );
942 total_read += read;
943 match result {
944 DecoderResult::InputEmpty => {
945 return Some(0);
946 }
947 DecoderResult::Malformed(_, _) => {
948 return None;
949 }
950 DecoderResult::OutputFull => {
951 continue;
952 }
953 }
954 }
955 }
956}
957
958struct Iso2022Candidate {
959 decoder: Decoder,
960}
961
962impl Iso2022Candidate {
963 fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
964 let mut dst = [0u16; 1024];
965 let mut total_read = 0;
966 loop {
967 let (result, read, _) = self.decoder.decode_to_utf16_without_replacement(
968 &buffer[total_read..],
969 &mut dst,
970 last,
971 );
972 total_read += read;
973 match result {
974 DecoderResult::InputEmpty => {
975 return Some(0);
976 }
977 DecoderResult::Malformed(_, _) => {
978 return None;
979 }
980 DecoderResult::OutputFull => {
981 continue;
982 }
983 }
984 }
985 }
986}
987
988#[derive(PartialEq)]
989enum LatinCj {
990 AsciiLetter,
991 Cj,
992 Other,
993}
994
995#[derive(PartialEq, Copy, Clone)]
996enum HalfWidthKatakana {
997 DakutenForbidden,
998 DakutenAllowed,
999 DakutenOrHandakutenAllowed,
1000}
1001
1002#[derive(PartialEq)]
1003enum LatinKorean {
1004 AsciiLetter,
1005 Hangul,
1006 Hanja,
1007 Other,
1008}
1009
1010fn cjk_extra_score(u: u16, table: &'static [u16; 128]) -> i64 {
1011 if let Some(pos) = table.iter().position(|&x| x == u) {
1012 ((128 - pos) / 16) as i64
1013 } else {
1014 0
1015 }
1016}
1017
1018struct GbkCandidate {
1019 decoder: Decoder,
1020 prev_byte: u8,
1021 prev: LatinCj,
1022 pending_score: Option<i64>,
1023}
1024
1025impl GbkCandidate {
1026 fn maybe_set_as_pending(&mut self, s: i64) -> i64 {
1027 assert!(self.pending_score.is_none());
1028 if self.prev == LatinCj::Cj || !more_problematic_lead(self.prev_byte) {
1029 s
1030 } else {
1031 self.pending_score = Some(s);
1032 0
1033 }
1034 }
1035
1036 fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
1037 let mut score = 0i64;
1038 let mut src = [0u8];
1039 let mut dst = [0u16; 2];
1040 for &b in buffer {
1041 src[0] = b;
1042 let (result, read, written) = self
1043 .decoder
1044 .decode_to_utf16_without_replacement(&src, &mut dst, false);
1045 if written == 1 {
1046 let u = dst[0];
1047 if (u >= u16::from(b'a') && u <= u16::from(b'z'))
1048 || (u >= u16::from(b'A') && u <= u16::from(b'Z'))
1049 {
1050 self.pending_score = None; if self.prev == LatinCj::Cj {
1052 score += CJK_LATIN_ADJACENCY_PENALTY;
1053 }
1054 self.prev = LatinCj::AsciiLetter;
1055 } else if u == 0x20AC {
1056 self.pending_score = None; self.prev = LatinCj::Other;
1060 } else if u >= 0x4E00 && u <= 0x9FA5 {
1061 if let Some(pending) = self.pending_score {
1062 score += pending;
1063 self.pending_score = None;
1064 }
1065 if b >= 0xA1 && b <= 0xFE {
1066 match self.prev_byte {
1067 0xA1..=0xD7 => {
1068 score += GBK_SCORE_PER_LEVEL_1;
1069 score +=
1070 cjk_extra_score(u, &data::DETECTOR_DATA.frequent_simplified);
1071 }
1072 0xD8..=0xFE => score += GBK_SCORE_PER_LEVEL_2,
1073 _ => {
1074 score += GBK_SCORE_PER_NON_EUC;
1075 }
1076 }
1077 } else {
1078 score += self.maybe_set_as_pending(GBK_SCORE_PER_NON_EUC);
1079 }
1080 if self.prev == LatinCj::AsciiLetter {
1081 score += CJK_LATIN_ADJACENCY_PENALTY;
1082 }
1083 self.prev = LatinCj::Cj;
1084 } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) {
1085 if let Some(pending) = self.pending_score {
1086 score += pending;
1087 self.pending_score = None;
1088 }
1089 if self.prev == LatinCj::AsciiLetter {
1091 score += CJK_LATIN_ADJACENCY_PENALTY;
1092 }
1093 self.prev = LatinCj::Cj;
1094 } else if u >= 0xE000 && u < 0xF900 {
1095 if let Some(pending) = self.pending_score {
1096 score += pending;
1097 self.pending_score = None;
1098 }
1099 match u {
1101 0xE78D..=0xE796
1102 | 0xE816..=0xE818
1103 | 0xE81E
1104 | 0xE826
1105 | 0xE82B
1106 | 0xE82C
1107 | 0xE831
1108 | 0xE832
1109 | 0xE83B
1110 | 0xE843
1111 | 0xE854
1112 | 0xE855
1113 | 0xE864 => {
1114 score += GBK_SCORE_PER_NON_EUC;
1115 if self.prev == LatinCj::AsciiLetter {
1116 score += CJK_LATIN_ADJACENCY_PENALTY;
1117 }
1118 self.prev = LatinCj::Cj;
1119 }
1120 _ => {
1121 score += GBK_PUA_PENALTY;
1122 self.prev = LatinCj::Other;
1123 }
1124 }
1125 } else {
1126 match u {
1127 0x3000 | 0x3001 | 0x3002 | 0xFF08 | 0xFF09 | 0xFF01 | 0xFF0C | 0xFF1B | 0xFF1F => {
1137 if let Some(pending) = self.pending_score {
1138 score += pending;
1139 self.pending_score = None;
1140 }
1141 score += CJ_PUNCTUATION;
1142 }
1143 0..=0x7F => {
1144 self.pending_score = None; }
1146 _ => {
1147 if let Some(pending) = self.pending_score {
1148 score += pending;
1149 self.pending_score = None;
1150 }
1151 score += CJK_OTHER;
1152 }
1153 }
1154 self.prev = LatinCj::Other;
1155 }
1156 } else if written == 2 {
1157 if let Some(pending) = self.pending_score {
1158 score += pending;
1159 self.pending_score = None;
1160 }
1161 let u = dst[0];
1162 if u >= 0xDB80 && u <= 0xDBFF {
1163 score += GBK_PUA_PENALTY;
1164 self.prev = LatinCj::Other;
1165 } else if u >= 0xD480 && u < 0xD880 {
1166 score += GBK_SCORE_PER_NON_EUC;
1167 if self.prev == LatinCj::AsciiLetter {
1168 score += CJK_LATIN_ADJACENCY_PENALTY;
1169 }
1170 self.prev = LatinCj::Cj;
1171 } else {
1172 score += CJK_OTHER;
1173 self.prev = LatinCj::Other;
1174 }
1175 }
1176 match result {
1177 DecoderResult::InputEmpty => {
1178 assert_eq!(read, 1);
1179 }
1180 DecoderResult::Malformed(malformed_len, _) => {
1181 if (self.prev_byte == 0xA0 || self.prev_byte == 0xFE || self.prev_byte == 0xFD)
1182 && (b < 0x80 || b == 0xFF)
1183 {
1184 self.pending_score = None; score += GBK_SINGLE_BYTE_EXTENSION_PENALTY;
1188 if (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') {
1189 self.prev = LatinCj::AsciiLetter;
1190 } else if b == 0xFF {
1191 score += GBK_SINGLE_BYTE_EXTENSION_PENALTY;
1192 self.prev = LatinCj::Other;
1193 } else {
1194 self.prev = LatinCj::Other;
1195 }
1196 self.decoder = GBK.new_decoder_without_bom_handling();
1199 } else if malformed_len == 1 && b == 0xFF {
1200 self.pending_score = None; score += GBK_SINGLE_BYTE_EXTENSION_PENALTY;
1203 self.prev = LatinCj::Other;
1204 self.decoder = GBK.new_decoder_without_bom_handling();
1207 } else {
1208 return None;
1209 }
1210 }
1211 DecoderResult::OutputFull => {
1212 unreachable!();
1213 }
1214 }
1215 self.prev_byte = b;
1216 }
1217 if last {
1218 let (result, _, _) = self
1219 .decoder
1220 .decode_to_utf16_without_replacement(b"", &mut dst, true);
1221 match result {
1222 DecoderResult::InputEmpty => {}
1223 DecoderResult::Malformed(_, _) => {
1224 return None;
1225 }
1226 DecoderResult::OutputFull => {
1227 unreachable!();
1228 }
1229 }
1230 }
1231 Some(score)
1232 }
1233}
1234
1235fn problematic_lead(b: u8) -> bool {
1237 match b {
1238 0x91..=0x97 | 0x9A | 0x8A | 0x9B | 0x8B | 0x9E | 0x8E | 0xB0 => true,
1239 _ => false,
1240 }
1241}
1242
1243fn more_problematic_lead(b: u8) -> bool {
1245 problematic_lead(b) || b == 0x82 || b == 0x84 || b == 0x85 || b == 0xA0
1246}
1247
1248struct ShiftJisCandidate {
1249 decoder: Decoder,
1250 half_width_katakana_seen: bool,
1251 half_width_katakana_state: HalfWidthKatakana,
1252 prev: LatinCj,
1253 prev_byte: u8,
1254 pending_score: Option<i64>,
1255}
1256
1257impl ShiftJisCandidate {
1258 fn maybe_set_as_pending(&mut self, s: i64) -> i64 {
1259 assert!(self.pending_score.is_none());
1260 if self.prev == LatinCj::Cj || !problematic_lead(self.prev_byte) {
1261 s
1262 } else {
1263 self.pending_score = Some(s);
1264 0
1265 }
1266 }
1267
1268 fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
1269 let mut score = 0i64;
1270 let mut src = [0u8];
1271 let mut dst = [0u16; 2];
1272 for &b in buffer {
1273 src[0] = b;
1274 let (result, read, written) = self
1275 .decoder
1276 .decode_to_utf16_without_replacement(&src, &mut dst, false);
1277 if written > 0 {
1278 let half_width_katakana_state = self.half_width_katakana_state;
1279 self.half_width_katakana_state = HalfWidthKatakana::DakutenForbidden;
1280 let u = dst[0];
1281 if (u >= u16::from(b'a') && u <= u16::from(b'z'))
1282 || (u >= u16::from(b'A') && u <= u16::from(b'Z'))
1283 {
1284 self.pending_score = None; if self.prev == LatinCj::Cj {
1286 score += CJK_LATIN_ADJACENCY_PENALTY;
1287 }
1288 self.prev = LatinCj::AsciiLetter;
1289 } else if u >= 0xFF61 && u <= 0xFF9F {
1290 if !self.half_width_katakana_seen {
1291 self.half_width_katakana_seen = true;
1292 score += SHIFT_JIS_INITIAL_HALF_WIDTH_KATAKANA_PENALTY;
1294 }
1295 self.pending_score = None; score += HALF_WIDTH_KATAKANA_SCORE;
1297
1298 if (u >= 0xFF76 && u <= 0xFF84) || u == 0xFF73 {
1299 self.half_width_katakana_state = HalfWidthKatakana::DakutenAllowed;
1300 } else if u >= 0xFF8A && u <= 0xFF8E {
1301 self.half_width_katakana_state =
1302 HalfWidthKatakana::DakutenOrHandakutenAllowed;
1303 } else if u == 0xFF9E {
1304 if half_width_katakana_state == HalfWidthKatakana::DakutenForbidden {
1305 score += IMPLAUSIBILITY_PENALTY;
1306 } else {
1307 score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
1308 }
1309 } else if u == 0xFF9F {
1310 if half_width_katakana_state
1311 != HalfWidthKatakana::DakutenOrHandakutenAllowed
1312 {
1313 score += IMPLAUSIBILITY_PENALTY;
1314 } else {
1315 score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
1316 }
1317 }
1318
1319 if self.prev == LatinCj::AsciiLetter {
1320 score += CJK_LATIN_ADJACENCY_PENALTY;
1321 }
1322 self.prev = LatinCj::Cj;
1323 } else if u >= 0x3040 && u < 0x3100 {
1324 if let Some(pending) = self.pending_score {
1325 score += pending;
1326 self.pending_score = None;
1327 }
1328 score += SHIFT_JIS_SCORE_PER_KANA;
1329 if self.prev == LatinCj::AsciiLetter {
1330 score += CJK_LATIN_ADJACENCY_PENALTY;
1331 }
1332 self.prev = LatinCj::Cj;
1333 } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) {
1334 if let Some(pending) = self.pending_score {
1335 score += pending;
1336 self.pending_score = None;
1337 }
1338 if self.prev_byte < 0x98 || (self.prev_byte == 0x98 && b < 0x73) {
1339 score += self.maybe_set_as_pending(
1340 SHIFT_JIS_SCORE_PER_LEVEL_1_KANJI
1341 + cjk_extra_score(u, &data::DETECTOR_DATA.frequent_kanji),
1342 );
1343 } else {
1344 score += self.maybe_set_as_pending(SHIFT_JIS_SCORE_PER_LEVEL_2_KANJI);
1345 }
1346 if self.prev == LatinCj::AsciiLetter {
1347 score += CJK_LATIN_ADJACENCY_PENALTY;
1348 }
1349 self.prev = LatinCj::Cj;
1350 } else if u >= 0xE000 && u < 0xF900 {
1351 if let Some(pending) = self.pending_score {
1352 score += pending;
1353 self.pending_score = None;
1354 }
1355 score += SHIFT_JIS_PUA_PENALTY;
1356 self.prev = LatinCj::Other;
1357 } else {
1358 match u {
1359 0x3000 | 0x3001 | 0x3002 | 0xFF08 | 0xFF09 => {
1365 if let Some(pending) = self.pending_score {
1366 score += pending;
1367 self.pending_score = None;
1368 }
1369 score += CJ_PUNCTUATION;
1373 }
1374 0..=0x7F => {
1375 self.pending_score = None; }
1377 0x80 => {
1378 self.pending_score = None; score += IMPLAUSIBILITY_PENALTY;
1383 }
1384 _ => {
1385 if let Some(pending) = self.pending_score {
1386 score += pending;
1387 self.pending_score = None;
1388 }
1389 score += CJK_OTHER;
1390 }
1391 }
1392 self.prev = LatinCj::Other;
1393 }
1394 }
1395 match result {
1396 DecoderResult::InputEmpty => {
1397 assert_eq!(read, 1);
1398 }
1399 DecoderResult::Malformed(malformed_len, _) => {
1400 if (((self.prev_byte >= 0x81 && self.prev_byte <= 0x9F)
1401 || (self.prev_byte >= 0xE0 && self.prev_byte <= 0xFC))
1402 && ((b >= 0x40 && b <= 0x7E) || (b >= 0x80 && b <= 0xFC)))
1403 && !((self.prev_byte == 0x82 && b >= 0xFA)
1404 || (self.prev_byte == 0x84 && ((b >= 0xDD && b <= 0xE4) || b >= 0xFB))
1405 || (self.prev_byte == 0x86 && b >= 0xF2 && b <= 0xFA)
1406 || (self.prev_byte == 0x87 && b >= 0x77 && b <= 0x7D)
1407 || (self.prev_byte == 0xFC && b >= 0xF5))
1408 {
1409 if let Some(pending) = self.pending_score {
1411 score += pending;
1412 self.pending_score = None;
1413 }
1414 score += SHIFT_JIS_EXTENSION_PENALTY;
1415 if self.prev_byte < 0x87 {
1417 self.prev = LatinCj::Other;
1418 } else {
1419 if self.prev == LatinCj::AsciiLetter {
1420 score += CJK_LATIN_ADJACENCY_PENALTY;
1421 }
1422 self.prev = LatinCj::Cj;
1423 }
1424 } else if malformed_len == 1 && (b == 0xA0 || b >= 0xFD) {
1425 self.pending_score = None; score += SHIFT_JIS_SINGLE_BYTE_EXTENSION_PENALTY;
1427 self.prev = LatinCj::Other;
1428 } else {
1429 return None;
1430 }
1431 }
1432 DecoderResult::OutputFull => {
1433 unreachable!();
1434 }
1435 }
1436 self.prev_byte = b;
1437 }
1438 if last {
1439 let (result, _, _) = self
1440 .decoder
1441 .decode_to_utf16_without_replacement(b"", &mut dst, true);
1442 match result {
1443 DecoderResult::InputEmpty => {}
1444 DecoderResult::Malformed(_, _) => {
1445 return None;
1446 }
1447 DecoderResult::OutputFull => {
1448 unreachable!();
1449 }
1450 }
1451 }
1452 Some(score)
1453 }
1454}
1455
1456struct EucJpCandidate {
1457 decoder: Decoder,
1458 non_ascii_seen: bool,
1459 half_width_katakana_state: HalfWidthKatakana,
1460 prev: LatinCj,
1461 prev_byte: u8,
1462 prev_prev_byte: u8,
1463}
1464
1465impl EucJpCandidate {
1466 fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
1467 let mut score = 0i64;
1468 let mut src = [0u8];
1469 let mut dst = [0u16; 2];
1470 for &b in buffer {
1471 src[0] = b;
1472 let (result, read, written) = self
1473 .decoder
1474 .decode_to_utf16_without_replacement(&src, &mut dst, false);
1475 if written > 0 {
1476 let half_width_katakana_state = self.half_width_katakana_state;
1477 self.half_width_katakana_state = HalfWidthKatakana::DakutenForbidden;
1478 let u = dst[0];
1479 if !self.non_ascii_seen && u >= 0x80 {
1480 self.non_ascii_seen = true;
1481 if u >= 0xFF61 && u <= 0xFF9F {
1482 }
1484 if u >= 0x3040 && u < 0x3100 {
1485 score += EUC_JP_INITIAL_KANA_PENALTY;
1488 }
1489 }
1490 if (u >= u16::from(b'a') && u <= u16::from(b'z'))
1491 || (u >= u16::from(b'A') && u <= u16::from(b'Z'))
1492 {
1493 if self.prev == LatinCj::Cj {
1494 score += CJK_LATIN_ADJACENCY_PENALTY;
1495 }
1496 self.prev = LatinCj::AsciiLetter;
1497 } else if u >= 0xFF61 && u <= 0xFF9F {
1498 score += HALF_WIDTH_KATAKANA_SCORE;
1499
1500 if (u >= 0xFF76 && u <= 0xFF84) || u == 0xFF73 {
1501 self.half_width_katakana_state = HalfWidthKatakana::DakutenAllowed;
1502 } else if u >= 0xFF8A && u <= 0xFF8E {
1503 self.half_width_katakana_state =
1504 HalfWidthKatakana::DakutenOrHandakutenAllowed;
1505 } else if u == 0xFF9E {
1506 if half_width_katakana_state == HalfWidthKatakana::DakutenForbidden {
1507 score += IMPLAUSIBILITY_PENALTY;
1508 } else {
1509 score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
1510 }
1511 } else if u == 0xFF9F {
1512 if half_width_katakana_state
1513 != HalfWidthKatakana::DakutenOrHandakutenAllowed
1514 {
1515 score += IMPLAUSIBILITY_PENALTY;
1516 } else {
1517 score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
1518 }
1519 }
1520
1521 if self.prev == LatinCj::AsciiLetter {
1522 score += CJK_LATIN_ADJACENCY_PENALTY;
1523 }
1524 self.prev = LatinCj::Other;
1525 } else if (u >= 0x3041 && u <= 0x3093) || (u >= 0x30A1 && u <= 0x30F6) {
1526 match u {
1527 0x3090 | 0x3091 | 0x30F0 | 0x30F1 => {
1532 score += EUC_JP_SCORE_PER_NEAR_OBSOLETE_KANA;
1534 }
1535 _ => {
1536 score += EUC_JP_SCORE_PER_KANA;
1537 }
1538 }
1539 if self.prev == LatinCj::AsciiLetter {
1540 score += CJK_LATIN_ADJACENCY_PENALTY;
1541 }
1542 self.prev = LatinCj::Cj;
1543 } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) {
1544 if self.prev_prev_byte == 0x8F {
1545 score += EUC_JP_SCORE_PER_OTHER_KANJI;
1546 } else if self.prev_byte < 0xD0 {
1547 score += EUC_JP_SCORE_PER_LEVEL_1_KANJI;
1548 score += cjk_extra_score(u, &data::DETECTOR_DATA.frequent_kanji);
1549 } else {
1550 score += EUC_JP_SCORE_PER_LEVEL_2_KANJI;
1551 }
1552 if self.prev == LatinCj::AsciiLetter {
1553 score += CJK_LATIN_ADJACENCY_PENALTY;
1554 }
1555 self.prev = LatinCj::Cj;
1556 } else {
1557 match u {
1558 0x3000 | 0x3001 | 0x3002 | 0xFF08 | 0xFF09 => {
1564 score += CJ_PUNCTUATION;
1565 }
1566 0..=0x7F => {}
1567 _ => {
1568 score += CJK_OTHER;
1569 }
1570 }
1571 self.prev = LatinCj::Other;
1572 }
1573 }
1574 match result {
1575 DecoderResult::InputEmpty => {
1576 assert_eq!(read, 1);
1577 }
1578 DecoderResult::Malformed(_, _) => {
1579 if b >= 0xA1
1580 && b <= 0xFE
1581 && self.prev_byte >= 0xA1
1582 && self.prev_byte <= 0xFE
1583 && ((self.prev_prev_byte != 0x8F
1584 && !(self.prev_byte == 0xA8 && b >= 0xDF && b <= 0xE6)
1585 && !(self.prev_byte == 0xAC && b >= 0xF4 && b <= 0xFC)
1586 && !(self.prev_byte == 0xAD && b >= 0xD8 && b <= 0xDE))
1587 || (self.prev_prev_byte == 0x8F
1588 && self.prev_byte != 0xA2
1589 && self.prev_byte != 0xA6
1590 && self.prev_byte != 0xA7
1591 && self.prev_byte != 0xA9
1592 && self.prev_byte != 0xAA
1593 && self.prev_byte != 0xAB
1594 && self.prev_byte != 0xED
1595 && !(self.prev_byte == 0xFE && b >= 0xF7)))
1596 {
1597 score += EUC_JP_EXTENSION_PENALTY;
1598 if self.prev == LatinCj::AsciiLetter {
1599 score += CJK_LATIN_ADJACENCY_PENALTY;
1600 }
1601 self.prev = LatinCj::Cj;
1602 } else {
1603 return None;
1604 }
1605 }
1606 DecoderResult::OutputFull => {
1607 unreachable!();
1608 }
1609 }
1610 self.prev_prev_byte = self.prev_byte;
1611 self.prev_byte = b;
1612 }
1613 if last {
1614 let (result, _, _) = self
1615 .decoder
1616 .decode_to_utf16_without_replacement(b"", &mut dst, true);
1617 match result {
1618 DecoderResult::InputEmpty => {}
1619 DecoderResult::Malformed(_, _) => {
1620 return None;
1621 }
1622 DecoderResult::OutputFull => {
1623 unreachable!();
1624 }
1625 }
1626 }
1627 Some(score)
1628 }
1629}
1630
1631struct Big5Candidate {
1632 decoder: Decoder,
1633 prev: LatinCj,
1634 prev_byte: u8,
1635 pending_score: Option<i64>,
1636}
1637
1638impl Big5Candidate {
1639 fn maybe_set_as_pending(&mut self, s: i64) -> i64 {
1640 assert!(self.pending_score.is_none());
1641 if self.prev == LatinCj::Cj || !problematic_lead(self.prev_byte) {
1642 s
1643 } else {
1644 self.pending_score = Some(s);
1645 0
1646 }
1647 }
1648
1649 fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
1650 let mut score = 0i64;
1651 let mut src = [0u8];
1652 let mut dst = [0u16; 2];
1653 for &b in buffer {
1654 src[0] = b;
1655 let (result, read, written) = self
1656 .decoder
1657 .decode_to_utf16_without_replacement(&src, &mut dst, false);
1658 if written == 1 {
1659 let u = dst[0];
1660 if (u >= u16::from(b'a') && u <= u16::from(b'z'))
1661 || (u >= u16::from(b'A') && u <= u16::from(b'Z'))
1662 {
1663 self.pending_score = None; if self.prev == LatinCj::Cj {
1665 score += CJK_LATIN_ADJACENCY_PENALTY;
1666 }
1667 self.prev = LatinCj::AsciiLetter;
1668 } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) {
1669 if let Some(pending) = self.pending_score {
1670 score += pending;
1671 self.pending_score = None;
1672 }
1673 match self.prev_byte {
1674 0xA4..=0xC6 => {
1675 score += self.maybe_set_as_pending(BIG5_SCORE_PER_LEVEL_1_HANZI);
1676 }
1678 _ => {
1679 score += self.maybe_set_as_pending(BIG5_SCORE_PER_OTHER_HANZI);
1680 }
1681 }
1682 if self.prev == LatinCj::AsciiLetter {
1683 score += CJK_LATIN_ADJACENCY_PENALTY;
1684 }
1685 self.prev = LatinCj::Cj;
1686 } else {
1687 match u {
1688 0x3000 | 0x3001 | 0x3002 | 0xFF08 | 0xFF09 | 0xFF01 | 0xFF0C | 0xFF1B | 0xFF1F => {
1698 if let Some(pending) = self.pending_score {
1699 score += pending;
1700 self.pending_score = None;
1701 }
1702 score += CJ_PUNCTUATION;
1706 }
1707 0..=0x7F => {
1708 self.pending_score = None; }
1710 _ => {
1711 if let Some(pending) = self.pending_score {
1712 score += pending;
1713 self.pending_score = None;
1714 }
1715 score += CJK_OTHER;
1716 }
1717 }
1718 self.prev = LatinCj::Other;
1719 }
1720 } else if written == 2 {
1721 if let Some(pending) = self.pending_score {
1722 score += pending;
1723 self.pending_score = None;
1724 }
1725 if dst[0] == 0xCA || dst[0] == 0xEA {
1726 score += CJK_OTHER;
1727 self.prev = LatinCj::Other;
1728 } else {
1729 debug_assert!(dst[0] >= 0xD480 && dst[0] < 0xD880);
1730 score += self.maybe_set_as_pending(BIG5_SCORE_PER_OTHER_HANZI);
1731 if self.prev == LatinCj::AsciiLetter {
1732 score += CJK_LATIN_ADJACENCY_PENALTY;
1733 }
1734 self.prev = LatinCj::Cj;
1735 }
1736 }
1737 match result {
1738 DecoderResult::InputEmpty => {
1739 assert_eq!(read, 1);
1740 }
1741 DecoderResult::Malformed(malformed_len, _) => {
1742 if self.prev_byte >= 0x81
1743 && self.prev_byte <= 0xFE
1744 && ((b >= 0x40 && b <= 0x7E) || (b >= 0xA1 && b <= 0xFE))
1745 {
1746 if let Some(pending) = self.pending_score {
1751 score += pending;
1752 self.pending_score = None;
1753 }
1754 score += BIG5_PUA_PENALTY;
1755 if self.prev == LatinCj::AsciiLetter {
1757 score += CJK_LATIN_ADJACENCY_PENALTY;
1758 }
1759 self.prev = LatinCj::Cj;
1760 } else if (self.prev_byte == 0xA0
1761 || self.prev_byte == 0xFD
1762 || self.prev_byte == 0xFE)
1763 && (b < 0x80 || b == 0xFF)
1764 {
1765 self.pending_score = None; score += BIG5_SINGLE_BYTE_EXTENSION_PENALTY;
1769 if (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') {
1770 self.prev = LatinCj::AsciiLetter;
1771 } else if b == 0xFF {
1772 score += BIG5_SINGLE_BYTE_EXTENSION_PENALTY;
1773 self.prev = LatinCj::Other;
1774 } else {
1775 self.prev = LatinCj::Other;
1776 }
1777 } else if malformed_len == 1 && b == 0xFF {
1778 self.pending_score = None; score += BIG5_SINGLE_BYTE_EXTENSION_PENALTY;
1781 self.prev = LatinCj::Other;
1782 } else {
1783 return None;
1784 }
1785 }
1786 DecoderResult::OutputFull => {
1787 unreachable!();
1788 }
1789 }
1790 self.prev_byte = b;
1791 }
1792 if last {
1793 let (result, _, _) = self
1794 .decoder
1795 .decode_to_utf16_without_replacement(b"", &mut dst, true);
1796 match result {
1797 DecoderResult::InputEmpty => {}
1798 DecoderResult::Malformed(_, _) => {
1799 return None;
1800 }
1801 DecoderResult::OutputFull => {
1802 unreachable!();
1803 }
1804 }
1805 }
1806 Some(score)
1807 }
1808}
1809
1810struct EucKrCandidate {
1811 decoder: Decoder,
1812 prev_byte: u8,
1813 prev_was_euc_range: bool,
1814 prev: LatinKorean,
1815 current_word_len: u64,
1816 pending_score: Option<i64>,
1817}
1818
1819impl EucKrCandidate {
1820 fn maybe_set_as_pending(&mut self, s: i64) -> i64 {
1821 assert!(self.pending_score.is_none());
1822 if self.prev == LatinKorean::Hangul || !more_problematic_lead(self.prev_byte) {
1823 s
1824 } else {
1825 self.pending_score = Some(s);
1826 0
1827 }
1828 }
1829
1830 fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
1831 let mut score = 0i64;
1832 let mut src = [0u8];
1833 let mut dst = [0u16; 2];
1834 for &b in buffer {
1835 let in_euc_range = b >= 0xA1 && b <= 0xFE;
1836 src[0] = b;
1837 let (result, read, written) = self
1838 .decoder
1839 .decode_to_utf16_without_replacement(&src, &mut dst, false);
1840 if written > 0 {
1841 let u = dst[0];
1842 if (u >= u16::from(b'a') && u <= u16::from(b'z'))
1843 || (u >= u16::from(b'A') && u <= u16::from(b'Z'))
1844 {
1845 self.pending_score = None; match self.prev {
1847 LatinKorean::Hangul | LatinKorean::Hanja => {
1848 score += CJK_LATIN_ADJACENCY_PENALTY;
1849 }
1850 _ => {}
1851 }
1852 self.prev = LatinKorean::AsciiLetter;
1853 self.current_word_len = 0;
1854 } else if u >= 0xAC00 && u <= 0xD7A3 {
1855 if let Some(pending) = self.pending_score {
1856 score += pending;
1857 self.pending_score = None;
1858 }
1859 if self.prev_was_euc_range && in_euc_range {
1860 score += EUC_KR_SCORE_PER_EUC_HANGUL;
1861 score += cjk_extra_score(u, &data::DETECTOR_DATA.frequent_hangul);
1862 } else {
1863 score += self.maybe_set_as_pending(EUC_KR_SCORE_PER_NON_EUC_HANGUL);
1864 }
1865 if self.prev == LatinKorean::AsciiLetter {
1866 score += CJK_LATIN_ADJACENCY_PENALTY;
1867 }
1868 self.prev = LatinKorean::Hangul;
1869 self.current_word_len += 1;
1870 if self.current_word_len > 5 {
1871 score += EUC_KR_LONG_WORD_PENALTY;
1872 }
1873 } else if (u >= 0x4E00 && u < 0xAC00) || (u >= 0xF900 && u <= 0xFA0B) {
1874 if let Some(pending) = self.pending_score {
1875 score += pending;
1876 self.pending_score = None;
1877 }
1878 score += EUC_KR_SCORE_PER_HANJA;
1879 match self.prev {
1880 LatinKorean::AsciiLetter => {
1881 score += CJK_LATIN_ADJACENCY_PENALTY;
1882 }
1883 LatinKorean::Hangul => {
1884 score += EUC_KR_HANJA_AFTER_HANGUL_PENALTY;
1885 }
1886 _ => {}
1887 }
1888 self.prev = LatinKorean::Hanja;
1889 self.current_word_len += 1;
1890 if self.current_word_len > 5 {
1891 score += EUC_KR_LONG_WORD_PENALTY;
1892 }
1893 } else {
1894 if u >= 0x80 {
1895 if let Some(pending) = self.pending_score {
1896 score += pending;
1897 self.pending_score = None;
1898 }
1899 score += CJK_OTHER;
1900 } else {
1901 self.pending_score = None; }
1903 self.prev = LatinKorean::Other;
1904 self.current_word_len = 0;
1905 }
1906 }
1907 match result {
1908 DecoderResult::InputEmpty => {
1909 assert_eq!(read, 1);
1910 }
1911 DecoderResult::Malformed(malformed_len, _) => {
1912 if (self.prev_byte == 0xC9 || self.prev_byte == 0xFE) && b >= 0xA1 && b <= 0xFE
1913 {
1914 if let Some(pending) = self.pending_score {
1915 score += pending;
1916 self.pending_score = None;
1917 }
1918 score += EUC_KR_PUA_PENALTY;
1920 match self.prev {
1922 LatinKorean::AsciiLetter => {
1923 score += CJK_LATIN_ADJACENCY_PENALTY;
1924 }
1925 LatinKorean::Hangul => {
1926 score += EUC_KR_HANJA_AFTER_HANGUL_PENALTY;
1927 }
1928 _ => {}
1929 }
1930 self.prev = LatinKorean::Hanja;
1931 self.current_word_len += 1;
1932 if self.current_word_len > 5 {
1933 score += EUC_KR_LONG_WORD_PENALTY;
1934 }
1935 } else if (self.prev_byte == 0xA1
1936 || (self.prev_byte >= 0xA3 && self.prev_byte <= 0xA8)
1937 || (self.prev_byte >= 0xAA && self.prev_byte <= 0xAD))
1938 && (b >= 0x7B && b <= 0x7D)
1939 {
1940 if let Some(pending) = self.pending_score {
1941 score += pending;
1942 self.pending_score = None;
1943 }
1944 score += EUC_KR_MAC_KOREAN_PENALTY;
1946 self.prev = LatinKorean::Other;
1947 self.current_word_len = 0;
1948 } else if (self.prev_byte >= 0x81 && self.prev_byte <= 0x84)
1949 && (b <= 0x80 || b == 0xFF)
1950 {
1951 self.pending_score = None; score += EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY;
1955 if (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') {
1956 self.prev = LatinKorean::AsciiLetter;
1957 } else if b == 0x80 || b == 0xFF {
1958 score += EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY;
1959 self.prev = LatinKorean::Other;
1960 } else {
1961 self.prev = LatinKorean::Other;
1962 }
1963 self.current_word_len = 0;
1964 } else if malformed_len == 1 && (b == 0x80 || b == 0xFF) {
1965 self.pending_score = None; score += EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY;
1968 self.prev = LatinKorean::Other;
1969 self.current_word_len = 0;
1970 } else {
1971 return None;
1972 }
1973 }
1974 DecoderResult::OutputFull => {
1975 unreachable!();
1976 }
1977 }
1978 self.prev_was_euc_range = in_euc_range;
1979 self.prev_byte = b;
1980 }
1981 if last {
1982 let (result, _, _) = self
1983 .decoder
1984 .decode_to_utf16_without_replacement(b"", &mut dst, true);
1985 match result {
1986 DecoderResult::InputEmpty => {}
1987 DecoderResult::Malformed(_, _) => {
1988 return None;
1989 }
1990 DecoderResult::OutputFull => {
1991 unreachable!();
1992 }
1993 }
1994 }
1995 Some(score)
1996 }
1997}
1998
1999enum InnerCandidate {
2000 Latin(LatinCandidate),
2001 NonLatinCased(NonLatinCasedCandidate),
2002 Caseless(CaselessCandidate),
2003 ArabicFrench(ArabicFrenchCandidate),
2004 Logical(LogicalCandidate),
2005 Visual(VisualCandidate),
2006 Utf8(Utf8Candidate),
2007 Iso2022(Iso2022Candidate),
2008 Shift(ShiftJisCandidate),
2009 EucJp(EucJpCandidate),
2010 EucKr(EucKrCandidate),
2011 Big5(Big5Candidate),
2012 Gbk(GbkCandidate),
2013}
2014
2015impl InnerCandidate {
2016 fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
2017 match self {
2018 InnerCandidate::Latin(c) => {
2019 if let Some(new_score) = c.feed(buffer) {
2020 if last {
2021 if let Some(additional_score) = c.feed(b" ") {
2023 Some(new_score + additional_score)
2024 } else {
2025 None
2026 }
2027 } else {
2028 Some(new_score)
2029 }
2030 } else {
2031 None
2032 }
2033 }
2034 InnerCandidate::NonLatinCased(c) => {
2035 if let Some(new_score) = c.feed(buffer) {
2036 if last {
2037 if let Some(additional_score) = c.feed(b" ") {
2039 Some(new_score + additional_score)
2040 } else {
2041 None
2042 }
2043 } else {
2044 Some(new_score)
2045 }
2046 } else {
2047 None
2048 }
2049 }
2050 InnerCandidate::Caseless(c) => {
2051 if let Some(new_score) = c.feed(buffer) {
2052 if last {
2053 if let Some(additional_score) = c.feed(b" ") {
2055 Some(new_score + additional_score)
2056 } else {
2057 None
2058 }
2059 } else {
2060 Some(new_score)
2061 }
2062 } else {
2063 None
2064 }
2065 }
2066 InnerCandidate::ArabicFrench(c) => {
2067 if let Some(new_score) = c.feed(buffer) {
2068 if last {
2069 if let Some(additional_score) = c.feed(b" ") {
2071 Some(new_score + additional_score)
2072 } else {
2073 None
2074 }
2075 } else {
2076 Some(new_score)
2077 }
2078 } else {
2079 None
2080 }
2081 }
2082 InnerCandidate::Logical(c) => {
2083 if let Some(new_score) = c.feed(buffer) {
2084 if last {
2085 if let Some(additional_score) = c.feed(b" ") {
2087 Some(new_score + additional_score)
2088 } else {
2089 None
2090 }
2091 } else {
2092 Some(new_score)
2093 }
2094 } else {
2095 None
2096 }
2097 }
2098 InnerCandidate::Visual(c) => {
2099 if let Some(new_score) = c.feed(buffer) {
2100 if last {
2101 if let Some(additional_score) = c.feed(b" ") {
2103 Some(new_score + additional_score)
2104 } else {
2105 None
2106 }
2107 } else {
2108 Some(new_score)
2109 }
2110 } else {
2111 None
2112 }
2113 }
2114 InnerCandidate::Utf8(c) => c.feed(buffer, last),
2115 InnerCandidate::Iso2022(c) => c.feed(buffer, last),
2116 InnerCandidate::Shift(c) => c.feed(buffer, last),
2117 InnerCandidate::EucJp(c) => c.feed(buffer, last),
2118 InnerCandidate::EucKr(c) => c.feed(buffer, last),
2119 InnerCandidate::Big5(c) => c.feed(buffer, last),
2120 InnerCandidate::Gbk(c) => c.feed(buffer, last),
2121 }
2122 }
2123}
2124
2125fn encoding_for_tld(tld: Tld) -> usize {
2126 match tld {
2127 Tld::CentralWindows | Tld::CentralCyrillic => EncodingDetector::CENTRAL_WINDOWS_INDEX,
2128 Tld::Cyrillic => EncodingDetector::CYRILLIC_WINDOWS_INDEX,
2129 Tld::Generic | Tld::Western | Tld::WesternCyrillic | Tld::WesternArabic | Tld::Eu => {
2130 EncodingDetector::WESTERN_INDEX
2131 }
2132 Tld::IcelandicFaroese => EncodingDetector::ICELANDIC_INDEX,
2133 Tld::Greek => EncodingDetector::GREEK_ISO_INDEX,
2134 Tld::TurkishAzeri => EncodingDetector::TURKISH_INDEX,
2135 Tld::Hebrew => EncodingDetector::LOGICAL_INDEX,
2136 Tld::Arabic => EncodingDetector::ARABIC_WINDOWS_INDEX,
2137 Tld::Baltic => EncodingDetector::BALTIC_WINDOWS_INDEX,
2138 Tld::Vietnamese => EncodingDetector::VIETNAMESE_INDEX,
2139 Tld::Thai => EncodingDetector::THAI_INDEX,
2140 Tld::Simplified | Tld::SimplifiedTraditional => EncodingDetector::GBK_INDEX,
2141 Tld::Traditional | Tld::TraditionalSimplified => EncodingDetector::BIG5_INDEX,
2142 Tld::Japanese => EncodingDetector::SHIFT_JIS_INDEX,
2143 Tld::Korean => EncodingDetector::EUC_KR_INDEX,
2144 Tld::CentralIso => EncodingDetector::CENTRAL_ISO_INDEX,
2145 }
2146}
2147
2148fn encoding_is_native_to_tld(tld: Tld, encoding: usize) -> bool {
2149 match tld {
2150 Tld::CentralWindows => encoding == EncodingDetector::CENTRAL_WINDOWS_INDEX,
2151 Tld::Cyrillic => {
2152 encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX
2153 || encoding == EncodingDetector::CYRILLIC_KOI_INDEX
2154 || encoding == EncodingDetector::CYRILLIC_IBM_INDEX
2155 || encoding == EncodingDetector::CYRILLIC_ISO_INDEX
2156 }
2157 Tld::Western => encoding == EncodingDetector::WESTERN_INDEX,
2158 Tld::Greek => {
2159 encoding == EncodingDetector::GREEK_WINDOWS_INDEX
2160 || encoding == EncodingDetector::GREEK_ISO_INDEX
2161 }
2162 Tld::TurkishAzeri => encoding == EncodingDetector::TURKISH_INDEX,
2163 Tld::Hebrew => encoding == EncodingDetector::LOGICAL_INDEX,
2164 Tld::Arabic => {
2165 encoding == EncodingDetector::ARABIC_WINDOWS_INDEX
2166 || encoding == EncodingDetector::ARABIC_ISO_INDEX
2167 }
2168 Tld::Baltic => {
2169 encoding == EncodingDetector::BALTIC_WINDOWS_INDEX
2170 || encoding == EncodingDetector::BALTIC_ISO13_INDEX
2171 || encoding == EncodingDetector::BALTIC_ISO4_INDEX
2172 }
2173 Tld::Vietnamese => encoding == EncodingDetector::VIETNAMESE_INDEX,
2174 Tld::Thai => encoding == EncodingDetector::THAI_INDEX,
2175 Tld::Simplified => encoding == EncodingDetector::GBK_INDEX,
2176 Tld::Traditional => encoding == EncodingDetector::BIG5_INDEX,
2177 Tld::Japanese => {
2178 encoding == EncodingDetector::SHIFT_JIS_INDEX
2179 || encoding == EncodingDetector::EUC_JP_INDEX
2180 }
2181 Tld::Korean => encoding == EncodingDetector::EUC_KR_INDEX,
2182 Tld::SimplifiedTraditional | Tld::TraditionalSimplified => {
2183 encoding == EncodingDetector::GBK_INDEX || encoding == EncodingDetector::BIG5_INDEX
2184 }
2185 Tld::CentralIso => encoding == EncodingDetector::CENTRAL_ISO_INDEX,
2186 Tld::IcelandicFaroese => encoding == EncodingDetector::ICELANDIC_INDEX,
2187 Tld::WesternCyrillic => {
2188 encoding == EncodingDetector::WESTERN_INDEX
2189 || encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX
2190 || encoding == EncodingDetector::CYRILLIC_KOI_INDEX
2191 || encoding == EncodingDetector::CYRILLIC_IBM_INDEX
2192 || encoding == EncodingDetector::CYRILLIC_ISO_INDEX
2193 }
2194 Tld::CentralCyrillic => {
2195 encoding == EncodingDetector::CENTRAL_WINDOWS_INDEX
2196 || encoding == EncodingDetector::CENTRAL_ISO_INDEX
2197 || encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX
2198 || encoding == EncodingDetector::CYRILLIC_KOI_INDEX
2199 || encoding == EncodingDetector::CYRILLIC_IBM_INDEX
2200 || encoding == EncodingDetector::CYRILLIC_ISO_INDEX
2201 }
2202 Tld::WesternArabic => {
2203 encoding == EncodingDetector::WESTERN_INDEX
2204 || encoding == EncodingDetector::ARABIC_WINDOWS_INDEX
2205 || encoding == EncodingDetector::ARABIC_ISO_INDEX
2206 }
2207 Tld::Eu => {
2208 encoding == EncodingDetector::WESTERN_INDEX
2209 || encoding == EncodingDetector::ICELANDIC_INDEX
2210 || encoding == EncodingDetector::CENTRAL_WINDOWS_INDEX
2211 || encoding == EncodingDetector::CENTRAL_ISO_INDEX
2212 || encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX
2213 || encoding == EncodingDetector::CYRILLIC_KOI_INDEX
2214 || encoding == EncodingDetector::CYRILLIC_IBM_INDEX
2215 || encoding == EncodingDetector::CYRILLIC_ISO_INDEX
2216 || encoding == EncodingDetector::GREEK_WINDOWS_INDEX
2217 || encoding == EncodingDetector::GREEK_ISO_INDEX
2218 || encoding == EncodingDetector::BALTIC_WINDOWS_INDEX
2219 || encoding == EncodingDetector::BALTIC_ISO13_INDEX
2220 || encoding == EncodingDetector::BALTIC_ISO4_INDEX
2221 }
2222 Tld::Generic => false,
2223 }
2224}
2225
2226fn score_adjustment(score: i64, encoding: usize, tld: Tld) -> i64 {
2227 if score < 1 {
2228 return 0;
2229 }
2230 let (divisor, constant) = match tld {
2232 Tld::Generic => {
2233 unreachable!();
2234 }
2235 Tld::CentralWindows | Tld::CentralIso => {
2236 match encoding {
2237 EncodingDetector::WESTERN_INDEX
2238 | EncodingDetector::ICELANDIC_INDEX
2239 | EncodingDetector::BALTIC_WINDOWS_INDEX
2240 | EncodingDetector::BALTIC_ISO4_INDEX
2241 | EncodingDetector::BALTIC_ISO13_INDEX
2242 | EncodingDetector::VIETNAMESE_INDEX
2243 | EncodingDetector::TURKISH_INDEX => {
2244 return score;
2246 }
2247 _ => (50, 60),
2248 }
2249 }
2250 Tld::Cyrillic => {
2251 match encoding {
2252 EncodingDetector::BIG5_INDEX
2253 | EncodingDetector::GBK_INDEX
2254 | EncodingDetector::EUC_JP_INDEX
2255 | EncodingDetector::CENTRAL_WINDOWS_INDEX
2256 | EncodingDetector::CENTRAL_ISO_INDEX
2257 | EncodingDetector::GREEK_WINDOWS_INDEX
2258 | EncodingDetector::GREEK_ISO_INDEX
2259 | EncodingDetector::VISUAL_INDEX
2260 | EncodingDetector::LOGICAL_INDEX
2261 | EncodingDetector::BALTIC_WINDOWS_INDEX
2262 | EncodingDetector::BALTIC_ISO4_INDEX
2263 | EncodingDetector::BALTIC_ISO13_INDEX
2264 | EncodingDetector::TURKISH_INDEX => {
2265 return score;
2267 }
2268 _ => (50, 60),
2269 }
2270 }
2271 Tld::Western | Tld::WesternCyrillic | Tld::WesternArabic => {
2272 match encoding {
2273 EncodingDetector::CENTRAL_WINDOWS_INDEX
2274 | EncodingDetector::CENTRAL_ISO_INDEX
2275 | EncodingDetector::BALTIC_WINDOWS_INDEX
2276 | EncodingDetector::BALTIC_ISO4_INDEX
2277 | EncodingDetector::BALTIC_ISO13_INDEX
2278 | EncodingDetector::TURKISH_INDEX
2279 | EncodingDetector::VIETNAMESE_INDEX => {
2280 return score;
2282 }
2283 _ => (50, 60),
2284 }
2285 }
2286 Tld::Greek => {
2287 match encoding {
2288 EncodingDetector::BIG5_INDEX
2289 | EncodingDetector::GBK_INDEX
2290 | EncodingDetector::EUC_JP_INDEX
2291 | EncodingDetector::CENTRAL_WINDOWS_INDEX
2292 | EncodingDetector::CENTRAL_ISO_INDEX
2293 | EncodingDetector::CYRILLIC_WINDOWS_INDEX
2294 | EncodingDetector::CYRILLIC_ISO_INDEX
2295 | EncodingDetector::CYRILLIC_KOI_INDEX
2296 | EncodingDetector::CYRILLIC_IBM_INDEX
2297 | EncodingDetector::VISUAL_INDEX
2298 | EncodingDetector::LOGICAL_INDEX
2299 | EncodingDetector::BALTIC_WINDOWS_INDEX
2300 | EncodingDetector::BALTIC_ISO4_INDEX
2301 | EncodingDetector::BALTIC_ISO13_INDEX
2302 | EncodingDetector::TURKISH_INDEX => {
2303 return score;
2305 }
2306 _ => (50, 60),
2307 }
2308 }
2309 Tld::TurkishAzeri => {
2310 match encoding {
2311 EncodingDetector::CENTRAL_WINDOWS_INDEX
2312 | EncodingDetector::CENTRAL_ISO_INDEX
2313 | EncodingDetector::BALTIC_WINDOWS_INDEX
2314 | EncodingDetector::BALTIC_ISO4_INDEX
2315 | EncodingDetector::BALTIC_ISO13_INDEX
2316 | EncodingDetector::VIETNAMESE_INDEX
2317 | EncodingDetector::ICELANDIC_INDEX => {
2318 return score;
2320 }
2321 _ => (50, 60),
2322 }
2323 }
2324 Tld::Hebrew => {
2325 match encoding {
2326 EncodingDetector::CENTRAL_WINDOWS_INDEX
2327 | EncodingDetector::CENTRAL_ISO_INDEX
2328 | EncodingDetector::CYRILLIC_WINDOWS_INDEX
2329 | EncodingDetector::CYRILLIC_ISO_INDEX
2330 | EncodingDetector::CYRILLIC_KOI_INDEX
2331 | EncodingDetector::CYRILLIC_IBM_INDEX
2332 | EncodingDetector::GREEK_WINDOWS_INDEX
2333 | EncodingDetector::GREEK_ISO_INDEX
2334 | EncodingDetector::BALTIC_WINDOWS_INDEX
2335 | EncodingDetector::BALTIC_ISO4_INDEX
2336 | EncodingDetector::BALTIC_ISO13_INDEX
2337 | EncodingDetector::VIETNAMESE_INDEX
2338 | EncodingDetector::TURKISH_INDEX => {
2339 return score;
2341 }
2342 _ => (50, 60),
2343 }
2344 }
2345 Tld::Arabic => {
2346 match encoding {
2347 EncodingDetector::BIG5_INDEX
2348 | EncodingDetector::GBK_INDEX
2349 | EncodingDetector::EUC_JP_INDEX
2350 | EncodingDetector::EUC_KR_INDEX
2351 | EncodingDetector::CENTRAL_WINDOWS_INDEX
2352 | EncodingDetector::CENTRAL_ISO_INDEX
2353 | EncodingDetector::CYRILLIC_WINDOWS_INDEX
2354 | EncodingDetector::CYRILLIC_ISO_INDEX
2355 | EncodingDetector::CYRILLIC_KOI_INDEX
2356 | EncodingDetector::CYRILLIC_IBM_INDEX
2357 | EncodingDetector::GREEK_WINDOWS_INDEX
2358 | EncodingDetector::GREEK_ISO_INDEX
2359 | EncodingDetector::VISUAL_INDEX
2360 | EncodingDetector::LOGICAL_INDEX
2361 | EncodingDetector::BALTIC_WINDOWS_INDEX
2362 | EncodingDetector::BALTIC_ISO4_INDEX
2363 | EncodingDetector::BALTIC_ISO13_INDEX
2364 | EncodingDetector::VIETNAMESE_INDEX
2365 | EncodingDetector::TURKISH_INDEX => {
2366 return score;
2368 }
2369 _ => (50, 60),
2370 }
2371 }
2372 Tld::Baltic => {
2373 match encoding {
2374 EncodingDetector::CENTRAL_WINDOWS_INDEX
2375 | EncodingDetector::CENTRAL_ISO_INDEX
2376 | EncodingDetector::ICELANDIC_INDEX
2377 | EncodingDetector::TURKISH_INDEX
2378 | EncodingDetector::VIETNAMESE_INDEX => {
2379 return score;
2381 }
2382 _ => (50, 60),
2383 }
2384 }
2385 Tld::Vietnamese => {
2386 match encoding {
2387 EncodingDetector::CENTRAL_WINDOWS_INDEX
2388 | EncodingDetector::CENTRAL_ISO_INDEX
2389 | EncodingDetector::BALTIC_WINDOWS_INDEX
2390 | EncodingDetector::BALTIC_ISO4_INDEX
2391 | EncodingDetector::BALTIC_ISO13_INDEX
2392 | EncodingDetector::TURKISH_INDEX
2393 | EncodingDetector::ICELANDIC_INDEX => {
2394 return score;
2396 }
2397 _ => (50, 60),
2398 }
2399 }
2400 Tld::Thai => {
2401 match encoding {
2402 EncodingDetector::BIG5_INDEX
2403 | EncodingDetector::GBK_INDEX
2404 | EncodingDetector::EUC_JP_INDEX
2405 | EncodingDetector::EUC_KR_INDEX
2406 | EncodingDetector::SHIFT_JIS_INDEX
2407 | EncodingDetector::CENTRAL_WINDOWS_INDEX
2408 | EncodingDetector::CENTRAL_ISO_INDEX
2409 | EncodingDetector::CYRILLIC_WINDOWS_INDEX
2410 | EncodingDetector::CYRILLIC_ISO_INDEX
2411 | EncodingDetector::CYRILLIC_KOI_INDEX
2412 | EncodingDetector::CYRILLIC_IBM_INDEX
2413 | EncodingDetector::GREEK_WINDOWS_INDEX
2414 | EncodingDetector::GREEK_ISO_INDEX
2415 | EncodingDetector::ARABIC_WINDOWS_INDEX
2416 | EncodingDetector::ARABIC_ISO_INDEX
2417 | EncodingDetector::VISUAL_INDEX
2418 | EncodingDetector::LOGICAL_INDEX
2419 | EncodingDetector::BALTIC_WINDOWS_INDEX
2420 | EncodingDetector::BALTIC_ISO4_INDEX
2421 | EncodingDetector::BALTIC_ISO13_INDEX
2422 | EncodingDetector::TURKISH_INDEX => {
2423 return score;
2425 }
2426 _ => (50, 60),
2427 }
2428 }
2429 Tld::Simplified
2430 | Tld::Traditional
2431 | Tld::TraditionalSimplified
2432 | Tld::SimplifiedTraditional
2433 | Tld::Japanese
2434 | Tld::Korean => {
2435 return score;
2437 }
2438 Tld::IcelandicFaroese => {
2439 match encoding {
2440 EncodingDetector::CENTRAL_WINDOWS_INDEX
2441 | EncodingDetector::CENTRAL_ISO_INDEX
2442 | EncodingDetector::BALTIC_WINDOWS_INDEX
2443 | EncodingDetector::BALTIC_ISO4_INDEX
2444 | EncodingDetector::BALTIC_ISO13_INDEX
2445 | EncodingDetector::TURKISH_INDEX
2446 | EncodingDetector::VIETNAMESE_INDEX => {
2447 return score;
2449 }
2450 _ => (50, 60),
2451 }
2452 }
2453 Tld::CentralCyrillic => {
2454 match encoding {
2455 EncodingDetector::BIG5_INDEX
2456 | EncodingDetector::GBK_INDEX
2457 | EncodingDetector::EUC_JP_INDEX
2458 | EncodingDetector::GREEK_WINDOWS_INDEX
2459 | EncodingDetector::GREEK_ISO_INDEX
2460 | EncodingDetector::VISUAL_INDEX
2461 | EncodingDetector::LOGICAL_INDEX
2462 | EncodingDetector::BALTIC_WINDOWS_INDEX
2463 | EncodingDetector::BALTIC_ISO4_INDEX
2464 | EncodingDetector::BALTIC_ISO13_INDEX
2465 | EncodingDetector::TURKISH_INDEX => {
2466 return score;
2468 }
2469 _ => (50, 60),
2470 }
2471 }
2472 Tld::Eu => {
2473 match encoding {
2474 EncodingDetector::BIG5_INDEX
2475 | EncodingDetector::GBK_INDEX
2476 | EncodingDetector::EUC_JP_INDEX
2477 | EncodingDetector::TURKISH_INDEX
2478 | EncodingDetector::VIETNAMESE_INDEX => {
2479 return score;
2481 }
2482 _ => (50, 60),
2483 }
2484 }
2485 };
2486 (score / divisor) + constant
2487}
2488
2489cfg_if::cfg_if! {
2490 if #[cfg(feature = "multithreading")] {
2491 #[repr(align(64))] struct Candidate {
2493 inner: InnerCandidate,
2494 score: Option<i64>,
2495 }
2496 } else {
2497 struct Candidate {
2498 inner: InnerCandidate,
2499 score: Option<i64>,
2500 }
2501 }
2502}
2503
2504impl Candidate {
2505 fn feed(&mut self, buffer: &[u8], last: bool) {
2506 if let Some(old_score) = self.score {
2507 if let Some(new_score) = self.inner.feed(buffer, last) {
2508 self.score = Some(old_score + new_score);
2509 } else {
2510 self.score = None;
2511 }
2512 }
2513 }
2514
2515 #[cfg(feature = "multithreading")]
2516 fn qualified(&self) -> bool {
2517 !self.score.is_none()
2518 }
2519
2520 fn new_latin(data: &'static SingleByteData) -> Self {
2521 Candidate {
2522 inner: InnerCandidate::Latin(LatinCandidate::new(data)),
2523 score: Some(0),
2524 }
2525 }
2526
2527 fn new_non_latin_cased(data: &'static SingleByteData) -> Self {
2528 Candidate {
2529 inner: InnerCandidate::NonLatinCased(NonLatinCasedCandidate::new(data)),
2530 score: Some(0),
2531 }
2532 }
2533
2534 fn new_caseless(data: &'static SingleByteData) -> Self {
2535 Candidate {
2536 inner: InnerCandidate::Caseless(CaselessCandidate::new(data)),
2537 score: Some(0),
2538 }
2539 }
2540
2541 fn new_arabic_french(data: &'static SingleByteData) -> Self {
2542 Candidate {
2543 inner: InnerCandidate::ArabicFrench(ArabicFrenchCandidate::new(data)),
2544 score: Some(0),
2545 }
2546 }
2547
2548 fn new_logical(data: &'static SingleByteData) -> Self {
2549 Candidate {
2550 inner: InnerCandidate::Logical(LogicalCandidate::new(data)),
2551 score: Some(0),
2552 }
2553 }
2554
2555 fn new_visual(data: &'static SingleByteData) -> Self {
2556 Candidate {
2557 inner: InnerCandidate::Visual(VisualCandidate::new(data)),
2558 score: Some(0),
2559 }
2560 }
2561
2562 fn new_utf_8() -> Self {
2563 Candidate {
2564 inner: InnerCandidate::Utf8(Utf8Candidate {
2565 decoder: UTF_8.new_decoder_without_bom_handling(),
2566 }),
2567 score: Some(0),
2568 }
2569 }
2570
2571 fn new_iso_2022_jp() -> Self {
2572 Candidate {
2573 inner: InnerCandidate::Iso2022(Iso2022Candidate {
2574 decoder: ISO_2022_JP.new_decoder_without_bom_handling(),
2575 }),
2576 score: Some(0),
2577 }
2578 }
2579
2580 fn new_shift_jis() -> Self {
2581 Candidate {
2582 inner: InnerCandidate::Shift(ShiftJisCandidate {
2583 decoder: SHIFT_JIS.new_decoder_without_bom_handling(),
2584 half_width_katakana_seen: false,
2585 half_width_katakana_state: HalfWidthKatakana::DakutenForbidden,
2586 prev: LatinCj::Other,
2587 prev_byte: 0,
2588 pending_score: None,
2589 }),
2590 score: Some(0),
2591 }
2592 }
2593
2594 fn new_euc_jp() -> Self {
2595 Candidate {
2596 inner: InnerCandidate::EucJp(EucJpCandidate {
2597 decoder: EUC_JP.new_decoder_without_bom_handling(),
2598 non_ascii_seen: false,
2599 half_width_katakana_state: HalfWidthKatakana::DakutenForbidden,
2600 prev: LatinCj::Other,
2601 prev_byte: 0,
2602 prev_prev_byte: 0,
2603 }),
2604 score: Some(0),
2605 }
2606 }
2607
2608 fn new_euc_kr() -> Self {
2609 Candidate {
2610 inner: InnerCandidate::EucKr(EucKrCandidate {
2611 decoder: EUC_KR.new_decoder_without_bom_handling(),
2612 prev_byte: 0,
2613 prev_was_euc_range: false,
2614 prev: LatinKorean::Other,
2615 current_word_len: 0,
2616 pending_score: None,
2617 }),
2618 score: Some(0),
2619 }
2620 }
2621
2622 fn new_big5() -> Self {
2623 Candidate {
2624 inner: InnerCandidate::Big5(Big5Candidate {
2625 decoder: BIG5.new_decoder_without_bom_handling(),
2626 prev: LatinCj::Other,
2627 prev_byte: 0,
2628 pending_score: None,
2629 }),
2630 score: Some(0),
2631 }
2632 }
2633
2634 fn new_gbk() -> Self {
2635 Candidate {
2636 inner: InnerCandidate::Gbk(GbkCandidate {
2637 decoder: GBK.new_decoder_without_bom_handling(),
2638 prev: LatinCj::Other,
2639 prev_byte: 0,
2640 pending_score: None,
2641 }),
2642 score: Some(0),
2643 }
2644 }
2645
2646 fn score(&self, encoding: usize, tld: Tld, expectation_is_valid: bool) -> Option<i64> {
2647 match &self.inner {
2648 InnerCandidate::NonLatinCased(c) => {
2649 if c.longest_word < 2 {
2650 return None;
2651 }
2652 }
2653 InnerCandidate::Caseless(c) => {
2654 if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) {
2655 return None;
2656 }
2657 }
2658 InnerCandidate::ArabicFrench(c) => {
2659 if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) {
2660 return None;
2661 }
2662 }
2663 InnerCandidate::Logical(c) => {
2664 if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) {
2665 return None;
2666 }
2667 }
2668 InnerCandidate::Visual(c) => {
2669 if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) {
2670 return None;
2671 }
2672 }
2673 _ => {}
2674 }
2675 if tld == Tld::Generic {
2676 return self.score;
2677 }
2678 if let Some(score) = self.score {
2679 if encoding == encoding_for_tld(tld) {
2680 return Some(score + 1);
2681 }
2682 if encoding_is_native_to_tld(tld, encoding) {
2683 return Some(score);
2684 }
2685 if expectation_is_valid {
2686 return Some(score - score_adjustment(score, encoding, tld));
2687 }
2688 return Some(score);
2692 }
2693 None
2694 }
2695
2696 fn plausible_punctuation(&self) -> u64 {
2697 match &self.inner {
2698 InnerCandidate::Logical(c) => {
2699 return c.plausible_punctuation;
2700 }
2701 InnerCandidate::Visual(c) => {
2702 return c.plausible_punctuation;
2703 }
2704 _ => {
2705 unreachable!();
2706 }
2707 }
2708 }
2709
2710 fn encoding(&self) -> &'static Encoding {
2711 match &self.inner {
2712 InnerCandidate::Latin(c) => {
2713 return c.data.encoding;
2714 }
2715 InnerCandidate::NonLatinCased(c) => {
2716 return c.data.encoding;
2717 }
2718 InnerCandidate::Caseless(c) => {
2719 return c.data.encoding;
2720 }
2721 InnerCandidate::ArabicFrench(c) => {
2722 return c.data.encoding;
2723 }
2724 InnerCandidate::Logical(c) => {
2725 return c.data.encoding;
2726 }
2727 InnerCandidate::Visual(c) => {
2728 return c.data.encoding;
2729 }
2730 InnerCandidate::Shift(_) => {
2731 return SHIFT_JIS;
2732 }
2733 InnerCandidate::EucJp(_) => {
2734 return EUC_JP;
2735 }
2736 InnerCandidate::Big5(_) => {
2737 return BIG5;
2738 }
2739 InnerCandidate::EucKr(_) => {
2740 return EUC_KR;
2741 }
2742 InnerCandidate::Gbk(_) => {
2743 return GBK;
2744 }
2745 InnerCandidate::Utf8(_) => {
2746 return UTF_8;
2747 }
2748 InnerCandidate::Iso2022(_) => {
2749 return ISO_2022_JP;
2750 }
2751 }
2752 }
2753}
2754
2755cfg_if::cfg_if! {
2757 if #[cfg(target_feature = "sse2")] {
2758 fn count_non_ascii(buffer: &[u8]) -> u64 {
2759 let mut count = 0;
2760 let (prefix, simd, suffix) = unsafe { buffer.align_to::<__m128i>() };
2761 for &b in prefix {
2762 if b >= 0x80 {
2763 count += 1;
2764 }
2765 }
2766 for &s in simd {
2767 count += unsafe {_mm_movemask_epi8(s)}.count_ones() as u64;
2768 }
2769 for &b in suffix {
2770 if b >= 0x80 {
2771 count += 1;
2772 }
2773 }
2774 count
2775 }
2776 } else {
2777 fn count_non_ascii(buffer: &[u8]) -> u64 {
2778 let mut count = 0;
2779 for &b in buffer {
2780 if b >= 0x80 {
2781 count += 1;
2782 }
2783 }
2784 count
2785 }
2786 }
2787}
2788
2789#[derive(Clone, Copy)]
2790enum BeforeNonAscii {
2791 None,
2792 One([u8; 1]),
2793 Two([u8; 2]),
2794}
2795
2796impl BeforeNonAscii {
2797 fn as_slice(&self) -> &[u8] {
2798 match self {
2799 BeforeNonAscii::None => b"",
2800 BeforeNonAscii::One(arr) => &arr[..],
2801 BeforeNonAscii::Two(arr) => &arr[..],
2802 }
2803 }
2804
2805 fn push(&mut self, buffer: &[u8]) {
2806 let len = buffer.len();
2807 if len >= 2 {
2808 let arr = [buffer[len - 2], buffer[len - 1]];
2809 *self = BeforeNonAscii::Two(arr);
2810 } else if len == 1 {
2811 match self {
2812 BeforeNonAscii::None => {
2813 let arr = [buffer[0]];
2814 *self = BeforeNonAscii::One(arr);
2815 }
2816 BeforeNonAscii::One(first) => {
2817 let arr = [first[0], buffer[0]];
2818 *self = BeforeNonAscii::Two(arr);
2819 }
2820 BeforeNonAscii::Two(first) => {
2821 let arr = [first[1], buffer[0]];
2822 *self = BeforeNonAscii::Two(arr);
2823 }
2824 }
2825 }
2826 }
2827}
2828
2829pub struct EncodingDetector {
2841 candidates: [Candidate; 27],
2842 non_ascii_seen: u64,
2843 last_before_non_ascii: BeforeNonAscii,
2846 esc_seen: bool,
2847 closed: bool,
2848}
2849
2850impl EncodingDetector {
2851 cfg_if::cfg_if! {
2852 if #[cfg(feature = "multithreading")] {
2853 fn feed_impl(&mut self, buffer: &[u8], last: bool) {
2854 if buffer.len() < 10 {
2855 self.candidates.iter_mut().for_each(|candidate| candidate.feed(buffer, last));
2856 self.non_ascii_seen += count_non_ascii(buffer);
2857 return;
2858 }
2859 let mut qualified = ArrayVec::<[_; 27]>::new();
2864 for candidate in self.candidates.iter_mut() {
2865 if candidate.qualified() {
2866 qualified.push(candidate);
2867 }
2868 }
2869 let (_, non_ascii) = rayon::join(|| qualified.par_iter_mut().for_each(|candidate| candidate.feed(buffer, last)),
2870 || count_non_ascii(buffer));
2871 self.non_ascii_seen += non_ascii;
2872 }
2873 } else {
2874 fn feed_impl(&mut self, buffer: &[u8], last: bool) {
2875 self.candidates.iter_mut().for_each(|candidate| candidate.feed(buffer, last));
2876 self.non_ascii_seen += count_non_ascii(buffer);
2877 }
2878 }
2879 }
2880
2881 pub fn feed(&mut self, buffer: &[u8], last: bool) -> bool {
2908 assert!(
2909 !self.closed,
2910 "Must not feed again after feeding with last equaling true."
2911 );
2912 if last {
2913 self.closed = true;
2914 }
2915 let start = if self.non_ascii_seen == 0 && !self.esc_seen {
2916 let up_to = Encoding::ascii_valid_up_to(buffer);
2917 let start = if let Some(escape) = memchr::memchr(0x1B, &buffer[..up_to]) {
2918 self.esc_seen = true;
2919 escape
2920 } else {
2921 up_to
2922 };
2923 if start == buffer.len() {
2924 self.last_before_non_ascii.push(buffer);
2925 return self.non_ascii_seen != 0;
2926 }
2927 if start == 0 || start == 1 {
2928 let last_before = self.last_before_non_ascii;
2929 self.last_before_non_ascii = BeforeNonAscii::None;
2930 self.feed_impl(last_before.as_slice(), false);
2931 0
2932 } else {
2933 start - 2
2934 }
2935 } else {
2936 0
2937 };
2938 self.feed_impl(&buffer[start..], last);
2939 self.non_ascii_seen != 0
2940 }
2941
2942 pub fn guess(&self, tld: Option<&[u8]>, allow_utf8: bool) -> &'static Encoding {
2971 self.guess_assess(tld, allow_utf8).0
2972 }
2973
2974 pub fn guess_assess(&self, tld: Option<&[u8]>, allow_utf8: bool) -> (&'static Encoding, bool) {
2979 let mut tld_type = tld.map_or(Tld::Generic, |tld| {
2980 assert!(!contains_upper_case_period_or_non_ascii(tld));
2981 classify_tld(tld)
2982 });
2983
2984 if self.non_ascii_seen == 0
2985 && self.esc_seen
2986 && self.candidates[Self::ISO_2022_JP_INDEX].score.is_some()
2987 {
2988 return (ISO_2022_JP, true);
2989 }
2990
2991 if self.candidates[Self::UTF_8_INDEX].score.is_some() {
2992 if allow_utf8 {
2993 return (UTF_8, true);
2994 }
2995 return (self.candidates[encoding_for_tld(tld_type)].encoding(), true);
3000 }
3001
3002 let mut encoding = self.candidates[encoding_for_tld(tld_type)].encoding();
3003 let mut max = 0i64;
3004 let mut expectation_is_valid = false;
3005 if tld_type != Tld::Generic {
3006 for (i, candidate) in self.candidates.iter().enumerate().skip(Self::FIRST_NORMAL) {
3007 if encoding_is_native_to_tld(tld_type, i) && candidate.score.is_some() {
3008 expectation_is_valid = true;
3009 break;
3010 }
3011 }
3012 }
3013 if !expectation_is_valid {
3014 match tld_type {
3016 Tld::Simplified => {
3017 if self.candidates[Self::BIG5_INDEX].score.is_some() {
3018 tld_type = Tld::Traditional;
3019 expectation_is_valid = true;
3020 }
3021 }
3022 Tld::Traditional => {
3023 if self.candidates[Self::GBK_INDEX].score.is_some() {
3024 tld_type = Tld::Simplified;
3025 expectation_is_valid = true;
3026 }
3027 }
3028 Tld::CentralWindows => {
3029 if self.candidates[Self::CENTRAL_ISO_INDEX].score.is_some() {
3030 tld_type = Tld::CentralIso;
3031 expectation_is_valid = true;
3032 }
3033 }
3034 Tld::CentralIso => {
3035 if self.candidates[Self::CENTRAL_WINDOWS_INDEX].score.is_some() {
3036 tld_type = Tld::CentralWindows;
3037 expectation_is_valid = true;
3038 }
3039 }
3040 _ => {}
3041 }
3042 }
3043 for (i, candidate) in self.candidates.iter().enumerate().skip(Self::FIRST_NORMAL) {
3044 if let Some(score) = candidate.score(i, tld_type, expectation_is_valid) {
3045 if score > max {
3046 max = score;
3047 encoding = candidate.encoding();
3048 }
3049 }
3050 }
3051 let visual = &self.candidates[Self::VISUAL_INDEX];
3052 if let Some(visual_score) = visual.score(Self::VISUAL_INDEX, tld_type, expectation_is_valid)
3053 {
3054 if (visual_score > max || encoding == WINDOWS_1255)
3055 && visual.plausible_punctuation()
3056 > self.candidates[Self::LOGICAL_INDEX].plausible_punctuation()
3057 {
3058 encoding = ISO_8859_8;
3060 }
3061 }
3062 (encoding, max >= 0)
3063 }
3064
3065 #[cfg(feature = "testing-only-no-semver-guarantees-do-not-use")]
3067 pub fn find_score(&self, encoding: &'static Encoding) -> Option<i64> {
3068 let mut tld_type = Tld::Generic;
3069 let mut expectation_is_valid = false;
3070 if tld_type != Tld::Generic {
3071 for (i, candidate) in self.candidates.iter().enumerate().skip(Self::FIRST_NORMAL) {
3072 if encoding_is_native_to_tld(tld_type, i) && candidate.score.is_some() {
3073 expectation_is_valid = true;
3074 break;
3075 }
3076 }
3077 }
3078 if !expectation_is_valid {
3079 match tld_type {
3081 Tld::Simplified => {
3082 if self.candidates[Self::BIG5_INDEX].score.is_some() {
3083 tld_type = Tld::Traditional;
3084 expectation_is_valid = true;
3085 }
3086 }
3087 Tld::Traditional => {
3088 if self.candidates[Self::GBK_INDEX].score.is_some() {
3089 tld_type = Tld::Simplified;
3090 expectation_is_valid = true;
3091 }
3092 }
3093 Tld::CentralWindows => {
3094 if self.candidates[Self::CENTRAL_ISO_INDEX].score.is_some() {
3095 tld_type = Tld::CentralIso;
3096 expectation_is_valid = true;
3097 }
3098 }
3099 Tld::CentralIso => {
3100 if self.candidates[Self::CENTRAL_WINDOWS_INDEX].score.is_some() {
3101 tld_type = Tld::CentralWindows;
3102 expectation_is_valid = true;
3103 }
3104 }
3105 _ => {}
3106 }
3107 }
3108 for (i, candidate) in self.candidates.iter().enumerate() {
3109 if encoding == candidate.encoding() {
3110 return candidate.score(i, tld_type, expectation_is_valid);
3111 }
3112 }
3113 Some(0)
3114 }
3115
3116 const FIRST_NORMAL: usize = 3;
3117
3118 const UTF_8_INDEX: usize = 0;
3119
3120 const ISO_2022_JP_INDEX: usize = 1;
3121
3122 const VISUAL_INDEX: usize = 2;
3123
3124 const GBK_INDEX: usize = 3;
3125
3126 const EUC_JP_INDEX: usize = 4;
3127
3128 const EUC_KR_INDEX: usize = 5;
3129
3130 const SHIFT_JIS_INDEX: usize = 6;
3131
3132 const BIG5_INDEX: usize = 7;
3133
3134 const WESTERN_INDEX: usize = 8;
3135
3136 const CYRILLIC_WINDOWS_INDEX: usize = 9;
3137
3138 const CENTRAL_WINDOWS_INDEX: usize = 10;
3139
3140 const CENTRAL_ISO_INDEX: usize = 11;
3141
3142 const ARABIC_WINDOWS_INDEX: usize = 12;
3143
3144 const ICELANDIC_INDEX: usize = 13;
3145
3146 const TURKISH_INDEX: usize = 14;
3147
3148 const THAI_INDEX: usize = 15;
3149
3150 const LOGICAL_INDEX: usize = 16;
3151
3152 const GREEK_WINDOWS_INDEX: usize = 17;
3153
3154 const GREEK_ISO_INDEX: usize = 18;
3155
3156 const BALTIC_WINDOWS_INDEX: usize = 19;
3157
3158 const BALTIC_ISO13_INDEX: usize = 20;
3159
3160 const CYRILLIC_KOI_INDEX: usize = 21;
3161
3162 const CYRILLIC_IBM_INDEX: usize = 22;
3163
3164 const ARABIC_ISO_INDEX: usize = 23;
3165
3166 const VIETNAMESE_INDEX: usize = 24;
3167
3168 const BALTIC_ISO4_INDEX: usize = 25;
3169
3170 const CYRILLIC_ISO_INDEX: usize = 26;
3171
3172 pub fn new() -> Self {
3174 EncodingDetector {
3175 candidates: [
3176 Candidate::new_utf_8(), Candidate::new_iso_2022_jp(), Candidate::new_visual(&SINGLE_BYTE_DATA[ISO_8859_8_INDEX]), Candidate::new_gbk(), Candidate::new_euc_jp(), Candidate::new_euc_kr(), Candidate::new_shift_jis(), Candidate::new_big5(), Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1252_INDEX]), Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[WINDOWS_1251_INDEX]), Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1250_INDEX]), Candidate::new_latin(&SINGLE_BYTE_DATA[ISO_8859_2_INDEX]), Candidate::new_arabic_french(&SINGLE_BYTE_DATA[WINDOWS_1256_INDEX]), Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1252_ICELANDIC_INDEX]), Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1254_INDEX]), Candidate::new_caseless(&SINGLE_BYTE_DATA[WINDOWS_874_INDEX]), Candidate::new_logical(&SINGLE_BYTE_DATA[WINDOWS_1255_INDEX]), Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[WINDOWS_1253_INDEX]), Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[ISO_8859_7_INDEX]), Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1257_INDEX]), Candidate::new_latin(&SINGLE_BYTE_DATA[ISO_8859_13_INDEX]), Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[KOI8_U_INDEX]), Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[IBM866_INDEX]), Candidate::new_caseless(&SINGLE_BYTE_DATA[ISO_8859_6_INDEX]), Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1258_INDEX]), Candidate::new_latin(&SINGLE_BYTE_DATA[ISO_8859_4_INDEX]), Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[ISO_8859_5_INDEX]), ],
3204 non_ascii_seen: 0,
3205 last_before_non_ascii: BeforeNonAscii::None,
3206 esc_seen: false,
3207 closed: false,
3208 }
3209 }
3210}
3211
3212#[cfg(test)]
3213mod tests {
3214 use super::*;
3215 use detone::IterDecomposeVietnamese;
3216 use encoding_rs::IBM866;
3217 use encoding_rs::ISO_8859_2;
3218 use encoding_rs::ISO_8859_4;
3219 use encoding_rs::ISO_8859_5;
3220 use encoding_rs::ISO_8859_6;
3221 use encoding_rs::ISO_8859_7;
3222 use encoding_rs::KOI8_U;
3223 use encoding_rs::WINDOWS_1250;
3224 use encoding_rs::WINDOWS_1251;
3225 use encoding_rs::WINDOWS_1252;
3226 use encoding_rs::WINDOWS_1253;
3227 use encoding_rs::WINDOWS_1254;
3228 use encoding_rs::WINDOWS_1256;
3229 use encoding_rs::WINDOWS_1257;
3230 use encoding_rs::WINDOWS_1258;
3231 use encoding_rs::WINDOWS_874;
3232
3233 fn check_bytes(bytes: &[u8], encoding: &'static Encoding) {
3234 let mut det = EncodingDetector::new();
3235 det.feed(bytes, true);
3236 let enc = det.guess(None, false);
3237 let (decoded, _) = enc.decode_without_bom_handling(bytes);
3238 println!("{:?}", decoded);
3239 assert_eq!(enc, encoding);
3240 }
3241
3242 fn check(input: &str, encoding: &'static Encoding) {
3243 let orthographic;
3244 let (bytes, _, _) = if encoding == WINDOWS_1258 {
3245 orthographic = input
3246 .chars()
3247 .decompose_vietnamese_tones(true)
3248 .collect::<String>();
3249 encoding.encode(&orthographic)
3250 } else {
3251 encoding.encode(input)
3252 };
3253 check_bytes(&bytes, encoding);
3254 }
3255
3256 #[test]
3257 fn test_i_apostrophe() {
3258 let mut det = EncodingDetector::new();
3259 det.feed(b"I\x92", true);
3260 let enc = det.guess(None, false);
3261 assert_eq!(enc, WINDOWS_1252);
3262 }
3263
3264 #[test]
3265 fn test_streaming_numero_one_by_one() {
3266 let mut det = EncodingDetector::new();
3267 det.feed(b"n", false);
3268 det.feed(b".", false);
3269 det.feed(b"\xBA", false);
3270 det.feed(b"1", true);
3271 let enc = det.guess(None, false);
3272 assert_eq!(enc, WINDOWS_1252);
3273 }
3274
3275 #[test]
3276 fn test_streaming_numero_two_together() {
3277 let mut det = EncodingDetector::new();
3278 det.feed(b"n.", false);
3279 det.feed(b"\xBA", false);
3280 det.feed(b"1", true);
3281 let enc = det.guess(None, false);
3282 assert_eq!(enc, WINDOWS_1252);
3283 }
3284
3285 #[test]
3286 fn test_streaming_numero_one_by_one_extra_before() {
3287 let mut det = EncodingDetector::new();
3288 det.feed(b" n", false);
3289 det.feed(b".", false);
3290 det.feed(b"\xBA", false);
3291 det.feed(b"1", true);
3292 let enc = det.guess(None, false);
3293 assert_eq!(enc, WINDOWS_1252);
3294 }
3295
3296 #[test]
3297 fn test_streaming_numero_one_before() {
3298 let mut det = EncodingDetector::new();
3299 det.feed(b"n", false);
3300 det.feed(b".\xBA", false);
3301 det.feed(b"1", true);
3302 let enc = det.guess(None, false);
3303 assert_eq!(enc, WINDOWS_1252);
3304 }
3305
3306 #[test]
3307 fn test_streaming_numero_longer_first_buffer() {
3308 let mut det = EncodingDetector::new();
3309 det.feed(b"rrn.", false);
3310 det.feed(b"\xBA", false);
3311 det.feed(b"1", true);
3312 let enc = det.guess(None, false);
3313 assert_eq!(enc, WINDOWS_1252);
3314 }
3315
3316 #[test]
3317 fn test_empty() {
3318 let mut det = EncodingDetector::new();
3319 let seen_non_ascii = det.feed(b"", true);
3320 let enc = det.guess(None, false);
3321 assert_eq!(enc, WINDOWS_1252);
3322 assert!(!seen_non_ascii);
3323 }
3324
3325 #[test]
3326 fn test_fi() {
3327 check("Ääni", WINDOWS_1252);
3328 }
3329
3330 #[test]
3331 fn test_fi_bis() {
3332 check("Tämä", WINDOWS_1252);
3333 }
3334
3335 #[test]
3336 fn test_pt() {
3337 check(
3338 "Este é um teste de codificação de caracteres.",
3339 WINDOWS_1252,
3340 );
3341 }
3342
3343 #[test]
3344 fn test_is() {
3345 check("Þetta er kóðunarpróf á staf. Fyrir sum tungumál sem nota latneska stafi þurfum við meira inntak til að taka ákvörðunina.", WINDOWS_1252);
3346 }
3347
3348 #[test]
3349 fn test_ru_short() {
3350 check("Русский", WINDOWS_1251);
3351 }
3352
3353 #[test]
3354 fn test_ru() {
3355 check("Это тест кодировки символов.", WINDOWS_1251);
3356 }
3357
3358 #[test]
3359 fn test_ru_iso() {
3360 check("Это тест кодировки символов.", ISO_8859_5);
3361 }
3362
3363 #[test]
3364 fn test_ru_ibm() {
3365 check("Это тест кодировки символов.", IBM866);
3366 }
3367
3368 #[test]
3369 fn test_ru_koi() {
3370 check("Это тест кодировки символов.", KOI8_U);
3371 }
3372
3373 #[test]
3374 fn test_uk() {
3375 check("Це тест на кодування символів.", WINDOWS_1251);
3376 }
3377
3378 #[test]
3379 fn test_uk_koi() {
3380 check("Це тест на кодування символів.", KOI8_U);
3381 }
3382
3383 #[test]
3384 fn test_el_short() {
3385 check("Ελληνικά", WINDOWS_1253);
3386 }
3387
3388 #[test]
3389 fn test_el() {
3390 check(
3391 "Πρόκειται για δοκιμή κωδικοποίησης χαρακτήρων: Άρης",
3392 WINDOWS_1253,
3393 );
3394 }
3395
3396 #[test]
3397 fn test_el_iso() {
3398 check(
3399 "Πρόκειται για δοκιμή κωδικοποίησης χαρακτήρων: Άρης",
3400 ISO_8859_7,
3401 );
3402 }
3403
3404 #[test]
3405 fn test_de() {
3406 check("Straße", WINDOWS_1252);
3407 }
3408
3409 #[test]
3410 fn test_en_windows1252() {
3411 check_bytes(&[68, 111, 110, 180, 116, 32], WINDOWS_1252);
3413 }
3414
3415 #[test]
3416 fn test_he() {
3417 check("\u{5E2}\u{5D1}\u{5E8}\u{5D9}\u{5EA}", WINDOWS_1255);
3418 }
3419
3420 #[test]
3421 fn test_2022() {
3422 check("日本語", ISO_2022_JP);
3423 }
3424
3425 #[test]
3426 fn test_th() {
3427 check("นี่คือการทดสอบการเข้ารหัสอักขระ", WINDOWS_874);
3428 }
3429
3430 #[test]
3431 fn test_vi() {
3432 check("Đây là một thử nghiệm mã hóa ký tự.", WINDOWS_1258);
3433 }
3434
3435 #[test]
3436 fn test_tr() {
3437 check("Bu bir karakter kodlama testidir. Latince karakterleri kullanan bazı dillerde karar vermek için daha fazla girdiye ihtiyacımız var.", WINDOWS_1254);
3438 }
3439
3440 #[test]
3441 fn test_simplified() {
3442 check("这是一个字符编码测试。", GBK);
3443 }
3444
3445 #[test]
3446 fn test_traditional() {
3447 check("這是一個字符編碼測試。", BIG5);
3448 }
3449
3450 #[test]
3451 fn test_ko() {
3452 check("이것은 문자 인코딩 테스트입니다.", EUC_KR);
3453 }
3454
3455 #[test]
3456 fn test_shift() {
3457 check("これは文字実験です。", SHIFT_JIS);
3458 }
3459
3460 #[test]
3461 fn test_euc() {
3462 check("これは文字実験です。", EUC_JP);
3463 }
3464
3465 #[test]
3466 fn test_ar() {
3467 check("هذا هو اختبار ترميز الأحرف.", WINDOWS_1256);
3468 }
3469
3470 #[test]
3471 fn test_ar_iso() {
3472 check("هذا هو اختبار ترميز الأحرف.", ISO_8859_6);
3473 }
3474
3475 #[test]
3476 fn test_fa() {
3477 check("این یک تست رمزگذاری کاراکتر است.", WINDOWS_1256);
3478 }
3479
3480 #[test]
3481 fn test_visual() {
3482 check(".םיוות דודיק ןחבמ והז", ISO_8859_8);
3483 }
3484
3485 #[test]
3486 fn test_yi() {
3487 check("דאָס איז אַ טעסט פֿאַר קאָדירונג פון כאַראַקטער.", WINDOWS_1255);
3488 }
3489
3490 #[test]
3491 fn test_it() {
3492 check("è", WINDOWS_1252);
3493 }
3494
3495 #[test]
3496 fn test_en() {
3497 check("isn’t", WINDOWS_1252);
3498 }
3499
3500 #[test]
3501 fn test_en_bis() {
3502 check("Rock ’n Roll", WINDOWS_1252);
3503 }
3504
3505 #[test]
3506 fn test_ca() {
3507 check("Codificació de caràcters", WINDOWS_1252);
3508 }
3509
3510 #[test]
3511 fn test_et() {
3512 check("või", WINDOWS_1252);
3513 }
3514
3515 #[test]
3516 fn test_pl_iso() {
3517 check("To jest test kodowania znaków. W przypadku niektórych języków, które używają znaków łacińskich, potrzebujemy więcej danych, aby podjąć decyzję.", ISO_8859_2);
3518 }
3519
3520 #[test]
3521 fn test_pl() {
3522 check("To jest test kodowania znaków. W przypadku niektórych języków, które używają znaków łacińskich, potrzebujemy więcej danych, aby podjąć decyzję.", WINDOWS_1250);
3523 }
3524
3525 #[test]
3526 fn test_lt() {
3527 check("Tai simbolių kodavimo testas. Kai kurioms kalboms, naudojančioms lotyniškus rašmenis, mums reikia daugiau informacijos, kad galėtume priimti sprendimą.", WINDOWS_1257);
3528 }
3529
3530 #[test]
3537 fn test_lv() {
3538 check("Šis ir rakstzīmju kodēšanas tests. Dažās valodās, kurās tiek izmantotas latīņu valodas burti, lēmuma pieņemšanai mums ir nepieciešams vairāk ieguldījuma.", WINDOWS_1257);
3539 }
3540
3541 #[test]
3542 fn test_lv_iso_8859_4() {
3543 check("Šis ir rakstzīmju kodēšanas tests. Dažās valodās, kurās tiek izmantotas latīņu valodas burti, lēmuma pieņemšanai mums ir nepieciešams vairāk ieguldījuma.", ISO_8859_4);
3544 }
3545
3546 #[test]
3547 fn test_a0() {
3548 check("\u{A0}\u{A0} \u{A0}", WINDOWS_1252);
3550 }
3551
3552 #[test]
3553 fn test_a0a0() {
3554 check("\u{A0}\u{A0}", WINDOWS_1252);
3556 }
3557
3558 #[test]
3559 fn test_space_copyright_space() {
3560 check(" © ", WINDOWS_1252);
3561 }
3562
3563 #[test]
3564 fn test_space_masculine_space() {
3565 check(" º ", WINDOWS_1252);
3566 }
3567
3568 #[test]
3569 fn test_space_feminine_space() {
3570 check(" ª ", WINDOWS_1252);
3571 }
3572
3573 #[test]
3574 fn test_period_masculine_space() {
3575 check(".º ", WINDOWS_1252);
3576 }
3577
3578 #[test]
3579 fn test_period_feminine_space() {
3580 check(".ª ", WINDOWS_1252);
3581 }
3582
3583 #[test]
3584 fn test_maria() {
3585 check(" Mª ", WINDOWS_1252);
3586 }
3587
3588 #[test]
3589 fn test_dona() {
3590 check(" Dª ", WINDOWS_1252);
3591 }
3592
3593 #[test]
3594 fn test_nuestra() {
3595 check(" Nª ", WINDOWS_1252);
3596 }
3597
3598 #[test]
3599 fn test_senora() {
3600 check(" Sª ", WINDOWS_1252);
3601 }
3602
3603 #[test]
3604 fn test_digit_feminine() {
3605 check(" 42ª ", WINDOWS_1252);
3606 }
3607
3608 #[test]
3609 fn test_digit_masculine() {
3610 check(" 42º ", WINDOWS_1252);
3611 }
3612
3613 #[test]
3614 fn test_roman_feminine() {
3615 check(" XIVª ", WINDOWS_1252);
3616 }
3617
3618 #[test]
3619 fn test_roman_masculine() {
3620 check(" XIVº ", WINDOWS_1252);
3621 }
3622
3623 #[test]
3624 fn test_numero_uno() {
3625 check("Nº1", WINDOWS_1252);
3626 }
3627
3628 #[test]
3629 fn test_numero() {
3630 check("Nº", WINDOWS_1252);
3631 }
3632
3633 #[test]
3634 fn test_euro() {
3635 check(" €9", WINDOWS_1252);
3636 }
3637
3638 #[test]
3639 fn test_shift_jis_half_width_katakana() {
3640 check("ハードウェアハードウェアハードウェアハードウェアハードウェア", SHIFT_JIS);
3641 }
3642
3643 #[test]
3644 fn test_big5_pua() {
3645 let mut v = Vec::new();
3646 for _ in 0..40 {
3647 v.extend_from_slice(b"\xA4\x40");
3648 }
3649 v.extend_from_slice(b"\x81\x40\xA4\x40");
3650 check_bytes(&v, BIG5);
3651 }
3652
3653 #[test]
3654 fn test_big5_single_byte_a0() {
3655 let mut v = Vec::new();
3656 for _ in 0..80 {
3657 v.extend_from_slice(b"\xA4\x40");
3658 }
3659 v.extend_from_slice(b"\x81\x40\xA0 ");
3660 check_bytes(&v, BIG5);
3661 }
3662
3663 #[test]
3664 fn test_big5_single_byte_ff() {
3665 let mut v = Vec::new();
3666 for _ in 0..80 {
3667 v.extend_from_slice(b"\xA4\x40");
3668 }
3669 v.extend_from_slice(b"\x81\x40\xFF ");
3670 check_bytes(&v, BIG5);
3671 }
3672
3673 #[test]
3674 fn test_not_big5() {
3675 let mut v = Vec::new();
3676 for _ in 0..40 {
3677 v.extend_from_slice(b"\xA4\x40");
3678 }
3679 v.extend_from_slice(b"\x81\x40\xA0\xA0");
3680 check_bytes(&v, IBM866);
3681 }
3682
3683 #[test]
3684 fn test_euc_kr_pua() {
3685 let mut v = Vec::new();
3686 v.extend_from_slice(b"\xC9\xA1\xB0\xA1 ");
3687 for _ in 0..40 {
3688 v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3689 }
3690 check_bytes(&v, EUC_KR);
3691 }
3692
3693 #[test]
3694 fn test_euc_kr_pua_bis() {
3695 let mut v = Vec::new();
3696 v.extend_from_slice(b"\xFE\xA1\xB0\xA1 ");
3697 for _ in 0..40 {
3698 v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3699 }
3700 check_bytes(&v, EUC_KR);
3701 }
3702
3703 #[test]
3704 fn test_euc_kr_single_byte_ff() {
3705 let mut v = Vec::new();
3706 v.extend_from_slice(b"\xFF ");
3707 for _ in 0..40 {
3708 v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3709 }
3710 check_bytes(&v, EUC_KR);
3711 }
3712
3713 #[test]
3714 fn test_euc_kr_single_byte_81() {
3715 let mut v = Vec::new();
3716 v.extend_from_slice(b"\x81 ");
3717 for _ in 0..40 {
3718 v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3719 }
3720 check_bytes(&v, EUC_KR);
3721 }
3722
3723 #[test]
3724 fn test_euc_kr_single_byte_84() {
3725 let mut v = Vec::new();
3726 v.extend_from_slice(b"\x84 ");
3727 for _ in 0..40 {
3728 v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3729 }
3730 check_bytes(&v, EUC_KR);
3731 }
3732
3733 #[test]
3734 fn test_not_euc_kr() {
3735 let mut v = Vec::new();
3736 v.extend_from_slice(b"\xC9\xA0\xB0\xA1 ");
3737 for _ in 0..40 {
3738 v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3739 }
3740 check_bytes(&v, GBK);
3741 }
3742
3743 #[test]
3744 fn test_shift_jis_x0213() {
3745 let mut v = Vec::new();
3746 v.extend_from_slice(b"\x87\xE5");
3747 for _ in 0..40 {
3748 v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2");
3749 }
3750 check_bytes(&v, SHIFT_JIS);
3751 }
3752
3753 #[test]
3754 fn test_shift_jis_single_byte_fd() {
3755 let mut v = Vec::new();
3756 v.extend_from_slice(b"\xFD");
3757 for _ in 0..40 {
3758 v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2");
3759 }
3760 check_bytes(&v, SHIFT_JIS);
3761 }
3762
3763 #[test]
3764 fn test_not_shift_jis() {
3765 let mut v = Vec::new();
3766 v.extend_from_slice(b"\x84\xE0");
3767 for _ in 0..40 {
3768 v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2");
3769 }
3770 check_bytes(&v, GBK);
3771 }
3772
3773 #[test]
3774 fn test_not_shift_jis_bis() {
3775 let mut v = Vec::new();
3776 v.extend_from_slice(b"\x87\x7D");
3777 for _ in 0..40 {
3778 v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2");
3779 }
3780 check_bytes(&v, GBK);
3781 }
3782
3783 #[test]
3784 fn test_euc_jp_x0213() {
3785 let mut v = Vec::new();
3786 v.extend_from_slice(b"\xAD\xBF");
3787 for _ in 0..80 {
3788 v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4");
3789 }
3790 check_bytes(&v, EUC_JP);
3791 }
3792
3793 #[test]
3794 fn test_euc_jp_x0213_other_plane() {
3795 let mut v = Vec::new();
3796 v.extend_from_slice(b"\x8F\xFE\xF6");
3797 for _ in 0..80 {
3798 v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4");
3799 }
3800 check_bytes(&v, EUC_JP);
3801 }
3802
3803 #[test]
3804 fn test_not_euc_jp() {
3805 let mut v = Vec::new();
3806 v.extend_from_slice(b"\x8F\xFE\xF7");
3807 for _ in 0..80 {
3808 v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4");
3809 }
3810 check_bytes(&v, WINDOWS_1252);
3811 }
3812
3813 #[test]
3814 fn test_not_euc_jp_bis() {
3815 let mut v = Vec::new();
3816 v.extend_from_slice(b"\xA8\xDF");
3817 for _ in 0..80 {
3818 v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4");
3819 }
3820 check_bytes(&v, BIG5);
3821 }
3822
3823 #[test]
3824 fn test_gbk_single_byte_ff() {
3825 let mut v = Vec::new();
3826 v.extend_from_slice(b"\xFF");
3827 for _ in 0..80 {
3828 v.extend_from_slice(b"\xB5\xC4");
3829 }
3830 check_bytes(&v, GBK);
3831 }
3832
3833 #[test]
3834 fn test_gbk_single_byte_a0() {
3835 let mut v = Vec::new();
3836 v.extend_from_slice(b"\xA0 ");
3837 for _ in 0..80 {
3838 v.extend_from_slice(b"\xB5\xC4");
3839 }
3840 check_bytes(&v, GBK);
3841 }
3842
3843 #[test]
3844 fn test_gbk_single_byte_fe() {
3845 let mut v = Vec::new();
3846 v.extend_from_slice(b"\xFE ");
3847 for _ in 0..80 {
3848 v.extend_from_slice(b"\xB5\xC4");
3849 }
3850 check_bytes(&v, GBK);
3851 }
3852
3853 #[test]
3854 fn test_not_gbk_single_byte_fc() {
3855 let mut v = Vec::new();
3856 v.extend_from_slice(b"\xFC ");
3857 for _ in 0..80 {
3858 v.extend_from_slice(b"\xB5\xC4");
3859 }
3860 check_bytes(&v, ISO_8859_5);
3861 }
3862}