Skip to main content

chardetng/
lib.rs

1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! `chardetng` is a character encoding detector for legacy Web content.
11//!
12//! It is optimized for binary size in applications that already depend
13//! on `encoding_rs` for other reasons.
14
15#![no_std]
16
17#[cfg(feature = "multithreading")]
18use rayon::prelude::*;
19
20#[cfg(feature = "multithreading")]
21use arrayvec::ArrayVec;
22
23#[cfg(all(target_arch = "x86", target_feature = "sse2"))]
24use core::arch::x86::__m128i;
25#[cfg(all(target_arch = "x86", target_feature = "sse2"))]
26use core::arch::x86::_mm_movemask_epi8;
27
28#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
29use core::arch::x86_64::__m128i;
30#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
31use core::arch::x86_64::_mm_movemask_epi8;
32
33use encoding_rs::Decoder;
34use encoding_rs::DecoderResult;
35use encoding_rs::Encoding;
36use encoding_rs::BIG5;
37use encoding_rs::EUC_JP;
38use encoding_rs::EUC_KR;
39use encoding_rs::GBK;
40use encoding_rs::ISO_2022_JP;
41use encoding_rs::ISO_8859_8;
42use encoding_rs::SHIFT_JIS;
43use encoding_rs::UTF_8;
44use encoding_rs::WINDOWS_1255;
45
46mod data;
47mod tld;
48use data::*;
49use tld::classify_tld;
50use tld::Tld;
51
52const LATIN_ADJACENCY_PENALTY: i64 = -50;
53
54const IMPLAUSIBILITY_PENALTY: i64 = -220;
55
56const ORDINAL_BONUS: i64 = 300;
57
58/// Must match the ISO-8859-2 score for " Š ". Note: There
59/// are four Slovenian Wikipedia list page titles where the
60/// list is split by letter so that Š stands alone for the
61/// list part for Š. Let's assume that's a special case not
62/// worth detecting even though the copyright sign detection
63/// makes Slovenian title detection round to one percentage
64/// point worse.
65const COPYRIGHT_BONUS: i64 = 222;
66
67const IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY: i64 = -180;
68
69const NON_LATIN_CAPITALIZATION_BONUS: i64 = 40;
70
71const NON_LATIN_ALL_CAPS_PENALTY: i64 = -40;
72
73const NON_LATIN_MIXED_CASE_PENALTY: i64 = -20;
74
75// Manually calibrated relative to windows-1256 Arabic
76const CJK_BASE_SCORE: i64 = 41;
77
78const CJK_SECONDARY_BASE_SCORE: i64 = 20; // Was 20
79
80const SHIFT_JIS_SCORE_PER_KANA: i64 = 20;
81
82const SHIFT_JIS_SCORE_PER_LEVEL_1_KANJI: i64 = CJK_BASE_SCORE;
83
84const SHIFT_JIS_SCORE_PER_LEVEL_2_KANJI: i64 = CJK_SECONDARY_BASE_SCORE;
85
86// Manually calibrated relative to windows-1256 Persian and Urdu
87const SHIFT_JIS_INITIAL_HALF_WIDTH_KATAKANA_PENALTY: i64 = -75;
88
89const HALF_WIDTH_KATAKANA_SCORE: i64 = 1;
90
91// Unclear if this is a good idea; seems not harmful, but can't be sure.
92const HALF_WIDTH_KATAKANA_VOICING_SCORE: i64 = 10;
93
94const SHIFT_JIS_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 10); // Should this be larger?
95
96const SHIFT_JIS_EXTENSION_PENALTY: i64 = SHIFT_JIS_PUA_PENALTY * 2;
97
98const SHIFT_JIS_SINGLE_BYTE_EXTENSION_PENALTY: i64 = SHIFT_JIS_EXTENSION_PENALTY;
99
100const EUC_JP_SCORE_PER_KANA: i64 = CJK_BASE_SCORE + (CJK_BASE_SCORE / 3); // Relative to Big5
101
102const EUC_JP_SCORE_PER_NEAR_OBSOLETE_KANA: i64 = CJK_BASE_SCORE - 1;
103
104const EUC_JP_SCORE_PER_LEVEL_1_KANJI: i64 = CJK_BASE_SCORE;
105
106const EUC_JP_SCORE_PER_LEVEL_2_KANJI: i64 = CJK_SECONDARY_BASE_SCORE;
107
108const EUC_JP_SCORE_PER_OTHER_KANJI: i64 = CJK_SECONDARY_BASE_SCORE / 4;
109
110const EUC_JP_INITIAL_KANA_PENALTY: i64 = -((CJK_BASE_SCORE / 3) + 1);
111
112const EUC_JP_EXTENSION_PENALTY: i64 = -(CJK_BASE_SCORE * 50); // Needs to be more severe than for Shift_JIS to avoid misdetecting EUC-KR!
113
114const BIG5_SCORE_PER_LEVEL_1_HANZI: i64 = CJK_BASE_SCORE;
115
116const BIG5_SCORE_PER_OTHER_HANZI: i64 = CJK_SECONDARY_BASE_SCORE;
117
118const BIG5_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 30); // More severe than other PUA penalties to avoid misdetecting EUC-KR! (25 as the multiplier is too little)
119
120const BIG5_SINGLE_BYTE_EXTENSION_PENALTY: i64 = -(CJK_BASE_SCORE * 40);
121
122const EUC_KR_SCORE_PER_EUC_HANGUL: i64 = CJK_BASE_SCORE + 1;
123
124const EUC_KR_SCORE_PER_NON_EUC_HANGUL: i64 = CJK_SECONDARY_BASE_SCORE / 5;
125
126const EUC_KR_SCORE_PER_HANJA: i64 = CJK_SECONDARY_BASE_SCORE / 2;
127
128const EUC_KR_HANJA_AFTER_HANGUL_PENALTY: i64 = -(CJK_BASE_SCORE * 10);
129
130const EUC_KR_LONG_WORD_PENALTY: i64 = -6;
131
132const EUC_KR_PUA_PENALTY: i64 = GBK_PUA_PENALTY - 1; // Break tie in favor of GBK
133
134const EUC_KR_MAC_KOREAN_PENALTY: i64 = EUC_KR_PUA_PENALTY * 2;
135
136const EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY: i64 = EUC_KR_MAC_KOREAN_PENALTY;
137
138const GBK_SCORE_PER_LEVEL_1: i64 = CJK_BASE_SCORE;
139
140const GBK_SCORE_PER_LEVEL_2: i64 = CJK_SECONDARY_BASE_SCORE;
141
142const GBK_SCORE_PER_NON_EUC: i64 = CJK_SECONDARY_BASE_SCORE / 4;
143
144const GBK_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 10); // Factor should be at least 2, but should it be larger?
145
146const GBK_SINGLE_BYTE_EXTENSION_PENALTY: i64 = GBK_PUA_PENALTY * 4;
147
148const CJK_LATIN_ADJACENCY_PENALTY: i64 = -CJK_BASE_SCORE; // smaller penalty than LATIN_ADJACENCY_PENALTY
149
150const CJ_PUNCTUATION: i64 = CJK_BASE_SCORE / 2;
151
152const CJK_OTHER: i64 = CJK_SECONDARY_BASE_SCORE / 4;
153
154/// Latin letter caseless class
155const LATIN_LETTER: u8 = 1;
156
157fn contains_upper_case_period_or_non_ascii(label: &[u8]) -> bool {
158    for &b in label.into_iter() {
159        if b >= 0x80 {
160            return true;
161        }
162        if b == b'.' {
163            return true;
164        }
165        if b >= b'A' && b <= b'Z' {
166            return true;
167        }
168    }
169    false
170}
171
172// For Latin, we only penalize pairwise bad transitions
173// if one participant is non-ASCII. This avoids violating
174// the principle that ASCII pairs never contribute to the
175// score. (Maybe that's a bad principle, though!)
176#[derive(PartialEq)]
177enum LatinCaseState {
178    Space,
179    Upper,
180    Lower,
181    AllCaps,
182}
183
184// Fon non-Latin, we calculate case-related penalty
185// or bonus on a per-non-Latin-word basis.
186#[derive(PartialEq)]
187enum NonLatinCaseState {
188    Space,
189    Upper,
190    Lower,
191    UpperLower,
192    AllCaps,
193    Mix,
194}
195
196struct NonLatinCasedCandidate {
197    data: &'static SingleByteData,
198    prev: u8,
199    case_state: NonLatinCaseState,
200    prev_ascii: bool,
201    current_word_len: u64,
202    longest_word: u64,
203    ibm866: bool,
204    prev_was_a0: bool, // Only used with IBM866
205}
206
207impl NonLatinCasedCandidate {
208    fn new(data: &'static SingleByteData) -> Self {
209        NonLatinCasedCandidate {
210            data: data,
211            prev: 0,
212            case_state: NonLatinCaseState::Space,
213            prev_ascii: true,
214            current_word_len: 0,
215            longest_word: 0,
216            ibm866: data == &SINGLE_BYTE_DATA[IBM866_INDEX],
217            prev_was_a0: false,
218        }
219    }
220
221    fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
222        let mut score = 0i64;
223        for &b in buffer {
224            let class = self.data.classify(b);
225            if class == 255 {
226                return None;
227            }
228            let caseless_class = class & 0x7F;
229
230            let ascii = b < 0x80;
231            let ascii_pair = self.prev_ascii && ascii;
232
233            let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false);
234
235            // The purpose of this state machine is to avoid misdetecting Greek as
236            // Cyrillic by:
237            //
238            // * Giving a small bonus to words that start with an upper-case letter
239            //   and are lower-case for the rest.
240            // * Giving a large penalty to start with one lower-case letter followed
241            //   by all upper-case (obviously upper and lower case inverted, which
242            //   unfortunately is possible due to KOI8-U).
243            // * Giving a small per-word penalty to all-uppercase KOI8-U (to favor
244            //   all-lowercase Greek over all-caps KOI8-U).
245            // * Giving large penalties for mixed-case other than initial upper-case.
246            //   This also helps relative to non-cased encodings.
247
248            // ASCII doesn't participate in non-Latin casing.
249            if caseless_class == LATIN_LETTER {
250                // Latin
251                // Mark this word as a mess. If there end up being non-Latin
252                // letters in this word, the ASCII-adjacency penalty gets
253                // applied to Latin/non-Latin pairs and the mix penalty
254                // to non-Latin/non-Latin pairs.
255                // XXX Apply penalty here
256                self.case_state = NonLatinCaseState::Mix;
257            } else if !non_ascii_alphabetic {
258                // Space
259                match self.case_state {
260                    NonLatinCaseState::Space
261                    | NonLatinCaseState::Upper
262                    | NonLatinCaseState::Lower => {}
263                    NonLatinCaseState::UpperLower => {
264                        // Intentionally applied only once per word.
265                        score += NON_LATIN_CAPITALIZATION_BONUS;
266                    }
267                    NonLatinCaseState::AllCaps => {
268                        // Intentionally applied only once per word.
269                        if self.data == &SINGLE_BYTE_DATA[KOI8_U_INDEX] {
270                            // Apply only to KOI8-U.
271                            score += NON_LATIN_ALL_CAPS_PENALTY;
272                        }
273                    }
274                    NonLatinCaseState::Mix => {
275                        // Per letter
276                        score += NON_LATIN_MIXED_CASE_PENALTY * (self.current_word_len as i64);
277                    }
278                }
279                self.case_state = NonLatinCaseState::Space;
280            } else if (class >> 7) == 0 {
281                // Lower case
282                match self.case_state {
283                    NonLatinCaseState::Space => {
284                        self.case_state = NonLatinCaseState::Lower;
285                    }
286                    NonLatinCaseState::Upper => {
287                        self.case_state = NonLatinCaseState::UpperLower;
288                    }
289                    NonLatinCaseState::Lower
290                    | NonLatinCaseState::UpperLower
291                    | NonLatinCaseState::Mix => {}
292                    NonLatinCaseState::AllCaps => {
293                        self.case_state = NonLatinCaseState::Mix;
294                    }
295                }
296            } else {
297                // Upper case
298                match self.case_state {
299                    NonLatinCaseState::Space => {
300                        self.case_state = NonLatinCaseState::Upper;
301                    }
302                    NonLatinCaseState::Upper => {
303                        self.case_state = NonLatinCaseState::AllCaps;
304                    }
305                    NonLatinCaseState::Lower | NonLatinCaseState::UpperLower => {
306                        self.case_state = NonLatinCaseState::Mix;
307                    }
308                    NonLatinCaseState::AllCaps | NonLatinCaseState::Mix => {}
309                }
310            }
311
312            // XXX Apply penalty if > 16
313            if non_ascii_alphabetic {
314                self.current_word_len += 1;
315            } else {
316                if self.current_word_len > self.longest_word {
317                    self.longest_word = self.current_word_len;
318                }
319                self.current_word_len = 0;
320            }
321
322            let is_a0 = b == 0xA0;
323            if !ascii_pair {
324                // 0xA0 is no-break space in many other encodings, so avoid
325                // assigning score to IBM866 when 0xA0 occurs next to itself
326                // or a space-like byte.
327                if !(self.ibm866
328                    && ((is_a0 && (self.prev_was_a0 || self.prev == 0))
329                        || caseless_class == 0 && self.prev_was_a0))
330                {
331                    score += self.data.score(caseless_class, self.prev, false);
332                }
333
334                if self.prev == LATIN_LETTER && non_ascii_alphabetic {
335                    score += LATIN_ADJACENCY_PENALTY;
336                } else if caseless_class == LATIN_LETTER
337                    && self.data.is_non_latin_alphabetic(self.prev, false)
338                {
339                    score += LATIN_ADJACENCY_PENALTY;
340                }
341            }
342
343            self.prev_ascii = ascii;
344            self.prev = caseless_class;
345            self.prev_was_a0 = is_a0;
346        }
347        Some(score)
348    }
349}
350
351enum OrdinalState {
352    Other,
353    Space,
354    PeriodAfterN,
355    OrdinalExpectingSpace,
356    OrdinalExpectingSpaceUndoImplausibility,
357    OrdinalExpectingSpaceOrDigit,
358    OrdinalExpectingSpaceOrDigitUndoImplausibily,
359    UpperN,
360    LowerN,
361    FeminineAbbreviationStartLetter,
362    Digit,
363    Roman,
364    Copyright,
365}
366
367struct LatinCandidate {
368    data: &'static SingleByteData,
369    prev: u8,
370    case_state: LatinCaseState,
371    prev_non_ascii: u32,
372    ordinal_state: OrdinalState, // Used only when `windows1252 == true`
373    windows1252: bool,
374}
375
376impl LatinCandidate {
377    fn new(data: &'static SingleByteData) -> Self {
378        LatinCandidate {
379            data: data,
380            prev: 0,
381            case_state: LatinCaseState::Space,
382            prev_non_ascii: 0,
383            ordinal_state: OrdinalState::Space,
384            windows1252: data == &SINGLE_BYTE_DATA[WINDOWS_1252_INDEX],
385        }
386    }
387
388    fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
389        let mut score = 0i64;
390        for &b in buffer {
391            let class = self.data.classify(b);
392            if class == 255 {
393                return None;
394            }
395            let caseless_class = class & 0x7F;
396
397            let ascii = b < 0x80;
398            let ascii_pair = self.prev_non_ascii == 0 && ascii;
399
400            let non_ascii_penalty = match self.prev_non_ascii {
401                0 | 1 | 2 => 0,
402                3 => -5,
403                4 => -20,
404                _ => -200,
405            };
406            score += non_ascii_penalty;
407            // XXX if has Vietnamese-only characters and word length > 7,
408            // apply penalty
409
410            if !self.data.is_latin_alphabetic(caseless_class) {
411                self.case_state = LatinCaseState::Space;
412            } else if (class >> 7) == 0 {
413                // Penalizing lower case after two upper case
414                // is important for avoiding misdetecting
415                // windows-1250 as windows-1252 (byte 0x9F).
416                if self.case_state == LatinCaseState::AllCaps && !ascii_pair {
417                    score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY;
418                }
419                self.case_state = LatinCaseState::Lower;
420            } else {
421                match self.case_state {
422                    LatinCaseState::Space => {
423                        self.case_state = LatinCaseState::Upper;
424                    }
425                    LatinCaseState::Upper | LatinCaseState::AllCaps => {
426                        self.case_state = LatinCaseState::AllCaps;
427                    }
428                    LatinCaseState::Lower => {
429                        if !ascii_pair {
430                            // XXX How bad is this for Irish Gaelic?
431                            score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY;
432                        }
433                        self.case_state = LatinCaseState::Upper;
434                    }
435                }
436            }
437
438            // Treat pairing space-like, which can be non-ASCII, with ASCII as
439            // ASCIIish enough not to get a score in order to avoid giving
440            // ASCII i and I in windows-1254 next to windows-125x apostrophe/quote
441            // a score. This avoids detecting English I’ as Turkish.
442            let ascii_ish_pair = ascii_pair
443                || (ascii && self.prev == 0)
444                || (caseless_class == 0 && self.prev_non_ascii == 0);
445
446            if !ascii_ish_pair {
447                score += self.data.score(caseless_class, self.prev, false);
448            }
449
450            if self.windows1252 {
451                // This state machine assigns score to the sequences
452                // * " º " (Spanish)
453                // * " ª " (Spanish)
454                // * ".ª " (Spanish)
455                // * ".º " (Spanish)
456                // * "n.º1" (Spanish)
457                // * " Mª " (Spanish)
458                // * " Dª " (Spanish)
459                // * " Nª " (Spanish)
460                // * " Sª " (Spanish)
461                // * " 3º " (Italian, where 3 is an ASCII digit)
462                // * " 3ª " (Italian, where 3 is an ASCII digit)
463                // * " Xº " (Italian, where X is a small Roman numeral)
464                // * " Xª " (Italian, where X is a small Roman numeral)
465                // * " Nº1" (Italian, where 1 is an ASCII digit)
466                // * " Nº " (Italian)
467                // * " © " (otherwise ASCII-only)
468                // which are problematic to deal with by pairwise scoring
469                // without messing up Romanian detection.
470                // Initial sc
471                match self.ordinal_state {
472                    OrdinalState::Other => {
473                        if caseless_class == 0 {
474                            self.ordinal_state = OrdinalState::Space;
475                        }
476                    }
477                    OrdinalState::Space => {
478                        if caseless_class == 0 {
479                            // pass
480                        } else if b == 0xAA || b == 0xBA {
481                            self.ordinal_state = OrdinalState::OrdinalExpectingSpace;
482                        } else if b == b'M' || b == b'D' || b == b'S' {
483                            self.ordinal_state = OrdinalState::FeminineAbbreviationStartLetter;
484                        } else if b == b'N' {
485                            // numero or Nuestra
486                            self.ordinal_state = OrdinalState::UpperN;
487                        } else if b == b'n' {
488                            // numero
489                            self.ordinal_state = OrdinalState::LowerN;
490                        } else if caseless_class == (ASCII_DIGIT as u8) {
491                            self.ordinal_state = OrdinalState::Digit;
492                        } else if caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24
493                        /* X */
494                        {
495                            self.ordinal_state = OrdinalState::Roman;
496                        } else if b == 0xA9 {
497                            self.ordinal_state = OrdinalState::Copyright;
498                        } else {
499                            self.ordinal_state = OrdinalState::Other;
500                        }
501                    }
502                    OrdinalState::OrdinalExpectingSpace => {
503                        if caseless_class == 0 {
504                            score += ORDINAL_BONUS;
505                            self.ordinal_state = OrdinalState::Space;
506                        } else {
507                            self.ordinal_state = OrdinalState::Other;
508                        }
509                    }
510                    OrdinalState::OrdinalExpectingSpaceUndoImplausibility => {
511                        if caseless_class == 0 {
512                            score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY;
513                            self.ordinal_state = OrdinalState::Space;
514                        } else {
515                            self.ordinal_state = OrdinalState::Other;
516                        }
517                    }
518                    OrdinalState::OrdinalExpectingSpaceOrDigit => {
519                        if caseless_class == 0 {
520                            score += ORDINAL_BONUS;
521                            self.ordinal_state = OrdinalState::Space;
522                        } else if caseless_class == (ASCII_DIGIT as u8) {
523                            score += ORDINAL_BONUS;
524                            // Deliberately set to `Other`
525                            self.ordinal_state = OrdinalState::Other;
526                        } else {
527                            self.ordinal_state = OrdinalState::Other;
528                        }
529                    }
530                    OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily => {
531                        if caseless_class == 0 {
532                            score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY;
533                            self.ordinal_state = OrdinalState::Space;
534                        } else if caseless_class == (ASCII_DIGIT as u8) {
535                            score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY;
536                            // Deliberately set to `Other`
537                            self.ordinal_state = OrdinalState::Other;
538                        } else {
539                            self.ordinal_state = OrdinalState::Other;
540                        }
541                    }
542                    OrdinalState::UpperN => {
543                        if b == 0xAA {
544                            self.ordinal_state =
545                                OrdinalState::OrdinalExpectingSpaceUndoImplausibility;
546                        } else if b == 0xBA {
547                            self.ordinal_state =
548                                OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily;
549                        } else if b == b'.' {
550                            self.ordinal_state = OrdinalState::PeriodAfterN;
551                        } else if caseless_class == 0 {
552                            self.ordinal_state = OrdinalState::Space;
553                        } else {
554                            self.ordinal_state = OrdinalState::Other;
555                        }
556                    }
557                    OrdinalState::LowerN => {
558                        if b == 0xBA {
559                            self.ordinal_state =
560                                OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily;
561                        } else if b == b'.' {
562                            self.ordinal_state = OrdinalState::PeriodAfterN;
563                        } else if caseless_class == 0 {
564                            self.ordinal_state = OrdinalState::Space;
565                        } else {
566                            self.ordinal_state = OrdinalState::Other;
567                        }
568                    }
569                    OrdinalState::FeminineAbbreviationStartLetter => {
570                        if b == 0xAA {
571                            self.ordinal_state =
572                                OrdinalState::OrdinalExpectingSpaceUndoImplausibility;
573                        } else if caseless_class == 0 {
574                            self.ordinal_state = OrdinalState::Space;
575                        } else {
576                            self.ordinal_state = OrdinalState::Other;
577                        }
578                    }
579                    OrdinalState::Digit => {
580                        if b == 0xAA || b == 0xBA {
581                            self.ordinal_state = OrdinalState::OrdinalExpectingSpace;
582                        } else if caseless_class == 0 {
583                            self.ordinal_state = OrdinalState::Space;
584                        } else if caseless_class == (ASCII_DIGIT as u8) {
585                            // pass
586                        } else {
587                            self.ordinal_state = OrdinalState::Other;
588                        }
589                    }
590                    OrdinalState::Roman => {
591                        if b == 0xAA || b == 0xBA {
592                            self.ordinal_state =
593                                OrdinalState::OrdinalExpectingSpaceUndoImplausibility;
594                        } else if caseless_class == 0 {
595                            self.ordinal_state = OrdinalState::Space;
596                        } else if caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24
597                        /* X */
598                        {
599                            // pass
600                        } else {
601                            self.ordinal_state = OrdinalState::Other;
602                        }
603                    }
604                    OrdinalState::PeriodAfterN => {
605                        if b == 0xBA {
606                            self.ordinal_state = OrdinalState::OrdinalExpectingSpaceOrDigit;
607                        } else if caseless_class == 0 {
608                            self.ordinal_state = OrdinalState::Space;
609                        } else {
610                            self.ordinal_state = OrdinalState::Other;
611                        }
612                    }
613                    OrdinalState::Copyright => {
614                        if caseless_class == 0 {
615                            score += COPYRIGHT_BONUS;
616                            self.ordinal_state = OrdinalState::Space;
617                        } else {
618                            self.ordinal_state = OrdinalState::Other;
619                        }
620                    }
621                }
622            }
623
624            if ascii {
625                self.prev_non_ascii = 0;
626            } else {
627                self.prev_non_ascii += 1;
628            }
629            self.prev = caseless_class;
630        }
631        Some(score)
632    }
633}
634
635struct ArabicFrenchCandidate {
636    data: &'static SingleByteData,
637    prev: u8,
638    case_state: LatinCaseState,
639    prev_ascii: bool,
640    current_word_len: u64,
641    longest_word: u64,
642}
643
644impl ArabicFrenchCandidate {
645    fn new(data: &'static SingleByteData) -> Self {
646        ArabicFrenchCandidate {
647            data: data,
648            prev: 0,
649            case_state: LatinCaseState::Space,
650            prev_ascii: true,
651            current_word_len: 0,
652            longest_word: 0,
653        }
654    }
655
656    fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
657        let mut score = 0i64;
658        for &b in buffer {
659            let class = self.data.classify(b);
660            if class == 255 {
661                return None;
662            }
663            let caseless_class = class & 0x7F;
664
665            let ascii = b < 0x80;
666            let ascii_pair = self.prev_ascii && ascii;
667
668            if caseless_class != LATIN_LETTER {
669                // We compute case penalties for French only
670                self.case_state = LatinCaseState::Space;
671            } else if (class >> 7) == 0 {
672                if self.case_state == LatinCaseState::AllCaps && !ascii_pair {
673                    score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY;
674                }
675                self.case_state = LatinCaseState::Lower;
676            } else {
677                match self.case_state {
678                    LatinCaseState::Space => {
679                        self.case_state = LatinCaseState::Upper;
680                    }
681                    LatinCaseState::Upper | LatinCaseState::AllCaps => {
682                        self.case_state = LatinCaseState::AllCaps;
683                    }
684                    LatinCaseState::Lower => {
685                        if !ascii_pair {
686                            score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY;
687                        }
688                        self.case_state = LatinCaseState::Upper;
689                    }
690                }
691            }
692
693            // Count only Arabic word length and ignore French
694            let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, true);
695            // XXX apply penalty if > 23
696            if non_ascii_alphabetic {
697                self.current_word_len += 1;
698            } else {
699                if self.current_word_len > self.longest_word {
700                    self.longest_word = self.current_word_len;
701                }
702                self.current_word_len = 0;
703            }
704
705            if !ascii_pair {
706                score += self.data.score(caseless_class, self.prev, true);
707
708                if self.prev == LATIN_LETTER && non_ascii_alphabetic {
709                    score += LATIN_ADJACENCY_PENALTY;
710                } else if caseless_class == LATIN_LETTER
711                    && self.data.is_non_latin_alphabetic(self.prev, true)
712                {
713                    score += LATIN_ADJACENCY_PENALTY;
714                }
715            }
716
717            self.prev_ascii = ascii;
718            self.prev = caseless_class;
719        }
720        Some(score)
721    }
722}
723
724struct CaselessCandidate {
725    data: &'static SingleByteData,
726    prev: u8,
727    prev_ascii: bool,
728    current_word_len: u64,
729    longest_word: u64,
730}
731
732impl CaselessCandidate {
733    fn new(data: &'static SingleByteData) -> Self {
734        CaselessCandidate {
735            data: data,
736            prev: 0,
737            prev_ascii: true,
738            current_word_len: 0,
739            longest_word: 0,
740        }
741    }
742
743    fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
744        let mut score = 0i64;
745        for &b in buffer {
746            let class = self.data.classify(b);
747            if class == 255 {
748                return None;
749            }
750            let caseless_class = class & 0x7F;
751
752            let ascii = b < 0x80;
753            let ascii_pair = self.prev_ascii && ascii;
754
755            let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false);
756            // Apply penalty if > 23 and not Thai
757            if non_ascii_alphabetic {
758                self.current_word_len += 1;
759            } else {
760                if self.current_word_len > self.longest_word {
761                    self.longest_word = self.current_word_len;
762                }
763                self.current_word_len = 0;
764            }
765
766            if !ascii_pair {
767                score += self.data.score(caseless_class, self.prev, false);
768
769                if self.prev == LATIN_LETTER && non_ascii_alphabetic {
770                    score += LATIN_ADJACENCY_PENALTY;
771                } else if caseless_class == LATIN_LETTER
772                    && self.data.is_non_latin_alphabetic(self.prev, false)
773                {
774                    score += LATIN_ADJACENCY_PENALTY;
775                }
776            }
777
778            self.prev_ascii = ascii;
779            self.prev = caseless_class;
780        }
781        Some(score)
782    }
783}
784
785fn is_ascii_punctuation(byte: u8) -> bool {
786    match byte {
787        b'.' | b',' | b':' | b';' | b'?' | b'!' => true,
788        _ => false,
789    }
790}
791
792struct LogicalCandidate {
793    data: &'static SingleByteData,
794    prev: u8,
795    prev_ascii: bool,
796    plausible_punctuation: u64,
797    current_word_len: u64,
798    longest_word: u64,
799}
800
801impl LogicalCandidate {
802    fn new(data: &'static SingleByteData) -> Self {
803        LogicalCandidate {
804            data: data,
805            prev: 0,
806            prev_ascii: true,
807            plausible_punctuation: 0,
808            current_word_len: 0,
809            longest_word: 0,
810        }
811    }
812
813    fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
814        let mut score = 0i64;
815        for &b in buffer {
816            let class = self.data.classify(b);
817            if class == 255 {
818                return None;
819            }
820            let caseless_class = class & 0x7F;
821
822            let ascii = b < 0x80;
823            let ascii_pair = self.prev_ascii && ascii;
824
825            let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false);
826            // XXX apply penalty if > 22
827            if non_ascii_alphabetic {
828                self.current_word_len += 1;
829            } else {
830                if self.current_word_len > self.longest_word {
831                    self.longest_word = self.current_word_len;
832                }
833                self.current_word_len = 0;
834            }
835
836            if !ascii_pair {
837                score += self.data.score(caseless_class, self.prev, false);
838
839                let prev_non_ascii_alphabetic = self.data.is_non_latin_alphabetic(self.prev, false);
840                if caseless_class == 0 && prev_non_ascii_alphabetic && is_ascii_punctuation(b) {
841                    self.plausible_punctuation += 1;
842                }
843
844                if self.prev == LATIN_LETTER && non_ascii_alphabetic {
845                    score += LATIN_ADJACENCY_PENALTY;
846                } else if caseless_class == LATIN_LETTER && prev_non_ascii_alphabetic {
847                    score += LATIN_ADJACENCY_PENALTY;
848                }
849            }
850
851            self.prev_ascii = ascii;
852            self.prev = caseless_class;
853        }
854        Some(score)
855    }
856}
857
858struct VisualCandidate {
859    data: &'static SingleByteData,
860    prev: u8,
861    prev_ascii: bool,
862    prev_punctuation: bool,
863    plausible_punctuation: u64,
864    current_word_len: u64,
865    longest_word: u64,
866}
867
868impl VisualCandidate {
869    fn new(data: &'static SingleByteData) -> Self {
870        VisualCandidate {
871            data: data,
872            prev: 0,
873            prev_ascii: true,
874            prev_punctuation: false,
875            plausible_punctuation: 0,
876            current_word_len: 0,
877            longest_word: 0,
878        }
879    }
880
881    fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
882        let mut score = 0i64;
883        for &b in buffer {
884            let class = self.data.classify(b);
885            if class == 255 {
886                return None;
887            }
888            let caseless_class = class & 0x7F;
889
890            let ascii = b < 0x80;
891            let ascii_pair = self.prev_ascii && ascii;
892
893            let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false);
894            // XXX apply penalty if > 22
895            if non_ascii_alphabetic {
896                self.current_word_len += 1;
897            } else {
898                if self.current_word_len > self.longest_word {
899                    self.longest_word = self.current_word_len;
900                }
901                self.current_word_len = 0;
902            }
903
904            if !ascii_pair {
905                score += self.data.score(caseless_class, self.prev, false);
906
907                if non_ascii_alphabetic && self.prev_punctuation {
908                    self.plausible_punctuation += 1;
909                }
910
911                if self.prev == LATIN_LETTER && non_ascii_alphabetic {
912                    score += LATIN_ADJACENCY_PENALTY;
913                } else if caseless_class == LATIN_LETTER
914                    && self.data.is_non_latin_alphabetic(self.prev, false)
915                {
916                    score += LATIN_ADJACENCY_PENALTY;
917                }
918            }
919
920            self.prev_ascii = ascii;
921            self.prev = caseless_class;
922            self.prev_punctuation = caseless_class == 0 && is_ascii_punctuation(b);
923        }
924        Some(score)
925    }
926}
927
928struct Utf8Candidate {
929    decoder: Decoder,
930}
931
932impl Utf8Candidate {
933    fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
934        let mut dst = [0u8; 1024];
935        let mut total_read = 0;
936        loop {
937            let (result, read, _) = self.decoder.decode_to_utf8_without_replacement(
938                &buffer[total_read..],
939                &mut dst,
940                last,
941            );
942            total_read += read;
943            match result {
944                DecoderResult::InputEmpty => {
945                    return Some(0);
946                }
947                DecoderResult::Malformed(_, _) => {
948                    return None;
949                }
950                DecoderResult::OutputFull => {
951                    continue;
952                }
953            }
954        }
955    }
956}
957
958struct Iso2022Candidate {
959    decoder: Decoder,
960}
961
962impl Iso2022Candidate {
963    fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
964        let mut dst = [0u16; 1024];
965        let mut total_read = 0;
966        loop {
967            let (result, read, _) = self.decoder.decode_to_utf16_without_replacement(
968                &buffer[total_read..],
969                &mut dst,
970                last,
971            );
972            total_read += read;
973            match result {
974                DecoderResult::InputEmpty => {
975                    return Some(0);
976                }
977                DecoderResult::Malformed(_, _) => {
978                    return None;
979                }
980                DecoderResult::OutputFull => {
981                    continue;
982                }
983            }
984        }
985    }
986}
987
988#[derive(PartialEq)]
989enum LatinCj {
990    AsciiLetter,
991    Cj,
992    Other,
993}
994
995#[derive(PartialEq, Copy, Clone)]
996enum HalfWidthKatakana {
997    DakutenForbidden,
998    DakutenAllowed,
999    DakutenOrHandakutenAllowed,
1000}
1001
1002#[derive(PartialEq)]
1003enum LatinKorean {
1004    AsciiLetter,
1005    Hangul,
1006    Hanja,
1007    Other,
1008}
1009
1010fn cjk_extra_score(u: u16, table: &'static [u16; 128]) -> i64 {
1011    if let Some(pos) = table.iter().position(|&x| x == u) {
1012        ((128 - pos) / 16) as i64
1013    } else {
1014        0
1015    }
1016}
1017
1018struct GbkCandidate {
1019    decoder: Decoder,
1020    prev_byte: u8,
1021    prev: LatinCj,
1022    pending_score: Option<i64>,
1023}
1024
1025impl GbkCandidate {
1026    fn maybe_set_as_pending(&mut self, s: i64) -> i64 {
1027        assert!(self.pending_score.is_none());
1028        if self.prev == LatinCj::Cj || !more_problematic_lead(self.prev_byte) {
1029            s
1030        } else {
1031            self.pending_score = Some(s);
1032            0
1033        }
1034    }
1035
1036    fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
1037        let mut score = 0i64;
1038        let mut src = [0u8];
1039        let mut dst = [0u16; 2];
1040        for &b in buffer {
1041            src[0] = b;
1042            let (result, read, written) = self
1043                .decoder
1044                .decode_to_utf16_without_replacement(&src, &mut dst, false);
1045            if written == 1 {
1046                let u = dst[0];
1047                if (u >= u16::from(b'a') && u <= u16::from(b'z'))
1048                    || (u >= u16::from(b'A') && u <= u16::from(b'Z'))
1049                {
1050                    self.pending_score = None; // Discard pending score
1051                    if self.prev == LatinCj::Cj {
1052                        score += CJK_LATIN_ADJACENCY_PENALTY;
1053                    }
1054                    self.prev = LatinCj::AsciiLetter;
1055                } else if u == 0x20AC {
1056                    // euro sign
1057                    self.pending_score = None; // Discard pending score
1058                                               // Should there even be a penalty?
1059                    self.prev = LatinCj::Other;
1060                } else if u >= 0x4E00 && u <= 0x9FA5 {
1061                    if let Some(pending) = self.pending_score {
1062                        score += pending;
1063                        self.pending_score = None;
1064                    }
1065                    if b >= 0xA1 && b <= 0xFE {
1066                        match self.prev_byte {
1067                            0xA1..=0xD7 => {
1068                                score += GBK_SCORE_PER_LEVEL_1;
1069                                score +=
1070                                    cjk_extra_score(u, &data::DETECTOR_DATA.frequent_simplified);
1071                            }
1072                            0xD8..=0xFE => score += GBK_SCORE_PER_LEVEL_2,
1073                            _ => {
1074                                score += GBK_SCORE_PER_NON_EUC;
1075                            }
1076                        }
1077                    } else {
1078                        score += self.maybe_set_as_pending(GBK_SCORE_PER_NON_EUC);
1079                    }
1080                    if self.prev == LatinCj::AsciiLetter {
1081                        score += CJK_LATIN_ADJACENCY_PENALTY;
1082                    }
1083                    self.prev = LatinCj::Cj;
1084                } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) {
1085                    if let Some(pending) = self.pending_score {
1086                        score += pending;
1087                        self.pending_score = None;
1088                    }
1089                    // XXX score?
1090                    if self.prev == LatinCj::AsciiLetter {
1091                        score += CJK_LATIN_ADJACENCY_PENALTY;
1092                    }
1093                    self.prev = LatinCj::Cj;
1094                } else if u >= 0xE000 && u < 0xF900 {
1095                    if let Some(pending) = self.pending_score {
1096                        score += pending;
1097                        self.pending_score = None;
1098                    }
1099                    // Treat the GB18030-required PUA mappings as non-EUC ideographs.
1100                    match u {
1101                        0xE78D..=0xE796
1102                        | 0xE816..=0xE818
1103                        | 0xE81E
1104                        | 0xE826
1105                        | 0xE82B
1106                        | 0xE82C
1107                        | 0xE831
1108                        | 0xE832
1109                        | 0xE83B
1110                        | 0xE843
1111                        | 0xE854
1112                        | 0xE855
1113                        | 0xE864 => {
1114                            score += GBK_SCORE_PER_NON_EUC;
1115                            if self.prev == LatinCj::AsciiLetter {
1116                                score += CJK_LATIN_ADJACENCY_PENALTY;
1117                            }
1118                            self.prev = LatinCj::Cj;
1119                        }
1120                        _ => {
1121                            score += GBK_PUA_PENALTY;
1122                            self.prev = LatinCj::Other;
1123                        }
1124                    }
1125                } else {
1126                    match u {
1127                        0x3000 // Distinct from Korean, space
1128                        | 0x3001 // Distinct from Korean, enumeration comma
1129                        | 0x3002 // Distinct from Korean, full stop
1130                        | 0xFF08 // Distinct from Korean, parenthesis
1131                        | 0xFF09 // Distinct from Korean, parenthesis
1132                        | 0xFF01 // Distinct from Japanese, exclamation
1133                        | 0xFF0C // Distinct from Japanese, comma
1134                        | 0xFF1B // Distinct from Japanese, semicolon
1135                        | 0xFF1F // Distinct from Japanese, question
1136                        => {
1137                            if let Some(pending) = self.pending_score {
1138                                score += pending;
1139                                self.pending_score = None;
1140                            }
1141                            score += CJ_PUNCTUATION;
1142                        }
1143                        0..=0x7F => {
1144                            self.pending_score = None; // Discard pending score
1145                        }
1146                        _ => {
1147                            if let Some(pending) = self.pending_score {
1148                                score += pending;
1149                                self.pending_score = None;
1150                            }
1151                            score += CJK_OTHER;
1152                        }
1153                    }
1154                    self.prev = LatinCj::Other;
1155                }
1156            } else if written == 2 {
1157                if let Some(pending) = self.pending_score {
1158                    score += pending;
1159                    self.pending_score = None;
1160                }
1161                let u = dst[0];
1162                if u >= 0xDB80 && u <= 0xDBFF {
1163                    score += GBK_PUA_PENALTY;
1164                    self.prev = LatinCj::Other;
1165                } else if u >= 0xD480 && u < 0xD880 {
1166                    score += GBK_SCORE_PER_NON_EUC;
1167                    if self.prev == LatinCj::AsciiLetter {
1168                        score += CJK_LATIN_ADJACENCY_PENALTY;
1169                    }
1170                    self.prev = LatinCj::Cj;
1171                } else {
1172                    score += CJK_OTHER;
1173                    self.prev = LatinCj::Other;
1174                }
1175            }
1176            match result {
1177                DecoderResult::InputEmpty => {
1178                    assert_eq!(read, 1);
1179                }
1180                DecoderResult::Malformed(malformed_len, _) => {
1181                    if (self.prev_byte == 0xA0 || self.prev_byte == 0xFE || self.prev_byte == 0xFD)
1182                        && (b < 0x80 || b == 0xFF)
1183                    {
1184                        // Mac OS Chinese Simplified single-byte that conflicts with code page GBK lead byte
1185                        // followed by ASCII or a non-conflicting single-byte extension.
1186                        self.pending_score = None; // Just in case
1187                        score += GBK_SINGLE_BYTE_EXTENSION_PENALTY;
1188                        if (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') {
1189                            self.prev = LatinCj::AsciiLetter;
1190                        } else if b == 0xFF {
1191                            score += GBK_SINGLE_BYTE_EXTENSION_PENALTY;
1192                            self.prev = LatinCj::Other;
1193                        } else {
1194                            self.prev = LatinCj::Other;
1195                        }
1196                        // The GBK decoder has the pending ASCII concept, which is
1197                        // a problem with this trickery, so let's reset the state.
1198                        self.decoder = GBK.new_decoder_without_bom_handling();
1199                    } else if malformed_len == 1 && b == 0xFF {
1200                        // Mac OS Chinese Simplified single-byte extension that doesn't conflict with lead bytes
1201                        self.pending_score = None; // Just in case
1202                        score += GBK_SINGLE_BYTE_EXTENSION_PENALTY;
1203                        self.prev = LatinCj::Other;
1204                        // The GBK decoder has the pending ASCII concept, which is
1205                        // a problem with this trickery, so let's reset the state.
1206                        self.decoder = GBK.new_decoder_without_bom_handling();
1207                    } else {
1208                        return None;
1209                    }
1210                }
1211                DecoderResult::OutputFull => {
1212                    unreachable!();
1213                }
1214            }
1215            self.prev_byte = b;
1216        }
1217        if last {
1218            let (result, _, _) = self
1219                .decoder
1220                .decode_to_utf16_without_replacement(b"", &mut dst, true);
1221            match result {
1222                DecoderResult::InputEmpty => {}
1223                DecoderResult::Malformed(_, _) => {
1224                    return None;
1225                }
1226                DecoderResult::OutputFull => {
1227                    unreachable!();
1228                }
1229            }
1230        }
1231        Some(score)
1232    }
1233}
1234
1235// Shift_JIS and Big5
1236fn problematic_lead(b: u8) -> bool {
1237    match b {
1238        0x91..=0x97 | 0x9A | 0x8A | 0x9B | 0x8B | 0x9E | 0x8E | 0xB0 => true,
1239        _ => false,
1240    }
1241}
1242
1243// GBK and EUC-KR
1244fn more_problematic_lead(b: u8) -> bool {
1245    problematic_lead(b) || b == 0x82 || b == 0x84 || b == 0x85 || b == 0xA0
1246}
1247
1248struct ShiftJisCandidate {
1249    decoder: Decoder,
1250    half_width_katakana_seen: bool,
1251    half_width_katakana_state: HalfWidthKatakana,
1252    prev: LatinCj,
1253    prev_byte: u8,
1254    pending_score: Option<i64>,
1255}
1256
1257impl ShiftJisCandidate {
1258    fn maybe_set_as_pending(&mut self, s: i64) -> i64 {
1259        assert!(self.pending_score.is_none());
1260        if self.prev == LatinCj::Cj || !problematic_lead(self.prev_byte) {
1261            s
1262        } else {
1263            self.pending_score = Some(s);
1264            0
1265        }
1266    }
1267
1268    fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
1269        let mut score = 0i64;
1270        let mut src = [0u8];
1271        let mut dst = [0u16; 2];
1272        for &b in buffer {
1273            src[0] = b;
1274            let (result, read, written) = self
1275                .decoder
1276                .decode_to_utf16_without_replacement(&src, &mut dst, false);
1277            if written > 0 {
1278                let half_width_katakana_state = self.half_width_katakana_state;
1279                self.half_width_katakana_state = HalfWidthKatakana::DakutenForbidden;
1280                let u = dst[0];
1281                if (u >= u16::from(b'a') && u <= u16::from(b'z'))
1282                    || (u >= u16::from(b'A') && u <= u16::from(b'Z'))
1283                {
1284                    self.pending_score = None; // Discard pending score
1285                    if self.prev == LatinCj::Cj {
1286                        score += CJK_LATIN_ADJACENCY_PENALTY;
1287                    }
1288                    self.prev = LatinCj::AsciiLetter;
1289                } else if u >= 0xFF61 && u <= 0xFF9F {
1290                    if !self.half_width_katakana_seen {
1291                        self.half_width_katakana_seen = true;
1292                        // To avoid misdetecting title-length inputs
1293                        score += SHIFT_JIS_INITIAL_HALF_WIDTH_KATAKANA_PENALTY;
1294                    }
1295                    self.pending_score = None; // Discard pending score
1296                    score += HALF_WIDTH_KATAKANA_SCORE;
1297
1298                    if (u >= 0xFF76 && u <= 0xFF84) || u == 0xFF73 {
1299                        self.half_width_katakana_state = HalfWidthKatakana::DakutenAllowed;
1300                    } else if u >= 0xFF8A && u <= 0xFF8E {
1301                        self.half_width_katakana_state =
1302                            HalfWidthKatakana::DakutenOrHandakutenAllowed;
1303                    } else if u == 0xFF9E {
1304                        if half_width_katakana_state == HalfWidthKatakana::DakutenForbidden {
1305                            score += IMPLAUSIBILITY_PENALTY;
1306                        } else {
1307                            score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
1308                        }
1309                    } else if u == 0xFF9F {
1310                        if half_width_katakana_state
1311                            != HalfWidthKatakana::DakutenOrHandakutenAllowed
1312                        {
1313                            score += IMPLAUSIBILITY_PENALTY;
1314                        } else {
1315                            score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
1316                        }
1317                    }
1318
1319                    if self.prev == LatinCj::AsciiLetter {
1320                        score += CJK_LATIN_ADJACENCY_PENALTY;
1321                    }
1322                    self.prev = LatinCj::Cj;
1323                } else if u >= 0x3040 && u < 0x3100 {
1324                    if let Some(pending) = self.pending_score {
1325                        score += pending;
1326                        self.pending_score = None;
1327                    }
1328                    score += SHIFT_JIS_SCORE_PER_KANA;
1329                    if self.prev == LatinCj::AsciiLetter {
1330                        score += CJK_LATIN_ADJACENCY_PENALTY;
1331                    }
1332                    self.prev = LatinCj::Cj;
1333                } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) {
1334                    if let Some(pending) = self.pending_score {
1335                        score += pending;
1336                        self.pending_score = None;
1337                    }
1338                    if self.prev_byte < 0x98 || (self.prev_byte == 0x98 && b < 0x73) {
1339                        score += self.maybe_set_as_pending(
1340                            SHIFT_JIS_SCORE_PER_LEVEL_1_KANJI
1341                                + cjk_extra_score(u, &data::DETECTOR_DATA.frequent_kanji),
1342                        );
1343                    } else {
1344                        score += self.maybe_set_as_pending(SHIFT_JIS_SCORE_PER_LEVEL_2_KANJI);
1345                    }
1346                    if self.prev == LatinCj::AsciiLetter {
1347                        score += CJK_LATIN_ADJACENCY_PENALTY;
1348                    }
1349                    self.prev = LatinCj::Cj;
1350                } else if u >= 0xE000 && u < 0xF900 {
1351                    if let Some(pending) = self.pending_score {
1352                        score += pending;
1353                        self.pending_score = None;
1354                    }
1355                    score += SHIFT_JIS_PUA_PENALTY;
1356                    self.prev = LatinCj::Other;
1357                } else {
1358                    match u {
1359                        0x3000 // Distinct from Korean, space
1360                        | 0x3001 // Distinct from Korean, enumeration comma
1361                        | 0x3002 // Distinct from Korean, full stop
1362                        | 0xFF08 // Distinct from Korean, parenthesis
1363                        | 0xFF09 // Distinct from Korean, parenthesis
1364                        => {
1365                            if let Some(pending) = self.pending_score {
1366                                score += pending;
1367                                self.pending_score = None;
1368                            }
1369                            // Not really needed for CJK distinction
1370                            // but let's give non-zero score for these
1371                            // common byte pairs anyway.
1372                            score += CJ_PUNCTUATION;
1373                        }
1374                        0..=0x7F => {
1375                            self.pending_score = None; // Discard pending score
1376                        }
1377                        0x80 => {
1378                            // This is a control character that overlaps euro
1379                            // in windows-1252 and happens to be a non-error
1380                            // is Shift_JIS.
1381                            self.pending_score = None; // Discard pending score
1382                            score += IMPLAUSIBILITY_PENALTY;
1383                        }
1384                        _ => {
1385                            if let Some(pending) = self.pending_score {
1386                                score += pending;
1387                                self.pending_score = None;
1388                            }
1389                            score += CJK_OTHER;
1390                        }
1391                    }
1392                    self.prev = LatinCj::Other;
1393                }
1394            }
1395            match result {
1396                DecoderResult::InputEmpty => {
1397                    assert_eq!(read, 1);
1398                }
1399                DecoderResult::Malformed(malformed_len, _) => {
1400                    if (((self.prev_byte >= 0x81 && self.prev_byte <= 0x9F)
1401                        || (self.prev_byte >= 0xE0 && self.prev_byte <= 0xFC))
1402                        && ((b >= 0x40 && b <= 0x7E) || (b >= 0x80 && b <= 0xFC)))
1403                        && !((self.prev_byte == 0x82 && b >= 0xFA)
1404                            || (self.prev_byte == 0x84 && ((b >= 0xDD && b <= 0xE4) || b >= 0xFB))
1405                            || (self.prev_byte == 0x86 && b >= 0xF2 && b <= 0xFA)
1406                            || (self.prev_byte == 0x87 && b >= 0x77 && b <= 0x7D)
1407                            || (self.prev_byte == 0xFC && b >= 0xF5))
1408                    {
1409                        // Shift_JIS2004 or MacJapanese
1410                        if let Some(pending) = self.pending_score {
1411                            score += pending;
1412                            self.pending_score = None;
1413                        }
1414                        score += SHIFT_JIS_EXTENSION_PENALTY;
1415                        // Approximate boundary
1416                        if self.prev_byte < 0x87 {
1417                            self.prev = LatinCj::Other;
1418                        } else {
1419                            if self.prev == LatinCj::AsciiLetter {
1420                                score += CJK_LATIN_ADJACENCY_PENALTY;
1421                            }
1422                            self.prev = LatinCj::Cj;
1423                        }
1424                    } else if malformed_len == 1 && (b == 0xA0 || b >= 0xFD) {
1425                        self.pending_score = None; // Just in case
1426                        score += SHIFT_JIS_SINGLE_BYTE_EXTENSION_PENALTY;
1427                        self.prev = LatinCj::Other;
1428                    } else {
1429                        return None;
1430                    }
1431                }
1432                DecoderResult::OutputFull => {
1433                    unreachable!();
1434                }
1435            }
1436            self.prev_byte = b;
1437        }
1438        if last {
1439            let (result, _, _) = self
1440                .decoder
1441                .decode_to_utf16_without_replacement(b"", &mut dst, true);
1442            match result {
1443                DecoderResult::InputEmpty => {}
1444                DecoderResult::Malformed(_, _) => {
1445                    return None;
1446                }
1447                DecoderResult::OutputFull => {
1448                    unreachable!();
1449                }
1450            }
1451        }
1452        Some(score)
1453    }
1454}
1455
1456struct EucJpCandidate {
1457    decoder: Decoder,
1458    non_ascii_seen: bool,
1459    half_width_katakana_state: HalfWidthKatakana,
1460    prev: LatinCj,
1461    prev_byte: u8,
1462    prev_prev_byte: u8,
1463}
1464
1465impl EucJpCandidate {
1466    fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
1467        let mut score = 0i64;
1468        let mut src = [0u8];
1469        let mut dst = [0u16; 2];
1470        for &b in buffer {
1471            src[0] = b;
1472            let (result, read, written) = self
1473                .decoder
1474                .decode_to_utf16_without_replacement(&src, &mut dst, false);
1475            if written > 0 {
1476                let half_width_katakana_state = self.half_width_katakana_state;
1477                self.half_width_katakana_state = HalfWidthKatakana::DakutenForbidden;
1478                let u = dst[0];
1479                if !self.non_ascii_seen && u >= 0x80 {
1480                    self.non_ascii_seen = true;
1481                    if u >= 0xFF61 && u <= 0xFF9F {
1482                        // return None;
1483                    }
1484                    if u >= 0x3040 && u < 0x3100 {
1485                        // Remove the kana advantage over initial Big5
1486                        // hanzi.
1487                        score += EUC_JP_INITIAL_KANA_PENALTY;
1488                    }
1489                }
1490                if (u >= u16::from(b'a') && u <= u16::from(b'z'))
1491                    || (u >= u16::from(b'A') && u <= u16::from(b'Z'))
1492                {
1493                    if self.prev == LatinCj::Cj {
1494                        score += CJK_LATIN_ADJACENCY_PENALTY;
1495                    }
1496                    self.prev = LatinCj::AsciiLetter;
1497                } else if u >= 0xFF61 && u <= 0xFF9F {
1498                    score += HALF_WIDTH_KATAKANA_SCORE;
1499
1500                    if (u >= 0xFF76 && u <= 0xFF84) || u == 0xFF73 {
1501                        self.half_width_katakana_state = HalfWidthKatakana::DakutenAllowed;
1502                    } else if u >= 0xFF8A && u <= 0xFF8E {
1503                        self.half_width_katakana_state =
1504                            HalfWidthKatakana::DakutenOrHandakutenAllowed;
1505                    } else if u == 0xFF9E {
1506                        if half_width_katakana_state == HalfWidthKatakana::DakutenForbidden {
1507                            score += IMPLAUSIBILITY_PENALTY;
1508                        } else {
1509                            score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
1510                        }
1511                    } else if u == 0xFF9F {
1512                        if half_width_katakana_state
1513                            != HalfWidthKatakana::DakutenOrHandakutenAllowed
1514                        {
1515                            score += IMPLAUSIBILITY_PENALTY;
1516                        } else {
1517                            score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
1518                        }
1519                    }
1520
1521                    if self.prev == LatinCj::AsciiLetter {
1522                        score += CJK_LATIN_ADJACENCY_PENALTY;
1523                    }
1524                    self.prev = LatinCj::Other;
1525                } else if (u >= 0x3041 && u <= 0x3093) || (u >= 0x30A1 && u <= 0x30F6) {
1526                    match u {
1527                        0x3090 // hiragana wi
1528                        | 0x3091 // hiragana we
1529                        | 0x30F0 // katakana wi
1530                        | 0x30F1 // katakana we
1531                        => {
1532                            // Remove advantage over Big5 Hanzi
1533                            score += EUC_JP_SCORE_PER_NEAR_OBSOLETE_KANA;
1534                        }
1535                        _ => {
1536                            score += EUC_JP_SCORE_PER_KANA;
1537                        }
1538                    }
1539                    if self.prev == LatinCj::AsciiLetter {
1540                        score += CJK_LATIN_ADJACENCY_PENALTY;
1541                    }
1542                    self.prev = LatinCj::Cj;
1543                } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) {
1544                    if self.prev_prev_byte == 0x8F {
1545                        score += EUC_JP_SCORE_PER_OTHER_KANJI;
1546                    } else if self.prev_byte < 0xD0 {
1547                        score += EUC_JP_SCORE_PER_LEVEL_1_KANJI;
1548                        score += cjk_extra_score(u, &data::DETECTOR_DATA.frequent_kanji);
1549                    } else {
1550                        score += EUC_JP_SCORE_PER_LEVEL_2_KANJI;
1551                    }
1552                    if self.prev == LatinCj::AsciiLetter {
1553                        score += CJK_LATIN_ADJACENCY_PENALTY;
1554                    }
1555                    self.prev = LatinCj::Cj;
1556                } else {
1557                    match u {
1558                        0x3000 // Distinct from Korean, space
1559                        | 0x3001 // Distinct from Korean, enumeration comma
1560                        | 0x3002 // Distinct from Korean, full stop
1561                        | 0xFF08 // Distinct from Korean, parenthesis
1562                        | 0xFF09 // Distinct from Korean, parenthesis
1563                        => {
1564                            score += CJ_PUNCTUATION;
1565                        }
1566                        0..=0x7F => {}
1567                        _ => {
1568                            score += CJK_OTHER;
1569                        }
1570                    }
1571                    self.prev = LatinCj::Other;
1572                }
1573            }
1574            match result {
1575                DecoderResult::InputEmpty => {
1576                    assert_eq!(read, 1);
1577                }
1578                DecoderResult::Malformed(_, _) => {
1579                    if b >= 0xA1
1580                        && b <= 0xFE
1581                        && self.prev_byte >= 0xA1
1582                        && self.prev_byte <= 0xFE
1583                        && ((self.prev_prev_byte != 0x8F
1584                            && !(self.prev_byte == 0xA8 && b >= 0xDF && b <= 0xE6)
1585                            && !(self.prev_byte == 0xAC && b >= 0xF4 && b <= 0xFC)
1586                            && !(self.prev_byte == 0xAD && b >= 0xD8 && b <= 0xDE))
1587                            || (self.prev_prev_byte == 0x8F
1588                                && self.prev_byte != 0xA2
1589                                && self.prev_byte != 0xA6
1590                                && self.prev_byte != 0xA7
1591                                && self.prev_byte != 0xA9
1592                                && self.prev_byte != 0xAA
1593                                && self.prev_byte != 0xAB
1594                                && self.prev_byte != 0xED
1595                                && !(self.prev_byte == 0xFE && b >= 0xF7)))
1596                    {
1597                        score += EUC_JP_EXTENSION_PENALTY;
1598                        if self.prev == LatinCj::AsciiLetter {
1599                            score += CJK_LATIN_ADJACENCY_PENALTY;
1600                        }
1601                        self.prev = LatinCj::Cj;
1602                    } else {
1603                        return None;
1604                    }
1605                }
1606                DecoderResult::OutputFull => {
1607                    unreachable!();
1608                }
1609            }
1610            self.prev_prev_byte = self.prev_byte;
1611            self.prev_byte = b;
1612        }
1613        if last {
1614            let (result, _, _) = self
1615                .decoder
1616                .decode_to_utf16_without_replacement(b"", &mut dst, true);
1617            match result {
1618                DecoderResult::InputEmpty => {}
1619                DecoderResult::Malformed(_, _) => {
1620                    return None;
1621                }
1622                DecoderResult::OutputFull => {
1623                    unreachable!();
1624                }
1625            }
1626        }
1627        Some(score)
1628    }
1629}
1630
1631struct Big5Candidate {
1632    decoder: Decoder,
1633    prev: LatinCj,
1634    prev_byte: u8,
1635    pending_score: Option<i64>,
1636}
1637
1638impl Big5Candidate {
1639    fn maybe_set_as_pending(&mut self, s: i64) -> i64 {
1640        assert!(self.pending_score.is_none());
1641        if self.prev == LatinCj::Cj || !problematic_lead(self.prev_byte) {
1642            s
1643        } else {
1644            self.pending_score = Some(s);
1645            0
1646        }
1647    }
1648
1649    fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
1650        let mut score = 0i64;
1651        let mut src = [0u8];
1652        let mut dst = [0u16; 2];
1653        for &b in buffer {
1654            src[0] = b;
1655            let (result, read, written) = self
1656                .decoder
1657                .decode_to_utf16_without_replacement(&src, &mut dst, false);
1658            if written == 1 {
1659                let u = dst[0];
1660                if (u >= u16::from(b'a') && u <= u16::from(b'z'))
1661                    || (u >= u16::from(b'A') && u <= u16::from(b'Z'))
1662                {
1663                    self.pending_score = None; // Discard pending score
1664                    if self.prev == LatinCj::Cj {
1665                        score += CJK_LATIN_ADJACENCY_PENALTY;
1666                    }
1667                    self.prev = LatinCj::AsciiLetter;
1668                } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) {
1669                    if let Some(pending) = self.pending_score {
1670                        score += pending;
1671                        self.pending_score = None;
1672                    }
1673                    match self.prev_byte {
1674                        0xA4..=0xC6 => {
1675                            score += self.maybe_set_as_pending(BIG5_SCORE_PER_LEVEL_1_HANZI);
1676                            // score += cjk_extra_score(u, &data::DETECTOR_DATA.frequent_traditional);
1677                        }
1678                        _ => {
1679                            score += self.maybe_set_as_pending(BIG5_SCORE_PER_OTHER_HANZI);
1680                        }
1681                    }
1682                    if self.prev == LatinCj::AsciiLetter {
1683                        score += CJK_LATIN_ADJACENCY_PENALTY;
1684                    }
1685                    self.prev = LatinCj::Cj;
1686                } else {
1687                    match u {
1688                        0x3000 // Distinct from Korean, space
1689                        | 0x3001 // Distinct from Korean, enumeration comma
1690                        | 0x3002 // Distinct from Korean, full stop
1691                        | 0xFF08 // Distinct from Korean, parenthesis
1692                        | 0xFF09 // Distinct from Korean, parenthesis
1693                        | 0xFF01 // Distinct from Japanese, exclamation
1694                        | 0xFF0C // Distinct from Japanese, comma
1695                        | 0xFF1B // Distinct from Japanese, semicolon
1696                        | 0xFF1F // Distinct from Japanese, question
1697                        => {
1698                            if let Some(pending) = self.pending_score {
1699                                score += pending;
1700                                self.pending_score = None;
1701                            }
1702                            // Not really needed for CJK distinction
1703                            // but let's give non-zero score for these
1704                            // common byte pairs anyway.
1705                            score += CJ_PUNCTUATION;
1706                        }
1707                        0..=0x7F => {
1708                            self.pending_score = None; // Discard pending score
1709                        }
1710                        _ => {
1711                            if let Some(pending) = self.pending_score {
1712                                score += pending;
1713                                self.pending_score = None;
1714                            }
1715                            score += CJK_OTHER;
1716                        }
1717                    }
1718                    self.prev = LatinCj::Other;
1719                }
1720            } else if written == 2 {
1721                if let Some(pending) = self.pending_score {
1722                    score += pending;
1723                    self.pending_score = None;
1724                }
1725                if dst[0] == 0xCA || dst[0] == 0xEA {
1726                    score += CJK_OTHER;
1727                    self.prev = LatinCj::Other;
1728                } else {
1729                    debug_assert!(dst[0] >= 0xD480 && dst[0] < 0xD880);
1730                    score += self.maybe_set_as_pending(BIG5_SCORE_PER_OTHER_HANZI);
1731                    if self.prev == LatinCj::AsciiLetter {
1732                        score += CJK_LATIN_ADJACENCY_PENALTY;
1733                    }
1734                    self.prev = LatinCj::Cj;
1735                }
1736            }
1737            match result {
1738                DecoderResult::InputEmpty => {
1739                    assert_eq!(read, 1);
1740                }
1741                DecoderResult::Malformed(malformed_len, _) => {
1742                    if self.prev_byte >= 0x81
1743                        && self.prev_byte <= 0xFE
1744                        && ((b >= 0x40 && b <= 0x7E) || (b >= 0xA1 && b <= 0xFE))
1745                    {
1746                        // The byte pair is in the Big5 range but unmapped.
1747                        // Treat as PUA to avoid rejecting Big5-UAO, etc.
1748                        // We don't reprocess `b` even if ASCII, since it's
1749                        // logically part of the pair.
1750                        if let Some(pending) = self.pending_score {
1751                            score += pending;
1752                            self.pending_score = None;
1753                        }
1754                        score += BIG5_PUA_PENALTY;
1755                        // Assume Hanzi semantics
1756                        if self.prev == LatinCj::AsciiLetter {
1757                            score += CJK_LATIN_ADJACENCY_PENALTY;
1758                        }
1759                        self.prev = LatinCj::Cj;
1760                    } else if (self.prev_byte == 0xA0
1761                        || self.prev_byte == 0xFD
1762                        || self.prev_byte == 0xFE)
1763                        && (b < 0x80 || b == 0xFF)
1764                    {
1765                        // Mac OS Chinese Traditional single-byte that conflicts with code page Big5 lead byte
1766                        // followed by ASCII or a non-conflicting single-byte extension.
1767                        self.pending_score = None; // Just in case
1768                        score += BIG5_SINGLE_BYTE_EXTENSION_PENALTY;
1769                        if (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') {
1770                            self.prev = LatinCj::AsciiLetter;
1771                        } else if b == 0xFF {
1772                            score += BIG5_SINGLE_BYTE_EXTENSION_PENALTY;
1773                            self.prev = LatinCj::Other;
1774                        } else {
1775                            self.prev = LatinCj::Other;
1776                        }
1777                    } else if malformed_len == 1 && b == 0xFF {
1778                        // Mac OS Chinese Traditional single-byte extension that doesn't conflict with lead bytes
1779                        self.pending_score = None; // Just in case
1780                        score += BIG5_SINGLE_BYTE_EXTENSION_PENALTY;
1781                        self.prev = LatinCj::Other;
1782                    } else {
1783                        return None;
1784                    }
1785                }
1786                DecoderResult::OutputFull => {
1787                    unreachable!();
1788                }
1789            }
1790            self.prev_byte = b;
1791        }
1792        if last {
1793            let (result, _, _) = self
1794                .decoder
1795                .decode_to_utf16_without_replacement(b"", &mut dst, true);
1796            match result {
1797                DecoderResult::InputEmpty => {}
1798                DecoderResult::Malformed(_, _) => {
1799                    return None;
1800                }
1801                DecoderResult::OutputFull => {
1802                    unreachable!();
1803                }
1804            }
1805        }
1806        Some(score)
1807    }
1808}
1809
1810struct EucKrCandidate {
1811    decoder: Decoder,
1812    prev_byte: u8,
1813    prev_was_euc_range: bool,
1814    prev: LatinKorean,
1815    current_word_len: u64,
1816    pending_score: Option<i64>,
1817}
1818
1819impl EucKrCandidate {
1820    fn maybe_set_as_pending(&mut self, s: i64) -> i64 {
1821        assert!(self.pending_score.is_none());
1822        if self.prev == LatinKorean::Hangul || !more_problematic_lead(self.prev_byte) {
1823            s
1824        } else {
1825            self.pending_score = Some(s);
1826            0
1827        }
1828    }
1829
1830    fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
1831        let mut score = 0i64;
1832        let mut src = [0u8];
1833        let mut dst = [0u16; 2];
1834        for &b in buffer {
1835            let in_euc_range = b >= 0xA1 && b <= 0xFE;
1836            src[0] = b;
1837            let (result, read, written) = self
1838                .decoder
1839                .decode_to_utf16_without_replacement(&src, &mut dst, false);
1840            if written > 0 {
1841                let u = dst[0];
1842                if (u >= u16::from(b'a') && u <= u16::from(b'z'))
1843                    || (u >= u16::from(b'A') && u <= u16::from(b'Z'))
1844                {
1845                    self.pending_score = None; // Discard pending score
1846                    match self.prev {
1847                        LatinKorean::Hangul | LatinKorean::Hanja => {
1848                            score += CJK_LATIN_ADJACENCY_PENALTY;
1849                        }
1850                        _ => {}
1851                    }
1852                    self.prev = LatinKorean::AsciiLetter;
1853                    self.current_word_len = 0;
1854                } else if u >= 0xAC00 && u <= 0xD7A3 {
1855                    if let Some(pending) = self.pending_score {
1856                        score += pending;
1857                        self.pending_score = None;
1858                    }
1859                    if self.prev_was_euc_range && in_euc_range {
1860                        score += EUC_KR_SCORE_PER_EUC_HANGUL;
1861                        score += cjk_extra_score(u, &data::DETECTOR_DATA.frequent_hangul);
1862                    } else {
1863                        score += self.maybe_set_as_pending(EUC_KR_SCORE_PER_NON_EUC_HANGUL);
1864                    }
1865                    if self.prev == LatinKorean::AsciiLetter {
1866                        score += CJK_LATIN_ADJACENCY_PENALTY;
1867                    }
1868                    self.prev = LatinKorean::Hangul;
1869                    self.current_word_len += 1;
1870                    if self.current_word_len > 5 {
1871                        score += EUC_KR_LONG_WORD_PENALTY;
1872                    }
1873                } else if (u >= 0x4E00 && u < 0xAC00) || (u >= 0xF900 && u <= 0xFA0B) {
1874                    if let Some(pending) = self.pending_score {
1875                        score += pending;
1876                        self.pending_score = None;
1877                    }
1878                    score += EUC_KR_SCORE_PER_HANJA;
1879                    match self.prev {
1880                        LatinKorean::AsciiLetter => {
1881                            score += CJK_LATIN_ADJACENCY_PENALTY;
1882                        }
1883                        LatinKorean::Hangul => {
1884                            score += EUC_KR_HANJA_AFTER_HANGUL_PENALTY;
1885                        }
1886                        _ => {}
1887                    }
1888                    self.prev = LatinKorean::Hanja;
1889                    self.current_word_len += 1;
1890                    if self.current_word_len > 5 {
1891                        score += EUC_KR_LONG_WORD_PENALTY;
1892                    }
1893                } else {
1894                    if u >= 0x80 {
1895                        if let Some(pending) = self.pending_score {
1896                            score += pending;
1897                            self.pending_score = None;
1898                        }
1899                        score += CJK_OTHER;
1900                    } else {
1901                        self.pending_score = None; // Discard pending score
1902                    }
1903                    self.prev = LatinKorean::Other;
1904                    self.current_word_len = 0;
1905                }
1906            }
1907            match result {
1908                DecoderResult::InputEmpty => {
1909                    assert_eq!(read, 1);
1910                }
1911                DecoderResult::Malformed(malformed_len, _) => {
1912                    if (self.prev_byte == 0xC9 || self.prev_byte == 0xFE) && b >= 0xA1 && b <= 0xFE
1913                    {
1914                        if let Some(pending) = self.pending_score {
1915                            score += pending;
1916                            self.pending_score = None;
1917                        }
1918                        // The byte pair is in code page 949 EUDC range
1919                        score += EUC_KR_PUA_PENALTY;
1920                        // Assume Hanja semantics
1921                        match self.prev {
1922                            LatinKorean::AsciiLetter => {
1923                                score += CJK_LATIN_ADJACENCY_PENALTY;
1924                            }
1925                            LatinKorean::Hangul => {
1926                                score += EUC_KR_HANJA_AFTER_HANGUL_PENALTY;
1927                            }
1928                            _ => {}
1929                        }
1930                        self.prev = LatinKorean::Hanja;
1931                        self.current_word_len += 1;
1932                        if self.current_word_len > 5 {
1933                            score += EUC_KR_LONG_WORD_PENALTY;
1934                        }
1935                    } else if (self.prev_byte == 0xA1
1936                        || (self.prev_byte >= 0xA3 && self.prev_byte <= 0xA8)
1937                        || (self.prev_byte >= 0xAA && self.prev_byte <= 0xAD))
1938                        && (b >= 0x7B && b <= 0x7D)
1939                    {
1940                        if let Some(pending) = self.pending_score {
1941                            score += pending;
1942                            self.pending_score = None;
1943                        }
1944                        // MacKorean symbols in range not part of code page 949
1945                        score += EUC_KR_MAC_KOREAN_PENALTY;
1946                        self.prev = LatinKorean::Other;
1947                        self.current_word_len = 0;
1948                    } else if (self.prev_byte >= 0x81 && self.prev_byte <= 0x84)
1949                        && (b <= 0x80 || b == 0xFF)
1950                    {
1951                        // MacKorean single-byte that conflicts with code page 949 lead byte
1952                        // followed by ASCII or a non-conflicting single-byte extension.
1953                        self.pending_score = None; // Just in case
1954                        score += EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY;
1955                        if (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') {
1956                            self.prev = LatinKorean::AsciiLetter;
1957                        } else if b == 0x80 || b == 0xFF {
1958                            score += EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY;
1959                            self.prev = LatinKorean::Other;
1960                        } else {
1961                            self.prev = LatinKorean::Other;
1962                        }
1963                        self.current_word_len = 0;
1964                    } else if malformed_len == 1 && (b == 0x80 || b == 0xFF) {
1965                        // MacKorean single-byte extensions that don't conflict with lead bytes
1966                        self.pending_score = None; // Just in case
1967                        score += EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY;
1968                        self.prev = LatinKorean::Other;
1969                        self.current_word_len = 0;
1970                    } else {
1971                        return None;
1972                    }
1973                }
1974                DecoderResult::OutputFull => {
1975                    unreachable!();
1976                }
1977            }
1978            self.prev_was_euc_range = in_euc_range;
1979            self.prev_byte = b;
1980        }
1981        if last {
1982            let (result, _, _) = self
1983                .decoder
1984                .decode_to_utf16_without_replacement(b"", &mut dst, true);
1985            match result {
1986                DecoderResult::InputEmpty => {}
1987                DecoderResult::Malformed(_, _) => {
1988                    return None;
1989                }
1990                DecoderResult::OutputFull => {
1991                    unreachable!();
1992                }
1993            }
1994        }
1995        Some(score)
1996    }
1997}
1998
1999enum InnerCandidate {
2000    Latin(LatinCandidate),
2001    NonLatinCased(NonLatinCasedCandidate),
2002    Caseless(CaselessCandidate),
2003    ArabicFrench(ArabicFrenchCandidate),
2004    Logical(LogicalCandidate),
2005    Visual(VisualCandidate),
2006    Utf8(Utf8Candidate),
2007    Iso2022(Iso2022Candidate),
2008    Shift(ShiftJisCandidate),
2009    EucJp(EucJpCandidate),
2010    EucKr(EucKrCandidate),
2011    Big5(Big5Candidate),
2012    Gbk(GbkCandidate),
2013}
2014
2015impl InnerCandidate {
2016    fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
2017        match self {
2018            InnerCandidate::Latin(c) => {
2019                if let Some(new_score) = c.feed(buffer) {
2020                    if last {
2021                        // Treat EOF as space-like
2022                        if let Some(additional_score) = c.feed(b" ") {
2023                            Some(new_score + additional_score)
2024                        } else {
2025                            None
2026                        }
2027                    } else {
2028                        Some(new_score)
2029                    }
2030                } else {
2031                    None
2032                }
2033            }
2034            InnerCandidate::NonLatinCased(c) => {
2035                if let Some(new_score) = c.feed(buffer) {
2036                    if last {
2037                        // Treat EOF as space-like
2038                        if let Some(additional_score) = c.feed(b" ") {
2039                            Some(new_score + additional_score)
2040                        } else {
2041                            None
2042                        }
2043                    } else {
2044                        Some(new_score)
2045                    }
2046                } else {
2047                    None
2048                }
2049            }
2050            InnerCandidate::Caseless(c) => {
2051                if let Some(new_score) = c.feed(buffer) {
2052                    if last {
2053                        // Treat EOF as space-like
2054                        if let Some(additional_score) = c.feed(b" ") {
2055                            Some(new_score + additional_score)
2056                        } else {
2057                            None
2058                        }
2059                    } else {
2060                        Some(new_score)
2061                    }
2062                } else {
2063                    None
2064                }
2065            }
2066            InnerCandidate::ArabicFrench(c) => {
2067                if let Some(new_score) = c.feed(buffer) {
2068                    if last {
2069                        // Treat EOF as space-like
2070                        if let Some(additional_score) = c.feed(b" ") {
2071                            Some(new_score + additional_score)
2072                        } else {
2073                            None
2074                        }
2075                    } else {
2076                        Some(new_score)
2077                    }
2078                } else {
2079                    None
2080                }
2081            }
2082            InnerCandidate::Logical(c) => {
2083                if let Some(new_score) = c.feed(buffer) {
2084                    if last {
2085                        // Treat EOF as space-like
2086                        if let Some(additional_score) = c.feed(b" ") {
2087                            Some(new_score + additional_score)
2088                        } else {
2089                            None
2090                        }
2091                    } else {
2092                        Some(new_score)
2093                    }
2094                } else {
2095                    None
2096                }
2097            }
2098            InnerCandidate::Visual(c) => {
2099                if let Some(new_score) = c.feed(buffer) {
2100                    if last {
2101                        // Treat EOF as space-like
2102                        if let Some(additional_score) = c.feed(b" ") {
2103                            Some(new_score + additional_score)
2104                        } else {
2105                            None
2106                        }
2107                    } else {
2108                        Some(new_score)
2109                    }
2110                } else {
2111                    None
2112                }
2113            }
2114            InnerCandidate::Utf8(c) => c.feed(buffer, last),
2115            InnerCandidate::Iso2022(c) => c.feed(buffer, last),
2116            InnerCandidate::Shift(c) => c.feed(buffer, last),
2117            InnerCandidate::EucJp(c) => c.feed(buffer, last),
2118            InnerCandidate::EucKr(c) => c.feed(buffer, last),
2119            InnerCandidate::Big5(c) => c.feed(buffer, last),
2120            InnerCandidate::Gbk(c) => c.feed(buffer, last),
2121        }
2122    }
2123}
2124
2125fn encoding_for_tld(tld: Tld) -> usize {
2126    match tld {
2127        Tld::CentralWindows | Tld::CentralCyrillic => EncodingDetector::CENTRAL_WINDOWS_INDEX,
2128        Tld::Cyrillic => EncodingDetector::CYRILLIC_WINDOWS_INDEX,
2129        Tld::Generic | Tld::Western | Tld::WesternCyrillic | Tld::WesternArabic | Tld::Eu => {
2130            EncodingDetector::WESTERN_INDEX
2131        }
2132        Tld::IcelandicFaroese => EncodingDetector::ICELANDIC_INDEX,
2133        Tld::Greek => EncodingDetector::GREEK_ISO_INDEX,
2134        Tld::TurkishAzeri => EncodingDetector::TURKISH_INDEX,
2135        Tld::Hebrew => EncodingDetector::LOGICAL_INDEX,
2136        Tld::Arabic => EncodingDetector::ARABIC_WINDOWS_INDEX,
2137        Tld::Baltic => EncodingDetector::BALTIC_WINDOWS_INDEX,
2138        Tld::Vietnamese => EncodingDetector::VIETNAMESE_INDEX,
2139        Tld::Thai => EncodingDetector::THAI_INDEX,
2140        Tld::Simplified | Tld::SimplifiedTraditional => EncodingDetector::GBK_INDEX,
2141        Tld::Traditional | Tld::TraditionalSimplified => EncodingDetector::BIG5_INDEX,
2142        Tld::Japanese => EncodingDetector::SHIFT_JIS_INDEX,
2143        Tld::Korean => EncodingDetector::EUC_KR_INDEX,
2144        Tld::CentralIso => EncodingDetector::CENTRAL_ISO_INDEX,
2145    }
2146}
2147
2148fn encoding_is_native_to_tld(tld: Tld, encoding: usize) -> bool {
2149    match tld {
2150        Tld::CentralWindows => encoding == EncodingDetector::CENTRAL_WINDOWS_INDEX,
2151        Tld::Cyrillic => {
2152            encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX
2153                || encoding == EncodingDetector::CYRILLIC_KOI_INDEX
2154                || encoding == EncodingDetector::CYRILLIC_IBM_INDEX
2155                || encoding == EncodingDetector::CYRILLIC_ISO_INDEX
2156        }
2157        Tld::Western => encoding == EncodingDetector::WESTERN_INDEX,
2158        Tld::Greek => {
2159            encoding == EncodingDetector::GREEK_WINDOWS_INDEX
2160                || encoding == EncodingDetector::GREEK_ISO_INDEX
2161        }
2162        Tld::TurkishAzeri => encoding == EncodingDetector::TURKISH_INDEX,
2163        Tld::Hebrew => encoding == EncodingDetector::LOGICAL_INDEX,
2164        Tld::Arabic => {
2165            encoding == EncodingDetector::ARABIC_WINDOWS_INDEX
2166                || encoding == EncodingDetector::ARABIC_ISO_INDEX
2167        }
2168        Tld::Baltic => {
2169            encoding == EncodingDetector::BALTIC_WINDOWS_INDEX
2170                || encoding == EncodingDetector::BALTIC_ISO13_INDEX
2171                || encoding == EncodingDetector::BALTIC_ISO4_INDEX
2172        }
2173        Tld::Vietnamese => encoding == EncodingDetector::VIETNAMESE_INDEX,
2174        Tld::Thai => encoding == EncodingDetector::THAI_INDEX,
2175        Tld::Simplified => encoding == EncodingDetector::GBK_INDEX,
2176        Tld::Traditional => encoding == EncodingDetector::BIG5_INDEX,
2177        Tld::Japanese => {
2178            encoding == EncodingDetector::SHIFT_JIS_INDEX
2179                || encoding == EncodingDetector::EUC_JP_INDEX
2180        }
2181        Tld::Korean => encoding == EncodingDetector::EUC_KR_INDEX,
2182        Tld::SimplifiedTraditional | Tld::TraditionalSimplified => {
2183            encoding == EncodingDetector::GBK_INDEX || encoding == EncodingDetector::BIG5_INDEX
2184        }
2185        Tld::CentralIso => encoding == EncodingDetector::CENTRAL_ISO_INDEX,
2186        Tld::IcelandicFaroese => encoding == EncodingDetector::ICELANDIC_INDEX,
2187        Tld::WesternCyrillic => {
2188            encoding == EncodingDetector::WESTERN_INDEX
2189                || encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX
2190                || encoding == EncodingDetector::CYRILLIC_KOI_INDEX
2191                || encoding == EncodingDetector::CYRILLIC_IBM_INDEX
2192                || encoding == EncodingDetector::CYRILLIC_ISO_INDEX
2193        }
2194        Tld::CentralCyrillic => {
2195            encoding == EncodingDetector::CENTRAL_WINDOWS_INDEX
2196                || encoding == EncodingDetector::CENTRAL_ISO_INDEX
2197                || encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX
2198                || encoding == EncodingDetector::CYRILLIC_KOI_INDEX
2199                || encoding == EncodingDetector::CYRILLIC_IBM_INDEX
2200                || encoding == EncodingDetector::CYRILLIC_ISO_INDEX
2201        }
2202        Tld::WesternArabic => {
2203            encoding == EncodingDetector::WESTERN_INDEX
2204                || encoding == EncodingDetector::ARABIC_WINDOWS_INDEX
2205                || encoding == EncodingDetector::ARABIC_ISO_INDEX
2206        }
2207        Tld::Eu => {
2208            encoding == EncodingDetector::WESTERN_INDEX
2209                || encoding == EncodingDetector::ICELANDIC_INDEX
2210                || encoding == EncodingDetector::CENTRAL_WINDOWS_INDEX
2211                || encoding == EncodingDetector::CENTRAL_ISO_INDEX
2212                || encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX
2213                || encoding == EncodingDetector::CYRILLIC_KOI_INDEX
2214                || encoding == EncodingDetector::CYRILLIC_IBM_INDEX
2215                || encoding == EncodingDetector::CYRILLIC_ISO_INDEX
2216                || encoding == EncodingDetector::GREEK_WINDOWS_INDEX
2217                || encoding == EncodingDetector::GREEK_ISO_INDEX
2218                || encoding == EncodingDetector::BALTIC_WINDOWS_INDEX
2219                || encoding == EncodingDetector::BALTIC_ISO13_INDEX
2220                || encoding == EncodingDetector::BALTIC_ISO4_INDEX
2221        }
2222        Tld::Generic => false,
2223    }
2224}
2225
2226fn score_adjustment(score: i64, encoding: usize, tld: Tld) -> i64 {
2227    if score < 1 {
2228        return 0;
2229    }
2230    // This is the most ad hoc part of this library.
2231    let (divisor, constant) = match tld {
2232        Tld::Generic => {
2233            unreachable!();
2234        }
2235        Tld::CentralWindows | Tld::CentralIso => {
2236            match encoding {
2237                EncodingDetector::WESTERN_INDEX
2238                | EncodingDetector::ICELANDIC_INDEX
2239                | EncodingDetector::BALTIC_WINDOWS_INDEX
2240                | EncodingDetector::BALTIC_ISO4_INDEX
2241                | EncodingDetector::BALTIC_ISO13_INDEX
2242                | EncodingDetector::VIETNAMESE_INDEX
2243                | EncodingDetector::TURKISH_INDEX => {
2244                    // XXX Tune this better instead of this kind of absolute.
2245                    return score;
2246                }
2247                _ => (50, 60),
2248            }
2249        }
2250        Tld::Cyrillic => {
2251            match encoding {
2252                EncodingDetector::BIG5_INDEX
2253                | EncodingDetector::GBK_INDEX
2254                | EncodingDetector::EUC_JP_INDEX
2255                | EncodingDetector::CENTRAL_WINDOWS_INDEX
2256                | EncodingDetector::CENTRAL_ISO_INDEX
2257                | EncodingDetector::GREEK_WINDOWS_INDEX
2258                | EncodingDetector::GREEK_ISO_INDEX
2259                | EncodingDetector::VISUAL_INDEX
2260                | EncodingDetector::LOGICAL_INDEX
2261                | EncodingDetector::BALTIC_WINDOWS_INDEX
2262                | EncodingDetector::BALTIC_ISO4_INDEX
2263                | EncodingDetector::BALTIC_ISO13_INDEX
2264                | EncodingDetector::TURKISH_INDEX => {
2265                    // XXX Tune this better instead of this kind of absolute.
2266                    return score;
2267                }
2268                _ => (50, 60),
2269            }
2270        }
2271        Tld::Western | Tld::WesternCyrillic | Tld::WesternArabic => {
2272            match encoding {
2273                EncodingDetector::CENTRAL_WINDOWS_INDEX
2274                | EncodingDetector::CENTRAL_ISO_INDEX
2275                | EncodingDetector::BALTIC_WINDOWS_INDEX
2276                | EncodingDetector::BALTIC_ISO4_INDEX
2277                | EncodingDetector::BALTIC_ISO13_INDEX
2278                | EncodingDetector::TURKISH_INDEX
2279                | EncodingDetector::VIETNAMESE_INDEX => {
2280                    // XXX Tune this better instead of this kind of absolute.
2281                    return score;
2282                }
2283                _ => (50, 60),
2284            }
2285        }
2286        Tld::Greek => {
2287            match encoding {
2288                EncodingDetector::BIG5_INDEX
2289                | EncodingDetector::GBK_INDEX
2290                | EncodingDetector::EUC_JP_INDEX
2291                | EncodingDetector::CENTRAL_WINDOWS_INDEX
2292                | EncodingDetector::CENTRAL_ISO_INDEX
2293                | EncodingDetector::CYRILLIC_WINDOWS_INDEX
2294                | EncodingDetector::CYRILLIC_ISO_INDEX
2295                | EncodingDetector::CYRILLIC_KOI_INDEX
2296                | EncodingDetector::CYRILLIC_IBM_INDEX
2297                | EncodingDetector::VISUAL_INDEX
2298                | EncodingDetector::LOGICAL_INDEX
2299                | EncodingDetector::BALTIC_WINDOWS_INDEX
2300                | EncodingDetector::BALTIC_ISO4_INDEX
2301                | EncodingDetector::BALTIC_ISO13_INDEX
2302                | EncodingDetector::TURKISH_INDEX => {
2303                    // XXX Tune this better instead of this kind of absolute.
2304                    return score;
2305                }
2306                _ => (50, 60),
2307            }
2308        }
2309        Tld::TurkishAzeri => {
2310            match encoding {
2311                EncodingDetector::CENTRAL_WINDOWS_INDEX
2312                | EncodingDetector::CENTRAL_ISO_INDEX
2313                | EncodingDetector::BALTIC_WINDOWS_INDEX
2314                | EncodingDetector::BALTIC_ISO4_INDEX
2315                | EncodingDetector::BALTIC_ISO13_INDEX
2316                | EncodingDetector::VIETNAMESE_INDEX
2317                | EncodingDetector::ICELANDIC_INDEX => {
2318                    // XXX Tune this better instead of this kind of absolute.
2319                    return score;
2320                }
2321                _ => (50, 60),
2322            }
2323        }
2324        Tld::Hebrew => {
2325            match encoding {
2326                EncodingDetector::CENTRAL_WINDOWS_INDEX
2327                | EncodingDetector::CENTRAL_ISO_INDEX
2328                | EncodingDetector::CYRILLIC_WINDOWS_INDEX
2329                | EncodingDetector::CYRILLIC_ISO_INDEX
2330                | EncodingDetector::CYRILLIC_KOI_INDEX
2331                | EncodingDetector::CYRILLIC_IBM_INDEX
2332                | EncodingDetector::GREEK_WINDOWS_INDEX
2333                | EncodingDetector::GREEK_ISO_INDEX
2334                | EncodingDetector::BALTIC_WINDOWS_INDEX
2335                | EncodingDetector::BALTIC_ISO4_INDEX
2336                | EncodingDetector::BALTIC_ISO13_INDEX
2337                | EncodingDetector::VIETNAMESE_INDEX
2338                | EncodingDetector::TURKISH_INDEX => {
2339                    // XXX Tune this better instead of this kind of absolute.
2340                    return score;
2341                }
2342                _ => (50, 60),
2343            }
2344        }
2345        Tld::Arabic => {
2346            match encoding {
2347                EncodingDetector::BIG5_INDEX
2348                | EncodingDetector::GBK_INDEX
2349                | EncodingDetector::EUC_JP_INDEX
2350                | EncodingDetector::EUC_KR_INDEX
2351                | EncodingDetector::CENTRAL_WINDOWS_INDEX
2352                | EncodingDetector::CENTRAL_ISO_INDEX
2353                | EncodingDetector::CYRILLIC_WINDOWS_INDEX
2354                | EncodingDetector::CYRILLIC_ISO_INDEX
2355                | EncodingDetector::CYRILLIC_KOI_INDEX
2356                | EncodingDetector::CYRILLIC_IBM_INDEX
2357                | EncodingDetector::GREEK_WINDOWS_INDEX
2358                | EncodingDetector::GREEK_ISO_INDEX
2359                | EncodingDetector::VISUAL_INDEX
2360                | EncodingDetector::LOGICAL_INDEX
2361                | EncodingDetector::BALTIC_WINDOWS_INDEX
2362                | EncodingDetector::BALTIC_ISO4_INDEX
2363                | EncodingDetector::BALTIC_ISO13_INDEX
2364                | EncodingDetector::VIETNAMESE_INDEX
2365                | EncodingDetector::TURKISH_INDEX => {
2366                    // XXX Tune this better instead of this kind of absolute.
2367                    return score;
2368                }
2369                _ => (50, 60),
2370            }
2371        }
2372        Tld::Baltic => {
2373            match encoding {
2374                EncodingDetector::CENTRAL_WINDOWS_INDEX
2375                | EncodingDetector::CENTRAL_ISO_INDEX
2376                | EncodingDetector::ICELANDIC_INDEX
2377                | EncodingDetector::TURKISH_INDEX
2378                | EncodingDetector::VIETNAMESE_INDEX => {
2379                    // XXX Tune this better instead of this kind of absolute.
2380                    return score;
2381                }
2382                _ => (50, 60),
2383            }
2384        }
2385        Tld::Vietnamese => {
2386            match encoding {
2387                EncodingDetector::CENTRAL_WINDOWS_INDEX
2388                | EncodingDetector::CENTRAL_ISO_INDEX
2389                | EncodingDetector::BALTIC_WINDOWS_INDEX
2390                | EncodingDetector::BALTIC_ISO4_INDEX
2391                | EncodingDetector::BALTIC_ISO13_INDEX
2392                | EncodingDetector::TURKISH_INDEX
2393                | EncodingDetector::ICELANDIC_INDEX => {
2394                    // XXX Tune this better instead of this kind of absolute.
2395                    return score;
2396                }
2397                _ => (50, 60),
2398            }
2399        }
2400        Tld::Thai => {
2401            match encoding {
2402                EncodingDetector::BIG5_INDEX
2403                | EncodingDetector::GBK_INDEX
2404                | EncodingDetector::EUC_JP_INDEX
2405                | EncodingDetector::EUC_KR_INDEX
2406                | EncodingDetector::SHIFT_JIS_INDEX
2407                | EncodingDetector::CENTRAL_WINDOWS_INDEX
2408                | EncodingDetector::CENTRAL_ISO_INDEX
2409                | EncodingDetector::CYRILLIC_WINDOWS_INDEX
2410                | EncodingDetector::CYRILLIC_ISO_INDEX
2411                | EncodingDetector::CYRILLIC_KOI_INDEX
2412                | EncodingDetector::CYRILLIC_IBM_INDEX
2413                | EncodingDetector::GREEK_WINDOWS_INDEX
2414                | EncodingDetector::GREEK_ISO_INDEX
2415                | EncodingDetector::ARABIC_WINDOWS_INDEX
2416                | EncodingDetector::ARABIC_ISO_INDEX
2417                | EncodingDetector::VISUAL_INDEX
2418                | EncodingDetector::LOGICAL_INDEX
2419                | EncodingDetector::BALTIC_WINDOWS_INDEX
2420                | EncodingDetector::BALTIC_ISO4_INDEX
2421                | EncodingDetector::BALTIC_ISO13_INDEX
2422                | EncodingDetector::TURKISH_INDEX => {
2423                    // XXX Tune this better instead of this kind of absolute.
2424                    return score;
2425                }
2426                _ => (50, 60),
2427            }
2428        }
2429        Tld::Simplified
2430        | Tld::Traditional
2431        | Tld::TraditionalSimplified
2432        | Tld::SimplifiedTraditional
2433        | Tld::Japanese
2434        | Tld::Korean => {
2435            // If TLD default is valid, everything else scores zero
2436            return score;
2437        }
2438        Tld::IcelandicFaroese => {
2439            match encoding {
2440                EncodingDetector::CENTRAL_WINDOWS_INDEX
2441                | EncodingDetector::CENTRAL_ISO_INDEX
2442                | EncodingDetector::BALTIC_WINDOWS_INDEX
2443                | EncodingDetector::BALTIC_ISO4_INDEX
2444                | EncodingDetector::BALTIC_ISO13_INDEX
2445                | EncodingDetector::TURKISH_INDEX
2446                | EncodingDetector::VIETNAMESE_INDEX => {
2447                    // XXX Tune this better instead of this kind of absolute.
2448                    return score;
2449                }
2450                _ => (50, 60),
2451            }
2452        }
2453        Tld::CentralCyrillic => {
2454            match encoding {
2455                EncodingDetector::BIG5_INDEX
2456                | EncodingDetector::GBK_INDEX
2457                | EncodingDetector::EUC_JP_INDEX
2458                | EncodingDetector::GREEK_WINDOWS_INDEX
2459                | EncodingDetector::GREEK_ISO_INDEX
2460                | EncodingDetector::VISUAL_INDEX
2461                | EncodingDetector::LOGICAL_INDEX
2462                | EncodingDetector::BALTIC_WINDOWS_INDEX
2463                | EncodingDetector::BALTIC_ISO4_INDEX
2464                | EncodingDetector::BALTIC_ISO13_INDEX
2465                | EncodingDetector::TURKISH_INDEX => {
2466                    // XXX Tune this better instead of this kind of absolute.
2467                    return score;
2468                }
2469                _ => (50, 60),
2470            }
2471        }
2472        Tld::Eu => {
2473            match encoding {
2474                EncodingDetector::BIG5_INDEX
2475                | EncodingDetector::GBK_INDEX
2476                | EncodingDetector::EUC_JP_INDEX
2477                | EncodingDetector::TURKISH_INDEX
2478                | EncodingDetector::VIETNAMESE_INDEX => {
2479                    // XXX Tune this better instead of this kind of absolute.
2480                    return score;
2481                }
2482                _ => (50, 60),
2483            }
2484        }
2485    };
2486    (score / divisor) + constant
2487}
2488
2489cfg_if::cfg_if! {
2490    if #[cfg(feature = "multithreading")] {
2491        #[repr(align(64))] // Align to cache lines to avoid false sharing in the Rayon case
2492        struct Candidate {
2493            inner: InnerCandidate,
2494            score: Option<i64>,
2495        }
2496    } else {
2497        struct Candidate {
2498            inner: InnerCandidate,
2499            score: Option<i64>,
2500        }
2501    }
2502}
2503
2504impl Candidate {
2505    fn feed(&mut self, buffer: &[u8], last: bool) {
2506        if let Some(old_score) = self.score {
2507            if let Some(new_score) = self.inner.feed(buffer, last) {
2508                self.score = Some(old_score + new_score);
2509            } else {
2510                self.score = None;
2511            }
2512        }
2513    }
2514
2515    #[cfg(feature = "multithreading")]
2516    fn qualified(&self) -> bool {
2517        !self.score.is_none()
2518    }
2519
2520    fn new_latin(data: &'static SingleByteData) -> Self {
2521        Candidate {
2522            inner: InnerCandidate::Latin(LatinCandidate::new(data)),
2523            score: Some(0),
2524        }
2525    }
2526
2527    fn new_non_latin_cased(data: &'static SingleByteData) -> Self {
2528        Candidate {
2529            inner: InnerCandidate::NonLatinCased(NonLatinCasedCandidate::new(data)),
2530            score: Some(0),
2531        }
2532    }
2533
2534    fn new_caseless(data: &'static SingleByteData) -> Self {
2535        Candidate {
2536            inner: InnerCandidate::Caseless(CaselessCandidate::new(data)),
2537            score: Some(0),
2538        }
2539    }
2540
2541    fn new_arabic_french(data: &'static SingleByteData) -> Self {
2542        Candidate {
2543            inner: InnerCandidate::ArabicFrench(ArabicFrenchCandidate::new(data)),
2544            score: Some(0),
2545        }
2546    }
2547
2548    fn new_logical(data: &'static SingleByteData) -> Self {
2549        Candidate {
2550            inner: InnerCandidate::Logical(LogicalCandidate::new(data)),
2551            score: Some(0),
2552        }
2553    }
2554
2555    fn new_visual(data: &'static SingleByteData) -> Self {
2556        Candidate {
2557            inner: InnerCandidate::Visual(VisualCandidate::new(data)),
2558            score: Some(0),
2559        }
2560    }
2561
2562    fn new_utf_8() -> Self {
2563        Candidate {
2564            inner: InnerCandidate::Utf8(Utf8Candidate {
2565                decoder: UTF_8.new_decoder_without_bom_handling(),
2566            }),
2567            score: Some(0),
2568        }
2569    }
2570
2571    fn new_iso_2022_jp() -> Self {
2572        Candidate {
2573            inner: InnerCandidate::Iso2022(Iso2022Candidate {
2574                decoder: ISO_2022_JP.new_decoder_without_bom_handling(),
2575            }),
2576            score: Some(0),
2577        }
2578    }
2579
2580    fn new_shift_jis() -> Self {
2581        Candidate {
2582            inner: InnerCandidate::Shift(ShiftJisCandidate {
2583                decoder: SHIFT_JIS.new_decoder_without_bom_handling(),
2584                half_width_katakana_seen: false,
2585                half_width_katakana_state: HalfWidthKatakana::DakutenForbidden,
2586                prev: LatinCj::Other,
2587                prev_byte: 0,
2588                pending_score: None,
2589            }),
2590            score: Some(0),
2591        }
2592    }
2593
2594    fn new_euc_jp() -> Self {
2595        Candidate {
2596            inner: InnerCandidate::EucJp(EucJpCandidate {
2597                decoder: EUC_JP.new_decoder_without_bom_handling(),
2598                non_ascii_seen: false,
2599                half_width_katakana_state: HalfWidthKatakana::DakutenForbidden,
2600                prev: LatinCj::Other,
2601                prev_byte: 0,
2602                prev_prev_byte: 0,
2603            }),
2604            score: Some(0),
2605        }
2606    }
2607
2608    fn new_euc_kr() -> Self {
2609        Candidate {
2610            inner: InnerCandidate::EucKr(EucKrCandidate {
2611                decoder: EUC_KR.new_decoder_without_bom_handling(),
2612                prev_byte: 0,
2613                prev_was_euc_range: false,
2614                prev: LatinKorean::Other,
2615                current_word_len: 0,
2616                pending_score: None,
2617            }),
2618            score: Some(0),
2619        }
2620    }
2621
2622    fn new_big5() -> Self {
2623        Candidate {
2624            inner: InnerCandidate::Big5(Big5Candidate {
2625                decoder: BIG5.new_decoder_without_bom_handling(),
2626                prev: LatinCj::Other,
2627                prev_byte: 0,
2628                pending_score: None,
2629            }),
2630            score: Some(0),
2631        }
2632    }
2633
2634    fn new_gbk() -> Self {
2635        Candidate {
2636            inner: InnerCandidate::Gbk(GbkCandidate {
2637                decoder: GBK.new_decoder_without_bom_handling(),
2638                prev: LatinCj::Other,
2639                prev_byte: 0,
2640                pending_score: None,
2641            }),
2642            score: Some(0),
2643        }
2644    }
2645
2646    fn score(&self, encoding: usize, tld: Tld, expectation_is_valid: bool) -> Option<i64> {
2647        match &self.inner {
2648            InnerCandidate::NonLatinCased(c) => {
2649                if c.longest_word < 2 {
2650                    return None;
2651                }
2652            }
2653            InnerCandidate::Caseless(c) => {
2654                if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) {
2655                    return None;
2656                }
2657            }
2658            InnerCandidate::ArabicFrench(c) => {
2659                if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) {
2660                    return None;
2661                }
2662            }
2663            InnerCandidate::Logical(c) => {
2664                if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) {
2665                    return None;
2666                }
2667            }
2668            InnerCandidate::Visual(c) => {
2669                if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) {
2670                    return None;
2671                }
2672            }
2673            _ => {}
2674        }
2675        if tld == Tld::Generic {
2676            return self.score;
2677        }
2678        if let Some(score) = self.score {
2679            if encoding == encoding_for_tld(tld) {
2680                return Some(score + 1);
2681            }
2682            if encoding_is_native_to_tld(tld, encoding) {
2683                return Some(score);
2684            }
2685            if expectation_is_valid {
2686                return Some(score - score_adjustment(score, encoding, tld));
2687            }
2688            // If expectation is no longer valid, fall back to
2689            // generic behavior.
2690            // XXX Flipped Chinese and Central
2691            return Some(score);
2692        }
2693        None
2694    }
2695
2696    fn plausible_punctuation(&self) -> u64 {
2697        match &self.inner {
2698            InnerCandidate::Logical(c) => {
2699                return c.plausible_punctuation;
2700            }
2701            InnerCandidate::Visual(c) => {
2702                return c.plausible_punctuation;
2703            }
2704            _ => {
2705                unreachable!();
2706            }
2707        }
2708    }
2709
2710    fn encoding(&self) -> &'static Encoding {
2711        match &self.inner {
2712            InnerCandidate::Latin(c) => {
2713                return c.data.encoding;
2714            }
2715            InnerCandidate::NonLatinCased(c) => {
2716                return c.data.encoding;
2717            }
2718            InnerCandidate::Caseless(c) => {
2719                return c.data.encoding;
2720            }
2721            InnerCandidate::ArabicFrench(c) => {
2722                return c.data.encoding;
2723            }
2724            InnerCandidate::Logical(c) => {
2725                return c.data.encoding;
2726            }
2727            InnerCandidate::Visual(c) => {
2728                return c.data.encoding;
2729            }
2730            InnerCandidate::Shift(_) => {
2731                return SHIFT_JIS;
2732            }
2733            InnerCandidate::EucJp(_) => {
2734                return EUC_JP;
2735            }
2736            InnerCandidate::Big5(_) => {
2737                return BIG5;
2738            }
2739            InnerCandidate::EucKr(_) => {
2740                return EUC_KR;
2741            }
2742            InnerCandidate::Gbk(_) => {
2743                return GBK;
2744            }
2745            InnerCandidate::Utf8(_) => {
2746                return UTF_8;
2747            }
2748            InnerCandidate::Iso2022(_) => {
2749                return ISO_2022_JP;
2750            }
2751        }
2752    }
2753}
2754
2755// LLVM doesn't autovectorize this properly for SSE2, so let's help manually.
2756cfg_if::cfg_if! {
2757    if #[cfg(target_feature = "sse2")] {
2758        fn count_non_ascii(buffer: &[u8]) -> u64 {
2759            let mut count = 0;
2760            let (prefix, simd, suffix) = unsafe { buffer.align_to::<__m128i>() };
2761            for &b in prefix {
2762                if b >= 0x80 {
2763                    count += 1;
2764                }
2765            }
2766            for &s in simd {
2767                count += unsafe {_mm_movemask_epi8(s)}.count_ones() as u64;
2768            }
2769            for &b in suffix {
2770                if b >= 0x80 {
2771                    count += 1;
2772                }
2773            }
2774            count
2775        }
2776    } else {
2777        fn count_non_ascii(buffer: &[u8]) -> u64 {
2778            let mut count = 0;
2779            for &b in buffer {
2780                if b >= 0x80 {
2781                    count += 1;
2782                }
2783            }
2784            count
2785        }
2786    }
2787}
2788
2789#[derive(Clone, Copy)]
2790enum BeforeNonAscii {
2791    None,
2792    One([u8; 1]),
2793    Two([u8; 2]),
2794}
2795
2796impl BeforeNonAscii {
2797    fn as_slice(&self) -> &[u8] {
2798        match self {
2799            BeforeNonAscii::None => b"",
2800            BeforeNonAscii::One(arr) => &arr[..],
2801            BeforeNonAscii::Two(arr) => &arr[..],
2802        }
2803    }
2804
2805    fn push(&mut self, buffer: &[u8]) {
2806        let len = buffer.len();
2807        if len >= 2 {
2808            let arr = [buffer[len - 2], buffer[len - 1]];
2809            *self = BeforeNonAscii::Two(arr);
2810        } else if len == 1 {
2811            match self {
2812                BeforeNonAscii::None => {
2813                    let arr = [buffer[0]];
2814                    *self = BeforeNonAscii::One(arr);
2815                }
2816                BeforeNonAscii::One(first) => {
2817                    let arr = [first[0], buffer[0]];
2818                    *self = BeforeNonAscii::Two(arr);
2819                }
2820                BeforeNonAscii::Two(first) => {
2821                    let arr = [first[1], buffer[0]];
2822                    *self = BeforeNonAscii::Two(arr);
2823                }
2824            }
2825        }
2826    }
2827}
2828
2829/// A Web browser-oriented detector for guessing what character
2830/// encoding a stream of bytes is encoded in.
2831///
2832/// The bytes are fed to the detector incrementally using the `feed`
2833/// method. The current guess of the detector can be queried using
2834/// the `guess` method. The guessing parameters are arguments to the
2835/// `guess` method rather than arguments to the constructor in order
2836/// to enable the application to check if the arguments affect the
2837/// guessing outcome. (The specific use case is to disable UI for
2838/// re-running the detector with UTF-8 allowed and the top-level
2839/// domain name ignored if those arguments don't change the guess.)
2840pub struct EncodingDetector {
2841    candidates: [Candidate; 27],
2842    non_ascii_seen: u64,
2843    // We need to feed up to two bytes of context before non-ASCII
2844    // thanks to Spanish n.º.
2845    last_before_non_ascii: BeforeNonAscii,
2846    esc_seen: bool,
2847    closed: bool,
2848}
2849
2850impl EncodingDetector {
2851    cfg_if::cfg_if! {
2852        if #[cfg(feature = "multithreading")] {
2853            fn feed_impl(&mut self, buffer: &[u8], last: bool) {
2854                if buffer.len() < 10 {
2855                    self.candidates.iter_mut().for_each(|candidate| candidate.feed(buffer, last));
2856                    self.non_ascii_seen += count_non_ascii(buffer);
2857                    return;
2858                }
2859                // Collect only qualified candidates to avoid Rayon
2860                // performing thread synchronization only to bail
2861                // out immediately when trying a disqualified
2862                // candidate.
2863                let mut qualified = ArrayVec::<[_; 27]>::new();
2864                for candidate in self.candidates.iter_mut() {
2865                    if candidate.qualified() {
2866                        qualified.push(candidate);
2867                    }
2868                }
2869                let (_, non_ascii) = rayon::join(|| qualified.par_iter_mut().for_each(|candidate| candidate.feed(buffer, last)),
2870                                                 || count_non_ascii(buffer));
2871                self.non_ascii_seen += non_ascii;
2872            }
2873        } else {
2874            fn feed_impl(&mut self, buffer: &[u8], last: bool) {
2875                self.candidates.iter_mut().for_each(|candidate| candidate.feed(buffer, last));
2876                self.non_ascii_seen += count_non_ascii(buffer);
2877            }
2878        }
2879    }
2880
2881    /// Inform the detector of a chunk of input.
2882    ///
2883    /// The byte stream is represented as a sequence of calls to this
2884    /// method such that the concatenation of the arguments to this
2885    /// method form the byte stream. It does not matter how the application
2886    /// chooses to chunk the stream. It is OK to call this method with
2887    /// a zero-length byte slice.
2888    ///
2889    /// The end of the stream is indicated by calling this method with
2890    /// `last` set to `true`. In that case, the end of the stream is
2891    /// considered to occur after the last byte of the `buffer` (which
2892    /// may be zero-length) passed in the same call. Once this method
2893    /// has been called with `last` set to `true` this method must not
2894    /// be called again.
2895    ///
2896    /// If you want to perform detection on just the prefix of a longer
2897    /// stream, do not pass `last=true` after the prefix if the stream
2898    /// actually still continues.
2899    ///
2900    /// Returns `true` if after processing `buffer` the stream has
2901    /// contained at least one non-ASCII byte and `false` if only
2902    /// ASCII has been seen so far.
2903    ///
2904    /// # Panics
2905    ///
2906    /// If this method has previously been called with `last` set to `true`.
2907    pub fn feed(&mut self, buffer: &[u8], last: bool) -> bool {
2908        assert!(
2909            !self.closed,
2910            "Must not feed again after feeding with last equaling true."
2911        );
2912        if last {
2913            self.closed = true;
2914        }
2915        let start = if self.non_ascii_seen == 0 && !self.esc_seen {
2916            let up_to = Encoding::ascii_valid_up_to(buffer);
2917            let start = if let Some(escape) = memchr::memchr(0x1B, &buffer[..up_to]) {
2918                self.esc_seen = true;
2919                escape
2920            } else {
2921                up_to
2922            };
2923            if start == buffer.len() {
2924                self.last_before_non_ascii.push(buffer);
2925                return self.non_ascii_seen != 0;
2926            }
2927            if start == 0 || start == 1 {
2928                let last_before = self.last_before_non_ascii;
2929                self.last_before_non_ascii = BeforeNonAscii::None;
2930                self.feed_impl(last_before.as_slice(), false);
2931                0
2932            } else {
2933                start - 2
2934            }
2935        } else {
2936            0
2937        };
2938        self.feed_impl(&buffer[start..], last);
2939        self.non_ascii_seen != 0
2940    }
2941
2942    /// Guess the encoding given the bytes pushed to the detector so far
2943    /// (via `feed()`), the top-level domain name from which the bytes were
2944    /// loaded, and an indication of whether to consider UTF-8 as a permissible
2945    /// guess.
2946    ///
2947    /// The `tld` argument takes the rightmost DNS label of the hostname of the
2948    /// host the stream was loaded from in lower-case ASCII form. That is, if
2949    /// the label is an internationalized top-level domain name, it must be
2950    /// provided in its Punycode form. If the TLD that the stream was loaded
2951    /// from is unavalable, `None` may be passed instead, which is equivalent
2952    /// to passing `Some(b"com")`.
2953    ///
2954    /// If the `allow_utf8` argument is set to `false`, the return value of
2955    /// this method won't be `encoding_rs::UTF_8`. When performing detection
2956    /// on `text/html` on non-`file:` URLs, Web browsers must pass `false`,
2957    /// unless the user has taken a specific contextual action to request an
2958    /// override. This way, Web developers cannot start depending on UTF-8
2959    /// detection. Such reliance would make the Web Platform more brittle.
2960    ///
2961    /// Returns the guessed encoding.
2962    ///
2963    /// # Panics
2964    ///
2965    /// If `tld` contains non-ASCII, period, or upper-case letters. (The panic
2966    /// condition is intentionally limited to signs of failing to extract the
2967    /// label correctly, failing to provide it in its Punycode form, and failure
2968    /// to lower-case it. Full DNS label validation is intentionally not performed
2969    /// to avoid panics when the reality doesn't match the specs.)
2970    pub fn guess(&self, tld: Option<&[u8]>, allow_utf8: bool) -> &'static Encoding {
2971        self.guess_assess(tld, allow_utf8).0
2972    }
2973
2974    /// Same as `guess()`, but also returns a Boolean indicating
2975    /// whether the guessed encoding had a higher score than at least
2976    /// one other candidate. If this method returns `false`, the
2977    /// guessed encoding is likely to be wrong.
2978    pub fn guess_assess(&self, tld: Option<&[u8]>, allow_utf8: bool) -> (&'static Encoding, bool) {
2979        let mut tld_type = tld.map_or(Tld::Generic, |tld| {
2980            assert!(!contains_upper_case_period_or_non_ascii(tld));
2981            classify_tld(tld)
2982        });
2983
2984        if self.non_ascii_seen == 0
2985            && self.esc_seen
2986            && self.candidates[Self::ISO_2022_JP_INDEX].score.is_some()
2987        {
2988            return (ISO_2022_JP, true);
2989        }
2990
2991        if self.candidates[Self::UTF_8_INDEX].score.is_some() {
2992            if allow_utf8 {
2993                return (UTF_8, true);
2994            }
2995            // Various test cases that prohibit UTF-8 detection want to
2996            // see windows-1252 specifically. These tests run on generic
2997            // domains. However, if we returned windows-1252 on
2998            // some non-generic domains, we'd cause reloads.
2999            return (self.candidates[encoding_for_tld(tld_type)].encoding(), true);
3000        }
3001
3002        let mut encoding = self.candidates[encoding_for_tld(tld_type)].encoding();
3003        let mut max = 0i64;
3004        let mut expectation_is_valid = false;
3005        if tld_type != Tld::Generic {
3006            for (i, candidate) in self.candidates.iter().enumerate().skip(Self::FIRST_NORMAL) {
3007                if encoding_is_native_to_tld(tld_type, i) && candidate.score.is_some() {
3008                    expectation_is_valid = true;
3009                    break;
3010                }
3011            }
3012        }
3013        if !expectation_is_valid {
3014            // Flip Chinese and Central around
3015            match tld_type {
3016                Tld::Simplified => {
3017                    if self.candidates[Self::BIG5_INDEX].score.is_some() {
3018                        tld_type = Tld::Traditional;
3019                        expectation_is_valid = true;
3020                    }
3021                }
3022                Tld::Traditional => {
3023                    if self.candidates[Self::GBK_INDEX].score.is_some() {
3024                        tld_type = Tld::Simplified;
3025                        expectation_is_valid = true;
3026                    }
3027                }
3028                Tld::CentralWindows => {
3029                    if self.candidates[Self::CENTRAL_ISO_INDEX].score.is_some() {
3030                        tld_type = Tld::CentralIso;
3031                        expectation_is_valid = true;
3032                    }
3033                }
3034                Tld::CentralIso => {
3035                    if self.candidates[Self::CENTRAL_WINDOWS_INDEX].score.is_some() {
3036                        tld_type = Tld::CentralWindows;
3037                        expectation_is_valid = true;
3038                    }
3039                }
3040                _ => {}
3041            }
3042        }
3043        for (i, candidate) in self.candidates.iter().enumerate().skip(Self::FIRST_NORMAL) {
3044            if let Some(score) = candidate.score(i, tld_type, expectation_is_valid) {
3045                if score > max {
3046                    max = score;
3047                    encoding = candidate.encoding();
3048                }
3049            }
3050        }
3051        let visual = &self.candidates[Self::VISUAL_INDEX];
3052        if let Some(visual_score) = visual.score(Self::VISUAL_INDEX, tld_type, expectation_is_valid)
3053        {
3054            if (visual_score > max || encoding == WINDOWS_1255)
3055                && visual.plausible_punctuation()
3056                    > self.candidates[Self::LOGICAL_INDEX].plausible_punctuation()
3057            {
3058                // max = visual_score;
3059                encoding = ISO_8859_8;
3060            }
3061        }
3062        (encoding, max >= 0)
3063    }
3064
3065    // XXX Test-only API
3066    #[cfg(feature = "testing-only-no-semver-guarantees-do-not-use")]
3067    pub fn find_score(&self, encoding: &'static Encoding) -> Option<i64> {
3068        let mut tld_type = Tld::Generic;
3069        let mut expectation_is_valid = false;
3070        if tld_type != Tld::Generic {
3071            for (i, candidate) in self.candidates.iter().enumerate().skip(Self::FIRST_NORMAL) {
3072                if encoding_is_native_to_tld(tld_type, i) && candidate.score.is_some() {
3073                    expectation_is_valid = true;
3074                    break;
3075                }
3076            }
3077        }
3078        if !expectation_is_valid {
3079            // Flip Chinese and Central around
3080            match tld_type {
3081                Tld::Simplified => {
3082                    if self.candidates[Self::BIG5_INDEX].score.is_some() {
3083                        tld_type = Tld::Traditional;
3084                        expectation_is_valid = true;
3085                    }
3086                }
3087                Tld::Traditional => {
3088                    if self.candidates[Self::GBK_INDEX].score.is_some() {
3089                        tld_type = Tld::Simplified;
3090                        expectation_is_valid = true;
3091                    }
3092                }
3093                Tld::CentralWindows => {
3094                    if self.candidates[Self::CENTRAL_ISO_INDEX].score.is_some() {
3095                        tld_type = Tld::CentralIso;
3096                        expectation_is_valid = true;
3097                    }
3098                }
3099                Tld::CentralIso => {
3100                    if self.candidates[Self::CENTRAL_WINDOWS_INDEX].score.is_some() {
3101                        tld_type = Tld::CentralWindows;
3102                        expectation_is_valid = true;
3103                    }
3104                }
3105                _ => {}
3106            }
3107        }
3108        for (i, candidate) in self.candidates.iter().enumerate() {
3109            if encoding == candidate.encoding() {
3110                return candidate.score(i, tld_type, expectation_is_valid);
3111            }
3112        }
3113        Some(0)
3114    }
3115
3116    const FIRST_NORMAL: usize = 3;
3117
3118    const UTF_8_INDEX: usize = 0;
3119
3120    const ISO_2022_JP_INDEX: usize = 1;
3121
3122    const VISUAL_INDEX: usize = 2;
3123
3124    const GBK_INDEX: usize = 3;
3125
3126    const EUC_JP_INDEX: usize = 4;
3127
3128    const EUC_KR_INDEX: usize = 5;
3129
3130    const SHIFT_JIS_INDEX: usize = 6;
3131
3132    const BIG5_INDEX: usize = 7;
3133
3134    const WESTERN_INDEX: usize = 8;
3135
3136    const CYRILLIC_WINDOWS_INDEX: usize = 9;
3137
3138    const CENTRAL_WINDOWS_INDEX: usize = 10;
3139
3140    const CENTRAL_ISO_INDEX: usize = 11;
3141
3142    const ARABIC_WINDOWS_INDEX: usize = 12;
3143
3144    const ICELANDIC_INDEX: usize = 13;
3145
3146    const TURKISH_INDEX: usize = 14;
3147
3148    const THAI_INDEX: usize = 15;
3149
3150    const LOGICAL_INDEX: usize = 16;
3151
3152    const GREEK_WINDOWS_INDEX: usize = 17;
3153
3154    const GREEK_ISO_INDEX: usize = 18;
3155
3156    const BALTIC_WINDOWS_INDEX: usize = 19;
3157
3158    const BALTIC_ISO13_INDEX: usize = 20;
3159
3160    const CYRILLIC_KOI_INDEX: usize = 21;
3161
3162    const CYRILLIC_IBM_INDEX: usize = 22;
3163
3164    const ARABIC_ISO_INDEX: usize = 23;
3165
3166    const VIETNAMESE_INDEX: usize = 24;
3167
3168    const BALTIC_ISO4_INDEX: usize = 25;
3169
3170    const CYRILLIC_ISO_INDEX: usize = 26;
3171
3172    /// Creates a new instance of the detector.
3173    pub fn new() -> Self {
3174        EncodingDetector {
3175            candidates: [
3176                Candidate::new_utf_8(),                                                // 0
3177                Candidate::new_iso_2022_jp(),                                          // 1
3178                Candidate::new_visual(&SINGLE_BYTE_DATA[ISO_8859_8_INDEX]),            // 2
3179                Candidate::new_gbk(),                                                  // 3
3180                Candidate::new_euc_jp(),                                               // 4
3181                Candidate::new_euc_kr(),                                               // 5
3182                Candidate::new_shift_jis(),                                            // 6
3183                Candidate::new_big5(),                                                 // 7
3184                Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1252_INDEX]),           // 8
3185                Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[WINDOWS_1251_INDEX]), // 9
3186                Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1250_INDEX]),           // 10
3187                Candidate::new_latin(&SINGLE_BYTE_DATA[ISO_8859_2_INDEX]),             // 11
3188                Candidate::new_arabic_french(&SINGLE_BYTE_DATA[WINDOWS_1256_INDEX]),   // 12
3189                Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1252_ICELANDIC_INDEX]), // 13
3190                Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1254_INDEX]),           // 14
3191                Candidate::new_caseless(&SINGLE_BYTE_DATA[WINDOWS_874_INDEX]),         // 15
3192                Candidate::new_logical(&SINGLE_BYTE_DATA[WINDOWS_1255_INDEX]),         // 16
3193                Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[WINDOWS_1253_INDEX]), // 17
3194                Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[ISO_8859_7_INDEX]),   // 18
3195                Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1257_INDEX]),           // 19
3196                Candidate::new_latin(&SINGLE_BYTE_DATA[ISO_8859_13_INDEX]),            // 20
3197                Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[KOI8_U_INDEX]),       // 21
3198                Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[IBM866_INDEX]),       // 22
3199                Candidate::new_caseless(&SINGLE_BYTE_DATA[ISO_8859_6_INDEX]),          // 23
3200                Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1258_INDEX]),           // 24
3201                Candidate::new_latin(&SINGLE_BYTE_DATA[ISO_8859_4_INDEX]),             // 25
3202                Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[ISO_8859_5_INDEX]),   // 26
3203            ],
3204            non_ascii_seen: 0,
3205            last_before_non_ascii: BeforeNonAscii::None,
3206            esc_seen: false,
3207            closed: false,
3208        }
3209    }
3210}
3211
3212#[cfg(test)]
3213mod tests {
3214    use super::*;
3215    use detone::IterDecomposeVietnamese;
3216    use encoding_rs::IBM866;
3217    use encoding_rs::ISO_8859_2;
3218    use encoding_rs::ISO_8859_4;
3219    use encoding_rs::ISO_8859_5;
3220    use encoding_rs::ISO_8859_6;
3221    use encoding_rs::ISO_8859_7;
3222    use encoding_rs::KOI8_U;
3223    use encoding_rs::WINDOWS_1250;
3224    use encoding_rs::WINDOWS_1251;
3225    use encoding_rs::WINDOWS_1252;
3226    use encoding_rs::WINDOWS_1253;
3227    use encoding_rs::WINDOWS_1254;
3228    use encoding_rs::WINDOWS_1256;
3229    use encoding_rs::WINDOWS_1257;
3230    use encoding_rs::WINDOWS_1258;
3231    use encoding_rs::WINDOWS_874;
3232
3233    fn check_bytes(bytes: &[u8], encoding: &'static Encoding) {
3234        let mut det = EncodingDetector::new();
3235        det.feed(bytes, true);
3236        let enc = det.guess(None, false);
3237        let (decoded, _) = enc.decode_without_bom_handling(bytes);
3238        println!("{:?}", decoded);
3239        assert_eq!(enc, encoding);
3240    }
3241
3242    fn check(input: &str, encoding: &'static Encoding) {
3243        let orthographic;
3244        let (bytes, _, _) = if encoding == WINDOWS_1258 {
3245            orthographic = input
3246                .chars()
3247                .decompose_vietnamese_tones(true)
3248                .collect::<String>();
3249            encoding.encode(&orthographic)
3250        } else {
3251            encoding.encode(input)
3252        };
3253        check_bytes(&bytes, encoding);
3254    }
3255
3256    #[test]
3257    fn test_i_apostrophe() {
3258        let mut det = EncodingDetector::new();
3259        det.feed(b"I\x92", true);
3260        let enc = det.guess(None, false);
3261        assert_eq!(enc, WINDOWS_1252);
3262    }
3263
3264    #[test]
3265    fn test_streaming_numero_one_by_one() {
3266        let mut det = EncodingDetector::new();
3267        det.feed(b"n", false);
3268        det.feed(b".", false);
3269        det.feed(b"\xBA", false);
3270        det.feed(b"1", true);
3271        let enc = det.guess(None, false);
3272        assert_eq!(enc, WINDOWS_1252);
3273    }
3274
3275    #[test]
3276    fn test_streaming_numero_two_together() {
3277        let mut det = EncodingDetector::new();
3278        det.feed(b"n.", false);
3279        det.feed(b"\xBA", false);
3280        det.feed(b"1", true);
3281        let enc = det.guess(None, false);
3282        assert_eq!(enc, WINDOWS_1252);
3283    }
3284
3285    #[test]
3286    fn test_streaming_numero_one_by_one_extra_before() {
3287        let mut det = EncodingDetector::new();
3288        det.feed(b" n", false);
3289        det.feed(b".", false);
3290        det.feed(b"\xBA", false);
3291        det.feed(b"1", true);
3292        let enc = det.guess(None, false);
3293        assert_eq!(enc, WINDOWS_1252);
3294    }
3295
3296    #[test]
3297    fn test_streaming_numero_one_before() {
3298        let mut det = EncodingDetector::new();
3299        det.feed(b"n", false);
3300        det.feed(b".\xBA", false);
3301        det.feed(b"1", true);
3302        let enc = det.guess(None, false);
3303        assert_eq!(enc, WINDOWS_1252);
3304    }
3305
3306    #[test]
3307    fn test_streaming_numero_longer_first_buffer() {
3308        let mut det = EncodingDetector::new();
3309        det.feed(b"rrn.", false);
3310        det.feed(b"\xBA", false);
3311        det.feed(b"1", true);
3312        let enc = det.guess(None, false);
3313        assert_eq!(enc, WINDOWS_1252);
3314    }
3315
3316    #[test]
3317    fn test_empty() {
3318        let mut det = EncodingDetector::new();
3319        let seen_non_ascii = det.feed(b"", true);
3320        let enc = det.guess(None, false);
3321        assert_eq!(enc, WINDOWS_1252);
3322        assert!(!seen_non_ascii);
3323    }
3324
3325    #[test]
3326    fn test_fi() {
3327        check("Ääni", WINDOWS_1252);
3328    }
3329
3330    #[test]
3331    fn test_fi_bis() {
3332        check("Tämä", WINDOWS_1252);
3333    }
3334
3335    #[test]
3336    fn test_pt() {
3337        check(
3338            "Este é um teste de codificação de caracteres.",
3339            WINDOWS_1252,
3340        );
3341    }
3342
3343    #[test]
3344    fn test_is() {
3345        check("Þetta er kóðunarpróf á staf. Fyrir sum tungumál sem nota latneska stafi þurfum við meira inntak til að taka ákvörðunina.", WINDOWS_1252);
3346    }
3347
3348    #[test]
3349    fn test_ru_short() {
3350        check("Русский", WINDOWS_1251);
3351    }
3352
3353    #[test]
3354    fn test_ru() {
3355        check("Это тест кодировки символов.", WINDOWS_1251);
3356    }
3357
3358    #[test]
3359    fn test_ru_iso() {
3360        check("Это тест кодировки символов.", ISO_8859_5);
3361    }
3362
3363    #[test]
3364    fn test_ru_ibm() {
3365        check("Это тест кодировки символов.", IBM866);
3366    }
3367
3368    #[test]
3369    fn test_ru_koi() {
3370        check("Это тест кодировки символов.", KOI8_U);
3371    }
3372
3373    #[test]
3374    fn test_uk() {
3375        check("Це тест на кодування символів.", WINDOWS_1251);
3376    }
3377
3378    #[test]
3379    fn test_uk_koi() {
3380        check("Це тест на кодування символів.", KOI8_U);
3381    }
3382
3383    #[test]
3384    fn test_el_short() {
3385        check("Ελληνικά", WINDOWS_1253);
3386    }
3387
3388    #[test]
3389    fn test_el() {
3390        check(
3391            "Πρόκειται για δοκιμή κωδικοποίησης χαρακτήρων: Άρης",
3392            WINDOWS_1253,
3393        );
3394    }
3395
3396    #[test]
3397    fn test_el_iso() {
3398        check(
3399            "Πρόκειται για δοκιμή κωδικοποίησης χαρακτήρων: Άρης",
3400            ISO_8859_7,
3401        );
3402    }
3403
3404    #[test]
3405    fn test_de() {
3406        check("Straße", WINDOWS_1252);
3407    }
3408
3409    #[test]
3410    fn test_en_windows1252() {
3411        // "Don't "
3412        check_bytes(&[68, 111, 110, 180, 116, 32], WINDOWS_1252);
3413    }
3414
3415    #[test]
3416    fn test_he() {
3417        check("\u{5E2}\u{5D1}\u{5E8}\u{5D9}\u{5EA}", WINDOWS_1255);
3418    }
3419
3420    #[test]
3421    fn test_2022() {
3422        check("日本語", ISO_2022_JP);
3423    }
3424
3425    #[test]
3426    fn test_th() {
3427        check("นี่คือการทดสอบการเข้ารหัสอักขระ", WINDOWS_874);
3428    }
3429
3430    #[test]
3431    fn test_vi() {
3432        check("Đây là một thử nghiệm mã hóa ký tự.", WINDOWS_1258);
3433    }
3434
3435    #[test]
3436    fn test_tr() {
3437        check("Bu bir karakter kodlama testidir. Latince karakterleri kullanan bazı dillerde karar vermek için daha fazla girdiye ihtiyacımız var.", WINDOWS_1254);
3438    }
3439
3440    #[test]
3441    fn test_simplified() {
3442        check("这是一个字符编码测试。", GBK);
3443    }
3444
3445    #[test]
3446    fn test_traditional() {
3447        check("這是一個字符編碼測試。", BIG5);
3448    }
3449
3450    #[test]
3451    fn test_ko() {
3452        check("이것은 문자 인코딩 테스트입니다.", EUC_KR);
3453    }
3454
3455    #[test]
3456    fn test_shift() {
3457        check("これは文字実験です。", SHIFT_JIS);
3458    }
3459
3460    #[test]
3461    fn test_euc() {
3462        check("これは文字実験です。", EUC_JP);
3463    }
3464
3465    #[test]
3466    fn test_ar() {
3467        check("هذا هو اختبار ترميز الأحرف.", WINDOWS_1256);
3468    }
3469
3470    #[test]
3471    fn test_ar_iso() {
3472        check("هذا هو اختبار ترميز الأحرف.", ISO_8859_6);
3473    }
3474
3475    #[test]
3476    fn test_fa() {
3477        check("این یک تست رمزگذاری کاراکتر است.", WINDOWS_1256);
3478    }
3479
3480    #[test]
3481    fn test_visual() {
3482        check(".םיוות דודיק ןחבמ והז", ISO_8859_8);
3483    }
3484
3485    #[test]
3486    fn test_yi() {
3487        check("דאָס איז אַ טעסט פֿאַר קאָדירונג פון כאַראַקטער.", WINDOWS_1255);
3488    }
3489
3490    #[test]
3491    fn test_it() {
3492        check("è", WINDOWS_1252);
3493    }
3494
3495    #[test]
3496    fn test_en() {
3497        check("isn’t", WINDOWS_1252);
3498    }
3499
3500    #[test]
3501    fn test_en_bis() {
3502        check("Rock ’n Roll", WINDOWS_1252);
3503    }
3504
3505    #[test]
3506    fn test_ca() {
3507        check("Codificació de caràcters", WINDOWS_1252);
3508    }
3509
3510    #[test]
3511    fn test_et() {
3512        check("või", WINDOWS_1252);
3513    }
3514
3515    #[test]
3516    fn test_pl_iso() {
3517        check("To jest test kodowania znaków. W przypadku niektórych języków, które używają znaków łacińskich, potrzebujemy więcej danych, aby podjąć decyzję.", ISO_8859_2);
3518    }
3519
3520    #[test]
3521    fn test_pl() {
3522        check("To jest test kodowania znaków. W przypadku niektórych języków, które używają znaków łacińskich, potrzebujemy więcej danych, aby podjąć decyzję.", WINDOWS_1250);
3523    }
3524
3525    #[test]
3526    fn test_lt() {
3527        check("Tai simbolių kodavimo testas. Kai kurioms kalboms, naudojančioms lotyniškus rašmenis, mums reikia daugiau informacijos, kad galėtume priimti sprendimą.", WINDOWS_1257);
3528    }
3529
3530    // TODO: Detected as ISO-8859-2.
3531    // #[test]
3532    // fn test_lt_windows_iso_8859_4() {
3533    //     check("Tai simbolių kodavimo testas. Kai kurioms kalboms, naudojančioms lotyniškus rašmenis, mums reikia daugiau informacijos, kad galėtume priimti sprendimą.", ISO_8859_4);
3534    // }
3535
3536    #[test]
3537    fn test_lv() {
3538        check("Šis ir rakstzīmju kodēšanas tests. Dažās valodās, kurās tiek izmantotas latīņu valodas burti, lēmuma pieņemšanai mums ir nepieciešams vairāk ieguldījuma.", WINDOWS_1257);
3539    }
3540
3541    #[test]
3542    fn test_lv_iso_8859_4() {
3543        check("Šis ir rakstzīmju kodēšanas tests. Dažās valodās, kurās tiek izmantotas latīņu valodas burti, lēmuma pieņemšanai mums ir nepieciešams vairāk ieguldījuma.", ISO_8859_4);
3544    }
3545
3546    #[test]
3547    fn test_a0() {
3548        // Test that this isn't IBM866. TODO: What about GBK with fully paired 0xA0?
3549        check("\u{A0}\u{A0} \u{A0}", WINDOWS_1252);
3550    }
3551
3552    #[test]
3553    fn test_a0a0() {
3554        // Test that this isn't GBK or EUC-KR.
3555        check("\u{A0}\u{A0}", WINDOWS_1252);
3556    }
3557
3558    #[test]
3559    fn test_space_copyright_space() {
3560        check(" © ", WINDOWS_1252);
3561    }
3562
3563    #[test]
3564    fn test_space_masculine_space() {
3565        check(" º ", WINDOWS_1252);
3566    }
3567
3568    #[test]
3569    fn test_space_feminine_space() {
3570        check(" ª ", WINDOWS_1252);
3571    }
3572
3573    #[test]
3574    fn test_period_masculine_space() {
3575        check(".º ", WINDOWS_1252);
3576    }
3577
3578    #[test]
3579    fn test_period_feminine_space() {
3580        check(".ª ", WINDOWS_1252);
3581    }
3582
3583    #[test]
3584    fn test_maria() {
3585        check(" Mª ", WINDOWS_1252);
3586    }
3587
3588    #[test]
3589    fn test_dona() {
3590        check(" Dª ", WINDOWS_1252);
3591    }
3592
3593    #[test]
3594    fn test_nuestra() {
3595        check(" Nª ", WINDOWS_1252);
3596    }
3597
3598    #[test]
3599    fn test_senora() {
3600        check(" Sª ", WINDOWS_1252);
3601    }
3602
3603    #[test]
3604    fn test_digit_feminine() {
3605        check(" 42ª ", WINDOWS_1252);
3606    }
3607
3608    #[test]
3609    fn test_digit_masculine() {
3610        check(" 42º ", WINDOWS_1252);
3611    }
3612
3613    #[test]
3614    fn test_roman_feminine() {
3615        check(" XIVª ", WINDOWS_1252);
3616    }
3617
3618    #[test]
3619    fn test_roman_masculine() {
3620        check(" XIVº ", WINDOWS_1252);
3621    }
3622
3623    #[test]
3624    fn test_numero_uno() {
3625        check("Nº1", WINDOWS_1252);
3626    }
3627
3628    #[test]
3629    fn test_numero() {
3630        check("Nº", WINDOWS_1252);
3631    }
3632
3633    #[test]
3634    fn test_euro() {
3635        check(" €9", WINDOWS_1252);
3636    }
3637
3638    #[test]
3639    fn test_shift_jis_half_width_katakana() {
3640        check("ハードウェアハードウェアハードウェアハードウェアハードウェア", SHIFT_JIS);
3641    }
3642
3643    #[test]
3644    fn test_big5_pua() {
3645        let mut v = Vec::new();
3646        for _ in 0..40 {
3647            v.extend_from_slice(b"\xA4\x40");
3648        }
3649        v.extend_from_slice(b"\x81\x40\xA4\x40");
3650        check_bytes(&v, BIG5);
3651    }
3652
3653    #[test]
3654    fn test_big5_single_byte_a0() {
3655        let mut v = Vec::new();
3656        for _ in 0..80 {
3657            v.extend_from_slice(b"\xA4\x40");
3658        }
3659        v.extend_from_slice(b"\x81\x40\xA0 ");
3660        check_bytes(&v, BIG5);
3661    }
3662
3663    #[test]
3664    fn test_big5_single_byte_ff() {
3665        let mut v = Vec::new();
3666        for _ in 0..80 {
3667            v.extend_from_slice(b"\xA4\x40");
3668        }
3669        v.extend_from_slice(b"\x81\x40\xFF ");
3670        check_bytes(&v, BIG5);
3671    }
3672
3673    #[test]
3674    fn test_not_big5() {
3675        let mut v = Vec::new();
3676        for _ in 0..40 {
3677            v.extend_from_slice(b"\xA4\x40");
3678        }
3679        v.extend_from_slice(b"\x81\x40\xA0\xA0");
3680        check_bytes(&v, IBM866);
3681    }
3682
3683    #[test]
3684    fn test_euc_kr_pua() {
3685        let mut v = Vec::new();
3686        v.extend_from_slice(b"\xC9\xA1\xB0\xA1 ");
3687        for _ in 0..40 {
3688            v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3689        }
3690        check_bytes(&v, EUC_KR);
3691    }
3692
3693    #[test]
3694    fn test_euc_kr_pua_bis() {
3695        let mut v = Vec::new();
3696        v.extend_from_slice(b"\xFE\xA1\xB0\xA1 ");
3697        for _ in 0..40 {
3698            v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3699        }
3700        check_bytes(&v, EUC_KR);
3701    }
3702
3703    #[test]
3704    fn test_euc_kr_single_byte_ff() {
3705        let mut v = Vec::new();
3706        v.extend_from_slice(b"\xFF ");
3707        for _ in 0..40 {
3708            v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3709        }
3710        check_bytes(&v, EUC_KR);
3711    }
3712
3713    #[test]
3714    fn test_euc_kr_single_byte_81() {
3715        let mut v = Vec::new();
3716        v.extend_from_slice(b"\x81 ");
3717        for _ in 0..40 {
3718            v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3719        }
3720        check_bytes(&v, EUC_KR);
3721    }
3722
3723    #[test]
3724    fn test_euc_kr_single_byte_84() {
3725        let mut v = Vec::new();
3726        v.extend_from_slice(b"\x84 ");
3727        for _ in 0..40 {
3728            v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3729        }
3730        check_bytes(&v, EUC_KR);
3731    }
3732
3733    #[test]
3734    fn test_not_euc_kr() {
3735        let mut v = Vec::new();
3736        v.extend_from_slice(b"\xC9\xA0\xB0\xA1 ");
3737        for _ in 0..40 {
3738            v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3739        }
3740        check_bytes(&v, GBK);
3741    }
3742
3743    #[test]
3744    fn test_shift_jis_x0213() {
3745        let mut v = Vec::new();
3746        v.extend_from_slice(b"\x87\xE5");
3747        for _ in 0..40 {
3748            v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2");
3749        }
3750        check_bytes(&v, SHIFT_JIS);
3751    }
3752
3753    #[test]
3754    fn test_shift_jis_single_byte_fd() {
3755        let mut v = Vec::new();
3756        v.extend_from_slice(b"\xFD");
3757        for _ in 0..40 {
3758            v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2");
3759        }
3760        check_bytes(&v, SHIFT_JIS);
3761    }
3762
3763    #[test]
3764    fn test_not_shift_jis() {
3765        let mut v = Vec::new();
3766        v.extend_from_slice(b"\x84\xE0");
3767        for _ in 0..40 {
3768            v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2");
3769        }
3770        check_bytes(&v, GBK);
3771    }
3772
3773    #[test]
3774    fn test_not_shift_jis_bis() {
3775        let mut v = Vec::new();
3776        v.extend_from_slice(b"\x87\x7D");
3777        for _ in 0..40 {
3778            v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2");
3779        }
3780        check_bytes(&v, GBK);
3781    }
3782
3783    #[test]
3784    fn test_euc_jp_x0213() {
3785        let mut v = Vec::new();
3786        v.extend_from_slice(b"\xAD\xBF");
3787        for _ in 0..80 {
3788            v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4");
3789        }
3790        check_bytes(&v, EUC_JP);
3791    }
3792
3793    #[test]
3794    fn test_euc_jp_x0213_other_plane() {
3795        let mut v = Vec::new();
3796        v.extend_from_slice(b"\x8F\xFE\xF6");
3797        for _ in 0..80 {
3798            v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4");
3799        }
3800        check_bytes(&v, EUC_JP);
3801    }
3802
3803    #[test]
3804    fn test_not_euc_jp() {
3805        let mut v = Vec::new();
3806        v.extend_from_slice(b"\x8F\xFE\xF7");
3807        for _ in 0..80 {
3808            v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4");
3809        }
3810        check_bytes(&v, WINDOWS_1252);
3811    }
3812
3813    #[test]
3814    fn test_not_euc_jp_bis() {
3815        let mut v = Vec::new();
3816        v.extend_from_slice(b"\xA8\xDF");
3817        for _ in 0..80 {
3818            v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4");
3819        }
3820        check_bytes(&v, BIG5);
3821    }
3822
3823    #[test]
3824    fn test_gbk_single_byte_ff() {
3825        let mut v = Vec::new();
3826        v.extend_from_slice(b"\xFF");
3827        for _ in 0..80 {
3828            v.extend_from_slice(b"\xB5\xC4");
3829        }
3830        check_bytes(&v, GBK);
3831    }
3832
3833    #[test]
3834    fn test_gbk_single_byte_a0() {
3835        let mut v = Vec::new();
3836        v.extend_from_slice(b"\xA0 ");
3837        for _ in 0..80 {
3838            v.extend_from_slice(b"\xB5\xC4");
3839        }
3840        check_bytes(&v, GBK);
3841    }
3842
3843    #[test]
3844    fn test_gbk_single_byte_fe() {
3845        let mut v = Vec::new();
3846        v.extend_from_slice(b"\xFE ");
3847        for _ in 0..80 {
3848            v.extend_from_slice(b"\xB5\xC4");
3849        }
3850        check_bytes(&v, GBK);
3851    }
3852
3853    #[test]
3854    fn test_not_gbk_single_byte_fc() {
3855        let mut v = Vec::new();
3856        v.extend_from_slice(b"\xFC ");
3857        for _ in 0..80 {
3858            v.extend_from_slice(b"\xB5\xC4");
3859        }
3860        check_bytes(&v, ISO_8859_5);
3861    }
3862}