base/
text.rs

1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
4
5use std::iter::Sum;
6use std::ops::{Add, AddAssign, Sub, SubAssign};
7
8use malloc_size_of_derive::MallocSizeOf;
9
10pub use crate::unicode_block::{UnicodeBlock, UnicodeBlockMethod};
11
12pub fn is_bidi_control(c: char) -> bool {
13    matches!(c, '\u{202A}'..='\u{202E}' | '\u{2066}'..='\u{2069}' | '\u{200E}' | '\u{200F}' | '\u{061C}')
14}
15
16pub fn unicode_plane(codepoint: char) -> u32 {
17    (codepoint as u32) >> 16
18}
19
20pub fn is_cjk(codepoint: char) -> bool {
21    if let Some(
22        UnicodeBlock::CJKRadicalsSupplement |
23        UnicodeBlock::KangxiRadicals |
24        UnicodeBlock::IdeographicDescriptionCharacters |
25        UnicodeBlock::CJKSymbolsandPunctuation |
26        UnicodeBlock::Hiragana |
27        UnicodeBlock::Katakana |
28        UnicodeBlock::Bopomofo |
29        UnicodeBlock::HangulCompatibilityJamo |
30        UnicodeBlock::Kanbun |
31        UnicodeBlock::BopomofoExtended |
32        UnicodeBlock::CJKStrokes |
33        UnicodeBlock::KatakanaPhoneticExtensions |
34        UnicodeBlock::EnclosedCJKLettersandMonths |
35        UnicodeBlock::CJKCompatibility |
36        UnicodeBlock::CJKUnifiedIdeographsExtensionA |
37        UnicodeBlock::YijingHexagramSymbols |
38        UnicodeBlock::CJKUnifiedIdeographs |
39        UnicodeBlock::CJKCompatibilityIdeographs |
40        UnicodeBlock::CJKCompatibilityForms |
41        UnicodeBlock::HalfwidthandFullwidthForms,
42    ) = codepoint.block()
43    {
44        return true;
45    }
46
47    // https://en.wikipedia.org/wiki/Plane_(Unicode)#Supplementary_Ideographic_Plane
48    // https://en.wikipedia.org/wiki/Plane_(Unicode)#Tertiary_Ideographic_Plane
49    unicode_plane(codepoint) == 2 || unicode_plane(codepoint) == 3
50}
51
52macro_rules! unicode_length_type {
53    ($type_name:ident) => {
54        /// A length in code units of the given text encoding. For instance, `Utf8CodeUnitLength`
55        /// is a length in UTF-8 code units (one byte each). `Utf16CodeUnitLength` is a length in
56        /// UTF-16 code units (two bytes each). This type is used to more reliable work with
57        /// lengths in different encodings.
58        #[derive(Clone, Copy, Debug, Default, Eq, MallocSizeOf, Ord, PartialEq, PartialOrd)]
59        pub struct $type_name(pub usize);
60
61        impl $type_name {
62            pub fn zero() -> Self {
63                Self(0)
64            }
65
66            pub fn one() -> Self {
67                Self(1)
68            }
69
70            pub fn saturating_sub(self, value: Self) -> Self {
71                Self(self.0.saturating_sub(value.0))
72            }
73        }
74
75        impl From<u32> for $type_name {
76            fn from(value: u32) -> Self {
77                Self(value as usize)
78            }
79        }
80
81        impl From<isize> for $type_name {
82            fn from(value: isize) -> Self {
83                Self(value as usize)
84            }
85        }
86
87        impl Add for $type_name {
88            type Output = Self;
89            fn add(self, other: Self) -> Self {
90                Self(self.0 + other.0)
91            }
92        }
93
94        impl AddAssign for $type_name {
95            fn add_assign(&mut self, other: Self) {
96                *self = Self(self.0 + other.0)
97            }
98        }
99
100        impl Sub for $type_name {
101            type Output = Self;
102            fn sub(self, value: Self) -> Self {
103                Self(self.0 - value.0)
104            }
105        }
106
107        impl SubAssign for $type_name {
108            fn sub_assign(&mut self, other: Self) {
109                *self = Self(self.0 - other.0)
110            }
111        }
112
113        impl Sum for $type_name {
114            fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
115                iter.fold(Self::zero(), |a, b| Self(a.0 + b.0))
116            }
117        }
118    };
119}
120
121unicode_length_type!(Utf8CodeUnitLength);
122unicode_length_type!(Utf16CodeUnitLength);
123
124#[cfg(test)]
125mod test {
126    use super::*;
127
128    #[test]
129    fn test_is_cjk() {
130        // Test characters from different CJK blocks
131        assert_eq!(is_cjk('〇'), true);
132        assert_eq!(is_cjk('㐀'), true);
133        assert_eq!(is_cjk('あ'), true);
134        assert_eq!(is_cjk('ア'), true);
135        assert_eq!(is_cjk('㆒'), true);
136        assert_eq!(is_cjk('ㆣ'), true);
137        assert_eq!(is_cjk('龥'), true);
138        assert_eq!(is_cjk('𰾑'), true);
139        assert_eq!(is_cjk('𰻝'), true);
140
141        // Test characters from outside CJK blocks
142        assert_eq!(is_cjk('a'), false);
143        assert_eq!(is_cjk('🙂'), false);
144        assert_eq!(is_cjk('©'), false);
145    }
146}