base/
text.rs

1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
4
5use std::iter::Sum;
6use std::ops::{Add, AddAssign, Range, Sub, SubAssign};
7
8use malloc_size_of_derive::MallocSizeOf;
9
10pub use crate::unicode_block::{UnicodeBlock, UnicodeBlockMethod};
11
12pub fn is_bidi_control(c: char) -> bool {
13    matches!(c, '\u{202A}'..='\u{202E}' | '\u{2066}'..='\u{2069}' | '\u{200E}' | '\u{200F}' | '\u{061C}')
14}
15
16pub fn unicode_plane(codepoint: char) -> u32 {
17    (codepoint as u32) >> 16
18}
19
20pub fn is_cjk(codepoint: char) -> bool {
21    if let Some(
22        UnicodeBlock::CJKRadicalsSupplement |
23        UnicodeBlock::KangxiRadicals |
24        UnicodeBlock::IdeographicDescriptionCharacters |
25        UnicodeBlock::CJKSymbolsandPunctuation |
26        UnicodeBlock::Hiragana |
27        UnicodeBlock::Katakana |
28        UnicodeBlock::Bopomofo |
29        UnicodeBlock::HangulCompatibilityJamo |
30        UnicodeBlock::Kanbun |
31        UnicodeBlock::BopomofoExtended |
32        UnicodeBlock::CJKStrokes |
33        UnicodeBlock::KatakanaPhoneticExtensions |
34        UnicodeBlock::EnclosedCJKLettersandMonths |
35        UnicodeBlock::CJKCompatibility |
36        UnicodeBlock::CJKUnifiedIdeographsExtensionA |
37        UnicodeBlock::YijingHexagramSymbols |
38        UnicodeBlock::CJKUnifiedIdeographs |
39        UnicodeBlock::CJKCompatibilityIdeographs |
40        UnicodeBlock::CJKCompatibilityForms |
41        UnicodeBlock::HalfwidthandFullwidthForms,
42    ) = codepoint.block()
43    {
44        return true;
45    }
46
47    // https://en.wikipedia.org/wiki/Plane_(Unicode)#Supplementary_Ideographic_Plane
48    // https://en.wikipedia.org/wiki/Plane_(Unicode)#Tertiary_Ideographic_Plane
49    unicode_plane(codepoint) == 2 || unicode_plane(codepoint) == 3
50}
51
52macro_rules! unicode_length_type {
53    ($type_name:ident) => {
54        /// A length in code units of the given text encoding. For instance, `Utf8CodeUnitLength`
55        /// is a length in UTF-8 code units (one byte each). `Utf16CodeUnitLength` is a length in
56        /// UTF-16 code units (two bytes each). This type is used to more reliable work with
57        /// lengths in different encodings.
58        #[derive(Clone, Copy, Debug, Default, Eq, MallocSizeOf, Ord, PartialEq, PartialOrd)]
59        pub struct $type_name(pub usize);
60
61        impl $type_name {
62            pub fn zero() -> Self {
63                Self(0)
64            }
65
66            pub fn one() -> Self {
67                Self(1)
68            }
69
70            pub fn unwrap_range(byte_range: Range<Self>) -> Range<usize> {
71                byte_range.start.0..byte_range.end.0
72            }
73
74            pub fn saturating_sub(self, value: Self) -> Self {
75                Self(self.0.saturating_sub(value.0))
76            }
77        }
78
79        impl From<u32> for $type_name {
80            fn from(value: u32) -> Self {
81                Self(value as usize)
82            }
83        }
84
85        impl From<isize> for $type_name {
86            fn from(value: isize) -> Self {
87                Self(value as usize)
88            }
89        }
90
91        impl Add for $type_name {
92            type Output = Self;
93            fn add(self, other: Self) -> Self {
94                Self(self.0 + other.0)
95            }
96        }
97
98        impl AddAssign for $type_name {
99            fn add_assign(&mut self, other: Self) {
100                *self = Self(self.0 + other.0)
101            }
102        }
103
104        impl Sub for $type_name {
105            type Output = Self;
106            fn sub(self, value: Self) -> Self {
107                Self(self.0 - value.0)
108            }
109        }
110
111        impl SubAssign for $type_name {
112            fn sub_assign(&mut self, other: Self) {
113                *self = Self(self.0 - other.0)
114            }
115        }
116
117        impl Sum for $type_name {
118            fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
119                iter.fold(Self::zero(), |a, b| Self(a.0 + b.0))
120            }
121        }
122    };
123}
124
125unicode_length_type!(Utf8CodeUnitLength);
126unicode_length_type!(Utf16CodeUnitLength);
127
128#[cfg(test)]
129mod test {
130    use super::*;
131
132    #[test]
133    fn test_is_cjk() {
134        // Test characters from different CJK blocks
135        assert_eq!(is_cjk('〇'), true);
136        assert_eq!(is_cjk('㐀'), true);
137        assert_eq!(is_cjk('あ'), true);
138        assert_eq!(is_cjk('ア'), true);
139        assert_eq!(is_cjk('㆒'), true);
140        assert_eq!(is_cjk('ㆣ'), true);
141        assert_eq!(is_cjk('龥'), true);
142        assert_eq!(is_cjk('𰾑'), true);
143        assert_eq!(is_cjk('𰻝'), true);
144
145        // Test characters from outside CJK blocks
146        assert_eq!(is_cjk('a'), false);
147        assert_eq!(is_cjk('🙂'), false);
148        assert_eq!(is_cjk('©'), false);
149    }
150}