Skip to main content

unicode_categories/
lib.rs

1//! `unicode_categories` is a crate that adds extensions to the
2//! `char` primitive type that allow for a char to be queried
3//! about whether or not it belongs to a particular Unicode category.
4//!
5//! These extensions exist on the `UnicodeCategories` trait, so
6//! by importing it the extensions will be active on all chars:
7//!
8//! ```
9//! use unicode_categories::UnicodeCategories;
10//!
11//! assert!('a'.is_letter_lowercase());
12//! assert!('A'.is_letter_uppercase());
13//! assert!('\n'.is_other_control());
14//! ```
15//!
16//! `UnicodeCategories` is the only item contained exported
17//! by this crate and contains all of methods that allow
18//! for category queries.
19
20mod tables;
21
22pub trait UnicodeCategories : Sized + Copy {
23
24    /// Returns `true` if this value is a member
25    /// of the "Other, Control" (Cc) category.
26    fn is_other_control(self) -> bool;
27
28    /// Returns `true` if this value is a member
29    /// of the "Other, Format" (Cf) category.
30    fn is_other_format(self) -> bool;
31
32    /// Returns true if this value is a member
33    /// of the "Other, Private Use" (Co) category.
34    fn is_other_private_use(self) -> bool;
35
36    /// Returns true if this value is a member
37    /// of the "Letter, Lowercase" (Ll) category.
38    fn is_letter_lowercase(self) -> bool;
39
40    /// Returns true if this value is a member of
41    /// the "Letter, Modifier" (Lm) category.
42    fn is_letter_modifier(self) -> bool;
43
44    /// Returns true if this value is a member of
45    /// the "Letter, Other" (Lo) category.
46    fn is_letter_other(self) -> bool;
47
48    /// Returns true if this value is a member of
49    /// the "Letter, Titlecase" (Lt) category.
50    fn is_letter_titlecase(self) -> bool;
51
52    /// Returns true if this value is a member of
53    /// the "Letter, Uppercase" (Lu) category.
54    fn is_letter_uppercase(self) -> bool;
55
56    /// Returns true if this value is a member of
57    /// the "Mark, Spacing Combining" (Mc) category.
58    fn is_mark_spacing_combining(self) -> bool;
59
60    /// Returns true if this value is a member of
61    /// the "Mark, Enclosing" (Me) category.
62    fn is_mark_enclosing(self) -> bool;
63
64    /// Returns true if this value is a member of
65    /// the "Mark, Nonspacing" (Mn) category.
66    fn is_mark_nonspacing(self) -> bool;
67
68    /// Returns true if this value is a member of
69    /// the "Number, Decimal Digit" (Nd) category.
70    fn is_number_decimal_digit(self) -> bool;
71
72    /// Returns true if this value is a member of
73    /// the "Number, Letter" (Nl) category.
74    fn is_number_letter(self) -> bool;
75
76    /// Returns true if this value is a member of
77    /// the "Number, Other" (No) category.
78    fn is_number_other(self) -> bool;
79
80    /// Returns true if this value is a member of
81    /// the "Punctuation, Connector" (Pc) category.
82    fn is_punctuation_connector(self) -> bool;
83
84    /// Returns true if this value is a member of
85    /// the "Punctuation, Dash" (Pd) category.
86    fn is_punctuation_dash(self) -> bool;
87
88    /// Returns true if this value is a member of
89    /// the "Punctuation, Close" (Pe) category.
90    fn is_punctuation_close(self) -> bool;
91
92    /// Returns true if this value is a member of
93    /// the "Punctuation, Final Quote" (Pf) category.
94    fn is_punctuation_final_quote(self) -> bool;
95
96    /// Returns true if this value is a member of
97    /// the "Punctuation, Initial Quote" (Pi) category.
98    fn is_punctuation_initial_quote(self) -> bool;
99
100    /// Returns true if this value is a member of
101    /// the "Punctuation, Other" (Po) category.
102    fn is_punctuation_other(self) -> bool;
103
104    /// Returns true if this value is a member of
105    /// the "Punctuation, Open" (Ps) category.
106    fn is_punctuation_open(self) -> bool;
107
108    /// Returns true if this value is a member of
109    /// the "Symbol, Currency" (Sc) category.
110    fn is_symbol_currency(self) -> bool;
111
112    /// Returns true if this value is a member of
113    /// the "Symbol, Modifier" (Sk) category.
114    fn is_symbol_modifier(self) -> bool;
115
116    /// Returns true if this value is a member of
117    /// the "Symbol, Math" (Sm) category.
118    fn is_symbol_math(self) -> bool;
119
120    /// Returns true if this value is a member of
121    /// the "Symbol, Other" (So) category.
122    fn is_symbol_other(self) -> bool;
123
124    /// Returns true if this value is a member of
125    /// the "Separator, Line" (Zl) category.
126    fn is_separator_line(self) -> bool;
127
128    /// Returns true if this value is a member of
129    /// the "Separator, Paragraph" (Zp) category.
130    fn is_separator_paragraph(self) -> bool;
131
132    /// Returns true if this value is a member of
133    /// the "Separator, Space" (Zs) category.
134    fn is_separator_space(self) -> bool;
135
136    /// Returns true if this value is a member of
137    /// a "Other" category: Cc, Cf, Cn, or Co.
138    /// Surrogates cannot be `chars` in Rust, so
139    /// they are not included.
140    #[inline]
141    fn is_other(self) -> bool {
142        self.is_other_control()
143            || self.is_other_format()
144            || self.is_other_private_use()
145    }
146
147    /// Returns true if this value is a member of
148    /// a "Letter" category: Lc, Ll, Lm, Lo, Lt, or Lu.
149    #[inline]
150    fn is_letter(self) -> bool {
151        self.is_letter_lowercase()
152            || self.is_letter_modifier()
153            || self.is_letter_other()
154            || self.is_letter_titlecase()
155            || self.is_letter_uppercase()
156    }
157
158    /// Returns true if this value is a member of a
159    /// "Mark" category: Mc, Me, or Mn.
160    #[inline]
161    fn is_mark(self) -> bool {
162        self.is_mark_spacing_combining()
163            || self.is_mark_enclosing()
164            || self.is_mark_nonspacing()
165    }
166
167    /// Returns true if this value is a member of a
168    /// "Number" category: Nd, Nl, or No.
169    #[inline]
170    fn is_number(self) -> bool {
171        self.is_number_decimal_digit()
172            || self.is_number_letter()
173            || self.is_number_other()
174    }
175
176    /// Returns true if this value is a member of a
177    /// "Punctuation" category: Pc, Pd, Pe, Pf, Pi, Po, or Ps.
178    #[inline]
179    fn is_punctuation(self) -> bool {
180        self.is_punctuation_connector()
181            || self.is_punctuation_dash()
182            || self.is_punctuation_close()
183            || self.is_punctuation_close()
184            || self.is_punctuation_final_quote()
185            || self.is_punctuation_initial_quote()
186            || self.is_punctuation_other()
187            || self.is_punctuation_open()
188    }
189
190    /// Returns true if this value is a member of a
191    /// "Symbol" category: Sc, Sk, Sm, or So.
192    #[inline]
193    fn is_symbol(self) -> bool {
194        self.is_symbol_currency()
195            || self.is_symbol_modifier()
196            || self.is_symbol_math()
197            || self.is_symbol_other()
198    }
199
200    /// Returns true if this value is a member of a
201    /// "Separator" category: Zl, Zp, or Zs.
202    #[inline]
203    fn is_separator(self) -> bool {
204        self.is_separator_line()
205            || self.is_separator_paragraph()
206            || self.is_separator_space()
207    }
208}
209
210fn table_binary_search(target: char, table: &'static [char]) -> bool {
211    table.binary_search(&target).is_ok()
212}
213
214impl UnicodeCategories for char {
215    #[inline]
216    fn is_other_control(self) -> bool {
217        table_binary_search(self, tables::OTHER_CONTROL)
218    }
219
220    #[inline]
221    fn is_other_format(self) -> bool {
222        table_binary_search(self, tables::OTHER_FORMAT)
223    }
224
225    #[inline]
226    fn is_other_private_use(self) -> bool {
227        match self {
228            // Private Use
229            '\u{E000}'...'\u{F8FF}' => true,
230            // Plane 15, Private Use
231            '\u{F0000}'...'\u{FFFFD}' => true,
232            // Plane 16, private Use
233            '\u{100000}'...'\u{10FFFD}' => true,
234            _ => table_binary_search(self, tables::OTHER_PRIVATE_USE)
235        }
236    }
237
238    #[inline]
239    fn is_letter_lowercase(self) -> bool {
240        table_binary_search(self, tables::LETTER_LOWERCASED)
241    }
242
243    #[inline]
244    fn is_letter_modifier(self) -> bool {
245        table_binary_search(self, tables::LETTER_MODIFIER)
246    }
247
248    #[inline]
249    fn is_letter_other(self) -> bool {
250        match self {
251            // CJK Ideograph Extension A
252            '\u{3400}'...'\u{4DB5}' => true,
253            // CJK Ideograph
254            '\u{4E00}'...'\u{9FD5}' => true,
255            // Hangul Syllable
256            '\u{AC00}'...'\u{D7A3}' => true,
257            // Tangut Ideograph
258            '\u{17000}'...'\u{187EC}' => true,
259            // CJK Ideograph Extension B
260            '\u{20000}'...'\u{2A6D6}' => true,
261            // CJK Ideograph Extension C
262            '\u{2A700}'...'\u{2B734}' => true,
263            // CJK Ideograph Extension D
264            '\u{2B740}'...'\u{2B81D}' => true,
265            // CJK Ideograph Extension E
266            '\u{2B820}'...'\u{2CEA1}' => true,
267            _ => table_binary_search(self, tables::LETTER_OTHER)
268        }
269    }
270
271    #[inline]
272    fn is_letter_titlecase(self) -> bool {
273        table_binary_search(self, tables::LETTER_TITLECASE)
274    }
275
276    #[inline]
277    fn is_letter_uppercase(self) -> bool {
278        table_binary_search(self, tables::LETTER_UPPERCASE)
279    }
280
281    #[inline]
282    fn is_mark_spacing_combining(self) -> bool {
283        table_binary_search(self, tables::MARK_SPACE_COMBINING)
284    }
285
286    #[inline]
287    fn is_mark_enclosing(self) -> bool {
288        table_binary_search(self, tables::MARK_ENCLOSING)
289    }
290
291    #[inline]
292    fn is_mark_nonspacing(self) -> bool {
293        table_binary_search(self, tables::MARK_NONSPACING)
294    }
295
296    #[inline]
297    fn is_number_decimal_digit(self) -> bool {
298        table_binary_search(self, tables::NUMBER_DECIMAL_DIGIT)
299    }
300
301    #[inline]
302    fn is_number_letter(self) -> bool {
303        table_binary_search(self, tables::NUMBER_LETTER)
304    }
305
306    #[inline]
307    fn is_number_other(self) -> bool {
308        table_binary_search(self, tables::NUMBER_OTHER)
309    }
310
311    #[inline]
312    fn is_punctuation_connector(self) -> bool {
313        table_binary_search(self, tables::PUNCTUATION_CONNECTOR)
314    }
315
316    #[inline]
317    fn is_punctuation_dash(self) -> bool {
318        table_binary_search(self, tables::PUNCTUATION_DASH)
319    }
320
321    #[inline]
322    fn is_punctuation_close(self) -> bool {
323        table_binary_search(self, tables::PUNCTUATION_CLOSE)
324    }
325
326    #[inline]
327    fn is_punctuation_final_quote(self) -> bool {
328        table_binary_search(self, tables::PUNCTUATION_FINAL_QUOTE)
329    }
330
331    #[inline]
332    fn is_punctuation_initial_quote(self) -> bool {
333        table_binary_search(self, tables::PUNCTUATION_INITIAL_QUOTE)
334    }
335
336    #[inline]
337    fn is_punctuation_other(self) -> bool {
338        table_binary_search(self, tables::PUNCTUATION_OTHER)
339    }
340
341    #[inline]
342    fn is_punctuation_open(self) -> bool {
343        table_binary_search(self, tables::PUNCTUATION_OPEN)
344    }
345
346    #[inline]
347    fn is_symbol_currency(self) -> bool {
348        table_binary_search(self, tables::SYMBOL_CURRENCY)
349    }
350
351    #[inline]
352    fn is_symbol_modifier(self) -> bool {
353        table_binary_search(self, tables::SYMBOL_MODIFIER)
354    }
355
356    #[inline]
357    fn is_symbol_math(self) -> bool {
358        table_binary_search(self, tables::SYMBOL_MATH)
359    }
360
361    #[inline]
362    fn is_symbol_other(self) -> bool {
363        table_binary_search(self, tables::SYMBOL_OTHER)
364    }
365
366    #[inline]
367    fn is_separator_line(self) -> bool {
368        table_binary_search(self, tables::SEPARATOR_LINE)
369    }
370
371    #[inline]
372    fn is_separator_paragraph(self) -> bool {
373        table_binary_search(self, tables::SEPARATOR_PARAGRAPH)
374    }
375
376    #[inline]
377    fn is_separator_space(self) -> bool {
378        table_binary_search(self, tables::SEPARATOR_SPACE)
379    }
380}
381
382#[cfg(test)]
383mod tests {
384    use super::UnicodeCategories;
385
386    #[test]
387    fn is_other_control() {
388        assert!('\0'.is_other_control());
389        assert!('\u{007F}'.is_other_control());
390        assert!(!'f'.is_other_control());
391    }
392
393    #[test]
394    fn is_other_format() {
395        assert!('؁'.is_other_format());
396        assert!(!'0'.is_other_format());
397    }
398
399    #[test]
400    fn is_other_private_use() {
401        assert!('\u{F8FF}'.is_other_private_use());
402        assert!(!'n'.is_other_private_use())
403    }
404
405    #[test]
406    fn is_letter_lowercase() {
407        assert!('q'.is_letter_lowercase());
408        assert!(!'N'.is_letter_lowercase());
409    }
410
411    #[test]
412    fn is_letter_modifier() {
413        assert!('ˢ'.is_letter_modifier());
414        assert!(!'m'.is_letter_modifier());
415    }
416
417    #[test]
418    fn is_letter_range() {
419        assert!('界'.is_letter_other());
420    }
421}