1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
use ucd::{Codepoint, UnicodeBlock};
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum CompressionMode {
CompressNone,
CompressWhitespace,
CompressWhitespaceNewline,
DiscardNewline,
}
// ported from Gecko's nsTextFrameUtils::TransformText.
//
// High level TODOs:
//
// * Issue #113: consider incoming text state (arabic, etc)
// and propagate outgoing text state (dual of above)
//
// * Issue #114: record skipped and kept chars for mapping original to new text
//
// * Untracked: various edge cases for bidi, CJK, etc.
pub fn transform_text(
text: &str,
mode: CompressionMode,
incoming_whitespace: bool,
output_text: &mut String,
) -> bool {
let out_whitespace = match mode {
CompressionMode::CompressNone | CompressionMode::DiscardNewline => {
for ch in text.chars() {
if is_discardable_char(ch, mode) {
// TODO: record skipped char
} else {
// TODO: record kept char
if ch == '\t' {
// TODO: set "has tab" flag
}
output_text.push(ch);
}
}
false
},
CompressionMode::CompressWhitespace | CompressionMode::CompressWhitespaceNewline => {
let mut in_whitespace: bool = incoming_whitespace;
for ch in text.chars() {
// TODO: discard newlines between CJK chars
let mut next_in_whitespace: bool = is_in_whitespace(ch, mode);
if !next_in_whitespace {
if is_always_discardable_char(ch) {
// revert whitespace setting, since this char was discarded
next_in_whitespace = in_whitespace;
// TODO: record skipped char
} else {
// TODO: record kept char
output_text.push(ch);
}
} else {
/* next_in_whitespace; possibly add a space char */
if in_whitespace {
// TODO: record skipped char
} else {
// TODO: record kept char
output_text.push(' ');
}
}
// save whitespace context for next char
in_whitespace = next_in_whitespace;
} /* /for str::each_char */
in_whitespace
},
};
return out_whitespace;
fn is_in_whitespace(ch: char, mode: CompressionMode) -> bool {
match (ch, mode) {
(' ', _) => true,
('\t', _) => true,
('\n', CompressionMode::CompressWhitespaceNewline) => true,
(_, _) => false,
}
}
fn is_discardable_char(ch: char, mode: CompressionMode) -> bool {
if is_always_discardable_char(ch) {
return true;
}
match mode {
CompressionMode::DiscardNewline | CompressionMode::CompressWhitespaceNewline => {
ch == '\n'
},
_ => false,
}
}
fn is_always_discardable_char(ch: char) -> bool {
// TODO: check for soft hyphens.
is_bidi_control(ch)
}
}
pub fn float_to_fixed(before: usize, f: f64) -> i32 {
((1i32 << before) as f64 * f) as i32
}
pub fn fixed_to_float(before: usize, f: i32) -> f64 {
f as f64 * 1.0f64 / ((1i32 << before) as f64)
}
pub fn is_bidi_control(c: char) -> bool {
matches!(c, '\u{202A}'..='\u{202E}' | '\u{2066}'..='\u{2069}' | '\u{200E}' | '\u{200F}' | '\u{061C}')
}
pub fn unicode_plane(codepoint: char) -> u32 {
(codepoint as u32) >> 16
}
pub fn is_cjk(codepoint: char) -> bool {
if let Some(block) = codepoint.block() {
match block {
UnicodeBlock::CJKRadicalsSupplement |
UnicodeBlock::KangxiRadicals |
UnicodeBlock::IdeographicDescriptionCharacters |
UnicodeBlock::CJKSymbolsandPunctuation |
UnicodeBlock::Hiragana |
UnicodeBlock::Katakana |
UnicodeBlock::Bopomofo |
UnicodeBlock::HangulCompatibilityJamo |
UnicodeBlock::Kanbun |
UnicodeBlock::BopomofoExtended |
UnicodeBlock::CJKStrokes |
UnicodeBlock::KatakanaPhoneticExtensions |
UnicodeBlock::EnclosedCJKLettersandMonths |
UnicodeBlock::CJKCompatibility |
UnicodeBlock::CJKUnifiedIdeographsExtensionA |
UnicodeBlock::YijingHexagramSymbols |
UnicodeBlock::CJKUnifiedIdeographs |
UnicodeBlock::CJKCompatibilityIdeographs |
UnicodeBlock::CJKCompatibilityForms |
UnicodeBlock::HalfwidthandFullwidthForms => return true,
_ => {},
}
}
// https://en.wikipedia.org/wiki/Plane_(Unicode)#Supplementary_Ideographic_Plane
// https://en.wikipedia.org/wiki/Plane_(Unicode)#Tertiary_Ideographic_Plane
unicode_plane(codepoint) == 2 || unicode_plane(codepoint) == 3
}