#![cfg_attr(not(test), no_std)]
#![cfg_attr(feature = "bench", feature(test))]
mod tables;
use core::convert::TryFrom;
use core::fmt;
use core::u64;
pub use tables::script_extensions;
use tables::{get_script, get_script_extension, NEXT_SCRIPT};
pub use tables::{Script, UNICODE_VERSION};
impl Script {
pub fn full_name(self) -> &'static str {
self.inner_full_name()
}
pub fn from_full_name(input: &str) -> Option<Self> {
Self::inner_from_full_name(input)
}
pub fn short_name(self) -> &'static str {
self.inner_short_name()
}
pub fn from_short_name(input: &str) -> Option<Self> {
Self::inner_from_short_name(input)
}
pub fn is_recommended(self) -> bool {
use Script::*;
match self {
Common | Inherited | Arabic | Armenian | Bengali | Bopomofo | Cyrillic | Devanagari
| Ethiopic | Georgian | Greek | Gujarati | Gurmukhi | Han | Hangul | Hebrew
| Hiragana | Kannada | Katakana | Khmer | Lao | Latin | Malayalam | Myanmar | Oriya
| Sinhala | Tamil | Telugu | Thaana | Thai | Tibetan => true,
_ => false,
}
}
}
impl From<Script> for ScriptExtension {
fn from(script: Script) -> Self {
if script == Script::Common {
ScriptExtension::new_common()
} else if script == Script::Inherited {
ScriptExtension::new_inherited()
} else if script == Script::Unknown {
ScriptExtension::new_unknown()
} else {
let mut first = 0;
let mut second = 0;
let mut third = 0;
let bit = script as u8;
if bit < 64 {
first = 1 << bit as u64;
} else if bit < 128 {
second = 1 << (bit - 64) as u64;
} else {
third = 1 << (bit - 128) as u32;
}
ScriptExtension::new(first, second, third)
}
}
}
impl TryFrom<ScriptExtension> for Script {
type Error = ();
fn try_from(ext: ScriptExtension) -> Result<Self, ()> {
if ext.is_common_or_inherited() {
if ext.common {
Ok(Script::Common)
} else {
Ok(Script::Inherited)
}
} else if ext.is_empty() {
Ok(Script::Unknown)
} else {
let fo = ext.first.count_ones();
let so = ext.second.count_ones();
let to = ext.third.count_ones();
if fo == 1 && so == 0 && to == 0 {
Ok(Script::for_integer(ext.first.trailing_zeros() as u8))
} else if fo == 0 && so == 1 && to == 0 {
Ok(Script::for_integer(64 + ext.second.trailing_zeros() as u8))
} else if fo == 0 && so == 0 && to == 1 {
Ok(Script::for_integer(128 + ext.third.trailing_zeros() as u8))
} else {
Err(())
}
}
}
}
impl Default for Script {
fn default() -> Self {
Script::Common
}
}
impl From<char> for Script {
fn from(o: char) -> Self {
o.script()
}
}
impl fmt::Display for Script {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.full_name())
}
}
#[derive(Clone, Copy, PartialEq, Eq, Hash)]
#[non_exhaustive]
pub struct ScriptExtension {
first: u64,
second: u64,
third: u64,
common: bool,
}
impl ScriptExtension {
const THIRD_MAX: u64 = ((1 << (NEXT_SCRIPT - 128)) - 1);
pub(crate) const fn new(first: u64, second: u64, third: u64) -> Self {
ScriptExtension {
first,
second,
third,
common: false,
}
}
pub(crate) const fn new_common() -> Self {
ScriptExtension {
first: u64::MAX,
second: u64::MAX,
third: Self::THIRD_MAX,
common: true,
}
}
pub(crate) const fn new_inherited() -> Self {
ScriptExtension {
first: u64::MAX,
second: u64::MAX,
third: Self::THIRD_MAX,
common: false,
}
}
pub(crate) const fn new_unknown() -> Self {
ScriptExtension {
first: 0,
second: 0,
third: 0,
common: false,
}
}
const fn is_common_or_inherited(self) -> bool {
(self.first == u64::MAX) & (self.second == u64::MAX) & (self.third == Self::THIRD_MAX)
}
pub const fn is_common(self) -> bool {
self.is_common_or_inherited() & self.common
}
pub const fn is_inherited(self) -> bool {
self.is_common_or_inherited() & !self.common
}
pub const fn is_empty(self) -> bool {
(self.first == 0) & (self.second == 0) & (self.third == 0)
}
pub fn len(self) -> usize {
if self.is_common_or_inherited() {
1
} else {
(self.first.count_ones() + self.second.count_ones() + self.third.count_ones()) as usize
}
}
pub fn intersect_with(&mut self, other: Self) {
*self = self.intersection(other)
}
pub const fn intersection(self, other: Self) -> Self {
let first = self.first & other.first;
let second = self.second & other.second;
let third = self.third & other.third;
let common = self.common & other.common;
ScriptExtension {
first,
second,
third,
common,
}
}
pub const fn union(self, other: Self) -> Self {
let first = self.first | other.first;
let second = self.second | other.second;
let third = self.third | other.third;
let common = self.common | other.common;
ScriptExtension {
first,
second,
third,
common,
}
}
pub fn contains_script(self, script: Script) -> bool {
!self.intersection(script.into()).is_empty()
}
pub fn for_str(x: &str) -> Self {
let mut ext = ScriptExtension::default();
for ch in x.chars() {
ext.intersect_with(ch.into());
}
ext
}
pub fn iter(self) -> ScriptIterator {
ScriptIterator { ext: self }
}
}
impl Default for ScriptExtension {
fn default() -> Self {
ScriptExtension::new_common()
}
}
impl From<char> for ScriptExtension {
fn from(o: char) -> Self {
o.script_extension()
}
}
impl From<&'_ str> for ScriptExtension {
fn from(o: &'_ str) -> Self {
Self::for_str(o)
}
}
impl fmt::Debug for ScriptExtension {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "ScriptExtension(")?;
fmt::Display::fmt(self, f)?;
write!(f, ")")
}
}
impl fmt::Display for ScriptExtension {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
if self.is_common() {
write!(f, "Common")?;
} else if self.is_inherited() {
write!(f, "Inherited")?;
} else if self.is_empty() {
write!(f, "Unknown")?;
} else {
let mut first = true;
for script in self.iter() {
if !first {
write!(f, " + ")?;
first = false;
}
script.full_name().fmt(f)?;
}
}
Ok(())
}
}
pub trait UnicodeScript {
fn script(&self) -> Script;
fn script_extension(&self) -> ScriptExtension;
}
impl UnicodeScript for char {
fn script(&self) -> Script {
get_script(*self).unwrap_or(Script::Unknown)
}
fn script_extension(&self) -> ScriptExtension {
get_script_extension(*self).unwrap_or_else(|| self.script().into())
}
}
pub struct ScriptIterator {
ext: ScriptExtension,
}
impl Iterator for ScriptIterator {
type Item = Script;
fn next(&mut self) -> Option<Script> {
if self.ext.is_common_or_inherited() {
let common = self.ext.common;
self.ext = ScriptExtension::new_unknown();
if common {
Some(Script::Common)
} else {
Some(Script::Inherited)
}
} else if self.ext.first != 0 {
let bit = self.ext.first.trailing_zeros();
self.ext.first &= !(1 << bit);
Some(Script::for_integer(bit as u8))
} else if self.ext.second != 0 {
let bit = self.ext.second.trailing_zeros();
self.ext.second &= !(1 << bit);
Some(Script::for_integer(64 + bit as u8))
} else if self.ext.third != 0 {
let bit = self.ext.third.trailing_zeros();
self.ext.third &= !(1 << bit);
Some(Script::for_integer(128 + bit as u8))
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use crate::*;
use std::collections::HashSet;
use std::convert::TryInto;
#[cfg(feature = "bench")]
use test::bench::Bencher;
#[cfg(feature = "bench")]
extern crate test;
#[test]
fn test_conversion() {
let mut seen_scripts = HashSet::new();
let mut seen_exts = HashSet::new();
for bit in 0..NEXT_SCRIPT {
let script = Script::for_integer(bit);
let ext = script.into();
if seen_scripts.contains(&script) {
panic!("Found script {:?} twice!", script)
}
if seen_exts.contains(&ext) {
panic!("Found extension {:?} twice!", ext)
}
seen_scripts.insert(script);
seen_exts.insert(ext);
assert_eq!(script as u8, bit);
assert!(!ScriptExtension::new_common().intersection(ext).is_empty());
assert!(!ScriptExtension::new_inherited()
.intersection(ext)
.is_empty());
assert!(ScriptExtension::new_unknown().intersection(ext).is_empty());
assert_eq!(ext.iter().collect::<Vec<_>>(), vec![script]);
assert_eq!(Ok(script), ext.try_into());
}
}
#[test]
fn test_specific() {
let s = "सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे.";
let ext = ScriptExtension::for_str(s);
assert_eq!(ext, script_extensions::DEVA);
println!(
"{:?}",
script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
);
println!(
"{:?}",
ext.intersection(
script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
)
);
assert!(!ext
.intersection(script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH)
.is_empty());
let u = ext.union(Script::Dogra.into());
assert_eq!(
u.intersection(
script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
),
u
);
}
#[test]
fn test_specific_ext() {
let ext = script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH;
let all: HashSet<_> = ext.iter().collect();
for bit in 0..NEXT_SCRIPT {
let script = Script::for_integer(bit);
if all.contains(&script) {
assert!(ext.contains_script(script))
} else {
assert!(!ext.contains_script(script))
}
}
assert!(ext.contains_script(Script::Devanagari));
assert!(ext.contains_script(Script::Dogra));
assert!(ext.contains_script(Script::Gujarati));
assert!(ext.contains_script(Script::Gurmukhi));
assert!(ext.contains_script(Script::Khojki));
assert!(ext.contains_script(Script::Kaithi));
assert!(ext.contains_script(Script::Mahajani));
assert!(ext.contains_script(Script::Modi));
assert!(ext.contains_script(Script::Khudawadi));
assert!(ext.contains_script(Script::Takri));
assert!(ext.contains_script(Script::Tirhuta));
let scr: Result<Script, _> = ext.try_into();
assert!(scr.is_err());
}
#[cfg(feature = "bench")]
#[bench]
fn bench_script_intersection(b: &mut Bencher) {
b.iter(|| {
let script = test::black_box(Script::Devanagari);
let ext = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ONAO_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH);
test::black_box(ext.intersection(script.into()));
})
}
#[cfg(feature = "bench")]
#[bench]
fn bench_ext_to_script(b: &mut Bencher) {
let ext: ScriptExtension = Script::Devanagari.into();
b.iter(|| {
let ext = test::black_box(ext);
let script: Result<Script, _> = ext.try_into();
let _ = test::black_box(script);
})
}
#[cfg(feature = "bench")]
#[bench]
fn bench_script_to_ext(b: &mut Bencher) {
b.iter(|| {
let script = test::black_box(Script::Devanagari);
let ext: ScriptExtension = script.into();
test::black_box(ext);
})
}
#[cfg(feature = "bench")]
#[bench]
fn bench_ext_intersection(b: &mut Bencher) {
b.iter(|| {
let e1 = test::black_box(script_extensions::ARAB_GARA_NKOO_ROHG_SYRC_THAA_YEZI);
let e2 = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ONAO_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH);
test::black_box(e2.intersection(e1));
})
}
#[cfg(feature = "bench")]
#[bench]
fn bench_to_vec(b: &mut Bencher) {
b.iter(|| {
let ext = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ONAO_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH);
test::black_box(ext.iter().collect::<Vec<_>>());
})
}
#[cfg(feature = "bench")]
#[bench]
fn bench_string_ext(b: &mut Bencher) {
b.iter(|| {
let s = test::black_box("सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे.");
test::black_box(ScriptExtension::for_str(s));
})
}
}