idna/lib.rs
1// Copyright 2016 The rust-url developers.
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9//! This Rust crate implements IDNA
10//! [per the WHATWG URL Standard](https://url.spec.whatwg.org/#idna).
11//!
12//! It also exposes the underlying algorithms from [*Unicode IDNA Compatibility Processing*
13//! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/)
14//! and [Punycode (RFC 3492)](https://tools.ietf.org/html/rfc3492).
15//!
16//! Quoting from [UTS #46’s introduction](http://www.unicode.org/reports/tr46/#Introduction):
17//!
18//! > Initially, domain names were restricted to ASCII characters.
19//! > A system was introduced in 2003 for internationalized domain names (IDN).
20//! > This system is called Internationalizing Domain Names for Applications,
21//! > or IDNA2003 for short.
22//! > This mechanism supports IDNs by means of a client software transformation
23//! > into a format known as Punycode.
24//! > A revision of IDNA was approved in 2010 (IDNA2008).
25//! > This revision has a number of incompatibilities with IDNA2003.
26//! >
27//! > The incompatibilities force implementers of client software,
28//! > such as browsers and emailers,
29//! > to face difficult choices during the transition period
30//! > as registries shift from IDNA2003 to IDNA2008.
31//! > This document specifies a mechanism
32//! > that minimizes the impact of this transition for client software,
33//! > allowing client software to access domains that are valid under either system.
34#![no_std]
35
36// For forwards compatibility
37#[cfg(feature = "std")]
38extern crate std;
39
40extern crate alloc;
41
42#[cfg(not(feature = "alloc"))]
43compile_error!("the `alloc` feature must be enabled");
44
45// Avoid a breaking change if in the future there's a use case for
46// having a Bring-Your-Own-ICU4X-Data constructor for `Uts46` and
47// not also having compiled data in the binary.
48#[cfg(not(feature = "compiled_data"))]
49compile_error!("the `compiled_data` feature must be enabled");
50
51use alloc::borrow::Cow;
52use alloc::string::String;
53pub use uts46::AsciiDenyList;
54use uts46::Uts46;
55
56mod deprecated;
57pub mod punycode;
58pub mod uts46;
59
60#[allow(deprecated)]
61pub use crate::deprecated::{Config, Idna};
62
63/// Type indicating that there were errors during UTS #46 processing.
64#[derive(Default, Debug)]
65#[non_exhaustive]
66pub struct Errors {}
67
68impl From<Errors> for Result<(), Errors> {
69 fn from(e: Errors) -> Self {
70 Err(e)
71 }
72}
73
74#[cfg(feature = "std")]
75impl std::error::Error for Errors {}
76
77#[cfg(not(feature = "std"))]
78impl core::error::Error for Errors {}
79
80impl core::fmt::Display for Errors {
81 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
82 core::fmt::Debug::fmt(self, f)
83 }
84}
85
86/// The [domain to ASCII](https://url.spec.whatwg.org/#concept-domain-to-ascii) algorithm;
87/// version returning a `Cow`.
88///
89/// Most applications should be using this function or `domain_to_ascii_from_cow` rather
90/// than the sibling functions, and most applications should pass [`AsciiDenyList::URL`] as
91/// the second argument. Passing [`AsciiDenyList::URL`] as the second argument makes this function also
92/// perform the [forbidden domain code point](https://url.spec.whatwg.org/#forbidden-domain-code-point)
93/// check in addition to the [domain to ASCII](https://url.spec.whatwg.org/#concept-domain-to-ascii)
94/// algorithm.
95///
96/// Returns the ASCII representation a domain name,
97/// normalizing characters (upper-case to lower-case and other kinds of equivalence)
98/// and using Punycode as necessary.
99///
100/// This process may fail.
101///
102/// If you have a `&str` instead of `&[u8]`, just call `.as_bytes()` on it before
103/// passing it to this function. It's still preferable to use this function over
104/// the sibling functions that take `&str`.
105pub fn domain_to_ascii_cow(
106 domain: &[u8],
107 ascii_deny_list: AsciiDenyList,
108) -> Result<Cow<'_, str>, Errors> {
109 Uts46::new().to_ascii(
110 domain,
111 ascii_deny_list,
112 uts46::Hyphens::Allow,
113 uts46::DnsLength::Ignore,
114 )
115}
116
117/// The [domain to ASCII](https://url.spec.whatwg.org/#concept-domain-to-ascii) algorithm;
118/// version accepting and returning a `Cow`.
119///
120/// Most applications should be using this function or `domain_to_ascii_cow` rather
121/// than the sibling functions, and most applications should pass [`AsciiDenyList::URL`] as
122/// the second argument. Passing [`AsciiDenyList::URL`] as the second argument makes this function also
123/// perform the [forbidden domain code point](https://url.spec.whatwg.org/#forbidden-domain-code-point)
124/// check in addition to the [domain to ASCII](https://url.spec.whatwg.org/#concept-domain-to-ascii)
125/// algorithm.
126///
127/// Return the ASCII representation a domain name,
128/// normalizing characters (upper-case to lower-case and other kinds of equivalence)
129/// and using Punycode as necessary.
130///
131/// This process may fail.
132pub fn domain_to_ascii_from_cow(
133 domain: Cow<'_, [u8]>,
134 ascii_deny_list: AsciiDenyList,
135) -> Result<Cow<'_, str>, Errors> {
136 Uts46::new().to_ascii_from_cow(
137 domain,
138 ascii_deny_list,
139 uts46::Hyphens::Allow,
140 uts46::DnsLength::Ignore,
141 )
142}
143
144/// The [domain to ASCII](https://url.spec.whatwg.org/#concept-domain-to-ascii) algorithm;
145/// version returning `String` and no ASCII deny list (i.e. _UseSTD3ASCIIRules=false_).
146///
147/// This function exists for backward-compatibility. Consider using [`domain_to_ascii_cow`]
148/// instead.
149///
150/// Return the ASCII representation a domain name,
151/// normalizing characters (upper-case to lower-case and other kinds of equivalence)
152/// and using Punycode as necessary.
153///
154/// This process may fail.
155pub fn domain_to_ascii(domain: &str) -> Result<String, Errors> {
156 domain_to_ascii_cow(domain.as_bytes(), AsciiDenyList::EMPTY).map(|cow| cow.into_owned())
157}
158
159/// The [domain to ASCII](https://url.spec.whatwg.org/#concept-domain-to-ascii) algorithm,
160/// with the `beStrict` flag set.
161///
162/// Note that this rejects various real-world names including:
163/// * YouTube CDN nodes
164/// * Some GitHub user pages
165/// * Pseudo-hosts used by various TXT record-based protocols.
166pub fn domain_to_ascii_strict(domain: &str) -> Result<String, Errors> {
167 Uts46::new()
168 .to_ascii(
169 domain.as_bytes(),
170 uts46::AsciiDenyList::STD3,
171 uts46::Hyphens::Check,
172 uts46::DnsLength::Verify,
173 )
174 .map(|cow| cow.into_owned())
175}
176
177/// The [domain to Unicode](https://url.spec.whatwg.org/#concept-domain-to-unicode) algorithm;
178/// version returning `String` and no ASCII deny list (i.e. _UseSTD3ASCIIRules=false_).
179///
180/// This function exists for backward-compatibility. Consider using [`Uts46::to_user_interface`]
181/// or [`Uts46::to_unicode`].
182///
183/// Return the Unicode representation of a domain name,
184/// normalizing characters (upper-case to lower-case and other kinds of equivalence)
185/// and decoding Punycode as necessary.
186///
187/// If the second item of the tuple indicates an error, the first item of the tuple
188/// denotes errors using the REPLACEMENT CHARACTERs in order to be able to illustrate
189/// errors to the user. When the second item of the return tuple signals an error,
190/// the first item of the tuple must not be used in a network protocol.
191pub fn domain_to_unicode(domain: &str) -> (String, Result<(), Errors>) {
192 let (cow, result) = Uts46::new().to_unicode(
193 domain.as_bytes(),
194 uts46::AsciiDenyList::EMPTY,
195 uts46::Hyphens::Allow,
196 );
197 (cow.into_owned(), result)
198}