bpaf/
arg.rs

1use std::ffi::{OsStr, OsString};
2
3/// Preprocessed command line argument
4///
5/// [`OsString`] in Short/Long correspond to orignal command line item used for errors
6#[derive(Debug, Clone, Eq, PartialEq)]
7pub(crate) enum Arg {
8    /// short flag: `-f`
9    ///
10    /// bool indicates if following item is also part of this Short (created
11    Short(char, bool, OsString),
12
13    /// long flag: `--flag`
14    /// bool tells if it looks like --key=val or not
15    Long(String, bool, OsString),
16
17    /// "val" part of --key=val -k=val -kval
18    ArgWord(OsString),
19
20    /// separate word that can be command, positional or a separate argument to a flag
21    ///
22    /// Can start with `-` or `--`, doesn't have to be valid utf8
23    ///
24    /// `hello`
25    Word(OsString),
26
27    /// separate word that goes after `--`, strictly positional
28    ///
29    /// Can start with `-` or `--`, doesn't have to be valid utf8
30    PosWord(OsString),
31}
32
33impl Arg {
34    pub(crate) fn os_str(&self) -> &OsStr {
35        match self {
36            Arg::Short(_, _, s)
37            | Arg::Long(_, _, s)
38            | Arg::ArgWord(s)
39            | Arg::Word(s)
40            | Arg::PosWord(s) => s.as_ref(),
41        }
42    }
43
44    pub(crate) fn match_short(&self, val: char) -> bool {
45        match self {
46            Arg::Short(s, _, _) => *s == val,
47            Arg::ArgWord(_) | Arg::Long(_, _, _) | Arg::Word(_) | Arg::PosWord(_) => false,
48        }
49    }
50
51    pub(crate) fn match_long(&self, val: &str) -> bool {
52        match self {
53            Arg::Long(s, _, _) => *s == val,
54            Arg::Short(_, _, _) | Arg::ArgWord(_) | Arg::Word(_) | Arg::PosWord(_) => false,
55        }
56    }
57}
58
59// short flag disambiguations:
60//
61// Short flags | short arg
62// No          | No        | no problem
63// Yes         | No        | use flag
64// No          | Yes       | use arg
65// Yes         | Yes       | ask user?
66//
67// -a  - just a regular short flag: "-a"
68// -abc - assuming there are short flags a, b and c: "-a -b -c", assuming utf8 values AND there's no argument -a
69// -abc - assuming there's no -a -b -c: "-a bc"
70// -abc - assuming both short a b c AND there's argument -a - need to disambiguate  on a context level
71//
72// 1. parse argument into ambigous representation that can store both short flags and argument
73// 2. collect short flag/arg when entering the subparsre
74// 3. when reaching ambi
75//
76
77impl std::fmt::Display for Arg {
78    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
79        match self {
80            Arg::Short(s, _, _) => write!(f, "-{}", s),
81            Arg::Long(l, _, _) => write!(f, "--{}", l),
82            Arg::ArgWord(w) | Arg::Word(w) | Arg::PosWord(w) => {
83                write!(f, "{}", w.to_string_lossy())
84            }
85        }
86    }
87}
88
89#[derive(Eq, PartialEq, Debug)]
90pub(crate) enum ArgType {
91    Short,
92    Long,
93}
94
95/// split [`OsString`] into argument specific bits
96///
97/// takes a possibly non-utf8 string looking like "--name=value" and splits it into bits:
98/// "--" - type, "name" - name, must be representable as utf8, "=" - optional, "value" - flag
99///
100/// dashes and equals sign are low codepoint values and - can look for them literally in a string.
101/// This probably means not supporting dashes with diacritics, but that's okay
102///
103/// name must be valid utf8 after conversion and must not include `=`
104///
105/// argument is optional and can be non valid utf8.
106///
107/// The idea is to split the [`OsString`] into opaque parts by looking only at the parts simple parts
108/// and let stdlib to handle the decoding of those parts.
109///
110/// performance wise this (at least on unix) works some small number percentage slower than the
111/// previous version
112///
113///
114/// Notation -fbar is ambigous and could mean either `-f -b -a -r` or `-f=bar`, resolve it into
115/// [`Arg::Ambiguity`] and let subparser disambiguate it later depending on available short flag and
116/// arguments
117pub(crate) fn split_os_argument(input: &std::ffi::OsStr) -> Option<(ArgType, String, Option<Arg>)> {
118    #[cfg(any(unix, windows))]
119    {
120        // OsString are sequences of smaller smaller elements - bytes in unix and
121        // possibly invalid utf16 items on windows
122        #[cfg(unix)]
123        type Elt = u8;
124        #[cfg(windows)]
125        type Elt = u16;
126
127        // reuse allocation on unix, don't reuse allocations on windows
128        // either case - pack a vector of elements back into OsString
129        fn os_from_vec(vec: Vec<Elt>) -> OsString {
130            #[cfg(unix)]
131            {
132                <OsString as std::os::unix::ffi::OsStringExt>::from_vec(vec)
133            }
134            #[cfg(windows)]
135            {
136                <OsString as std::os::windows::ffi::OsStringExt>::from_wide(&vec)
137            }
138        }
139
140        // try to decode elements into a String
141        fn str_from_vec(vec: Vec<Elt>) -> Option<String> {
142            Some(os_from_vec(vec).to_str()?.to_owned())
143        }
144
145        // but in either case dashes and equals are just literal values just with different width
146        const DASH: Elt = b'-' as Elt;
147        const EQUALS: Elt = b'=' as Elt;
148
149        // preallocate something to store the name. oversized but avoids extra allocations/copying
150        let mut name = Vec::with_capacity(input.len());
151
152        let mut items;
153        #[cfg(unix)]
154        {
155            items = std::os::unix::ffi::OsStrExt::as_bytes(input)
156                .iter()
157                .copied();
158        }
159        #[cfg(windows)]
160        {
161            items = std::os::windows::ffi::OsStrExt::encode_wide(input);
162        }
163
164        // first item must be dash, otherwise it's positional or a flag value
165        if items.next()? != DASH {
166            return None;
167        }
168
169        // second item may or may not be, but should be present
170        let ty;
171        match items.next()? {
172            DASH => ty = ArgType::Long,
173            val => {
174                ty = ArgType::Short;
175                name.push(val);
176            }
177        }
178
179        // keep collecting until = or the end of the input
180        loop {
181            match items.next() {
182                Some(EQUALS) => {
183                    if ty == ArgType::Short && name.len() > 1 {
184                        let mut body = name.drain(1..).collect::<Vec<_>>();
185                        body.push(EQUALS);
186                        body.extend(items);
187                        name.truncate(1);
188                        let os = Arg::ArgWord(os_from_vec(body));
189                        return Some((ty, str_from_vec(name)?, Some(os)));
190                    }
191                    break;
192                }
193                Some(val) => name.push(val),
194                None => {
195                    if name.is_empty() {
196                        return None;
197                    }
198                    return Some((ty, str_from_vec(name)?, None));
199                }
200            }
201        }
202
203        let name = str_from_vec(name)?;
204        let word = {
205            let os = os_from_vec(items.collect());
206            Arg::ArgWord(os)
207        };
208        Some((ty, name, Some(word)))
209    }
210    #[cfg(not(any(unix, windows)))]
211    {
212        split_os_argument_fallback(input)
213    }
214}
215
216/// similar to [`split_os_argument`] but only works for utf8 values, used as a fallback function
217/// on non windows/unix OSes
218#[cfg(any(all(not(windows), not(unix)), test))]
219pub(crate) fn split_os_argument_fallback(
220    input: &std::ffi::OsStr,
221) -> Option<(ArgType, String, Option<Arg>)> {
222    // fallback supports only valid utf8 os strings, matches old behavior
223    let string = input.to_str()?;
224
225    let mut chars = string.chars();
226    let mut name = String::with_capacity(string.len());
227
228    // first character must be dash, otherwise it's positional or a flag value
229    if chars.next()? != '-' {
230        return None;
231    }
232
233    // second character may or may not be
234    let ty;
235    match chars.next()? {
236        '-' => ty = ArgType::Long,
237        val => {
238            ty = ArgType::Short;
239            name.push(val);
240        }
241    }
242
243    // collect the argument's name up to '=' or until the end
244    // if it's a flag
245    loop {
246        match chars.next() {
247            Some('=') => {
248                if ty == ArgType::Short && name.len() > 1 {
249                    let mut body = name.drain(1..).collect::<String>();
250                    body.push('=');
251                    body.extend(chars);
252                    name.truncate(1);
253                    let os = Arg::ArgWord(OsString::from(body));
254                    return Some((ty, name, Some(os)));
255                }
256                break;
257            }
258
259            Some(val) => name.push(val),
260            None => {
261                if name.is_empty() {
262                    return None;
263                }
264                return Some((ty, name, None));
265            }
266        }
267    }
268
269    Some((
270        ty,
271        name,
272        Some(Arg::ArgWord(OsString::from(chars.collect::<String>()))),
273    ))
274}