bpaf/arg.rs
1use std::ffi::{OsStr, OsString};
2
3/// Preprocessed command line argument
4///
5/// [`OsString`] in Short/Long correspond to orignal command line item used for errors
6#[derive(Debug, Clone, Eq, PartialEq)]
7#[allow(clippy::enum_variant_names)]
8pub(crate) enum Arg {
9 /// short flag: `-f`
10 ///
11 /// bool indicates if following item is also part of this Short (created
12 Short(char, bool, OsString),
13
14 /// long flag: `--flag`
15 /// bool tells if it looks like --key=val or not
16 Long(String, bool, OsString),
17
18 /// "val" part of --key=val -k=val -kval
19 ArgWord(OsString),
20
21 /// separate word that can be command, positional or a separate argument to a flag
22 ///
23 /// Can start with `-` or `--`, doesn't have to be valid utf8
24 ///
25 /// `hello`
26 Word(OsString),
27
28 /// separate word that goes after `--`, strictly positional
29 ///
30 /// Can start with `-` or `--`, doesn't have to be valid utf8
31 PosWord(OsString),
32}
33
34impl Arg {
35 pub(crate) fn os_str(&self) -> &OsStr {
36 match self {
37 Arg::Short(_, _, s)
38 | Arg::Long(_, _, s)
39 | Arg::ArgWord(s)
40 | Arg::Word(s)
41 | Arg::PosWord(s) => s.as_ref(),
42 }
43 }
44
45 pub(crate) fn match_short(&self, val: char) -> bool {
46 match self {
47 Arg::Short(s, _, _) => *s == val,
48 Arg::ArgWord(_) | Arg::Long(_, _, _) | Arg::Word(_) | Arg::PosWord(_) => false,
49 }
50 }
51
52 pub(crate) fn match_long(&self, val: &str) -> bool {
53 match self {
54 Arg::Long(s, _, _) => *s == val,
55 Arg::Short(_, _, _) | Arg::ArgWord(_) | Arg::Word(_) | Arg::PosWord(_) => false,
56 }
57 }
58}
59
60// short flag disambiguations:
61//
62// Short flags | short arg
63// No | No | no problem
64// Yes | No | use flag
65// No | Yes | use arg
66// Yes | Yes | ask user?
67//
68// -a - just a regular short flag: "-a"
69// -abc - assuming there are short flags a, b and c: "-a -b -c", assuming utf8 values AND there's no argument -a
70// -abc - assuming there's no -a -b -c: "-a bc"
71// -abc - assuming both short a b c AND there's argument -a - need to disambiguate on a context level
72//
73// 1. parse argument into ambigous representation that can store both short flags and argument
74// 2. collect short flag/arg when entering the subparsre
75// 3. when reaching ambi
76//
77
78impl std::fmt::Display for Arg {
79 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
80 match self {
81 Arg::Short(s, _, _) => write!(f, "-{}", s),
82 Arg::Long(l, _, _) => write!(f, "--{}", l),
83 Arg::ArgWord(w) | Arg::Word(w) | Arg::PosWord(w) => {
84 write!(f, "{}", w.to_string_lossy())
85 }
86 }
87 }
88}
89
90#[derive(Eq, PartialEq, Debug)]
91pub(crate) enum ArgType {
92 Short,
93 Long,
94}
95
96/// split [`OsString`] into argument specific bits
97///
98/// takes a possibly non-utf8 string looking like "--name=value" and splits it into bits:
99/// "--" - type, "name" - name, must be representable as utf8, "=" - optional, "value" - flag
100///
101/// dashes and equals sign are low codepoint values and - can look for them literally in a string.
102/// This probably means not supporting dashes with diacritics, but that's okay
103///
104/// name must be valid utf8 after conversion and must not include `=`
105///
106/// argument is optional and can be non valid utf8.
107///
108/// The idea is to split the [`OsString`] into opaque parts by looking only at the parts simple parts
109/// and let stdlib to handle the decoding of those parts.
110///
111/// performance wise this (at least on unix) works some small number percentage slower than the
112/// previous version
113///
114///
115/// Notation -fbar is ambigous and could mean either `-f -b -a -r` or `-f=bar`, resolve it into
116/// [`Arg::Ambiguity`] and let subparser disambiguate it later depending on available short flag and
117/// arguments
118pub(crate) fn split_os_argument(input: &std::ffi::OsStr) -> Option<(ArgType, String, Option<Arg>)> {
119 #[cfg(any(unix, windows))]
120 {
121 // OsString are sequences of smaller smaller elements - bytes in unix and
122 // possibly invalid utf16 items on windows
123 #[cfg(unix)]
124 type Elt = u8;
125 #[cfg(windows)]
126 type Elt = u16;
127
128 // reuse allocation on unix, don't reuse allocations on windows
129 // either case - pack a vector of elements back into OsString
130 fn os_from_vec(vec: Vec<Elt>) -> OsString {
131 #[cfg(unix)]
132 {
133 <OsString as std::os::unix::ffi::OsStringExt>::from_vec(vec)
134 }
135 #[cfg(windows)]
136 {
137 <OsString as std::os::windows::ffi::OsStringExt>::from_wide(&vec)
138 }
139 }
140
141 // try to decode elements into a String
142 fn str_from_vec(vec: Vec<Elt>) -> Option<String> {
143 Some(os_from_vec(vec).to_str()?.to_owned())
144 }
145
146 // but in either case dashes and equals are just literal values just with different width
147 const DASH: Elt = b'-' as Elt;
148 const EQUALS: Elt = b'=' as Elt;
149
150 // preallocate something to store the name. oversized but avoids extra allocations/copying
151 let mut name = Vec::with_capacity(input.len());
152
153 let mut items;
154 #[cfg(unix)]
155 {
156 items = std::os::unix::ffi::OsStrExt::as_bytes(input)
157 .iter()
158 .copied();
159 }
160 #[cfg(windows)]
161 {
162 items = std::os::windows::ffi::OsStrExt::encode_wide(input);
163 }
164
165 // first item must be dash, otherwise it's positional or a flag value
166 if items.next()? != DASH {
167 return None;
168 }
169
170 // second item may or may not be, but should be present
171 let ty;
172 match items.next()? {
173 DASH => ty = ArgType::Long,
174 val => {
175 ty = ArgType::Short;
176 name.push(val);
177 }
178 }
179
180 // keep collecting until = or the end of the input
181 loop {
182 match items.next() {
183 Some(EQUALS) => {
184 if ty == ArgType::Short && name.len() > 1 {
185 let mut body = name.drain(1..).collect::<Vec<_>>();
186 body.push(EQUALS);
187 body.extend(items);
188 name.truncate(1);
189 let os = Arg::ArgWord(os_from_vec(body));
190 return Some((ty, str_from_vec(name)?, Some(os)));
191 }
192 break;
193 }
194 Some(val) => name.push(val),
195 None => {
196 if name.is_empty() {
197 return None;
198 }
199 return Some((ty, str_from_vec(name)?, None));
200 }
201 }
202 }
203
204 let name = str_from_vec(name)?;
205 let word = {
206 let os = os_from_vec(items.collect());
207 Arg::ArgWord(os)
208 };
209 Some((ty, name, Some(word)))
210 }
211 #[cfg(not(any(unix, windows)))]
212 {
213 split_os_argument_fallback(input)
214 }
215}
216
217/// similar to [`split_os_argument`] but only works for utf8 values, used as a fallback function
218/// on non windows/unix OSes
219#[cfg(any(all(not(windows), not(unix)), test))]
220pub(crate) fn split_os_argument_fallback(
221 input: &std::ffi::OsStr,
222) -> Option<(ArgType, String, Option<Arg>)> {
223 // fallback supports only valid utf8 os strings, matches old behavior
224 let string = input.to_str()?;
225
226 let mut chars = string.chars();
227 let mut name = String::with_capacity(string.len());
228
229 // first character must be dash, otherwise it's positional or a flag value
230 if chars.next()? != '-' {
231 return None;
232 }
233
234 // second character may or may not be
235 let ty;
236 match chars.next()? {
237 '-' => ty = ArgType::Long,
238 val => {
239 ty = ArgType::Short;
240 name.push(val);
241 }
242 }
243
244 // collect the argument's name up to '=' or until the end
245 // if it's a flag
246 loop {
247 match chars.next() {
248 Some('=') => {
249 if ty == ArgType::Short && name.len() > 1 {
250 let mut body = name.drain(1..).collect::<String>();
251 body.push('=');
252 body.extend(chars);
253 name.truncate(1);
254 let os = Arg::ArgWord(OsString::from(body));
255 return Some((ty, name, Some(os)));
256 }
257 break;
258 }
259
260 Some(val) => name.push(val),
261 None => {
262 if name.is_empty() {
263 return None;
264 }
265 return Some((ty, name, None));
266 }
267 }
268 }
269
270 Some((
271 ty,
272 name,
273 Some(Arg::ArgWord(OsString::from(chars.collect::<String>()))),
274 ))
275}