bpaf/arg.rs
1use std::ffi::{OsStr, OsString};
2
3/// Preprocessed command line argument
4///
5/// [`OsString`] in Short/Long correspond to orignal command line item used for errors
6#[derive(Debug, Clone, Eq, PartialEq)]
7pub(crate) enum Arg {
8 /// short flag: `-f`
9 ///
10 /// bool indicates if following item is also part of this Short (created
11 Short(char, bool, OsString),
12
13 /// long flag: `--flag`
14 /// bool tells if it looks like --key=val or not
15 Long(String, bool, OsString),
16
17 /// "val" part of --key=val -k=val -kval
18 ArgWord(OsString),
19
20 /// separate word that can be command, positional or a separate argument to a flag
21 ///
22 /// Can start with `-` or `--`, doesn't have to be valid utf8
23 ///
24 /// `hello`
25 Word(OsString),
26
27 /// separate word that goes after `--`, strictly positional
28 ///
29 /// Can start with `-` or `--`, doesn't have to be valid utf8
30 PosWord(OsString),
31}
32
33impl Arg {
34 pub(crate) fn os_str(&self) -> &OsStr {
35 match self {
36 Arg::Short(_, _, s)
37 | Arg::Long(_, _, s)
38 | Arg::ArgWord(s)
39 | Arg::Word(s)
40 | Arg::PosWord(s) => s.as_ref(),
41 }
42 }
43
44 pub(crate) fn match_short(&self, val: char) -> bool {
45 match self {
46 Arg::Short(s, _, _) => *s == val,
47 Arg::ArgWord(_) | Arg::Long(_, _, _) | Arg::Word(_) | Arg::PosWord(_) => false,
48 }
49 }
50
51 pub(crate) fn match_long(&self, val: &str) -> bool {
52 match self {
53 Arg::Long(s, _, _) => *s == val,
54 Arg::Short(_, _, _) | Arg::ArgWord(_) | Arg::Word(_) | Arg::PosWord(_) => false,
55 }
56 }
57}
58
59// short flag disambiguations:
60//
61// Short flags | short arg
62// No | No | no problem
63// Yes | No | use flag
64// No | Yes | use arg
65// Yes | Yes | ask user?
66//
67// -a - just a regular short flag: "-a"
68// -abc - assuming there are short flags a, b and c: "-a -b -c", assuming utf8 values AND there's no argument -a
69// -abc - assuming there's no -a -b -c: "-a bc"
70// -abc - assuming both short a b c AND there's argument -a - need to disambiguate on a context level
71//
72// 1. parse argument into ambigous representation that can store both short flags and argument
73// 2. collect short flag/arg when entering the subparsre
74// 3. when reaching ambi
75//
76
77impl std::fmt::Display for Arg {
78 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
79 match self {
80 Arg::Short(s, _, _) => write!(f, "-{}", s),
81 Arg::Long(l, _, _) => write!(f, "--{}", l),
82 Arg::ArgWord(w) | Arg::Word(w) | Arg::PosWord(w) => {
83 write!(f, "{}", w.to_string_lossy())
84 }
85 }
86 }
87}
88
89#[derive(Eq, PartialEq, Debug)]
90pub(crate) enum ArgType {
91 Short,
92 Long,
93}
94
95/// split [`OsString`] into argument specific bits
96///
97/// takes a possibly non-utf8 string looking like "--name=value" and splits it into bits:
98/// "--" - type, "name" - name, must be representable as utf8, "=" - optional, "value" - flag
99///
100/// dashes and equals sign are low codepoint values and - can look for them literally in a string.
101/// This probably means not supporting dashes with diacritics, but that's okay
102///
103/// name must be valid utf8 after conversion and must not include `=`
104///
105/// argument is optional and can be non valid utf8.
106///
107/// The idea is to split the [`OsString`] into opaque parts by looking only at the parts simple parts
108/// and let stdlib to handle the decoding of those parts.
109///
110/// performance wise this (at least on unix) works some small number percentage slower than the
111/// previous version
112///
113///
114/// Notation -fbar is ambigous and could mean either `-f -b -a -r` or `-f=bar`, resolve it into
115/// [`Arg::Ambiguity`] and let subparser disambiguate it later depending on available short flag and
116/// arguments
117pub(crate) fn split_os_argument(input: &std::ffi::OsStr) -> Option<(ArgType, String, Option<Arg>)> {
118 #[cfg(any(unix, windows))]
119 {
120 // OsString are sequences of smaller smaller elements - bytes in unix and
121 // possibly invalid utf16 items on windows
122 #[cfg(unix)]
123 type Elt = u8;
124 #[cfg(windows)]
125 type Elt = u16;
126
127 // reuse allocation on unix, don't reuse allocations on windows
128 // either case - pack a vector of elements back into OsString
129 fn os_from_vec(vec: Vec<Elt>) -> OsString {
130 #[cfg(unix)]
131 {
132 <OsString as std::os::unix::ffi::OsStringExt>::from_vec(vec)
133 }
134 #[cfg(windows)]
135 {
136 <OsString as std::os::windows::ffi::OsStringExt>::from_wide(&vec)
137 }
138 }
139
140 // try to decode elements into a String
141 fn str_from_vec(vec: Vec<Elt>) -> Option<String> {
142 Some(os_from_vec(vec).to_str()?.to_owned())
143 }
144
145 // but in either case dashes and equals are just literal values just with different width
146 const DASH: Elt = b'-' as Elt;
147 const EQUALS: Elt = b'=' as Elt;
148
149 // preallocate something to store the name. oversized but avoids extra allocations/copying
150 let mut name = Vec::with_capacity(input.len());
151
152 let mut items;
153 #[cfg(unix)]
154 {
155 items = std::os::unix::ffi::OsStrExt::as_bytes(input)
156 .iter()
157 .copied();
158 }
159 #[cfg(windows)]
160 {
161 items = std::os::windows::ffi::OsStrExt::encode_wide(input);
162 }
163
164 // first item must be dash, otherwise it's positional or a flag value
165 if items.next()? != DASH {
166 return None;
167 }
168
169 // second item may or may not be, but should be present
170 let ty;
171 match items.next()? {
172 DASH => ty = ArgType::Long,
173 val => {
174 ty = ArgType::Short;
175 name.push(val);
176 }
177 }
178
179 // keep collecting until = or the end of the input
180 loop {
181 match items.next() {
182 Some(EQUALS) => {
183 if ty == ArgType::Short && name.len() > 1 {
184 let mut body = name.drain(1..).collect::<Vec<_>>();
185 body.push(EQUALS);
186 body.extend(items);
187 name.truncate(1);
188 let os = Arg::ArgWord(os_from_vec(body));
189 return Some((ty, str_from_vec(name)?, Some(os)));
190 }
191 break;
192 }
193 Some(val) => name.push(val),
194 None => {
195 if name.is_empty() {
196 return None;
197 }
198 return Some((ty, str_from_vec(name)?, None));
199 }
200 }
201 }
202
203 let name = str_from_vec(name)?;
204 let word = {
205 let os = os_from_vec(items.collect());
206 Arg::ArgWord(os)
207 };
208 Some((ty, name, Some(word)))
209 }
210 #[cfg(not(any(unix, windows)))]
211 {
212 split_os_argument_fallback(input)
213 }
214}
215
216/// similar to [`split_os_argument`] but only works for utf8 values, used as a fallback function
217/// on non windows/unix OSes
218#[cfg(any(all(not(windows), not(unix)), test))]
219pub(crate) fn split_os_argument_fallback(
220 input: &std::ffi::OsStr,
221) -> Option<(ArgType, String, Option<Arg>)> {
222 // fallback supports only valid utf8 os strings, matches old behavior
223 let string = input.to_str()?;
224
225 let mut chars = string.chars();
226 let mut name = String::with_capacity(string.len());
227
228 // first character must be dash, otherwise it's positional or a flag value
229 if chars.next()? != '-' {
230 return None;
231 }
232
233 // second character may or may not be
234 let ty;
235 match chars.next()? {
236 '-' => ty = ArgType::Long,
237 val => {
238 ty = ArgType::Short;
239 name.push(val);
240 }
241 }
242
243 // collect the argument's name up to '=' or until the end
244 // if it's a flag
245 loop {
246 match chars.next() {
247 Some('=') => {
248 if ty == ArgType::Short && name.len() > 1 {
249 let mut body = name.drain(1..).collect::<String>();
250 body.push('=');
251 body.extend(chars);
252 name.truncate(1);
253 let os = Arg::ArgWord(OsString::from(body));
254 return Some((ty, name, Some(os)));
255 }
256 break;
257 }
258
259 Some(val) => name.push(val),
260 None => {
261 if name.is_empty() {
262 return None;
263 }
264 return Some((ty, name, None));
265 }
266 }
267 }
268
269 Some((
270 ty,
271 name,
272 Some(Arg::ArgWord(OsString::from(chars.collect::<String>()))),
273 ))
274}