gaol/platform/linux/
seccomp.rs

1// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11//! `seccomp-bpf` support on recent Linux kernels.
12//!
13//! This works in tandem with `namespace` in order to implement sandbox profiles. It is generally
14//! the weaker of the two approaches, because BPF is limited, but it's useful for reducing kernel
15//! attack surface area and implementing coarse-grained policies.
16
17#![allow(non_upper_case_globals, unused_imports)]
18
19use profile::{Operation, Profile};
20
21use libc::{self, CLONE_CHILD_CLEARTID, CLONE_FILES, CLONE_FS,
22           CLONE_PARENT_SETTID, CLONE_SETTLS, CLONE_SIGHAND, CLONE_SYSVSEM,
23           CLONE_THREAD, CLONE_VM};
24use libc::{AF_INET, AF_INET6, AF_UNIX, AF_NETLINK};
25use libc::{c_char, c_int, c_ulong, c_ushort, c_void};
26use libc::{O_NONBLOCK, O_RDONLY, O_NOCTTY, O_CLOEXEC, FIONREAD, FIOCLEX};
27use libc::{MADV_NORMAL, MADV_RANDOM, MADV_SEQUENTIAL, MADV_WILLNEED, MADV_DONTNEED};
28use std::ffi::CString;
29use std::mem;
30
31/// The architecture number for x86.
32#[cfg(target_arch="x86")]
33const ARCH_NR: u32 = AUDIT_ARCH_X86;
34/// The architecture number for x86-64.
35#[cfg(target_arch="x86_64")]
36const ARCH_NR: u32 = AUDIT_ARCH_X86_64;
37/// The architecture number for ARM.
38#[cfg(target_arch="arm")]
39const ARCH_NR: u32 = AUDIT_ARCH_ARM;
40/// The architecture number for ARM 64-bit.
41#[cfg(target_arch="aarch64")]
42const ARCH_NR: u32 = AUDIT_ARCH_AARCH64;
43#[cfg(target_arch="powerpc")]
44const ARCH_NR: u32 = AUDIT_ARCH_PPC;
45#[cfg(all(target_arch="powerpc64", target_endian="big"))]
46const ARCH_NR: u32 = AUDIT_ARCH_PPC64;
47#[cfg(all(target_arch="powerpc64", target_endian="little"))]
48const ARCH_NR: u32 = AUDIT_ARCH_PPC64LE;
49
50const SECCOMP_RET_KILL: u32 = 0;
51const SECCOMP_RET_ALLOW: u32 = 0x7fff_0000;
52
53const LD: u16 = 0x00;
54const JMP: u16 = 0x05;
55const RET: u16 = 0x06;
56
57const W: u16 = 0;
58const ABS: u16 = 0x20;
59
60const JEQ: u16 = 0x10;
61const JSET: u16 = 0x40;
62
63const K: u16 = 0x00;
64
65const SYSCALL_NR_OFFSET: u32 = 0;
66const ARCH_NR_OFFSET: u32 = 4;
67const ARG_0_OFFSET: u32 = 16;
68const ARG_1_OFFSET: u32 = 24;
69const ARG_2_OFFSET: u32 = 32;
70
71const NETLINK_ROUTE: c_int = 0;
72
73const EM_386: u32 = 3;
74const EM_PPC: u32 = 20;
75const EM_PPC64: u32 = 21;
76const EM_ARM: u32 = 40;
77const EM_X86_64: u32 = 62;
78const EM_AARCH64: u32 = 183;
79
80/// A flag set in the architecture number for all 64-bit architectures.
81const __AUDIT_ARCH_64BIT: u32 = 0x8000_0000;
82/// A flag set in the architecture number for all little-endian architectures.
83const __AUDIT_ARCH_LE: u32 = 0x4000_0000;
84/// The architecture number for x86.
85const AUDIT_ARCH_X86: u32 = EM_386 | __AUDIT_ARCH_LE;
86/// The architecture number for x86-64.
87const AUDIT_ARCH_X86_64: u32 = EM_X86_64 | __AUDIT_ARCH_64BIT | __AUDIT_ARCH_LE;
88/// The architecture number for ARM.
89const AUDIT_ARCH_ARM: u32 = EM_ARM | __AUDIT_ARCH_LE;
90/// The architecture number for ARM 64-bit.
91const AUDIT_ARCH_AARCH64: u32 = EM_AARCH64 | __AUDIT_ARCH_64BIT | __AUDIT_ARCH_LE;
92/// The architecture number for ppc.
93const AUDIT_ARCH_PPC: u32 = EM_PPC;
94/// The architecture number for ppc64.
95const AUDIT_ARCH_PPC64: u32 = EM_PPC64 | __AUDIT_ARCH_64BIT;
96/// The architecture number for ppc64le.
97const AUDIT_ARCH_PPC64LE: u32 = EM_PPC64 | __AUDIT_ARCH_64BIT | __AUDIT_ARCH_LE;
98
99const PR_SET_SECCOMP: c_int = 22;
100const PR_SET_NO_NEW_PRIVS: c_int = 38;
101
102const SECCOMP_MODE_FILTER: c_ulong = 2;
103
104static FILTER_PROLOGUE: [sock_filter; 3] = [
105    VALIDATE_ARCHITECTURE_0,
106    VALIDATE_ARCHITECTURE_1,
107    VALIDATE_ARCHITECTURE_2,
108];
109
110// A most untimely end...
111static FILTER_EPILOGUE: [sock_filter; 1] = [
112    KILL_PROCESS,
113];
114
115/// Syscalls that are always allowed.
116pub static ALLOWED_SYSCALLS: [u32; 21] = [
117    libc::SYS_brk as u32,
118    libc::SYS_close as u32,
119    libc::SYS_exit as u32,
120    libc::SYS_exit_group as u32,
121    libc::SYS_futex as u32,
122    libc::SYS_getrandom as u32,
123    libc::SYS_getuid as u32,
124    libc::SYS_mmap as u32,
125    libc::SYS_mprotect as u32,
126    libc::SYS_munmap as u32,
127    libc::SYS_poll as u32,
128    libc::SYS_read as u32,
129    libc::SYS_recvfrom as u32,
130    libc::SYS_recvmsg as u32,
131    libc::SYS_rt_sigreturn as u32,
132    libc::SYS_sched_getaffinity as u32,
133    libc::SYS_sendmmsg as u32,
134    libc::SYS_sendto as u32,
135    libc::SYS_set_robust_list as u32,
136    libc::SYS_sigaltstack as u32,
137    libc::SYS_write as u32,
138];
139
140static ALLOWED_SYSCALLS_FOR_FILE_READ: [u32; 5] = [
141    libc::SYS_access as u32,
142    libc::SYS_fstat as u32,
143    libc::SYS_lseek as u32,
144    libc::SYS_readlink as u32,
145    libc::SYS_stat as u32,
146];
147
148static ALLOWED_SYSCALLS_FOR_NETWORK_OUTBOUND: [u32; 3] = [
149    libc::SYS_bind as u32,
150    libc::SYS_connect as u32,
151    libc::SYS_getsockname as u32,
152];
153
154const ALLOW_SYSCALL: sock_filter = sock_filter {
155    code: RET + K,
156    k: SECCOMP_RET_ALLOW,
157    jt: 0,
158    jf: 0,
159};
160
161const KILL_PROCESS: sock_filter = sock_filter {
162    code: RET + K,
163    k: SECCOMP_RET_KILL,
164    jt: 0,
165    jf: 0,
166};
167
168const EXAMINE_SYSCALL: sock_filter = sock_filter {
169    code: LD + W + ABS,
170    k: SYSCALL_NR_OFFSET,
171    jt: 0,
172    jf: 0,
173};
174
175const EXAMINE_ARG_0: sock_filter = sock_filter {
176    code: LD + W + ABS,
177    k: ARG_0_OFFSET,
178    jt: 0,
179    jf: 0,
180};
181
182const EXAMINE_ARG_1: sock_filter = sock_filter {
183    code: LD + W + ABS,
184    k: ARG_1_OFFSET,
185    jt: 0,
186    jf: 0,
187};
188
189const EXAMINE_ARG_2: sock_filter = sock_filter {
190    code: LD + W + ABS,
191    k: ARG_2_OFFSET,
192    jt: 0,
193    jf: 0,
194};
195
196const VALIDATE_ARCHITECTURE_0: sock_filter = sock_filter {
197    code: LD + W + ABS,
198    k: ARCH_NR_OFFSET,
199    jt: 0,
200    jf: 0,
201};
202
203const VALIDATE_ARCHITECTURE_1: sock_filter = sock_filter {
204    code: JMP + JEQ + K,
205    k: ARCH_NR,
206    jt: 1,
207    jf: 0,
208};
209
210const VALIDATE_ARCHITECTURE_2: sock_filter = KILL_PROCESS;
211
212pub struct Filter {
213    program: Vec<sock_filter>,
214}
215
216impl Filter {
217    pub fn new(profile: &Profile) -> Filter {
218        let mut filter = Filter {
219            program: FILTER_PROLOGUE.iter().map(|x| *x).collect(),
220        };
221        filter.allow_syscalls(&ALLOWED_SYSCALLS);
222
223        if profile.allowed_operations().iter().any(|operation| {
224            match *operation {
225                Operation::FileReadAll(_) | Operation::FileReadMetadata(_) => true,
226                _ => false,
227            }
228        }) {
229            filter.allow_syscalls(&ALLOWED_SYSCALLS_FOR_FILE_READ);
230
231            // Only allow file reading.
232            filter.if_syscall_is(libc::SYS_open as u32, |filter| {
233                filter.if_arg1_hasnt_set(!(O_RDONLY | O_CLOEXEC | O_NOCTTY | O_NONBLOCK) as u32,
234                                         |filter| filter.allow_this_syscall())
235            });
236
237            // Only allow the `FIONREAD` or `FIOCLEX` `ioctl`s to be performed.
238            filter.if_syscall_is(libc::SYS_ioctl as u32, |filter| {
239                filter.if_arg1_is(FIONREAD as u32, |filter| filter.allow_this_syscall());
240                filter.if_arg1_is(FIOCLEX as u32, |filter| filter.allow_this_syscall())
241            })
242        }
243
244        if profile.allowed_operations().iter().any(|operation| {
245            match *operation {
246                Operation::NetworkOutbound(_) => true,
247                _ => false,
248            }
249        }) {
250            filter.allow_syscalls(&ALLOWED_SYSCALLS_FOR_NETWORK_OUTBOUND);
251
252            // Only allow Unix, IPv4, IPv6, and netlink route sockets to be created.
253            filter.if_syscall_is(libc::SYS_socket as u32, |filter| {
254                filter.if_arg0_is(AF_UNIX as u32, |filter| filter.allow_this_syscall());
255                filter.if_arg0_is(AF_INET as u32, |filter| filter.allow_this_syscall());
256                filter.if_arg0_is(AF_INET6 as u32, |filter| filter.allow_this_syscall());
257                filter.if_arg0_is(AF_NETLINK as u32, |filter| {
258                    filter.if_arg2_is(NETLINK_ROUTE as u32, |filter| filter.allow_this_syscall())
259                })
260            })
261        }
262
263        // Only allow normal threads to be created.
264        filter.if_syscall_is(libc::SYS_clone as u32, |filter| {
265            filter.if_arg0_is((CLONE_VM |
266                               CLONE_FS |
267                               CLONE_FILES |
268                               CLONE_SIGHAND |
269                               CLONE_THREAD |
270                               CLONE_SYSVSEM |
271                               CLONE_SETTLS |
272                               CLONE_PARENT_SETTID |
273                               CLONE_CHILD_CLEARTID) as u32,
274                              |filter| filter.allow_this_syscall())
275        });
276
277        // Only allow the POSIX values for `madvise`.
278        filter.if_syscall_is(libc::SYS_madvise as u32, |filter| {
279            for mode in [
280                MADV_NORMAL,
281                MADV_RANDOM,
282                MADV_SEQUENTIAL,
283                MADV_WILLNEED,
284                MADV_DONTNEED
285            ].iter() {
286                filter.if_arg2_is(*mode as u32, |filter| filter.allow_this_syscall())
287            }
288        });
289
290        filter.program.extend_from_slice(&FILTER_EPILOGUE);
291        filter
292    }
293
294    /// Dumps this filter to a temporary file.
295    #[cfg(dump_bpf_sockets)]
296    pub fn dump(&self) {
297        let path = CString::from_slice(b"/tmp/gaol-bpf.XXXXXX");
298        let mut path = path.as_bytes_with_nul().to_vec();
299        let fd = unsafe {
300            libc::mkstemp(path.as_mut_ptr() as *mut c_char)
301        };
302        let nbytes = self.program.len() * mem::size_of::<sock_filter>();
303        unsafe {
304            assert!(libc::write(fd, self.program.as_ptr() as *const c_void, nbytes as u64) ==
305                    nbytes as i64);
306            libc::close(fd);
307        }
308    }
309
310    #[cfg(not(dump_bpf_sockets))]
311    pub fn dump(&self) {}
312
313    /// Activates this filter, applying all of its restrictions forevermore. This can only be done
314    /// once.
315    pub fn activate(&self) -> Result<(),c_int> {
316        unsafe {
317            let result = libc::prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
318            if result != 0 {
319                return Err(result)
320            }
321
322            let program = sock_fprog {
323                len: self.program.len() as c_ushort,
324                filter: self.program.as_ptr(),
325            };
326            let result = libc::prctl(PR_SET_SECCOMP,
327                                     SECCOMP_MODE_FILTER,
328                                     &program as *const sock_fprog as usize as c_ulong,
329                                     !0,
330                                     0);
331            if result == 0 {
332                Ok(())
333            } else {
334                Err(result)
335            }
336        }
337    }
338
339    fn allow_this_syscall(&mut self) {
340        self.program.push(ALLOW_SYSCALL)
341    }
342
343    fn allow_syscalls(&mut self, syscalls: &[u32]) {
344        for &syscall in syscalls.iter() {
345            self.if_syscall_is(syscall, |filter| filter.allow_this_syscall())
346        }
347    }
348
349    fn if_syscall_is<F>(&mut self, number: u32, then: F) where F: FnMut(&mut Filter) {
350        self.program.push(EXAMINE_SYSCALL);
351        self.if_k_is(number, then)
352    }
353
354    fn if_arg0_is<F>(&mut self, value: u32, then: F) where F: FnMut(&mut Filter) {
355        self.program.push(EXAMINE_ARG_0);
356        self.if_k_is(value, then)
357    }
358
359    fn if_arg1_is<F>(&mut self, value: u32, then: F) where F: FnMut(&mut Filter) {
360        self.program.push(EXAMINE_ARG_1);
361        self.if_k_is(value, then)
362    }
363
364    fn if_arg1_hasnt_set<F>(&mut self, value: u32, then: F) where F: FnMut(&mut Filter) {
365        self.program.push(EXAMINE_ARG_1);
366        self.if_k_hasnt_set(value, then)
367    }
368
369    fn if_arg2_is<F>(&mut self, value: u32, then: F) where F: FnMut(&mut Filter) {
370        self.program.push(EXAMINE_ARG_2);
371        self.if_k_is(value, then)
372    }
373
374    fn if_k_is<F>(&mut self, value: u32, mut then: F) where F: FnMut(&mut Filter) {
375        let index = self.program.len();
376        self.program.push(sock_filter {
377            code: JMP + JEQ + K,
378            k: value,
379            jt: 0,
380            jf: 0,
381        });
382        then(self);
383        self.program[index].jf = (self.program.len() - index - 1) as u8;
384    }
385
386    fn if_k_hasnt_set<F>(&mut self, value: u32, mut then: F) where F: FnMut(&mut Filter) {
387        let index = self.program.len();
388        self.program.push(sock_filter {
389            code: JMP + JSET + K,
390            k: value,
391            jt: 0,
392            jf: 0,
393        });
394        then(self);
395        self.program[index].jt = (self.program.len() - index - 1) as u8;
396    }
397}
398
399#[repr(C)]
400#[derive(Copy, Clone)]
401struct sock_filter {
402    code: u16,
403    jt: u8,
404    jf: u8,
405    k: u32,
406}
407
408#[repr(C)]
409#[derive(Copy, Clone)]
410struct sock_fprog {
411    len: c_ushort,
412    filter: *const sock_filter,
413}