aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2020-10-29 13:45:02 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2020-10-29 13:45:02 +1100
commit329f3452c781ee2361aa8016ea06e845a11cf299 (patch)
treeea39800671b57bd46b1fd221fc54399812b02dff
parent0753b9531ca88b88ffc99576250d4668c540dd02 (diff)
parent57a339117e52f06b821698a982b77ed6e9bc38ae (diff)
Merge remote-tracking branch 'seccomp/for-next/seccomp' into master
-rw-r--r--arch/x86/include/asm/seccomp.h20
-rw-r--r--kernel/seccomp.c233
-rw-r--r--tools/testing/selftests/seccomp/seccomp_benchmark.c151
-rw-r--r--tools/testing/selftests/seccomp/settings2
4 files changed, 382 insertions, 24 deletions
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
index 2bd1338de236..fef16e398161 100644
--- a/arch/x86/include/asm/seccomp.h
+++ b/arch/x86/include/asm/seccomp.h
@@ -16,6 +16,26 @@
#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
#endif
+#ifdef CONFIG_X86_64
+# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_X86_64
+# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
+# define SECCOMP_ARCH_NATIVE_NAME "x86_64"
+# ifdef CONFIG_COMPAT
+# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_I386
+# define SECCOMP_ARCH_COMPAT_NR IA32_NR_syscalls
+# define SECCOMP_ARCH_COMPAT_NAME "ia32"
+# endif
+/*
+ * x32 will have __X32_SYSCALL_BIT set in syscall number. We don't support
+ * caching them and they are treated as out of range syscalls, which will
+ * always pass through the BPF filter.
+ */
+#else /* !CONFIG_X86_64 */
+# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_I386
+# define SECCOMP_ARCH_NATIVE_NR NR_syscalls
+# define SECCOMP_ARCH_NATIVE_NAME "ia32"
+#endif
+
#include <asm-generic/seccomp.h>
#endif /* _ASM_X86_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 8ad7a293255a..d8cf468dbe1e 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -143,6 +143,38 @@ struct notification {
struct list_head notifications;
};
+#ifdef SECCOMP_ARCH_NATIVE
+/**
+ * struct action_cache - per-filter cache of seccomp actions per
+ * arch/syscall pair
+ *
+ * @allow_native: A bitmap where each bit represents whether the
+ * filter will always allow the syscall, for the
+ * native architecture.
+ * @allow_compat: A bitmap where each bit represents whether the
+ * filter will always allow the syscall, for the
+ * compat architecture.
+ */
+struct action_cache {
+ DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR);
+#ifdef SECCOMP_ARCH_COMPAT
+ DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR);
+#endif
+};
+#else
+struct action_cache { };
+
+static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
+ const struct seccomp_data *sd)
+{
+ return false;
+}
+
+static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
+{
+}
+#endif /* SECCOMP_ARCH_NATIVE */
+
/**
* struct seccomp_filter - container for seccomp BPF programs
*
@@ -159,6 +191,7 @@ struct notification {
* this filter after reaching 0. The @users count is always smaller
* or equal to @refs. Hence, reaching 0 for @users does not mean
* the filter can be freed.
+ * @cache: cache of arch/syscall mappings to actions
* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
* @prev: points to a previously installed, or inherited, filter
* @prog: the BPF program to evaluate
@@ -180,6 +213,7 @@ struct seccomp_filter {
refcount_t refs;
refcount_t users;
bool log;
+ struct action_cache cache;
struct seccomp_filter *prev;
struct bpf_prog *prog;
struct notification *notif;
@@ -298,6 +332,52 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
return 0;
}
+#ifdef SECCOMP_ARCH_NATIVE
+static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap,
+ size_t bitmap_size,
+ int syscall_nr)
+{
+ if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size))
+ return false;
+ syscall_nr = array_index_nospec(syscall_nr, bitmap_size);
+
+ return test_bit(syscall_nr, bitmap);
+}
+
+/**
+ * seccomp_cache_check_allow - lookup seccomp cache
+ * @sfilter: The seccomp filter
+ * @sd: The seccomp data to lookup the cache with
+ *
+ * Returns true if the seccomp_data is cached and allowed.
+ */
+static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
+ const struct seccomp_data *sd)
+{
+ int syscall_nr = sd->nr;
+ const struct action_cache *cache = &sfilter->cache;
+
+#ifndef SECCOMP_ARCH_COMPAT
+ /* A native-only architecture doesn't need to check sd->arch. */
+ return seccomp_cache_check_allow_bitmap(cache->allow_native,
+ SECCOMP_ARCH_NATIVE_NR,
+ syscall_nr);
+#else
+ if (likely(sd->arch == SECCOMP_ARCH_NATIVE))
+ return seccomp_cache_check_allow_bitmap(cache->allow_native,
+ SECCOMP_ARCH_NATIVE_NR,
+ syscall_nr);
+ if (likely(sd->arch == SECCOMP_ARCH_COMPAT))
+ return seccomp_cache_check_allow_bitmap(cache->allow_compat,
+ SECCOMP_ARCH_COMPAT_NR,
+ syscall_nr);
+#endif /* SECCOMP_ARCH_COMPAT */
+
+ WARN_ON_ONCE(true);
+ return false;
+}
+#endif /* SECCOMP_ARCH_NATIVE */
+
/**
* seccomp_run_filters - evaluates all seccomp filters against @sd
* @sd: optional seccomp data to be passed to filters
@@ -320,6 +400,9 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
if (WARN_ON(f == NULL))
return SECCOMP_RET_KILL_PROCESS;
+ if (seccomp_cache_check_allow(f, sd))
+ return SECCOMP_RET_ALLOW;
+
/*
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
@@ -544,7 +627,12 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
struct seccomp_filter *sfilter;
int ret;
- const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
+ const bool save_orig =
+#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(SECCOMP_ARCH_NATIVE)
+ true;
+#else
+ false;
+#endif
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
return ERR_PTR(-EINVAL);
@@ -610,6 +698,148 @@ out:
return filter;
}
+#ifdef SECCOMP_ARCH_NATIVE
+/**
+ * seccomp_is_const_allow - check if filter is constant allow with given data
+ * @fprog: The BPF programs
+ * @sd: The seccomp data to check against, only syscall number and arch
+ * number are considered constant.
+ */
+static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
+ struct seccomp_data *sd)
+{
+ unsigned int reg_value = 0;
+ unsigned int pc;
+ bool op_res;
+
+ if (WARN_ON_ONCE(!fprog))
+ return false;
+
+ for (pc = 0; pc < fprog->len; pc++) {
+ struct sock_filter *insn = &fprog->filter[pc];
+ u16 code = insn->code;
+ u32 k = insn->k;
+
+ switch (code) {
+ case BPF_LD | BPF_W | BPF_ABS:
+ switch (k) {
+ case offsetof(struct seccomp_data, nr):
+ reg_value = sd->nr;
+ break;
+ case offsetof(struct seccomp_data, arch):
+ reg_value = sd->arch;
+ break;
+ default:
+ /* can't optimize (non-constant value load) */
+ return false;
+ }
+ break;
+ case BPF_RET | BPF_K:
+ /* reached return with constant values only, check allow */
+ return k == SECCOMP_RET_ALLOW;
+ case BPF_JMP | BPF_JA:
+ pc += insn->k;
+ break;
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ switch (BPF_OP(code)) {
+ case BPF_JEQ:
+ op_res = reg_value == k;
+ break;
+ case BPF_JGE:
+ op_res = reg_value >= k;
+ break;
+ case BPF_JGT:
+ op_res = reg_value > k;
+ break;
+ case BPF_JSET:
+ op_res = !!(reg_value & k);
+ break;
+ default:
+ /* can't optimize (unknown jump) */
+ return false;
+ }
+
+ pc += op_res ? insn->jt : insn->jf;
+ break;
+ case BPF_ALU | BPF_AND | BPF_K:
+ reg_value &= k;
+ break;
+ default:
+ /* can't optimize (unknown insn) */
+ return false;
+ }
+ }
+
+ /* ran off the end of the filter?! */
+ WARN_ON(1);
+ return false;
+}
+
+static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter,
+ void *bitmap, const void *bitmap_prev,
+ size_t bitmap_size, int arch)
+{
+ struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
+ struct seccomp_data sd;
+ int nr;
+
+ if (bitmap_prev) {
+ /* The new filter must be as restrictive as the last. */
+ bitmap_copy(bitmap, bitmap_prev, bitmap_size);
+ } else {
+ /* Before any filters, all syscalls are always allowed. */
+ bitmap_fill(bitmap, bitmap_size);
+ }
+
+ for (nr = 0; nr < bitmap_size; nr++) {
+ /* No bitmap change: not a cacheable action. */
+ if (!test_bit(nr, bitmap))
+ continue;
+
+ sd.nr = nr;
+ sd.arch = arch;
+
+ /* No bitmap change: continue to always allow. */
+ if (seccomp_is_const_allow(fprog, &sd))
+ continue;
+
+ /*
+ * Not a cacheable action: always run filters.
+ * atomic clear_bit() not needed, filter not visible yet.
+ */
+ __clear_bit(nr, bitmap);
+ }
+}
+
+/**
+ * seccomp_cache_prepare - emulate the filter to find cachable syscalls
+ * @sfilter: The seccomp filter
+ *
+ * Returns 0 if successful or -errno if error occurred.
+ */
+static void seccomp_cache_prepare(struct seccomp_filter *sfilter)
+{
+ struct action_cache *cache = &sfilter->cache;
+ const struct action_cache *cache_prev =
+ sfilter->prev ? &sfilter->prev->cache : NULL;
+
+ seccomp_cache_prepare_bitmap(sfilter, cache->allow_native,
+ cache_prev ? cache_prev->allow_native : NULL,
+ SECCOMP_ARCH_NATIVE_NR,
+ SECCOMP_ARCH_NATIVE);
+
+#ifdef SECCOMP_ARCH_COMPAT
+ seccomp_cache_prepare_bitmap(sfilter, cache->allow_compat,
+ cache_prev ? cache_prev->allow_compat : NULL,
+ SECCOMP_ARCH_COMPAT_NR,
+ SECCOMP_ARCH_COMPAT);
+#endif /* SECCOMP_ARCH_COMPAT */
+}
+#endif /* SECCOMP_ARCH_NATIVE */
+
/**
* seccomp_attach_filter: validate and attach filter
* @flags: flags to change filter behavior
@@ -659,6 +889,7 @@ static long seccomp_attach_filter(unsigned int flags,
* task reference.
*/
filter->prev = current->seccomp.filter;
+ seccomp_cache_prepare(filter);
current->seccomp.filter = filter;
atomic_inc(&current->seccomp.filter_count);
diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c
index 91f5a89cadac..fcc806585266 100644
--- a/tools/testing/selftests/seccomp/seccomp_benchmark.c
+++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c
@@ -4,12 +4,16 @@
*/
#define _GNU_SOURCE
#include <assert.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
+#include <sys/param.h>
#include <sys/prctl.h>
#include <sys/syscall.h>
#include <sys/types.h>
@@ -70,18 +74,74 @@ unsigned long long calibrate(void)
return samples * seconds;
}
+bool approx(int i_one, int i_two)
+{
+ double one = i_one, one_bump = one * 0.01;
+ double two = i_two, two_bump = two * 0.01;
+
+ one_bump = one + MAX(one_bump, 2.0);
+ two_bump = two + MAX(two_bump, 2.0);
+
+ /* Equal to, or within 1% or 2 digits */
+ if (one == two ||
+ (one > two && one <= two_bump) ||
+ (two > one && two <= one_bump))
+ return true;
+ return false;
+}
+
+bool le(int i_one, int i_two)
+{
+ if (i_one <= i_two)
+ return true;
+ return false;
+}
+
+long compare(const char *name_one, const char *name_eval, const char *name_two,
+ unsigned long long one, bool (*eval)(int, int), unsigned long long two)
+{
+ bool good;
+
+ printf("\t%s %s %s (%lld %s %lld): ", name_one, name_eval, name_two,
+ (long long)one, name_eval, (long long)two);
+ if (one > INT_MAX) {
+ printf("Miscalculation! Measurement went negative: %lld\n", (long long)one);
+ return 1;
+ }
+ if (two > INT_MAX) {
+ printf("Miscalculation! Measurement went negative: %lld\n", (long long)two);
+ return 1;
+ }
+
+ good = eval(one, two);
+ printf("%s\n", good ? "✔️" : "❌");
+
+ return good ? 0 : 1;
+}
+
int main(int argc, char *argv[])
{
+ struct sock_filter bitmap_filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog bitmap_prog = {
+ .len = (unsigned short)ARRAY_SIZE(bitmap_filter),
+ .filter = bitmap_filter,
+ };
struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, args[0])),
BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
};
struct sock_fprog prog = {
.len = (unsigned short)ARRAY_SIZE(filter),
.filter = filter,
};
- long ret;
- unsigned long long samples;
- unsigned long long native, filter1, filter2;
+
+ long ret, bits;
+ unsigned long long samples, calc;
+ unsigned long long native, filter1, filter2, bitmap1, bitmap2;
+ unsigned long long entry, per_filter1, per_filter2;
printf("Current BPF sysctl settings:\n");
system("sysctl net.core.bpf_jit_enable");
@@ -101,35 +161,82 @@ int main(int argc, char *argv[])
ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
assert(ret == 0);
- /* One filter */
- ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ /* One filter resulting in a bitmap */
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog);
assert(ret == 0);
- filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
- printf("getpid RET_ALLOW 1 filter: %llu ns\n", filter1);
+ bitmap1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
+ printf("getpid RET_ALLOW 1 filter (bitmap): %llu ns\n", bitmap1);
+
+ /* Second filter resulting in a bitmap */
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog);
+ assert(ret == 0);
- if (filter1 == native)
- printf("No overhead measured!? Try running again with more samples.\n");
+ bitmap2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
+ printf("getpid RET_ALLOW 2 filters (bitmap): %llu ns\n", bitmap2);
- /* Two filters */
+ /* Third filter, can no longer be converted to bitmap */
ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
assert(ret == 0);
- filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
- printf("getpid RET_ALLOW 2 filters: %llu ns\n", filter2);
-
- /* Calculations */
- printf("Estimated total seccomp overhead for 1 filter: %llu ns\n",
- filter1 - native);
+ filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
+ printf("getpid RET_ALLOW 3 filters (full): %llu ns\n", filter1);
- printf("Estimated total seccomp overhead for 2 filters: %llu ns\n",
- filter2 - native);
+ /* Fourth filter, can not be converted to bitmap because of filter 3 */
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog);
+ assert(ret == 0);
- printf("Estimated seccomp per-filter overhead: %llu ns\n",
- filter2 - filter1);
+ filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
+ printf("getpid RET_ALLOW 4 filters (full): %llu ns\n", filter2);
+
+ /* Estimations */
+#define ESTIMATE(fmt, var, what) do { \
+ var = (what); \
+ printf("Estimated " fmt ": %llu ns\n", var); \
+ if (var > INT_MAX) \
+ goto more_samples; \
+ } while (0)
+
+ ESTIMATE("total seccomp overhead for 1 bitmapped filter", calc,
+ bitmap1 - native);
+ ESTIMATE("total seccomp overhead for 2 bitmapped filters", calc,
+ bitmap2 - native);
+ ESTIMATE("total seccomp overhead for 3 full filters", calc,
+ filter1 - native);
+ ESTIMATE("total seccomp overhead for 4 full filters", calc,
+ filter2 - native);
+ ESTIMATE("seccomp entry overhead", entry,
+ bitmap1 - native - (bitmap2 - bitmap1));
+ ESTIMATE("seccomp per-filter overhead (last 2 diff)", per_filter1,
+ filter2 - filter1);
+ ESTIMATE("seccomp per-filter overhead (filters / 4)", per_filter2,
+ (filter2 - native - entry) / 4);
+
+ printf("Expectations:\n");
+ ret |= compare("native", "≤", "1 bitmap", native, le, bitmap1);
+ bits = compare("native", "≤", "1 filter", native, le, filter1);
+ if (bits)
+ goto more_samples;
+
+ ret |= compare("per-filter (last 2 diff)", "≈", "per-filter (filters / 4)",
+ per_filter1, approx, per_filter2);
+
+ bits = compare("1 bitmapped", "≈", "2 bitmapped",
+ bitmap1 - native, approx, bitmap2 - native);
+ if (bits) {
+ printf("Skipping constant action bitmap expectations: they appear unsupported.\n");
+ goto out;
+ }
- printf("Estimated seccomp entry overhead: %llu ns\n",
- filter1 - native - (filter2 - filter1));
+ ret |= compare("entry", "≈", "1 bitmapped", entry, approx, bitmap1 - native);
+ ret |= compare("entry", "≈", "2 bitmapped", entry, approx, bitmap2 - native);
+ ret |= compare("native + entry + (per filter * 4)", "≈", "4 filters total",
+ entry + (per_filter1 * 4) + native, approx, filter2);
+ if (ret == 0)
+ goto out;
+more_samples:
+ printf("Saw unexpected benchmark result. Try running again with more samples?\n");
+out:
return 0;
}
diff --git a/tools/testing/selftests/seccomp/settings b/tools/testing/selftests/seccomp/settings
index ba4d85f74cd6..6091b45d226b 100644
--- a/tools/testing/selftests/seccomp/settings
+++ b/tools/testing/selftests/seccomp/settings
@@ -1 +1 @@
-timeout=90
+timeout=120