From 431e8fb6f9970414aae6ebd9e3d4bd3e6eb3ba55 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Tue, 18 Aug 2015 09:54:20 +1000 Subject: namei: fix warning while make xmldocs caused by namei.c Fix the following warnings: Warning(.//fs/namei.c:2422): No description found for parameter 'nd' Warning(.//fs/namei.c:2422): Excess function parameter 'nameidata' description in 'path_mountpoint' Signed-off-by: Masanari Iida Acked-by: Randy Dunlap Signed-off-by: Andrew Morton --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index 034fc7d9d04c..d68c21fe4598 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2512,7 +2512,7 @@ done: /** * path_mountpoint - look up a path to be umounted - * @nameidata: lookup context + * @nd: lookup context * @flags: lookup flags * @path: pointer to container for result * -- cgit v1.2.3 From d98c2bf93b5d5deee3d25ba5a33302c81024dc78 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 18 Aug 2015 09:54:20 +1000 Subject: fs/seq_file: convert int seq_vprint/seq_printf/etc... returns to void The seq_ function return values were frequently misused. See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to seq_has_overflowed() and make public") All uses of these return values have been removed, so convert the return types to void. Miscellanea: o Move seq_put_decimal_ and seq_escape prototypes closer the other seq_vprintf prototypes o Reorder seq_putc and seq_puts to return early on overflow o Add argument names to seq_vprintf and seq_printf o Update the seq_escape kernel-doc o Convert a couple of leading spaces to tabs in seq_escape Signed-off-by: Joe Perches Cc: Al Viro Cc: Steven Rostedt Signed-off-by: Andrew Morton --- fs/seq_file.c | 70 +++++++++++++++++++++--------------------------- include/linux/seq_file.h | 19 ++++++------- 2 files changed, 41 insertions(+), 48 deletions(-) diff --git a/fs/seq_file.c b/fs/seq_file.c index 263b125dbcf4..225586e141ca 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -372,16 +372,16 @@ EXPORT_SYMBOL(seq_release); * @esc: set of characters that need escaping * * Puts string into buffer, replacing each occurrence of character from - * @esc with usual octal escape. Returns 0 in case of success, -1 - in - * case of overflow. + * @esc with usual octal escape. + * Use seq_has_overflowed() to check for errors. */ -int seq_escape(struct seq_file *m, const char *s, const char *esc) +void seq_escape(struct seq_file *m, const char *s, const char *esc) { char *end = m->buf + m->size; - char *p; + char *p; char c; - for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) { + for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) { if (!strchr(esc, c)) { *p++ = c; continue; @@ -394,14 +394,13 @@ int seq_escape(struct seq_file *m, const char *s, const char *esc) continue; } seq_set_overflow(m); - return -1; - } + return; + } m->count = p - m->buf; - return 0; } EXPORT_SYMBOL(seq_escape); -int seq_vprintf(struct seq_file *m, const char *f, va_list args) +void seq_vprintf(struct seq_file *m, const char *f, va_list args) { int len; @@ -409,24 +408,20 @@ int seq_vprintf(struct seq_file *m, const char *f, va_list args) len = vsnprintf(m->buf + m->count, m->size - m->count, f, args); if (m->count + len < m->size) { m->count += len; - return 0; + return; } } seq_set_overflow(m); - return -1; } EXPORT_SYMBOL(seq_vprintf); -int seq_printf(struct seq_file *m, const char *f, ...) +void seq_printf(struct seq_file *m, const char *f, ...) { - int ret; va_list args; va_start(args, f); - ret = seq_vprintf(m, f, args); + seq_vprintf(m, f, args); va_end(args); - - return ret; } EXPORT_SYMBOL(seq_printf); @@ -664,26 +659,25 @@ int seq_open_private(struct file *filp, const struct seq_operations *ops, } EXPORT_SYMBOL(seq_open_private); -int seq_putc(struct seq_file *m, char c) +void seq_putc(struct seq_file *m, char c) { - if (m->count < m->size) { - m->buf[m->count++] = c; - return 0; - } - return -1; + if (m->count >= m->size) + return; + + m->buf[m->count++] = c; } EXPORT_SYMBOL(seq_putc); -int seq_puts(struct seq_file *m, const char *s) +void seq_puts(struct seq_file *m, const char *s) { int len = strlen(s); - if (m->count + len < m->size) { - memcpy(m->buf + m->count, s, len); - m->count += len; - return 0; + + if (m->count + len >= m->size) { + seq_set_overflow(m); + return; } - seq_set_overflow(m); - return -1; + memcpy(m->buf + m->count, s, len); + m->count += len; } EXPORT_SYMBOL(seq_puts); @@ -694,8 +688,8 @@ EXPORT_SYMBOL(seq_puts); * This routine is very quick when you show lots of numbers. * In usual cases, it will be better to use seq_printf(). It's easier to read. */ -int seq_put_decimal_ull(struct seq_file *m, char delimiter, - unsigned long long num) +void seq_put_decimal_ull(struct seq_file *m, char delimiter, + unsigned long long num) { int len; @@ -707,35 +701,33 @@ int seq_put_decimal_ull(struct seq_file *m, char delimiter, if (num < 10) { m->buf[m->count++] = num + '0'; - return 0; + return; } len = num_to_str(m->buf + m->count, m->size - m->count, num); if (!len) goto overflow; m->count += len; - return 0; + return; + overflow: seq_set_overflow(m); - return -1; } EXPORT_SYMBOL(seq_put_decimal_ull); -int seq_put_decimal_ll(struct seq_file *m, char delimiter, - long long num) +void seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num) { if (num < 0) { if (m->count + 3 >= m->size) { seq_set_overflow(m); - return -1; + return; } if (delimiter) m->buf[m->count++] = delimiter; num = -num; delimiter = '-'; } - return seq_put_decimal_ull(m, delimiter, num); - + seq_put_decimal_ull(m, delimiter, num); } EXPORT_SYMBOL(seq_put_decimal_ll); diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index adeadbd6d7bf..dde00defbaa5 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -114,13 +114,18 @@ int seq_open(struct file *, const struct seq_operations *); ssize_t seq_read(struct file *, char __user *, size_t, loff_t *); loff_t seq_lseek(struct file *, loff_t, int); int seq_release(struct inode *, struct file *); -int seq_escape(struct seq_file *, const char *, const char *); -int seq_putc(struct seq_file *m, char c); -int seq_puts(struct seq_file *m, const char *s); int seq_write(struct seq_file *seq, const void *data, size_t len); -__printf(2, 3) int seq_printf(struct seq_file *, const char *, ...); -__printf(2, 0) int seq_vprintf(struct seq_file *, const char *, va_list args); +__printf(2, 0) +void seq_vprintf(struct seq_file *m, const char *fmt, va_list args); +__printf(2, 3) +void seq_printf(struct seq_file *m, const char *fmt, ...); +void seq_putc(struct seq_file *m, char c); +void seq_puts(struct seq_file *m, const char *s); +void seq_put_decimal_ull(struct seq_file *m, char delimiter, + unsigned long long num); +void seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num); +void seq_escape(struct seq_file *m, const char *s, const char *esc); void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, @@ -138,10 +143,6 @@ int single_release(struct inode *, struct file *); void *__seq_open_private(struct file *, const struct seq_operations *, int); int seq_open_private(struct file *, const struct seq_operations *, int); int seq_release_private(struct inode *, struct file *); -int seq_put_decimal_ull(struct seq_file *m, char delimiter, - unsigned long long num); -int seq_put_decimal_ll(struct seq_file *m, char delimiter, - long long num); static inline struct user_namespace *seq_user_ns(struct seq_file *seq) { -- cgit v1.2.3 From f847f3e5c19133cb3f5f806199fd1c2bc112e482 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 18 Aug 2015 09:54:20 +1000 Subject: fs-seq_file-convert-int-seq_vprint-seq_printf-etc-returns-to-void-fix Cc: Al Viro Cc: Joe Perches Cc: Steven Rostedt Signed-off-by: Andrew Morton --- fs/nsfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nsfs.c b/fs/nsfs.c index e4905fbf3396..8f20d6016e20 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -142,7 +142,8 @@ static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry) struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = dentry->d_fsdata; - return seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino); + seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino); + return 0; } static const struct super_operations nsfs_ops = { -- cgit v1.2.3 From 5d873fcb4e77c22f27a1457f33d637c77a44ee55 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 18 Aug 2015 09:54:20 +1000 Subject: fs-seq_file-convert-int-seq_vprint-seq_printf-etc-returns-to-void-fix-fix fix drivers/iommu/omap-iommu-debug.c for fs-seq_file-convert-int-seq_vprint-seq_printf-etc-returns-to-void.patch Cc: Mark Brown Cc: Al Viro Cc: Joe Perches Cc: Steven Rostedt Cc: Stephen Rothwell Cc: Joerg Roedel Signed-off-by: Andrew Morton --- drivers/iommu/omap-iommu-debug.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/omap-iommu-debug.c b/drivers/iommu/omap-iommu-debug.c index 0717aa96ce39..9bc20e2119a3 100644 --- a/drivers/iommu/omap-iommu-debug.c +++ b/drivers/iommu/omap-iommu-debug.c @@ -135,8 +135,9 @@ __dump_tlb_entries(struct omap_iommu *obj, struct cr_regs *crs, int num) static ssize_t iotlb_dump_cr(struct omap_iommu *obj, struct cr_regs *cr, struct seq_file *s) { - return seq_printf(s, "%08x %08x %01x\n", cr->cam, cr->ram, + seq_printf(s, "%08x %08x %01x\n", cr->cam, cr->ram, (cr->cam & MMU_CAM_P) ? 1 : 0); + return 0; } static size_t omap_dump_tlb_entries(struct omap_iommu *obj, struct seq_file *s) -- cgit v1.2.3 From 051222d6f62f0342b2913e7c878656b71a2dd705 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 18 Aug 2015 09:54:21 +1000 Subject: mm: mark most vm_operations_struct const With two exceptions (drm/qxl and drm/radeon) all vm_operations_struct structs should be constant. Signed-off-by: Kirill A. Shutemov Reviewed-by: Oleg Nesterov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Ingo Molnar Cc: Minchan Kim Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- arch/x86/entry/vsyscall/vsyscall_64.c | 2 +- drivers/android/binder.c | 2 +- drivers/gpu/drm/vgem/vgem_drv.c | 2 +- drivers/hsi/clients/cmt_speech.c | 2 +- drivers/infiniband/hw/qib/qib_file_ops.c | 2 +- drivers/infiniband/hw/qib/qib_mmap.c | 2 +- drivers/media/platform/omap/omap_vout.c | 2 +- drivers/misc/genwqe/card_dev.c | 2 +- drivers/staging/android/ion/ion.c | 2 +- drivers/staging/comedi/comedi_fops.c | 2 +- drivers/video/fbdev/omap2/omapfb/omapfb-main.c | 2 +- drivers/xen/gntalloc.c | 2 +- drivers/xen/gntdev.c | 2 +- drivers/xen/privcmd.c | 4 ++-- fs/ceph/addr.c | 2 +- fs/cifs/file.c | 2 +- security/selinux/selinuxfs.c | 2 +- 17 files changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 26a46f44e298..b160c0c6baed 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -277,7 +277,7 @@ static const char *gate_vma_name(struct vm_area_struct *vma) { return "[vsyscall]"; } -static struct vm_operations_struct gate_vma_ops = { +static const struct vm_operations_struct gate_vma_ops = { .name = gate_vma_name, }; static struct vm_area_struct gate_vma = { diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 6607f3c6ace1..a39e85f9efa9 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2834,7 +2834,7 @@ static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -static struct vm_operations_struct binder_vm_ops = { +static const struct vm_operations_struct binder_vm_ops = { .open = binder_vma_open, .close = binder_vma_close, .fault = binder_vm_fault, diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c index 6394547cf67a..860062ef8814 100644 --- a/drivers/gpu/drm/vgem/vgem_drv.c +++ b/drivers/gpu/drm/vgem/vgem_drv.c @@ -125,7 +125,7 @@ static int vgem_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } } -static struct vm_operations_struct vgem_gem_vm_ops = { +static const struct vm_operations_struct vgem_gem_vm_ops = { .fault = vgem_gem_fault, .open = drm_gem_vm_open, .close = drm_gem_vm_close, diff --git a/drivers/hsi/clients/cmt_speech.c b/drivers/hsi/clients/cmt_speech.c index d04643f9548b..95638df73d1c 100644 --- a/drivers/hsi/clients/cmt_speech.c +++ b/drivers/hsi/clients/cmt_speech.c @@ -1110,7 +1110,7 @@ static int cs_char_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return 0; } -static struct vm_operations_struct cs_char_vm_ops = { +static const struct vm_operations_struct cs_char_vm_ops = { .fault = cs_char_vma_fault, }; diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 725881890c4a..e449e394963f 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -908,7 +908,7 @@ static int qib_file_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return 0; } -static struct vm_operations_struct qib_file_vm_ops = { +static const struct vm_operations_struct qib_file_vm_ops = { .fault = qib_file_vma_fault, }; diff --git a/drivers/infiniband/hw/qib/qib_mmap.c b/drivers/infiniband/hw/qib/qib_mmap.c index 146cf29a2e1d..34927b700b0e 100644 --- a/drivers/infiniband/hw/qib/qib_mmap.c +++ b/drivers/infiniband/hw/qib/qib_mmap.c @@ -75,7 +75,7 @@ static void qib_vma_close(struct vm_area_struct *vma) kref_put(&ip->ref, qib_release_mmap_info); } -static struct vm_operations_struct qib_vm_ops = { +static const struct vm_operations_struct qib_vm_ops = { .open = qib_vma_open, .close = qib_vma_close, }; diff --git a/drivers/media/platform/omap/omap_vout.c b/drivers/media/platform/omap/omap_vout.c index 7feb6394f111..70c28d19ea04 100644 --- a/drivers/media/platform/omap/omap_vout.c +++ b/drivers/media/platform/omap/omap_vout.c @@ -865,7 +865,7 @@ static void omap_vout_vm_close(struct vm_area_struct *vma) vout->mmap_count--; } -static struct vm_operations_struct omap_vout_vm_ops = { +static const struct vm_operations_struct omap_vout_vm_ops = { .open = omap_vout_vm_open, .close = omap_vout_vm_close, }; diff --git a/drivers/misc/genwqe/card_dev.c b/drivers/misc/genwqe/card_dev.c index c49d244265ec..70e62d6a3231 100644 --- a/drivers/misc/genwqe/card_dev.c +++ b/drivers/misc/genwqe/card_dev.c @@ -418,7 +418,7 @@ static void genwqe_vma_close(struct vm_area_struct *vma) kfree(dma_map); } -static struct vm_operations_struct genwqe_vma_ops = { +static const struct vm_operations_struct genwqe_vma_ops = { .open = genwqe_vma_open, .close = genwqe_vma_close, }; diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index eec878e183f5..217aa537c4eb 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -997,7 +997,7 @@ static void ion_vm_close(struct vm_area_struct *vma) mutex_unlock(&buffer->lock); } -static struct vm_operations_struct ion_vma_ops = { +static const struct vm_operations_struct ion_vma_ops = { .open = ion_vm_open, .close = ion_vm_close, .fault = ion_vm_fault, diff --git a/drivers/staging/comedi/comedi_fops.c b/drivers/staging/comedi/comedi_fops.c index fd54d098ab02..0e8a45102933 100644 --- a/drivers/staging/comedi/comedi_fops.c +++ b/drivers/staging/comedi/comedi_fops.c @@ -2156,7 +2156,7 @@ static void comedi_vm_close(struct vm_area_struct *area) comedi_buf_map_put(bm); } -static struct vm_operations_struct comedi_vm_ops = { +static const struct vm_operations_struct comedi_vm_ops = { .open = comedi_vm_open, .close = comedi_vm_close, }; diff --git a/drivers/video/fbdev/omap2/omapfb/omapfb-main.c b/drivers/video/fbdev/omap2/omapfb/omapfb-main.c index 4f0cbb54d4db..d3af01c94a58 100644 --- a/drivers/video/fbdev/omap2/omapfb/omapfb-main.c +++ b/drivers/video/fbdev/omap2/omapfb/omapfb-main.c @@ -1091,7 +1091,7 @@ static void mmap_user_close(struct vm_area_struct *vma) omapfb_put_mem_region(rg); } -static struct vm_operations_struct mmap_user_ops = { +static const struct vm_operations_struct mmap_user_ops = { .open = mmap_user_open, .close = mmap_user_close, }; diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index 14370df9ac1c..4547a91bca67 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -494,7 +494,7 @@ static void gntalloc_vma_close(struct vm_area_struct *vma) mutex_unlock(&gref_mutex); } -static struct vm_operations_struct gntalloc_vmops = { +static const struct vm_operations_struct gntalloc_vmops = { .open = gntalloc_vma_open, .close = gntalloc_vma_close, }; diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 0dbb222daaf1..2ea0b3b2a91d 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -433,7 +433,7 @@ static struct page *gntdev_vma_find_special_page(struct vm_area_struct *vma, return map->pages[(addr - map->pages_vm_start) >> PAGE_SHIFT]; } -static struct vm_operations_struct gntdev_vmops = { +static const struct vm_operations_struct gntdev_vmops = { .open = gntdev_vma_open, .close = gntdev_vma_close, .find_special_page = gntdev_vma_find_special_page, diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index c6deb87c5c69..5e9adac928e6 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -414,7 +414,7 @@ static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) return 0; } -static struct vm_operations_struct privcmd_vm_ops; +static const struct vm_operations_struct privcmd_vm_ops; static long privcmd_ioctl_mmap_batch(void __user *udata, int version) { @@ -605,7 +605,7 @@ static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -static struct vm_operations_struct privcmd_vm_ops = { +static const struct vm_operations_struct privcmd_vm_ops = { .close = privcmd_close, .fault = privcmd_fault }; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 890c50971a69..a268abfe60ac 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1593,7 +1593,7 @@ out: return err; } -static struct vm_operations_struct ceph_vmops = { +static const struct vm_operations_struct ceph_vmops = { .fault = ceph_filemap_fault, .page_mkwrite = ceph_page_mkwrite, }; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 2ac2d8471393..94f81962368c 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3216,7 +3216,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_LOCKED; } -static struct vm_operations_struct cifs_file_vm_ops = { +static const struct vm_operations_struct cifs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = cifs_page_mkwrite, diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 3d2201413028..5bed7716f8ab 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -472,7 +472,7 @@ static int sel_mmap_policy_fault(struct vm_area_struct *vma, return 0; } -static struct vm_operations_struct sel_mmap_policy_ops = { +static const struct vm_operations_struct sel_mmap_policy_ops = { .fault = sel_mmap_policy_fault, .page_mkwrite = sel_mmap_policy_fault, }; -- cgit v1.2.3 From 8cb1b3e0e2161e329381d2330bb63b1552fe3766 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 18 Aug 2015 09:54:21 +1000 Subject: mm, mpx: add "vm_flags_t vm_flags" arg to do_mmap_pgoff() Add the additional "vm_flags_t vm_flags" argument to do_mmap_pgoff(), rename it to do_mmap(), and re-introduce do_mmap_pgoff() as a simple wrapper on top of do_mmap(). Perhaps we should update the callers of do_mmap_pgoff() and kill it later. This way mpx_mmap() can simply call do_mmap(vm_flags => VM_MPX) and do not play with vm internals. After this change mmap_region() has a single user outside of mmap.c, arch/tile/mm/elf.c:arch_setup_additional_pages(). It would be nice to change arch/tile/ and unexport mmap_region(). Signed-off-by: Oleg Nesterov Acked-by: Dave Hansen Tested-by: Dave Hansen Signed-off-by: Kirill A. Shutemov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Minchan Kim Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- arch/x86/mm/mpx.c | 51 +++++++-------------------------------------------- include/linux/mm.h | 12 ++++++++++-- mm/mmap.c | 10 ++++------ mm/nommu.c | 15 ++++++++------- 4 files changed, 29 insertions(+), 59 deletions(-) diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index db1b0bc5017c..134948b0926f 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -42,58 +42,21 @@ static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm) */ static unsigned long mpx_mmap(unsigned long len) { - unsigned long ret; - unsigned long addr, pgoff; struct mm_struct *mm = current->mm; - vm_flags_t vm_flags; - struct vm_area_struct *vma; + unsigned long addr, populate; /* Only bounds table can be allocated here */ if (len != mpx_bt_size_bytes(mm)) return -EINVAL; down_write(&mm->mmap_sem); - - /* Too many mappings? */ - if (mm->map_count > sysctl_max_map_count) { - ret = -ENOMEM; - goto out; - } - - /* Obtain the address to map to. we verify (or select) it and ensure - * that it represents a valid section of the address space. - */ - addr = get_unmapped_area(NULL, 0, len, 0, MAP_ANONYMOUS | MAP_PRIVATE); - if (addr & ~PAGE_MASK) { - ret = addr; - goto out; - } - - vm_flags = VM_READ | VM_WRITE | VM_MPX | - mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; - - /* Set pgoff according to addr for anon_vma */ - pgoff = addr >> PAGE_SHIFT; - - ret = mmap_region(NULL, addr, len, vm_flags, pgoff); - if (IS_ERR_VALUE(ret)) - goto out; - - vma = find_vma(mm, ret); - if (!vma) { - ret = -ENOMEM; - goto out; - } - - if (vm_flags & VM_LOCKED) { - up_write(&mm->mmap_sem); - mm_populate(ret, len); - return ret; - } - -out: + addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate); up_write(&mm->mmap_sem); - return ret; + if (populate) + mm_populate(addr, populate); + + return addr; } enum reg_type { diff --git a/include/linux/mm.h b/include/linux/mm.h index 9dd5da678005..d34da8f5e278 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1878,11 +1878,19 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff); -extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, +extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, - unsigned long pgoff, unsigned long *populate); + vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate); extern int do_munmap(struct mm_struct *, unsigned long, size_t); +static inline unsigned long +do_mmap_pgoff(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, unsigned long flags, + unsigned long pgoff, unsigned long *populate) +{ + return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate); +} + #ifdef CONFIG_MMU extern int __mm_populate(unsigned long addr, unsigned long len, int ignore_errors); diff --git a/mm/mmap.c b/mm/mmap.c index 76ada5dd968b..e2d2072c589d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1260,14 +1260,12 @@ static inline int mlock_future_check(struct mm_struct *mm, /* * The caller must hold down_write(¤t->mm->mmap_sem). */ - -unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, +unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long pgoff, - unsigned long *populate) + unsigned long flags, vm_flags_t vm_flags, + unsigned long pgoff, unsigned long *populate) { struct mm_struct *mm = current->mm; - vm_flags_t vm_flags; *populate = 0; @@ -1311,7 +1309,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ - vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | + vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (flags & MAP_LOCKED) diff --git a/mm/nommu.c b/mm/nommu.c index 1cc0709fcaa5..f1035e29f0a7 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1233,13 +1233,14 @@ enomem: /* * handle mapping creation for uClinux */ -unsigned long do_mmap_pgoff(struct file *file, - unsigned long addr, - unsigned long len, - unsigned long prot, - unsigned long flags, - unsigned long pgoff, - unsigned long *populate) +unsigned long do_mmap(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long prot, + unsigned long flags, + vm_flags_t vm_flags, + unsigned long pgoff, + unsigned long *populate) { struct vm_area_struct *vma; struct vm_region *region; -- cgit v1.2.3 From dcdc170d7a8b4b9c7a3360ef59f7b6bfbb98c3a8 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 18 Aug 2015 09:54:21 +1000 Subject: mm-mpx-add-vm_flags_t-vm_flags-arg-to-do_mmap_pgoff-fix mm/nommu.c: In function 'do_mmap': >> mm/nommu.c:1248:30: error: 'vm_flags' redeclared as different kind of symbol unsigned long capabilities, vm_flags, result; ^ mm/nommu.c:1241:15: note: previous definition of 'vm_flags' was here vm_flags_t vm_flags, Reported-by: Wu Fengguang Reported-by: Oleg Nesterov Cc: Stephen Rothwell Tested-by: Mark Salter Signed-off-by: Andrew Morton --- mm/nommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/nommu.c b/mm/nommu.c index f1035e29f0a7..d28173689f52 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1245,7 +1245,7 @@ unsigned long do_mmap(struct file *file, struct vm_area_struct *vma; struct vm_region *region; struct rb_node *rb; - unsigned long capabilities, vm_flags, result; + unsigned long capabilities, result; int ret; *populate = 0; @@ -1263,7 +1263,7 @@ unsigned long do_mmap(struct file *file, /* we've determined that we can make the mapping, now translate what we * now know into VMA flags */ - vm_flags = determine_vm_flags(file, prot, flags, capabilities); + vm_flags |= determine_vm_flags(file, prot, flags, capabilities); /* we're going to need to record the mapping */ region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); -- cgit v1.2.3 From c52f66db48bc33872d87eb4bc88d662e67cf2297 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 18 Aug 2015 09:54:21 +1000 Subject: mm-mpx-add-vm_flags_t-vm_flags-arg-to-do_mmap_pgoff-fix-checkpatch-fixes WARNING: Possible unwrapped commit description (prefer a maximum 75 chars per line) #5: >> mm/nommu.c:1248:30: error: 'vm_flags' redeclared as different kind of symbol WARNING: please, no spaces at the start of a line #28: FILE: mm/nommu.c:1248: + unsigned long capabilities, result;$ WARNING: please, no spaces at the start of a line #37: FILE: mm/nommu.c:1266: + vm_flags |= determine_vm_flags(file, prot, flags, capabilities);$ total: 0 errors, 3 warnings, 16 lines checked ./patches/mm-mpx-add-vm_flags_t-vm_flags-arg-to-do_mmap_pgoff-fix.patch has style problems, please review. NOTE: If any of the errors are false positives, please report them to the maintainer, see CHECKPATCH in MAINTAINERS. Please run checkpatch prior to sending patches Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton --- mm/nommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/nommu.c b/mm/nommu.c index d28173689f52..ab14a2014dea 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1245,7 +1245,7 @@ unsigned long do_mmap(struct file *file, struct vm_area_struct *vma; struct vm_region *region; struct rb_node *rb; - unsigned long capabilities, result; + unsigned long capabilities, result; int ret; *populate = 0; @@ -1263,7 +1263,7 @@ unsigned long do_mmap(struct file *file, /* we've determined that we can make the mapping, now translate what we * now know into VMA flags */ - vm_flags |= determine_vm_flags(file, prot, flags, capabilities); + vm_flags |= determine_vm_flags(file, prot, flags, capabilities); /* we're going to need to record the mapping */ region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); -- cgit v1.2.3 From 019350542c6ad04ad85a5dc7f9336b9073c72f93 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 18 Aug 2015 09:54:22 +1000 Subject: mm: make sure all file VMAs have ->vm_ops set We rely on vma->vm_ops == NULL to detect anonymous VMA: see vma_is_anonymous(), but some drivers doesn't set ->vm_ops. As a result we can end up with anonymous page in private file mapping. That should not lead to serious misbehaviour, but nevertheless is wrong. Let's fix by setting up dummy ->vm_ops for file mmapping if f_op->mmap() didn't set its own. The patch also adds sanity check into __vma_link_rb(). It will help catch broken VMAs which inserted directly into mm_struct via insert_vm_struct(). Signed-off-by: Kirill A. Shutemov Reviewed-by: Oleg Nesterov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Ingo Molnar Cc: Minchan Kim Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- mm/mmap.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/mmap.c b/mm/mmap.c index e2d2072c589d..8e0366e0bd1d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -612,6 +612,8 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm, void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, struct rb_node **rb_link, struct rb_node *rb_parent) { + WARN_ONCE(vma->vm_file && !vma->vm_ops, "missing vma->vm_ops"); + /* Update tracking information for the gap following the new vma. */ if (vma->vm_next) vma_gap_update(vma->vm_next); @@ -1636,6 +1638,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr, */ WARN_ON_ONCE(addr != vma->vm_start); + /* All file mapping must have ->vm_ops set */ + if (!vma->vm_ops) { + static const struct vm_operations_struct dummy_ops = {}; + vma->vm_ops = &dummy_ops; + } + addr = vma->vm_start; vm_flags = vma->vm_flags; } else if (vm_flags & VM_SHARED) { -- cgit v1.2.3 From d322499daaf60bca3732b491e214d660d6edd9f8 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 18 Aug 2015 09:54:22 +1000 Subject: mm: use vma_is_anonymous() in create_huge_pmd() and wp_huge_pmd() Let's use helper rather than direct check of vma->vm_ops to distinguish anonymous VMA. Signed-off-by: Kirill A. Shutemov Reviewed-by: Oleg Nesterov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Ingo Molnar Cc: Minchan Kim Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 558ee16167d9..7f6a9563d5a6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3233,7 +3233,7 @@ out: static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { - if (!vma->vm_ops) + if (vma_is_anonymous(vma)) return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags); if (vma->vm_ops->pmd_fault) return vma->vm_ops->pmd_fault(vma, address, pmd, flags); @@ -3244,7 +3244,7 @@ static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pmd_t orig_pmd, unsigned int flags) { - if (!vma->vm_ops) + if (vma_is_anonymous(vma)) return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); if (vma->vm_ops->pmd_fault) return vma->vm_ops->pmd_fault(vma, address, pmd, flags); -- cgit v1.2.3 From de7eef4cf917fb39cd05f83a11e08e3859ea02f4 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 18 Aug 2015 09:54:22 +1000 Subject: mm, madvise: use vma_is_anonymous() to check for anon VMA !vma->vm_file is not reliable to detect anon VMA, because not all drivers bother set it. Let's use vma_is_anonymous() instead. Signed-off-by: Kirill A. Shutemov Acked-by: Minchan Kim Reviewed-by: Oleg Nesterov Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- mm/madvise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index fa6479aca0c9..16441f550cdf 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -393,7 +393,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, return -EINVAL; /* MADV_FREE works for only anon vma at the moment */ - if (vma->vm_file) + if (!vma_is_anonymous(vma)) return -EINVAL; start = max(vma->vm_start, start_addr); -- cgit v1.2.3 From 7c4c91d7a4b964a061842b98ded7c0a1ecfe4985 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 18 Aug 2015 09:54:22 +1000 Subject: sys_membarrier(): system-wide memory barrier (generic, x86) Here is an implementation of a new system call, sys_membarrier(), which executes a memory barrier on all threads running on the system. It is implemented by calling synchronize_sched(). It can be used to distribute the cost of user-space memory barriers asymmetrically by transforming pairs of memory barriers into pairs consisting of sys_membarrier() and a compiler barrier. For synchronization primitives that distinguish between read-side and write-side (e.g. userspace RCU [1], rwlocks), the read-side can be accelerated significantly by moving the bulk of the memory barrier overhead to the write-side. The existing applications of which I am aware that would be improved by this system call are as follows: * Through Userspace RCU library (http://urcu.so) - DNS server (Knot DNS) https://www.knot-dns.cz/ - Network sniffer (http://netsniff-ng.org/) - Distributed object storage (https://sheepdog.github.io/sheepdog/) - User-space tracing (http://lttng.org) - Network storage system (https://www.gluster.org/) - Virtual routers (https://events.linuxfoundation.org/sites/events/files/slides/DPDK_RCU_0MQ.pdf) - Financial software (https://lkml.org/lkml/2015/3/23/189) Those projects use RCU in userspace to increase read-side speed and scalability compared to locking. Especially in the case of RCU used by libraries, sys_membarrier can speed up the read-side by moving the bulk of the memory barrier cost to synchronize_rcu(). * Direct users of sys_membarrier - core dotnet garbage collector (https://github.com/dotnet/coreclr/issues/198) Microsoft core dotnet GC developers are planning to use the mprotect() side-effect of issuing memory barriers through IPIs as a way to implement Windows FlushProcessWriteBuffers() on Linux. They are referring to sys_membarrier in their github thread, specifically stating that sys_membarrier() is what they are looking for. To explain the benefit of this scheme, let's introduce two example threads: Thread A (non-frequent, e.g. executing liburcu synchronize_rcu()) Thread B (frequent, e.g. executing liburcu rcu_read_lock()/rcu_read_unlock()) In a scheme where all smp_mb() in thread A are ordering memory accesses with respect to smp_mb() present in Thread B, we can change each smp_mb() within Thread A into calls to sys_membarrier() and each smp_mb() within Thread B into compiler barriers "barrier()". Before the change, we had, for each smp_mb() pairs: Thread A Thread B previous mem accesses previous mem accesses smp_mb() smp_mb() following mem accesses following mem accesses After the change, these pairs become: Thread A Thread B prev mem accesses prev mem accesses sys_membarrier() barrier() follow mem accesses follow mem accesses As we can see, there are two possible scenarios: either Thread B memory accesses do not happen concurrently with Thread A accesses (1), or they do (2). 1) Non-concurrent Thread A vs Thread B accesses: Thread A Thread B prev mem accesses sys_membarrier() follow mem accesses prev mem accesses barrier() follow mem accesses In this case, thread B accesses will be weakly ordered. This is OK, because at that point, thread A is not particularly interested in ordering them with respect to its own accesses. 2) Concurrent Thread A vs Thread B accesses Thread A Thread B prev mem accesses prev mem accesses sys_membarrier() barrier() follow mem accesses follow mem accesses In this case, thread B accesses, which are ensured to be in program order thanks to the compiler barrier, will be "upgraded" to full smp_mb() by synchronize_sched(). * Benchmarks On Intel Xeon E5405 (8 cores) (one thread is calling sys_membarrier, the other 7 threads are busy looping) 1000 non-expedited sys_membarrier calls in 33s =3D 33 milliseconds/call. * User-space user of this system call: Userspace RCU library Both the signal-based and the sys_membarrier userspace RCU schemes permit us to remove the memory barrier from the userspace RCU rcu_read_lock() and rcu_read_unlock() primitives, thus significantly accelerating them. These memory barriers are replaced by compiler barriers on the read-side, and all matching memory barriers on the write-side are turned into an invocation of a memory barrier on all active threads in the process. By letting the kernel perform this synchronization rather than dumbly sending a signal to every process threads (as we currently do), we diminish the number of unnecessary wake ups and only issue the memory barriers on active threads. Non-running threads do not need to execute such barrier anyway, because these are implied by the scheduler context switches. Results in liburcu: Operations in 10s, 6 readers, 2 writers: memory barriers in reader: 1701557485 reads, 2202847 writes signal-based scheme: 9830061167 reads, 6700 writes sys_membarrier: 9952759104 reads, 425 writes sys_membarrier (dyn. check): 7970328887 reads, 425 writes The dynamic sys_membarrier availability check adds some overhead to the read-side compared to the signal-based scheme, but besides that, sys_membarrier slightly outperforms the signal-based scheme. However, this non-expedited sys_membarrier implementation has a much slower grace period than signal and memory barrier schemes. Besides diminishing the number of wake-ups, one major advantage of the membarrier system call over the signal-based scheme is that it does not need to reserve a signal. This plays much more nicely with libraries, and with processes injected into for tracing purposes, for which we cannot expect that signals will be unused by the application. An expedited version of this system call can be added later on to speed up the grace period. Its implementation will likely depend on reading the cpu_curr()->mm without holding each CPU's rq lock. This patch adds the system call to x86 and to asm-generic. [1] http://urcu.so membarrier(2) man page: MEMBARRIER(2) Linux Programmer's Manual MEMBARRIER(2) NAME membarrier - issue memory barriers on a set of threads SYNOPSIS #include int membarrier(int cmd, int flags); DESCRIPTION The cmd argument is one of the following: MEMBARRIER_CMD_QUERY Query the set of supported commands. It returns a bitmask of supported commands. MEMBARRIER_CMD_SHARED Execute a memory barrier on all threads running on the system. Upon return from system call, the caller thread is ensured that all running threads have passed through a state where all memory accesses to user-space addresses match program order between entry to and return from the system call (non-running threads are de facto in such a state). This covers threads from all pro=E2=80=90 cesses running on the system. This command returns 0. The flags argument needs to be 0. For future extensions. All memory accesses performed in program order from each targeted thread is guaranteed to be ordered with respect to sys_membarrier(). If we use the semantic "barrier()" to represent a compiler barrier forcing memory accesses to be performed in program order across the barrier, and smp_mb() to represent explicit memory barriers forcing full memory ordering across the barrier, we have the following ordering table for each pair of barrier(), sys_membarrier() and smp_mb(): The pair ordering is detailed as (O: ordered, X: not ordered): barrier() smp_mb() sys_membarrier() barrier() X X O smp_mb() X O O sys_membarrier() O O O RETURN VALUE On success, these system calls return zero. On error, -1 is returned, and errno is set appropriately. For a given command, with flags argument set to 0, this system call is guaranteed to always return the same value until reboot. ERRORS ENOSYS System call is not implemented. EINVAL Invalid arguments. Linux 2015-04-15 MEMBARRIER(2) Signed-off-by: Mathieu Desnoyers Reviewed-by: Paul E. McKenney Reviewed-by: Josh Triplett Cc: KOSAKI Motohiro Cc: Steven Rostedt Cc: Nicholas Miell Cc: Linus Torvalds Cc: Ingo Molnar Cc: Alan Cox Cc: Lai Jiangshan Cc: Stephen Hemminger Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: David Howells Cc: Pranith Kumar Cc: Michael Kerrisk Cc: Shuah Khan Signed-off-by: Andrew Morton --- MAINTAINERS | 8 +++++ arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + include/linux/syscalls.h | 2 ++ include/uapi/asm-generic/unistd.h | 4 ++- include/uapi/linux/Kbuild | 1 + include/uapi/linux/membarrier.h | 53 +++++++++++++++++++++++++++ init/Kconfig | 12 +++++++ kernel/Makefile | 1 + kernel/membarrier.c | 66 ++++++++++++++++++++++++++++++++++ kernel/sys_ni.c | 3 ++ 11 files changed, 151 insertions(+), 1 deletion(-) create mode 100644 include/uapi/linux/membarrier.h create mode 100644 kernel/membarrier.c diff --git a/MAINTAINERS b/MAINTAINERS index 1f25f3c168f4..bee86ed96a92 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6787,6 +6787,14 @@ W: http://www.mellanox.com Q: http://patchwork.ozlabs.org/project/netdev/list/ F: drivers/net/ethernet/mellanox/mlxsw/ +MEMBARRIER SUPPORT +M: Mathieu Desnoyers +M: "Paul E. McKenney" +L: linux-kernel@vger.kernel.org +S: Supported +F: kernel/membarrier.c +F: include/uapi/linux/membarrier.h + MEMORY MANAGEMENT L: linux-mm@kvack.org W: http://www.linux-mm.org diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 41e72a50c2ed..24310bbf8688 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -382,3 +382,4 @@ 373 i386 shutdown sys_shutdown 374 i386 userfaultfd sys_userfaultfd 375 i386 mlock2 sys_mlock2 +376 i386 membarrier sys_membarrier diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 23669007b85d..6a8737e6f15e 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -331,6 +331,7 @@ 322 64 execveat stub_execveat 323 common userfaultfd sys_userfaultfd 324 common mlock2 sys_mlock2 +325 common membarrier sys_membarrier # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 890632cbf353..9ca6a26558b8 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -887,4 +887,6 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename, asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags); +asmlinkage long sys_membarrier(int cmd, int flags); + #endif diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 14a6013cbdac..2803f41d8a7e 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -711,9 +711,11 @@ __SYSCALL(__NR_bpf, sys_bpf) __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat) #define __NR_mlock2 282 __SYSCALL(__NR_mlock2, sys_mlock2) +#define __NR_membarrier 283 +__SYSCALL(__NR_membarrier, sys_membarrier) #undef __NR_syscalls -#define __NR_syscalls 283 +#define __NR_syscalls 284 /* * All syscalls below here should go away really, diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index ba94b9bb7bee..e77707802dcc 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -253,6 +253,7 @@ header-y += mdio.h header-y += media.h header-y += media-bus-format.h header-y += mei.h +header-y += membarrier.h header-y += memfd.h header-y += mempolicy.h header-y += meye.h diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h new file mode 100644 index 000000000000..e0b108bd2624 --- /dev/null +++ b/include/uapi/linux/membarrier.h @@ -0,0 +1,53 @@ +#ifndef _UAPI_LINUX_MEMBARRIER_H +#define _UAPI_LINUX_MEMBARRIER_H + +/* + * linux/membarrier.h + * + * membarrier system call API + * + * Copyright (c) 2010, 2015 Mathieu Desnoyers + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * enum membarrier_cmd - membarrier system call command + * @MEMBARRIER_CMD_QUERY: Query the set of supported commands. It returns + * a bitmask of valid commands. + * @MEMBARRIER_CMD_SHARED: Execute a memory barrier on all running threads. + * Upon return from system call, the caller thread + * is ensured that all running threads have passed + * through a state where all memory accesses to + * user-space addresses match program order between + * entry to and return from the system call + * (non-running threads are de facto in such a + * state). This covers threads from all processes + * running on the system. This command returns 0. + * + * Command to be passed to the membarrier system call. The commands need to + * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to + * the value 0. + */ +enum membarrier_cmd { + MEMBARRIER_CMD_QUERY = 0, + MEMBARRIER_CMD_SHARED = (1 << 0), +}; + +#endif /* _UAPI_LINUX_MEMBARRIER_H */ diff --git a/init/Kconfig b/init/Kconfig index 88ee0fe59048..7a645f7380d2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1690,6 +1690,18 @@ config PCI_QUIRKS bugs/quirks. Disable this only if your target machine is unaffected by PCI quirks. +config MEMBARRIER + bool "Enable membarrier() system call" if EXPERT + default y + help + Enable the membarrier() system call that allows issuing memory + barriers across all running threads, which can be used to distribute + the cost of user-space memory barriers asymmetrically by transforming + pairs of memory barriers into pairs consisting of membarrier() and a + compiler barrier. + + If unsure, say Y. + config EMBEDDED bool "Embedded system" option allnoconfig_y diff --git a/kernel/Makefile b/kernel/Makefile index 0edb1d10b800..f07f8d18b3d6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -101,6 +101,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o +obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_HAS_IOMEM) += memremap.o diff --git a/kernel/membarrier.c b/kernel/membarrier.c new file mode 100644 index 000000000000..536c727a56e9 --- /dev/null +++ b/kernel/membarrier.c @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2010, 2015 Mathieu Desnoyers + * + * membarrier system call + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include + +/* + * Bitmask made from a "or" of all commands within enum membarrier_cmd, + * except MEMBARRIER_CMD_QUERY. + */ +#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED) + +/** + * sys_membarrier - issue memory barriers on a set of threads + * @cmd: Takes command values defined in enum membarrier_cmd. + * @flags: Currently needs to be 0. For future extensions. + * + * If this system call is not implemented, -ENOSYS is returned. If the + * command specified does not exist, or if the command argument is invalid, + * this system call returns -EINVAL. For a given command, with flags argument + * set to 0, this system call is guaranteed to always return the same value + * until reboot. + * + * All memory accesses performed in program order from each targeted thread + * is guaranteed to be ordered with respect to sys_membarrier(). If we use + * the semantic "barrier()" to represent a compiler barrier forcing memory + * accesses to be performed in program order across the barrier, and + * smp_mb() to represent explicit memory barriers forcing full memory + * ordering across the barrier, we have the following ordering table for + * each pair of barrier(), sys_membarrier() and smp_mb(): + * + * The pair ordering is detailed as (O: ordered, X: not ordered): + * + * barrier() smp_mb() sys_membarrier() + * barrier() X X O + * smp_mb() X O O + * sys_membarrier() O O O + */ +SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) +{ + if (unlikely(flags)) + return -EINVAL; + switch (cmd) { + case MEMBARRIER_CMD_QUERY: + return MEMBARRIER_CMD_BITMASK; + case MEMBARRIER_CMD_SHARED: + if (num_online_cpus() > 1) + synchronize_sched(); + return 0; + default: + return -EINVAL; + } +} diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 8de5b2645796..0623787ec67a 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -246,3 +246,6 @@ cond_syscall(sys_bpf); /* execveat */ cond_syscall(sys_execveat); + +/* membarrier */ +cond_syscall(sys_membarrier); -- cgit v1.2.3 From 5edfc1ccd054fc515ba26eacc1974fca0d72a923 Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Tue, 18 Aug 2015 09:54:23 +1000 Subject: selftests: add membarrier syscall test Add a self test for the membarrier system call. Signed-off-by: Pranith Kumar Signed-off-by: Mathieu Desnoyers Cc: Michael Ellerman Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/Makefile | 1 + tools/testing/selftests/membarrier/.gitignore | 1 + tools/testing/selftests/membarrier/Makefile | 11 ++++ .../testing/selftests/membarrier/membarrier_test.c | 71 ++++++++++++++++++++++ 4 files changed, 84 insertions(+) create mode 100644 tools/testing/selftests/membarrier/.gitignore create mode 100644 tools/testing/selftests/membarrier/Makefile create mode 100644 tools/testing/selftests/membarrier/membarrier_test.c diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 2cb20b7188ae..87b4609eb817 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -7,6 +7,7 @@ TARGETS += ftrace TARGETS += futex TARGETS += kcmp TARGETS += kdbus +TARGETS += membarrier TARGETS += memfd TARGETS += memory-hotplug TARGETS += mount diff --git a/tools/testing/selftests/membarrier/.gitignore b/tools/testing/selftests/membarrier/.gitignore new file mode 100644 index 000000000000..020c44f49a9e --- /dev/null +++ b/tools/testing/selftests/membarrier/.gitignore @@ -0,0 +1 @@ +membarrier_test diff --git a/tools/testing/selftests/membarrier/Makefile b/tools/testing/selftests/membarrier/Makefile new file mode 100644 index 000000000000..877a50355d7f --- /dev/null +++ b/tools/testing/selftests/membarrier/Makefile @@ -0,0 +1,11 @@ +CFLAGS += -g -I../../../../usr/include/ + +all: + $(CC) $(CFLAGS) membarrier_test.c -o membarrier_test + +TEST_PROGS := membarrier_test + +include ../lib.mk + +clean: + $(RM) membarrier_test diff --git a/tools/testing/selftests/membarrier/membarrier_test.c b/tools/testing/selftests/membarrier/membarrier_test.c new file mode 100644 index 000000000000..3c9f2179acd8 --- /dev/null +++ b/tools/testing/selftests/membarrier/membarrier_test.c @@ -0,0 +1,71 @@ +#define _GNU_SOURCE +#define __EXPORTED_HEADERS__ + +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +static int sys_membarrier(int cmd, int flags) +{ + return syscall(__NR_membarrier, cmd, flags); +} + +static void test_membarrier_fail(void) +{ + int cmd = -1, flags = 0; + + if (sys_membarrier(cmd, flags) != -1) { + printf("membarrier: Should fail but passed\n"); + ksft_exit_fail(); + } +} + +static void test_membarrier_success(void) +{ + int flags = 0; + + if (sys_membarrier(MEMBARRIER_CMD_SHARED, flags) != 0) { + printf("membarrier: Executing MEMBARRIER failed, %s\n", + strerror(errno)); + ksft_exit_fail(); + } + + printf("membarrier: MEMBARRIER_CMD_SHARED success\n"); +} + +static void test_membarrier(void) +{ + test_membarrier_fail(); + test_membarrier_success(); +} + +static int test_membarrier_exists(void) +{ + int flags = 0; + + if (sys_membarrier(MEMBARRIER_CMD_QUERY, flags)) + return 0; + + return 1; +} + +int main(int argc, char **argv) +{ + printf("membarrier: MEMBARRIER_CMD_QUERY "); + if (test_membarrier_exists()) { + printf("syscall implemented\n"); + test_membarrier(); + } else { + printf("syscall not implemented!\n"); + return ksft_exit_fail(); + } + + printf("membarrier: tests done!\n"); + + return ksft_exit_pass(); +} -- cgit v1.2.3 From 1ad880cc38899f12d097272e7d03104b0ae6b7d9 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 18 Aug 2015 09:54:23 +1000 Subject: selftests: enhance membarrier syscall test Update the membarrier syscall self-test to match the membarrier interface. Extend coverage of the interface. Consider ENOSYS as a "SKIP" test, since it is a valid configuration, but does not allow testing the system call. Signed-off-by: Mathieu Desnoyers Cc: Michael Ellerman Cc: Pranith Kumar Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../testing/selftests/membarrier/membarrier_test.c | 100 +++++++++++++++------ 1 file changed, 75 insertions(+), 25 deletions(-) diff --git a/tools/testing/selftests/membarrier/membarrier_test.c b/tools/testing/selftests/membarrier/membarrier_test.c index 3c9f2179acd8..dde312508007 100644 --- a/tools/testing/selftests/membarrier/membarrier_test.c +++ b/tools/testing/selftests/membarrier/membarrier_test.c @@ -10,62 +10,112 @@ #include "../kselftest.h" +enum test_membarrier_status { + TEST_MEMBARRIER_PASS = 0, + TEST_MEMBARRIER_FAIL, + TEST_MEMBARRIER_SKIP, +}; + static int sys_membarrier(int cmd, int flags) { return syscall(__NR_membarrier, cmd, flags); } -static void test_membarrier_fail(void) +static enum test_membarrier_status test_membarrier_cmd_fail(void) { int cmd = -1, flags = 0; if (sys_membarrier(cmd, flags) != -1) { - printf("membarrier: Should fail but passed\n"); - ksft_exit_fail(); + printf("membarrier: Wrong command should fail but passed.\n"); + return TEST_MEMBARRIER_FAIL; + } + return TEST_MEMBARRIER_PASS; +} + +static enum test_membarrier_status test_membarrier_flags_fail(void) +{ + int cmd = MEMBARRIER_CMD_QUERY, flags = 1; + + if (sys_membarrier(cmd, flags) != -1) { + printf("membarrier: Wrong flags should fail but passed.\n"); + return TEST_MEMBARRIER_FAIL; } + return TEST_MEMBARRIER_PASS; } -static void test_membarrier_success(void) +static enum test_membarrier_status test_membarrier_success(void) { - int flags = 0; + int cmd = MEMBARRIER_CMD_SHARED, flags = 0; - if (sys_membarrier(MEMBARRIER_CMD_SHARED, flags) != 0) { - printf("membarrier: Executing MEMBARRIER failed, %s\n", + if (sys_membarrier(cmd, flags) != 0) { + printf("membarrier: Executing MEMBARRIER_CMD_SHARED failed. %s.\n", strerror(errno)); - ksft_exit_fail(); + return TEST_MEMBARRIER_FAIL; } - printf("membarrier: MEMBARRIER_CMD_SHARED success\n"); + printf("membarrier: MEMBARRIER_CMD_SHARED success.\n"); + return TEST_MEMBARRIER_PASS; } -static void test_membarrier(void) +static enum test_membarrier_status test_membarrier(void) { - test_membarrier_fail(); - test_membarrier_success(); + enum test_membarrier_status status; + + status = test_membarrier_cmd_fail(); + if (status) + return status; + status = test_membarrier_flags_fail(); + if (status) + return status; + status = test_membarrier_success(); + if (status) + return status; + return TEST_MEMBARRIER_PASS; } -static int test_membarrier_exists(void) +static enum test_membarrier_status test_membarrier_query(void) { - int flags = 0; - - if (sys_membarrier(MEMBARRIER_CMD_QUERY, flags)) - return 0; + int flags = 0, ret; - return 1; + printf("membarrier MEMBARRIER_CMD_QUERY "); + ret = sys_membarrier(MEMBARRIER_CMD_QUERY, flags); + if (ret < 0) { + printf("failed. %s.\n", strerror(errno)); + switch (errno) { + case ENOSYS: + /* + * It is valid to build a kernel with + * CONFIG_MEMBARRIER=n. However, this skips the tests. + */ + return TEST_MEMBARRIER_SKIP; + case EINVAL: + default: + return TEST_MEMBARRIER_FAIL; + } + } + if (!(ret & MEMBARRIER_CMD_SHARED)) { + printf("command MEMBARRIER_CMD_SHARED is not supported.\n"); + return TEST_MEMBARRIER_FAIL; + } + printf("syscall available.\n"); + return TEST_MEMBARRIER_PASS; } int main(int argc, char **argv) { - printf("membarrier: MEMBARRIER_CMD_QUERY "); - if (test_membarrier_exists()) { - printf("syscall implemented\n"); - test_membarrier(); - } else { - printf("syscall not implemented!\n"); + switch (test_membarrier_query()) { + case TEST_MEMBARRIER_FAIL: return ksft_exit_fail(); + case TEST_MEMBARRIER_SKIP: + return ksft_exit_skip(); + } + switch (test_membarrier()) { + case TEST_MEMBARRIER_FAIL: + return ksft_exit_fail(); + case TEST_MEMBARRIER_SKIP: + return ksft_exit_skip(); } printf("membarrier: tests done!\n"); - return ksft_exit_pass(); } -- cgit v1.2.3 From 47407f1ce4351bb5106172e437450224b069ea57 Mon Sep 17 00:00:00 2001 From: Levente Kurusa Date: Tue, 18 Aug 2015 09:54:23 +1000 Subject: drivers/w1/w1_int.c: call put_device if device_register fails Currently, memsetting and kfreeing the device is bad behaviour. The device will have a reference count of 1 and hence can cause trouble because it has kfree'd. Proper way to handle a failed device_register is to call put_device right after it fails. Signed-off-by: Levente Kurusa Acked-by: Evgeniy Polyakov Signed-off-by: Andrew Morton --- drivers/w1/w1_int.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/w1/w1_int.c b/drivers/w1/w1_int.c index 47249a30eae3..20f766afa4c7 100644 --- a/drivers/w1/w1_int.c +++ b/drivers/w1/w1_int.c @@ -91,8 +91,7 @@ static struct w1_master *w1_alloc_dev(u32 id, int slave_count, int slave_ttl, err = device_register(&dev->dev); if (err) { pr_err("Failed to register master device. err=%d\n", err); - memset(dev, 0, sizeof(struct w1_master)); - kfree(dev); + put_device(&dev->dev); dev = NULL; } -- cgit v1.2.3