From bf80e8b13d777f7b068fd97a0602b1f9fbf2050b Mon Sep 17 00:00:00 2001 From: Leann Ogasawara Date: Fri, 12 Mar 2010 17:13:25 -0800 Subject: LINARO: ubuntu sauce based on UBUNTU: (no-up) fold down debian for ubuntu-q v3.5-rc1 rebase With packaging bits removed Ignore: yes Signed-off-by: Leann Ogasawara Signed-off-by: John Rigby --- Makefile | 13 +++++++++++-- arch/arm/Kconfig | 2 ++ arch/powerpc/Kconfig | 2 ++ arch/x86/Kconfig | 8 ++++++++ ubuntu/Kconfig | 17 +++++++++++++++++ ubuntu/Makefile | 20 ++++++++++++++++++++ ubuntu/include/README | 4 ++++ 7 files changed, 64 insertions(+), 2 deletions(-) create mode 100644 ubuntu/Kconfig create mode 100644 ubuntu/Makefile create mode 100644 ubuntu/include/README diff --git a/Makefile b/Makefile index d845c2a1aa68..b1e029a80064 100644 --- a/Makefile +++ b/Makefile @@ -349,14 +349,23 @@ CFLAGS_KERNEL = AFLAGS_KERNEL = CFLAGS_GCOV = -fprofile-arcs -ftest-coverage +# Prefer linux-backports-modules +ifneq ($(KBUILD_SRC),) +ifneq ($(shell if test -e $(KBUILD_OUTPUT)/ubuntu-build; then echo yes; fi),yes) +UBUNTUINCLUDE := -I/usr/src/linux-headers-lbm-$(KERNELRELEASE) +endif +endif # Use LINUXINCLUDE when you must reference the include/ directory. # Needed to be compatible with the O= option -LINUXINCLUDE := -I$(srctree)/arch/$(hdr-arch)/include \ +LINUXINCLUDE := $(UBUNTUINCLUDE) -I$(srctree)/arch/$(hdr-arch)/include \ -Iarch/$(hdr-arch)/include/generated -Iinclude \ $(if $(KBUILD_SRC), -I$(srctree)/include) \ -include $(srctree)/include/linux/kconfig.h +# UBUNTU: Include our third party driver stuff too +LINUXINCLUDE += -Iubuntu/include $(if $(KBUILD_SRC),-I$(srctree)/ubuntu/include) + KBUILD_CPPFLAGS := -D__KERNEL__ KBUILD_CFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ @@ -504,7 +513,7 @@ scripts: scripts_basic include/config/auto.conf include/config/tristate.conf # Objects we will link into vmlinux / subdirs we need to visit init-y := init/ -drivers-y := drivers/ sound/ firmware/ +drivers-y := drivers/ sound/ firmware/ ubuntu/ net-y := net/ libs-y := lib/ core-y := usr/ diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index b4524b9bc191..9176e10249cd 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -2314,6 +2314,8 @@ source "net/Kconfig" source "drivers/Kconfig" +source "ubuntu/Kconfig" + source "fs/Kconfig" source "arch/arm/Kconfig.debug" diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 050cb371a69e..a9363d28dbbd 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -986,6 +986,8 @@ source "net/Kconfig" source "drivers/Kconfig" +source "ubuntu/Kconfig" + source "fs/Kconfig" source "arch/powerpc/sysdev/qe_lib/Kconfig" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c70684f859e1..8e083c36f519 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -381,6 +381,12 @@ config X86_UV # Following is an alphabetically sorted list of 32 bit extended platforms # Please maintain the alphabetic order if and when there are additions +config X86_LPIA + bool "LPIA-compatible" + depends on X86_32 && X86_PC + help + Choose this option if your computer is an LPIA platform. + config X86_INTEL_CE bool "CE4100 TV platform" depends on PCI @@ -2220,6 +2226,8 @@ source "net/Kconfig" source "drivers/Kconfig" +source "ubuntu/Kconfig" + source "drivers/firmware/Kconfig" source "fs/Kconfig" diff --git a/ubuntu/Kconfig b/ubuntu/Kconfig new file mode 100644 index 000000000000..f15ccd8b7f4c --- /dev/null +++ b/ubuntu/Kconfig @@ -0,0 +1,17 @@ +menu "Ubuntu Supplied Third-Party Device Drivers" + +# +# NOTE: to allow drivers to be added and removed without causing merge +# collisions you should add new entries in the middle of the six lines +# of ## at the bottom of the list. Always add three lines of ## above +# your new entry and maintain the six lines below. +# + +## +## +## +## +## +## + +endmenu diff --git a/ubuntu/Makefile b/ubuntu/Makefile new file mode 100644 index 000000000000..5823ca0b2f65 --- /dev/null +++ b/ubuntu/Makefile @@ -0,0 +1,20 @@ +# +# Makefile for the Linux kernel ubuntu supplied third-party device drivers. +# + +# +# NOTE: to allow drivers to be added and removed without causing merge +# collisions you should add new entries in the middle of the six lines +# of ## at the bottom of the list. Always add three lines of ## above +# your new entry and maintain the six lines below. +# + +## +## +## +## +## +## + +# This is a stupid trick to get kbuild to create ubuntu/built-in.o +obj- += foo.o diff --git a/ubuntu/include/README b/ubuntu/include/README new file mode 100644 index 000000000000..adc8d33e6d8e --- /dev/null +++ b/ubuntu/include/README @@ -0,0 +1,4 @@ +Only use this directory for things which need to share their headers with +other parts of the kernel or other modules in ubuntu/ + +Otherwise, keep them local to the module directory. -- cgit v1.2.3 From 8da1b7a13a9f4123491a551ee0d9501923f91937 Mon Sep 17 00:00:00 2001 From: Stefan Bader Date: Thu, 14 Feb 2008 09:55:49 -0500 Subject: UBUNTU: ubuntu: dm-raid4-5 -- (no-up) Export dm_disk function of device-mapper This function is externally used by the dm-raid4-5 module. Signed-off-by: Stefan Bader Signed-off-by: Ben Collins --- drivers/md/dm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index e24143cc2040..ea1f089b128e 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2653,6 +2653,7 @@ struct gendisk *dm_disk(struct mapped_device *md) { return md->disk; } +EXPORT_SYMBOL_GPL(dm_disk); struct kobject *dm_kobject(struct mapped_device *md) { -- cgit v1.2.3 From 89b0f2ad42d7c0ccd732cbe3654acf47c63806f9 Mon Sep 17 00:00:00 2001 From: Manoj Iyer Date: Wed, 1 Jul 2009 17:51:07 -0500 Subject: UBUNTU: ubuntu: dm-raid45 -- version 2009.04.24 (2.6.30-rc3) ExternalDriver: dm-raid45 Description: This software extends device-mapper by RAID4 and RAID5 mappings. Url: http://people.redhat.com/~heinzm/sw/dm/dm-raid45/ Version: 2009.04.24 (2.6.30-rc3) Signed-off-by: Manoj Iyer Signed-off-by: Tim Gardner --- ubuntu/Kconfig | 4 + ubuntu/Makefile | 4 + ubuntu/dm-raid4-5/BOM | 3 + ubuntu/dm-raid4-5/Kconfig | 6 + ubuntu/dm-raid4-5/Makefile | 4 + ubuntu/dm-raid4-5/dm-memcache.c | 301 +++ ubuntu/dm-raid4-5/dm-memcache.h | 68 + ubuntu/dm-raid4-5/dm-message.c | 183 ++ ubuntu/dm-raid4-5/dm-message.h | 91 + ubuntu/dm-raid4-5/dm-raid4-5.c | 4547 ++++++++++++++++++++++++++++++++++++ ubuntu/dm-raid4-5/dm-raid4-5.h | 27 + ubuntu/dm-raid4-5/dm-raid45.h | 28 + ubuntu/dm-raid4-5/dm-region-hash.c | 718 ++++++ ubuntu/dm-raid4-5/dm-region-hash.h | 108 + 14 files changed, 6092 insertions(+) create mode 100644 ubuntu/dm-raid4-5/BOM create mode 100644 ubuntu/dm-raid4-5/Kconfig create mode 100644 ubuntu/dm-raid4-5/Makefile create mode 100644 ubuntu/dm-raid4-5/dm-memcache.c create mode 100644 ubuntu/dm-raid4-5/dm-memcache.h create mode 100644 ubuntu/dm-raid4-5/dm-message.c create mode 100644 ubuntu/dm-raid4-5/dm-message.h create mode 100644 ubuntu/dm-raid4-5/dm-raid4-5.c create mode 100644 ubuntu/dm-raid4-5/dm-raid4-5.h create mode 100644 ubuntu/dm-raid4-5/dm-raid45.h create mode 100644 ubuntu/dm-raid4-5/dm-region-hash.c create mode 100644 ubuntu/dm-raid4-5/dm-region-hash.h diff --git a/ubuntu/Kconfig b/ubuntu/Kconfig index f15ccd8b7f4c..042f8ad518b3 100644 --- a/ubuntu/Kconfig +++ b/ubuntu/Kconfig @@ -7,6 +7,10 @@ menu "Ubuntu Supplied Third-Party Device Drivers" # your new entry and maintain the six lines below. # +## +## +## +source "ubuntu/dm-raid4-5/Kconfig" ## ## ## diff --git a/ubuntu/Makefile b/ubuntu/Makefile index 5823ca0b2f65..4ec1f3d6677d 100644 --- a/ubuntu/Makefile +++ b/ubuntu/Makefile @@ -9,6 +9,10 @@ # your new entry and maintain the six lines below. # +## +## +## +obj-$(CONFIG_DM_RAID45) += dm-raid4-5/ ## ## ## diff --git a/ubuntu/dm-raid4-5/BOM b/ubuntu/dm-raid4-5/BOM new file mode 100644 index 000000000000..dd29442289da --- /dev/null +++ b/ubuntu/dm-raid4-5/BOM @@ -0,0 +1,3 @@ +Downloaded from: http://people.redhat.com/~heinzm/sw/dm/dm-raid45/ +Current Version: 2009.04.24 (2.6.30-rc3) +Comments: All of the patches to dmraid1/dm-log, etc are upstream. diff --git a/ubuntu/dm-raid4-5/Kconfig b/ubuntu/dm-raid4-5/Kconfig new file mode 100644 index 000000000000..3ce3296ca094 --- /dev/null +++ b/ubuntu/dm-raid4-5/Kconfig @@ -0,0 +1,6 @@ +config DM_RAID45 + tristate "RAID 4/5 target (EXPERIMENTAL)" + depends on BLK_DEV_DM && EXPERIMENTAL + default m + ---help--- + A target that supports RAID4 and RAID5 mappings. diff --git a/ubuntu/dm-raid4-5/Makefile b/ubuntu/dm-raid4-5/Makefile new file mode 100644 index 000000000000..9a32796d73f1 --- /dev/null +++ b/ubuntu/dm-raid4-5/Makefile @@ -0,0 +1,4 @@ +EXTRA_CFLAGS += -I$(srctree)/drivers/md + +obj-$(CONFIG_DM_RAID45) := dm-raid45.o +dm-raid45-objs := dm-raid4-5.o dm-memcache.o dm-region-hash.o dm-message.o diff --git a/ubuntu/dm-raid4-5/dm-memcache.c b/ubuntu/dm-raid4-5/dm-memcache.c new file mode 100644 index 000000000000..4e3731ca1f17 --- /dev/null +++ b/ubuntu/dm-raid4-5/dm-memcache.c @@ -0,0 +1,301 @@ +/* + * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. + * + * Module Author: Heinz Mauelshagen + * + * Device-mapper memory object handling: + * + * o allocate/free total_pages in a per client page pool. + * + * o allocate/free memory objects with chunks (1..n) of + * pages_per_chunk pages hanging off. + * + * This file is released under the GPL. + */ + +#define DM_MEM_CACHE_VERSION "0.2" + +#include "dm.h" +#include "dm-memcache.h" +#include + +struct dm_mem_cache_client { + spinlock_t lock; + mempool_t *objs_pool; + struct page_list *free_list; + unsigned objects; + unsigned chunks; + unsigned pages_per_chunk; + unsigned free_pages; + unsigned total_pages; +}; + +/* + * Free pages and page_list elements of client. + */ +static void free_cache_pages(struct page_list *list) +{ + while (list) { + struct page_list *pl = list; + + list = pl->next; + BUG_ON(!pl->page); + __free_page(pl->page); + kfree(pl); + } +} + +/* + * Alloc number of pages and page_list elements as required by client. + */ +static struct page_list *alloc_cache_pages(unsigned pages) +{ + struct page_list *pl, *ret = NULL; + struct page *page; + + while (pages--) { + page = alloc_page(GFP_NOIO); + if (!page) + goto err; + + pl = kmalloc(sizeof(*pl), GFP_NOIO); + if (!pl) { + __free_page(page); + goto err; + } + + pl->page = page; + pl->next = ret; + ret = pl; + } + + return ret; + +err: + free_cache_pages(ret); + return NULL; +} + +/* + * Allocate page_list elements from the pool to chunks of the memory object. + */ +static void alloc_chunks(struct dm_mem_cache_client *cl, + struct dm_mem_cache_object *obj) +{ + unsigned chunks = cl->chunks; + unsigned long flags; + + local_irq_save(flags); + local_irq_disable(); + while (chunks--) { + unsigned p = cl->pages_per_chunk; + + obj[chunks].pl = NULL; + + while (p--) { + struct page_list *pl; + + /* Take next element from free list */ + spin_lock(&cl->lock); + pl = cl->free_list; + BUG_ON(!pl); + cl->free_list = pl->next; + spin_unlock(&cl->lock); + + pl->next = obj[chunks].pl; + obj[chunks].pl = pl; + } + } + + local_irq_restore(flags); +} + +/* + * Free page_list elements putting them back onto free list + */ +static void free_chunks(struct dm_mem_cache_client *cl, + struct dm_mem_cache_object *obj) +{ + unsigned chunks = cl->chunks; + unsigned long flags; + struct page_list *next, *pl; + + local_irq_save(flags); + local_irq_disable(); + while (chunks--) { + for (pl = obj[chunks].pl; pl; pl = next) { + next = pl->next; + + spin_lock(&cl->lock); + pl->next = cl->free_list; + cl->free_list = pl; + cl->free_pages++; + spin_unlock(&cl->lock); + } + } + + local_irq_restore(flags); +} + +/* + * Create/destroy dm memory cache client resources. + */ +struct dm_mem_cache_client * +dm_mem_cache_client_create(unsigned objects, unsigned chunks, + unsigned pages_per_chunk) +{ + unsigned total_pages = objects * chunks * pages_per_chunk; + struct dm_mem_cache_client *client; + + BUG_ON(!total_pages); + client = kzalloc(sizeof(*client), GFP_KERNEL); + if (!client) + return ERR_PTR(-ENOMEM); + + client->objs_pool = mempool_create_kmalloc_pool(objects, + chunks * sizeof(struct dm_mem_cache_object)); + if (!client->objs_pool) + goto err; + + client->free_list = alloc_cache_pages(total_pages); + if (!client->free_list) + goto err1; + + spin_lock_init(&client->lock); + client->objects = objects; + client->chunks = chunks; + client->pages_per_chunk = pages_per_chunk; + client->free_pages = client->total_pages = total_pages; + return client; + +err1: + mempool_destroy(client->objs_pool); +err: + kfree(client); + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL(dm_mem_cache_client_create); + +void dm_mem_cache_client_destroy(struct dm_mem_cache_client *cl) +{ + BUG_ON(cl->free_pages != cl->total_pages); + free_cache_pages(cl->free_list); + mempool_destroy(cl->objs_pool); + kfree(cl); +} +EXPORT_SYMBOL(dm_mem_cache_client_destroy); + +/* + * Grow a clients cache by an amount of pages. + * + * Don't call from interrupt context! + */ +int dm_mem_cache_grow(struct dm_mem_cache_client *cl, unsigned objects) +{ + unsigned pages = objects * cl->chunks * cl->pages_per_chunk; + struct page_list *pl, *last; + + BUG_ON(!pages); + pl = alloc_cache_pages(pages); + if (!pl) + return -ENOMEM; + + last = pl; + while (last->next) + last = last->next; + + spin_lock_irq(&cl->lock); + last->next = cl->free_list; + cl->free_list = pl; + cl->free_pages += pages; + cl->total_pages += pages; + cl->objects++; + spin_unlock_irq(&cl->lock); + + mempool_resize(cl->objs_pool, cl->objects, GFP_NOIO); + return 0; +} +EXPORT_SYMBOL(dm_mem_cache_grow); + +/* Shrink a clients cache by an amount of pages */ +int dm_mem_cache_shrink(struct dm_mem_cache_client *cl, unsigned objects) +{ + int r; + unsigned pages = objects * cl->chunks * cl->pages_per_chunk, p = pages; + unsigned long flags; + struct page_list *last = NULL, *pl, *pos; + + BUG_ON(!pages); + + spin_lock_irqsave(&cl->lock, flags); + pl = pos = cl->free_list; + while (p-- && pos->next) { + last = pos; + pos = pos->next; + } + + if (++p) + r = -ENOMEM; + else { + r = 0; + cl->free_list = pos; + cl->free_pages -= pages; + cl->total_pages -= pages; + cl->objects--; + last->next = NULL; + } + spin_unlock_irqrestore(&cl->lock, flags); + + if (!r) { + free_cache_pages(pl); + mempool_resize(cl->objs_pool, cl->objects, GFP_NOIO); + } + + return r; +} +EXPORT_SYMBOL(dm_mem_cache_shrink); + +/* + * Allocate/free a memory object + * + * Can be called from interrupt context + */ +struct dm_mem_cache_object *dm_mem_cache_alloc(struct dm_mem_cache_client *cl) +{ + int r = 0; + unsigned pages = cl->chunks * cl->pages_per_chunk; + unsigned long flags; + struct dm_mem_cache_object *obj; + + obj = mempool_alloc(cl->objs_pool, GFP_NOIO); + if (!obj) + return ERR_PTR(-ENOMEM); + + spin_lock_irqsave(&cl->lock, flags); + if (pages > cl->free_pages) + r = -ENOMEM; + else + cl->free_pages -= pages; + spin_unlock_irqrestore(&cl->lock, flags); + + if (r) { + mempool_free(obj, cl->objs_pool); + return ERR_PTR(r); + } + + alloc_chunks(cl, obj); + return obj; +} +EXPORT_SYMBOL(dm_mem_cache_alloc); + +void dm_mem_cache_free(struct dm_mem_cache_client *cl, + struct dm_mem_cache_object *obj) +{ + free_chunks(cl, obj); + mempool_free(obj, cl->objs_pool); +} +EXPORT_SYMBOL(dm_mem_cache_free); + +MODULE_DESCRIPTION(DM_NAME " dm memory cache"); +MODULE_AUTHOR("Heinz Mauelshagen "); +MODULE_LICENSE("GPL"); diff --git a/ubuntu/dm-raid4-5/dm-memcache.h b/ubuntu/dm-raid4-5/dm-memcache.h new file mode 100644 index 000000000000..87e4256daf5d --- /dev/null +++ b/ubuntu/dm-raid4-5/dm-memcache.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. + * + * Module Author: Heinz Mauelshagen + * + * Device-mapper memory object handling: + * + * o allocate/free total_pages in a per client page pool. + * + * o allocate/free memory objects with chunks (1..n) of + * pages_per_chunk pages hanging off. + * + * This file is released under the GPL. + */ + +#ifndef _DM_MEM_CACHE_H +#define _DM_MEM_CACHE_H + +#define DM_MEM_CACHE_H_VERSION "0.1" + +#include "dm.h" +#include + +static inline struct page_list *pl_elem(struct page_list *pl, unsigned p) +{ + while (pl && p--) + pl = pl->next; + + return pl; +} + +struct dm_mem_cache_object { + struct page_list *pl; /* Dynamically allocated array */ + void *private; /* Caller context reference */ +}; + +struct dm_mem_cache_client; + +/* + * Create/destroy dm memory cache client resources. + * + * On creation, a number of @objects with @chunks of + * @pages_per_chunk pages will be allocated. + */ +struct dm_mem_cache_client * +dm_mem_cache_client_create(unsigned objects, unsigned chunks, + unsigned pages_per_chunk); +void dm_mem_cache_client_destroy(struct dm_mem_cache_client *client); + +/* + * Grow/shrink a dm memory cache client resources + * by @objetcs amount of objects. + */ +int dm_mem_cache_grow(struct dm_mem_cache_client *client, unsigned objects); +int dm_mem_cache_shrink(struct dm_mem_cache_client *client, unsigned objects); + +/* + * Allocate/free a memory object + * + * On allocation one object with an amount of chunks and + * an amount of pages per chunk will be returned on success. + */ +struct dm_mem_cache_object * +dm_mem_cache_alloc(struct dm_mem_cache_client *client); +void dm_mem_cache_free(struct dm_mem_cache_client *client, + struct dm_mem_cache_object *object); + +#endif diff --git a/ubuntu/dm-raid4-5/dm-message.c b/ubuntu/dm-raid4-5/dm-message.c new file mode 100644 index 000000000000..a66b0152cb3d --- /dev/null +++ b/ubuntu/dm-raid4-5/dm-message.c @@ -0,0 +1,183 @@ +/* + * Copyright (C) 2007,2008 Red Hat Inc. All rights reserved. + * + * Module Author: Heinz Mauelshagen + * + * General device-mapper message interface argument parser. + * + * This file is released under the GPL. + * + * device-mapper message parser. + * + */ + +#include "dm.h" +#include "dm-message.h" +#include + +#define DM_MSG_PREFIX "dm_message" + +/* Basename of a path. */ +static inline char * +basename(char *s) +{ + char *p = strrchr(s, '/'); + + return p ? p + 1 : s; +} + +/* Get an argument depending on type. */ +static void +message_arguments(struct dm_msg *msg, int argc, char **argv) +{ + + if (argc) { + int i; + struct dm_message_argument *args = msg->spec->args; + + for (i = 0; i < args->num_args; i++) { + int r; + unsigned long **ptr = args->ptr; + enum dm_message_argument_type type = args->types[i]; + + switch (type) { + case dm_msg_base_t: + ((char **) ptr)[i] = basename(argv[i]); + break; + + case dm_msg_str_t: + ((char **) ptr)[i] = argv[i]; + break; + + case dm_msg_int_t: + r = sscanf(argv[i], "%d", ((int **) ptr)[i]); + goto check; + + case dm_msg_uint_t: + r = sscanf(argv[i], "%u", + ((unsigned **) ptr)[i]); + goto check; + + case dm_msg_uint64_t: + r = sscanf(argv[i], "%llu", + ((unsigned long long **) ptr)[i]); + +check: + if (r != 1) { + set_bit(dm_msg_ret_undef, &msg->ret); + set_bit(dm_msg_ret_arg, &msg->ret); + } + } + } + } +} + +/* Parse message options. */ +static void +message_options_parse(struct dm_msg *msg, int argc, char **argv) +{ + int hit = 0; + unsigned long *action; + size_t l1 = strlen(*argv), l_hit = 0; + struct dm_message_option *o = msg->spec->options; + char **option, **option_end = o->options + o->num_options; + + for (option = o->options, action = o->actions; + option < option_end; option++, action++) { + size_t l2 = strlen(*option); + + if (!strnicmp(*argv, *option, min(l1, l2))) { + hit++; + l_hit = l2; + set_bit(*action, &msg->action); + } + } + + /* Assume error. */ + msg->ret = 0; + set_bit(dm_msg_ret_option, &msg->ret); + if (!hit || l1 > l_hit) + set_bit(dm_msg_ret_undef, &msg->ret); /* Undefined option. */ + else if (hit > 1) + set_bit(dm_msg_ret_ambiguous, &msg->ret); /* Ambiguous option.*/ + else { + clear_bit(dm_msg_ret_option, &msg->ret); /* Option OK. */ + message_arguments(msg, --argc, ++argv); + } +} + +static inline void +print_ret(const char *caller, unsigned long ret) +{ + struct { + unsigned long err; + const char *err_str; + } static err_msg[] = { + { dm_msg_ret_ambiguous, "message ambiguous" }, + { dm_msg_ret_inval, "message invalid" }, + { dm_msg_ret_undef, "message undefined" }, + { dm_msg_ret_arg, "message argument" }, + { dm_msg_ret_argcount, "message argument count" }, + { dm_msg_ret_option, "option" }, + }, *e = ARRAY_END(err_msg); + + while (e-- > err_msg) { + if (test_bit(e->err, &ret)) + DMERR("%s %s", caller, e->err_str); + } +} + +/* Parse a message action. */ +int +dm_message_parse(const char *caller, struct dm_msg *msg, void *context, + int argc, char **argv) +{ + int hit = 0; + size_t l1, l_hit = 0; + struct dm_msg_spec *s, *s_hit = NULL, + *s_end = msg->specs + msg->num_specs; + + if (argc < 2) + return -EINVAL; + + l1 = strlen(*argv); + for (s = msg->specs; s < s_end; s++) { + size_t l2 = strlen(s->cmd); + + if (!strnicmp(*argv, s->cmd, min(l1, l2))) { + hit++; + l_hit = l2; + s_hit = s; + } + } + + msg->ret = 0; + if (!hit || l1 > l_hit) /* No hit or message string too long. */ + set_bit(dm_msg_ret_undef, &msg->ret); + else if (hit > 1) /* Ambiguous message. */ + set_bit(dm_msg_ret_ambiguous, &msg->ret); + else if (argc - 2 != s_hit->args->num_args) { + set_bit(dm_msg_ret_undef, &msg->ret); + set_bit(dm_msg_ret_argcount, &msg->ret); + } + + if (msg->ret) + goto bad; + + msg->action = 0; + msg->spec = s_hit; + set_bit(s_hit->action, &msg->action); + message_options_parse(msg, --argc, ++argv); + + if (!msg->ret) + return msg->spec->f(msg, context); + +bad: + print_ret(caller, msg->ret); + return -EINVAL; +} +EXPORT_SYMBOL(dm_message_parse); + +MODULE_DESCRIPTION(DM_NAME " device-mapper target message parser"); +MODULE_AUTHOR("Heinz Mauelshagen "); +MODULE_LICENSE("GPL"); diff --git a/ubuntu/dm-raid4-5/dm-message.h b/ubuntu/dm-raid4-5/dm-message.h new file mode 100644 index 000000000000..2024534c5bf0 --- /dev/null +++ b/ubuntu/dm-raid4-5/dm-message.h @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2007,2008 Red Hat, Inc. All rights reserved. + * + * Module Author: Heinz Mauelshagen + * + * General device-mapper message interface argument parser. + * + * This file is released under the GPL. + * + */ + +#ifndef DM_MESSAGE_H +#define DM_MESSAGE_H + +/* Factor out to dm.h. */ +/* Reference to array end. */ +#define ARRAY_END(a) ((a) + ARRAY_SIZE(a)) + +/* Message return bits. */ +enum dm_message_return { + dm_msg_ret_ambiguous, /* Action ambiguous. */ + dm_msg_ret_inval, /* Action invalid. */ + dm_msg_ret_undef, /* Action undefined. */ + + dm_msg_ret_option, /* Option error. */ + dm_msg_ret_arg, /* Argument error. */ + dm_msg_ret_argcount, /* Argument count error. */ +}; + +/* Message argument type conversions. */ +enum dm_message_argument_type { + dm_msg_base_t, /* Basename string. */ + dm_msg_str_t, /* String. */ + dm_msg_int_t, /* Signed int. */ + dm_msg_uint_t, /* Unsigned int. */ + dm_msg_uint64_t, /* Unsigned int 64. */ +}; + +/* A message option. */ +struct dm_message_option { + unsigned num_options; + char **options; + unsigned long *actions; +}; + +/* Message arguments and types. */ +struct dm_message_argument { + unsigned num_args; + unsigned long **ptr; + enum dm_message_argument_type types[]; +}; + +/* Client message. */ +struct dm_msg { + unsigned long action; /* Identified action. */ + unsigned long ret; /* Return bits. */ + unsigned num_specs; /* # of sepcifications listed. */ + struct dm_msg_spec *specs; /* Specification list. */ + struct dm_msg_spec *spec; /* Specification selected. */ +}; + +/* Secification of the message. */ +struct dm_msg_spec { + const char *cmd; /* Name of the command (i.e. 'bandwidth'). */ + unsigned long action; + struct dm_message_option *options; + struct dm_message_argument *args; + unsigned long parm; /* Parameter to pass through to callback. */ + /* Function to process for action. */ + int (*f) (struct dm_msg *msg, void *context); +}; + +/* Parameter access macros. */ +#define DM_MSG_PARM(msg) ((msg)->spec->parm) + +#define DM_MSG_STR_ARGS(msg, idx) ((char *) *(msg)->spec->args->ptr[idx]) +#define DM_MSG_INT_ARGS(msg, idx) ((int) *(msg)->spec->args->ptr[idx]) +#define DM_MSG_UINT_ARGS(msg, idx) ((unsigned) DM_MSG_INT_ARG(msg, idx)) +#define DM_MSG_UINT64_ARGS(msg, idx) ((uint64_t) *(msg)->spec->args->ptr[idx]) + +#define DM_MSG_STR_ARG(msg) DM_MSG_STR_ARGS(msg, 0) +#define DM_MSG_INT_ARG(msg) DM_MSG_INT_ARGS(msg, 0) +#define DM_MSG_UINT_ARG(msg) DM_MSG_UINT_ARGS(msg, 0) +#define DM_MSG_UINT64_ARG(msg) DM_MSG_UINT64_ARGS(msg, 0) + + +/* Parse a message and its options and optionally call a function back. */ +int dm_message_parse(const char *caller, struct dm_msg *msg, void *context, + int argc, char **argv); + +#endif diff --git a/ubuntu/dm-raid4-5/dm-raid4-5.c b/ubuntu/dm-raid4-5/dm-raid4-5.c new file mode 100644 index 000000000000..52b21e9d189e --- /dev/null +++ b/ubuntu/dm-raid4-5/dm-raid4-5.c @@ -0,0 +1,4547 @@ +/*[A[A + * Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved. + * + * Module Author: Heinz Mauelshagen + * + * This file is released under the GPL. + * + * + * Linux 2.6 Device Mapper RAID4 and RAID5 target. + * + * Supports: + * o RAID4 with dedicated and selectable parity device + * o RAID5 with rotating parity (left+right, symmetric+asymmetric) + * o recovery of out of sync device for initial + * RAID set creation or after dead drive replacement + * o run time optimization of xor algorithm used to calculate parity + * + * + * Thanks to MD for: + * o the raid address calculation algorithm + * o the base of the biovec <-> page list copier. + * + * + * Uses region hash to keep track of how many writes are in flight to + * regions in order to use dirty log to keep state of regions to recover: + * + * o clean regions (those which are synchronized + * and don't have write io in flight) + * o dirty regions (those with write io in flight) + * + * + * On startup, any dirty regions are migrated to the + * 'nosync' state and are subject to recovery by the daemon. + * + * See raid_ctr() for table definition. + * + * FIXME: recovery bandwidth + */ + +static const char *version = "v0.2594b"; + +#include "dm.h" +#include "dm-memcache.h" +#include "dm-message.h" +#include "dm-raid45.h" + +#include +#include +#include + +#include +#include +#include +#include "dm-region-hash.h" + + +/* + * Configurable parameters + */ + +/* Minimum/maximum and default # of selectable stripes. */ +#define STRIPES_MIN 8 +#define STRIPES_MAX 16384 +#define STRIPES_DEFAULT 80 + +/* Maximum and default chunk size in sectors if not set in constructor. */ +#define CHUNK_SIZE_MIN 8 +#define CHUNK_SIZE_MAX 16384 +#define CHUNK_SIZE_DEFAULT 64 + +/* Default io size in sectors if not set in constructor. */ +#define IO_SIZE_MIN CHUNK_SIZE_MIN +#define IO_SIZE_DEFAULT IO_SIZE_MIN + +/* Recover io size default in sectors. */ +#define RECOVER_IO_SIZE_MIN 64 +#define RECOVER_IO_SIZE_DEFAULT 256 + +/* Default, minimum and maximum percentage of recover io bandwidth. */ +#define BANDWIDTH_DEFAULT 10 +#define BANDWIDTH_MIN 1 +#define BANDWIDTH_MAX 100 + +/* # of parallel recovered regions */ +#define RECOVERY_STRIPES_MIN 1 +#define RECOVERY_STRIPES_MAX 64 +#define RECOVERY_STRIPES_DEFAULT RECOVERY_STRIPES_MIN +/* + * END Configurable parameters + */ + +#define TARGET "dm-raid45" +#define DAEMON "kraid45d" +#define DM_MSG_PREFIX TARGET + +#define SECTORS_PER_PAGE (PAGE_SIZE >> SECTOR_SHIFT) + +/* Amount/size for __xor(). */ +#define XOR_SIZE PAGE_SIZE + +/* Check value in range. */ +#define range_ok(i, min, max) (i >= min && i <= max) + +/* Check argument is power of 2. */ +#define POWER_OF_2(a) (!(a & (a - 1))) + +/* Structure access macros. */ +/* Derive raid_set from stripe_cache pointer. */ +#define RS(x) container_of(x, struct raid_set, sc) + +/* Page reference. */ +#define PAGE(stripe, p) ((stripe)->obj[p].pl->page) + +/* Stripe chunk reference. */ +#define CHUNK(stripe, p) ((stripe)->chunk + p) + +/* Bio list reference. */ +#define BL(stripe, p, rw) (stripe->chunk[p].bl + rw) +#define BL_CHUNK(chunk, rw) (chunk->bl + rw) + +/* Page list reference. */ +#define PL(stripe, p) (stripe->obj[p].pl) +/* END: structure access macros. */ + +/* Factor out to dm-bio-list.h */ +static inline void bio_list_push(struct bio_list *bl, struct bio *bio) +{ + bio->bi_next = bl->head; + bl->head = bio; + + if (!bl->tail) + bl->tail = bio; +} + +/* Factor out to dm.h */ +#define TI_ERR_RET(str, ret) \ + do { ti->error = str; return ret; } while (0); +#define TI_ERR(str) TI_ERR_RET(str, -EINVAL) + +/* Macro to define access IO flags access inline functions. */ +#define BITOPS(name, what, var, flag) \ +static inline int TestClear ## name ## what(struct var *v) \ +{ return test_and_clear_bit(flag, &v->io.flags); } \ +static inline int TestSet ## name ## what(struct var *v) \ +{ return test_and_set_bit(flag, &v->io.flags); } \ +static inline void Clear ## name ## what(struct var *v) \ +{ clear_bit(flag, &v->io.flags); } \ +static inline void Set ## name ## what(struct var *v) \ +{ set_bit(flag, &v->io.flags); } \ +static inline int name ## what(struct var *v) \ +{ return test_bit(flag, &v->io.flags); } + +/*----------------------------------------------------------------- + * Stripe cache + * + * Cache for all reads and writes to raid sets (operational or degraded) + * + * We need to run all data to and from a RAID set through this cache, + * because parity chunks need to get calculated from data chunks + * or, in the degraded/resynchronization case, missing chunks need + * to be reconstructed using the other chunks of the stripe. + *---------------------------------------------------------------*/ +/* A chunk within a stripe (holds bios hanging off). */ +/* IO status flags for chunks of a stripe. */ +enum chunk_flags { + CHUNK_DIRTY, /* Pages of chunk dirty; need writing. */ + CHUNK_ERROR, /* IO error on any chunk page. */ + CHUNK_IO, /* Allow/prohibit IO on chunk pages. */ + CHUNK_LOCKED, /* Chunk pages locked during IO. */ + CHUNK_MUST_IO, /* Chunk must io. */ + CHUNK_UNLOCK, /* Enforce chunk unlock. */ + CHUNK_UPTODATE, /* Chunk pages are uptodate. */ +}; + +#if READ != 0 || WRITE != 1 +#error dm-raid45: READ/WRITE != 0/1 used as index!!! +#endif + +enum bl_type { + WRITE_QUEUED = WRITE + 1, + WRITE_MERGED, + NR_BL_TYPES, /* Must be last one! */ +}; +struct stripe_chunk { + atomic_t cnt; /* Reference count. */ + struct stripe *stripe; /* Backpointer to stripe for endio(). */ + /* Bio lists for reads, writes, and writes merged. */ + struct bio_list bl[NR_BL_TYPES]; + struct { + unsigned long flags; /* IO status flags. */ + } io; +}; + +/* Define chunk bit operations. */ +BITOPS(Chunk, Dirty, stripe_chunk, CHUNK_DIRTY) +BITOPS(Chunk, Error, stripe_chunk, CHUNK_ERROR) +BITOPS(Chunk, Io, stripe_chunk, CHUNK_IO) +BITOPS(Chunk, Locked, stripe_chunk, CHUNK_LOCKED) +BITOPS(Chunk, MustIo, stripe_chunk, CHUNK_MUST_IO) +BITOPS(Chunk, Unlock, stripe_chunk, CHUNK_UNLOCK) +BITOPS(Chunk, Uptodate, stripe_chunk, CHUNK_UPTODATE) + +/* + * Stripe linked list indexes. Keep order, because the stripe + * and the stripe cache rely on the first 3! + */ +enum list_types { + LIST_FLUSH, /* Stripes to flush for io. */ + LIST_ENDIO, /* Stripes to endio. */ + LIST_LRU, /* Least recently used stripes. */ + SC_NR_LISTS, /* # of lists in stripe cache. */ + LIST_HASH = SC_NR_LISTS, /* Hashed stripes. */ + LIST_RECOVER = LIST_HASH, /* For recovery type stripes only. */ + STRIPE_NR_LISTS,/* To size array in struct stripe. */ +}; + +/* Adressing region recovery. */ +struct recover_addr { + struct dm_region *reg; /* Actual region to recover. */ + sector_t pos; /* Position within region to recover. */ + sector_t end; /* End of region to recover. */ +}; + +/* A stripe: the io object to handle all reads and writes to a RAID set. */ +struct stripe { + atomic_t cnt; /* Reference count. */ + struct stripe_cache *sc; /* Backpointer to stripe cache. */ + + /* + * 4 linked lists: + * o io list to flush io + * o endio list + * o LRU list to put stripes w/o reference count on + * o stripe cache hash + */ + struct list_head lists[STRIPE_NR_LISTS]; + + sector_t key; /* Hash key. */ + region_t region; /* Region stripe is mapped to. */ + + struct { + unsigned long flags; /* Stripe state flags (see below). */ + + /* + * Pending ios in flight: + * + * used to control move of stripe to endio list + */ + atomic_t pending; + + /* Sectors to read and write for multi page stripe sets. */ + unsigned size; + } io; + + /* Address region recovery. */ + struct recover_addr *recover; + + /* Lock on stripe (Future: for clustering). */ + void *lock; + + struct { + unsigned short parity; /* Parity chunk index. */ + short recover; /* Recovery chunk index. */ + } idx; + + /* + * This stripe's memory cache object (dm-mem-cache); + * i.e. the io chunk pages. + */ + struct dm_mem_cache_object *obj; + + /* Array of stripe sets (dynamically allocated). */ + struct stripe_chunk chunk[0]; +}; + +/* States stripes can be in (flags field). */ +enum stripe_states { + STRIPE_ERROR, /* io error on stripe. */ + STRIPE_MERGED, /* Writes got merged to be written. */ + STRIPE_RBW, /* Read-before-write stripe. */ + STRIPE_RECONSTRUCT, /* Reconstruct of a missing chunk required. */ + STRIPE_RECONSTRUCTED, /* Reconstructed of a missing chunk. */ + STRIPE_RECOVER, /* Stripe used for RAID set recovery. */ +}; + +/* Define stripe bit operations. */ +BITOPS(Stripe, Error, stripe, STRIPE_ERROR) +BITOPS(Stripe, Merged, stripe, STRIPE_MERGED) +BITOPS(Stripe, RBW, stripe, STRIPE_RBW) +BITOPS(Stripe, Reconstruct, stripe, STRIPE_RECONSTRUCT) +BITOPS(Stripe, Reconstructed, stripe, STRIPE_RECONSTRUCTED) +BITOPS(Stripe, Recover, stripe, STRIPE_RECOVER) + +/* A stripe hash. */ +struct stripe_hash { + struct list_head *hash; + unsigned buckets; + unsigned mask; + unsigned prime; + unsigned shift; +}; + +enum sc_lock_types { + LOCK_ENDIO, /* Protect endio list. */ + LOCK_LRU, /* Protect LRU list. */ + NR_LOCKS, /* To size array in struct stripe_cache. */ +}; + +/* A stripe cache. */ +struct stripe_cache { + /* Stripe hash. */ + struct stripe_hash hash; + + spinlock_t locks[NR_LOCKS]; /* Locks to protect lists. */ + + /* Stripes with io to flush, stripes to endio and LRU lists. */ + struct list_head lists[SC_NR_LISTS]; + + /* Slab cache to allocate stripes from. */ + struct { + struct kmem_cache *cache; /* Cache itself. */ + char name[32]; /* Unique name. */ + } kc; + + struct dm_io_client *dm_io_client; /* dm-io client resource context. */ + + /* dm-mem-cache client resource context. */ + struct dm_mem_cache_client *mem_cache_client; + + int stripes_parm; /* # stripes parameter from constructor. */ + atomic_t stripes; /* actual # of stripes in cache. */ + atomic_t stripes_to_set; /* # of stripes to resize cache to. */ + atomic_t stripes_last; /* last # of stripes in cache. */ + atomic_t active_stripes; /* actual # of active stripes in cache. */ + + /* REMOVEME: */ + atomic_t active_stripes_max; /* actual # of active stripes in cache. */ +}; + +/* Flag specs for raid_dev */ ; +enum raid_dev_flags { + DEV_FAILED, /* Device failed. */ + DEV_IO_QUEUED, /* Io got queued to device. */ +}; + +/* The raid device in a set. */ +struct raid_dev { + struct dm_dev *dev; + sector_t start; /* Offset to map to. */ + struct { /* Using struct to be able to BITOPS(). */ + unsigned long flags; /* raid_dev_flags. */ + } io; +}; + +BITOPS(Dev, Failed, raid_dev, DEV_FAILED) +BITOPS(Dev, IoQueued, raid_dev, DEV_IO_QUEUED) + +/* Flags spec for raid_set. */ +enum raid_set_flags { + RS_CHECK_OVERWRITE, /* Check for chunk overwrites. */ + RS_DEAD, /* RAID set inoperational. */ + RS_DEGRADED, /* Io errors on RAID device. */ + RS_DEVEL_STATS, /* REMOVEME: display status information. */ + RS_RECOVER, /* Do recovery. */ + RS_RECOVERY_BANDWIDTH, /* Allow recovery bandwidth (delayed bios). */ + RS_SC_BUSY, /* Stripe cache busy -> send an event. */ + RS_SUSPEND, /* Suspend RAID set. */ +}; + +/* REMOVEME: devel stats counters. */ +enum stats_types { + S_BIOS_READ, + S_BIOS_ADDED_READ, + S_BIOS_ENDIO_READ, + S_BIOS_WRITE, + S_BIOS_ADDED_WRITE, + S_BIOS_ENDIO_WRITE, + S_CAN_MERGE, + S_CANT_MERGE, + S_CONGESTED, + S_DM_IO_READ, + S_DM_IO_WRITE, + S_BANDWIDTH, + S_BARRIER, + S_BIO_COPY_PL_NEXT, + S_DEGRADED, + S_DELAYED_BIOS, + S_FLUSHS, + S_HITS_1ST, + S_IOS_POST, + S_INSCACHE, + S_MAX_LOOKUP, + S_CHUNK_LOCKED, + S_NO_BANDWIDTH, + S_NOT_CONGESTED, + S_NO_RW, + S_NOSYNC, + S_OVERWRITE, + S_PROHIBITCHUNKIO, + S_RECONSTRUCT_EI, + S_RECONSTRUCT_DEV, + S_RECONSTRUCT_SET, + S_RECONSTRUCTED, + S_REQUEUE, + S_STRIPE_ERROR, + S_SUM_DELAYED_BIOS, + S_XORS, + S_NR_STATS, /* # of stats counters. Must be last! */ +}; + +/* Status type -> string mappings. */ +struct stats_map { + const enum stats_types type; + const char *str; +}; + +static struct stats_map stats_map[] = { + { S_BIOS_READ, "r=" }, + { S_BIOS_ADDED_READ, "/" }, + { S_BIOS_ENDIO_READ, "/" }, + { S_BIOS_WRITE, " w=" }, + { S_BIOS_ADDED_WRITE, "/" }, + { S_BIOS_ENDIO_WRITE, "/" }, + { S_DM_IO_READ, " rc=" }, + { S_DM_IO_WRITE, " wc=" }, + { S_BANDWIDTH, "\nbw=" }, + { S_NO_BANDWIDTH, " no_bw=" }, + { S_BARRIER, "\nbarrier=" }, + { S_BIO_COPY_PL_NEXT, "\nbio_cp_next=" }, + { S_CAN_MERGE, "\nmerge=" }, + { S_CANT_MERGE, "/no_merge=" }, + { S_CHUNK_LOCKED, "\nchunk_locked=" }, + { S_CONGESTED, "\ncgst=" }, + { S_NOT_CONGESTED, "/not_cgst=" }, + { S_DEGRADED, "\ndegraded=" }, + { S_DELAYED_BIOS, "\ndel_bios=" }, + { S_SUM_DELAYED_BIOS, "/sum_del_bios=" }, + { S_FLUSHS, "\nflushs=" }, + { S_HITS_1ST, "\nhits_1st=" }, + { S_IOS_POST, " ios_post=" }, + { S_INSCACHE, " inscache=" }, + { S_MAX_LOOKUP, " maxlookup=" }, + { S_NO_RW, "\nno_rw=" }, + { S_NOSYNC, " nosync=" }, + { S_OVERWRITE, " ovr=" }, + { S_PROHIBITCHUNKIO, " prhbt_io=" }, + { S_RECONSTRUCT_EI, "\nrec_ei=" }, + { S_RECONSTRUCT_DEV, " rec_dev=" }, + { S_RECONSTRUCT_SET, " rec_set=" }, + { S_RECONSTRUCTED, " rec=" }, + { S_REQUEUE, " requeue=" }, + { S_STRIPE_ERROR, " stripe_err=" }, + { S_XORS, " xors=" }, +}; + +/* + * A RAID set. + */ +#define dm_rh_client dm_region_hash +enum count_type { IO_WORK = 0, IO_RECOVER, IO_NR_COUNT }; +typedef void (*xor_function_t)(unsigned count, unsigned long **data); +struct raid_set { + struct dm_target *ti; /* Target pointer. */ + + struct { + unsigned long flags; /* State flags. */ + struct mutex in_lock; /* Protects central input list below. */ + struct bio_list in; /* Pending ios (central input list). */ + struct bio_list work; /* ios work set. */ + wait_queue_head_t suspendq; /* suspend synchronization. */ + atomic_t in_process; /* counter of queued bios (suspendq). */ + atomic_t in_process_max;/* counter of queued bios max. */ + + /* io work. */ + struct workqueue_struct *wq; + struct delayed_work dws_do_raid; /* For main worker. */ + struct work_struct ws_do_table_event; /* For event worker. */ + } io; + + /* Stripe locking abstraction. */ + struct dm_raid45_locking_type *locking; + + struct stripe_cache sc; /* Stripe cache for this set. */ + + /* Xor optimization. */ + struct { + struct xor_func *f; + unsigned chunks; + unsigned speed; + } xor; + + /* Recovery parameters. */ + struct recover { + struct dm_dirty_log *dl; /* Dirty log. */ + struct dm_rh_client *rh; /* Region hash. */ + + struct dm_io_client *dm_io_client; /* recovery dm-io client. */ + /* dm-mem-cache client resource context for recovery stripes. */ + struct dm_mem_cache_client *mem_cache_client; + + struct list_head stripes; /* List of recovery stripes. */ + + region_t nr_regions; + region_t nr_regions_to_recover; + region_t nr_regions_recovered; + unsigned long start_jiffies; + unsigned long end_jiffies; + + unsigned bandwidth; /* Recovery bandwidth [%]. */ + unsigned bandwidth_work; /* Recovery bandwidth [factor]. */ + unsigned bandwidth_parm; /* " constructor parm. */ + unsigned io_size; /* recovery io size <= region size. */ + unsigned io_size_parm; /* recovery io size ctr parameter. */ + unsigned recovery; /* Recovery allowed/prohibited. */ + unsigned recovery_stripes; /* # of parallel recovery stripes. */ + + /* recovery io throttling. */ + atomic_t io_count[IO_NR_COUNT]; /* counter recover/regular io.*/ + unsigned long last_jiffies; + } recover; + + /* RAID set parameters. */ + struct { + struct raid_type *raid_type; /* RAID type (eg, RAID4). */ + unsigned raid_parms; /* # variable raid parameters. */ + + unsigned chunk_size; /* Sectors per chunk. */ + unsigned chunk_size_parm; + unsigned chunk_shift; /* rsector chunk size shift. */ + + unsigned io_size; /* Sectors per io. */ + unsigned io_size_parm; + unsigned io_mask; /* Mask for bio_copy_page_list(). */ + unsigned io_inv_mask; /* Mask for raid_address(). */ + + sector_t sectors_per_dev; /* Sectors per device. */ + + atomic_t failed_devs; /* Amount of devices failed. */ + + /* Index of device to initialize. */ + int dev_to_init; + int dev_to_init_parm; + + /* Raid devices dynamically allocated. */ + unsigned raid_devs; /* # of RAID devices below. */ + unsigned data_devs; /* # of RAID data devices. */ + + int ei; /* index of failed RAID device. */ + + /* Index of dedicated parity device (i.e. RAID4). */ + int pi; + int pi_parm; /* constructor parm for status output. */ + } set; + + /* REMOVEME: devel stats counters. */ + atomic_t stats[S_NR_STATS]; + + /* Dynamically allocated temporary pointers for xor(). */ + unsigned long **data; + + /* Dynamically allocated RAID devices. Alignment? */ + struct raid_dev dev[0]; +}; + +/* Define RAID set bit operations. */ +BITOPS(RS, Bandwidth, raid_set, RS_RECOVERY_BANDWIDTH) +BITOPS(RS, CheckOverwrite, raid_set, RS_CHECK_OVERWRITE) +BITOPS(RS, Dead, raid_set, RS_DEAD) +BITOPS(RS, Degraded, raid_set, RS_DEGRADED) +BITOPS(RS, DevelStats, raid_set, RS_DEVEL_STATS) +BITOPS(RS, Recover, raid_set, RS_RECOVER) +BITOPS(RS, ScBusy, raid_set, RS_SC_BUSY) +BITOPS(RS, Suspend, raid_set, RS_SUSPEND) +#undef BITOPS + +/*----------------------------------------------------------------- + * Raid-4/5 set structures. + *---------------------------------------------------------------*/ +/* RAID level definitions. */ +enum raid_level { + raid4, + raid5, +}; + +/* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */ +enum raid_algorithm { + none, + left_asym, + right_asym, + left_sym, + right_sym, +}; + +struct raid_type { + const char *name; /* RAID algorithm. */ + const char *descr; /* Descriptor text for logging. */ + const unsigned parity_devs; /* # of parity devices. */ + const unsigned minimal_devs; /* minimal # of devices in set. */ + const enum raid_level level; /* RAID level. */ + const enum raid_algorithm algorithm; /* RAID algorithm. */ +}; + +/* Supported raid types and properties. */ +static struct raid_type raid_types[] = { + {"raid4", "RAID4 (dedicated parity disk)", 1, 3, raid4, none}, + {"raid5_la", "RAID5 (left asymmetric)", 1, 3, raid5, left_asym}, + {"raid5_ra", "RAID5 (right asymmetric)", 1, 3, raid5, right_asym}, + {"raid5_ls", "RAID5 (left symmetric)", 1, 3, raid5, left_sym}, + {"raid5_rs", "RAID5 (right symmetric)", 1, 3, raid5, right_sym}, +}; + +/* Address as calculated by raid_address(). */ +struct raid_address { + sector_t key; /* Hash key (address of stripe % chunk_size). */ + unsigned di, pi; /* Data and parity disks index. */ +}; + +/* REMOVEME: reset statistics counters. */ +static void stats_reset(struct raid_set *rs) +{ + unsigned s = S_NR_STATS; + + while (s--) + atomic_set(rs->stats + s, 0); +} + +/*---------------------------------------------------------------- + * RAID set management routines. + *--------------------------------------------------------------*/ +/* + * Begin small helper functions. + */ +/* No need to be called from region hash indirectly at dm_rh_dec(). */ +static void wake_dummy(void *context) {} + +/* Return # of io reference. */ +static int io_ref(struct raid_set *rs) +{ + return atomic_read(&rs->io.in_process); +} + +/* Get an io reference. */ +static void io_get(struct raid_set *rs) +{ + int p = atomic_inc_return(&rs->io.in_process); + + if (p > atomic_read(&rs->io.in_process_max)) + atomic_set(&rs->io.in_process_max, p); /* REMOVEME: max. */ +} + +/* Put the io reference and conditionally wake io waiters. */ +static void io_put(struct raid_set *rs) +{ + /* Intel: rebuild data corrupter? */ + if (atomic_dec_and_test(&rs->io.in_process)) + wake_up(&rs->io.suspendq); + else + BUG_ON(io_ref(rs) < 0); +} + +/* Wait until all io has been processed. */ +static void wait_ios(struct raid_set *rs) +{ + wait_event(rs->io.suspendq, !io_ref(rs)); +} + +/* Queue (optionally delayed) io work. */ +static void wake_do_raid_delayed(struct raid_set *rs, unsigned long delay) +{ + queue_delayed_work(rs->io.wq, &rs->io.dws_do_raid, delay); +} + +/* Queue io work immediately (called from region hash too). */ +static void wake_do_raid(void *context) +{ + struct raid_set *rs = context; + + queue_work(rs->io.wq, &rs->io.dws_do_raid.work); +} + +/* Calculate device sector offset. */ +static sector_t _sector(struct raid_set *rs, struct bio *bio) +{ + sector_t sector = bio->bi_sector; + + sector_div(sector, rs->set.data_devs); + return sector; +} + +/* Return # of active stripes in stripe cache. */ +static int sc_active(struct stripe_cache *sc) +{ + return atomic_read(&sc->active_stripes); +} + +/* Stripe cache busy indicator. */ +static int sc_busy(struct raid_set *rs) +{ + return sc_active(&rs->sc) > + atomic_read(&rs->sc.stripes) - (STRIPES_MIN / 2); +} + +/* Set chunks states. */ +enum chunk_dirty_type { CLEAN, DIRTY, ERROR }; +static void chunk_set(struct stripe_chunk *chunk, enum chunk_dirty_type type) +{ + switch (type) { + case CLEAN: + ClearChunkDirty(chunk); + break; + case DIRTY: + SetChunkDirty(chunk); + break; + case ERROR: + SetChunkError(chunk); + SetStripeError(chunk->stripe); + return; + default: + BUG(); + } + + SetChunkUptodate(chunk); + SetChunkIo(chunk); + ClearChunkError(chunk); +} + +/* Return region state for a sector. */ +static int region_state(struct raid_set *rs, sector_t sector, + enum dm_rh_region_states state) +{ + struct dm_rh_client *rh = rs->recover.rh; + region_t region = dm_rh_sector_to_region(rh, sector); + + return !!(dm_rh_get_state(rh, region, 1) & state); +} + +/* + * Return true in case a chunk should be read/written + * + * Conditions to read/write: + * o chunk not uptodate + * o chunk dirty + * + * Conditios to avoid io: + * o io already ongoing on chunk + * o io explitely prohibited + */ +static int chunk_io(struct stripe_chunk *chunk) +{ + /* 2nd run optimization (flag set below on first run). */ + if (TestClearChunkMustIo(chunk)) + return 1; + + /* Avoid io if prohibited or a locked chunk. */ + if (!ChunkIo(chunk) || ChunkLocked(chunk)) + return 0; + + if (!ChunkUptodate(chunk) || ChunkDirty(chunk)) { + SetChunkMustIo(chunk); /* 2nd run optimization. */ + return 1; + } + + return 0; +} + +/* Call a function on each chunk needing io unless device failed. */ +static unsigned for_each_io_dev(struct stripe *stripe, + void (*f_io)(struct stripe *stripe, unsigned p)) +{ + struct raid_set *rs = RS(stripe->sc); + unsigned p, r = 0; + + for (p = 0; p < rs->set.raid_devs; p++) { + if (chunk_io(CHUNK(stripe, p)) && !DevFailed(rs->dev + p)) { + f_io(stripe, p); + r++; + } + } + + return r; +} + +/* + * Index of device to calculate parity on. + * + * Either the parity device index *or* the selected + * device to init after a spare replacement. + */ +static int dev_for_parity(struct stripe *stripe, int *sync) +{ + struct raid_set *rs = RS(stripe->sc); + int r = region_state(rs, stripe->key, DM_RH_NOSYNC | DM_RH_RECOVERING); + + *sync = !r; + + /* Reconstruct a particular device ?. */ + if (r && rs->set.dev_to_init > -1) + return rs->set.dev_to_init; + else if (rs->set.raid_type->level == raid4) + return rs->set.pi; + else if (!StripeRecover(stripe)) + return stripe->idx.parity; + else + return -1; +} + +/* RAID set congested function. */ +static int rs_congested(void *congested_data, int bdi_bits) +{ + int r; + unsigned p; + struct raid_set *rs = congested_data; + + if (sc_busy(rs) || RSSuspend(rs)) + r = 1; + else for (r = 0, p = rs->set.raid_devs; !r && p--; ) { + /* If any of our component devices are overloaded. */ + struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev); + + r |= bdi_congested(&q->backing_dev_info, bdi_bits); + } + + /* REMOVEME: statistics. */ + atomic_inc(rs->stats + (r ? S_CONGESTED : S_NOT_CONGESTED)); + return r; +} + +/* RAID device degrade check. */ +static void rs_check_degrade_dev(struct raid_set *rs, + struct stripe *stripe, unsigned p) +{ + if (TestSetDevFailed(rs->dev + p)) + return; + + /* Through an event in case of member device errors. */ + if (atomic_inc_return(&rs->set.failed_devs) > + rs->set.raid_type->parity_devs && + !TestSetRSDead(rs)) { + /* Display RAID set dead message once. */ + unsigned p; + char buf[BDEVNAME_SIZE]; + + DMERR("FATAL: too many devices failed -> RAID set broken"); + for (p = 0; p < rs->set.raid_devs; p++) { + if (DevFailed(rs->dev + p)) + DMERR("device /dev/%s failed", + bdevname(rs->dev[p].dev->bdev, buf)); + } + } + + /* Only log the first member error. */ + if (!TestSetRSDegraded(rs)) { + char buf[BDEVNAME_SIZE]; + + /* Store index for recovery. */ + rs->set.ei = p; + DMERR("CRITICAL: %sio error on device /dev/%s " + "in region=%llu; DEGRADING RAID set\n", + stripe ? "" : "FAKED ", + bdevname(rs->dev[p].dev->bdev, buf), + (unsigned long long) (stripe ? stripe->key : 0)); + DMERR("further device error messages suppressed"); + } + + schedule_work(&rs->io.ws_do_table_event); +} + +/* RAID set degrade check. */ +static void rs_check_degrade(struct stripe *stripe) +{ + struct raid_set *rs = RS(stripe->sc); + unsigned p = rs->set.raid_devs; + + while (p--) { + if (ChunkError(CHUNK(stripe, p))) + rs_check_degrade_dev(rs, stripe, p); + } +} + +/* Lookup a RAID device by name or by major:minor number. */ +static int raid_dev_lookup(struct raid_set *rs, struct raid_dev *dev_lookup) +{ + unsigned p; + struct raid_dev *dev; + + /* + * Must be an incremental loop, because the device array + * can have empty slots still on calls from raid_ctr() + */ + for (dev = rs->dev, p = 0; + dev->dev && p < rs->set.raid_devs; + dev++, p++) { + if (dev_lookup->dev->bdev->bd_dev == dev->dev->bdev->bd_dev) + return p; + } + + return -ENODEV; +} +/* + * End small helper functions. + */ + +/* + * Stripe hash functions + */ +/* Initialize/destroy stripe hash. */ +static int hash_init(struct stripe_hash *hash, unsigned stripes) +{ + unsigned buckets = 2, max_buckets = stripes >> 1; + static unsigned hash_primes[] = { + /* Table of primes for hash_fn/table size optimization. */ + 1, 2, 3, 7, 13, 27, 53, 97, 193, 389, 769, + 1543, 3079, 6151, 12289, 24593, 49157, 98317, + }; + + /* Calculate number of buckets (2^^n <= stripes / 2). */ + while (buckets < max_buckets) + buckets <<= 1; + + /* Allocate stripe hash buckets. */ + hash->hash = vmalloc(buckets * sizeof(*hash->hash)); + if (!hash->hash) + return -ENOMEM; + + hash->buckets = buckets; + hash->mask = buckets - 1; + hash->shift = ffs(buckets); + if (hash->shift > ARRAY_SIZE(hash_primes)) + hash->shift = ARRAY_SIZE(hash_primes) - 1; + + BUG_ON(hash->shift < 2); + hash->prime = hash_primes[hash->shift]; + + /* Initialize buckets. */ + while (buckets--) + INIT_LIST_HEAD(hash->hash + buckets); + return 0; +} + +static void hash_exit(struct stripe_hash *hash) +{ + if (hash->hash) { + vfree(hash->hash); + hash->hash = NULL; + } +} + +static unsigned hash_fn(struct stripe_hash *hash, sector_t key) +{ + return (unsigned) (((key * hash->prime) >> hash->shift) & hash->mask); +} + +static struct list_head *hash_bucket(struct stripe_hash *hash, sector_t key) +{ + return hash->hash + hash_fn(hash, key); +} + +/* Insert an entry into a hash. */ +static void stripe_insert(struct stripe_hash *hash, struct stripe *stripe) +{ + list_add(stripe->lists + LIST_HASH, hash_bucket(hash, stripe->key)); +} + +/* Lookup an entry in the stripe hash. */ +static struct stripe *stripe_lookup(struct stripe_cache *sc, sector_t key) +{ + unsigned look = 0; + struct stripe *stripe; + struct list_head *bucket = hash_bucket(&sc->hash, key); + + list_for_each_entry(stripe, bucket, lists[LIST_HASH]) { + look++; + + if (stripe->key == key) { + /* REMOVEME: statisics. */ + if (look > atomic_read(RS(sc)->stats + S_MAX_LOOKUP)) + atomic_set(RS(sc)->stats + S_MAX_LOOKUP, look); + return stripe; + } + } + + return NULL; +} + +/* Resize the stripe cache hash on size changes. */ +static int sc_hash_resize(struct stripe_cache *sc) +{ + /* Resize indicated ? */ + if (atomic_read(&sc->stripes) != atomic_read(&sc->stripes_last)) { + int r; + struct stripe_hash hash; + + r = hash_init(&hash, atomic_read(&sc->stripes)); + if (r) + return r; + + if (sc->hash.hash) { + unsigned b = sc->hash.buckets; + struct list_head *pos, *tmp; + + /* Walk old buckets and insert into new. */ + while (b--) { + list_for_each_safe(pos, tmp, sc->hash.hash + b) + stripe_insert(&hash, + list_entry(pos, struct stripe, + lists[LIST_HASH])); + } + + } + + hash_exit(&sc->hash); + memcpy(&sc->hash, &hash, sizeof(sc->hash)); + atomic_set(&sc->stripes_last, atomic_read(&sc->stripes)); + } + + return 0; +} +/* End hash stripe hash function. */ + +/* List add, delete, push and pop functions. */ +/* Add stripe to flush list. */ +#define DEL_LIST(lh) \ + if (!list_empty(lh)) \ + list_del_init(lh); + +/* Delete stripe from hash. */ +static void stripe_hash_del(struct stripe *stripe) +{ + DEL_LIST(stripe->lists + LIST_HASH); +} + +/* Return stripe reference count. */ +static inline int stripe_ref(struct stripe *stripe) +{ + return atomic_read(&stripe->cnt); +} + +static void stripe_flush_add(struct stripe *stripe) +{ + struct stripe_cache *sc = stripe->sc; + struct list_head *lh = stripe->lists + LIST_FLUSH; + + if (!StripeReconstruct(stripe) && list_empty(lh)) + list_add_tail(lh, sc->lists + LIST_FLUSH); +} + +/* + * Add stripe to LRU (inactive) list. + * + * Need lock, because of concurrent access from message interface. + */ +static void stripe_lru_add(struct stripe *stripe) +{ + if (!StripeRecover(stripe)) { + unsigned long flags; + struct list_head *lh = stripe->lists + LIST_LRU; + spinlock_t *lock = stripe->sc->locks + LOCK_LRU; + + spin_lock_irqsave(lock, flags); + if (list_empty(lh)) + list_add_tail(lh, stripe->sc->lists + LIST_LRU); + spin_unlock_irqrestore(lock, flags); + } +} + +#define POP_LIST(list) \ + do { \ + if (list_empty(sc->lists + (list))) \ + stripe = NULL; \ + else { \ + stripe = list_first_entry(sc->lists + (list), \ + struct stripe, \ + lists[(list)]); \ + list_del_init(stripe->lists + (list)); \ + } \ + } while (0); + +/* Pop an available stripe off the LRU list. */ +static struct stripe *stripe_lru_pop(struct stripe_cache *sc) +{ + struct stripe *stripe; + spinlock_t *lock = sc->locks + LOCK_LRU; + + spin_lock_irq(lock); + POP_LIST(LIST_LRU); + spin_unlock_irq(lock); + + return stripe; +} + +/* Pop an available stripe off the io list. */ +static struct stripe *stripe_io_pop(struct stripe_cache *sc) +{ + struct stripe *stripe; + + POP_LIST(LIST_FLUSH); + return stripe; +} + +/* Push a stripe safely onto the endio list to be handled by do_endios(). */ +static void stripe_endio_push(struct stripe *stripe) +{ + unsigned long flags; + struct stripe_cache *sc = stripe->sc; + struct list_head *stripe_list = stripe->lists + LIST_ENDIO, + *sc_list = sc->lists + LIST_ENDIO; + spinlock_t *lock = sc->locks + LOCK_ENDIO; + + /* This runs in parallel with do_endios(). */ + spin_lock_irqsave(lock, flags); + if (list_empty(stripe_list)) + list_add_tail(stripe_list, sc_list); + spin_unlock_irqrestore(lock, flags); + + wake_do_raid(RS(sc)); /* Wake myself. */ +} + +/* Pop a stripe off safely off the endio list. */ +static struct stripe *stripe_endio_pop(struct stripe_cache *sc) +{ + struct stripe *stripe; + spinlock_t *lock = sc->locks + LOCK_ENDIO; + + /* This runs in parallel with endio(). */ + spin_lock_irq(lock); + POP_LIST(LIST_ENDIO) + spin_unlock_irq(lock); + return stripe; +} +#undef POP_LIST + +/* + * Stripe cache locking functions + */ +/* Dummy lock function for single host RAID4+5. */ +static void *no_lock(sector_t key, enum dm_lock_type type) +{ + return &no_lock; +} + +/* Dummy unlock function for single host RAID4+5. */ +static void no_unlock(void *lock_handle) +{ +} + +/* No locking (for single host RAID 4+5). */ +static struct dm_raid45_locking_type locking_none = { + .lock = no_lock, + .unlock = no_unlock, +}; + +/* Lock a stripe (for clustering). */ +static int +stripe_lock(struct stripe *stripe, int rw, sector_t key) +{ + stripe->lock = RS(stripe->sc)->locking->lock(key, rw == READ ? DM_RAID45_SHARED : DM_RAID45_EX); + return stripe->lock ? 0 : -EPERM; +} + +/* Unlock a stripe (for clustering). */ +static void stripe_unlock(struct stripe *stripe) +{ + RS(stripe->sc)->locking->unlock(stripe->lock); + stripe->lock = NULL; +} + +/* Test io pending on stripe. */ +static int stripe_io_ref(struct stripe *stripe) +{ + return atomic_read(&stripe->io.pending); +} + +static void stripe_io_get(struct stripe *stripe) +{ + if (atomic_inc_return(&stripe->io.pending) == 1) + /* REMOVEME: statistics */ + atomic_inc(&stripe->sc->active_stripes); + else + BUG_ON(stripe_io_ref(stripe) < 0); +} + +static void stripe_io_put(struct stripe *stripe) +{ + if (atomic_dec_and_test(&stripe->io.pending)) { + if (unlikely(StripeRecover(stripe))) + /* Don't put recovery stripe on endio list. */ + wake_do_raid(RS(stripe->sc)); + else + /* Add regular stripe to endio list and wake daemon. */ + stripe_endio_push(stripe); + + /* REMOVEME: statistics */ + atomic_dec(&stripe->sc->active_stripes); + } else + BUG_ON(stripe_io_ref(stripe) < 0); +} + +/* Take stripe reference out. */ +static int stripe_get(struct stripe *stripe) +{ + int r; + struct list_head *lh = stripe->lists + LIST_LRU; + spinlock_t *lock = stripe->sc->locks + LOCK_LRU; + + /* Delete stripe from LRU (inactive) list if on. */ + spin_lock_irq(lock); + DEL_LIST(lh); + spin_unlock_irq(lock); + + BUG_ON(stripe_ref(stripe) < 0); + + /* Lock stripe on first reference */ + r = (atomic_inc_return(&stripe->cnt) == 1) ? + stripe_lock(stripe, WRITE, stripe->key) : 0; + + return r; +} +#undef DEL_LIST + +/* Return references on a chunk. */ +static int chunk_ref(struct stripe_chunk *chunk) +{ + return atomic_read(&chunk->cnt); +} + +/* Take out reference on a chunk. */ +static int chunk_get(struct stripe_chunk *chunk) +{ + return atomic_inc_return(&chunk->cnt); +} + +/* Drop reference on a chunk. */ +static void chunk_put(struct stripe_chunk *chunk) +{ + BUG_ON(atomic_dec_return(&chunk->cnt) < 0); +} + +/* + * Drop reference on a stripe. + * + * Move it to list of LRU stripes if zero. + */ +static void stripe_put(struct stripe *stripe) +{ + if (atomic_dec_and_test(&stripe->cnt)) { + BUG_ON(stripe_io_ref(stripe)); + stripe_unlock(stripe); + } else + BUG_ON(stripe_ref(stripe) < 0); +} + +/* Helper needed by for_each_io_dev(). */ +static void stripe_get_references(struct stripe *stripe, unsigned p) +{ + + /* + * Another one to reference the stripe in + * order to protect vs. LRU list moves. + */ + io_get(RS(stripe->sc)); /* Global io references. */ + stripe_get(stripe); + stripe_io_get(stripe); /* One for each chunk io. */ +} + +/* Helper for endio() to put all take references. */ +static void stripe_put_references(struct stripe *stripe) +{ + stripe_io_put(stripe); /* One for each chunk io. */ + stripe_put(stripe); + io_put(RS(stripe->sc)); +} + +/* + * Stripe cache functions. + */ +/* + * Invalidate all chunks (i.e. their pages) of a stripe. + * + * I only keep state for the whole chunk. + */ +static inline void stripe_chunk_invalidate(struct stripe_chunk *chunk) +{ + chunk->io.flags = 0; +} + +static void +stripe_chunks_invalidate(struct stripe *stripe) +{ + unsigned p = RS(stripe->sc)->set.raid_devs; + + while (p--) + stripe_chunk_invalidate(CHUNK(stripe, p)); +} + +/* Prepare stripe for (re)use. */ +static void stripe_invalidate(struct stripe *stripe) +{ + stripe->io.flags = 0; + stripe->idx.parity = stripe->idx.recover = -1; + stripe_chunks_invalidate(stripe); +} + +/* + * Allow io on all chunks of a stripe. + * If not set, IO will not occur; i.e. it's prohibited. + * + * Actual IO submission for allowed chunks depends + * on their !uptodate or dirty state. + */ +static void stripe_allow_io(struct stripe *stripe) +{ + unsigned p = RS(stripe->sc)->set.raid_devs; + + while (p--) + SetChunkIo(CHUNK(stripe, p)); +} + +/* Initialize a stripe. */ +static void stripe_init(struct stripe_cache *sc, struct stripe *stripe) +{ + unsigned i, p = RS(sc)->set.raid_devs; + + /* Work all io chunks. */ + while (p--) { + struct stripe_chunk *chunk = CHUNK(stripe, p); + + atomic_set(&chunk->cnt, 0); + chunk->stripe = stripe; + i = ARRAY_SIZE(chunk->bl); + while (i--) + bio_list_init(chunk->bl + i); + } + + stripe->sc = sc; + + + i = ARRAY_SIZE(stripe->lists); + while (i--) + INIT_LIST_HEAD(stripe->lists + i); + + stripe->io.size = RS(sc)->set.io_size; + atomic_set(&stripe->cnt, 0); + atomic_set(&stripe->io.pending, 0); + stripe_invalidate(stripe); +} + +/* Number of pages per chunk. */ +static inline unsigned chunk_pages(unsigned sectors) +{ + return dm_div_up(sectors, SECTORS_PER_PAGE); +} + +/* Number of pages per stripe. */ +static inline unsigned stripe_pages(struct raid_set *rs, unsigned io_size) +{ + return chunk_pages(io_size) * rs->set.raid_devs; +} + +/* Initialize part of page_list (recovery). */ +static void stripe_zero_pl_part(struct stripe *stripe, int p, + unsigned start, unsigned count) +{ + unsigned o = start / SECTORS_PER_PAGE, pages = chunk_pages(count); + /* Get offset into the page_list. */ + struct page_list *pl = pl_elem(PL(stripe, p), o); + + BUG_ON(!pl); + while (pl && pages--) { + BUG_ON(!pl->page); + memset(page_address(pl->page), 0, PAGE_SIZE); + pl = pl->next; + } +} + +/* Initialize parity chunk of stripe. */ +static void stripe_zero_chunk(struct stripe *stripe, int p) +{ + if (p > -1) + stripe_zero_pl_part(stripe, p, 0, stripe->io.size); +} + +/* Return dynamic stripe structure size. */ +static size_t stripe_size(struct raid_set *rs) +{ + return sizeof(struct stripe) + + rs->set.raid_devs * sizeof(struct stripe_chunk); +} + +/* Allocate a stripe and its memory object. */ +/* XXX adjust to cope with stripe cache and recovery stripe caches. */ +enum grow { SC_GROW, SC_KEEP }; +static struct stripe *stripe_alloc(struct stripe_cache *sc, + struct dm_mem_cache_client *mc, + enum grow grow) +{ + int r; + struct stripe *stripe; + + stripe = kmem_cache_zalloc(sc->kc.cache, GFP_KERNEL); + if (stripe) { + /* Grow the dm-mem-cache by one object. */ + if (grow == SC_GROW) { + r = dm_mem_cache_grow(mc, 1); + if (r) + goto err_free; + } + + stripe->obj = dm_mem_cache_alloc(mc); + if (!stripe->obj) + goto err_shrink; + + stripe_init(sc, stripe); + } + + return stripe; + +err_shrink: + if (grow == SC_GROW) + dm_mem_cache_shrink(mc, 1); +err_free: + kmem_cache_free(sc->kc.cache, stripe); + return NULL; +} + +/* + * Free a stripes memory object, shrink the + * memory cache and free the stripe itself. + */ +static void stripe_free(struct stripe *stripe, struct dm_mem_cache_client *mc) +{ + dm_mem_cache_free(mc, stripe->obj); + dm_mem_cache_shrink(mc, 1); + kmem_cache_free(stripe->sc->kc.cache, stripe); +} + +/* Free the recovery stripe. */ +static void stripe_recover_free(struct raid_set *rs) +{ + struct recover *rec = &rs->recover; + struct dm_mem_cache_client *mc; + + mc = rec->mem_cache_client; + rec->mem_cache_client = NULL; + if (mc) { + struct stripe *stripe; + + while (!list_empty(&rec->stripes)) { + stripe = list_first_entry(&rec->stripes, struct stripe, + lists[LIST_RECOVER]); + list_del(stripe->lists + LIST_RECOVER); + kfree(stripe->recover); + stripe_free(stripe, mc); + } + + dm_mem_cache_client_destroy(mc); + dm_io_client_destroy(rec->dm_io_client); + rec->dm_io_client = NULL; + } +} + +/* Grow stripe cache. */ +static int sc_grow(struct stripe_cache *sc, unsigned stripes, enum grow grow) +{ + int r = 0; + + /* Try to allocate this many (additional) stripes. */ + while (stripes--) { + struct stripe *stripe = + stripe_alloc(sc, sc->mem_cache_client, grow); + + if (likely(stripe)) { + stripe_lru_add(stripe); + atomic_inc(&sc->stripes); + } else { + r = -ENOMEM; + break; + } + } + + return r ? r : sc_hash_resize(sc); +} + +/* Shrink stripe cache. */ +static int sc_shrink(struct stripe_cache *sc, unsigned stripes) +{ + int r = 0; + + /* Try to get unused stripe from LRU list. */ + while (stripes--) { + struct stripe *stripe; + + stripe = stripe_lru_pop(sc); + if (stripe) { + /* An LRU stripe may never have ios pending! */ + BUG_ON(stripe_io_ref(stripe)); + BUG_ON(stripe_ref(stripe)); + atomic_dec(&sc->stripes); + /* Remove from hash if on before deletion. */ + stripe_hash_del(stripe); + stripe_free(stripe, sc->mem_cache_client); + } else { + r = -ENOENT; + break; + } + } + + /* Check if stats are still sane. */ + if (atomic_read(&sc->active_stripes_max) > + atomic_read(&sc->stripes)) + atomic_set(&sc->active_stripes_max, 0); + + if (r) + return r; + + return atomic_read(&sc->stripes) ? sc_hash_resize(sc) : 0; +} + +/* Create stripe cache and recovery. */ +static int sc_init(struct raid_set *rs, unsigned stripes) +{ + unsigned i, r, rstripes; + struct stripe_cache *sc = &rs->sc; + struct stripe *stripe; + struct recover *rec = &rs->recover; + struct mapped_device *md; + struct gendisk *disk; + + /* Initialize lists and locks. */ + i = ARRAY_SIZE(sc->lists); + while (i--) + INIT_LIST_HEAD(sc->lists + i); + + INIT_LIST_HEAD(&rec->stripes); + + /* Initialize endio and LRU list locks. */ + i = NR_LOCKS; + while (i--) + spin_lock_init(sc->locks + i); + + /* Initialize atomic variables. */ + atomic_set(&sc->stripes, 0); + atomic_set(&sc->stripes_to_set, 0); + atomic_set(&sc->active_stripes, 0); + atomic_set(&sc->active_stripes_max, 0); /* REMOVEME: statistics. */ + + /* + * We need a runtime unique # to suffix the kmem cache name + * because we'll have one for each active RAID set. + */ + md = dm_table_get_md(rs->ti->table); + disk = dm_disk(md); + sprintf(sc->kc.name, "%s-%d", TARGET, disk->first_minor); + dm_put(md); + sc->kc.cache = kmem_cache_create(sc->kc.name, stripe_size(rs), + 0, 0, NULL); + if (!sc->kc.cache) + return -ENOMEM; + + /* Create memory cache client context for RAID stripe cache. */ + sc->mem_cache_client = + dm_mem_cache_client_create(stripes, rs->set.raid_devs, + chunk_pages(rs->set.io_size)); + if (IS_ERR(sc->mem_cache_client)) + return PTR_ERR(sc->mem_cache_client); + + /* Create memory cache client context for RAID recovery stripe(s). */ + rstripes = rec->recovery_stripes; + rec->mem_cache_client = + dm_mem_cache_client_create(rstripes, rs->set.raid_devs, + chunk_pages(rec->io_size)); + if (IS_ERR(rec->mem_cache_client)) + return PTR_ERR(rec->mem_cache_client); + + /* Create dm-io client context for IO stripes. */ + sc->dm_io_client = + dm_io_client_create((stripes > 32 ? 32 : stripes) * + rs->set.raid_devs * + chunk_pages(rs->set.io_size)); + if (IS_ERR(sc->dm_io_client)) + return PTR_ERR(sc->dm_io_client); + + /* FIXME: intermingeled with stripe cache initialization. */ + /* Create dm-io client context for recovery stripes. */ + rec->dm_io_client = + dm_io_client_create(rstripes * rs->set.raid_devs * + chunk_pages(rec->io_size)); + if (IS_ERR(rec->dm_io_client)) + return PTR_ERR(rec->dm_io_client); + + /* Allocate stripes for set recovery. */ + while (rstripes--) { + stripe = stripe_alloc(sc, rec->mem_cache_client, SC_KEEP); + if (!stripe) + return -ENOMEM; + + stripe->recover = kzalloc(sizeof(*stripe->recover), GFP_KERNEL); + if (!stripe->recover) { + stripe_free(stripe, rec->mem_cache_client); + return -ENOMEM; + } + + SetStripeRecover(stripe); + stripe->io.size = rec->io_size; + list_add_tail(stripe->lists + LIST_RECOVER, &rec->stripes); + /* Don't add recovery stripes to LRU list! */ + } + + /* + * Allocate the stripe objetcs from the + * cache and add them to the LRU list. + */ + r = sc_grow(sc, stripes, SC_KEEP); + if (!r) + atomic_set(&sc->stripes_last, stripes); + + return r; +} + +/* Destroy the stripe cache. */ +static void sc_exit(struct stripe_cache *sc) +{ + struct raid_set *rs = RS(sc); + + if (sc->kc.cache) { + stripe_recover_free(rs); + BUG_ON(sc_shrink(sc, atomic_read(&sc->stripes))); + kmem_cache_destroy(sc->kc.cache); + sc->kc.cache = NULL; + + if (sc->mem_cache_client && !IS_ERR(sc->mem_cache_client)) + dm_mem_cache_client_destroy(sc->mem_cache_client); + + if (sc->dm_io_client && !IS_ERR(sc->dm_io_client)) + dm_io_client_destroy(sc->dm_io_client); + + hash_exit(&sc->hash); + } +} + +/* + * Calculate RAID address + * + * Delivers tuple with the index of the data disk holding the chunk + * in the set, the parity disks index and the start of the stripe + * within the address space of the set (used as the stripe cache hash key). + */ +/* thx MD. */ +static struct raid_address *raid_address(struct raid_set *rs, sector_t sector, + struct raid_address *addr) +{ + sector_t stripe, tmp; + + /* + * chunk_number = sector / chunk_size + * stripe_number = chunk_number / data_devs + * di = stripe % data_devs; + */ + stripe = sector >> rs->set.chunk_shift; + addr->di = sector_div(stripe, rs->set.data_devs); + + switch (rs->set.raid_type->level) { + case raid4: + addr->pi = rs->set.pi; + goto check_shift_di; + case raid5: + tmp = stripe; + addr->pi = sector_div(tmp, rs->set.raid_devs); + + switch (rs->set.raid_type->algorithm) { + case left_asym: /* Left asymmetric. */ + addr->pi = rs->set.data_devs - addr->pi; + case right_asym: /* Right asymmetric. */ +check_shift_di: + if (addr->di >= addr->pi) + addr->di++; + break; + case left_sym: /* Left symmetric. */ + addr->pi = rs->set.data_devs - addr->pi; + case right_sym: /* Right symmetric. */ + addr->di = (addr->pi + addr->di + 1) % + rs->set.raid_devs; + break; + case none: /* Ain't happen: RAID4 algorithm placeholder. */ + BUG(); + } + } + + /* + * Start offset of the stripes chunk on any single device of the RAID + * set, adjusted in case io size differs from chunk size. + */ + addr->key = (stripe << rs->set.chunk_shift) + + (sector & rs->set.io_inv_mask); + return addr; +} + +/* + * Copy data across between stripe pages and bio vectors. + * + * Pay attention to data alignment in stripe and bio pages. + */ +static void bio_copy_page_list(int rw, struct stripe *stripe, + struct page_list *pl, struct bio *bio) +{ + unsigned i, page_offset; + void *page_addr; + struct raid_set *rs = RS(stripe->sc); + struct bio_vec *bv; + + /* Get start page in page list for this sector. */ + i = (bio->bi_sector & rs->set.io_mask) / SECTORS_PER_PAGE; + pl = pl_elem(pl, i); + BUG_ON(!pl); + BUG_ON(!pl->page); + + page_addr = page_address(pl->page); + page_offset = to_bytes(bio->bi_sector & (SECTORS_PER_PAGE - 1)); + + /* Walk all segments and copy data across between bio_vecs and pages. */ + bio_for_each_segment(bv, bio, i) { + int len = bv->bv_len, size; + unsigned bio_offset = 0; + void *bio_addr = __bio_kmap_atomic(bio, i, KM_USER0); +redo: + size = (page_offset + len > PAGE_SIZE) ? + PAGE_SIZE - page_offset : len; + + if (rw == READ) + memcpy(bio_addr + bio_offset, + page_addr + page_offset, size); + else + memcpy(page_addr + page_offset, + bio_addr + bio_offset, size); + + page_offset += size; + if (page_offset == PAGE_SIZE) { + /* + * We reached the end of the chunk page -> + * need to refer to the next one to copy more data. + */ + len -= size; + if (len) { + /* Get next page. */ + pl = pl->next; + BUG_ON(!pl); + BUG_ON(!pl->page); + page_addr = page_address(pl->page); + page_offset = 0; + bio_offset += size; + /* REMOVEME: statistics. */ + atomic_inc(rs->stats + S_BIO_COPY_PL_NEXT); + goto redo; + } + } + + __bio_kunmap_atomic(bio_addr, KM_USER0); + } +} + +/* + * Xor optimization macros. + */ +/* Xor data pointer declaration and initialization macros. */ +#define DECLARE_2 unsigned long *d0 = data[0], *d1 = data[1] +#define DECLARE_3 DECLARE_2, *d2 = data[2] +#define DECLARE_4 DECLARE_3, *d3 = data[3] +#define DECLARE_5 DECLARE_4, *d4 = data[4] +#define DECLARE_6 DECLARE_5, *d5 = data[5] +#define DECLARE_7 DECLARE_6, *d6 = data[6] +#define DECLARE_8 DECLARE_7, *d7 = data[7] + +/* Xor unrole macros. */ +#define D2(n) d0[n] = d0[n] ^ d1[n] +#define D3(n) D2(n) ^ d2[n] +#define D4(n) D3(n) ^ d3[n] +#define D5(n) D4(n) ^ d4[n] +#define D6(n) D5(n) ^ d5[n] +#define D7(n) D6(n) ^ d6[n] +#define D8(n) D7(n) ^ d7[n] + +#define X_2(macro, offset) macro(offset); macro(offset + 1); +#define X_4(macro, offset) X_2(macro, offset); X_2(macro, offset + 2); +#define X_8(macro, offset) X_4(macro, offset); X_4(macro, offset + 4); +#define X_16(macro, offset) X_8(macro, offset); X_8(macro, offset + 8); +#define X_32(macro, offset) X_16(macro, offset); X_16(macro, offset + 16); +#define X_64(macro, offset) X_32(macro, offset); X_32(macro, offset + 32); + +/* Define a _xor_#chunks_#xors_per_run() function. */ +#define _XOR(chunks, xors_per_run) \ +static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \ +{ \ + unsigned end = XOR_SIZE / sizeof(data[0]), i; \ + DECLARE_ ## chunks; \ +\ + for (i = 0; i < end; i += xors_per_run) { \ + X_ ## xors_per_run(D ## chunks, i); \ + } \ +} + +/* Define xor functions for 2 - 8 chunks and xors per run. */ +#define MAKE_XOR_PER_RUN(xors_per_run) \ + _XOR(2, xors_per_run); _XOR(3, xors_per_run); \ + _XOR(4, xors_per_run); _XOR(5, xors_per_run); \ + _XOR(6, xors_per_run); _XOR(7, xors_per_run); \ + _XOR(8, xors_per_run); + +MAKE_XOR_PER_RUN(8) /* Define _xor_*_8() functions. */ +MAKE_XOR_PER_RUN(16) /* Define _xor_*_16() functions. */ +MAKE_XOR_PER_RUN(32) /* Define _xor_*_32() functions. */ +MAKE_XOR_PER_RUN(64) /* Define _xor_*_64() functions. */ + +#define MAKE_XOR(xors_per_run) \ +struct { \ + void (*f)(unsigned long **); \ +} static xor_funcs ## xors_per_run[] = { \ + { NULL }, /* NULL pointers to optimize indexing in xor(). */ \ + { NULL }, \ + { _xor2_ ## xors_per_run }, \ + { _xor3_ ## xors_per_run }, \ + { _xor4_ ## xors_per_run }, \ + { _xor5_ ## xors_per_run }, \ + { _xor6_ ## xors_per_run }, \ + { _xor7_ ## xors_per_run }, \ + { _xor8_ ## xors_per_run }, \ +}; \ +\ +static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \ +{ \ + /* Call respective function for amount of chunks. */ \ + xor_funcs ## xors_per_run[n].f(data); \ +} + +/* Define xor_8() - xor_64 functions. */ +MAKE_XOR(8) +MAKE_XOR(16) +MAKE_XOR(32) +MAKE_XOR(64) + +/* Maximum number of chunks, which can be xor'ed in one go. */ +#define XOR_CHUNKS_MAX (ARRAY_SIZE(xor_funcs8) - 1) + +static void xor_blocks_wrapper(unsigned n, unsigned long **data) +{ + BUG_ON(n < 2 || n > MAX_XOR_BLOCKS + 1); + xor_blocks(n - 1, XOR_SIZE, (void *) data[0], (void **) data + 1); +} + +struct xor_func { + xor_function_t f; + const char *name; +} static xor_funcs[] = { + { xor_8, "xor_8" }, + { xor_16, "xor_16" }, + { xor_32, "xor_32" }, + { xor_64, "xor_64" }, + { xor_blocks_wrapper, "xor_blocks" }, +}; + +/* + * Check, if chunk has to be xored in/out: + * + * o if writes are queued + * o if writes are merged + * o if stripe is to be reconstructed + * o if recovery stripe + */ +static inline int chunk_must_xor(struct stripe_chunk *chunk) +{ + if (ChunkUptodate(chunk)) { + BUG_ON(!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) && + !bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED))); + + if (!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) || + !bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED))) + return 1; + + if (StripeReconstruct(chunk->stripe) || + StripeRecover(chunk->stripe)) + return 1; + } + + return 0; +} + +/* + * Calculate crc. + * + * This indexes into the chunks of a stripe and their pages. + * + * All chunks will be xored into the indexed (@pi) + * chunk in maximum groups of xor.chunks. + * + */ +static void xor(struct stripe *stripe, unsigned pi, unsigned sector) +{ + struct raid_set *rs = RS(stripe->sc); + unsigned max_chunks = rs->xor.chunks, n = 1, + o = sector / SECTORS_PER_PAGE, /* Offset into the page_list. */ + p = rs->set.raid_devs; + unsigned long **d = rs->data; + xor_function_t xor_f = rs->xor.f->f; + + BUG_ON(sector > stripe->io.size); + + /* Address of parity page to xor into. */ + d[0] = page_address(pl_elem(PL(stripe, pi), o)->page); + + while (p--) { + /* Preset pointers to data pages. */ + if (p != pi && chunk_must_xor(CHUNK(stripe, p))) + d[n++] = page_address(pl_elem(PL(stripe, p), o)->page); + + /* If max chunks -> xor. */ + if (n == max_chunks) { + xor_f(n, d); + n = 1; + } + } + + /* If chunks -> xor. */ + if (n > 1) + xor_f(n, d); +} + +/* Common xor loop through all stripe page lists. */ +static void common_xor(struct stripe *stripe, sector_t count, + unsigned off, unsigned pi) +{ + unsigned sector; + + BUG_ON(!count); + for (sector = off; sector < count; sector += SECTORS_PER_PAGE) + xor(stripe, pi, sector); + + /* Set parity page uptodate and clean. */ + chunk_set(CHUNK(stripe, pi), CLEAN); + atomic_inc(RS(stripe->sc)->stats + S_XORS); /* REMOVEME: statistics. */ +} + +/* + * Calculate parity sectors on intact stripes. + * + * Need to calculate raid address for recover stripe, because its + * chunk sizes differs and is typically larger than io chunk size. + */ +static void parity_xor(struct stripe *stripe) +{ + struct raid_set *rs = RS(stripe->sc); + unsigned chunk_size = rs->set.chunk_size, io_size = stripe->io.size, + xor_size = chunk_size > io_size ? io_size : chunk_size; + sector_t off; + + /* This can be the recover stripe with a larger io size. */ + for (off = 0; off < io_size; off += xor_size) { + /* + * Recover stripe is likely bigger than regular io + * ones and has no precalculated parity disk index -> + * need to calculate RAID address. + */ + if (unlikely(StripeRecover(stripe))) { + struct raid_address addr; + + raid_address(rs, + (stripe->key + off) * rs->set.data_devs, + &addr); + stripe->idx.parity = addr.pi; + stripe_zero_pl_part(stripe, addr.pi, off, xor_size); + } + + common_xor(stripe, xor_size, off, stripe->idx.parity); + chunk_set(CHUNK(stripe, stripe->idx.parity), DIRTY); + } +} + +/* Reconstruct missing chunk. */ +static void stripe_reconstruct(struct stripe *stripe) +{ + struct raid_set *rs = RS(stripe->sc); + int p = rs->set.raid_devs, pr = stripe->idx.recover; + + BUG_ON(pr < 0); + + /* Check if all but the chunk to be reconstructed are uptodate. */ + while (p--) + BUG_ON(p != pr && !ChunkUptodate(CHUNK(stripe, p))); + + /* REMOVEME: statistics. */ + atomic_inc(rs->stats + (RSDegraded(rs) ? S_RECONSTRUCT_EI : + S_RECONSTRUCT_DEV)); + /* Zero chunk to be reconstructed. */ + stripe_zero_chunk(stripe, pr); + common_xor(stripe, stripe->io.size, 0, pr); + stripe->idx.recover = -1; +} + +/* + * Recovery io throttling + */ +/* Conditionally reset io counters. */ +static int recover_io_reset(struct raid_set *rs) +{ + unsigned long j = jiffies; + + /* Pay attention to jiffies overflows. */ + if (j > rs->recover.last_jiffies + HZ / 20 || + j < rs->recover.last_jiffies) { + atomic_set(rs->recover.io_count + IO_WORK, 0); + atomic_set(rs->recover.io_count + IO_RECOVER, 0); + rs->recover.last_jiffies = j; + return 1; + } + + return 0; +} + +/* Count ios. */ +static void recover_io_count(struct stripe *stripe) +{ + struct raid_set *rs = RS(stripe->sc); + + recover_io_reset(rs); + atomic_inc(rs->recover.io_count + + (StripeRecover(stripe) ? IO_RECOVER : IO_WORK)); +} + +/* Try getting a stripe either from the hash or from the LRU list. */ +static struct stripe *stripe_find(struct raid_set *rs, + struct raid_address *addr) +{ + int r; + struct stripe_cache *sc = &rs->sc; + struct stripe *stripe; + + /* Try stripe from hash. */ + stripe = stripe_lookup(sc, addr->key); + if (stripe) { + r = stripe_get(stripe); + if (r) + goto get_lock_failed; + + atomic_inc(rs->stats + S_HITS_1ST); /* REMOVEME: statistics. */ + } else { + /* Not in hash -> try to get an LRU stripe. */ + stripe = stripe_lru_pop(sc); + if (stripe) { + /* + * An LRU stripe may not be referenced + * and may never have ios pending! + */ + BUG_ON(stripe_ref(stripe)); + BUG_ON(stripe_io_ref(stripe)); + + /* Remove from hash if on before reuse. */ + stripe_hash_del(stripe); + + /* Invalidate before reinserting with changed key. */ + stripe_invalidate(stripe); + + stripe->key = addr->key; + stripe->region = dm_rh_sector_to_region(rs->recover.rh, + addr->key); + stripe->idx.parity = addr->pi; + r = stripe_get(stripe); + if (r) + goto get_lock_failed; + + /* Insert stripe into the stripe hash. */ + stripe_insert(&sc->hash, stripe); + /* REMOVEME: statistics. */ + atomic_inc(rs->stats + S_INSCACHE); + } + } + + return stripe; + +get_lock_failed: + stripe_put(stripe); + return NULL; +} + +/* + * Process end io + * + * I need to do it here because I can't in interrupt + */ +/* End io all bios on a bio list. */ +static void bio_list_endio(struct stripe *stripe, struct bio_list *bl, + int p, int error) +{ + struct raid_set *rs = RS(stripe->sc); + struct bio *bio; + struct page_list *pl = PL(stripe, p); + struct stripe_chunk *chunk = CHUNK(stripe, p); + + /* Update region counters. */ + while ((bio = bio_list_pop(bl))) { + if (bio_data_dir(bio) == WRITE) + /* Drop io pending count for any writes. */ + dm_rh_dec(rs->recover.rh, stripe->region); + else if (!error) + /* Copy data accross. */ + bio_copy_page_list(READ, stripe, pl, bio); + + bio_endio(bio, error); + + /* REMOVEME: statistics. */ + atomic_inc(rs->stats + (bio_data_dir(bio) == READ ? + S_BIOS_ENDIO_READ : S_BIOS_ENDIO_WRITE)); + + chunk_put(chunk); + stripe_put(stripe); + io_put(rs); /* Wake any suspend waiters on last bio. */ + } +} + +/* + * End io all reads/writes on a stripe copying + * read data accross from stripe to bios and + * decrementing region counters for writes. + * + * Processing of ios depeding on state: + * o no chunk error -> endio ok + * o degraded: + * - chunk error and read -> ignore to be requeued + * - chunk error and write -> endio ok + * o dead (more than parity_devs failed) and chunk_error-> endio failed + */ +static void stripe_endio(int rw, struct stripe *stripe) +{ + struct raid_set *rs = RS(stripe->sc); + unsigned p = rs->set.raid_devs; + int write = (rw != READ); + + while (p--) { + struct stripe_chunk *chunk = CHUNK(stripe, p); + struct bio_list *bl; + + BUG_ON(ChunkLocked(chunk)); + + bl = BL_CHUNK(chunk, rw); + if (bio_list_empty(bl)) + continue; + + if (unlikely(ChunkError(chunk) || !ChunkUptodate(chunk))) { + /* RAID set dead. */ + if (unlikely(RSDead(rs))) + bio_list_endio(stripe, bl, p, -EIO); + /* RAID set degraded. */ + else if (write) + bio_list_endio(stripe, bl, p, 0); + } else { + BUG_ON(!RSDegraded(rs) && ChunkDirty(chunk)); + bio_list_endio(stripe, bl, p, 0); + } + } +} + +/* Fail all ios hanging off all bio lists of a stripe. */ +static void stripe_fail_io(struct stripe *stripe) +{ + struct raid_set *rs = RS(stripe->sc); + unsigned p = rs->set.raid_devs; + + while (p--) { + struct stripe_chunk *chunk = CHUNK(stripe, p); + int i = ARRAY_SIZE(chunk->bl); + + /* Fail all bios on all bio lists of the stripe. */ + while (i--) { + struct bio_list *bl = chunk->bl + i; + + if (!bio_list_empty(bl)) + bio_list_endio(stripe, bl, p, -EIO); + } + } + + /* Put stripe on LRU list. */ + BUG_ON(stripe_io_ref(stripe)); + BUG_ON(stripe_ref(stripe)); +} + +/* Unlock all required chunks. */ +static void stripe_chunks_unlock(struct stripe *stripe) +{ + unsigned p = RS(stripe->sc)->set.raid_devs; + struct stripe_chunk *chunk; + + while (p--) { + chunk = CHUNK(stripe, p); + + if (TestClearChunkUnlock(chunk)) + ClearChunkLocked(chunk); + } +} + +/* + * Queue reads and writes to a stripe by hanging + * their bios off the stripesets read/write lists. + */ +static int stripe_queue_bio(struct raid_set *rs, struct bio *bio, + struct bio_list *reject) +{ + struct raid_address addr; + struct stripe *stripe; + + stripe = stripe_find(rs, raid_address(rs, bio->bi_sector, &addr)); + if (stripe) { + int r = 0, rw = bio_data_dir(bio); + + /* Distinguish reads and writes. */ + bio_list_add(BL(stripe, addr.di, rw), bio); + + if (rw == READ) + /* REMOVEME: statistics. */ + atomic_inc(rs->stats + S_BIOS_ADDED_READ); + else { + /* Inrement pending write count on region. */ + dm_rh_inc(rs->recover.rh, stripe->region); + r = 1; + + /* REMOVEME: statistics. */ + atomic_inc(rs->stats + S_BIOS_ADDED_WRITE); + } + + /* + * Put on io (flush) list in case of + * initial bio queued to chunk. + */ + if (chunk_get(CHUNK(stripe, addr.di)) == 1) + stripe_flush_add(stripe); + + return r; + } + + /* Got no stripe from cache or failed to lock it -> reject bio. */ + bio_list_add(reject, bio); + atomic_inc(rs->stats + S_IOS_POST); /* REMOVEME: statistics. */ + return 0; +} + +/* + * Handle all stripes by handing them to the daemon, because we can't + * map their chunk pages to copy the data in interrupt context. + * + * We don't want to handle them here either, while interrupts are disabled. + */ + +/* Read/write endio function for dm-io (interrupt context). */ +static void endio(unsigned long error, void *context) +{ + struct stripe_chunk *chunk = context; + + if (unlikely(error)) { + chunk_set(chunk, ERROR); + /* REMOVEME: statistics. */ + atomic_inc(RS(chunk->stripe->sc)->stats + S_STRIPE_ERROR); + } else + chunk_set(chunk, CLEAN); + + /* + * For recovery stripes, I need to reset locked locked + * here, because those aren't processed in do_endios(). + */ + if (unlikely(StripeRecover(chunk->stripe))) + ClearChunkLocked(chunk); + else + SetChunkUnlock(chunk); + + /* Indirectly puts stripe on cache's endio list via stripe_io_put(). */ + stripe_put_references(chunk->stripe); +} + +/* Read/Write a chunk asynchronously. */ +static void stripe_chunk_rw(struct stripe *stripe, unsigned p) +{ + struct stripe_cache *sc = stripe->sc; + struct raid_set *rs = RS(sc); + struct dm_mem_cache_object *obj = stripe->obj + p; + struct page_list *pl = obj->pl; + struct stripe_chunk *chunk = CHUNK(stripe, p); + struct raid_dev *dev = rs->dev + p; + struct dm_io_region io = { + .bdev = dev->dev->bdev, + .sector = stripe->key, + .count = stripe->io.size, + }; + struct dm_io_request control = { + .bi_rw = ChunkDirty(chunk) ? WRITE : READ, + .mem = { + .type = DM_IO_PAGE_LIST, + .ptr.pl = pl, + .offset = 0, + }, + .notify = { + .fn = endio, + .context = chunk, + }, + .client = StripeRecover(stripe) ? rs->recover.dm_io_client : + sc->dm_io_client, + }; + + BUG_ON(ChunkLocked(chunk)); + BUG_ON(!ChunkUptodate(chunk) && ChunkDirty(chunk)); + BUG_ON(ChunkUptodate(chunk) && !ChunkDirty(chunk)); + + /* + * Don't rw past end of device, which can happen, because + * typically sectors_per_dev isn't divisible by io_size. + */ + if (unlikely(io.sector + io.count > rs->set.sectors_per_dev)) + io.count = rs->set.sectors_per_dev - io.sector; + + BUG_ON(!io.count); + io.sector += dev->start; /* Add . */ + if (RSRecover(rs)) + recover_io_count(stripe); /* Recovery io accounting. */ + + /* REMOVEME: statistics. */ + atomic_inc(rs->stats + (ChunkDirty(chunk) ? S_DM_IO_WRITE : + S_DM_IO_READ)); + SetChunkLocked(chunk); + SetDevIoQueued(dev); + BUG_ON(dm_io(&control, 1, &io, NULL)); +} + +/* + * Write dirty or read not uptodate page lists of a stripe. + */ +static int stripe_chunks_rw(struct stripe *stripe) +{ + int r; + struct raid_set *rs = RS(stripe->sc); + + /* + * Increment the pending count on the stripe + * first, so that we don't race in endio(). + * + * An inc (IO) is needed for any chunk unless !ChunkIo(chunk): + * + * o not uptodate + * o dirtied by writes merged + * o dirtied by parity calculations + */ + r = for_each_io_dev(stripe, stripe_get_references); + if (r) { + /* Io needed: chunks are either not uptodate or dirty. */ + int max; /* REMOVEME: */ + struct stripe_cache *sc = &rs->sc; + + /* Submit actual io. */ + for_each_io_dev(stripe, stripe_chunk_rw); + + /* REMOVEME: statistics */ + max = sc_active(sc); + if (atomic_read(&sc->active_stripes_max) < max) + atomic_set(&sc->active_stripes_max, max); + + atomic_inc(rs->stats + S_FLUSHS); + /* END REMOVEME: statistics */ + } + + return r; +} + +/* Merge in all writes hence dirtying respective chunks. */ +static void stripe_merge_writes(struct stripe *stripe) +{ + unsigned p = RS(stripe->sc)->set.raid_devs; + + while (p--) { + struct stripe_chunk *chunk = CHUNK(stripe, p); + struct bio_list *write = BL_CHUNK(chunk, WRITE_QUEUED); + + if (!bio_list_empty(write)) { + struct bio *bio; + struct page_list *pl = stripe->obj[p].pl; + + /* + * We can play with the lists without holding a lock, + * because it is just us accessing them anyway. + */ + bio_list_for_each(bio, write) + bio_copy_page_list(WRITE, stripe, pl, bio); + + bio_list_merge(BL_CHUNK(chunk, WRITE_MERGED), write); + bio_list_init(write); + chunk_set(chunk, DIRTY); + } + } +} + +/* Queue all writes to get merged. */ +static int stripe_queue_writes(struct stripe *stripe) +{ + int r = 0; + unsigned p = RS(stripe->sc)->set.raid_devs; + + while (p--) { + struct stripe_chunk *chunk = CHUNK(stripe, p); + struct bio_list *write = BL_CHUNK(chunk, WRITE); + + if (!bio_list_empty(write)) { + bio_list_merge(BL_CHUNK(chunk, WRITE_QUEUED), write); + bio_list_init(write); +SetChunkIo(chunk); + r = 1; + } + } + + return r; +} + + +/* Check, if a chunk gets completely overwritten. */ +static int stripe_check_chunk_overwrite(struct stripe *stripe, unsigned p) +{ + unsigned sectors = 0; + struct bio *bio; + struct bio_list *bl = BL(stripe, p, WRITE_QUEUED); + + bio_list_for_each(bio, bl) + sectors += bio_sectors(bio); + + BUG_ON(sectors > RS(stripe->sc)->set.io_size); + return sectors == RS(stripe->sc)->set.io_size; +} + +/* + * Avoid io on broken/reconstructed drive in order to + * reconstruct date on endio. + * + * (*1*) We set StripeReconstruct() in here, so that _do_endios() + * will trigger a reconstruct call before resetting it. + */ +static int stripe_chunk_set_io_flags(struct stripe *stripe, int pr) +{ + struct stripe_chunk *chunk = CHUNK(stripe, pr); + + /* + * Allow io on all chunks but the indexed one, + * because we're either degraded or prohibit it + * on the one for later reconstruction. + */ + /* Includes ClearChunkIo(), ClearChunkUptodate(). */ + stripe_chunk_invalidate(chunk); + stripe->idx.recover = pr; + SetStripeReconstruct(stripe); + + /* REMOVEME: statistics. */ + atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO); + return -EPERM; +} + +/* Chunk locked/uptodate and device failed tests. */ +static struct stripe_chunk * +stripe_chunk_check(struct stripe *stripe, unsigned p, unsigned *chunks_uptodate) +{ + struct raid_set *rs = RS(stripe->sc); + struct stripe_chunk *chunk = CHUNK(stripe, p); + + /* Can't access active chunks. */ + if (ChunkLocked(chunk)) { + /* REMOVEME: statistics. */ + atomic_inc(rs->stats + S_CHUNK_LOCKED); + return NULL; + } + + /* Can't access broken devive. */ + if (ChunkError(chunk) || DevFailed(rs->dev + p)) + return NULL; + + /* Can access uptodate chunks. */ + if (ChunkUptodate(chunk)) { + (*chunks_uptodate)++; + return NULL; + } + + return chunk; +} + +/* + * Degraded/reconstruction mode. + * + * Check stripe state to figure which chunks don't need IO. + * + * Returns 0 for fully operational, -EPERM for degraded/resynchronizing. + */ +static int stripe_check_reconstruct(struct stripe *stripe) +{ + struct raid_set *rs = RS(stripe->sc); + + if (RSDead(rs)) { + ClearStripeReconstruct(stripe); + ClearStripeReconstructed(stripe); + stripe_allow_io(stripe); + return 0; + } + + /* Avoid further reconstruction setting, when already set. */ + if (StripeReconstruct(stripe)) { + /* REMOVEME: statistics. */ + atomic_inc(rs->stats + S_RECONSTRUCT_SET); + return -EBUSY; + } + + /* Initially allow io on all chunks. */ + stripe_allow_io(stripe); + + /* Return if stripe is already reconstructed. */ + if (StripeReconstructed(stripe)) { + atomic_inc(rs->stats + S_RECONSTRUCTED); + return 0; + } + + /* + * Degraded/reconstruction mode (device failed) -> + * avoid io on the failed device. + */ + if (unlikely(RSDegraded(rs))) { + /* REMOVEME: statistics. */ + atomic_inc(rs->stats + S_DEGRADED); + /* Allow IO on all devices but the dead one. */ + BUG_ON(rs->set.ei < 0); + return stripe_chunk_set_io_flags(stripe, rs->set.ei); + } else { + int sync, pi = dev_for_parity(stripe, &sync); + + /* + * Reconstruction mode (ie. a particular (replaced) device or + * some (rotating) parity chunk is being resynchronized) -> + * o make sure all needed chunks are read in + * o writes are allowed to go through + */ + if (!sync) { + /* REMOVEME: statistics. */ + atomic_inc(rs->stats + S_NOSYNC); + /* Allow IO on all devs but the one to reconstruct. */ + return stripe_chunk_set_io_flags(stripe, pi); + } + } + + return 0; +} + +/* + * Check, if stripe is ready to merge writes. + * I.e. if all chunks present to allow to merge bios. + * + * We prohibit io on: + * + * o chunks without bios + * o chunks which get completely written over + */ +static int stripe_merge_possible(struct stripe *stripe, int nosync) +{ + struct raid_set *rs = RS(stripe->sc); + unsigned chunks_overwrite = 0, chunks_prohibited = 0, + chunks_uptodate = 0, p = rs->set.raid_devs; + + /* Walk all chunks. */ + while (p--) { + struct stripe_chunk *chunk; + + /* Prohibit io on broken devices. */ + if (DevFailed(rs->dev + p)) { + chunk = CHUNK(stripe, p); + goto prohibit_io; + } + + /* We can't optimize any further if no chunk. */ + chunk = stripe_chunk_check(stripe, p, &chunks_uptodate); + if (!chunk || nosync) + continue; + + /* + * We have a chunk, which is not uptodate. + * + * If this is not parity and we don't have + * reads queued, we can optimize further. + */ + if (p != stripe->idx.parity && + bio_list_empty(BL_CHUNK(chunk, READ)) && + bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED))) { + if (bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED))) + goto prohibit_io; + else if (RSCheckOverwrite(rs) && + stripe_check_chunk_overwrite(stripe, p)) + /* Completely overwritten chunk. */ + chunks_overwrite++; + } + + /* Allow io for chunks with bios and overwritten ones. */ + SetChunkIo(chunk); + continue; + +prohibit_io: + /* No io for broken devices or for chunks w/o bios. */ + ClearChunkIo(chunk); + chunks_prohibited++; + /* REMOVEME: statistics. */ + atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO); + } + + /* All data chunks will get written over. */ + if (chunks_overwrite == rs->set.data_devs) + atomic_inc(rs->stats + S_OVERWRITE); /* REMOVEME: statistics.*/ + else if (chunks_uptodate + chunks_prohibited < rs->set.raid_devs) { + /* We don't have enough chunks to merge. */ + atomic_inc(rs->stats + S_CANT_MERGE); /* REMOVEME: statistics.*/ + return -EPERM; + } + + /* + * If we have all chunks up to date or overwrite them, we + * just zero the parity chunk and let stripe_rw() recreate it. + */ + if (chunks_uptodate == rs->set.raid_devs || + chunks_overwrite == rs->set.data_devs) { + stripe_zero_chunk(stripe, stripe->idx.parity); + BUG_ON(StripeReconstruct(stripe)); + SetStripeReconstruct(stripe); /* Enforce xor in caller. */ + } else { + /* + * With less chunks, we xor parity out. + * + * (*4*) We rely on !StripeReconstruct() in chunk_must_xor(), + * so that only chunks with queued or merged writes + * are being xored. + */ + parity_xor(stripe); + } + + /* + * We do have enough chunks to merge. + * All chunks are uptodate or get written over. + */ + atomic_inc(rs->stats + S_CAN_MERGE); /* REMOVEME: statistics. */ + return 0; +} + +/* + * Avoid reading chunks in case we're fully operational. + * + * We prohibit io on any chunks without bios but the parity chunk. + */ +static void stripe_avoid_reads(struct stripe *stripe) +{ + struct raid_set *rs = RS(stripe->sc); + unsigned dummy = 0, p = rs->set.raid_devs; + + /* Walk all chunks. */ + while (p--) { + struct stripe_chunk *chunk = + stripe_chunk_check(stripe, p, &dummy); + + if (!chunk) + continue; + + /* If parity or any bios pending -> allow io. */ + if (chunk_ref(chunk) || p == stripe->idx.parity) + SetChunkIo(chunk); + else { + ClearChunkIo(chunk); + /* REMOVEME: statistics. */ + atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO); + } + } +} + +/* + * Read/write a stripe. + * + * All stripe read/write activity goes through this function + * unless recovery, which has to call stripe_chunk_rw() directly. + * + * Make sure we don't try already merged stripes in order + * to avoid data corruption. + * + * Check the state of the RAID set and if degraded (or + * resynchronizing for reads), read in all other chunks but + * the one on the dead/resynchronizing device in order to be + * able to reconstruct the missing one in _do_endios(). + * + * Can be called on active stripes in order + * to dispatch new io on inactive chunks. + * + * States to cover: + * o stripe to read and/or write + * o stripe with error to reconstruct + */ +static void stripe_rw(struct stripe *stripe) +{ + int nosync, r; + struct raid_set *rs = RS(stripe->sc); + + /* + * Check, if a chunk needs to be reconstructed + * because of a degraded set or a region out of sync. + */ + nosync = stripe_check_reconstruct(stripe); + switch (nosync) { + case -EBUSY: + return; /* Wait for stripe reconstruction to finish. */ + case -EPERM: + goto io; + } + + /* + * If we don't have merged writes pending, we can schedule + * queued writes to be merged next without corrupting data. + */ + if (!StripeMerged(stripe)) { + r = stripe_queue_writes(stripe); + if (r) + /* Writes got queued -> flag RBW. */ + SetStripeRBW(stripe); + } + + /* + * Merge all writes hanging off uptodate/overwritten + * chunks of the stripe. + */ + if (StripeRBW(stripe)) { + r = stripe_merge_possible(stripe, nosync); + if (!r) { /* Merge possible. */ + struct stripe_chunk *chunk; + + /* + * I rely on valid parity in order + * to xor a fraction of chunks out + * of parity and back in. + */ + stripe_merge_writes(stripe); /* Merge writes in. */ + parity_xor(stripe); /* Update parity. */ + ClearStripeReconstruct(stripe); /* Reset xor enforce. */ + SetStripeMerged(stripe); /* Writes merged. */ + ClearStripeRBW(stripe); /* Disable RBW. */ + + /* + * REMOVEME: sanity check on parity chunk + * states after writes got merged. + */ + chunk = CHUNK(stripe, stripe->idx.parity); + BUG_ON(ChunkLocked(chunk)); + BUG_ON(!ChunkUptodate(chunk)); + BUG_ON(!ChunkDirty(chunk)); + BUG_ON(!ChunkIo(chunk)); + } + } else if (!nosync && !StripeMerged(stripe)) + /* Read avoidance if not degraded/resynchronizing/merged. */ + stripe_avoid_reads(stripe); + +io: + /* Now submit any reads/writes for non-uptodate or dirty chunks. */ + r = stripe_chunks_rw(stripe); + if (!r) { + /* + * No io submitted because of chunk io + * prohibited or locked chunks/failed devices + * -> push to end io list for processing. + */ + stripe_endio_push(stripe); + atomic_inc(rs->stats + S_NO_RW); /* REMOVEME: statistics. */ + } +} + +/* + * Recovery functions + */ +/* Read a stripe off a raid set for recovery. */ +static int stripe_recover_read(struct stripe *stripe, int pi) +{ + BUG_ON(stripe_io_ref(stripe)); + + /* Invalidate all chunks so that they get read in. */ + stripe_chunks_invalidate(stripe); + stripe_allow_io(stripe); /* Allow io on all recovery chunks. */ + + /* + * If we are reconstructing a perticular device, we can avoid + * reading the respective chunk in, because we're going to + * reconstruct it anyway. + * + * We can't do that for resynchronization of rotating parity, + * because the recovery stripe chunk size is typically larger + * than the sets chunk size. + */ + if (pi > -1) + ClearChunkIo(CHUNK(stripe, pi)); + + return stripe_chunks_rw(stripe); +} + +/* Write a stripe to a raid set for recovery. */ +static int stripe_recover_write(struct stripe *stripe, int pi) +{ + BUG_ON(stripe_io_ref(stripe)); + + /* + * If this is a reconstruct of a particular device, then + * reconstruct the respective chunk, else create parity chunk. + */ + if (pi > -1) { + stripe_zero_chunk(stripe, pi); + common_xor(stripe, stripe->io.size, 0, pi); + chunk_set(CHUNK(stripe, pi), DIRTY); + } else + parity_xor(stripe); + + return stripe_chunks_rw(stripe); +} + +/* Read/write a recovery stripe. */ +static int stripe_recover_rw(struct stripe *stripe) +{ + int r = 0, sync = 0; + + /* Read/write flip-flop. */ + if (TestClearStripeRBW(stripe)) { + SetStripeMerged(stripe); + stripe->key = stripe->recover->pos; + r = stripe_recover_read(stripe, dev_for_parity(stripe, &sync)); + BUG_ON(!r); + } else if (TestClearStripeMerged(stripe)) { + r = stripe_recover_write(stripe, dev_for_parity(stripe, &sync)); + BUG_ON(!r); + } + + BUG_ON(sync); + return r; +} + +/* Recover bandwidth available ?. */ +static int recover_bandwidth(struct raid_set *rs) +{ + int r, work; + + /* On reset or when bios delayed -> allow recovery. */ + r = recover_io_reset(rs); + if (r || RSBandwidth(rs)) + goto out; + + work = atomic_read(rs->recover.io_count + IO_WORK); + if (work) { + /* Pay attention to larger recover stripe size. */ + int recover = atomic_read(rs->recover.io_count + IO_RECOVER) * + rs->recover.io_size / rs->set.io_size; + + /* + * Don't use more than given bandwidth + * of the work io for recovery. + */ + if (recover > work / rs->recover.bandwidth_work) { + /* REMOVEME: statistics. */ + atomic_inc(rs->stats + S_NO_BANDWIDTH); + return 0; + } + } + +out: + atomic_inc(rs->stats + S_BANDWIDTH); /* REMOVEME: statistics. */ + return 1; +} + +/* Try to get a region to recover. */ +static int stripe_recover_get_region(struct stripe *stripe) +{ + struct raid_set *rs = RS(stripe->sc); + struct recover *rec = &rs->recover; + struct recover_addr *addr = stripe->recover; + struct dm_dirty_log *dl = rec->dl; + struct dm_rh_client *rh = rec->rh; + + BUG_ON(!dl); + BUG_ON(!rh); + + /* Return, that we have region first to finish it during suspension. */ + if (addr->reg) + return 1; + + if (RSSuspend(rs)) + return -EPERM; + + if (dl->type->get_sync_count(dl) >= rec->nr_regions) + return -ENOENT; + + /* If we don't have enough bandwidth, we don't proceed recovering. */ + if (!recover_bandwidth(rs)) + return -EAGAIN; + + /* Start quiescing a region. */ + dm_rh_recovery_prepare(rh); + addr->reg = dm_rh_recovery_start(rh); + if (!addr->reg) + return -EAGAIN; + + addr->pos = dm_rh_region_to_sector(rh, dm_rh_get_region_key(addr->reg)); + addr->end = addr->pos + dm_rh_get_region_size(rh); + + /* + * Take one global io reference out for the + * whole region, which is going to be released + * when the region is completely done with. + */ + io_get(rs); + return 0; +} + +/* Update region hash state. */ +enum recover_type { REC_FAILURE = 0, REC_SUCCESS = 1 }; +static void recover_rh_update(struct stripe *stripe, enum recover_type success) +{ + struct recover_addr *addr = stripe->recover; + struct raid_set *rs = RS(stripe->sc); + struct recover *rec = &rs->recover; + + if (!addr->reg) { + DMERR("%s- Called w/o region", __func__); + return; + } + + dm_rh_recovery_end(addr->reg, success); + if (success) + rec->nr_regions_recovered++; + + addr->reg = NULL; + + /* + * Completely done with this region -> + * release the 1st io reference. + */ + io_put(rs); +} + +/* Set start of recovery state. */ +static void set_start_recovery(struct raid_set *rs) +{ + /* Initialize recovery. */ + rs->recover.start_jiffies = jiffies; + rs->recover.end_jiffies = 0; +} + +/* Set end of recovery state. */ +static void set_end_recovery(struct raid_set *rs) +{ + ClearRSRecover(rs); + rs->set.dev_to_init = -1; + + /* Check for jiffies overrun. */ + rs->recover.end_jiffies = jiffies; + if (rs->recover.end_jiffies < rs->recover.start_jiffies) + rs->recover.end_jiffies = ~0; +} + +/* Handle recovery on one recovery stripe. */ +static int _do_recovery(struct stripe *stripe) +{ + int r; + struct raid_set *rs = RS(stripe->sc); + struct recover_addr *addr = stripe->recover; + + /* If recovery is active -> return. */ + if (stripe_io_ref(stripe)) + return 1; + + /* IO error is fatal for recovery -> stop it. */ + if (unlikely(StripeError(stripe))) + goto err; + + /* Recovery end required. */ + if (!RSRecover(rs)) + goto err; + + /* Get a region to recover. */ + r = stripe_recover_get_region(stripe); + switch (r) { + case 0: /* Got a new region: flag initial read before write. */ + SetStripeRBW(stripe); + case 1: /* Have a region in the works. */ + break; + case -EAGAIN: + /* No bandwidth/quiesced region yet, try later. */ + if (!io_ref(rs)) + wake_do_raid_delayed(rs, HZ / 4); + case -EPERM: + /* Suspend. */ + return 1; + case -ENOENT: /* No more regions to recover. */ + schedule_work(&rs->io.ws_do_table_event); + return 0; + default: + BUG(); + } + + /* Read/write a recover stripe. */ + r = stripe_recover_rw(stripe); + if (r) + /* IO initiated. */ + return 1; + + /* Read and write finished-> update recovery position within region. */ + addr->pos += stripe->io.size; + + /* If we're at end of region, update region hash. */ + if (addr->pos >= addr->end || + addr->pos >= rs->set.sectors_per_dev) + recover_rh_update(stripe, REC_SUCCESS); + else + /* Prepare to read next region segment. */ + SetStripeRBW(stripe); + + /* Schedule myself for another round... */ + wake_do_raid(rs); + return 1; + +err: + /* FIXME: rather try recovering other regions on error? */ + rs_check_degrade(stripe); + recover_rh_update(stripe, REC_FAILURE); + + /* Check state of partially recovered array. */ + if (RSDegraded(rs) && !RSDead(rs) && + rs->set.dev_to_init != -1 && + rs->set.ei != rs->set.dev_to_init) + /* Broken drive != drive to recover -> FATAL. */ + SetRSDead(rs); + + if (StripeError(stripe)) { + char buf[BDEVNAME_SIZE]; + + DMERR("stopping recovery due to " + "ERROR on /dev/%s, stripe at offset %llu", + bdevname(rs->dev[rs->set.ei].dev->bdev, buf), + (unsigned long long) stripe->key); + + } + + /* Make sure, that all quiesced regions get released. */ + while (addr->reg) { + dm_rh_recovery_end(addr->reg, -EIO); + addr->reg = dm_rh_recovery_start(rs->recover.rh); + } + + return 0; +} + +/* Called by main io daemon to recover regions. */ +static void do_recovery(struct raid_set *rs) +{ + if (RSRecover(rs)) { + int r = 0; + struct stripe *stripe; + + list_for_each_entry(stripe, &rs->recover.stripes, + lists[LIST_RECOVER]) + r += _do_recovery(stripe); + + if (!r) { + set_end_recovery(rs); + stripe_recover_free(rs); + } + } +} + +/* + * END recovery functions + */ + +/* End io process all stripes handed in by endio() callback. */ +static void _do_endios(struct raid_set *rs, struct stripe *stripe, + struct list_head *flush_list) +{ + /* First unlock all required chunks. */ + stripe_chunks_unlock(stripe); + + /* + * If an io error on a stripe occured, degrade the RAID set + * and try to endio as many bios as possible. If any bios can't + * be endio processed, requeue the stripe (stripe_ref() != 0). + */ + if (TestClearStripeError(stripe)) { + /* + * FIXME: if read, rewrite the failed chunk after reconstruction + * in order to trigger disk bad sector relocation. + */ + rs_check_degrade(stripe); /* Resets ChunkError(). */ + ClearStripeReconstruct(stripe); + ClearStripeReconstructed(stripe); + } + + /* Got to reconstruct a missing chunk. */ + if (StripeReconstruct(stripe)) { + /* + * (*2*) We use StripeReconstruct() to allow for + * all chunks to be xored into the reconstructed + * one (see chunk_must_xor()). + */ + stripe_reconstruct(stripe); + + /* + * (*3*) Now we reset StripeReconstruct() and flag + * StripeReconstructed() to show to stripe_rw(), + * that we have reconstructed a missing chunk. + */ + ClearStripeReconstruct(stripe); + SetStripeReconstructed(stripe); + + /* FIXME: reschedule to be written in case of read. */ + // if (!StripeRBW(stripe)) { + // chunk_set(CHUNK(stripe, pr), DIRTY); + // stripe_chunks_rw(stripe); + // } + } + + /* + * Now that we eventually got a complete stripe, we + * can process the rest of the end ios on reads. + */ + stripe_endio(READ, stripe); + + /* End io all merged writes. */ + if (TestClearStripeMerged(stripe)) + stripe_endio(WRITE_MERGED, stripe); + + /* If RAID set is dead -> fail any ios to dead drives. */ + if (RSDead(rs)) { + DMERR_LIMIT("RAID set dead: failing ios to dead devices"); + stripe_fail_io(stripe); + } + + /* + * We have stripe references still, + * beacuse of read befeore writes or IO errors -> + * got to put on flush list for processing. + */ + if (stripe_ref(stripe)) { + BUG_ON(!list_empty(stripe->lists + LIST_LRU)); + list_add_tail(stripe->lists + LIST_FLUSH, flush_list); + atomic_inc(rs->stats + S_REQUEUE); /* REMOVEME: statistics. */ + } else + stripe_lru_add(stripe); +} + +/* Pop any endio stripes off of the endio list and belabour them. */ +static void do_endios(struct raid_set *rs) +{ + struct stripe_cache *sc = &rs->sc; + struct stripe *stripe; + /* IO flush list for sorted requeued stripes. */ + struct list_head flush_list; + + INIT_LIST_HEAD(&flush_list); + + while ((stripe = stripe_endio_pop(sc))) { + /* Avoid endio on stripes with newly io'ed chunks. */ + if (!stripe_io_ref(stripe)) + _do_endios(rs, stripe, &flush_list); + } + + /* + * Insert any requeued stripes in the proper + * order at the beginning of the io (flush) list. + */ + list_splice(&flush_list, sc->lists + LIST_FLUSH); +} + +/* Flush any stripes on the io list. */ +static void do_flush(struct raid_set *rs) +{ + struct stripe *stripe; + + while ((stripe = stripe_io_pop(&rs->sc))) + stripe_rw(stripe); /* Read/write stripe. */ +} + +/* Stripe cache resizing. */ +static void do_sc_resize(struct raid_set *rs) +{ + unsigned set = atomic_read(&rs->sc.stripes_to_set); + + if (set) { + unsigned cur = atomic_read(&rs->sc.stripes); + int r = (set > cur) ? sc_grow(&rs->sc, set - cur, SC_GROW) : + sc_shrink(&rs->sc, cur - set); + + /* Flag end of resizeing if ok. */ + if (!r) + atomic_set(&rs->sc.stripes_to_set, 0); + } +} + +/* + * Process all ios + * + * We do different things with the io depending + * on the state of the region that it is in: + * + * o reads: hang off stripe cache or postpone if full + * + * o writes: + * + * CLEAN/DIRTY/NOSYNC: increment pending and hang io off stripe's stripe set. + * In case stripe cache is full or busy, postpone the io. + * + * RECOVERING: delay the io until recovery of the region completes. + * + */ +static void do_ios(struct raid_set *rs, struct bio_list *ios) +{ + int r; + unsigned flush = 0, delay = 0; + sector_t sector; + struct dm_rh_client *rh = rs->recover.rh; + struct bio *bio; + struct bio_list reject; + + bio_list_init(&reject); + + /* + * Classify each io: + * o delay writes to recovering regions (let reads go through) + * o queue io to all other regions + */ + while ((bio = bio_list_pop(ios))) { + /* + * In case we get a barrier bio, push it back onto + * the input queue unless all work queues are empty + * and the stripe cache is inactive. + */ + if (unlikely(bio_barrier(bio))) { + /* REMOVEME: statistics. */ + atomic_inc(rs->stats + S_BARRIER); + if (delay || + !list_empty(rs->sc.lists + LIST_FLUSH) || + !bio_list_empty(&reject) || + sc_active(&rs->sc)) { + bio_list_push(ios, bio); + break; + } + } + + /* Check for recovering regions. */ + sector = _sector(rs, bio); + r = region_state(rs, sector, DM_RH_RECOVERING); + if (unlikely(r && bio_data_dir(bio) == WRITE)) { + delay++; + /* Wait writing to recovering regions. */ + dm_rh_delay_by_region(rh, bio, + dm_rh_sector_to_region(rh, + sector)); + /* REMOVEME: statistics.*/ + atomic_inc(rs->stats + S_DELAYED_BIOS); + atomic_inc(rs->stats + S_SUM_DELAYED_BIOS); + + /* Force bandwidth tests in recovery. */ + SetRSBandwidth(rs); + } else { + /* + * Process ios to non-recovering regions by queueing + * them to stripes (does dm_rh_inc()) for writes). + */ + flush += stripe_queue_bio(rs, bio, &reject); + } + } + + if (flush) { + /* FIXME: better error handling. */ + r = dm_rh_flush(rh); /* Writes got queued -> flush dirty log. */ + if (r) + DMERR_LIMIT("dirty log flush"); + } + + /* Merge any rejected bios back to the head of the input list. */ + bio_list_merge_head(ios, &reject); +} + +/* Unplug: let any queued io role on the sets devices. */ +static void do_unplug(struct raid_set *rs) +{ + struct raid_dev *dev = rs->dev + rs->set.raid_devs; + + while (dev-- > rs->dev) { + /* Only call any device unplug function, if io got queued. */ + if (TestClearDevIoQueued(dev)) + blk_unplug(bdev_get_queue(dev->dev->bdev)); + } +} + +/* Send an event in case we're getting too busy. */ +static void do_busy_event(struct raid_set *rs) +{ + if (sc_busy(rs)) { + if (!TestSetRSScBusy(rs)) + schedule_work(&rs->io.ws_do_table_event); + } + + ClearRSScBusy(rs); +} + +/* Throw an event. */ +static void do_table_event(struct work_struct *ws) +{ + struct raid_set *rs = container_of(ws, struct raid_set, + io.ws_do_table_event); + dm_table_event(rs->ti->table); +} + + +/*----------------------------------------------------------------- + * RAID daemon + *---------------------------------------------------------------*/ +/* + * o belabour all end ios + * o update the region hash states + * o optionally shrink the stripe cache + * o optionally do recovery + * o unplug any component raid devices with queued bios + * o grab the input queue + * o work an all requeued or new ios and perform stripe cache flushs + * o unplug any component raid devices with queued bios + * o check, if the stripe cache gets too busy and throw an event if so + */ +static void do_raid(struct work_struct *ws) +{ + struct raid_set *rs = container_of(ws, struct raid_set, + io.dws_do_raid.work); + struct bio_list *ios = &rs->io.work, *ios_in = &rs->io.in; + + /* + * We always need to end io, so that ios can get errored in + * case the set failed and the region counters get decremented + * before we update region hash states and go any further. + */ + do_endios(rs); + dm_rh_update_states(rs->recover.rh, 1); + + /* + * Now that we've end io'd, which may have put stripes on the LRU list + * to allow for shrinking, we resize the stripe cache if requested. + */ + do_sc_resize(rs); + + /* Try to recover regions. */ + do_recovery(rs); + do_unplug(rs); /* Unplug the sets device queues. */ + + /* Quickly grab all new ios queued and add them to the work list. */ + mutex_lock(&rs->io.in_lock); + bio_list_merge(ios, ios_in); + bio_list_init(ios_in); + mutex_unlock(&rs->io.in_lock); + + if (!bio_list_empty(ios)) + do_ios(rs, ios); /* Got ios to work into the cache. */ + + do_flush(rs); /* Flush any stripes on io list. */ + do_unplug(rs); /* Unplug the sets device queues. */ + do_busy_event(rs); /* Check if we got too busy. */ +} + +/* + * Callback for region hash to dispatch + * delayed bios queued to recovered regions + * (gets called via dm_rh_update_states()). + */ +static void dispatch_delayed_bios(void *context, struct bio_list *bl) +{ + struct raid_set *rs = context; + struct bio *bio; + + /* REMOVEME: statistics; decrement pending delayed bios counter. */ + bio_list_for_each(bio, bl) + atomic_dec(rs->stats + S_DELAYED_BIOS); + + /* Merge region hash private list to work list. */ + bio_list_merge_head(&rs->io.work, bl); + bio_list_init(bl); + ClearRSBandwidth(rs); +} + +/************************************************************* + * Constructor helpers + *************************************************************/ +/* Calculate MB/sec. */ +static unsigned mbpers(struct raid_set *rs, unsigned speed) +{ + return to_bytes(speed * rs->set.data_devs * + rs->recover.io_size * HZ >> 10) >> 10; +} + +/* + * Discover fastest xor algorithm and # of chunks combination. + */ +/* Calculate speed for algorithm and # of chunks. */ +static unsigned xor_speed(struct stripe *stripe) +{ + unsigned r = 0; + unsigned long j; + + /* Wait for next tick. */ + for (j = jiffies; j == jiffies; ) + ; + + /* Do xors for a full tick. */ + for (j = jiffies; j == jiffies; ) { + mb(); + common_xor(stripe, stripe->io.size, 0, 0); + mb(); + r++; + } + + return r; +} + +/* Optimize xor algorithm for this RAID set. */ +static unsigned xor_optimize(struct raid_set *rs) +{ + unsigned chunks_max = 2, p = rs->set.raid_devs, speed_max = 0; + struct xor_func *f = ARRAY_END(xor_funcs), *f_max = NULL; + struct stripe *stripe; + + BUG_ON(list_empty(&rs->recover.stripes)); + stripe = list_first_entry(&rs->recover.stripes, struct stripe, + lists[LIST_RECOVER]); + + /* Must set uptodate so that xor() will belabour chunks. */ + while (p--) + SetChunkUptodate(CHUNK(stripe, p)); + + /* Try all xor functions. */ + while (f-- > xor_funcs) { + unsigned speed; + + /* Set actual xor function for common_xor(). */ + rs->xor.f = f; + rs->xor.chunks = (f->f == xor_blocks_wrapper ? + (MAX_XOR_BLOCKS + 1) : XOR_CHUNKS_MAX) + 1; + + while (rs->xor.chunks-- > 2) { + speed = xor_speed(stripe); + if (speed > speed_max) { + speed_max = speed; + chunks_max = rs->xor.chunks; + f_max = f; + } + } + } + + /* Memorize optimum parameters. */ + rs->xor.f = f_max; + rs->xor.chunks = chunks_max; + return speed_max; +} + +/* + * Allocate a RAID context (a RAID set) + */ +/* Structure for variable RAID parameters. */ +struct variable_parms { + int bandwidth; + int bandwidth_parm; + int chunk_size; + int chunk_size_parm; + int io_size; + int io_size_parm; + int stripes; + int stripes_parm; + int recover_io_size; + int recover_io_size_parm; + int raid_parms; + int recovery; + int recovery_stripes; + int recovery_stripes_parm; +}; + +static struct raid_set * +context_alloc(struct raid_type *raid_type, struct variable_parms *p, + unsigned raid_devs, sector_t sectors_per_dev, + struct dm_target *ti, unsigned dl_parms, char **argv) +{ + int r; + size_t len; + sector_t region_size, ti_len; + struct raid_set *rs = NULL; + struct dm_dirty_log *dl; + struct recover *rec; + + /* + * Create the dirty log + * + * We need to change length for the dirty log constructor, + * because we want an amount of regions for all stripes derived + * from the single device size, so that we can keep region + * size = 2^^n independant of the number of devices + */ + ti_len = ti->len; + ti->len = sectors_per_dev; + dl = dm_dirty_log_create(argv[0], ti, dl_parms, argv + 2); + ti->len = ti_len; + if (!dl) + goto bad_dirty_log; + + /* Chunk size *must* be smaller than region size. */ + region_size = dl->type->get_region_size(dl); + if (p->chunk_size > region_size) + goto bad_chunk_size; + + /* Recover io size *must* be smaller than region size as well. */ + if (p->recover_io_size > region_size) + goto bad_recover_io_size; + + /* Size and allocate the RAID set structure. */ + len = sizeof(*rs->data) + sizeof(*rs->dev); + if (dm_array_too_big(sizeof(*rs), len, raid_devs)) + goto bad_array; + + len = sizeof(*rs) + raid_devs * len; + rs = kzalloc(len, GFP_KERNEL); + if (!rs) + goto bad_alloc; + + rec = &rs->recover; + atomic_set(&rs->io.in_process, 0); + atomic_set(&rs->io.in_process_max, 0); + rec->io_size = p->recover_io_size; + + /* Pointer to data array. */ + rs->data = (unsigned long **) + ((void *) rs->dev + raid_devs * sizeof(*rs->dev)); + rec->dl = dl; + rs->set.raid_devs = raid_devs; + rs->set.data_devs = raid_devs - raid_type->parity_devs; + rs->set.raid_type = raid_type; + + rs->set.raid_parms = p->raid_parms; + rs->set.chunk_size_parm = p->chunk_size_parm; + rs->set.io_size_parm = p->io_size_parm; + rs->sc.stripes_parm = p->stripes_parm; + rec->io_size_parm = p->recover_io_size_parm; + rec->bandwidth_parm = p->bandwidth_parm; + rec->recovery = p->recovery; + rec->recovery_stripes = p->recovery_stripes; + + /* + * Set chunk and io size and respective shifts + * (used to avoid divisions) + */ + rs->set.chunk_size = p->chunk_size; + rs->set.chunk_shift = ffs(p->chunk_size) - 1; + + rs->set.io_size = p->io_size; + rs->set.io_mask = p->io_size - 1; + /* Mask to adjust address key in case io_size != chunk_size. */ + rs->set.io_inv_mask = (p->chunk_size - 1) & ~rs->set.io_mask; + + rs->set.sectors_per_dev = sectors_per_dev; + + rs->set.ei = -1; /* Indicate no failed device. */ + atomic_set(&rs->set.failed_devs, 0); + + rs->ti = ti; + + atomic_set(rec->io_count + IO_WORK, 0); + atomic_set(rec->io_count + IO_RECOVER, 0); + + /* Initialize io lock and queues. */ + mutex_init(&rs->io.in_lock); + bio_list_init(&rs->io.in); + bio_list_init(&rs->io.work); + + init_waitqueue_head(&rs->io.suspendq); /* Suspend waiters (dm-io). */ + + rec->nr_regions = dm_sector_div_up(sectors_per_dev, region_size); + rec->rh = dm_region_hash_create(rs, dispatch_delayed_bios, + wake_dummy, wake_do_raid, 0, p->recovery_stripes, + dl, region_size, rec->nr_regions); + if (IS_ERR(rec->rh)) + goto bad_rh; + + /* Initialize stripe cache. */ + r = sc_init(rs, p->stripes); + if (r) + goto bad_sc; + + /* REMOVEME: statistics. */ + stats_reset(rs); + ClearRSDevelStats(rs); /* Disnable development status. */ + return rs; + +bad_dirty_log: + TI_ERR_RET("Error creating dirty log", ERR_PTR(-ENOMEM)); + +bad_chunk_size: + dm_dirty_log_destroy(dl); + TI_ERR_RET("Chunk size larger than region size", ERR_PTR(-EINVAL)); + +bad_recover_io_size: + dm_dirty_log_destroy(dl); + TI_ERR_RET("Recover stripe io size larger than region size", + ERR_PTR(-EINVAL)); + +bad_array: + dm_dirty_log_destroy(dl); + TI_ERR_RET("Arry too big", ERR_PTR(-EINVAL)); + +bad_alloc: + dm_dirty_log_destroy(dl); + TI_ERR_RET("Cannot allocate raid context", ERR_PTR(-ENOMEM)); + +bad_rh: + dm_dirty_log_destroy(dl); + ti->error = DM_MSG_PREFIX "Error creating dirty region hash"; + goto free_rs; + +bad_sc: + dm_region_hash_destroy(rec->rh); /* Destroys dirty log too. */ + sc_exit(&rs->sc); + ti->error = DM_MSG_PREFIX "Error creating stripe cache"; +free_rs: + kfree(rs); + return ERR_PTR(-ENOMEM); +} + +/* Free a RAID context (a RAID set). */ +static void context_free(struct raid_set *rs, unsigned p) +{ + while (p--) + dm_put_device(rs->ti, rs->dev[p].dev); + + sc_exit(&rs->sc); + dm_region_hash_destroy(rs->recover.rh); /* Destroys dirty log too. */ + kfree(rs); +} + +/* Create work queue and initialize delayed work. */ +static int rs_workqueue_init(struct raid_set *rs) +{ + struct dm_target *ti = rs->ti; + + rs->io.wq = create_singlethread_workqueue(DAEMON); + if (!rs->io.wq) + TI_ERR_RET("failed to create " DAEMON, -ENOMEM); + + INIT_DELAYED_WORK(&rs->io.dws_do_raid, do_raid); + INIT_WORK(&rs->io.ws_do_table_event, do_table_event); + return 0; +} + +/* Return pointer to raid_type structure for raid name. */ +static struct raid_type *get_raid_type(char *name) +{ + struct raid_type *r = ARRAY_END(raid_types); + + while (r-- > raid_types) { + if (!strcmp(r->name, name)) + return r; + } + + return NULL; +} + +/* FIXME: factor out to dm core. */ +static int multiple(sector_t a, sector_t b, sector_t *n) +{ + sector_t r = a; + + sector_div(r, b); + *n = r; + return a == r * b; +} + +/* Log RAID set information to kernel log. */ +static void rs_log(struct raid_set *rs, unsigned speed) +{ + unsigned p; + char buf[BDEVNAME_SIZE]; + + for (p = 0; p < rs->set.raid_devs; p++) + DMINFO("/dev/%s is raid disk %u%s", + bdevname(rs->dev[p].dev->bdev, buf), p, + (p == rs->set.pi) ? " (parity)" : ""); + + DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes\n" + "algorithm \"%s\", %u chunks with %uMB/s\n" + "%s set with net %u/%u devices", + rs->set.chunk_size, rs->set.io_size, rs->recover.io_size, + atomic_read(&rs->sc.stripes), + rs->xor.f->name, rs->xor.chunks, mbpers(rs, speed), + rs->set.raid_type->descr, rs->set.data_devs, rs->set.raid_devs); +} + +/* Get all devices and offsets. */ +static int dev_parms(struct raid_set *rs, char **argv, int *p) +{ + struct dm_target *ti = rs->ti; + + for (*p = 0; *p < rs->set.raid_devs; (*p)++, argv += 2) { + int r; + unsigned long long tmp; + struct raid_dev *dev = rs->dev + *p; + + /* Get offset and device. */ + if (sscanf(argv[1], "%llu", &tmp) != 1 || + tmp > rs->set.sectors_per_dev) + TI_ERR("Invalid RAID device offset parameter"); + + dev->start = tmp; + r = dm_get_device(ti, *argv, dev->start, + rs->set.sectors_per_dev, + dm_table_get_mode(ti->table), &dev->dev); + if (r) + TI_ERR_RET("RAID device lookup failure", r); + + r = raid_dev_lookup(rs, dev); + if (r != -ENODEV && r < *p) { + (*p)++; /* Ensure dm_put_device() on actual device. */ + TI_ERR_RET("Duplicate RAID device", -ENXIO); + } + } + + return 0; +} + +/* Set recovery bandwidth. */ +static void +recover_set_bandwidth(struct raid_set *rs, unsigned bandwidth) +{ + rs->recover.bandwidth = bandwidth; + rs->recover.bandwidth_work = 100 / bandwidth; +} + +/* Handle variable number of RAID parameters. */ +static int get_raid_variable_parms(struct dm_target *ti, char **argv, + struct variable_parms *vp) +{ + int p, value; + struct { + int action; /* -1: skip, 0: no pwer2 check, 1: power2 check */ + char *errmsg; + int min, max; + int *var, *var2, *var3; + } argctr[] = { + { 1, + "Invalid chunk size; must be -1 or 2^^n and <= 16384", + IO_SIZE_MIN, CHUNK_SIZE_MAX, + &vp->chunk_size_parm, &vp->chunk_size, &vp->io_size }, + { 0, + "Invalid number of stripes: must be -1 or >= 8 and <= 16384", + STRIPES_MIN, STRIPES_MAX, + &vp->stripes_parm, &vp->stripes, NULL }, + { 1, + "Invalid io size; must -1 or >= 8, 2^^n and less equal " + "min(BIO_MAX_SECTORS/2, chunk size)", + IO_SIZE_MIN, 0, /* Needs to be updated in loop below. */ + &vp->io_size_parm, &vp->io_size, NULL }, + { 1, + "Invalid recovery io size; must be -1 or " + "2^^n and less equal BIO_MAX_SECTORS/2", + RECOVER_IO_SIZE_MIN, BIO_MAX_SECTORS / 2, + &vp->recover_io_size_parm, &vp->recover_io_size, NULL }, + { 0, + "Invalid recovery bandwidth percentage; " + "must be -1 or > 0 and <= 100", + BANDWIDTH_MIN, BANDWIDTH_MAX, + &vp->bandwidth_parm, &vp->bandwidth, NULL }, + /* Handle sync argument seperately in loop. */ + { -1, + "Invalid recovery switch; must be \"sync\" or \"nosync\"" }, + { 0, + "Invalid number of recovery stripes;" + "must be -1, > 0 and <= 16384", + RECOVERY_STRIPES_MIN, RECOVERY_STRIPES_MAX, + &vp->recovery_stripes_parm, &vp->recovery_stripes, NULL }, + }, *varp; + + /* Fetch # of variable raid parameters. */ + if (sscanf(*(argv++), "%d", &vp->raid_parms) != 1 || + !range_ok(vp->raid_parms, 0, 7)) + TI_ERR("Bad variable raid parameters number"); + + /* Preset variable RAID parameters. */ + vp->chunk_size = CHUNK_SIZE_DEFAULT; + vp->io_size = IO_SIZE_DEFAULT; + vp->stripes = STRIPES_DEFAULT; + vp->recover_io_size = RECOVER_IO_SIZE_DEFAULT; + vp->bandwidth = BANDWIDTH_DEFAULT; + vp->recovery = 1; + vp->recovery_stripes = RECOVERY_STRIPES_DEFAULT; + + /* Walk the array of argument constraints for all given ones. */ + for (p = 0, varp = argctr; p < vp->raid_parms; p++, varp++) { + BUG_ON(varp >= ARRAY_END(argctr)); + + /* Special case for "[no]sync" string argument. */ + if (varp->action < 0) { + if (!strcmp(*argv, "sync")) + ; + else if (!strcmp(*argv, "nosync")) + vp->recovery = 0; + else + TI_ERR(varp->errmsg); + + argv++; + continue; + } + + /* + * Special case for io_size depending + * on previously set chunk size. + */ + if (p == 2) + varp->max = min(BIO_MAX_SECTORS / 2, vp->chunk_size); + + if (sscanf(*(argv++), "%d", &value) != 1 || + (value != -1 && + ((varp->action && !POWER_OF_2(value)) || + !range_ok(value, varp->min, varp->max)))) + TI_ERR(varp->errmsg); + + *varp->var = value; + if (value != -1) { + if (varp->var2) + *varp->var2 = value; + if (varp->var3) + *varp->var3 = value; + } + } + + return 0; +} + +/* Parse optional locking parameters. */ +static int get_raid_locking_parms(struct dm_target *ti, char **argv, + int *locking_parms, + struct dm_raid45_locking_type **locking_type) +{ + if (!strnicmp(argv[0], "locking", strlen(argv[0]))) { + char *lckstr = argv[1]; + size_t lcksz = strlen(lckstr); + + if (!strnicmp(lckstr, "none", lcksz)) { + *locking_type = &locking_none; + *locking_parms = 2; + } else if (!strnicmp(lckstr, "cluster", lcksz)) { + DMERR("locking type \"%s\" not yet implemented", + lckstr); + return -EINVAL; + } else { + DMERR("unknown locking type \"%s\"", lckstr); + return -EINVAL; + } + } + + *locking_parms = 0; + *locking_type = &locking_none; + return 0; +} + +/* Set backing device read ahead properties of RAID set. */ +static void rs_set_read_ahead(struct raid_set *rs, + unsigned sectors, unsigned stripes) +{ + unsigned ra_pages = dm_div_up(sectors, SECTORS_PER_PAGE); + struct mapped_device *md = dm_table_get_md(rs->ti->table); + struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info; + + /* Set read-ahead for the RAID set and the component devices. */ + if (ra_pages) { + unsigned p = rs->set.raid_devs; + + bdi->ra_pages = stripes * ra_pages * rs->set.data_devs; + + while (p--) { + struct request_queue *q = + bdev_get_queue(rs->dev[p].dev->bdev); + + q->backing_dev_info.ra_pages = ra_pages; + } + } + + dm_put(md); +} + +/* Set congested function. */ +static void rs_set_congested_fn(struct raid_set *rs) +{ + struct mapped_device *md = dm_table_get_md(rs->ti->table); + struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info; + + /* Set congested function and data. */ + bdi->congested_fn = rs_congested; + bdi->congested_data = rs; + dm_put(md); +} + +/* + * Construct a RAID4/5 mapping: + * + * log_type #log_params \ + * raid_type [#parity_dev] #raid_variable_params \ + * [locking "none"/"cluster"] + * #raid_devs #dev_to_initialize [ ]{3,} + * + * log_type = "core"/"disk", + * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only) + * log_params = [dirty_log_path] region_size [[no]sync]) + * + * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs" + * + * #parity_dev = N if raid_type = "raid4" + * o N = -1: pick default = last device + * o N >= 0 and < #raid_devs: parity device index + * + * #raid_variable_params = 0-7; raid_params (-1 = default): + * [chunk_size [#stripes [io_size [recover_io_size \ + * [%recovery_bandwidth [recovery_switch [#recovery_stripes]]]]]]] + * o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8 + * and <= CHUNK_SIZE_MAX) + * o #stripes is number of stripes allocated to stripe cache + * (must be > 1 and < STRIPES_MAX) + * o io_size (io unit size per device in sectors; must be 2^^n and > 8) + * o recover_io_size (io unit size per device for recovery in sectors; + must be 2^^n, > SECTORS_PER_PAGE and <= region_size) + * o %recovery_bandwith is the maximum amount spend for recovery during + * application io (1-100%) + * o recovery switch = [sync|nosync] + * o #recovery_stripes is the number of recovery stripes used for + * parallel recovery of the RAID set + * If raid_variable_params = 0, defaults will be used. + * Any raid_variable_param can be set to -1 to apply a default + * + * #raid_devs = N (N >= 3) + * + * #dev_to_initialize = N + * -1: initialize parity on all devices + * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction + * of a failed devices content after replacement + * + * = device_path (eg, /dev/sdd1) + * = begin at offset on + * + */ +#define MIN_PARMS 13 +static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + int dev_to_init, dl_parms, i, locking_parms, + parity_parm, pi = -1, r, raid_devs; + unsigned speed; + sector_t tmp, sectors_per_dev; + struct dm_raid45_locking_type *locking; + struct raid_set *rs; + struct raid_type *raid_type; + struct variable_parms parms; + + /* Ensure minimum number of parameters. */ + if (argc < MIN_PARMS) + TI_ERR("Not enough parameters"); + + /* Fetch # of dirty log parameters. */ + if (sscanf(argv[1], "%d", &dl_parms) != 1 || + !range_ok(dl_parms, 1, 4711)) /* ;-) */ + TI_ERR("Bad dirty log parameters number"); + + /* Check raid_type. */ + raid_type = get_raid_type(argv[dl_parms + 2]); + if (!raid_type) + TI_ERR("Bad raid type"); + + /* In case of RAID4, parity drive is selectable. */ + parity_parm = !!(raid_type->level == raid4); + + /* Handle variable number of RAID parameters. */ + r = get_raid_variable_parms(ti, argv + dl_parms + parity_parm + 3, + &parms); + if (r) + return r; + + /* Handle any locking parameters. */ + r = get_raid_locking_parms(ti, + argv + dl_parms + parity_parm + + parms.raid_parms + 4, + &locking_parms, &locking); + if (r) + return r; + + /* # of raid devices. */ + i = dl_parms + parity_parm + parms.raid_parms + locking_parms + 4; + if (sscanf(argv[i], "%d", &raid_devs) != 1 || + raid_devs < raid_type->minimal_devs) + TI_ERR("Invalid number of raid devices"); + + /* In case of RAID4, check parity drive index is in limits. */ + if (raid_type->level == raid4) { + /* Fetch index of parity device. */ + if (sscanf(argv[dl_parms + 3], "%d", &pi) != 1 || + (pi != -1 && !range_ok(pi, 0, raid_devs - 1))) + TI_ERR("Invalid RAID4 parity device index"); + } + + /* + * Index of device to initialize starts at 0 + * + * o -1 -> don't initialize a selected device; + * initialize parity conforming to algorithm + * o 0..raid_devs-1 -> initialize respective device + * (used for reconstruction of a replaced device) + */ + if (sscanf(argv[dl_parms + parity_parm + parms.raid_parms + + locking_parms + 5], "%d", &dev_to_init) != 1 || + !range_ok(dev_to_init, -1, raid_devs - 1)) + TI_ERR("Invalid number for raid device to initialize"); + + /* Check # of raid device arguments. */ + if (argc - dl_parms - parity_parm - parms.raid_parms - 6 != + 2 * raid_devs) + TI_ERR("Wrong number of raid device/offset arguments"); + + /* + * Check that the table length is devisable + * w/o rest by (raid_devs - parity_devs) + */ + if (!multiple(ti->len, raid_devs - raid_type->parity_devs, + §ors_per_dev)) + TI_ERR("Target length not divisible by number of data devices"); + + /* + * Check that the device size is + * devisable w/o rest by chunk size + */ + if (!multiple(sectors_per_dev, parms.chunk_size, &tmp)) + TI_ERR("Device length not divisible by chunk_size"); + + /**************************************************************** + * Now that we checked the constructor arguments -> + * let's allocate the RAID set + ****************************************************************/ + rs = context_alloc(raid_type, &parms, raid_devs, sectors_per_dev, + ti, dl_parms, argv); + if (IS_ERR(rs)) + return PTR_ERR(rs); + + + rs->set.dev_to_init = rs->set.dev_to_init_parm = dev_to_init; + rs->set.pi = rs->set.pi_parm = pi; + + /* Set RAID4 parity drive index. */ + if (raid_type->level == raid4) + rs->set.pi = (pi == -1) ? rs->set.data_devs : pi; + + recover_set_bandwidth(rs, parms.bandwidth); + + /* Use locking type to lock stripe access. */ + rs->locking = locking; + + /* Get the device/offset tupels. */ + argv += dl_parms + 6 + parity_parm + parms.raid_parms; + r = dev_parms(rs, argv, &i); + if (r) + goto err; + + /* Set backing device information (eg. read ahead). */ + rs_set_read_ahead(rs, 2 * rs->set.chunk_size, 4 /* stripes */); + rs_set_congested_fn(rs); /* Set congested function. */ + SetRSCheckOverwrite(rs); /* Allow chunk overwrite checks. */ + speed = xor_optimize(rs); /* Select best xor algorithm. */ + + /* Set for recovery of any nosync regions. */ + if (parms.recovery) + SetRSRecover(rs); + else { + /* + * Need to free recovery stripe(s) here in case + * of nosync, because xor_optimize uses one. + */ + set_start_recovery(rs); + set_end_recovery(rs); + stripe_recover_free(rs); + } + + /* + * Make sure that dm core only hands maximum io size + * length down and pays attention to io boundaries. + */ + ti->split_io = rs->set.io_size; + ti->private = rs; + + /* Initialize work queue to handle this RAID set's io. */ + r = rs_workqueue_init(rs); + if (r) + goto err; + + rs_log(rs, speed); /* Log information about RAID set. */ + return 0; + +err: + context_free(rs, i); + return r; +} + +/* + * Destruct a raid mapping + */ +static void raid_dtr(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + destroy_workqueue(rs->io.wq); + context_free(rs, rs->set.raid_devs); +} + +/* Raid mapping function. */ +static int raid_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + /* I don't want to waste stripe cache capacity. */ + if (bio_rw(bio) == READA) + return -EIO; + else { + struct raid_set *rs = ti->private; + + /* + * Get io reference to be waiting for to drop + * to zero on device suspension/destruction. + */ + io_get(rs); + bio->bi_sector -= ti->begin; /* Remap sector. */ + + /* Queue io to RAID set. */ + mutex_lock(&rs->io.in_lock); + bio_list_add(&rs->io.in, bio); + mutex_unlock(&rs->io.in_lock); + + /* Wake daemon to process input list. */ + wake_do_raid(rs); + + /* REMOVEME: statistics. */ + atomic_inc(rs->stats + (bio_data_dir(bio) == READ ? + S_BIOS_READ : S_BIOS_WRITE)); + return DM_MAPIO_SUBMITTED; /* Handle later. */ + } +} + +/* Device suspend. */ +static void raid_presuspend(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + struct dm_dirty_log *dl = rs->recover.dl; + + SetRSSuspend(rs); + + if (RSRecover(rs)) + dm_rh_stop_recovery(rs->recover.rh); + + cancel_delayed_work(&rs->io.dws_do_raid); + flush_workqueue(rs->io.wq); + wait_ios(rs); /* Wait for completion of all ios being processed. */ + + if (dl->type->presuspend && dl->type->presuspend(dl)) + /* FIXME: need better error handling. */ + DMWARN("log presuspend failed"); +} + +static void raid_postsuspend(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + struct dm_dirty_log *dl = rs->recover.dl; + + if (dl->type->postsuspend && dl->type->postsuspend(dl)) + /* FIXME: need better error handling. */ + DMWARN("log postsuspend failed"); + +} + +/* Device resume. */ +static void raid_resume(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + struct recover *rec = &rs->recover; + struct dm_dirty_log *dl = rec->dl; + + if (dl->type->resume && dl->type->resume(dl)) + /* Resume dirty log. */ + /* FIXME: need better error handling. */ + DMWARN("log resume failed"); + + rec->nr_regions_to_recover = + rec->nr_regions - dl->type->get_sync_count(dl); + + /* Restart any unfinished recovery. */ + if (RSRecover(rs)) { + set_start_recovery(rs); + dm_rh_start_recovery(rec->rh); + } + + ClearRSSuspend(rs); + wake_do_raid(rs); +} + +/* Return stripe cache size. */ +static unsigned sc_size(struct raid_set *rs) +{ + return to_sector(atomic_read(&rs->sc.stripes) * + (sizeof(struct stripe) + + (sizeof(struct stripe_chunk) + + (sizeof(struct page_list) + + to_bytes(rs->set.io_size) * + rs->set.raid_devs)) + + (rs->recover.end_jiffies ? + 0 : rs->recover.recovery_stripes * + to_bytes(rs->set.raid_devs * rs->recover.io_size)))); +} + +/* REMOVEME: status output for development. */ +static void raid_devel_stats(struct dm_target *ti, char *result, + unsigned *size, unsigned maxlen) +{ + unsigned sz = *size; + unsigned long j; + char buf[BDEVNAME_SIZE], *p; + struct stats_map *sm; + struct raid_set *rs = ti->private; + struct recover *rec = &rs->recover; + struct timespec ts; + + DMEMIT("%s %s %u\n", version, rs->xor.f->name, rs->xor.chunks); + DMEMIT("act_ios=%d ", io_ref(rs)); + DMEMIT("act_ios_max=%d\n", atomic_read(&rs->io.in_process_max)); + DMEMIT("act_stripes=%d ", sc_active(&rs->sc)); + DMEMIT("act_stripes_max=%d\n", + atomic_read(&rs->sc.active_stripes_max)); + + for (sm = stats_map; sm < ARRAY_END(stats_map); sm++) + DMEMIT("%s%d", sm->str, atomic_read(rs->stats + sm->type)); + + DMEMIT(" checkovr=%s\n", RSCheckOverwrite(rs) ? "on" : "off"); + DMEMIT("sc=%u/%u/%u/%u/%u/%u/%u\n", rs->set.chunk_size, + atomic_read(&rs->sc.stripes), rs->set.io_size, + rec->recovery_stripes, rec->io_size, rs->sc.hash.buckets, + sc_size(rs)); + + j = (rec->end_jiffies ? rec->end_jiffies : jiffies) - + rec->start_jiffies; + jiffies_to_timespec(j, &ts); + sprintf(buf, "%ld.%ld", ts.tv_sec, ts.tv_nsec); + p = strchr(buf, '.'); + p[3] = 0; + + DMEMIT("rg=%llu/%llu/%llu/%u %s\n", + (unsigned long long) rec->nr_regions_recovered, + (unsigned long long) rec->nr_regions_to_recover, + (unsigned long long) rec->nr_regions, rec->bandwidth, buf); + + *size = sz; +} + +static int raid_status(struct dm_target *ti, status_type_t type, + char *result, unsigned maxlen) +{ + unsigned p, sz = 0; + char buf[BDEVNAME_SIZE]; + struct raid_set *rs = ti->private; + int raid_parms[] = { + rs->set.chunk_size_parm, + rs->sc.stripes_parm, + rs->set.io_size_parm, + rs->recover.io_size_parm, + rs->recover.bandwidth_parm, + -2, + rs->recover.recovery_stripes, + }; + + switch (type) { + case STATUSTYPE_INFO: + /* REMOVEME: statistics. */ + if (RSDevelStats(rs)) + raid_devel_stats(ti, result, &sz, maxlen); + + DMEMIT("%u ", rs->set.raid_devs); + + for (p = 0; p < rs->set.raid_devs; p++) + DMEMIT("%s ", + format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev)); + + DMEMIT("1 "); + for (p = 0; p < rs->set.raid_devs; p++) { + DMEMIT("%c", !DevFailed(rs->dev + p) ? 'A' : 'D'); + + if (p == rs->set.pi) + DMEMIT("p"); + + if (rs->set.dev_to_init == p) + DMEMIT("i"); + } + + break; + case STATUSTYPE_TABLE: + sz = rs->recover.dl->type->status(rs->recover.dl, type, + result, maxlen); + DMEMIT("%s %u ", rs->set.raid_type->name, + rs->set.raid_parms); + + for (p = 0; p < rs->set.raid_parms; p++) { + if (raid_parms[p] > -2) + DMEMIT("%d ", raid_parms[p]); + else + DMEMIT("%s ", rs->recover.recovery ? + "sync" : "nosync"); + } + + DMEMIT("%u %d ", rs->set.raid_devs, rs->set.dev_to_init); + + for (p = 0; p < rs->set.raid_devs; p++) + DMEMIT("%s %llu ", + format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev), + (unsigned long long) rs->dev[p].start); + } + + return 0; +} + +/* + * Message interface + */ +enum raid_msg_actions { + act_bw, /* Recovery bandwidth switch. */ + act_dev, /* Device failure switch. */ + act_overwrite, /* Stripe overwrite check. */ + act_stats, /* Development statistics switch. */ + act_sc, /* Stripe cache switch. */ + + act_on, /* Set entity on. */ + act_off, /* Set entity off. */ + act_reset, /* Reset entity. */ + + act_set = act_on, /* Set # absolute. */ + act_grow = act_off, /* Grow # by an amount. */ + act_shrink = act_reset, /* Shrink # by an amount. */ +}; + +/* Turn a delta into an absolute value. */ +static int _absolute(unsigned long action, int act, int r) +{ + /* Make delta absolute. */ + if (test_bit(act_set, &action)) + ; + else if (test_bit(act_grow, &action)) + r += act; + else if (test_bit(act_shrink, &action)) + r = act - r; + else + r = -EINVAL; + + return r; +} + + /* Change recovery io bandwidth. */ +static int bandwidth_change(struct dm_msg *msg, void *context) +{ + struct raid_set *rs = context; + int act = rs->recover.bandwidth; + int bandwidth = DM_MSG_INT_ARG(msg); + + if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) { + /* Make delta bandwidth absolute. */ + bandwidth = _absolute(msg->action, act, bandwidth); + + /* Check range. */ + if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) { + recover_set_bandwidth(rs, bandwidth); + return 0; + } + } + + set_bit(dm_msg_ret_arg, &msg->ret); + set_bit(dm_msg_ret_inval, &msg->ret); + return -EINVAL; +} + +/* Set/reset development feature flags. */ +static int devel_flags(struct dm_msg *msg, void *context) +{ + struct raid_set *rs = context; + + if (test_bit(act_on, &msg->action)) + return test_and_set_bit(msg->spec->parm, + &rs->io.flags) ? -EPERM : 0; + else if (test_bit(act_off, &msg->action)) + return test_and_clear_bit(msg->spec->parm, + &rs->io.flags) ? 0 : -EPERM; + else if (test_bit(act_reset, &msg->action)) { + if (test_bit(act_stats, &msg->action)) { + stats_reset(rs); + goto on; + } else if (test_bit(act_overwrite, &msg->action)) { +on: + set_bit(msg->spec->parm, &rs->io.flags); + return 0; + } + } + + return -EINVAL; +} + +/* Resize the stripe cache. */ +static int sc_resize(struct dm_msg *msg, void *context) +{ + int act, stripes; + struct raid_set *rs = context; + + /* Deny permission in case the daemon is still resizing!. */ + if (atomic_read(&rs->sc.stripes_to_set)) + return -EPERM; + + stripes = DM_MSG_INT_ARG(msg); + if (stripes > 0) { + act = atomic_read(&rs->sc.stripes); + + /* Make delta stripes absolute. */ + stripes = _absolute(msg->action, act, stripes); + + /* + * Check range and that the # of stripes changes. + * We leave the resizing to the wroker. + */ + if (range_ok(stripes, STRIPES_MIN, STRIPES_MAX) && + stripes != atomic_read(&rs->sc.stripes)) { + atomic_set(&rs->sc.stripes_to_set, stripes); + wake_do_raid(rs); + return 0; + } + } + + set_bit(dm_msg_ret_arg, &msg->ret); + set_bit(dm_msg_ret_inval, &msg->ret); + return -EINVAL; +} + +/* Parse the RAID message action. */ +/* + * 'ba[ndwidth] {se[t],g[row],sh[rink]} #' # e.g 'ba se 50' + * "o[verwrite] {on,of[f],r[eset]}' # e.g. 'o of' + * 'sta[tistics] {on,of[f],r[eset]}' # e.g. 'stat of' + * 'str[ipecache] {se[t],g[row],sh[rink]} #' # e.g. 'stripe set 1024' + * + */ +static int raid_message(struct dm_target *ti, unsigned argc, char **argv) +{ + /* Variables to store the parsed parameters im. */ + static int i[2]; + static unsigned long *i_arg[] = { + (unsigned long *) i + 0, + (unsigned long *) i + 1, + }; + + /* Declare all message option strings. */ + static char *str_sgs[] = { "set", "grow", "shrink" }; + static char *str_oor[] = { "on", "off", "reset" }; + + /* Declare all actions. */ + static unsigned long act_sgs[] = { act_set, act_grow, act_shrink }; + static unsigned long act_oor[] = { act_on, act_off, act_reset }; + + /* Bandwidth option. */ + static struct dm_message_option bw_opt = { 3, str_sgs, act_sgs }; + static struct dm_message_argument bw_args = { + 1, i_arg, { dm_msg_int_t } + }; + + static struct dm_message_argument null_args = { + 0, NULL, { dm_msg_int_t } + }; + + /* Overwrite and statistics option. */ + static struct dm_message_option ovr_stats_opt = { 3, str_oor, act_oor }; + + /* Sripecache option. */ + static struct dm_message_option stripe_opt = { 3, str_sgs, act_sgs }; + + /* Declare messages. */ + static struct dm_msg_spec specs[] = { + { "bandwidth", act_bw, &bw_opt, &bw_args, + 0, bandwidth_change }, + { "overwrite", act_overwrite, &ovr_stats_opt, &null_args, + RS_CHECK_OVERWRITE, devel_flags }, + { "statistics", act_stats, &ovr_stats_opt, &null_args, + RS_DEVEL_STATS, devel_flags }, + { "stripecache", act_sc, &stripe_opt, &bw_args, + 0, sc_resize }, + }; + + /* The message for the parser. */ + struct dm_msg msg = { + .num_specs = ARRAY_SIZE(specs), + .specs = specs, + }; + + return dm_message_parse(TARGET, &msg, ti->private, argc, argv); +} +/* + * END message interface + */ + +static struct target_type raid_target = { + .name = "raid45", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = raid_ctr, + .dtr = raid_dtr, + .map = raid_map, + .presuspend = raid_presuspend, + .postsuspend = raid_postsuspend, + .resume = raid_resume, + .status = raid_status, + .message = raid_message, +}; + +static void init_exit(const char *bad_msg, const char *good_msg, int r) +{ + if (r) + DMERR("Failed to %sregister target [%d]", bad_msg, r); + else + DMINFO("%s %s", good_msg, version); +} + +static int __init dm_raid_init(void) +{ + int r = dm_register_target(&raid_target); + + init_exit("", "initialized", r); + return r; +} + +static void __exit dm_raid_exit(void) +{ + dm_unregister_target(&raid_target); + init_exit("un", "exit", 0); +} + +/* Module hooks. */ +module_init(dm_raid_init); +module_exit(dm_raid_exit); + +MODULE_DESCRIPTION(DM_NAME " raid4/5 target"); +MODULE_AUTHOR("Heinz Mauelshagen "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("dm-raid4"); +MODULE_ALIAS("dm-raid5"); diff --git a/ubuntu/dm-raid4-5/dm-raid4-5.h b/ubuntu/dm-raid4-5/dm-raid4-5.h new file mode 100644 index 000000000000..a0fe7c0621fa --- /dev/null +++ b/ubuntu/dm-raid4-5/dm-raid4-5.h @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2006 Red Hat GmbH + * + * Module Author: Heinz Mauelshagen (Mauelshagen@RedHat.com) + * + * This file is released under the GPL. + * + */ + +#ifndef _DM_RAID45_H +#define _DM_RAID45_H + +/* Factor out to dm.h! */ +#define STR_LEN(ptr, str) ptr, str, strlen(ptr) + +enum lock_type { RAID45_EX, RAID45_SHARED }; + +struct dmraid45_locking_type { + /* Request a lock on a stripe. */ + void* (*lock)(sector_t key, enum lock_type type); + + /* Release a lock on a stripe. */ + void (*unlock)(void *lock_handle); + +}; + +#endif diff --git a/ubuntu/dm-raid4-5/dm-raid45.h b/ubuntu/dm-raid4-5/dm-raid45.h new file mode 100644 index 000000000000..786ba7af6425 --- /dev/null +++ b/ubuntu/dm-raid4-5/dm-raid45.h @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. + * + * Module Author: Heinz Mauelshagen (Mauelshagen@RedHat.com) + * + * Locking definitions for the device-mapper RAID45 target. + * + * This file is released under the GPL. + * + */ + +#ifndef _DM_RAID45_H +#define _DM_RAID45_H + +/* Factor out to dm.h! */ +#define STR_LEN(ptr, str) (ptr), (str), strlen((ptr)) + +enum dm_lock_type { DM_RAID45_EX, DM_RAID45_SHARED }; + +struct dm_raid45_locking_type { + /* Request a lock on a stripe. */ + void* (*lock)(sector_t key, enum dm_lock_type type); + + /* Release a lock on a stripe. */ + void (*unlock)(void *lock_handle); +}; + +#endif diff --git a/ubuntu/dm-raid4-5/dm-region-hash.c b/ubuntu/dm-raid4-5/dm-region-hash.c new file mode 100644 index 000000000000..3d33af6be8f6 --- /dev/null +++ b/ubuntu/dm-raid4-5/dm-region-hash.c @@ -0,0 +1,718 @@ +/* + * Copyright (C) 2003 Sistina Software Limited. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include +#include "dm-region-hash.h" + +#include +#include +#include +#include + +#include "dm.h" + +#define DM_MSG_PREFIX "region hash" + +/*----------------------------------------------------------------- + * Region hash + * + * The mirror splits itself up into discrete regions. Each + * region can be in one of three states: clean, dirty, + * nosync. There is no need to put clean regions in the hash. + * + * In addition to being present in the hash table a region _may_ + * be present on one of three lists. + * + * clean_regions: Regions on this list have no io pending to + * them, they are in sync, we are no longer interested in them, + * they are dull. dm_rh_update_states() will remove them from the + * hash table. + * + * quiesced_regions: These regions have been spun down, ready + * for recovery. rh_recovery_start() will remove regions from + * this list and hand them to kmirrord, which will schedule the + * recovery io with kcopyd. + * + * recovered_regions: Regions that kcopyd has successfully + * recovered. dm_rh_update_states() will now schedule any delayed + * io, up the recovery_count, and remove the region from the + * hash. + * + * There are 2 locks: + * A rw spin lock 'hash_lock' protects just the hash table, + * this is never held in write mode from interrupt context, + * which I believe means that we only have to disable irqs when + * doing a write lock. + * + * An ordinary spin lock 'region_lock' that protects the three + * lists in the region_hash, with the 'state', 'list' and + * 'delayed_bios' fields of the regions. This is used from irq + * context, so all other uses will have to suspend local irqs. + *---------------------------------------------------------------*/ +struct dm_region_hash { + uint32_t region_size; + unsigned region_shift; + + /* holds persistent region state */ + struct dm_dirty_log *log; + + /* hash table */ + rwlock_t hash_lock; + mempool_t *region_pool; + unsigned mask; + unsigned nr_buckets; + unsigned prime; + unsigned shift; + struct list_head *buckets; + + unsigned max_recovery; /* Max # of regions to recover in parallel */ + + spinlock_t region_lock; + atomic_t recovery_in_flight; + struct semaphore recovery_count; + struct list_head clean_regions; + struct list_head quiesced_regions; + struct list_head recovered_regions; + struct list_head failed_recovered_regions; + + void *context; + sector_t target_begin; + + /* Callback function to schedule bios writes */ + void (*dispatch_bios)(void *context, struct bio_list *bios); + + /* Callback function to wakeup callers worker thread. */ + void (*wakeup_workers)(void *context); + + /* Callback function to wakeup callers recovery waiters. */ + void (*wakeup_all_recovery_waiters)(void *context); +}; + +struct dm_region { + struct dm_region_hash *rh; /* FIXME: can we get rid of this ? */ + region_t key; + int state; + + struct list_head hash_list; + struct list_head list; + + atomic_t pending; + struct bio_list delayed_bios; +}; + +/* + * Conversion fns + */ +region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector) +{ + return sector >> rh->region_shift; +} +// EXPORT_SYMBOL_GPL(dm_rh_sector_to_region); + +sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region) +{ + return region << rh->region_shift; +} +// EXPORT_SYMBOL_GPL(dm_rh_region_to_sector); + +region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio) +{ + return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin); +} +// EXPORT_SYMBOL_GPL(dm_rh_bio_to_region); + +void *dm_rh_region_context(struct dm_region *reg) +{ + return reg->rh->context; +} +// EXPORT_SYMBOL_GPL(dm_rh_region_context); + +region_t dm_rh_get_region_key(struct dm_region *reg) +{ + return reg->key; +} +// EXPORT_SYMBOL_GPL(dm_rh_get_region_key); + +sector_t dm_rh_get_region_size(struct dm_region_hash *rh) +{ + return rh->region_size; +} +// EXPORT_SYMBOL_GPL(dm_rh_get_region_size); + +/* + * FIXME: shall we pass in a structure instead of all these args to + * dm_region_hash_create()???? + */ +#define RH_HASH_MULT 2654435387U +#define RH_HASH_SHIFT 12 + +#define MIN_REGIONS 64 +struct dm_region_hash *dm_region_hash_create( + void *context, void (*dispatch_bios)(void *context, + struct bio_list *bios), + void (*wakeup_workers)(void *context), + void (*wakeup_all_recovery_waiters)(void *context), + sector_t target_begin, unsigned max_recovery, + struct dm_dirty_log *log, uint32_t region_size, + region_t nr_regions) +{ + struct dm_region_hash *rh; + unsigned nr_buckets, max_buckets; + size_t i; + + /* + * Calculate a suitable number of buckets for our hash + * table. + */ + max_buckets = nr_regions >> 6; + for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) + ; + nr_buckets >>= 1; + + rh = kmalloc(sizeof(*rh), GFP_KERNEL); + if (!rh) { + DMERR("unable to allocate region hash memory"); + return ERR_PTR(-ENOMEM); + } + + rh->context = context; + rh->dispatch_bios = dispatch_bios; + rh->wakeup_workers = wakeup_workers; + rh->wakeup_all_recovery_waiters = wakeup_all_recovery_waiters; + rh->target_begin = target_begin; + rh->max_recovery = max_recovery; + rh->log = log; + rh->region_size = region_size; + rh->region_shift = ffs(region_size) - 1; + rwlock_init(&rh->hash_lock); + rh->mask = nr_buckets - 1; + rh->nr_buckets = nr_buckets; + + rh->shift = RH_HASH_SHIFT; + rh->prime = RH_HASH_MULT; + + rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); + if (!rh->buckets) { + DMERR("unable to allocate region hash bucket memory"); + kfree(rh); + return ERR_PTR(-ENOMEM); + } + + for (i = 0; i < nr_buckets; i++) + INIT_LIST_HEAD(rh->buckets + i); + + spin_lock_init(&rh->region_lock); + sema_init(&rh->recovery_count, 0); + atomic_set(&rh->recovery_in_flight, 0); + INIT_LIST_HEAD(&rh->clean_regions); + INIT_LIST_HEAD(&rh->quiesced_regions); + INIT_LIST_HEAD(&rh->recovered_regions); + INIT_LIST_HEAD(&rh->failed_recovered_regions); + + rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, + sizeof(struct dm_region)); + if (!rh->region_pool) { + vfree(rh->buckets); + kfree(rh); + rh = ERR_PTR(-ENOMEM); + } + + return rh; +} +// EXPORT_SYMBOL_GPL(dm_region_hash_create); + +void dm_region_hash_destroy(struct dm_region_hash *rh) +{ + unsigned h; + struct dm_region *reg, *nreg; + + BUG_ON(!list_empty(&rh->quiesced_regions)); + for (h = 0; h < rh->nr_buckets; h++) { + list_for_each_entry_safe(reg, nreg, rh->buckets + h, + hash_list) { + BUG_ON(atomic_read(®->pending)); + mempool_free(reg, rh->region_pool); + } + } + + if (rh->log) + dm_dirty_log_destroy(rh->log); + + if (rh->region_pool) + mempool_destroy(rh->region_pool); + + vfree(rh->buckets); + kfree(rh); +} +// EXPORT_SYMBOL_GPL(dm_region_hash_destroy); + +struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh) +{ + return rh->log; +} +// EXPORT_SYMBOL_GPL(dm_rh_dirty_log); + +static unsigned rh_hash(struct dm_region_hash *rh, region_t region) +{ + return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask; +} + +static struct dm_region *__rh_lookup(struct dm_region_hash *rh, region_t region) +{ + struct dm_region *reg; + struct list_head *bucket = rh->buckets + rh_hash(rh, region); + + list_for_each_entry(reg, bucket, hash_list) + if (reg->key == region) + return reg; + + return NULL; +} + +static void __rh_insert(struct dm_region_hash *rh, struct dm_region *reg) +{ + list_add(®->hash_list, rh->buckets + rh_hash(rh, reg->key)); +} + +static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) +{ + struct dm_region *reg, *nreg; + + nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); + if (unlikely(!nreg)) + nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL); + + nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? + DM_RH_CLEAN : DM_RH_NOSYNC; + nreg->rh = rh; + nreg->key = region; + INIT_LIST_HEAD(&nreg->list); + atomic_set(&nreg->pending, 0); + bio_list_init(&nreg->delayed_bios); + + write_lock_irq(&rh->hash_lock); + reg = __rh_lookup(rh, region); + if (reg) + /* We lost the race. */ + mempool_free(nreg, rh->region_pool); + else { + __rh_insert(rh, nreg); + if (nreg->state == DM_RH_CLEAN) { + spin_lock(&rh->region_lock); + list_add(&nreg->list, &rh->clean_regions); + spin_unlock(&rh->region_lock); + } + + reg = nreg; + } + write_unlock_irq(&rh->hash_lock); + + return reg; +} + +static struct dm_region *__rh_find(struct dm_region_hash *rh, region_t region) +{ + struct dm_region *reg; + + reg = __rh_lookup(rh, region); + if (!reg) { + read_unlock(&rh->hash_lock); + reg = __rh_alloc(rh, region); + read_lock(&rh->hash_lock); + } + + return reg; +} + +int dm_rh_get_state(struct dm_region_hash *rh, region_t region, int may_block) +{ + int r; + struct dm_region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_lookup(rh, region); + read_unlock(&rh->hash_lock); + + if (reg) + return reg->state; + + /* + * The region wasn't in the hash, so we fall back to the + * dirty log. + */ + r = rh->log->type->in_sync(rh->log, region, may_block); + + /* + * Any error from the dirty log (eg. -EWOULDBLOCK) gets + * taken as a DM_RH_NOSYNC + */ + return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC; +} +// EXPORT_SYMBOL_GPL(dm_rh_get_state); + +static void complete_resync_work(struct dm_region *reg, int success) +{ + struct dm_region_hash *rh = reg->rh; + + rh->log->type->set_region_sync(rh->log, reg->key, success); + + /* + * Dispatch the bios before we call 'wake_up_all'. + * This is important because if we are suspending, + * we want to know that recovery is complete and + * the work queue is flushed. If we wake_up_all + * before we dispatch_bios (queue bios and call wake()), + * then we risk suspending before the work queue + * has been properly flushed. + */ + rh->dispatch_bios(rh->context, ®->delayed_bios); + if (atomic_dec_and_test(&rh->recovery_in_flight)) + rh->wakeup_all_recovery_waiters(rh->context); + up(&rh->recovery_count); +} + +/* dm_rh_mark_nosync + * @ms + * @bio + * @done + * @error + * + * The bio was written on some mirror(s) but failed on other mirror(s). + * We can successfully endio the bio but should avoid the region being + * marked clean by setting the state DM_RH_NOSYNC. + * + * This function is _not_ safe in interrupt context! + */ +void dm_rh_mark_nosync(struct dm_region_hash *rh, + struct bio *bio, unsigned done, int error) +{ + unsigned long flags; + struct dm_dirty_log *log = rh->log; + struct dm_region *reg; + region_t region = dm_rh_bio_to_region(rh, bio); + int recovering = 0; + + /* We must inform the log that the sync count has changed. */ + log->type->set_region_sync(log, region, 0); + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + read_unlock(&rh->hash_lock); + + /* region hash entry should exist because write was in-flight */ + BUG_ON(!reg); + BUG_ON(!list_empty(®->list)); + + spin_lock_irqsave(&rh->region_lock, flags); + /* + * Possible cases: + * 1) DM_RH_DIRTY + * 2) DM_RH_NOSYNC: was dirty, other preceeding writes failed + * 3) DM_RH_RECOVERING: flushing pending writes + * Either case, the region should have not been connected to list. + */ + recovering = (reg->state == DM_RH_RECOVERING); + reg->state = DM_RH_NOSYNC; + BUG_ON(!list_empty(®->list)); + spin_unlock_irqrestore(&rh->region_lock, flags); + + bio_endio(bio, error); + if (recovering) + complete_resync_work(reg, 0); +} +// EXPORT_SYMBOL_GPL(dm_rh_mark_nosync); + +void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled) +{ + struct dm_region *reg, *next; + + LIST_HEAD(clean); + LIST_HEAD(recovered); + LIST_HEAD(failed_recovered); + + /* + * Quickly grab the lists. + */ + write_lock_irq(&rh->hash_lock); + spin_lock(&rh->region_lock); + if (!list_empty(&rh->clean_regions)) { + list_splice_init(&rh->clean_regions, &clean); + + list_for_each_entry(reg, &clean, list) + list_del(®->hash_list); + } + + if (!list_empty(&rh->recovered_regions)) { + list_splice_init(&rh->recovered_regions, &recovered); + + list_for_each_entry(reg, &recovered, list) + list_del(®->hash_list); + } + + if (!list_empty(&rh->failed_recovered_regions)) { + list_splice_init(&rh->failed_recovered_regions, + &failed_recovered); + + list_for_each_entry(reg, &failed_recovered, list) + list_del(®->hash_list); + } + + spin_unlock(&rh->region_lock); + write_unlock_irq(&rh->hash_lock); + + /* + * All the regions on the recovered and clean lists have + * now been pulled out of the system, so no need to do + * any more locking. + */ + list_for_each_entry_safe(reg, next, &recovered, list) { + rh->log->type->clear_region(rh->log, reg->key); + complete_resync_work(reg, 1); + mempool_free(reg, rh->region_pool); + } + + list_for_each_entry_safe(reg, next, &failed_recovered, list) { + complete_resync_work(reg, errors_handled ? 0 : 1); + mempool_free(reg, rh->region_pool); + } + + list_for_each_entry_safe(reg, next, &clean, list) { + rh->log->type->clear_region(rh->log, reg->key); + mempool_free(reg, rh->region_pool); + } + + rh->log->type->flush(rh->log); +} +// EXPORT_SYMBOL_GPL(dm_rh_update_states); + +void dm_rh_inc(struct dm_region_hash *rh, region_t region) +{ + struct dm_region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + + spin_lock_irq(&rh->region_lock); + atomic_inc(®->pending); + + if (reg->state == DM_RH_CLEAN) { + reg->state = DM_RH_DIRTY; + list_del_init(®->list); /* take off the clean list */ + spin_unlock_irq(&rh->region_lock); + + rh->log->type->mark_region(rh->log, reg->key); + } else + spin_unlock_irq(&rh->region_lock); + + + read_unlock(&rh->hash_lock); +} +// EXPORT_SYMBOL_GPL(dm_rh_inc); + +void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) +{ + struct bio *bio; + + for (bio = bios->head; bio; bio = bio->bi_next) + dm_rh_inc(rh, dm_rh_bio_to_region(rh, bio)); +} +// EXPORT_SYMBOL_GPL(dm_rh_inc_pending); + +void dm_rh_dec(struct dm_region_hash *rh, region_t region) +{ + unsigned long flags; + struct dm_region *reg; + int should_wake = 0; + + read_lock(&rh->hash_lock); + reg = __rh_lookup(rh, region); + read_unlock(&rh->hash_lock); + + spin_lock_irqsave(&rh->region_lock, flags); + if (atomic_dec_and_test(®->pending)) { + /* + * There is no pending I/O for this region. + * We can move the region to corresponding list for next action. + * At this point, the region is not yet connected to any list. + * + * If the state is DM_RH_NOSYNC, the region should be kept off + * from clean list. + * The hash entry for DM_RH_NOSYNC will remain in memory + * until the region is recovered or the map is reloaded. + */ + + /* do nothing for DM_RH_NOSYNC */ + if (reg->state == DM_RH_RECOVERING) { + list_add_tail(®->list, &rh->quiesced_regions); + } else if (reg->state == DM_RH_DIRTY) { + reg->state = DM_RH_CLEAN; + list_add(®->list, &rh->clean_regions); + } + should_wake = 1; + } + spin_unlock_irqrestore(&rh->region_lock, flags); + + if (should_wake) + rh->wakeup_workers(rh->context); +} +// EXPORT_SYMBOL_GPL(dm_rh_dec); + +/* + * Starts quiescing a region in preparation for recovery. + */ +static int __rh_recovery_prepare(struct dm_region_hash *rh) +{ + int r; + region_t region; + struct dm_region *reg; + + /* + * Ask the dirty log what's next. + */ + r = rh->log->type->get_resync_work(rh->log, ®ion); + if (r <= 0) + return r; + + /* + * Get this region, and start it quiescing by setting the + * recovering flag. + */ + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + read_unlock(&rh->hash_lock); + + spin_lock_irq(&rh->region_lock); + reg->state = DM_RH_RECOVERING; + + /* Already quiesced ? */ + if (atomic_read(®->pending)) + list_del_init(®->list); + else + list_move(®->list, &rh->quiesced_regions); + + spin_unlock_irq(&rh->region_lock); + + return 1; +} + +void dm_rh_recovery_prepare(struct dm_region_hash *rh) +{ + /* Extra reference to avoid race with dm_rh_stop_recovery */ + atomic_inc(&rh->recovery_in_flight); + + while (!down_trylock(&rh->recovery_count)) { + atomic_inc(&rh->recovery_in_flight); + if (__rh_recovery_prepare(rh) <= 0) { + atomic_dec(&rh->recovery_in_flight); + up(&rh->recovery_count); + break; + } + } + + /* Drop the extra reference */ + if (atomic_dec_and_test(&rh->recovery_in_flight)) + rh->wakeup_all_recovery_waiters(rh->context); +} +// EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare); + +/* + * Returns any quiesced regions. + */ +struct dm_region *dm_rh_recovery_start(struct dm_region_hash *rh) +{ + struct dm_region *reg = NULL; + + spin_lock_irq(&rh->region_lock); + if (!list_empty(&rh->quiesced_regions)) { + reg = list_entry(rh->quiesced_regions.next, + struct dm_region, list); + list_del_init(®->list); /* remove from the quiesced list */ + } + spin_unlock_irq(&rh->region_lock); + + return reg; +} +// EXPORT_SYMBOL_GPL(dm_rh_recovery_start); + +void dm_rh_recovery_end(struct dm_region *reg, int success) +{ + struct dm_region_hash *rh = reg->rh; + + spin_lock_irq(&rh->region_lock); + if (success) + list_add(®->list, ®->rh->recovered_regions); + else { + reg->state = DM_RH_NOSYNC; + list_add(®->list, ®->rh->failed_recovered_regions); + } + spin_unlock_irq(&rh->region_lock); + + rh->wakeup_workers(rh->context); +} +// EXPORT_SYMBOL_GPL(dm_rh_recovery_end); + +/* Return recovery in flight count. */ +int dm_rh_recovery_in_flight(struct dm_region_hash *rh) +{ + return atomic_read(&rh->recovery_in_flight); +} +// EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight); + +int dm_rh_flush(struct dm_region_hash *rh) +{ + return rh->log->type->flush(rh->log); +} +// EXPORT_SYMBOL_GPL(dm_rh_flush); + +void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio) +{ + struct dm_region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, dm_rh_bio_to_region(rh, bio)); + bio_list_add(®->delayed_bios, bio); + read_unlock(&rh->hash_lock); +} +// EXPORT_SYMBOL_GPL(dm_rh_delay); + +void dm_rh_delay_by_region(struct dm_region_hash *rh, + struct bio *bio, region_t region) +{ + struct dm_region *reg; + + /* FIXME: locking. */ + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + bio_list_add(®->delayed_bios, bio); + read_unlock(&rh->hash_lock); +} +// EXPORT_SYMBOL_GPL(dm_rh_delay_by_region); + +void dm_rh_stop_recovery(struct dm_region_hash *rh) +{ + int i; + + /* wait for any recovering regions */ + for (i = 0; i < rh->max_recovery; i++) + down(&rh->recovery_count); +} +// EXPORT_SYMBOL_GPL(dm_rh_stop_recovery); + +void dm_rh_start_recovery(struct dm_region_hash *rh) +{ + int i; + + for (i = 0; i < rh->max_recovery; i++) + up(&rh->recovery_count); + + rh->wakeup_workers(rh->context); +} +// EXPORT_SYMBOL_GPL(dm_rh_start_recovery); + +MODULE_DESCRIPTION(DM_NAME " region hash"); +MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen "); +MODULE_LICENSE("GPL"); diff --git a/ubuntu/dm-raid4-5/dm-region-hash.h b/ubuntu/dm-raid4-5/dm-region-hash.h new file mode 100644 index 000000000000..bfd21cb9f768 --- /dev/null +++ b/ubuntu/dm-raid4-5/dm-region-hash.h @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2003 Sistina Software Limited. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * Device-Mapper dirty region hash interface. + * + * This file is released under the GPL. + */ + +#ifndef DM_REGION_HASH_H +#define DM_REGION_HASH_H + +#include + +/*----------------------------------------------------------------- + * Region hash + *----------------------------------------------------------------*/ +struct dm_region_hash; +struct dm_region; + +/* + * States a region can have. + */ +enum dm_rh_region_states { + DM_RH_CLEAN = 0x01, /* No writes in flight. */ + DM_RH_DIRTY = 0x02, /* Writes in flight. */ + DM_RH_NOSYNC = 0x04, /* Out of sync. */ + DM_RH_RECOVERING = 0x08, /* Under resynchronization. */ +}; + +/* + * Region hash create/destroy. + */ +struct bio_list; +struct dm_region_hash *dm_region_hash_create( + void *context, void (*dispatch_bios)(void *context, + struct bio_list *bios), + void (*wakeup_workers)(void *context), + void (*wakeup_all_recovery_waiters)(void *context), + sector_t target_begin, unsigned max_recovery, + struct dm_dirty_log *log, uint32_t region_size, + region_t nr_regions); +void dm_region_hash_destroy(struct dm_region_hash *rh); + +struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh); + +/* + * Conversion functions. + */ +region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio); +sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region); +region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector); +void *dm_rh_region_context(struct dm_region *reg); + +/* + * Get region size and key (ie. number of the region). + */ +sector_t dm_rh_get_region_size(struct dm_region_hash *rh); +region_t dm_rh_get_region_key(struct dm_region *reg); + +/* + * Get/set/update region state (and dirty log). + * + */ +int dm_rh_get_state(struct dm_region_hash *rh, region_t region, int may_block); +void dm_rh_set_state(struct dm_region_hash *rh, region_t region, + enum dm_rh_region_states state, int may_block); + +/* Non-zero errors_handled leaves the state of the region NOSYNC */ +void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled); + +/* Flush the region hash and dirty log. */ +int dm_rh_flush(struct dm_region_hash *rh); + +/* Inc/dec pending count on regions. */ +void dm_rh_inc(struct dm_region_hash *rh, region_t region); +void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios); +void dm_rh_dec(struct dm_region_hash *rh, region_t region); + +/* Delay bios on regions. */ +void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio); +void dm_rh_delay_by_region(struct dm_region_hash *rh, struct bio *bio, + region_t region); + +void dm_rh_mark_nosync(struct dm_region_hash *rh, + struct bio *bio, unsigned done, int error); + +/* + * Region recovery control. + */ + +/* Prepare some regions for recovery by starting to quiesce them. */ +void dm_rh_recovery_prepare(struct dm_region_hash *rh); + +/* Try fetching a quiesced region for recovery. */ +struct dm_region *dm_rh_recovery_start(struct dm_region_hash *rh); + +/* Report recovery end on a region. */ +void dm_rh_recovery_end(struct dm_region *reg, int error); + +/* Returns number of regions with recovery work outstanding. */ +int dm_rh_recovery_in_flight(struct dm_region_hash *rh); + +/* Start/stop recovery. */ +void dm_rh_start_recovery(struct dm_region_hash *rh); +void dm_rh_stop_recovery(struct dm_region_hash *rh); + +#endif /* DM_REGION_HASH_H */ -- cgit v1.2.3 From 74414500c31298947b9124c679622b07f04651f0 Mon Sep 17 00:00:00 2001 From: Ben Collins Date: Tue, 14 Oct 2008 11:30:10 -0400 Subject: UBUNTU: SAUCE: (no-up) Modularize vesafb Signed-off-by: Ben Collins --- drivers/video/Kconfig | 4 ++-- drivers/video/vesafb.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 2 deletions(-) diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig index b6ea1b569100..b786a00023ab 100644 --- a/drivers/video/Kconfig +++ b/drivers/video/Kconfig @@ -755,8 +755,8 @@ config FB_UVESA If unsure, say N. config FB_VESA - bool "VESA VGA graphics support" - depends on (FB = y) && X86 + tristate "VESA VGA graphics support" + depends on FB && X86 select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT diff --git a/drivers/video/vesafb.c b/drivers/video/vesafb.c index 501b3406c6d5..97af4869c5a6 100644 --- a/drivers/video/vesafb.c +++ b/drivers/video/vesafb.c @@ -27,6 +27,12 @@ #define dac_reg (0x3c8) #define dac_val (0x3c9) +struct vesafb_info +{ + u32 pseudo_palette[256]; + int mtrr_hdl; +}; + /* --------------------------------------------------------------------- */ static struct fb_var_screeninfo vesafb_defined __initdata = { @@ -46,16 +52,37 @@ static struct fb_fix_screeninfo vesafb_fix __initdata = { .accel = FB_ACCEL_NONE, }; +#ifndef MODULE static int inverse __read_mostly; +#endif static int mtrr __read_mostly; /* disable mtrr */ static int vram_remap __initdata; /* Set amount of memory to be used */ static int vram_total __initdata; /* Set total amount of memory */ static int pmi_setpal __read_mostly = 1; /* pmi for palette changes ??? */ +static int redraw __read_mostly; static int ypan __read_mostly; /* 0..nothing, 1..ypan, 2..ywrap */ +static int ywrap __read_mostly; static void (*pmi_start)(void) __read_mostly; static void (*pmi_pal) (void) __read_mostly; static int depth __read_mostly; static int vga_compat __read_mostly; + +module_param(redraw, bool, 0); +module_param(ypan, bool, 0); +module_param(ywrap, bool, 0); +module_param_named(vgapal, pmi_setpal, invbool, 0); +MODULE_PARM_DESC(vgapal, "Use VGA for setting palette (default)"); +module_param_named(pmipal, pmi_setpal, bool, 0); +MODULE_PARM_DESC(pmipal, "Use PMI for setting palette"); +module_param(mtrr, bool, 0); +MODULE_PARM_DESC(mtrr, "Enable MTRR support (default)"); +module_param_named(nomtrr, mtrr, invbool, 0); +MODULE_PARM_DESC(nomtrr, "Disable MTRR support"); +module_param(vram_remap, int, 0); +MODULE_PARM_DESC(vram_remap, "Set total amount of memory to be used"); +module_param(vram_total, int, 0); +MODULE_PARM_DESC(vram_total, "Total amount of memory"); + /* --------------------------------------------------------------------- */ static int vesafb_pan_display(struct fb_var_screeninfo *var, @@ -192,6 +219,7 @@ static struct fb_ops vesafb_ops = { .fb_imageblit = cfb_imageblit, }; +#ifndef MODULE static int __init vesafb_setup(char *options) { char *this_opt; @@ -225,6 +253,7 @@ static int __init vesafb_setup(char *options) } return 0; } +#endif static int __init vesafb_probe(struct platform_device *dev) { @@ -495,7 +524,27 @@ err: return err; } +static int __exit vesafb_remove(struct platform_device *device) +{ + struct fb_info *info = dev_get_drvdata(&device->dev); + + unregister_framebuffer(info); +#ifdef CONFIG_MTRR + { + struct vesafb_info *vfb_info = (struct vesafb_info *) info->par; + if (vfb_info->mtrr_hdl >= 0) + mtrr_del(vfb_info->mtrr_hdl, 0, 0); + } +#endif + iounmap(info->screen_base); + framebuffer_release(info); + release_mem_region(vesafb_fix.smem_start, vesafb_fix.smem_len); + + return 0; +} + static struct platform_driver vesafb_driver = { + .remove = vesafb_remove, .driver = { .name = "vesafb", }, @@ -506,11 +555,18 @@ static struct platform_device *vesafb_device; static int __init vesafb_init(void) { int ret; +#ifndef MODULE char *option = NULL; /* ignore error return of fb_get_options */ fb_get_options("vesafb", &option); vesafb_setup(option); +#else + if (redraw) + ypan = 0; + if (ywrap) + ypan = 2; +#endif vesafb_device = platform_device_alloc("vesafb", 0); if (!vesafb_device) @@ -530,6 +586,14 @@ static int __init vesafb_init(void) return ret; } + +static void __exit vesafb_exit(void) +{ + platform_device_unregister(vesafb_device); + platform_driver_unregister(&vesafb_driver); +} + module_init(vesafb_init); +module_exit(vesafb_exit); MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 37fbee5261e74f6c8786c3b80a78920df4d76097 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 27 Nov 2008 19:12:07 +0000 Subject: UBUNTU: SAUCE: (no-up) version: Implement version_signature proc file. Signed-off-by: Andy Whitcroft Acked-by: Tim Gardener --- fs/proc/Makefile | 1 + fs/proc/version_signature.c | 31 +++++++++++++++++++++++++++++++ init/Kconfig | 9 +++++++++ init/version.c | 6 +++++- 4 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 fs/proc/version_signature.c diff --git a/fs/proc/Makefile b/fs/proc/Makefile index c1c729335924..cc68b63d572b 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -28,3 +28,4 @@ proc-$(CONFIG_PROC_VMCORE) += vmcore.o proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o proc-$(CONFIG_PRINTK) += kmsg.o proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o +proc-y += version_signature.o diff --git a/fs/proc/version_signature.c b/fs/proc/version_signature.c new file mode 100644 index 000000000000..859fb6092a6c --- /dev/null +++ b/fs/proc/version_signature.c @@ -0,0 +1,31 @@ +#include +#include +#include +#include +#include +#include + +static int version_signature_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%s\n", CONFIG_VERSION_SIGNATURE); + return 0; +} + +static int version_signature_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, version_signature_proc_show, NULL); +} + +static const struct file_operations version_signature_proc_fops = { + .open = version_signature_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_version_signature_init(void) +{ + proc_create("version_signature", 0, NULL, &version_signature_proc_fops); + return 0; +} +module_init(proc_version_signature_init); diff --git a/init/Kconfig b/init/Kconfig index 7c329b6d4a13..3a5028fe833e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -214,6 +214,15 @@ config DEFAULT_HOSTNAME but you may wish to use a different default here to make a minimal system more usable with less configuration. +config VERSION_SIGNATURE + string "Arbitrary version signature" + help + This string will be created in a file, /proc/version_signature. It + is useful in determining arbitrary data about your kernel. For instance, + if you have several kernels of the same version, but need to keep track + of a revision of the same kernel, but not affect it's ability to load + compatible modules, this is the easiest way to do that. + config SWAP bool "Support for paging of anonymous memory (swap)" depends on MMU && BLOCK diff --git a/init/version.c b/init/version.c index 86fe0ccb997a..3a9433a39d2c 100644 --- a/init/version.c +++ b/init/version.c @@ -40,7 +40,11 @@ EXPORT_SYMBOL_GPL(init_uts_ns); /* FIXED STRINGS! Don't touch! */ const char linux_banner[] = "Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@" - LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION "\n"; + LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION +#ifdef CONFIG_VERSION_SIGNATURE + " (" CONFIG_VERSION_SIGNATURE ")" +#endif + "\n"; const char linux_proc_banner[] = "%s version %s" -- cgit v1.2.3 From 27d76a9ada38a63fe603a2bbc2cce5524169c432 Mon Sep 17 00:00:00 2001 From: Scott James Remnant Date: Tue, 3 Mar 2009 14:24:52 +0000 Subject: UBUNTU: SAUCE: (no-up) hostap: Change initial operation mode to managed (infra) This was previously changed by using an "options" line in a modprobe.d file, however that practice is now deprecated. This is because module names, option names, their values and even their current defaults can all change inside the kernel and module-init-tools has never been kept in sync. In addition, changing the kernel means that the option change will apply if the module is built in by users or the OEM team. Signed-off-by: Scott James Remnant Signed-off-by: Tim Gardner --- drivers/net/wireless/hostap/hostap_hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/hostap/hostap_hw.c b/drivers/net/wireless/hostap/hostap_hw.c index 50f87b60b0bd..16e2fee4d246 100644 --- a/drivers/net/wireless/hostap/hostap_hw.c +++ b/drivers/net/wireless/hostap/hostap_hw.c @@ -68,7 +68,7 @@ static char essid[33] = "test"; module_param_string(essid, essid, sizeof(essid), 0444); MODULE_PARM_DESC(essid, "Host AP's ESSID"); -static int iw_mode[MAX_PARM_DEVICES] = { IW_MODE_MASTER, DEF_INTS }; +static int iw_mode[MAX_PARM_DEVICES] = { IW_MODE_INFRA, DEF_INTS }; module_param_array(iw_mode, int, NULL, 0444); MODULE_PARM_DESC(iw_mode, "Initial operation mode"); -- cgit v1.2.3 From d1880b11e5ac4338ee7d99482ba93557bced1b40 Mon Sep 17 00:00:00 2001 From: Scott James Remnant Date: Mon, 16 Mar 2009 15:03:07 +0000 Subject: UBUNTU: SAUCE: (no-up) nbd: Change default partitions per device to 15 This was previously changed by using an "options" line in a modprobe.d file, however that practice is now deprecated. This is because module names, option names, their values and even their current defaults can all change inside the kernel and module-init-tools has never been kept in sync. In addition, changing the kernel means that the option change will apply if the module is built in by users or the OEM team. Bug: #342563 Signed-off-by: Scott James Remnant Signed-off-by: Tim Gardner --- drivers/block/nbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 061427a75d37..bb8b53da1db7 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -57,7 +57,7 @@ static unsigned int debugflags; static unsigned int nbds_max = 16; static struct nbd_device *nbd_dev; -static int max_part; +static int max_part = 15; /* * Use just one lock (or at most 1 per NIC). Two arguments for this: -- cgit v1.2.3 From d20211ae671c382ffa60deaaca58578b1608de96 Mon Sep 17 00:00:00 2001 From: Tim Gardner Date: Tue, 21 Apr 2009 11:09:10 -0600 Subject: UBUNTU: SAUCE: (no-up) Sony laptop: Some Sony Vaia laptops do not enable wwan power by default. Bug: https://bugs.launchpad.net/bugs/364678 Added quirk to enable wwan power based on DMI information already present in the module. Ity appears that Vaio's do not enable power to wwan from a cold boot. We'll carry this patch indefinitely. Signed-off-by: Tim Gardner --- drivers/platform/x86/sony-laptop.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c index 210d4ae547c2..9e9a55ec5e9c 100644 --- a/drivers/platform/x86/sony-laptop.c +++ b/drivers/platform/x86/sony-laptop.c @@ -2760,6 +2760,10 @@ struct sonypi_eventtypes { struct sonypi_event *events; }; +struct sony_pic_quirk_entry { + u8 set_wwan_power; +}; + struct sony_pic_dev { struct acpi_device *acpi_dev; struct sony_pic_irq *cur_irq; @@ -2770,6 +2774,7 @@ struct sony_pic_dev { struct sonypi_eventtypes *event_types; int (*handle_irq)(const u8, const u8); int model; + struct sony_pic_quirk_entry *quirks; u16 evport_offset; u8 camera_power; u8 bluetooth_power; @@ -4204,6 +4209,12 @@ static int sony_pic_add(struct acpi_device *device) if (result) goto err_remove_pf; + if (spic_dev.quirks && spic_dev.quirks->set_wwan_power) { + /* + * Power isn't enabled by default. + */ + __sony_pic_set_wwanpower(1); + } return 0; err_remove_pf: @@ -4274,6 +4285,16 @@ static struct acpi_driver sony_pic_driver = { }, }; +static struct sony_pic_quirk_entry sony_pic_vaio_vgn = { + .set_wwan_power = 1, +}; + +static int dmi_matched(const struct dmi_system_id *dmi) +{ + spic_dev.quirks = dmi->driver_data; + return 0; +} + static struct dmi_system_id __initdata sonypi_dmi_table[] = { { .ident = "Sony Vaio", @@ -4288,6 +4309,8 @@ static struct dmi_system_id __initdata sonypi_dmi_table[] = { DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"), DMI_MATCH(DMI_PRODUCT_NAME, "VGN-"), }, + .callback = dmi_matched, + .driver_data = &sony_pic_vaio_vgn, }, { } }; -- cgit v1.2.3 From de559c0063e985c2293c5d580f7f1601aff78672 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Fri, 26 Jun 2009 10:59:30 +0100 Subject: UBUNTU: SAUCE: (no-up) disable adding scsi headers to linux-libc-dev Currently scsi headers are generated by the kernel and by libc6-dev. We need to coordinate any switch over to the kernel. Temporarily disabled these headers in the kernel package. Signed-off-by: Andy Whitcroft --- include/Kbuild | 1 - include/scsi/Kbuild | 4 ---- 2 files changed, 5 deletions(-) delete mode 100644 include/scsi/Kbuild diff --git a/include/Kbuild b/include/Kbuild index 8d226bfa2696..fe36accd4328 100644 --- a/include/Kbuild +++ b/include/Kbuild @@ -9,4 +9,3 @@ header-y += rdma/ header-y += video/ header-y += drm/ header-y += xen/ -header-y += scsi/ diff --git a/include/scsi/Kbuild b/include/scsi/Kbuild deleted file mode 100644 index f2b94918994d..000000000000 --- a/include/scsi/Kbuild +++ /dev/null @@ -1,4 +0,0 @@ -header-y += scsi_netlink.h -header-y += scsi_netlink_fc.h -header-y += scsi_bsg_fc.h -header-y += fc/ -- cgit v1.2.3 From 2ae51dd7171b299c0b78e83104617fa19448a799 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 6 Oct 2009 09:33:30 +0100 Subject: UBUNTU: ubuntu: dm-raid-45 -- update to compile with 2.6.32 This commit changed bio_barrier() to direct flag accessors, convert dm-raid-4-5 to match: commit 1f98a13f623e0ef666690a18c1250335fc6d7ef1 Author: Jens Axboe Date: Fri Sep 11 14:32:04 2009 +0200 bio: first step in sanitizing the bio->bi_rw flag testing Signed-off-by: Andy Whitcroft --- ubuntu/dm-raid4-5/dm-raid4-5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ubuntu/dm-raid4-5/dm-raid4-5.c b/ubuntu/dm-raid4-5/dm-raid4-5.c index 52b21e9d189e..4efe1fc496b1 100644 --- a/ubuntu/dm-raid4-5/dm-raid4-5.c +++ b/ubuntu/dm-raid4-5/dm-raid4-5.c @@ -3227,7 +3227,7 @@ static void do_ios(struct raid_set *rs, struct bio_list *ios) * the input queue unless all work queues are empty * and the stripe cache is inactive. */ - if (unlikely(bio_barrier(bio))) { + if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { /* REMOVEME: statistics. */ atomic_inc(rs->stats + S_BARRIER); if (delay || -- cgit v1.2.3 From d6cf77bc185d836901cdfe6bdd1685d5e1c5030c Mon Sep 17 00:00:00 2001 From: Tim Gardner Date: Thu, 8 Oct 2009 11:42:36 -0600 Subject: UBUNTU: SAUCE: (no-up) raise the default console 'quiet' level to 2 In the interests of providing a clean boot experience, i.e., a blank screen before X starts, supress noisy driver messages. See https://lists.ubuntu.com/archives/kernel-team/2009-October/007476.html for the original diatribe. Signed-off-by: Tim Gardner --- init/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/main.c b/init/main.c index b5cc0a7c4708..0af72b99d009 100644 --- a/init/main.c +++ b/init/main.c @@ -199,7 +199,7 @@ static int __init debug_kernel(char *str) static int __init quiet_kernel(char *str) { - console_loglevel = 4; + console_loglevel = 2; return 0; } -- cgit v1.2.3 From 4ec0ab4b42becb55ab895a8453fef596ac8e1a35 Mon Sep 17 00:00:00 2001 From: Scott James Remnant Date: Tue, 27 Oct 2009 10:05:32 +0000 Subject: UBUNTU: SAUCE: (no-up) trace: add trace events for open(), exec() and uselib() BugLink: http://bugs.launchpad.net/bugs/462111 This patch uses TRACE_EVENT to add tracepoints for the open(), exec() and uselib() syscalls so that ureadahead can cheaply trace the boot sequence to determine what to read to speed up the next. It's not upstream because it will need to be rebased onto the syscall trace events whenever that gets merged, and is a stop-gap. Signed-off-by: Scott James Remnant Acked-by: Stefan Bader Acked-by: Andy Whitcroft Signed-off-by: Stefan Bader --- fs/exec.c | 4 ++++ fs/open.c | 4 ++++ include/trace/events/fs.h | 53 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+) create mode 100644 include/trace/events/fs.h diff --git a/fs/exec.c b/fs/exec.c index a79786a8d2c8..88af8a242f91 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -56,6 +56,8 @@ #include #include +#include + #include #include #include @@ -781,6 +783,8 @@ struct file *open_exec(const char *name) fsnotify_open(file); + trace_open_exec(name); + err = deny_write_access(file); if (err) goto exit; diff --git a/fs/open.c b/fs/open.c index d6c79a0dffc7..ec05458f9883 100644 --- a/fs/open.c +++ b/fs/open.c @@ -33,6 +33,9 @@ #include "internal.h" +#define CREATE_TRACE_POINTS +#include + int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { @@ -1035,6 +1038,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) } else { fsnotify_open(f); fd_install(fd, f); + trace_do_sys_open(tmp, flags, mode); } } putname(tmp); diff --git a/include/trace/events/fs.h b/include/trace/events/fs.h new file mode 100644 index 000000000000..2cbb8fed1ae6 --- /dev/null +++ b/include/trace/events/fs.h @@ -0,0 +1,53 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM fs + +#if !defined(_TRACE_FS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_FS_H + +#include +#include + +TRACE_EVENT(do_sys_open, + + TP_PROTO(char *filename, int flags, int mode), + + TP_ARGS(filename, flags, mode), + + TP_STRUCT__entry( + __string( filename, filename ) + __field( int, flags ) + __field( int, mode ) + ), + + TP_fast_assign( + __assign_str(filename, filename); + __entry->flags = flags; + __entry->mode = mode; + ), + + TP_printk("\"%s\" %x %o", + __get_str(filename), __entry->flags, __entry->mode) +); + +TRACE_EVENT(open_exec, + + TP_PROTO(char *filename), + + TP_ARGS(filename), + + TP_STRUCT__entry( + __string( filename, filename ) + ), + + TP_fast_assign( + __assign_str(filename, filename); + ), + + TP_printk("\"%s\"", + __get_str(filename)) +); + +#endif /* _TRACE_FS_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From cf16097621de94e33477ea40002abff8ab793ab6 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Mon, 9 Nov 2009 17:30:29 +0000 Subject: UBUNTU: SAUCE: (no-up) set /proc/acpi/video/*/DOS to 4 by default BugLink: http://bugs.launchpad.net/bugs/458982 acpi_video_bus_start_devices() in drivers/acpi/video.c sets the default ACPI DOS value to 0, which lets the OS handle toggling of display output but still leaves the BIOS handling brightness levels automatically when connecting/disconnecting AC. We want the OS to handle both where possible, so a better default would be to set this to 4. This likely will regress systems using proprietary video drivers which will need to change this setting back using the sysfs interfaces. Signed-off-by: Andy Whitcroft --- drivers/acpi/video.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index a576575617d7..9006942eacbd 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -1423,7 +1423,7 @@ static int acpi_video_bus_put_devices(struct acpi_video_bus *video) static int acpi_video_bus_start_devices(struct acpi_video_bus *video) { - return acpi_video_bus_DOS(video, 0, 0); + return acpi_video_bus_DOS(video, 0, 1); } static int acpi_video_bus_stop_devices(struct acpi_video_bus *video) -- cgit v1.2.3 From a9e942e3363dabc7a46d8e321965673af74a8322 Mon Sep 17 00:00:00 2001 From: Surbhi Palande Date: Tue, 8 Dec 2009 11:44:34 +0200 Subject: UBUNTU: SAUCE: Make populate_rootfs asynchronous The expansion of the initramfs is completely independant of other boot activities. The original data is already present at boot and the filesystem is not required until we are ready to start init. It is therefore reasonable to populate the rootfs asynchronously. Move this processing to an async call. This reduces kernel initialisation time (the time from bootloader to starting userspace) by several 10ths of a second on a selection of test hardware particularly SMP systems, although UP system also benefit. Signed-off-by: Surbhi Palande Signed-off-by: Andy Whitcroft --- include/linux/init.h | 2 ++ init/initramfs.c | 15 ++++++++++++--- init/main.c | 6 ++++++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/include/linux/init.h b/include/linux/init.h index 6b951095a42f..280d329b882c 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -223,6 +223,8 @@ extern bool initcall_debug; static initcall_t __initcall_##fn \ __used __section(.security_initcall.init) = fn +extern struct list_head populate_rootfs_domain; + struct obs_kernel_param { const char *str; int (*setup_func)(char *); diff --git a/init/initramfs.c b/init/initramfs.c index 84c6bf111300..928a3f2b81da 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -18,6 +18,7 @@ #include #include #include +#include static __initdata char *message; static void __init error(char *x) @@ -579,7 +580,9 @@ static void __init clean_rootfs(void) } #endif -static int __init populate_rootfs(void) +LIST_HEAD(populate_rootfs_domain); + +static void __init async_populate_rootfs(void) { char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); if (err) @@ -592,7 +595,7 @@ static int __init populate_rootfs(void) initrd_end - initrd_start); if (!err) { free_initrd(); - return 0; + return; } else { clean_rootfs(); unpack_to_rootfs(__initramfs_start, __initramfs_size); @@ -616,6 +619,12 @@ static int __init populate_rootfs(void) free_initrd(); #endif } - return 0; + return; } + +static int __init populate_rootfs(void) +{ + async_schedule_domain(async_populate_rootfs, NULL, &populate_rootfs_domain); +} + rootfs_initcall(populate_rootfs); diff --git a/init/main.c b/init/main.c index 0af72b99d009..b0751f0bfe04 100644 --- a/init/main.c +++ b/init/main.c @@ -870,6 +870,12 @@ static int __init kernel_init(void * unused) (void) sys_dup(0); (void) sys_dup(0); + /* + * We need to ensure that the filesystem is ready by this point, wait for + * async_populate_rootfs to complete. + */ + async_synchronize_full_domain(&populate_rootfs_domain); + /* * check if there is an early userspace init. If yes, let it do all * the work -- cgit v1.2.3 From f5f84a1333a114de14eaf341965385cc9b0a3447 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 2 Dec 2009 14:41:53 +0000 Subject: UBUNTU: SAUCE: isapnp_init: make isa PNP scans occur async The results of scanning for devices is to trigger udev events therefore we can push this processing async. This reduces kernel initialisation time (the time from bootloader to starting userspace) by several 10ths of a second x86 32bit systems. Signed-off-by: Andy Whitcroft --- drivers/pnp/isapnp/core.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/pnp/isapnp/core.c b/drivers/pnp/isapnp/core.c index 918d5f044865..972fde662338 100644 --- a/drivers/pnp/isapnp/core.c +++ b/drivers/pnp/isapnp/core.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include "../base.h" @@ -1001,7 +1002,7 @@ struct pnp_protocol isapnp_protocol = { .disable = isapnp_disable_resources, }; -static int __init isapnp_init(void) +static int __init real_isapnp_init(void) { int cards; struct pnp_card *card; @@ -1095,6 +1096,15 @@ static int __init isapnp_init(void) return 0; } +static void __init async_isapnp_init(void *unused, async_cookie_t cookie) +{ + (void)real_isapnp_init(); +} + +static int __init isapnp_init(void) +{ + async_schedule(async_isapnp_init, NULL); +} device_initcall(isapnp_init); /* format is: noisapnp */ -- cgit v1.2.3 From 8b3a76a7497607e7f33702f6c80844e1fc20b4d5 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 23 Dec 2009 16:43:07 +0000 Subject: UBUNTU: SAUCE: async_populate_rootfs: move rootfs init earlier Check to see if the machine has more than one active CPU, if it does then it is worth starting the decode of the rootfs earlier. Signed-off-by: Andy Whitcroft --- include/asm-generic/vmlinux.lds.h | 1 + include/linux/init.h | 1 + init/initramfs.c | 15 ++++++++++++++- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 4e2e1cc505ab..b5590e10b3a7 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -626,6 +626,7 @@ *(.initcallearly.init) \ INIT_CALLS_LEVEL(0) \ INIT_CALLS_LEVEL(1) \ + INIT_CALLS_LEVEL(earlyrootfs) \ INIT_CALLS_LEVEL(2) \ INIT_CALLS_LEVEL(3) \ INIT_CALLS_LEVEL(4) \ diff --git a/include/linux/init.h b/include/linux/init.h index 280d329b882c..ad021e87d5a3 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -196,6 +196,7 @@ extern bool initcall_debug; #define core_initcall(fn) __define_initcall("1",fn,1) #define core_initcall_sync(fn) __define_initcall("1s",fn,1s) +#define earlyrootfs_initcall(fn) __define_initcall("earlyrootfs",fn,rootfs) #define postcore_initcall(fn) __define_initcall("2",fn,2) #define postcore_initcall_sync(fn) __define_initcall("2s",fn,2s) #define arch_initcall(fn) __define_initcall("3",fn,3) diff --git a/init/initramfs.c b/init/initramfs.c index 928a3f2b81da..1ae8873570c6 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -622,9 +622,22 @@ static void __init async_populate_rootfs(void) return; } +static int __initdata rootfs_populated; + +static int __init populate_rootfs_early(void) +{ + if (num_online_cpus() > 1) { + rootfs_populated = 1; + async_schedule_domain(async_populate_rootfs, NULL, + &populate_rootfs_domain); + } +} static int __init populate_rootfs(void) { - async_schedule_domain(async_populate_rootfs, NULL, &populate_rootfs_domain); + if (!rootfs_populated) + async_schedule_domain(async_populate_rootfs, NULL, + &populate_rootfs_domain); } +earlyrootfs_initcall(populate_rootfs_early); rootfs_initcall(populate_rootfs); -- cgit v1.2.3 From 8d085d890023720d2c2b0ac1e742844bfd68a3cf Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 24 Dec 2009 02:09:03 +0000 Subject: UBUNTU: SAUCE: acpi battery -- move first lookup asynchronous BugLink: http://bugs.launchpad.net/bugs/507211 When instantiating the battery object on to the acpi bus in the kernel we talk to the BIOS to get the current battery state. This can take a long time and holds the acpi bus object locked for the duration. This leads to any other object wishing to add itself that bus blocking. This leads to unpredicatable delays of up to .3s when initialising the hpet during boot depending on execution order. Make the first update of the battery asynchronous. Move the acpi bus handling back synchronous. Signed-off-by: Andy Whitcroft --- drivers/acpi/battery.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index 7dd3f9fb9f3f..f22741069adb 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -976,6 +976,18 @@ static int battery_notify(struct notifier_block *nb, return 0; } +static LIST_HEAD(acpi_battery_domain); + +static void acpi_battery_update_async(struct acpi_device *device, async_cookie_t cookie) +{ + struct acpi_battery *battery = acpi_driver_data(device); + + acpi_battery_update(battery); + printk(KERN_INFO PREFIX "%s Slot [%s] (battery %s)\n", + ACPI_BATTERY_DEVICE_NAME, acpi_device_bid(device), + device->status.battery_present ? "present" : "absent"); +} + static int acpi_battery_add(struct acpi_device *device) { int result = 0; @@ -995,13 +1007,16 @@ static int acpi_battery_add(struct acpi_device *device) if (ACPI_SUCCESS(acpi_get_handle(battery->device->handle, "_BIX", &handle))) set_bit(ACPI_BATTERY_XINFO_PRESENT, &battery->flags); - result = acpi_battery_update(battery); - if (result) - goto fail; + + /* Mark the battery for update at first access. */ + battery->update_time = 0; #ifdef CONFIG_ACPI_PROCFS_POWER result = acpi_battery_add_fs(device); #endif - if (result) { + if (!result) { + async_schedule_domain(acpi_battery_update_async, device, &acpi_battery_domain); + + } else { #ifdef CONFIG_ACPI_PROCFS_POWER acpi_battery_remove_fs(device); #endif @@ -1031,6 +1046,10 @@ static int acpi_battery_remove(struct acpi_device *device, int type) if (!device || !acpi_driver_data(device)) return -EINVAL; + + /* Ensure all async updates are complete before freeing the battery. */ + async_synchronize_full_domain(&acpi_battery_domain); + battery = acpi_driver_data(device); unregister_pm_notifier(&battery->pm_nb); #ifdef CONFIG_ACPI_PROCFS_POWER @@ -1068,27 +1087,21 @@ static struct acpi_driver acpi_battery_driver = { }, }; -static void __init acpi_battery_init_async(void *unused, async_cookie_t cookie) +static int __init acpi_battery_init(void) { if (acpi_disabled) return; #ifdef CONFIG_ACPI_PROCFS_POWER acpi_battery_dir = acpi_lock_battery_dir(); if (!acpi_battery_dir) - return; + return -1; #endif if (acpi_bus_register_driver(&acpi_battery_driver) < 0) { #ifdef CONFIG_ACPI_PROCFS_POWER acpi_unlock_battery_dir(acpi_battery_dir); #endif - return; + return -1; } - return; -} - -static int __init acpi_battery_init(void) -{ - async_schedule(acpi_battery_init_async, NULL); return 0; } -- cgit v1.2.3 From 3545969eca4b3060bcde1358f7da28afd44cda8d Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 17 Feb 2010 10:13:42 +0000 Subject: UBUNTU: SAUCE: khubd -- switch USB product/manufacturer/serial handling to RCU BugLink: http://bugs.launchpad.net/bugs/510937 With the introduction of wireless USB hubs the product, manufacturer, and serial number are now mutable. This necessitates new locking in the consumers of these values including the sysfs read routines in order to prevent use-after-free acces to these values. These extra locks create significant lock contention leading to increased boot times (0.3s for an example Atom based system). Move update of these values to RCU based locking. Signed-off-by: Andy Whitcroft --- drivers/usb/core/hub.c | 34 ++++++++++++++++++++++++++++------ drivers/usb/core/sysfs.c | 6 +++--- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 04fb834c3fa1..78c26de8d425 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -2218,6 +2219,10 @@ fail: */ int usb_deauthorize_device(struct usb_device *usb_dev) { + char *product = NULL; + char *manufacturer = NULL; + char *serial = NULL; + usb_lock_device(usb_dev); if (usb_dev->authorized == 0) goto out_unauthorized; @@ -2225,11 +2230,12 @@ int usb_deauthorize_device(struct usb_device *usb_dev) usb_dev->authorized = 0; usb_set_configuration(usb_dev, -1); - kfree(usb_dev->product); + product = usb_dev->product; + manufacturer = usb_dev->manufacturer; + serial = usb_dev->serial; + usb_dev->product = kstrdup("n/a (unauthorized)", GFP_KERNEL); - kfree(usb_dev->manufacturer); usb_dev->manufacturer = kstrdup("n/a (unauthorized)", GFP_KERNEL); - kfree(usb_dev->serial); usb_dev->serial = kstrdup("n/a (unauthorized)", GFP_KERNEL); usb_destroy_configuration(usb_dev); @@ -2237,6 +2243,12 @@ int usb_deauthorize_device(struct usb_device *usb_dev) out_unauthorized: usb_unlock_device(usb_dev); + if (product || manufacturer || serial) { + synchronize_rcu(); + kfree(product); + kfree(manufacturer); + kfree(serial); + } return 0; } @@ -2244,6 +2256,9 @@ out_unauthorized: int usb_authorize_device(struct usb_device *usb_dev) { int result = 0, c; + char *product = NULL; + char *manufacturer = NULL; + char *serial = NULL; usb_lock_device(usb_dev); if (usb_dev->authorized == 1) @@ -2262,11 +2277,12 @@ int usb_authorize_device(struct usb_device *usb_dev) goto error_device_descriptor; } - kfree(usb_dev->product); + product = usb_dev->product; + manufacturer = usb_dev->manufacturer; + serial = usb_dev->serial; + usb_dev->product = NULL; - kfree(usb_dev->manufacturer); usb_dev->manufacturer = NULL; - kfree(usb_dev->serial); usb_dev->serial = NULL; usb_dev->authorized = 1; @@ -2294,6 +2310,12 @@ error_device_descriptor: error_autoresume: out_authorized: usb_unlock_device(usb_dev); // complements locktree + if (product || manufacturer || serial) { + synchronize_rcu(); + kfree(product); + kfree(manufacturer); + kfree(serial); + } return result; } diff --git a/drivers/usb/core/sysfs.c b/drivers/usb/core/sysfs.c index 9a56e3adf476..45801b5678fc 100644 --- a/drivers/usb/core/sysfs.c +++ b/drivers/usb/core/sysfs.c @@ -85,9 +85,9 @@ static ssize_t show_##name(struct device *dev, \ int retval; \ \ udev = to_usb_device(dev); \ - usb_lock_device(udev); \ - retval = sprintf(buf, "%s\n", udev->name); \ - usb_unlock_device(udev); \ + rcu_read_lock(); \ + retval = sprintf(buf, "%s\n", rcu_dereference(udev->name)); \ + rcu_read_unlock(); \ return retval; \ } \ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL); -- cgit v1.2.3 From 1b948c07a87391dc95cb3ac01aa8a5465781b813 Mon Sep 17 00:00:00 2001 From: Colin Watson Date: Thu, 25 Feb 2010 13:21:02 +0000 Subject: UBUNTU: ubuntu: dm-raid4-5: Depend on XOR_BLOCKS Needed so that 'make defconfig && make' works. Signed-off-by: Colin Watson Signed-off-by: Andy Whitcroft --- ubuntu/dm-raid4-5/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ubuntu/dm-raid4-5/Kconfig b/ubuntu/dm-raid4-5/Kconfig index 3ce3296ca094..03fab6438d4d 100644 --- a/ubuntu/dm-raid4-5/Kconfig +++ b/ubuntu/dm-raid4-5/Kconfig @@ -1,6 +1,6 @@ config DM_RAID45 tristate "RAID 4/5 target (EXPERIMENTAL)" - depends on BLK_DEV_DM && EXPERIMENTAL + depends on BLK_DEV_DM && XOR_BLOCKS && EXPERIMENTAL default m ---help--- A target that supports RAID4 and RAID5 mappings. -- cgit v1.2.3 From 3ef6ae4407c5ebd703b373ebf2adb7f7e4cf85f9 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 11 Mar 2010 10:33:36 +0000 Subject: UBUNTU: SAUCE: (no-up) cdrom -- default to not locking the tray when in use BugLink: http://bugs.launchpad.net/bugs/397734 It seems that users are have a high expectation that the eject button on their CDROM drive will eject the disk regardless of whether it is in use or not. To this end we are now changing the default LOCK mode for mounted CDROMS to 0 to allow ejects. This however does not handle the direct open cases like music and video players. From the launchpad bug commentary: So, according to the upstream discussion David Zeuthen recommended to just not lock CD-ROM trays by default. Kernel/userspace already handles prematurely removed USB storage devices reasonably, and with read-only devices like CD-ROMs it is even less of an issue. So we should just set /proc/sys/dev/cdrom/lock to 0 by default. Note that we still will have the drive mounted after the eject. There is a media change uevent generated and this will be used to trigger the unmount of the drive in udisks. The burner software will also have to be looked at to ensure they are explicitly locking the drive closed during the burn. This will all be handled under the bug above. Signed-off-by: Andy Whitcroft Acked-by: Tim Gardner Acked-by: Colin King --- drivers/cdrom/cdrom.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index d620b4495745..72cf3159c67c 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -289,7 +289,7 @@ static bool debug; /* default compatibility mode */ static bool autoclose=1; static bool autoeject; -static bool lockdoor = 1; +static bool lockdoor = 0; /* will we ever get to use this... sigh. */ static bool check_media_type; /* automatically restart mrw format */ -- cgit v1.2.3 From 02d933970209bff47b9ffa002f6c0051be7b70d0 Mon Sep 17 00:00:00 2001 From: Leann Ogasawara Date: Wed, 24 Mar 2010 10:36:13 -0700 Subject: UBUNTU: ubuntu: dm-raid4-5 -- update to compile with 2.6.33 The following commit adds a fn callback arguement to dm_dirty_log_create(). convert dm-raid4-5 to match: commit 87a8f240e9bcf025ba45e4563c842b0d59c5e8ef Author: Mikulas Patocka Date: Thu Dec 10 23:52:01 2009 +0000 dm log: add flush callback fn Signed-off-by: Leann Ogasawara --- ubuntu/dm-raid4-5/dm-raid4-5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ubuntu/dm-raid4-5/dm-raid4-5.c b/ubuntu/dm-raid4-5/dm-raid4-5.c index 4efe1fc496b1..c081825906af 100644 --- a/ubuntu/dm-raid4-5/dm-raid4-5.c +++ b/ubuntu/dm-raid4-5/dm-raid4-5.c @@ -3495,7 +3495,7 @@ context_alloc(struct raid_type *raid_type, struct variable_parms *p, */ ti_len = ti->len; ti->len = sectors_per_dev; - dl = dm_dirty_log_create(argv[0], ti, dl_parms, argv + 2); + dl = dm_dirty_log_create(argv[0], ti, NULL, dl_parms, argv + 2); ti->len = ti_len; if (!dl) goto bad_dirty_log; -- cgit v1.2.3 From 9b98f34601b9d5ca03cdf22a82ca3d005ce4ea4d Mon Sep 17 00:00:00 2001 From: Leann Ogasawara Date: Fri, 26 Mar 2010 15:15:51 -0700 Subject: UBUNTU: ubuntu: dm-raid4-5 -- update to compile with 2.6.34-rc2 The following commit removes unused params from dm_get_device(). Convert dm-raid4-5 to match: commit 8215d6ec5fee1e76545decea2cd73717efb5cb42 Author: Nikanth Karthikesan Date: Sat Mar 6 02:32:27 2010 +0000 dm table: remove unused dm_get_device range parameters Signed-off-by: Leann Ogasawara --- ubuntu/dm-raid4-5/dm-raid4-5.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ubuntu/dm-raid4-5/dm-raid4-5.c b/ubuntu/dm-raid4-5/dm-raid4-5.c index c081825906af..7354d6cb16fd 100644 --- a/ubuntu/dm-raid4-5/dm-raid4-5.c +++ b/ubuntu/dm-raid4-5/dm-raid4-5.c @@ -3705,9 +3705,7 @@ static int dev_parms(struct raid_set *rs, char **argv, int *p) TI_ERR("Invalid RAID device offset parameter"); dev->start = tmp; - r = dm_get_device(ti, *argv, dev->start, - rs->set.sectors_per_dev, - dm_table_get_mode(ti->table), &dev->dev); + r = dm_get_device(ti, *argv, dm_table_get_mode(ti->table), &dev->dev); if (r) TI_ERR_RET("RAID device lookup failure", r); -- cgit v1.2.3 From 33594811679de8640e444c5646517602e0c284fc Mon Sep 17 00:00:00 2001 From: Leann Ogasawara Date: Wed, 14 Apr 2010 08:27:05 -0700 Subject: UBUNTU: ubuntu: dm-raid4-5 -- update to compile with 2.6.34-rc4 Need to include linux/slab.h to prevent implicit declaration of functions which otherwise result in build failures. Signed-off-by: Leann Ogasawara --- ubuntu/dm-raid4-5/dm-memcache.c | 1 + ubuntu/dm-raid4-5/dm-raid4-5.c | 1 + ubuntu/dm-raid4-5/dm-region-hash.c | 1 + 3 files changed, 3 insertions(+) diff --git a/ubuntu/dm-raid4-5/dm-memcache.c b/ubuntu/dm-raid4-5/dm-memcache.c index 4e3731ca1f17..1b15859b2762 100644 --- a/ubuntu/dm-raid4-5/dm-memcache.c +++ b/ubuntu/dm-raid4-5/dm-memcache.c @@ -18,6 +18,7 @@ #include "dm.h" #include "dm-memcache.h" #include +#include struct dm_mem_cache_client { spinlock_t lock; diff --git a/ubuntu/dm-raid4-5/dm-raid4-5.c b/ubuntu/dm-raid4-5/dm-raid4-5.c index 7354d6cb16fd..7f4565fa3de3 100644 --- a/ubuntu/dm-raid4-5/dm-raid4-5.c +++ b/ubuntu/dm-raid4-5/dm-raid4-5.c @@ -53,6 +53,7 @@ static const char *version = "v0.2594b"; #include #include "dm-region-hash.h" +#include /* * Configurable parameters diff --git a/ubuntu/dm-raid4-5/dm-region-hash.c b/ubuntu/dm-raid4-5/dm-region-hash.c index 3d33af6be8f6..59429f68d2b5 100644 --- a/ubuntu/dm-raid4-5/dm-region-hash.c +++ b/ubuntu/dm-raid4-5/dm-region-hash.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "dm.h" -- cgit v1.2.3 From c1a774967f4e94a78e3fe8abacdb39b124d8c03b Mon Sep 17 00:00:00 2001 From: Stefan Bader Date: Mon, 29 Mar 2010 19:12:36 +0200 Subject: UBUNTU: SAUCE: (no-up) dma-mapping: Remove WARN_ON in dma_free_coherent BugLink: http://bugs.launchpad.net/bugs/458201 Triggered by the following backtrace: WARNING: at /build/buildd/linux-2.6.32/arch/x86/include/asm/dma-mapping.h:154 ___free_dma_mem_cluster+0x102/0x110() [] warn_slowpath_common+0x7b/0xc0 [] warn_slowpath_null+0x14/0x20 [] ___free_dma_mem_cluster+0x102/0x110 [] __sym_mfree+0xd2/0x100 [] __sym_mfree_dma+0x69/0x100 [] sym_hcb_free+0x8f/0x1f0 This patch never will be accepted upstream because the WARN_ON is supposed to perevent driver development which is only compatible with x86 on x86 (ARM can sleep in that function). The right way to fix it would be to make the offending function use locks in the right way but that requires careful implementation. Signed-off-by: Stefan Bader Acked-by: Andy Whitcroft Signed-off-by: Andy Whitcroft --- arch/x86/include/asm/dma-mapping.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index f7b4c7903e7e..fab11b0ab4ff 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -162,8 +162,6 @@ static inline void dma_free_attrs(struct device *dev, size_t size, { struct dma_map_ops *ops = get_dma_ops(dev); - WARN_ON(irqs_disabled()); /* for portability */ - if (dma_release_from_coherent(dev, get_order(size), vaddr)) return; -- cgit v1.2.3 From 4d61f7bf351f274c2de214995ab0972edafbb536 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 17 Jun 2010 10:46:47 +0100 Subject: UBUNTU: SAUCE: add option to hand off all kernel parameters to init BugLink: http://bugs.launchpad.net/bugs/586386 Some init packages such as upstart find having all of the kernel parameters passed in useful. Currently they have to open up /proc/cmdline and reparse that to obtain this information. Add a kernel configuration option to enable passing of all options. Note, enabling this option will reduce the chances that a fallback from /sbin/init to /bin/bash or /bin/sh will succeed. Though it should be noted that there are commonly unknown options present which would already break this fallback. init=/bin/foo provides explicit control over options which is unaffected by this change. Signed-off-by: Andy Whitcroft Signed-off-by: Leann Ogasawara --- include/linux/moduleparam.h | 4 ++-- init/Kconfig | 7 +++++++ init/main.c | 33 ++++++++++++++++++++++++++------- kernel/params.c | 20 ++++++++++++++------ 4 files changed, 49 insertions(+), 15 deletions(-) diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index d6a58065c09c..66cbc87d7636 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -320,8 +320,8 @@ extern int parse_args(const char *name, unsigned num, s16 level_min, s16 level_max, - int (*unknown)(char *param, char *val, - const char *doing)); + int (*handle)(char *param, char *val, + const char *doing, int known)); /* Called by module remove. */ #ifdef CONFIG_SYSFS diff --git a/init/Kconfig b/init/Kconfig index 3a5028fe833e..6a6878189265 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -79,6 +79,13 @@ config INIT_ENV_ARG_LIMIT Maximum of each of the number of arguments and environment variables passed to init from the kernel command line. +config INIT_PASS_ALL_PARAMS + bool "Pass all (known and unknown) kernel parameters to init" + default n + help + Pass all kernel command line parameters to init, this includes + those consumed by kernel modules. This is useful for upstart + based systems. If in doubt say N. config CROSS_COMPILE string "Cross-compiler tool prefix" diff --git a/init/main.c b/init/main.c index b0751f0bfe04..5267741da32d 100644 --- a/init/main.c +++ b/init/main.c @@ -112,6 +112,11 @@ EXPORT_SYMBOL(system_state); */ #define MAX_INIT_ARGS CONFIG_INIT_ENV_ARG_LIMIT #define MAX_INIT_ENVS CONFIG_INIT_ENV_ARG_LIMIT +#ifdef CONFIG_INIT_PASS_ALL_PARAMS +#define INIT_PASS_FUNCTION pass_all_bootoptions +#else +#define INIT_PASS_FUNCTION pass_unknown_bootoptions +#endif extern void time_init(void); /* Default late time init is NULL. archs can override this later. */ @@ -243,19 +248,20 @@ static int __init repair_env_string(char *param, char *val, const char *unused) } /* - * Unknown boot options get handed to init, unless they look like - * unused parameters (modprobe will find them in /proc/cmdline). + * Select boot options to hand to init. If all is set hand off them all + * otherwise only hand off unused ones which do not apply to modules + * (modprobe will find them in /proc/cmdline). */ -static int __init unknown_bootoption(char *param, char *val, const char *unused) +static int __init pass_bootoption(char *param, char *val, const char *unused, int all) { repair_env_string(param, val, unused); /* Handle obsolete-style parameters */ - if (obsolete_checksetup(param)) + if (obsolete_checksetup(param) && !all) return 0; /* Unused module parameter. */ - if (strchr(param, '.') && (!val || strchr(param, '.') < val)) + if (!all && strchr(param, '.') && (!val || strchr(param, '.') < val)) return 0; if (panic_later) @@ -286,6 +292,16 @@ static int __init unknown_bootoption(char *param, char *val, const char *unused) } return 0; } +static int __init pass_unknown_bootoptions(char *param, char *val, const char *unused, int known) +{ + if (known) + return 0; + return pass_bootoption(param, val, unused, 0); +} +static int __init pass_all_bootoptions(char *param, char *val, const char *unused, int known) +{ + return pass_bootoption(param, val, unused, 1); +} static int __init init_setup(char *str) { @@ -385,10 +401,13 @@ static noinline void __init_refok rest_init(void) } /* Check for early params. */ -static int __init do_early_param(char *param, char *val, const char *unused) +static int __init do_early_param(char *param, char *val, const char *unused, int known) { const struct obs_kernel_param *p; + if (known) + return 0; + for (p = __setup_start; p < __setup_end; p++) { if ((p->early && parameq(param, p->str)) || (strcmp(param, "console") == 0 && @@ -508,7 +527,7 @@ asmlinkage void __init start_kernel(void) parse_early_param(); parse_args("Booting kernel", static_command_line, __start___param, __stop___param - __start___param, - -1, -1, &unknown_bootoption); + -1, -1, &INIT_PASS_FUNCTION); jump_label_init(); diff --git a/kernel/params.c b/kernel/params.c index ed35345be536..8b8b3144793e 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -90,8 +90,8 @@ static int parse_one(char *param, unsigned num_params, s16 min_level, s16 max_level, - int (*handle_unknown)(char *param, char *val, - const char *doing)) + int (*handle_arg)(char *param, char *val, + const char *doing, int known)) { unsigned int i; int err; @@ -106,6 +106,14 @@ static int parse_one(char *param, if (!val && params[i].ops->set != param_set_bool && params[i].ops->set != param_set_bint) return -EINVAL; + if (handle_arg) { + int ret; + pr_debug("Valid argument: calling %p\n", + handle_arg); + ret = handle_arg(param, val, doing, 1); + if (ret) + return ret; + } pr_debug("handling %s with %p\n", param, params[i].ops->set); mutex_lock(¶m_lock); @@ -115,9 +123,9 @@ static int parse_one(char *param, } } - if (handle_unknown) { + if (handle_arg) { pr_debug("doing %s: %s='%s'\n", doing, param, val); - return handle_unknown(param, val, doing); + return handle_arg(param, val, doing, 0); } pr_debug("Unknown argument '%s'\n", param); @@ -183,7 +191,7 @@ int parse_args(const char *doing, unsigned num, s16 min_level, s16 max_level, - int (*unknown)(char *param, char *val, const char *doing)) + int (*handle_arg)(char *param, char *val, const char *doing, int arg)) { char *param, *val; @@ -200,7 +208,7 @@ int parse_args(const char *doing, args = next_arg(args, ¶m, &val); irq_was_disabled = irqs_disabled(); ret = parse_one(param, val, doing, params, num, - min_level, max_level, unknown); + min_level, max_level, handle_arg); if (irq_was_disabled && !irqs_disabled()) pr_warn("%s: option '%s' enabled irq's!\n", doing, param); -- cgit v1.2.3 From 3edfe9dff6d7ebe82395aa0412ebe578ddd42dd8 Mon Sep 17 00:00:00 2001 From: Tim Gardner Date: Tue, 13 Jul 2010 06:41:16 -0600 Subject: UBUNTU: SAUCE: fix build error with CONFIG_BLK_DEV_INITRD=n The original aynchronous boot patch (UBUNTU: SAUCE: Make populate_rootfs asynchronous) did not take into consideration the case when CONFIG_BLK_DEV_INITRD=n, e.g., populate_rootfs_domain becomes undefined. Therefore, add it to noinitramfs.c where its use is benign. Signed-off-by: Tim Gardner Original-patch-by: Nicolas Pitre Cc: Surbhi Palande Signed-off-by: Leann Ogasawara --- init/noinitramfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/init/noinitramfs.c b/init/noinitramfs.c index 267739d85179..e0f648cad6e6 100644 --- a/init/noinitramfs.c +++ b/init/noinitramfs.c @@ -22,6 +22,8 @@ #include #include +LIST_HEAD(populate_rootfs_domain); + /* * Create a simple rootfs that is similar to the default initramfs */ -- cgit v1.2.3 From 50e162b535a6e7bd25ac5a2a532bc8b6a6af73fa Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 16 Jul 2010 09:37:31 +0100 Subject: UBUNTU: SAUCE: (no-up) Stop ARM boards crashing when CUPS is loaded BugLink: http://bugs.launchpad.net/bugs/601226 When CUPS loads, it tries to load several drivers it may need. When one of these drivers, specifically parport_pc is loaded, it attempts to write to address space normally reserved for ISA transactions. On OMAP based systems, this causes a segmentation fault. Signed-off-by: Lee Jones Signed-off-by: Leann Ogasawara --- drivers/parport/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/parport/Kconfig b/drivers/parport/Kconfig index d92185a5523b..9654a9e83200 100644 --- a/drivers/parport/Kconfig +++ b/drivers/parport/Kconfig @@ -36,7 +36,7 @@ if PARPORT config PARPORT_PC tristate "PC-style hardware" depends on (!SPARC64 || PCI) && !SPARC32 && !M32R && !FRV && \ - (!M68K || ISA) && !MN10300 && !AVR32 && !BLACKFIN + (!M68K || ISA) && !MN10300 && !AVR32 && !BLACKFIN && !ARCH_OMAP ---help--- You should say Y here if you have a PC-style parallel port. All IBM PC compatible computers and some Alphas have PC-style -- cgit v1.2.3 From ee7bf2f4d3643193e1c7502c2890e00a4b2d33db Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 29 Jul 2010 09:42:21 +0100 Subject: UBUNTU: SAUCE: (no-up) Modularize vesafb -- fix initialisation When this patch was rolled forward, likely between Dapper and Hardy a chunk of initialisation was lost. Pull this back in so we actually have an vesafb_info structure initialised. Else we may well panic when we rmmod vesafb. Signed-off-by: Andy Whitcroft Acked-by: Stefan Bader Acked-by: Brad Figg Signed-off-by: Leann Ogasawara --- drivers/video/vesafb.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/video/vesafb.c b/drivers/video/vesafb.c index 97af4869c5a6..e3acd0131ee3 100644 --- a/drivers/video/vesafb.c +++ b/drivers/video/vesafb.c @@ -258,6 +258,7 @@ static int __init vesafb_setup(char *options) static int __init vesafb_probe(struct platform_device *dev) { struct fb_info *info; + struct vesafb_info *vfb_info; int i, err; unsigned int size_vmode; unsigned int size_remap; @@ -316,13 +317,14 @@ static int __init vesafb_probe(struct platform_device *dev) spaces our resource handlers simply don't know about */ } - info = framebuffer_alloc(sizeof(u32) * 256, &dev->dev); + info = framebuffer_alloc(sizeof(struct vesafb_info), &dev->dev); if (!info) { release_mem_region(vesafb_fix.smem_start, size_total); return -ENOMEM; } - info->pseudo_palette = info->par; - info->par = NULL; + vfb_info = (struct vesafb_info *) info->par; + vfb_info->mtrr_hdl = -1; + info->pseudo_palette = vfb_info->pseudo_palette; /* set vesafb aperture size for generic probing */ info->apertures = alloc_apertures(1); @@ -452,17 +454,15 @@ static int __init vesafb_probe(struct platform_device *dev) } if (type) { - int rc; - /* Find the largest power-of-two */ temp_size = roundup_pow_of_two(temp_size); /* Try and find a power of two to add */ do { - rc = mtrr_add(vesafb_fix.smem_start, temp_size, + vfb_info->mtrr_hdl = mtrr_add(vesafb_fix.smem_start, temp_size, type, 1); temp_size >>= 1; - } while (temp_size >= PAGE_SIZE && rc == -EINVAL); + } while (temp_size >= PAGE_SIZE && vfb_info->mtrr_hdl == -EINVAL); } } #endif -- cgit v1.2.3 From 6b49db640c6bba945d2b5cec77edf7170c2d11a5 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 27 Jul 2010 06:06:07 -0700 Subject: UBUNTU: SAUCE: fix pv-ops for legacy Xen Import fix_xen_guest_on_old_EC2.patch from fedora 14 Legacy hypervisors (RHEL 5.0 and RHEL 5.1) do not handle guest writes to cr4 gracefully. If a guest attempts to write a bit of cr4 that is unsupported, then the HV is so offended it crashes the domain. While later guest kernels (such as RHEL6) don't assume the HV supports all features, they do expect nicer responses. That assumption introduced code that probes whether or not xsave is supported early in the boot. So now when attempting to boot a RHEL6 guest on RHEL5.0 or RHEL5.1 an early crash will occur. This patch is quite obviously an undesirable hack. The real fix for this problem should be in the HV, and is, in later HVs. However, to support running on old HVs, RHEL6 can take this small change. No impact will occur for running on any RHEL HV (not even RHEL 5.5 supports xsave). There is only potential for guest performance loss on upstream Xen. All this by way of explanation for why is this patch not going upstream. Signed-off-by: John Johansen Signed-off-by: Tim Gardner Signed-off-by: Leann Ogasawara --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e74df9548a02..bfe6bc0c310d 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -941,6 +941,7 @@ static void xen_write_cr4(unsigned long cr4) { cr4 &= ~X86_CR4_PGE; cr4 &= ~X86_CR4_PSE; + cr4 &= ~X86_CR4_OSXSAVE; native_write_cr4(cr4); } -- cgit v1.2.3 From 259e3b3bafcb2ead4a53329c67272c1e44cc13e4 Mon Sep 17 00:00:00 2001 From: "Mathieu J. Poirier" Date: Tue, 17 Aug 2010 11:16:41 -0400 Subject: UBUNTU: SAUCE: (no-up) ARM: Resetting power_mode to its original value. This reverts the second part of patch: commit 6da20c89af64b75302399369a90b9d50c1a87665 Author: Adrian Hunter Date: Mon Feb 15 10:03:34 2010 -0800 omap_hsmmc: Ensure regulator enable / disable are paired Without this the kernel fails to initialize the SDHC card and find the root partition. Work is currently underway with the community to find a real solution to the problem. This a temporary measure to unblock developers and will have to be reverted when the real fix gets upstream. BugLink: https://bugs.launchpad.net/bugs/591941 Signed-off-by: Mathieu Poirier Signed-off-by: Leann Ogasawara --- drivers/mmc/host/omap_hsmmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 9219ad0ed9a6..f67ab02f8e62 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -1858,7 +1858,7 @@ static int __devinit omap_hsmmc_probe(struct platform_device *pdev) host->slot_id = 0; host->mapbase = res->start + pdata->reg_offset; host->base = ioremap(host->mapbase, SZ_4K); - host->power_mode = MMC_POWER_OFF; + host->power_mode = -1; host->flags = AUTO_CMD12; host->next_data.cookie = 1; -- cgit v1.2.3 From b66c2c9330971503c63d995750c1d869a6b230c4 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Wed, 15 Sep 2010 10:28:20 -0700 Subject: UBUNTU: SAUCE: Improve Amazon EBS performance for EC2 OriginalAuthor: Amazona from Ben Howard BugLink: http://bugs.launchpad.net/bugs/634316 The pv-ops kernel suffers from poor performance when using Amazon's Elastic block storage (EBS). This patch from Amazon improves pv-ops kernel performance, and has not exhibited any regressions. Signed-off-by: John Johansen Signed-off-by: Leann Ogasawara --- drivers/block/xen-blkfront.c | 120 ++++++++++++++++++++++++++++----------- include/xen/interface/io/blkif.h | 12 ++++ 2 files changed, 100 insertions(+), 32 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 60eed4bdd2e4..31b3541f8096 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -73,7 +73,8 @@ struct blk_shadow { static DEFINE_MUTEX(blkfront_mutex); static const struct block_device_operations xlvbd_block_fops; -#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) +#define BLK_MAX_RING_AREA_SIZE (BLKIF_MAX_NUM_RING_PAGES * PAGE_SIZE) +#define BLK_MAX_RING_SIZE __CONST_RING_SIZE(blkif, BLK_MAX_RING_AREA_SIZE) /* * We have one of these per vbd, whether ide, scsi or 'other'. They @@ -89,14 +90,15 @@ struct blkfront_info int vdevice; blkif_vdev_t handle; enum blkif_state connected; - int ring_ref; + int num_ring_pages; + int ring_ref[BLKIF_MAX_NUM_RING_PAGES]; struct blkif_front_ring ring; struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; unsigned int evtchn, irq; struct request_queue *rq; struct work_struct work; struct gnttab_free_callback callback; - struct blk_shadow shadow[BLK_RING_SIZE]; + struct blk_shadow shadow[BLK_MAX_RING_SIZE]; unsigned long shadow_free; unsigned int feature_flush; unsigned int flush_op; @@ -135,7 +137,8 @@ static DEFINE_SPINLOCK(minor_lock); static int get_id_from_freelist(struct blkfront_info *info) { unsigned long free = info->shadow_free; - BUG_ON(free >= BLK_RING_SIZE); + int ring_size = __RING_SIZE((struct blkif_sring *)0, info->num_ring_pages * PAGE_SIZE); + BUG_ON(free >= ring_size); info->shadow_free = info->shadow[free].req.u.rw.id; info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ return free; @@ -685,6 +688,8 @@ static void blkif_restart_queue(struct work_struct *work) static void blkif_free(struct blkfront_info *info, int suspend) { + int i; + /* Prevent new requests being issued until we fix things up. */ spin_lock_irq(&info->io_lock); info->connected = suspend ? @@ -700,10 +705,17 @@ static void blkif_free(struct blkfront_info *info, int suspend) flush_work_sync(&info->work); /* Free resources associated with old device channel. */ - if (info->ring_ref != GRANT_INVALID_REF) { - gnttab_end_foreign_access(info->ring_ref, 0, - (unsigned long)info->ring.sring); - info->ring_ref = GRANT_INVALID_REF; + for (i = 0; i < info->num_ring_pages; i++) { + /* Free resources associated with old device channel. */ + if (info->ring_ref[i] != GRANT_INVALID_REF) { + gnttab_end_foreign_access(info->ring_ref[i], 0, 0L); + info->ring_ref[i] = GRANT_INVALID_REF; + } + } + if (info->ring.sring) { + int ring_area_size = info->num_ring_pages * PAGE_SIZE; + free_pages((unsigned long)info->ring.sring, + get_order(ring_area_size)); info->ring.sring = NULL; } if (info->irq) @@ -828,27 +840,32 @@ static int setup_blkring(struct xenbus_device *dev, struct blkfront_info *info) { struct blkif_sring *sring; - int err; + int i, order, err; + int ring_area_size = info->num_ring_pages * PAGE_SIZE; - info->ring_ref = GRANT_INVALID_REF; + for (i = 0; i < info->num_ring_pages; i++) { + info->ring_ref[i] = GRANT_INVALID_REF; + } - sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH); + order = get_order(ring_area_size); + sring = (struct blkif_sring *)__get_free_pages(GFP_KERNEL, order); if (!sring) { xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); return -ENOMEM; } SHARED_RING_INIT(sring); - FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); - - sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST); - - err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); - if (err < 0) { - free_page((unsigned long)sring); - info->ring.sring = NULL; - goto fail; + FRONT_RING_INIT(&info->ring, sring, ring_area_size); + + for (i = 0; i < info->num_ring_pages; i++) { + unsigned long addr = (unsigned long)info->ring.sring + i * PAGE_SIZE; + err = xenbus_grant_ring(dev, virt_to_mfn(addr)); + if (err < 0) { + free_pages((unsigned long)sring, order); + info->ring.sring = NULL; + goto fail; + } + info->ring_ref[i] = err; } - info->ring_ref = err; err = xenbus_alloc_evtchn(dev, &info->evtchn); if (err) @@ -877,7 +894,13 @@ static int talk_to_blkback(struct xenbus_device *dev, { const char *message = NULL; struct xenbus_transaction xbt; - int err; + int err, i; + + BUILD_BUG_ON(BLKIF_MAX_NUM_RING_PAGES != 1 && + BLKIF_MAX_NUM_RING_PAGES != 2 && + BLKIF_MAX_NUM_RING_PAGES != 4 && + BLKIF_MAX_NUM_RING_PAGES != 8 && + BLKIF_MAX_NUM_RING_PAGES != 16); /* Create shared ring, alloc event channel. */ err = setup_blkring(dev, info); @@ -891,11 +914,30 @@ again: goto destroy_blkring; } - err = xenbus_printf(xbt, dev->nodename, - "ring-ref", "%u", info->ring_ref); - if (err) { - message = "writing ring-ref"; - goto abort_transaction; + if (info->num_ring_pages == 1) { + err = xenbus_printf(xbt, dev->nodename, + "ring-ref", "%u", info->ring_ref[0]); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + } else { + err = xenbus_printf(xbt, dev->nodename, "num-ring-pages", "%u", + info->num_ring_pages); + if (err) { + message = "writing num-ring-pages"; + goto abort_transaction; + } + for (i = 0; i < info->num_ring_pages; i++) { + char buf[16]; + snprintf(buf, sizeof(buf), "ring-ref%d", i); + err = xenbus_printf(xbt, dev->nodename, buf, "%u", + info->ring_ref[i]); + if (err) { + message = "writing ring-refs"; + goto abort_transaction; + } + } } err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u", info->evtchn); @@ -943,6 +985,7 @@ static int blkfront_probe(struct xenbus_device *dev, { int err, vdevice, i; struct blkfront_info *info; + int ring_size, max_ring_pages; /* FIXME: Use dynamic device id if this is not set. */ err = xenbus_scanf(XBT_NIL, dev->nodename, @@ -956,6 +999,10 @@ static int blkfront_probe(struct xenbus_device *dev, return err; } } + err = xenbus_scanf(XBT_NIL, dev->otherend, + "max-ring-pages", "%u", &max_ring_pages); + if (err != 1) + max_ring_pages = 1; if (xen_hvm_domain()) { char *type; @@ -999,9 +1046,13 @@ static int blkfront_probe(struct xenbus_device *dev, info->connected = BLKIF_STATE_DISCONNECTED; INIT_WORK(&info->work, blkif_restart_queue); - for (i = 0; i < BLK_RING_SIZE; i++) + info->num_ring_pages = min(max_ring_pages, BLKIF_MAX_NUM_RING_PAGES); + + ring_size = __RING_SIZE((struct blkif_sring *)0, + info->num_ring_pages * PAGE_SIZE); + for (i = 0; i < ring_size; i++) info->shadow[i].req.u.rw.id = i+1; - info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; + info->shadow[ring_size-1].req.u.rw.id = 0x0fffffff; /* Front end dir is a number, which is used as the id. */ info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); @@ -1014,6 +1065,9 @@ static int blkfront_probe(struct xenbus_device *dev, return err; } + printk(KERN_INFO "blkfront %s num-ring-pages %d nr_ents %d.\n", + dev->nodename, info->num_ring_pages, ring_size); + return 0; } @@ -1024,6 +1078,7 @@ static int blkif_recover(struct blkfront_info *info) struct blkif_request *req; struct blk_shadow *copy; int j; + int ring_size = __RING_SIZE((struct blkif_sring *)0, info->num_ring_pages * PAGE_SIZE); /* Stage 1: Make a safe copy of the shadow state. */ copy = kmalloc(sizeof(info->shadow), @@ -1034,13 +1089,13 @@ static int blkif_recover(struct blkfront_info *info) /* Stage 2: Set up free list. */ memset(&info->shadow, 0, sizeof(info->shadow)); - for (i = 0; i < BLK_RING_SIZE; i++) + for (i = 0; i < ring_size; i++) info->shadow[i].req.u.rw.id = i+1; info->shadow_free = info->ring.req_prod_pvt; - info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; + info->shadow[ring_size-1].req.u.rw.id = 0x0fffffff; /* Stage 3: Find pending requests and requeue them. */ - for (i = 0; i < BLK_RING_SIZE; i++) { + for (i = 0; i < ring_size; i++) { /* Not in use? */ if (!copy[i].request) continue; @@ -1511,3 +1566,4 @@ MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR); MODULE_ALIAS("xen:vbd"); MODULE_ALIAS("xenblk"); + diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h index ee338bfde18b..31063bd1b691 100644 --- a/include/xen/interface/io/blkif.h +++ b/include/xen/interface/io/blkif.h @@ -168,6 +168,18 @@ struct blkif_response { DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response); +/* + * Maximum number of pages used for a blkif ring + * max-ring-pages advertised by blkback to blkfront may be lowered at blkback + * mod load time. Load time param set to default. + */ +#define BLKIF_MAX_NUM_RING_PAGES 16 +#define BLKIF_MAX_NUM_RING_PAGES_DFLT 4 +#if BLKIF_MAX_NUM_RING_PAGES < BLKIF_MAX_NUM_RING_PAGES_DFLT +#undef BLKIF_MAX_NUM_RING_PAGES_DFLT +#define BLKIF_MAX_NUM_RING_PAGES_DFLT BLKIF_MAX_NUM_RING_PAGES +#endif + #define VDISK_CDROM 0x1 #define VDISK_REMOVABLE 0x2 #define VDISK_READONLY 0x4 -- cgit v1.2.3 From 74c82944f2d70251cba4fab04a6679d6f66cfee5 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 29 Jul 2010 16:48:21 +0100 Subject: UBUNTU: SAUCE: drm -- stop early access to drm devices When a drm driver is initialised we first allocate and initialise the drm minor numbers including creating the sysfs files, then we trigger the driver load method. The act of creating the sysfs files triggers the uevent. This means udev may start programs which open /dev/dri/card0 and other interfaces, this can occur before the load method has even started and thus before the driver has fully initialised its data structures. In the case of plymouthd this leads to it opening and closing (in disgust) the interface, which in turn leads to a kernel panic as the mutexes are yet to be initialised. This patch delays the linking up of the drm devices minor numbers until the driver is fully initialised. As it is possible for consumers of these interfaces to reach them before they are fully initialised we arrange for opens of these devices to return EAGAIN until the device is fully initialised. Signed-off-by: Andy Whitcroft Acked-by: Stefan Bader Signed-off-by: Leann Ogasawara --- drivers/gpu/drm/drm_fops.c | 8 ++++++-- drivers/gpu/drm/drm_pci.c | 4 ++++ drivers/gpu/drm/drm_platform.c | 4 ++++ drivers/gpu/drm/drm_stub.c | 2 +- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_fops.c b/drivers/gpu/drm/drm_fops.c index 123de28f94ef..9c14f633772c 100644 --- a/drivers/gpu/drm/drm_fops.c +++ b/drivers/gpu/drm/drm_fops.c @@ -129,7 +129,8 @@ int drm_open(struct inode *inode, struct file *filp) minor = idr_find(&drm_minors_idr, minor_id); if (!minor) return -ENODEV; - + if (IS_ERR(minor)) + return PTR_ERR(minor); if (!(dev = minor->dev)) return -ENODEV; @@ -180,7 +181,10 @@ int drm_stub_open(struct inode *inode, struct file *filp) minor = idr_find(&drm_minors_idr, minor_id); if (!minor) goto out; - + if (IS_ERR(minor)) { + err = PTR_ERR(minor); + goto out; + } if (!(dev = minor->dev)) goto out; diff --git a/drivers/gpu/drm/drm_pci.c b/drivers/gpu/drm/drm_pci.c index 13f3d936472f..b3216724122a 100644 --- a/drivers/gpu/drm/drm_pci.c +++ b/drivers/gpu/drm/drm_pci.c @@ -367,6 +367,10 @@ int drm_get_pci_dev(struct pci_dev *pdev, const struct pci_device_id *ent, list_add_tail(&dev->driver_item, &driver->device_list); + if (drm_core_check_feature(dev, DRIVER_MODESET)) + idr_replace(&drm_minors_idr, dev->control, dev->control->index); + idr_replace(&drm_minors_idr, dev->primary, dev->primary->index); + DRM_INFO("Initialized %s %d.%d.%d %s for %s on minor %d\n", driver->name, driver->major, driver->minor, driver->patchlevel, driver->date, pci_name(pdev), dev->primary->index); diff --git a/drivers/gpu/drm/drm_platform.c b/drivers/gpu/drm/drm_platform.c index 82431dcae37b..d749389a15a0 100644 --- a/drivers/gpu/drm/drm_platform.c +++ b/drivers/gpu/drm/drm_platform.c @@ -90,6 +90,10 @@ int drm_get_platform_dev(struct platform_device *platdev, list_add_tail(&dev->driver_item, &driver->device_list); + if (drm_core_check_feature(dev, DRIVER_MODESET)) + idr_replace(&drm_minors_idr, dev->control, dev->control->index); + idr_replace(&drm_minors_idr, dev->primary, dev->primary->index); + mutex_unlock(&drm_global_mutex); DRM_INFO("Initialized %s %d.%d.%d %s on minor %d\n", diff --git a/drivers/gpu/drm/drm_stub.c b/drivers/gpu/drm/drm_stub.c index 21bcd4a555d8..0251c38ac267 100644 --- a/drivers/gpu/drm/drm_stub.c +++ b/drivers/gpu/drm/drm_stub.c @@ -356,7 +356,7 @@ int drm_get_minor(struct drm_device *dev, struct drm_minor **minor, int type) new_minor->index = minor_id; INIT_LIST_HEAD(&new_minor->master_list); - idr_replace(&drm_minors_idr, new_minor, minor_id); + idr_replace(&drm_minors_idr, ERR_PTR(-EAGAIN), minor_id); if (type == DRM_MINOR_LEGACY) { ret = drm_proc_init(new_minor, minor_id, drm_proc_root); -- cgit v1.2.3 From eb3936b173eafecc8ebfb2abac322cdc084ef0ab Mon Sep 17 00:00:00 2001 From: Manoj Iyer Date: Mon, 6 Apr 2009 10:58:49 -0500 Subject: UBUNTU: SAUCE: Added quirk to recognize GE0301 3G modem as an interface. OriginalAuthor: Timo Aaltonen BugLink: http://bugs.launchpad.net/bugs/348861 Signed-off-by: Manoj Iyer Signed-off-by: Tim Gardner Signed-off-by: Andy Whitcroft Acked-by: Tim Gardner Acked-by: Stefan Bader --- drivers/usb/storage/unusual_devs.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h index 1719886bb9be..d1630aeae363 100644 --- a/drivers/usb/storage/unusual_devs.h +++ b/drivers/usb/storage/unusual_devs.h @@ -1163,6 +1163,13 @@ UNUSUAL_DEV( 0x0af0, 0x6971, 0x0000, 0x9999, USB_SC_DEVICE, USB_PR_DEVICE, option_ms_init, 0), +/* Reported by Timo Aaltonen */ +UNUSUAL_DEV( 0x0af0, 0x7011, 0x0000, 0x9999, + "Option", + "Mass Storage", + USB_SC_DEVICE, USB_PR_DEVICE, option_ms_init, + 0 ), + /* Reported by F. Aben * This device (wrongly) has a vendor-specific device descriptor. * The entry is needed so usb-storage can bind to it's mass-storage -- cgit v1.2.3 From 35cad26cdcce2616351bd6a92089481f9b5558cb Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Fri, 3 Dec 2010 09:51:33 +0000 Subject: UBUNTU: (no-up) add support for installed header files to ubuntu directory BugLink: http://bugs.launchpad.net/bugs/684666 We need the aufs headers in the linux-libc-headers, add support for including files from the ubuntu include directory. Signed-off-by: Andy Whitcroft --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index b1e029a80064..c1464f4dd33d 100644 --- a/Makefile +++ b/Makefile @@ -905,6 +905,7 @@ headers_install: __headers $(error Headers not exportable for the $(SRCARCH) architecture)) $(Q)$(MAKE) $(hdr-inst)=include $(Q)$(MAKE) $(hdr-inst)=arch/$(hdr-arch)/include/asm $(hdr-dst) + $(Q)$(MAKE) $(hdr-inst)=ubuntu/include dst=include oldheaders= PHONY += headers_check_all headers_check_all: headers_install_all @@ -914,6 +915,7 @@ PHONY += headers_check headers_check: headers_install $(Q)$(MAKE) $(hdr-inst)=include HDRCHECK=1 $(Q)$(MAKE) $(hdr-inst)=arch/$(hdr-arch)/include/asm $(hdr-dst) HDRCHECK=1 + $(Q)$(MAKE) $(hdr-inst)=ubuntu/include dst=include oldheaders= HDRCHECK=1 # --------------------------------------------------------------------------- # Modules -- cgit v1.2.3 From 884e9eb0c01d97d847f8ebf7ada0417a5ba442f2 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 8 Dec 2010 20:29:46 +0000 Subject: UBUNTU: ubuntu: dm-raid4-5 -- follow changes to bio flags commit 7b6d91daee5cac6402186ff224c3af39d79f4a0e Author: Christoph Hellwig Date: Sat Aug 7 18:20:39 2010 +0200 block: unify flags for struct bio and struct request Follow the following transitions: bio_rw_flagged(bio, BIO_RW_BARRIER)) -> (bio->bi_rw & REQ_HARDBARRIER) -> (bio->bi_rw & REQ_FLUSH) Signed-off-by: Andy Whitcroft --- ubuntu/dm-raid4-5/dm-raid4-5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ubuntu/dm-raid4-5/dm-raid4-5.c b/ubuntu/dm-raid4-5/dm-raid4-5.c index 7f4565fa3de3..504aee38d037 100644 --- a/ubuntu/dm-raid4-5/dm-raid4-5.c +++ b/ubuntu/dm-raid4-5/dm-raid4-5.c @@ -3228,7 +3228,7 @@ static void do_ios(struct raid_set *rs, struct bio_list *ios) * the input queue unless all work queues are empty * and the stripe cache is inactive. */ - if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { + if (unlikely(bio->bi_rw & REQ_FLUSH)) { /* REMOVEME: statistics. */ atomic_inc(rs->stats + S_BARRIER); if (delay || -- cgit v1.2.3 From 823a2f892d4ad419252b04ccf8684983e8a61f6f Mon Sep 17 00:00:00 2001 From: Leann Ogasawara Date: Thu, 5 May 2011 06:51:39 -0700 Subject: UBUNTU: SAUCE: [arm] fixup __aeabi_uldivmod undefined build error When building on arm we run into the following build error due to gcc-4.6 optimizing do_div into a uldivmod call: ERROR: "__aeabi_uldivmod" [drivers/scsi/megaraid/megaraid_sas.ko] undefined! Inline some assembly to prevent the compiler optimization. Signed-off-by: Leann Ogasawara --- arch/arm/include/asm/div64.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/include/asm/div64.h b/arch/arm/include/asm/div64.h index fe92ccf1d0b0..2dedab7ae089 100644 --- a/arch/arm/include/asm/div64.h +++ b/arch/arm/include/asm/div64.h @@ -73,6 +73,7 @@ #define do_div(n, base) \ ({ \ unsigned int __r, __b = (base); \ + asm("" : "+r" (__b)); \ if (!__builtin_constant_p(__b) || __b == 0 || \ (__LINUX_ARM_ARCH__ < 4 && (__b & (__b - 1)) != 0)) { \ /* non-constant divisor (or zero): slow path */ \ -- cgit v1.2.3 From 6a6d02fbe6ef09c80517383dcb019a336968dee6 Mon Sep 17 00:00:00 2001 From: Thomas Schlichter Date: Thu, 19 May 2011 14:42:26 +0100 Subject: UBUNTU: SAUCE: vesafb: mtrr module parameter is uint, not bool BugLink: http://bugs.launchpad.net/bugs/778043 As noted by the reporter the mtrr kernel command line option is actually a positive numeric not a boolean, move the module parameter we add to match. Acked-by: Leann Ogasawara Acked-by: Brad Figg Signed-off-by: Andy Whitcroft --- drivers/video/vesafb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/vesafb.c b/drivers/video/vesafb.c index e3acd0131ee3..9081d294745a 100644 --- a/drivers/video/vesafb.c +++ b/drivers/video/vesafb.c @@ -74,7 +74,7 @@ module_param_named(vgapal, pmi_setpal, invbool, 0); MODULE_PARM_DESC(vgapal, "Use VGA for setting palette (default)"); module_param_named(pmipal, pmi_setpal, bool, 0); MODULE_PARM_DESC(pmipal, "Use PMI for setting palette"); -module_param(mtrr, bool, 0); +module_param(mtrr, uint, 0); MODULE_PARM_DESC(mtrr, "Enable MTRR support (default)"); module_param_named(nomtrr, mtrr, invbool, 0); MODULE_PARM_DESC(nomtrr, "Disable MTRR support"); -- cgit v1.2.3 From c9aa855f40b6bbb9f6f2c18e627c7b1ab2f25d4f Mon Sep 17 00:00:00 2001 From: Thomas Schlichter Date: Thu, 19 May 2011 14:44:05 +0100 Subject: UBUNTU: SAUCE: vesafb: enable mtrr WC by default BugLink: http://bugs.launchpad.net/bugs/778043 Move to enabling a write-combining MTRR by default, this then matches the uvesafb module. Acked-by: Leann Ogasawara Acked-by: Brad Figg Signed-off-by: Andy Whitcroft --- drivers/video/vesafb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/vesafb.c b/drivers/video/vesafb.c index 9081d294745a..9d6995d2039f 100644 --- a/drivers/video/vesafb.c +++ b/drivers/video/vesafb.c @@ -55,7 +55,7 @@ static struct fb_fix_screeninfo vesafb_fix __initdata = { #ifndef MODULE static int inverse __read_mostly; #endif -static int mtrr __read_mostly; /* disable mtrr */ +static int mtrr __read_mostly = 3; /* disable mtrr */ static int vram_remap __initdata; /* Set amount of memory to be used */ static int vram_total __initdata; /* Set total amount of memory */ static int pmi_setpal __read_mostly = 1; /* pmi for palette changes ??? */ -- cgit v1.2.3 From 6c2a3f6868d8a5fc925f0b3c36cb945d8b3061fa Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 23 May 2011 15:06:50 +0100 Subject: UBUNTU: SAUCE: S3 early resume debug via keyboard LEDs Add support to debug S3 early resume by flashing the keyboard LEDs three times in the realmode path. This is useful to allow one to determine if S3 hangs occur in the BIOS or during the early resume phase. Add kernel parameter acpi_sleep=s3_leds to enable the s3 debugging option. This can also be enabled by writing 8 to /proc/sys/kernel/acpi_video_flags. Signed-off-by: Colin Ian King Acked-by: Stefan Bader Acked-by: John Johansen Signed-off-by: Leann Ogasawara --- Documentation/kernel-parameters.txt | 5 ++++- arch/x86/kernel/acpi/sleep.c | 2 ++ arch/x86/realmode/rm/wakemain.c | 27 +++++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index a92c5ebf373e..d79efa337357 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -252,12 +252,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. For broken nForce2 BIOS resulting in XT-PIC timer. acpi_sleep= [HW,ACPI] Sleep options - Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig, + Format: { s3_bios, s3_mode, s3_beep, s3_leds, s4_nohwsig, old_ordering, nonvs, sci_force_enable } See Documentation/power/video.txt for information on s3_bios and s3_mode. s3_beep is for debugging; it makes the PC's speaker beep as soon as the kernel's real-mode entry point is called. + s3_leds is for debugging; it flashes the keyboard LEDs + 3 times as soon as the kernel's real-mode entry point is + called. s4_nohwsig prevents ACPI hardware signature from being used during resume from hibernation. old_ordering causes the ACPI 1.0 ordering of the _PTS diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 95bf99de9058..79357aae9f10 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -94,6 +94,8 @@ static int __init acpi_sleep_setup(char *str) acpi_realmode_flags |= 2; if (strncmp(str, "s3_beep", 7) == 0) acpi_realmode_flags |= 4; + if (strncmp(str, "s3_leds", 7) == 0) + acpi_realmode_flags |= 8; #ifdef CONFIG_HIBERNATION if (strncmp(str, "s4_nohwsig", 10) == 0) acpi_no_s4_hw_signature(); diff --git a/arch/x86/realmode/rm/wakemain.c b/arch/x86/realmode/rm/wakemain.c index 91405d515ec6..810b9c13c7ab 100644 --- a/arch/x86/realmode/rm/wakemain.c +++ b/arch/x86/realmode/rm/wakemain.c @@ -61,6 +61,30 @@ static void send_morse(const char *pattern) } } +#define I8042_STATUS_REG 0x64 +#define I8042_DATA_REG 0x60 +#define I8042_SET_LED_BITS 0xed +#define I8042_STR_IBF 0x02 + +static void flash_keyboard_leds(void) +{ + int i; + unsigned char leds = 7; + + /* Flash keyboard LEDs 3 times */ + for (i = 0; i < 6; i++) { + while (inb(I8042_STATUS_REG) & I8042_STR_IBF) + ; + outb(I8042_SET_LED_BITS, I8042_DATA_REG); + while (inb(I8042_STATUS_REG) & I8042_STR_IBF) + ; + outb(leds, I8042_DATA_REG); + leds ^= 7; + udelay(500000); + } +} + + void main(void) { /* Kill machine if structures are wrong */ @@ -79,4 +103,7 @@ void main(void) probe_cards(0); set_mode(wakeup_header.video_mode); } + + if (wakeup_header.realmode_flags & 8) + flash_keyboard_leds(); } -- cgit v1.2.3 From b15dedd2b55cbffe2271866cc6f3f0dad2ecc318 Mon Sep 17 00:00:00 2001 From: Stefan Bader Date: Thu, 26 May 2011 13:19:57 +0200 Subject: UBUNTU: SAUCE: Convert dm-raid45 to new block plugging Plugging for IOs to block devices was changed to an explicit, per task base. This converts the module to the new framework, fixing the compile failure. Signed-off-by: Stefan Bader Signed-off-by: Leann Ogasawara --- ubuntu/dm-raid4-5/dm-raid4-5.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/ubuntu/dm-raid4-5/dm-raid4-5.c b/ubuntu/dm-raid4-5/dm-raid4-5.c index 504aee38d037..fcc782c80fb8 100644 --- a/ubuntu/dm-raid4-5/dm-raid4-5.c +++ b/ubuntu/dm-raid4-5/dm-raid4-5.c @@ -3275,18 +3275,6 @@ static void do_ios(struct raid_set *rs, struct bio_list *ios) bio_list_merge_head(ios, &reject); } -/* Unplug: let any queued io role on the sets devices. */ -static void do_unplug(struct raid_set *rs) -{ - struct raid_dev *dev = rs->dev + rs->set.raid_devs; - - while (dev-- > rs->dev) { - /* Only call any device unplug function, if io got queued. */ - if (TestClearDevIoQueued(dev)) - blk_unplug(bdev_get_queue(dev->dev->bdev)); - } -} - /* Send an event in case we're getting too busy. */ static void do_busy_event(struct raid_set *rs) { @@ -3326,6 +3314,7 @@ static void do_raid(struct work_struct *ws) struct raid_set *rs = container_of(ws, struct raid_set, io.dws_do_raid.work); struct bio_list *ios = &rs->io.work, *ios_in = &rs->io.in; + struct blk_plug plug; /* * We always need to end io, so that ios can get errored in @@ -3342,8 +3331,9 @@ static void do_raid(struct work_struct *ws) do_sc_resize(rs); /* Try to recover regions. */ + blk_start_plug(&plug); do_recovery(rs); - do_unplug(rs); /* Unplug the sets device queues. */ + blk_finish_plug(&plug); /* Unplug the queue */ /* Quickly grab all new ios queued and add them to the work list. */ mutex_lock(&rs->io.in_lock); @@ -3351,11 +3341,12 @@ static void do_raid(struct work_struct *ws) bio_list_init(ios_in); mutex_unlock(&rs->io.in_lock); + blk_start_plug(&plug); if (!bio_list_empty(ios)) do_ios(rs, ios); /* Got ios to work into the cache. */ do_flush(rs); /* Flush any stripes on io list. */ - do_unplug(rs); /* Unplug the sets device queues. */ + blk_finish_plug(&plug); /* Unplug the queue */ do_busy_event(rs); /* Check if we got too busy. */ } -- cgit v1.2.3 From f861c36c42fe065901a6f2d9ae8379088577737a Mon Sep 17 00:00:00 2001 From: Leann Ogasawara Date: Mon, 30 May 2011 16:35:15 -0700 Subject: UBUNTU: ubuntu: dm-raid4-5 fix up build failure ubuntu-2.6/ubuntu/dm-raid4-5/dm-raid4-5.c:1579:9: error: too many arguments to function 'dm_io_client_create' The above build failure was a result of upstream commit: commit bda8efec5c706a672e0714d341a342e811f0262a Author: Mikulas Patocka Date: Sun May 29 13:03:09 2011 +0100 dm io: use fixed initial mempool size Signed-off-by: Leann Ogasawara --- ubuntu/dm-raid4-5/dm-raid4-5.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/ubuntu/dm-raid4-5/dm-raid4-5.c b/ubuntu/dm-raid4-5/dm-raid4-5.c index fcc782c80fb8..88db7958b8d8 100644 --- a/ubuntu/dm-raid4-5/dm-raid4-5.c +++ b/ubuntu/dm-raid4-5/dm-raid4-5.c @@ -1574,17 +1574,14 @@ static int sc_init(struct raid_set *rs, unsigned stripes) /* Create dm-io client context for IO stripes. */ sc->dm_io_client = - dm_io_client_create((stripes > 32 ? 32 : stripes) * - rs->set.raid_devs * - chunk_pages(rs->set.io_size)); + dm_io_client_create(); if (IS_ERR(sc->dm_io_client)) return PTR_ERR(sc->dm_io_client); /* FIXME: intermingeled with stripe cache initialization. */ /* Create dm-io client context for recovery stripes. */ rec->dm_io_client = - dm_io_client_create(rstripes * rs->set.raid_devs * - chunk_pages(rec->io_size)); + dm_io_client_create(); if (IS_ERR(rec->dm_io_client)) return PTR_ERR(rec->dm_io_client); -- cgit v1.2.3 From a60bcee09919a0dd7ed3855934d81b28e7c38dcf Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 29 Jul 2010 09:46:41 +0100 Subject: UBUNTU: SAUCE: (no-up) add tracing for user initiated readahead requests Track pages which undergo readahead and for each record which were actually consumed, via either read or faulted into a map. This allows userspace readahead applications (such as ureadahead) to track which pages in core at the end of a boot are actually required and generate an optimal readahead pack. It also allows pack adjustment and optimisation in parallel with readahead, allowing the pack to evolve to be accurate as userspace paths change. The status of the pages are reported back via the mincore() call using a newly allocated bit. Signed-off-by: Andy Whitcroft Acked-by: Stefan Bader Signed-off-by: Leann Ogasawara --- include/linux/page-flags.h | 3 +++ mm/filemap.c | 3 +++ mm/memory.c | 7 ++++++- mm/mincore.c | 2 ++ mm/page_alloc.c | 1 + mm/readahead.c | 1 + 6 files changed, 16 insertions(+), 1 deletion(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index c88d2a9451af..3610ab7659f9 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -108,6 +108,7 @@ enum pageflags { #ifdef CONFIG_TRANSPARENT_HUGEPAGE PG_compound_lock, #endif + PG_readaheadunused, /* user oriented readahead as yet unused*/ __NR_PAGEFLAGS, /* Filesystems */ @@ -231,6 +232,8 @@ PAGEFLAG(MappedToDisk, mappedtodisk) PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim) PAGEFLAG(Readahead, reclaim) /* Reminder to do async read-ahead */ +PAGEFLAG(ReadaheadUnused, readaheadunused) + #ifdef CONFIG_HIGHMEM /* * Must use a macro here due to header dependency issues. page_zone() is not diff --git a/mm/filemap.c b/mm/filemap.c index a4a5260b0279..c97ce2627a8a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1304,6 +1304,9 @@ int file_read_actor(read_descriptor_t *desc, struct page *page, if (size > count) size = count; + if (PageReadaheadUnused(page)) + ClearPageReadaheadUnused(page); + /* * Faults on the destination of a read are common, so do it before * taking the kmap. diff --git a/mm/memory.c b/mm/memory.c index 1b7dc662bf9f..fcee128452b0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3231,10 +3231,15 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, else VM_BUG_ON(!PageLocked(vmf.page)); + page = vmf.page; + + /* Mark the page as used on fault. */ + if (PageReadaheadUnused(page)) + ClearPageReadaheadUnused(page); + /* * Should we do an early C-O-W break? */ - page = vmf.page; if (flags & FAULT_FLAG_WRITE) { if (!(vma->vm_flags & VM_SHARED)) { page = cow_page; diff --git a/mm/mincore.c b/mm/mincore.c index 936b4cee8cb1..7c2874a9a731 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -80,6 +80,8 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) #endif if (page) { present = PageUptodate(page); + if (present) + present |= (PageReadaheadUnused(page) << 7); page_cache_release(page); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bbfaa29232f1..e08dc611d74a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5971,6 +5971,7 @@ static const struct trace_print_flags pageflag_names[] = { #ifdef CONFIG_TRANSPARENT_HUGEPAGE {1UL << PG_compound_lock, "compound_lock" }, #endif + {1UL << PG_readaheadunused, "readaheadunused"}, }; static void dump_page_flags(unsigned long flags) diff --git a/mm/readahead.c b/mm/readahead.c index ea8f8fa21649..3de0a1975735 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -189,6 +189,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, list_add(&page->lru, &page_pool); if (page_idx == nr_to_read - lookahead_size) SetPageReadahead(page); + SetPageReadaheadUnused(page); ret++; } -- cgit v1.2.3 From bf4795ca1b15f2e99493d2c0beba0831c844d58f Mon Sep 17 00:00:00 2001 From: John Rigby Date: Tue, 12 Jun 2012 16:14:33 -0600 Subject: UBUNTU: SAUCE: (no-up) vfs: Add a trace point in the mark_inode_dirty function [apw@canonical.com: This has no upstream traction but is used by powertop, so its worth carrying.] PowerTOP would like to be able to show who is keeping the disk busy by dirtying data. The most logical spot for this is in the vfs in the mark_inode_dirty() function. Doing this on the block level is not possible because by the time the IO hits the block layer the guilty party can no longer be found ("kjournald" and "pdflush" are not useful answers to "who caused this file to be dirty). The trace point follows the same logic/style as the block_dump code and pretty much dumps the same data, just not to dmesg (and thus to /var/log/messages) but via the trace events streams. Note: This patch was posted to lkml and might potentially go into 2.6.33 but I have not seen which maintainer will take it. Signed-of-by: Arjan van de Ven Signed-off-by: Amit Kucheria Signed-off-by: Andy Whitcroft Conflicts: fs/fs-writeback.c --- fs/fs-writeback.c | 3 +++ fs/inode.c | 4 ++++ include/trace/events/vfs.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+) create mode 100644 include/trace/events/vfs.h diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 9c9e0cba7c06..74b3a13afdba 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "internal.h" /* @@ -1170,6 +1171,8 @@ void __mark_inode_dirty(struct inode *inode, int flags) if ((inode->i_state & flags) == flags) return; + trace_dirty_inode(inode, current); + if (unlikely(block_dump > 1)) block_dump___mark_inode_dirty(inode); diff --git a/fs/inode.c b/fs/inode.c index c99163b1b310..ebc1c1091680 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1883,3 +1883,7 @@ void inode_dio_done(struct inode *inode) wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); } EXPORT_SYMBOL(inode_dio_done); + +#define CREATE_TRACE_POINTS +#include + diff --git a/include/trace/events/vfs.h b/include/trace/events/vfs.h new file mode 100644 index 000000000000..261112909819 --- /dev/null +++ b/include/trace/events/vfs.h @@ -0,0 +1,55 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vfs + +#include + +#if !defined(_TRACE_VFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VFS_H + +/* + * Tracepoint for dirtying an inode: + */ +TRACE_EVENT(dirty_inode, + + TP_PROTO(struct inode *inode, struct task_struct *task), + + TP_ARGS(inode, task), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __array( char, dev, 16 ) + __array( char, file, 32 ) + ), + + TP_fast_assign( + if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { + struct dentry *dentry; + const char *name = "?"; + + dentry = d_find_alias(inode); + if (dentry) { + spin_lock(&dentry->d_lock); + name = (const char *) dentry->d_name.name; + } + + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->pid = task->pid; + strlcpy(__entry->file, name, 32); + strlcpy(__entry->dev, inode->i_sb->s_id, 16); + + if (dentry) { + spin_unlock(&dentry->d_lock); + dput(dentry); + } + } + ), + + TP_printk("task=%i (%s) file=%s dev=%s", + __entry->pid, __entry->comm, __entry->file, __entry->dev) +); + +#endif /* _TRACE_VFS_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From c363cd46a110ddb2fa274b44ee894d5f468e8763 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 29 Jul 2010 16:36:30 +0100 Subject: UBUNTU: SAUCE: vt -- maintain bootloader screen mode and content until vt switch Introduce a new VT mode KD_TRANSPARENT which endevours to leave the current content of the framebuffer untouched. This allows the bootloader to insert a graphical splash and have the kernel maintain it until the OS splash can take over. When we finally switch away (either through programs like plymouth or manually) the content is lost and the VT reverts to text mode. Signed-off-by: Andy Whitcroft Acked-by: Stefan Bader Signed-off-by: Leann Ogasawara --- drivers/tty/vt/vt.c | 36 +++++++++++++++++++++++++++++++----- drivers/tty/vt/vt_ioctl.c | 10 +++++----- include/linux/kd.h | 1 + include/linux/vt_kern.h | 2 +- 4 files changed, 38 insertions(+), 11 deletions(-) diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index 84cbf298c094..8e865d321462 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -102,6 +102,7 @@ #include #include #include +#include #define MAX_NR_CON_DRIVER 16 @@ -144,7 +145,7 @@ static const struct consw *con_driver_map[MAX_NR_CONSOLES]; static int con_open(struct tty_struct *, struct file *); static void vc_init(struct vc_data *vc, unsigned int rows, - unsigned int cols, int do_clear); + unsigned int cols, int do_clear, int mode); static void gotoxy(struct vc_data *vc, int new_x, int new_y); static void save_cur(struct vc_data *vc); static void reset_terminal(struct vc_data *vc, int do_clear); @@ -165,6 +166,9 @@ module_param(global_cursor_default, int, S_IRUGO | S_IWUSR); static int cur_default = CUR_DEFAULT; module_param(cur_default, int, S_IRUGO | S_IWUSR); +int vt_handoff = 0; +module_param_named(handoff, vt_handoff, int, S_IRUGO | S_IWUSR); + /* * ignore_poke: don't unblank the screen when things are typed. This is * mainly for the privacy of braille terminal users. @@ -694,6 +698,13 @@ void redraw_screen(struct vc_data *vc, int is_switch) } if (tty0dev) sysfs_notify(&tty0dev->kobj, NULL, "active"); + /* + * If we are switching away from a transparent VT the contents + * will be lost, convert it into a blank text console then + * it will be repainted blank if we ever switch back. + */ + if (old_vc->vc_mode == KD_TRANSPARENT) + old_vc->vc_mode = KD_TEXT; } else { hide_cursor(vc); redraw = 1; @@ -807,7 +818,7 @@ int vc_allocate(unsigned int currcons) /* return 0 on success */ if (global_cursor_default == -1) global_cursor_default = 1; - vc_init(vc, vc->vc_rows, vc->vc_cols, 1); + vc_init(vc, vc->vc_rows, vc->vc_cols, 1, KD_TEXT); vcs_make_sysfs(currcons); atomic_notifier_call_chain(&vt_notifier_list, VT_ALLOCATE, ¶m); } @@ -2848,7 +2859,7 @@ module_param_named(italic, default_italic_color, int, S_IRUGO | S_IWUSR); module_param_named(underline, default_underline_color, int, S_IRUGO | S_IWUSR); static void vc_init(struct vc_data *vc, unsigned int rows, - unsigned int cols, int do_clear) + unsigned int cols, int do_clear, int mode) { int j, k ; @@ -2859,7 +2870,7 @@ static void vc_init(struct vc_data *vc, unsigned int rows, set_origin(vc); vc->vc_pos = vc->vc_origin; - reset_vc(vc); + reset_vc(vc, mode); for (j=k=0; j<16; j++) { vc->vc_palette[k++] = default_red[j] ; vc->vc_palette[k++] = default_grn[j] ; @@ -2916,16 +2927,31 @@ static int __init con_init(void) mod_timer(&console_timer, jiffies + (blankinterval * HZ)); } + if (vt_handoff > 0 && vt_handoff <= MAX_NR_CONSOLES) { + currcons = vt_handoff - 1; + vc_cons[currcons].d = vc = kzalloc(sizeof(struct vc_data), GFP_NOWAIT); + INIT_WORK(&vc_cons[currcons].SAK_work, vc_SAK); + visual_init(vc, currcons, 1); + vc->vc_screenbuf = kzalloc(vc->vc_screenbuf_size, GFP_NOWAIT); + vc_init(vc, vc->vc_rows, vc->vc_cols, 0, KD_TRANSPARENT); + } for (currcons = 0; currcons < MIN_NR_CONSOLES; currcons++) { + if (currcons == vt_handoff - 1) + continue; vc_cons[currcons].d = vc = kzalloc(sizeof(struct vc_data), GFP_NOWAIT); INIT_WORK(&vc_cons[currcons].SAK_work, vc_SAK); tty_port_init(&vc->port); visual_init(vc, currcons, 1); vc->vc_screenbuf = kzalloc(vc->vc_screenbuf_size, GFP_NOWAIT); vc_init(vc, vc->vc_rows, vc->vc_cols, - currcons || !vc->vc_sw->con_save_screen); + currcons || !vc->vc_sw->con_save_screen, KD_TEXT); } currcons = fg_console = 0; + if (vt_handoff > 0) { + printk(KERN_INFO "vt handoff: transparent VT on vt#%d\n", + vt_handoff); + currcons = fg_console = vt_handoff - 1; + } master_display_fg = vc = vc_cons[currcons].d; set_origin(vc); save_screen(vc); diff --git a/drivers/tty/vt/vt_ioctl.c b/drivers/tty/vt/vt_ioctl.c index 64618547be11..ff5496af5faa 100644 --- a/drivers/tty/vt/vt_ioctl.c +++ b/drivers/tty/vt/vt_ioctl.c @@ -996,9 +996,9 @@ out: return ret; } -void reset_vc(struct vc_data *vc) +void reset_vc(struct vc_data *vc, int mode) { - vc->vc_mode = KD_TEXT; + vc->vc_mode = mode; vt_reset_unicode(vc->vc_num); vc->vt_mode.mode = VT_AUTO; vc->vt_mode.waitv = 0; @@ -1030,7 +1030,7 @@ void vc_SAK(struct work_struct *work) */ if (tty) __do_SAK(tty); - reset_vc(vc); + reset_vc(vc, KD_TEXT); } console_unlock(); } @@ -1287,7 +1287,7 @@ static void complete_change_console(struct vc_data *vc) * this outside of VT_PROCESS but there is no single process * to account for and tracking tty count may be undesirable. */ - reset_vc(vc); + reset_vc(vc, KD_TEXT); if (old_vc_mode != vc->vc_mode) { if (vc->vc_mode == KD_TEXT) @@ -1359,7 +1359,7 @@ void change_console(struct vc_data *new_vc) * this outside of VT_PROCESS but there is no single process * to account for and tracking tty count may be undesirable. */ - reset_vc(vc); + reset_vc(vc, KD_TEXT); /* * Fall through to normal (VT_AUTO) handling of the switch... diff --git a/include/linux/kd.h b/include/linux/kd.h index c36d8476db55..5cc3c171d5b9 100644 --- a/include/linux/kd.h +++ b/include/linux/kd.h @@ -45,6 +45,7 @@ struct consolefontdesc { #define KD_GRAPHICS 0x01 #define KD_TEXT0 0x02 /* obsolete */ #define KD_TEXT1 0x03 /* obsolete */ +#define KD_TRANSPARENT 0x04 #define KDGETMODE 0x4B3B /* get current mode */ #define KDMAPDISP 0x4B3C /* map display into address space */ diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h index 50ae7d0c279e..8f227f735b2f 100644 --- a/include/linux/vt_kern.h +++ b/include/linux/vt_kern.h @@ -129,7 +129,7 @@ int con_copy_unimap(struct vc_data *dst_vc, struct vc_data *src_vc) void vt_event_post(unsigned int event, unsigned int old, unsigned int new); int vt_waitactive(int n); void change_console(struct vc_data *new_vc); -void reset_vc(struct vc_data *vc); +void reset_vc(struct vc_data *vc, int mode); extern int unbind_con_driver(const struct consw *csw, int first, int last, int deflt); int vty_init(const struct file_operations *console_fops); -- cgit v1.2.3 From 4eb75c8488c3740aaeb804dbee4b49ea35327e4f Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 29 Jul 2010 16:36:31 +0100 Subject: UBUNTU: SAUCE: vt -- allow grub to request automatic vt_handoff Grub may be able to select a graphics mode and paint a splash screen for us. If so it needs to be able to tell us it has done so. Add support for detecting a new graphics mode selected bit in the screen_info passed over at boot. Use this to automatically enable vt_handoff mode. Signed-off-by: Andy Whitcroft Acked-by: Stefan Bader Signed-off-by: Leann Ogasawara --- drivers/tty/vt/vt.c | 7 +++++++ include/linux/screen_info.h | 1 + 2 files changed, 8 insertions(+) diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index 8e865d321462..509037447161 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -2896,6 +2896,13 @@ static int __init con_init(void) struct vc_data *vc; unsigned int currcons = 0, i; + if (screen_info.flags & VIDEO_FLAGS_HANDOFF) { + if (vt_handoff == 0) + vt_handoff = 8; + printk(KERN_INFO "vt handoff: grub requested handoff (vt#%d)\n", + vt_handoff); + } + console_lock(); if (conswitchp) diff --git a/include/linux/screen_info.h b/include/linux/screen_info.h index 899fbb487c94..e699dd416f91 100644 --- a/include/linux/screen_info.h +++ b/include/linux/screen_info.h @@ -67,6 +67,7 @@ struct screen_info { #define VIDEO_TYPE_EFI 0x70 /* EFI graphic mode */ #define VIDEO_FLAGS_NOCURSOR (1 << 0) /* The video mode has no cursor set */ +#define VIDEO_FLAGS_HANDOFF (1 << 1) /* Video buffer is alredy painted */ #ifdef __KERNEL__ extern struct screen_info screen_info; -- cgit v1.2.3 From bdc96ca7d599b1fd5c2fee741a7dbef3f6e9b68a Mon Sep 17 00:00:00 2001 From: Leann Ogasawara Date: Wed, 31 Aug 2011 10:25:24 -0700 Subject: UBUNTU: SAUCE: (no-up) x86: reboot: Make Dell Latitude E6520 use reboot=pci The Dell Latitude E6520 doesn't reboot unless reboot=pci is set. BugLink: http://bugs.launchpad.net/bugs/833705 Cc: Signed-off-by: Leann Ogasawara Signed-off-by: Tim Gardner --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 25b48edb847c..76fca743ab02 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -451,6 +451,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), }, }, + { /* Handle problems with rebooting on the Latitude E6520. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E6520", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6520"), + }, + }, { } }; -- cgit v1.2.3 From 685dd175a10a5dd4a186afad7b1f015f2187d845 Mon Sep 17 00:00:00 2001 From: Leann Ogasawara Date: Fri, 9 Sep 2011 13:23:51 -0700 Subject: UBUNTU: SAUCE: (no-up) x86: reboot: Make Dell Optiplex 790 use reboot=pci BugLink: http://bugs.launchpad.net/bugs/818933 The Dell Optiplex 790 doesn't reboot unless reboot=pci is set. Signed-off-by: Leann Ogasawara Signed-off-by: Tim Gardner --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 76fca743ab02..03fc26e2ea22 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -459,6 +459,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6520"), }, }, + { /* Handle problems with rebooting on the OptiPlex 790. */ + .callback = set_pci_reboot, + .ident = "Dell OptiPlex 790", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 790"), + }, + }, { } }; -- cgit v1.2.3 From 41bf177fc45db47a86235d4f889cac490cbb47f3 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 13 Sep 2011 10:01:24 +0800 Subject: UBUNTU: SAUCE: fireware: add NO_MSI quirks for o2micro controller Disable MSI for the O2 Micro, Inc. firewire controller. BugLink: http://bugs.launchpad.net/bugs/801719 Upstream: http://marc.info/?t=131475896500002&r=1&w=2 Signed-off-by: Ming Lei Signed-off-by: Tim Gardner --- drivers/firewire/ohci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index c1af05e834b6..6ac61488082e 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -320,6 +320,8 @@ static const struct { {PCI_VENDOR_ID_VIA, PCI_ANY_ID, PCI_ANY_ID, QUIRK_CYCLE_TIMER | QUIRK_NO_MSI}, + {PCI_VENDOR_ID_O2, PCI_ANY_ID, PCI_ANY_ID, + QUIRK_NO_MSI}, }; /* This overrides anything that was found in ohci_quirks[]. */ -- cgit v1.2.3 From 4b653c62a45e105515468f9a1afffa2f0f28e637 Mon Sep 17 00:00:00 2001 From: Leann Ogasawara Date: Wed, 14 Sep 2011 11:27:20 -0700 Subject: UBUNTU: SAUCE: (no-up) x86: reboot: Make Dell Optiplex 990 use reboot=pci BugLink: http://bugs.launchpad.net/bugs/768039 The Dell Optiplex 990 doesn't reboot unless reboot=pci is set. Signed-off-by: Leann Ogasawara Signed-off-by: Tim Gardner --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 03fc26e2ea22..60f820c8198c 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -467,6 +467,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 790"), }, }, + { /* Handle problems with rebooting on the OptiPlex 990. */ + .callback = set_pci_reboot, + .ident = "Dell OptiPlex 990", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), + }, + }, { } }; -- cgit v1.2.3 From 43653f13a532da8634b0e6e85fd2ea1d7ae061cb Mon Sep 17 00:00:00 2001 From: Leann Ogasawara Date: Fri, 26 Aug 2011 07:30:16 -0700 Subject: UBUNTU: SAUCE: (no-up) x86: reboot: Make Dell Latitude E6220 use reboot=pci BugLink: http://bugs.launchpad.net/bugs/838402 The Dell Latitude E6220 doesn't reboot unless reboot=pci is set. Signed-off-by: Leann Ogasawara Acked-by: Tim Gardner Acked-by: Seth Forshee --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 60f820c8198c..a108d81539ea 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -435,6 +435,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"), }, }, + { /* Handle problems with rebooting on the Latitude E6220. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E6220", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6220"), + }, + }, { /* Handle problems with rebooting on the Latitude E6420. */ .callback = set_pci_reboot, .ident = "Dell Latitude E6420", -- cgit v1.2.3 From 8ac5a681338a3cfdbbd9d17693c76fa53caf6b11 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Fri, 16 Sep 2011 13:20:31 +0100 Subject: UBUNTU: SAUCE: headers_install: fix #include "..." usage for userspace When headers are converted to userspace headers they may include relative includes. For example x86 has the following in its asm/posix_types.h: # ifdef __i386__ # include "posix_types_32.h" # else # include "posix_types_64.h" # endif However this is not safe in the face of the gcc option -I- which removes "the directory the file I am being included from" from the search list. Convert these references to form avoiding this. BugLink: http://bugs.launchpad.net/bugs/824377 Signed-off-by: Andy Whitcroft Acked-by: Tim Gardner Signed-off-by: Leann Ogasawara --- scripts/Makefile.headersinst | 6 +++--- scripts/headers_install.pl | 8 +++++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/scripts/Makefile.headersinst b/scripts/Makefile.headersinst index d3bae5e7b601..2a3c1b79b862 100644 --- a/scripts/Makefile.headersinst +++ b/scripts/Makefile.headersinst @@ -55,9 +55,9 @@ printdir = $(patsubst $(INSTALL_HDR_PATH)/%/,%,$(dir $@)) quiet_cmd_install = INSTALL $(printdir) ($(words $(all-files))\ file$(if $(word 2, $(all-files)),s)) cmd_install = \ - $(PERL) $< $(srctree)/$(obj) $(install) $(SRCARCH) $(header-y); \ - $(PERL) $< $(objtree)/$(obj) $(install) $(SRCARCH) $(objhdr-y); \ - $(PERL) $< $(objtree)/$(gen) $(install) $(SRCARCH) $(genhdr-y); \ + $(PERL) $< $(srctree)/$(obj) $(install) $(SRCARCH) $(printdir) $(header-y); \ + $(PERL) $< $(objtree)/$(obj) $(install) $(SRCARCH) $(printdir) $(objhdr-y); \ + $(PERL) $< $(objtree)/$(gen) $(install) $(SRCARCH) $(printdir) $(genhdr-y); \ for F in $(wrapper-files); do \ echo "\#include " > $(install)/$$F; \ done; \ diff --git a/scripts/headers_install.pl b/scripts/headers_install.pl index 48462be328bb..9f56fc501168 100644 --- a/scripts/headers_install.pl +++ b/scripts/headers_install.pl @@ -18,7 +18,9 @@ use strict; -my ($readdir, $installdir, $arch, @files) = @ARGV; +my ($readdir, $installdir, $arch, $printdir, @files) = @ARGV; + +$printdir =~ s@^include/@@; my $unifdef = "scripts/unifdef -U__KERNEL__ -D__EXPORTED_HEADERS__"; @@ -30,6 +32,10 @@ foreach my $file (@files) { open(my $out, '>', $tmpfile) or die "$tmpfile: $!\n"; while (my $line = <$in>) { + # Any #include which uses "" and does not have a path needs + # rewriting so that the resultant user space headers are + # safe against the use of -I-. + $line =~ s/^(\s*#\s*include\s+)"([^\/]*?)"/$1<$printdir\/$2>/; $line =~ s/([\s(])__user\s/$1/g; $line =~ s/([\s(])__force\s/$1/g; $line =~ s/([\s(])__iomem\s/$1/g; -- cgit v1.2.3 From 8fb753a2fc7335bf4711551e5c398123faee17d0 Mon Sep 17 00:00:00 2001 From: Tim Gardner Date: Tue, 8 Nov 2011 10:42:43 -0700 Subject: UBUNTU: [Config] Disabled dm-raid4-5 Signed-off-by: Tim Gardner --- ubuntu/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ubuntu/Makefile b/ubuntu/Makefile index 4ec1f3d6677d..bad1866141f7 100644 --- a/ubuntu/Makefile +++ b/ubuntu/Makefile @@ -12,7 +12,7 @@ ## ## ## -obj-$(CONFIG_DM_RAID45) += dm-raid4-5/ +#obj-$(CONFIG_DM_RAID45) += dm-raid4-5/ ## ## ## -- cgit v1.2.3 From a6b531d74a9b4acb62e31c52ce49cc95358a717f Mon Sep 17 00:00:00 2001 From: Tim Gardner Date: Tue, 8 Nov 2011 10:52:05 -0700 Subject: UBUNTU: [Config] Disable vt6656 Signed-off-by: Tim Gardner --- drivers/staging/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index a987b3ad380b..3b32dd97dae0 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -27,7 +27,7 @@ obj-$(CONFIG_LINE6_USB) += line6/ obj-$(CONFIG_USB_SERIAL_QUATECH2) += serqt_usb2/ obj-$(CONFIG_OCTEON_ETHERNET) += octeon/ obj-$(CONFIG_VT6655) += vt6655/ -obj-$(CONFIG_VT6656) += vt6656/ +#obj-$(CONFIG_VT6656) += vt6656/ obj-$(CONFIG_VME_BUS) += vme/ obj-$(CONFIG_IPACK_BUS) += ipack/ obj-$(CONFIG_DX_SEP) += sep/ -- cgit v1.2.3 From c523dcf5c36a857741ef91c6419e543d8a8d8713 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 6 Jan 2011 18:42:23 +0000 Subject: UBUNTU: SAUCE: ensure root is ready before running usermodehelpers in it Signed-off-by: Andy Whitcroft --- include/linux/init.h | 2 -- include/linux/kmod.h | 2 ++ init/initramfs.c | 7 +++++++ init/main.c | 2 +- kernel/kmod.c | 2 ++ 5 files changed, 12 insertions(+), 3 deletions(-) diff --git a/include/linux/init.h b/include/linux/init.h index ad021e87d5a3..bc7379f6625f 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -224,8 +224,6 @@ extern bool initcall_debug; static initcall_t __initcall_##fn \ __used __section(.security_initcall.init) = fn -extern struct list_head populate_rootfs_domain; - struct obs_kernel_param { const char *str; int (*setup_func)(char *); diff --git a/include/linux/kmod.h b/include/linux/kmod.h index 5398d5807075..6f571b1be39e 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -71,6 +71,8 @@ call_usermodehelper_fns(char *path, char **argv, char **envp, int wait, int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *), void *data); +void populate_rootfs_wait(void); + static inline int call_usermodehelper(char *path, char **argv, char **envp, int wait) { diff --git a/init/initramfs.c b/init/initramfs.c index 1ae8873570c6..c5a412346663 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -19,6 +19,7 @@ #include #include #include +#include static __initdata char *message; static void __init error(char *x) @@ -582,6 +583,12 @@ static void __init clean_rootfs(void) LIST_HEAD(populate_rootfs_domain); +void populate_rootfs_wait(void) +{ + async_synchronize_full_domain(&populate_rootfs_domain); +} +EXPORT_SYMBOL(populate_rootfs_wait); + static void __init async_populate_rootfs(void) { char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); diff --git a/init/main.c b/init/main.c index 5267741da32d..d0c1cfeb6084 100644 --- a/init/main.c +++ b/init/main.c @@ -893,7 +893,7 @@ static int __init kernel_init(void * unused) * We need to ensure that the filesystem is ready by this point, wait for * async_populate_rootfs to complete. */ - async_synchronize_full_domain(&populate_rootfs_domain); + populate_rootfs_wait(); /* * check if there is an early userspace init. If yes, let it do all diff --git a/kernel/kmod.c b/kernel/kmod.c index ff2c7cb86d77..9d1b6b26dd96 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -585,6 +585,8 @@ int call_usermodehelper_fns( struct subprocess_info *info; gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; + populate_rootfs_wait(); + info = call_usermodehelper_setup(path, argv, envp, gfp_mask); if (info == NULL) -- cgit v1.2.3 From fbf7cc01929f5214fc8f9de404b66bb878d8efd1 Mon Sep 17 00:00:00 2001 From: Leann Ogasawara Date: Wed, 16 Nov 2011 10:30:48 -0800 Subject: UBUNTU: SAUCE: include and for mmc_core arm build Fixes arm build failure: drivers/net/ethernet/stmicro/stmmac/mmc_core.c:142:2: error: implicit declaration of function 'pr_debug' [-Werror=implicit-function-declaration] include/linux/printk.h:47:2: error: unknown type name 'va_list' Signed-off-by: Leann Ogasawara --- drivers/net/ethernet/stmicro/stmmac/mmc_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/mmc_core.c b/drivers/net/ethernet/stmicro/stmmac/mmc_core.c index c07cfe989f6e..2de44aee73cb 100644 --- a/drivers/net/ethernet/stmicro/stmmac/mmc_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/mmc_core.c @@ -23,7 +23,9 @@ *******************************************************************************/ #include +#include #include +#include #include "mmc.h" /* MAC Management Counters register offset */ -- cgit v1.2.3 From 4bb1e1da1fc1ad29f04645d7944a3c8dea3e62db Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Fri, 9 Dec 2011 11:08:03 -0600 Subject: UBUNTU: SAUCE: dell-wmi: Demote unknown WMI event message to pr_debug BugLink: http://bugs.launchpad.net/bugs/581312 This message is informational in nature but is causing users to think that there's a problem. Demote to pr_debug to silence it by default. Signed-off-by: Seth Forshee Acked-by: Tim Gardner Acked-by: Herton Ronaldo Krzesinski Signed-off-by: Leann Ogasawara --- drivers/platform/x86/dell-wmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/dell-wmi.c b/drivers/platform/x86/dell-wmi.c index fa9a2171cc13..3ab412c18d23 100644 --- a/drivers/platform/x86/dell-wmi.c +++ b/drivers/platform/x86/dell-wmi.c @@ -165,7 +165,7 @@ static void dell_wmi_notify(u32 value, void *context) u16 *buffer_entry = (u16 *)obj->buffer.pointer; if (dell_new_hk_type && (buffer_entry[1] != 0x10)) { - pr_info("Received unknown WMI event (0x%x)\n", + pr_debug("Received unknown WMI event (0x%x)\n", buffer_entry[1]); kfree(obj); return; -- cgit v1.2.3 From 5ee501ddf7ac51638381fe03046bb4631e369eca Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Fri, 2 Jul 2010 18:45:50 +0100 Subject: UBUNTU: ubuntu: AUFS -- add BOM and automated update script Signed-off-by: Andy Whitcroft Signed-off-by: Leann Ogasawara --- ubuntu/aufs-update | 37 +++++++++++++++++++++++++++++++++++++ ubuntu/aufs/BOM | 2 ++ 2 files changed, 39 insertions(+) create mode 100644 ubuntu/aufs-update create mode 100644 ubuntu/aufs/BOM diff --git a/ubuntu/aufs-update b/ubuntu/aufs-update new file mode 100644 index 000000000000..869e9d663cad --- /dev/null +++ b/ubuntu/aufs-update @@ -0,0 +1,37 @@ +#!/bin/bash + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " 1>&2 + exit 1 +fi +aufs="$1" + +# Get the current tip name +{ + read x url + read x osha1 +} Date: Fri, 3 Dec 2010 11:12:17 +0000 Subject: UBUNTU: ubuntu: AUFS -- include the aufs_types.h file in linux-libc-headers BugLink: http://bugs.launchpad.net/bugs/684666 Signed-off-by: Andy Whitcroft --- ubuntu/include/Kbuild | 1 + ubuntu/include/linux/Kbuild | 1 + 2 files changed, 2 insertions(+) create mode 100644 ubuntu/include/Kbuild create mode 100644 ubuntu/include/linux/Kbuild diff --git a/ubuntu/include/Kbuild b/ubuntu/include/Kbuild new file mode 100644 index 000000000000..c9a138f17b7e --- /dev/null +++ b/ubuntu/include/Kbuild @@ -0,0 +1 @@ +header-y += linux/ diff --git a/ubuntu/include/linux/Kbuild b/ubuntu/include/linux/Kbuild new file mode 100644 index 000000000000..50d9e8c3c627 --- /dev/null +++ b/ubuntu/include/linux/Kbuild @@ -0,0 +1 @@ +header-y += aufs_type.h -- cgit v1.2.3 From 69013aeb564f79ee30d195929b5ceedc780b60b8 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 6 Jan 2011 16:12:34 +0000 Subject: UBUNTU: ubuntu: AUFS -- update aufs-update to track new locations of headers Track the new location of the headers as per the commit below: commit de699ab60a2f8a55b9c8313a04c7863897fb88bd Author: Andy Whitcroft Date: Fri Dec 3 11:12:17 2010 +0000 UBUNTU: ubuntu: AUFS -- include the aufs_types.h file in linux-libc-headers Signed-off-by: Andy Whitcroft --- ubuntu/aufs-update | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ubuntu/aufs-update b/ubuntu/aufs-update index 869e9d663cad..d1ac0f97ca0c 100644 --- a/ubuntu/aufs-update +++ b/ubuntu/aufs-update @@ -15,18 +15,19 @@ aufs="$1" # Update aufs to the latest. git rm -rf aufs cp -rp "$aufs/fs/aufs" aufs -cp -rp "$aufs/include" aufs +#cp -rp "$aufs/include" aufs +cp -rp "$aufs/include/linux"/*.h include/linux git checkout -f HEAD -- aufs/BOM # Reinsert the include update. -sed -i -e '1iEXTRA_CFLAGS += -I$(src)/include' aufs/Makefile +#sed -i -e '1iEXTRA_CFLAGS += -I$(src)/include' aufs/Makefile # Find the latest commit. read x nsha1 <"$aufs/ChangeLog" # Insert the new commit ID and commit the result. sed -i -e "s/^COMMIT: .*/COMMIT: $nsha1/" aufs/BOM -git add aufs +git add aufs include/linux { echo "UBUNTU: ubuntu: AUFS -- update to $nsha1" echo "" -- cgit v1.2.3 From 27a157ab970f348ccf2d37cde0d2a74644d1708d Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Mon, 16 May 2011 14:50:42 +0100 Subject: UBUNTU: ubuntu: AUFS -- clean up the aufs updater and BOM Clean up the updater to record and use the real sha1 of the tip of the standalone tree as well as recording and tracking the nominal tip in the changelog for commit generation. Signed-off-by: Andy Whitcroft --- ubuntu/aufs-update | 19 ++++++++++++------- ubuntu/aufs/BOM | 3 ++- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/ubuntu/aufs-update b/ubuntu/aufs-update index d1ac0f97ca0c..f50b1c806ce6 100644 --- a/ubuntu/aufs-update +++ b/ubuntu/aufs-update @@ -9,9 +9,13 @@ aufs="$1" # Get the current tip name { read x url - read x osha1 + read x o_tip_sha1 + read x o_log_sha1 } Date: Mon, 16 May 2011 15:18:37 +0100 Subject: UBUNTU: ubuntu: AUFS -- documentation on updating aufs2 Signed-off-by: Andy Whitcroft --- ubuntu/aufs-update | 5 ++- ubuntu/aufs/BOM.UPDATING | 80 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 ubuntu/aufs/BOM.UPDATING diff --git a/ubuntu/aufs-update b/ubuntu/aufs-update index f50b1c806ce6..5b4b5a6daef0 100644 --- a/ubuntu/aufs-update +++ b/ubuntu/aufs-update @@ -1,4 +1,7 @@ #!/bin/bash +# +# NOTE: See aufs/BOM.UPDATING for instructions on how to use this updater. +# if [ "$#" -ne 1 ]; then echo "Usage: $0 " 1>&2 @@ -21,7 +24,7 @@ git rm -rf aufs cp -rp "$aufs/fs/aufs" aufs #cp -rp "$aufs/include" aufs cp -rp "$aufs/include/linux"/*.h include/linux -git checkout -f HEAD -- aufs/BOM +git checkout -f HEAD -- aufs/BOM aufs/BOM.UPDATING # Reinsert the include update. #sed -i -e '1iEXTRA_CFLAGS += -I$(src)/include' aufs/Makefile diff --git a/ubuntu/aufs/BOM.UPDATING b/ubuntu/aufs/BOM.UPDATING new file mode 100644 index 000000000000..e2975b3ddf18 --- /dev/null +++ b/ubuntu/aufs/BOM.UPDATING @@ -0,0 +1,80 @@ += How to update AUFS2 = + +This document covers the process for updating aufs2 within the Ubuntu +drivers directory. + +== Background == + +Upstream aufs2 is maintained in a git tree as below (which is _not_ used +when updating Ubuntu). This represents the primary upstream source tree: + + http://git.c3sl.ufpr.br/pub/scm/aufs/aufs2-2.6.git + +However this is on an http: transport and incredibly slow. It is advisable +to also add the gitorius mirror to your tree, fetching that before origin +to reduce the objects you need from the main tree: + + git://gitorious.org/aufs2/aufs2.git + +Ubuntu updates are actually taken from the aufs2-standalone tree, somewhat +similar to the compat-wireless tree. The version in this tree is based on +the mainline tree above but is modularisable, a key requirement for Ubuntu. +This tree is found at the git tree below (check the BOM for the definative +location): + + http://git.c3sl.ufpr.br/pub/scm/aufs/aufs2-standalone.git + +This tree contains a set of branches, one per upstream release. The +aufs2.1 branch represents the mainline tracking branch, with aufs-2.1-38 +representing the delta to mainline 2.6.38. Each branch contains a set +of patches which affect the core enabling the application of aufs2, plus +an open tree for the fs/aufs tree. The core patches tend to be static +from early in the line of an upstream release, and thus generally do not +need updating as often. + +== Updating the core patches == + +Ubuntu uses two of the core enablement patches, which are applied in the +order below. The remaining changes required to enable aufs are Ubuntu +specific and static: + + aufs2-base.patch + aufs2-standalone.patch + +In order to update these patches it is simplest to simply revert the +existing pair and apply these as two commit. This allows us to more +easily drop aufs in the future should that be required. + +As these core enablement patches are very slow in changing we normally +do not need to update these more than once per upstream mainline release +though sometimes changes do affect these. + +== Updating aufs2 == + +Once the enablement patches are updated we can simply use the aufs-update +script in the ubuntu/ directory to update the existing source. There are +regular updates to the aufs2 upstream and it is likely we will see updates +there more than once a month. + +Firstly checkout the upstream standalone tree (see the BOM for the +location) and checkout an appropriate branch for the release you are +updating. Then simply run the aufs-update command from within the ubuntu/ +drivers directory, supplying the location of the local aufs2-standalone tree +on the command line: + + cd ubuntu + sh aufs-update ../../aufs2-standalone + +This will result in an update commit containing the updated code and +also updating the BOM. Note that any local modifications are now lost. +Where those exist they should be cherry-picked and then squashed into +the update commit to ensure we retain them. + +== Ubuntu modifications == + +We carry a very small delta to the upstream aufs2 source. Any such changes +need to be cherry-picked from the history and squashed into any update +commit in order to prevent regression. Any current patches are listed below: + +Lucid/Maverick/Natty/Oneiric: + UBUNTU: ubuntu: AUFS -- suppress benign plink warning messages -- cgit v1.2.3 From 24392cccf47e93f8767a18ffc5d5d392d20186b5 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 5 Jan 2012 14:25:47 +0000 Subject: UBUNTU: ubuntu: AUFS -- aufs3-base.patch Signed-off-by: Andy Whitcroft --- fs/namei.c | 2 +- fs/splice.c | 10 +++++----- include/linux/namei.h | 1 + include/linux/splice.h | 6 ++++++ 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 7d694194024a..18c9782669f1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1864,7 +1864,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, * needs parent already locked. Doesn't follow mounts. * SMP-safe. */ -static struct dentry *lookup_hash(struct nameidata *nd) +struct dentry *lookup_hash(struct nameidata *nd) { return __lookup_hash(&nd->last, nd->path.dentry, nd); } diff --git a/fs/splice.c b/fs/splice.c index c9f1318a3b82..490239f3dc93 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1086,8 +1086,8 @@ EXPORT_SYMBOL(generic_splice_sendpage); /* * Attempt to initiate a splice from pipe to file. */ -static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags) +long do_splice_from(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) { ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); @@ -1114,9 +1114,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, /* * Attempt to initiate a splice from a file to a pipe. */ -static long do_splice_to(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) +long do_splice_to(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); diff --git a/include/linux/namei.h b/include/linux/namei.h index ffc02135c483..ef35a3198c94 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -85,6 +85,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *, extern struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry, int (*open)(struct inode *, struct file *)); +extern struct dentry *lookup_hash(struct nameidata *nd); extern struct dentry *lookup_one_len(const char *, struct dentry *, int); extern int follow_down_one(struct path *); diff --git a/include/linux/splice.h b/include/linux/splice.h index 26e5b613deda..3ffef2fe0fd2 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -91,4 +91,10 @@ extern void splice_shrink_spd(struct pipe_inode_info *, extern void spd_release_page(struct splice_pipe_desc *, unsigned int); extern const struct pipe_buf_operations page_cache_pipe_buf_ops; + +extern long do_splice_from(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags); +extern long do_splice_to(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); #endif -- cgit v1.2.3 From 47999db26badef577ae0eeac92bf07776cec1ac4 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 5 Jan 2012 14:28:31 +0000 Subject: UBUNTU: ubuntu: AUFS -- aufs3-standalone.patch Signed-off-by: Andy Whitcroft --- fs/file_table.c | 2 ++ fs/inode.c | 1 + fs/namei.c | 1 + fs/namespace.c | 1 + fs/notify/group.c | 3 +++ fs/notify/mark.c | 4 ++++ fs/open.c | 1 + fs/splice.c | 2 ++ security/commoncap.c | 2 ++ security/device_cgroup.c | 1 + security/security.c | 9 +++++++++ 11 files changed, 27 insertions(+) diff --git a/fs/file_table.c b/fs/file_table.c index a305d9e2d1b2..cc998cad4e83 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -441,6 +441,8 @@ void file_sb_list_del(struct file *file) } } +EXPORT_SYMBOL(file_sb_list_del); + #ifdef CONFIG_SMP /* diff --git a/fs/inode.c b/fs/inode.c index ebc1c1091680..0e99c4d8abfb 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -56,6 +56,7 @@ static struct hlist_head *inode_hashtable __read_mostly; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); +EXPORT_SYMBOL(inode_sb_list_lock); /* * Empty aops. Can be used for the cases where the user does not diff --git a/fs/namei.c b/fs/namei.c index 18c9782669f1..f09edf3eeef2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1868,6 +1868,7 @@ struct dentry *lookup_hash(struct nameidata *nd) { return __lookup_hash(&nd->last, nd->path.dentry, nd); } +EXPORT_SYMBOL(lookup_hash); /** * lookup_one_len - filesystem helper to lookup single pathname component diff --git a/fs/namespace.c b/fs/namespace.c index 1e4a5fe3d7b7..7db5f2b15274 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1341,6 +1341,7 @@ int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, } return 0; } +EXPORT_SYMBOL(iterate_mounts); static void cleanup_group_ids(struct mount *mnt, struct mount *end) { diff --git a/fs/notify/group.c b/fs/notify/group.c index 63fc294a4692..6f4adcab9a5b 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "fsnotify.h" @@ -70,6 +71,7 @@ void fsnotify_put_group(struct fsnotify_group *group) if (atomic_dec_and_test(&group->refcnt)) fsnotify_destroy_group(group); } +EXPORT_SYMBOL(fsnotify_put_group); /* * Create a new fsnotify_group and hold a reference for the group returned. @@ -102,3 +104,4 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) return group; } +EXPORT_SYMBOL(fsnotify_alloc_group); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index f104d565b682..54f36db45def 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -112,6 +112,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) if (atomic_dec_and_test(&mark->refcnt)) mark->free_mark(mark); } +EXPORT_SYMBOL(fsnotify_put_mark); /* * Any time a mark is getting freed we end up here. @@ -191,6 +192,7 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark) if (unlikely(atomic_dec_and_test(&group->num_marks))) fsnotify_final_destroy_group(group); } +EXPORT_SYMBOL(fsnotify_destroy_mark); void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) { @@ -278,6 +280,7 @@ err: return ret; } +EXPORT_SYMBOL(fsnotify_add_mark); /* * clear any marks in a group in which mark->flags & flags is true @@ -333,6 +336,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, atomic_set(&mark->refcnt, 1); mark->free_mark = free_mark; } +EXPORT_SYMBOL(fsnotify_init_mark); static int fsnotify_mark_destroy(void *ignored) { diff --git a/fs/open.c b/fs/open.c index ec05458f9883..1637b209fa53 100644 --- a/fs/open.c +++ b/fs/open.c @@ -63,6 +63,7 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, mutex_unlock(&dentry->d_inode->i_mutex); return ret; } +EXPORT_SYMBOL(do_truncate); static long do_sys_truncate(const char __user *pathname, loff_t length) { diff --git a/fs/splice.c b/fs/splice.c index 490239f3dc93..701d34a9c8eb 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1110,6 +1110,7 @@ long do_splice_from(struct pipe_inode_info *pipe, struct file *out, return splice_write(pipe, out, ppos, len, flags); } +EXPORT_SYMBOL(do_splice_from); /* * Attempt to initiate a splice from a file to a pipe. @@ -1136,6 +1137,7 @@ long do_splice_to(struct file *in, loff_t *ppos, return splice_read(in, ppos, pipe, len, flags); } +EXPORT_SYMBOL(do_splice_to); /** * splice_direct_to_actor - splices data directly between two non-pipes diff --git a/security/commoncap.c b/security/commoncap.c index 8b431e4e5916..107919d99b08 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -996,3 +996,5 @@ int cap_mmap_file(struct file *file, unsigned long reqprot, { return 0; } + +EXPORT_SYMBOL(cap_mmap_file); diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 442204cc22d9..d7b5673b503f 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -493,6 +493,7 @@ found: return -EPERM; } +EXPORT_SYMBOL(__devcgroup_inode_permission); int devcgroup_inode_mknod(int mode, dev_t dev) { diff --git a/security/security.c b/security/security.c index 3efc9b12aef4..3fe42f6d3448 100644 --- a/security/security.c +++ b/security/security.c @@ -383,6 +383,7 @@ int security_path_rmdir(struct path *dir, struct dentry *dentry) return 0; return security_ops->path_rmdir(dir, dentry); } +EXPORT_SYMBOL(security_path_rmdir); int security_path_unlink(struct path *dir, struct dentry *dentry) { @@ -399,6 +400,7 @@ int security_path_symlink(struct path *dir, struct dentry *dentry, return 0; return security_ops->path_symlink(dir, dentry, old_name); } +EXPORT_SYMBOL(security_path_symlink); int security_path_link(struct dentry *old_dentry, struct path *new_dir, struct dentry *new_dentry) @@ -425,6 +427,7 @@ int security_path_truncate(struct path *path) return 0; return security_ops->path_truncate(path); } +EXPORT_SYMBOL(security_path_truncate); int security_path_chmod(struct path *path, umode_t mode) { @@ -432,6 +435,7 @@ int security_path_chmod(struct path *path, umode_t mode) return 0; return security_ops->path_chmod(path, mode); } +EXPORT_SYMBOL(security_path_chmod); int security_path_chown(struct path *path, uid_t uid, gid_t gid) { @@ -439,6 +443,7 @@ int security_path_chown(struct path *path, uid_t uid, gid_t gid) return 0; return security_ops->path_chown(path, uid, gid); } +EXPORT_SYMBOL(security_path_chown); int security_path_chroot(struct path *path) { @@ -515,6 +520,7 @@ int security_inode_readlink(struct dentry *dentry) return 0; return security_ops->inode_readlink(dentry); } +EXPORT_SYMBOL(security_inode_readlink); int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd) { @@ -529,6 +535,7 @@ int security_inode_permission(struct inode *inode, int mask) return 0; return security_ops->inode_permission(inode, mask); } +EXPORT_SYMBOL(security_inode_permission); int security_inode_setattr(struct dentry *dentry, struct iattr *attr) { @@ -644,6 +651,7 @@ int security_file_permission(struct file *file, int mask) return fsnotify_perm(file, mask); } +EXPORT_SYMBOL(security_file_permission); int security_file_alloc(struct file *file) { @@ -704,6 +712,7 @@ int security_mmap_file(struct file *file, unsigned long prot, return ret; return ima_file_mmap(file, prot); } +EXPORT_SYMBOL(security_mmap_file); int security_mmap_addr(unsigned long addr) { -- cgit v1.2.3 From d7d09de9f23de3475c7748d9fdf10aeecaf225da Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 5 Jan 2012 15:32:44 +0000 Subject: UBUNTU: ubuntu: AUFS -- fix undefined __devcgroup_inode_permission Fix: ERROR: "__devcgroup_inode_permission" [ubuntu/aufs/aufs.ko] undefined! Signed-off-by: Andy Whitcroft --- security/device_cgroup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/security/device_cgroup.c b/security/device_cgroup.c index d7b5673b503f..74aa1f9ac7ad 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -13,6 +13,7 @@ #include #include #include +#include #define ACC_MKNOD 1 #define ACC_READ 2 -- cgit v1.2.3 From 71d61644a7a55a720d9e02fda6f21d23d1b9176c Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 5 Jan 2012 15:38:31 +0000 Subject: UBUNTU: ubuntu: AUFS -- fix undefined security_path_link Fix: ERROR: "security_path_link" [ubuntu/aufs/aufs.ko] undefined! Signed-off-by: Andy Whitcroft --- security/security.c | 1 + 1 file changed, 1 insertion(+) diff --git a/security/security.c b/security/security.c index 3fe42f6d3448..c7afd35aa2b0 100644 --- a/security/security.c +++ b/security/security.c @@ -409,6 +409,7 @@ int security_path_link(struct dentry *old_dentry, struct path *new_dir, return 0; return security_ops->path_link(old_dentry, new_dir, new_dentry); } +EXPORT_SYMBOL(security_path_link); int security_path_rename(struct path *old_dir, struct dentry *old_dentry, struct path *new_dir, struct dentry *new_dentry) -- cgit v1.2.3 From e16a89c3276db3f74a2ac99353468df716625fb5 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 5 Jan 2012 14:06:50 +0000 Subject: UBUNTU: ubuntu: AUFS -- update to 4cf5db36bcd9748e8e7270022f295f84d1fc2245 Signed-off-by: Andy Whitcroft --- ubuntu/aufs/BOM | 2 +- ubuntu/aufs/Kconfig | 203 +++++ ubuntu/aufs/Makefile | 38 + ubuntu/aufs/aufs.h | 60 ++ ubuntu/aufs/branch.c | 1170 ++++++++++++++++++++++++++ ubuntu/aufs/branch.h | 232 ++++++ ubuntu/aufs/conf.mk | 38 + ubuntu/aufs/cpup.c | 1081 ++++++++++++++++++++++++ ubuntu/aufs/cpup.h | 83 ++ ubuntu/aufs/dbgaufs.c | 334 ++++++++ ubuntu/aufs/dbgaufs.h | 52 ++ ubuntu/aufs/dcsub.c | 243 ++++++ ubuntu/aufs/dcsub.h | 95 +++ ubuntu/aufs/debug.c | 490 +++++++++++ ubuntu/aufs/debug.h | 252 ++++++ ubuntu/aufs/dentry.c | 1140 ++++++++++++++++++++++++++ ubuntu/aufs/dentry.h | 238 ++++++ ubuntu/aufs/dinfo.c | 543 ++++++++++++ ubuntu/aufs/dir.c | 635 ++++++++++++++ ubuntu/aufs/dir.h | 138 ++++ ubuntu/aufs/dynop.c | 377 +++++++++ ubuntu/aufs/dynop.h | 80 ++ ubuntu/aufs/export.c | 805 ++++++++++++++++++ ubuntu/aufs/f_op.c | 731 +++++++++++++++++ ubuntu/aufs/f_op_sp.c | 299 +++++++ ubuntu/aufs/file.c | 676 +++++++++++++++ ubuntu/aufs/file.h | 299 +++++++ ubuntu/aufs/finfo.c | 157 ++++ ubuntu/aufs/fstype.h | 497 +++++++++++ ubuntu/aufs/hfsnotify.c | 247 ++++++ ubuntu/aufs/hfsplus.c | 58 ++ ubuntu/aufs/hnotify.c | 712 ++++++++++++++++ ubuntu/aufs/i_op.c | 994 ++++++++++++++++++++++ ubuntu/aufs/i_op_add.c | 711 ++++++++++++++++ ubuntu/aufs/i_op_del.c | 478 +++++++++++ ubuntu/aufs/i_op_ren.c | 1017 +++++++++++++++++++++++ ubuntu/aufs/iinfo.c | 264 ++++++ ubuntu/aufs/inode.c | 471 +++++++++++ ubuntu/aufs/inode.h | 556 +++++++++++++ ubuntu/aufs/ioctl.c | 197 +++++ ubuntu/aufs/loop.c | 133 +++ ubuntu/aufs/loop.h | 50 ++ ubuntu/aufs/magic.mk | 54 ++ ubuntu/aufs/module.c | 195 +++++ ubuntu/aufs/module.h | 107 +++ ubuntu/aufs/opts.c | 1679 ++++++++++++++++++++++++++++++++++++++ ubuntu/aufs/opts.h | 210 +++++ ubuntu/aufs/plink.c | 515 ++++++++++++ ubuntu/aufs/poll.c | 56 ++ ubuntu/aufs/procfs.c | 170 ++++ ubuntu/aufs/rdu.c | 385 +++++++++ ubuntu/aufs/rwsem.h | 189 +++++ ubuntu/aufs/sbinfo.c | 344 ++++++++ ubuntu/aufs/spl.h | 66 ++ ubuntu/aufs/super.c | 939 +++++++++++++++++++++ ubuntu/aufs/super.h | 547 +++++++++++++ ubuntu/aufs/sysaufs.c | 107 +++ ubuntu/aufs/sysaufs.h | 105 +++ ubuntu/aufs/sysfs.c | 260 ++++++ ubuntu/aufs/sysrq.c | 151 ++++ ubuntu/aufs/vdir.c | 886 ++++++++++++++++++++ ubuntu/aufs/vfsub.c | 837 +++++++++++++++++++ ubuntu/aufs/vfsub.h | 232 ++++++ ubuntu/aufs/wbr_policy.c | 700 ++++++++++++++++ ubuntu/aufs/whout.c | 1050 ++++++++++++++++++++++++ ubuntu/aufs/whout.h | 89 ++ ubuntu/aufs/wkq.c | 214 +++++ ubuntu/aufs/wkq.h | 96 +++ ubuntu/aufs/xino.c | 1266 ++++++++++++++++++++++++++++ ubuntu/include/linux/aufs_type.h | 224 +++++ 70 files changed, 28548 insertions(+), 1 deletion(-) create mode 100644 ubuntu/aufs/Kconfig create mode 100644 ubuntu/aufs/Makefile create mode 100644 ubuntu/aufs/aufs.h create mode 100644 ubuntu/aufs/branch.c create mode 100644 ubuntu/aufs/branch.h create mode 100644 ubuntu/aufs/conf.mk create mode 100644 ubuntu/aufs/cpup.c create mode 100644 ubuntu/aufs/cpup.h create mode 100644 ubuntu/aufs/dbgaufs.c create mode 100644 ubuntu/aufs/dbgaufs.h create mode 100644 ubuntu/aufs/dcsub.c create mode 100644 ubuntu/aufs/dcsub.h create mode 100644 ubuntu/aufs/debug.c create mode 100644 ubuntu/aufs/debug.h create mode 100644 ubuntu/aufs/dentry.c create mode 100644 ubuntu/aufs/dentry.h create mode 100644 ubuntu/aufs/dinfo.c create mode 100644 ubuntu/aufs/dir.c create mode 100644 ubuntu/aufs/dir.h create mode 100644 ubuntu/aufs/dynop.c create mode 100644 ubuntu/aufs/dynop.h create mode 100644 ubuntu/aufs/export.c create mode 100644 ubuntu/aufs/f_op.c create mode 100644 ubuntu/aufs/f_op_sp.c create mode 100644 ubuntu/aufs/file.c create mode 100644 ubuntu/aufs/file.h create mode 100644 ubuntu/aufs/finfo.c create mode 100644 ubuntu/aufs/fstype.h create mode 100644 ubuntu/aufs/hfsnotify.c create mode 100644 ubuntu/aufs/hfsplus.c create mode 100644 ubuntu/aufs/hnotify.c create mode 100644 ubuntu/aufs/i_op.c create mode 100644 ubuntu/aufs/i_op_add.c create mode 100644 ubuntu/aufs/i_op_del.c create mode 100644 ubuntu/aufs/i_op_ren.c create mode 100644 ubuntu/aufs/iinfo.c create mode 100644 ubuntu/aufs/inode.c create mode 100644 ubuntu/aufs/inode.h create mode 100644 ubuntu/aufs/ioctl.c create mode 100644 ubuntu/aufs/loop.c create mode 100644 ubuntu/aufs/loop.h create mode 100644 ubuntu/aufs/magic.mk create mode 100644 ubuntu/aufs/module.c create mode 100644 ubuntu/aufs/module.h create mode 100644 ubuntu/aufs/opts.c create mode 100644 ubuntu/aufs/opts.h create mode 100644 ubuntu/aufs/plink.c create mode 100644 ubuntu/aufs/poll.c create mode 100644 ubuntu/aufs/procfs.c create mode 100644 ubuntu/aufs/rdu.c create mode 100644 ubuntu/aufs/rwsem.h create mode 100644 ubuntu/aufs/sbinfo.c create mode 100644 ubuntu/aufs/spl.h create mode 100644 ubuntu/aufs/super.c create mode 100644 ubuntu/aufs/super.h create mode 100644 ubuntu/aufs/sysaufs.c create mode 100644 ubuntu/aufs/sysaufs.h create mode 100644 ubuntu/aufs/sysfs.c create mode 100644 ubuntu/aufs/sysrq.c create mode 100644 ubuntu/aufs/vdir.c create mode 100644 ubuntu/aufs/vfsub.c create mode 100644 ubuntu/aufs/vfsub.h create mode 100644 ubuntu/aufs/wbr_policy.c create mode 100644 ubuntu/aufs/whout.c create mode 100644 ubuntu/aufs/whout.h create mode 100644 ubuntu/aufs/wkq.c create mode 100644 ubuntu/aufs/wkq.h create mode 100644 ubuntu/aufs/xino.c create mode 100644 ubuntu/include/linux/aufs_type.h diff --git a/ubuntu/aufs/BOM b/ubuntu/aufs/BOM index 79443f7d8873..04dc856855fb 100644 --- a/ubuntu/aufs/BOM +++ b/ubuntu/aufs/BOM @@ -1,3 +1,3 @@ URL: git://aufs.git.sourceforge.net/gitroot/aufs/aufs3-standalone.git CHANGELOG: -COMMIT: a9be01e5e9688018ebe9ef46ec5414bb356bc556 +COMMIT: 4cf5db36bcd9748e8e7270022f295f84d1fc2245 diff --git a/ubuntu/aufs/Kconfig b/ubuntu/aufs/Kconfig new file mode 100644 index 000000000000..7a96cb5c26ba --- /dev/null +++ b/ubuntu/aufs/Kconfig @@ -0,0 +1,203 @@ +config AUFS_FS + tristate "Aufs (Advanced multi layered unification filesystem) support" + depends on EXPERIMENTAL + help + Aufs is a stackable unification filesystem such as Unionfs, + which unifies several directories and provides a merged single + directory. + In the early days, aufs was entirely re-designed and + re-implemented Unionfs Version 1.x series. Introducing many + original ideas, approaches and improvements, it becomes totally + different from Unionfs while keeping the basic features. + +if AUFS_FS +choice + prompt "Maximum number of branches" + default AUFS_BRANCH_MAX_127 + help + Specifies the maximum number of branches (or member directories) + in a single aufs. The larger value consumes more system + resources and has a minor impact to performance. +config AUFS_BRANCH_MAX_127 + bool "127" + help + Specifies the maximum number of branches (or member directories) + in a single aufs. The larger value consumes more system + resources and has a minor impact to performance. +config AUFS_BRANCH_MAX_511 + bool "511" + help + Specifies the maximum number of branches (or member directories) + in a single aufs. The larger value consumes more system + resources and has a minor impact to performance. +config AUFS_BRANCH_MAX_1023 + bool "1023" + help + Specifies the maximum number of branches (or member directories) + in a single aufs. The larger value consumes more system + resources and has a minor impact to performance. +config AUFS_BRANCH_MAX_32767 + bool "32767" + help + Specifies the maximum number of branches (or member directories) + in a single aufs. The larger value consumes more system + resources and has a minor impact to performance. +endchoice + +config AUFS_SBILIST + bool + depends on AUFS_MAGIC_SYSRQ || PROC_FS + default y + help + Automatic configuration for internal use. + When aufs supports Magic SysRq or /proc, enabled automatically. + +config AUFS_HNOTIFY + bool "Detect direct branch access (bypassing aufs)" + help + If you want to modify files on branches directly, eg. bypassing aufs, + and want aufs to detect the changes of them fully, then enable this + option and use 'udba=notify' mount option. + Currently there is only one available configuration, "fsnotify". + It will have a negative impact to the performance. + See detail in aufs.5. + +choice + prompt "method" if AUFS_HNOTIFY + default AUFS_HFSNOTIFY +config AUFS_HFSNOTIFY + bool "fsnotify" + select FSNOTIFY +endchoice + +config AUFS_EXPORT + bool "NFS-exportable aufs" + depends on EXPORTFS + help + If you want to export your mounted aufs via NFS, then enable this + option. There are several requirements for this configuration. + See detail in aufs.5. + +config AUFS_INO_T_64 + bool + depends on AUFS_EXPORT + depends on 64BIT && !(ALPHA || S390) + default y + help + Automatic configuration for internal use. + /* typedef unsigned long/int __kernel_ino_t */ + /* alpha and s390x are int */ + +config AUFS_RDU + bool "Readdir in userspace" + help + Aufs has two methods to provide a merged view for a directory, + by a user-space library and by kernel-space natively. The latter + is always enabled but sometimes large and slow. + If you enable this option, install the library in aufs2-util + package, and set some environment variables for your readdir(3), + then the work will be handled in user-space which generally + shows better performance in most cases. + See detail in aufs.5. + +config AUFS_PROC_MAP + bool "support for /proc/maps and lsof(1)" + depends on PROC_FS + help + When you issue mmap(2) in aufs, it is actually a direct mmap(2) + call to the file on the branch fs since the file in aufs is + purely virtual. And the file path printed in /proc/maps (and + others) will be the path on the branch fs. In most cases, it + does no harm. But some utilities like lsof(1) may confuse since + the utility or user may expect the file path in aufs to be + printed. + To address this issue, aufs provides a patch which introduces a + new member called vm_prfile into struct vm_are_struct. The patch + is meaningless without enabling this configuration since nobody + sets the new vm_prfile member. + If you don't apply the patch, then enabling this configuration + will cause a compile error. + This approach is fragile since if someone else make some changes + around vm_file, then vm_prfile may not work anymore. As a + workaround such case, aufs provides this configuration. If you + disable it, then lsof(1) may produce incorrect result but the + problem will be gone even if the aufs patch is applied (I hope). + +config AUFS_SP_IATTR + bool "Respect the attributes (mtime/ctime mainly) of special files" + help + When you write something to a special file, some attributes of it + (mtime/ctime mainly) may be updated. Generally such updates are + less important (actually some device drivers and NFS ignore + it). But some applications (such like test program) requires + such updates. If you need these updates, then enable this + configuration which introduces some overhead. + Currently this configuration handles FIFO only. + +config AUFS_SHWH + bool "Show whiteouts" + help + If you want to make the whiteouts in aufs visible, then enable + this option and specify 'shwh' mount option. Although it may + sounds like philosophy or something, but in technically it + simply shows the name of whiteout with keeping its behaviour. + +config AUFS_BR_RAMFS + bool "Ramfs (initramfs/rootfs) as an aufs branch" + help + If you want to use ramfs as an aufs branch fs, then enable this + option. Generally tmpfs is recommended. + Aufs prohibited them to be a branch fs by default, because + initramfs becomes unusable after switch_root or something + generally. If you sets initramfs as an aufs branch and boot your + system by switch_root, you will meet a problem easily since the + files in initramfs may be inaccessible. + Unless you are going to use ramfs as an aufs branch fs without + switch_root or something, leave it N. + +config AUFS_BR_FUSE + bool "Fuse fs as an aufs branch" + depends on FUSE_FS + select AUFS_POLL + help + If you want to use fuse-based userspace filesystem as an aufs + branch fs, then enable this option. + It implements the internal poll(2) operation which is + implemented by fuse only (curretnly). + +config AUFS_POLL + bool + help + Automatic configuration for internal use. + +config AUFS_BR_HFSPLUS + bool "Hfsplus as an aufs branch" + depends on HFSPLUS_FS + default y + help + If you want to use hfsplus fs as an aufs branch fs, then enable + this option. This option introduces a small overhead at + copying-up a file on hfsplus. + +config AUFS_BDEV_LOOP + bool + depends on BLK_DEV_LOOP + default y + help + Automatic configuration for internal use. + Convert =[ym] into =y. + +config AUFS_DEBUG + bool "Debug aufs" + help + Enable this to compile aufs internal debug code. + It will have a negative impact to the performance. + +config AUFS_MAGIC_SYSRQ + bool + depends on AUFS_DEBUG && MAGIC_SYSRQ + default y + help + Automatic configuration for internal use. + When aufs supports Magic SysRq, enabled automatically. +endif diff --git a/ubuntu/aufs/Makefile b/ubuntu/aufs/Makefile new file mode 100644 index 000000000000..46615fe113e6 --- /dev/null +++ b/ubuntu/aufs/Makefile @@ -0,0 +1,38 @@ + +include ${src}/magic.mk +ifeq (${CONFIG_AUFS_FS},m) +include ${src}/conf.mk +endif +-include ${src}/priv_def.mk + +# cf. include/linux/kernel.h +# enable pr_debug +ccflags-y += -DDEBUG +# sparse doesn't allow spaces +ccflags-y += -D'pr_fmt(fmt)=AUFS_NAME"\040%s:%d:%s[%d]:\040"fmt,__func__,__LINE__,current->comm,current->pid' + +obj-$(CONFIG_AUFS_FS) += aufs.o +aufs-y := module.o sbinfo.o super.o branch.o xino.o sysaufs.o opts.o \ + wkq.o vfsub.o dcsub.o \ + cpup.o whout.o wbr_policy.o \ + dinfo.o dentry.o \ + dynop.o \ + finfo.o file.o f_op.o \ + dir.o vdir.o \ + iinfo.o inode.o i_op.o i_op_add.o i_op_del.o i_op_ren.o \ + ioctl.o + +# all are boolean +aufs-$(CONFIG_PROC_FS) += procfs.o plink.o +aufs-$(CONFIG_SYSFS) += sysfs.o +aufs-$(CONFIG_DEBUG_FS) += dbgaufs.o +aufs-$(CONFIG_AUFS_BDEV_LOOP) += loop.o +aufs-$(CONFIG_AUFS_HNOTIFY) += hnotify.o +aufs-$(CONFIG_AUFS_HFSNOTIFY) += hfsnotify.o +aufs-$(CONFIG_AUFS_EXPORT) += export.o +aufs-$(CONFIG_AUFS_POLL) += poll.o +aufs-$(CONFIG_AUFS_RDU) += rdu.o +aufs-$(CONFIG_AUFS_SP_IATTR) += f_op_sp.o +aufs-$(CONFIG_AUFS_BR_HFSPLUS) += hfsplus.o +aufs-$(CONFIG_AUFS_DEBUG) += debug.o +aufs-$(CONFIG_AUFS_MAGIC_SYSRQ) += sysrq.o diff --git a/ubuntu/aufs/aufs.h b/ubuntu/aufs/aufs.h new file mode 100644 index 000000000000..3fd15c26ff41 --- /dev/null +++ b/ubuntu/aufs/aufs.h @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * all header files + */ + +#ifndef __AUFS_H__ +#define __AUFS_H__ + +#ifdef __KERNEL__ + +#define AuStub(type, name, body, ...) \ + static inline type name(__VA_ARGS__) { body; } + +#define AuStubVoid(name, ...) \ + AuStub(void, name, , __VA_ARGS__) +#define AuStubInt0(name, ...) \ + AuStub(int, name, return 0, __VA_ARGS__) + +#include "debug.h" + +#include "branch.h" +#include "cpup.h" +#include "dcsub.h" +#include "dbgaufs.h" +#include "dentry.h" +#include "dir.h" +#include "dynop.h" +#include "file.h" +#include "fstype.h" +#include "inode.h" +#include "loop.h" +#include "module.h" +#include "opts.h" +#include "rwsem.h" +#include "spl.h" +#include "super.h" +#include "sysaufs.h" +#include "vfsub.h" +#include "whout.h" +#include "wkq.h" + +#endif /* __KERNEL__ */ +#endif /* __AUFS_H__ */ diff --git a/ubuntu/aufs/branch.c b/ubuntu/aufs/branch.c new file mode 100644 index 000000000000..dccc7409931d --- /dev/null +++ b/ubuntu/aufs/branch.c @@ -0,0 +1,1170 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * branch management + */ + +#include +#include +#include +#include "aufs.h" + +/* + * free a single branch + */ +static void au_br_do_free(struct au_branch *br) +{ + int i; + struct au_wbr *wbr; + struct au_dykey **key; + + au_hnotify_fin_br(br); + + if (br->br_xino.xi_file) + fput(br->br_xino.xi_file); + mutex_destroy(&br->br_xino.xi_nondir_mtx); + + AuDebugOn(atomic_read(&br->br_count)); + + wbr = br->br_wbr; + if (wbr) { + for (i = 0; i < AuBrWh_Last; i++) + dput(wbr->wbr_wh[i]); + AuDebugOn(atomic_read(&wbr->wbr_wh_running)); + AuRwDestroy(&wbr->wbr_wh_rwsem); + } + + key = br->br_dykey; + for (i = 0; i < AuBrDynOp; i++, key++) + if (*key) + au_dy_put(*key); + else + break; + + mntput(br->br_mnt); + kfree(wbr); + kfree(br); +} + +/* + * frees all branches + */ +void au_br_free(struct au_sbinfo *sbinfo) +{ + aufs_bindex_t bmax; + struct au_branch **br; + + AuRwMustWriteLock(&sbinfo->si_rwsem); + + bmax = sbinfo->si_bend + 1; + br = sbinfo->si_branch; + while (bmax--) + au_br_do_free(*br++); +} + +/* + * find the index of a branch which is specified by @br_id. + */ +int au_br_index(struct super_block *sb, aufs_bindex_t br_id) +{ + aufs_bindex_t bindex, bend; + + bend = au_sbend(sb); + for (bindex = 0; bindex <= bend; bindex++) + if (au_sbr_id(sb, bindex) == br_id) + return bindex; + return -1; +} + +/* ---------------------------------------------------------------------- */ + +/* + * add a branch + */ + +static int test_overlap(struct super_block *sb, struct dentry *h_adding, + struct dentry *h_root) +{ + if (unlikely(h_adding == h_root + || au_test_loopback_overlap(sb, h_adding))) + return 1; + if (h_adding->d_sb != h_root->d_sb) + return 0; + return au_test_subdir(h_adding, h_root) + || au_test_subdir(h_root, h_adding); +} + +/* + * returns a newly allocated branch. @new_nbranch is a number of branches + * after adding a branch. + */ +static struct au_branch *au_br_alloc(struct super_block *sb, int new_nbranch, + int perm) +{ + struct au_branch *add_branch; + struct dentry *root; + int err; + + err = -ENOMEM; + root = sb->s_root; + add_branch = kmalloc(sizeof(*add_branch), GFP_NOFS); + if (unlikely(!add_branch)) + goto out; + + err = au_hnotify_init_br(add_branch, perm); + if (unlikely(err)) + goto out_br; + + add_branch->br_wbr = NULL; + if (au_br_writable(perm)) { + /* may be freed separately at changing the branch permission */ + add_branch->br_wbr = kmalloc(sizeof(*add_branch->br_wbr), + GFP_NOFS); + if (unlikely(!add_branch->br_wbr)) + goto out_hnotify; + } + + err = au_sbr_realloc(au_sbi(sb), new_nbranch); + if (!err) + err = au_di_realloc(au_di(root), new_nbranch); + if (!err) + err = au_ii_realloc(au_ii(root->d_inode), new_nbranch); + if (!err) + return add_branch; /* success */ + + kfree(add_branch->br_wbr); + +out_hnotify: + au_hnotify_fin_br(add_branch); +out_br: + kfree(add_branch); +out: + return ERR_PTR(err); +} + +/* + * test if the branch permission is legal or not. + */ +static int test_br(struct inode *inode, int brperm, char *path) +{ + int err; + + err = (au_br_writable(brperm) && IS_RDONLY(inode)); + if (!err) + goto out; + + err = -EINVAL; + pr_err("write permission for readonly mount or inode, %s\n", path); + +out: + return err; +} + +/* + * returns: + * 0: success, the caller will add it + * plus: success, it is already unified, the caller should ignore it + * minus: error + */ +static int test_add(struct super_block *sb, struct au_opt_add *add, int remount) +{ + int err; + aufs_bindex_t bend, bindex; + struct dentry *root; + struct inode *inode, *h_inode; + + root = sb->s_root; + bend = au_sbend(sb); + if (unlikely(bend >= 0 + && au_find_dbindex(root, add->path.dentry) >= 0)) { + err = 1; + if (!remount) { + err = -EINVAL; + pr_err("%s duplicated\n", add->pathname); + } + goto out; + } + + err = -ENOSPC; /* -E2BIG; */ + if (unlikely(AUFS_BRANCH_MAX <= add->bindex + || AUFS_BRANCH_MAX - 1 <= bend)) { + pr_err("number of branches exceeded %s\n", add->pathname); + goto out; + } + + err = -EDOM; + if (unlikely(add->bindex < 0 || bend + 1 < add->bindex)) { + pr_err("bad index %d\n", add->bindex); + goto out; + } + + inode = add->path.dentry->d_inode; + err = -ENOENT; + if (unlikely(!inode->i_nlink)) { + pr_err("no existence %s\n", add->pathname); + goto out; + } + + err = -EINVAL; + if (unlikely(inode->i_sb == sb)) { + pr_err("%s must be outside\n", add->pathname); + goto out; + } + + if (unlikely(au_test_fs_unsuppoted(inode->i_sb))) { + pr_err("unsupported filesystem, %s (%s)\n", + add->pathname, au_sbtype(inode->i_sb)); + goto out; + } + + err = test_br(add->path.dentry->d_inode, add->perm, add->pathname); + if (unlikely(err)) + goto out; + + if (bend < 0) + return 0; /* success */ + + err = -EINVAL; + for (bindex = 0; bindex <= bend; bindex++) + if (unlikely(test_overlap(sb, add->path.dentry, + au_h_dptr(root, bindex)))) { + pr_err("%s is overlapped\n", add->pathname); + goto out; + } + + err = 0; + if (au_opt_test(au_mntflags(sb), WARN_PERM)) { + h_inode = au_h_dptr(root, 0)->d_inode; + if ((h_inode->i_mode & S_IALLUGO) != (inode->i_mode & S_IALLUGO) + || h_inode->i_uid != inode->i_uid + || h_inode->i_gid != inode->i_gid) + pr_warning("uid/gid/perm %s %u/%u/0%o, %u/%u/0%o\n", + add->pathname, + inode->i_uid, inode->i_gid, + (inode->i_mode & S_IALLUGO), + h_inode->i_uid, h_inode->i_gid, + (h_inode->i_mode & S_IALLUGO)); + } + +out: + return err; +} + +/* + * initialize or clean the whiteouts for an adding branch + */ +static int au_br_init_wh(struct super_block *sb, struct au_branch *br, + int new_perm, struct dentry *h_root) +{ + int err, old_perm; + aufs_bindex_t bindex; + struct mutex *h_mtx; + struct au_wbr *wbr; + struct au_hinode *hdir; + + wbr = br->br_wbr; + old_perm = br->br_perm; + br->br_perm = new_perm; + hdir = NULL; + h_mtx = NULL; + bindex = au_br_index(sb, br->br_id); + if (0 <= bindex) { + hdir = au_hi(sb->s_root->d_inode, bindex); + au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); + } else { + h_mtx = &h_root->d_inode->i_mutex; + mutex_lock_nested(h_mtx, AuLsc_I_PARENT); + } + if (!wbr) + err = au_wh_init(h_root, br, sb); + else { + wbr_wh_write_lock(wbr); + err = au_wh_init(h_root, br, sb); + wbr_wh_write_unlock(wbr); + } + if (hdir) + au_hn_imtx_unlock(hdir); + else + mutex_unlock(h_mtx); + br->br_perm = old_perm; + + if (!err && wbr && !au_br_writable(new_perm)) { + kfree(wbr); + br->br_wbr = NULL; + } + + return err; +} + +static int au_wbr_init(struct au_branch *br, struct super_block *sb, + int perm, struct path *path) +{ + int err; + struct kstatfs kst; + struct au_wbr *wbr; + struct dentry *h_dentry; + + wbr = br->br_wbr; + au_rw_init(&wbr->wbr_wh_rwsem); + memset(wbr->wbr_wh, 0, sizeof(wbr->wbr_wh)); + atomic_set(&wbr->wbr_wh_running, 0); + wbr->wbr_bytes = 0; + + /* + * a limit for rmdir/rename a dir + * cf. AUFS_MAX_NAMELEN in include/linux/aufs_type.h + */ + err = vfs_statfs(path, &kst); + if (unlikely(err)) + goto out; + err = -EINVAL; + h_dentry = path->dentry; + if (kst.f_namelen >= NAME_MAX) + err = au_br_init_wh(sb, br, perm, h_dentry); + else + pr_err("%.*s(%s), unsupported namelen %ld\n", + AuDLNPair(h_dentry), au_sbtype(h_dentry->d_sb), + kst.f_namelen); + +out: + return err; +} + +/* intialize a new branch */ +static int au_br_init(struct au_branch *br, struct super_block *sb, + struct au_opt_add *add) +{ + int err; + + err = 0; + memset(&br->br_xino, 0, sizeof(br->br_xino)); + mutex_init(&br->br_xino.xi_nondir_mtx); + br->br_perm = add->perm; + br->br_mnt = add->path.mnt; /* set first, mntget() later */ + spin_lock_init(&br->br_dykey_lock); + memset(br->br_dykey, 0, sizeof(br->br_dykey)); + atomic_set(&br->br_count, 0); + br->br_xino_upper = AUFS_XINO_TRUNC_INIT; + atomic_set(&br->br_xino_running, 0); + br->br_id = au_new_br_id(sb); + AuDebugOn(br->br_id < 0); + + if (au_br_writable(add->perm)) { + err = au_wbr_init(br, sb, add->perm, &add->path); + if (unlikely(err)) + goto out_err; + } + + if (au_opt_test(au_mntflags(sb), XINO)) { + err = au_xino_br(sb, br, add->path.dentry->d_inode->i_ino, + au_sbr(sb, 0)->br_xino.xi_file, /*do_test*/1); + if (unlikely(err)) { + AuDebugOn(br->br_xino.xi_file); + goto out_err; + } + } + + sysaufs_br_init(br); + mntget(add->path.mnt); + goto out; /* success */ + +out_err: + br->br_mnt = NULL; +out: + return err; +} + +static void au_br_do_add_brp(struct au_sbinfo *sbinfo, aufs_bindex_t bindex, + struct au_branch *br, aufs_bindex_t bend, + aufs_bindex_t amount) +{ + struct au_branch **brp; + + AuRwMustWriteLock(&sbinfo->si_rwsem); + + brp = sbinfo->si_branch + bindex; + memmove(brp + 1, brp, sizeof(*brp) * amount); + *brp = br; + sbinfo->si_bend++; + if (unlikely(bend < 0)) + sbinfo->si_bend = 0; +} + +static void au_br_do_add_hdp(struct au_dinfo *dinfo, aufs_bindex_t bindex, + aufs_bindex_t bend, aufs_bindex_t amount) +{ + struct au_hdentry *hdp; + + AuRwMustWriteLock(&dinfo->di_rwsem); + + hdp = dinfo->di_hdentry + bindex; + memmove(hdp + 1, hdp, sizeof(*hdp) * amount); + au_h_dentry_init(hdp); + dinfo->di_bend++; + if (unlikely(bend < 0)) + dinfo->di_bstart = 0; +} + +static void au_br_do_add_hip(struct au_iinfo *iinfo, aufs_bindex_t bindex, + aufs_bindex_t bend, aufs_bindex_t amount) +{ + struct au_hinode *hip; + + AuRwMustWriteLock(&iinfo->ii_rwsem); + + hip = iinfo->ii_hinode + bindex; + memmove(hip + 1, hip, sizeof(*hip) * amount); + hip->hi_inode = NULL; + au_hn_init(hip); + iinfo->ii_bend++; + if (unlikely(bend < 0)) + iinfo->ii_bstart = 0; +} + +static void au_br_do_add(struct super_block *sb, struct dentry *h_dentry, + struct au_branch *br, aufs_bindex_t bindex) +{ + struct dentry *root; + struct inode *root_inode; + aufs_bindex_t bend, amount; + + root = sb->s_root; + root_inode = root->d_inode; + bend = au_sbend(sb); + amount = bend + 1 - bindex; + au_sbilist_lock(); + au_br_do_add_brp(au_sbi(sb), bindex, br, bend, amount); + au_br_do_add_hdp(au_di(root), bindex, bend, amount); + au_br_do_add_hip(au_ii(root_inode), bindex, bend, amount); + au_set_h_dptr(root, bindex, dget(h_dentry)); + au_set_h_iptr(root_inode, bindex, au_igrab(h_dentry->d_inode), + /*flags*/0); + au_sbilist_unlock(); +} + +int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount) +{ + int err; + aufs_bindex_t bend, add_bindex; + struct dentry *root, *h_dentry; + struct inode *root_inode; + struct au_branch *add_branch; + + root = sb->s_root; + root_inode = root->d_inode; + IMustLock(root_inode); + err = test_add(sb, add, remount); + if (unlikely(err < 0)) + goto out; + if (err) { + err = 0; + goto out; /* success */ + } + + bend = au_sbend(sb); + add_branch = au_br_alloc(sb, bend + 2, add->perm); + err = PTR_ERR(add_branch); + if (IS_ERR(add_branch)) + goto out; + + err = au_br_init(add_branch, sb, add); + if (unlikely(err)) { + au_br_do_free(add_branch); + goto out; + } + + add_bindex = add->bindex; + h_dentry = add->path.dentry; + if (!remount) + au_br_do_add(sb, h_dentry, add_branch, add_bindex); + else { + sysaufs_brs_del(sb, add_bindex); + au_br_do_add(sb, h_dentry, add_branch, add_bindex); + sysaufs_brs_add(sb, add_bindex); + } + + if (!add_bindex) { + au_cpup_attr_all(root_inode, /*force*/1); + sb->s_maxbytes = h_dentry->d_sb->s_maxbytes; + } else + au_add_nlink(root_inode, h_dentry->d_inode); + + /* + * this test/set prevents aufs from handling unnecesary notify events + * of xino files, in case of re-adding a writable branch which was + * once detached from aufs. + */ + if (au_xino_brid(sb) < 0 + && au_br_writable(add_branch->br_perm) + && !au_test_fs_bad_xino(h_dentry->d_sb) + && add_branch->br_xino.xi_file + && add_branch->br_xino.xi_file->f_dentry->d_parent == h_dentry) + au_xino_brid_set(sb, add_branch->br_id); + +out: + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* + * delete a branch + */ + +/* to show the line number, do not make it inlined function */ +#define AuVerbose(do_info, fmt, ...) do { \ + if (do_info) \ + pr_info(fmt, ##__VA_ARGS__); \ +} while (0) + +static int au_test_ibusy(struct inode *inode, aufs_bindex_t bstart, + aufs_bindex_t bend) +{ + return (inode && !S_ISDIR(inode->i_mode)) || bstart == bend; +} + +static int au_test_dbusy(struct dentry *dentry, aufs_bindex_t bstart, + aufs_bindex_t bend) +{ + return au_test_ibusy(dentry->d_inode, bstart, bend); +} + +/* + * test if the branch is deletable or not. + */ +static int test_dentry_busy(struct dentry *root, aufs_bindex_t bindex, + unsigned int sigen, const unsigned int verbose) +{ + int err, i, j, ndentry; + aufs_bindex_t bstart, bend; + struct au_dcsub_pages dpages; + struct au_dpage *dpage; + struct dentry *d; + + err = au_dpages_init(&dpages, GFP_NOFS); + if (unlikely(err)) + goto out; + err = au_dcsub_pages(&dpages, root, NULL, NULL); + if (unlikely(err)) + goto out_dpages; + + for (i = 0; !err && i < dpages.ndpage; i++) { + dpage = dpages.dpages + i; + ndentry = dpage->ndentry; + for (j = 0; !err && j < ndentry; j++) { + d = dpage->dentries[j]; + AuDebugOn(!d->d_count); + if (!au_digen_test(d, sigen)) { + di_read_lock_child(d, AuLock_IR); + if (unlikely(au_dbrange_test(d))) { + di_read_unlock(d, AuLock_IR); + continue; + } + } else { + di_write_lock_child(d); + if (unlikely(au_dbrange_test(d))) { + di_write_unlock(d); + continue; + } + err = au_reval_dpath(d, sigen); + if (!err) + di_downgrade_lock(d, AuLock_IR); + else { + di_write_unlock(d); + break; + } + } + + /* AuDbgDentry(d); */ + bstart = au_dbstart(d); + bend = au_dbend(d); + if (bstart <= bindex + && bindex <= bend + && au_h_dptr(d, bindex) + && au_test_dbusy(d, bstart, bend)) { + err = -EBUSY; + AuVerbose(verbose, "busy %.*s\n", AuDLNPair(d)); + AuDbgDentry(d); + } + di_read_unlock(d, AuLock_IR); + } + } + +out_dpages: + au_dpages_free(&dpages); +out: + return err; +} + +static int test_inode_busy(struct super_block *sb, aufs_bindex_t bindex, + unsigned int sigen, const unsigned int verbose) +{ + int err; + unsigned long long max, ull; + struct inode *i, **array; + aufs_bindex_t bstart, bend; + + array = au_iarray_alloc(sb, &max); + err = PTR_ERR(array); + if (IS_ERR(array)) + goto out; + + err = 0; + AuDbg("b%d\n", bindex); + for (ull = 0; !err && ull < max; ull++) { + i = array[ull]; + if (i->i_ino == AUFS_ROOT_INO) + continue; + + /* AuDbgInode(i); */ + if (au_iigen(i) == sigen) + ii_read_lock_child(i); + else { + ii_write_lock_child(i); + err = au_refresh_hinode_self(i); + au_iigen_dec(i); + if (!err) + ii_downgrade_lock(i); + else { + ii_write_unlock(i); + break; + } + } + + bstart = au_ibstart(i); + bend = au_ibend(i); + if (bstart <= bindex + && bindex <= bend + && au_h_iptr(i, bindex) + && au_test_ibusy(i, bstart, bend)) { + err = -EBUSY; + AuVerbose(verbose, "busy i%lu\n", i->i_ino); + AuDbgInode(i); + } + ii_read_unlock(i); + } + au_iarray_free(array, max); + +out: + return err; +} + +static int test_children_busy(struct dentry *root, aufs_bindex_t bindex, + const unsigned int verbose) +{ + int err; + unsigned int sigen; + + sigen = au_sigen(root->d_sb); + DiMustNoWaiters(root); + IiMustNoWaiters(root->d_inode); + di_write_unlock(root); + err = test_dentry_busy(root, bindex, sigen, verbose); + if (!err) + err = test_inode_busy(root->d_sb, bindex, sigen, verbose); + di_write_lock_child(root); /* aufs_write_lock() calls ..._child() */ + + return err; +} + +static void au_br_do_del_brp(struct au_sbinfo *sbinfo, + const aufs_bindex_t bindex, + const aufs_bindex_t bend) +{ + struct au_branch **brp, **p; + + AuRwMustWriteLock(&sbinfo->si_rwsem); + + brp = sbinfo->si_branch + bindex; + if (bindex < bend) + memmove(brp, brp + 1, sizeof(*brp) * (bend - bindex)); + sbinfo->si_branch[0 + bend] = NULL; + sbinfo->si_bend--; + + p = krealloc(sbinfo->si_branch, sizeof(*p) * bend, AuGFP_SBILIST); + if (p) + sbinfo->si_branch = p; + /* harmless error */ +} + +static void au_br_do_del_hdp(struct au_dinfo *dinfo, const aufs_bindex_t bindex, + const aufs_bindex_t bend) +{ + struct au_hdentry *hdp, *p; + + AuRwMustWriteLock(&dinfo->di_rwsem); + + hdp = dinfo->di_hdentry; + if (bindex < bend) + memmove(hdp + bindex, hdp + bindex + 1, + sizeof(*hdp) * (bend - bindex)); + hdp[0 + bend].hd_dentry = NULL; + dinfo->di_bend--; + + p = krealloc(hdp, sizeof(*p) * bend, AuGFP_SBILIST); + if (p) + dinfo->di_hdentry = p; + /* harmless error */ +} + +static void au_br_do_del_hip(struct au_iinfo *iinfo, const aufs_bindex_t bindex, + const aufs_bindex_t bend) +{ + struct au_hinode *hip, *p; + + AuRwMustWriteLock(&iinfo->ii_rwsem); + + hip = iinfo->ii_hinode + bindex; + if (bindex < bend) + memmove(hip, hip + 1, sizeof(*hip) * (bend - bindex)); + iinfo->ii_hinode[0 + bend].hi_inode = NULL; + au_hn_init(iinfo->ii_hinode + bend); + iinfo->ii_bend--; + + p = krealloc(iinfo->ii_hinode, sizeof(*p) * bend, AuGFP_SBILIST); + if (p) + iinfo->ii_hinode = p; + /* harmless error */ +} + +static void au_br_do_del(struct super_block *sb, aufs_bindex_t bindex, + struct au_branch *br) +{ + aufs_bindex_t bend; + struct au_sbinfo *sbinfo; + struct dentry *root, *h_root; + struct inode *inode, *h_inode; + struct au_hinode *hinode; + + SiMustWriteLock(sb); + + root = sb->s_root; + inode = root->d_inode; + sbinfo = au_sbi(sb); + bend = sbinfo->si_bend; + + h_root = au_h_dptr(root, bindex); + hinode = au_hi(inode, bindex); + h_inode = au_igrab(hinode->hi_inode); + au_hiput(hinode); + + au_sbilist_lock(); + au_br_do_del_brp(sbinfo, bindex, bend); + au_br_do_del_hdp(au_di(root), bindex, bend); + au_br_do_del_hip(au_ii(inode), bindex, bend); + au_sbilist_unlock(); + + dput(h_root); + iput(h_inode); + au_br_do_free(br); +} + +int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount) +{ + int err, rerr, i; + unsigned int mnt_flags; + aufs_bindex_t bindex, bend, br_id; + unsigned char do_wh, verbose; + struct au_branch *br; + struct au_wbr *wbr; + + err = 0; + bindex = au_find_dbindex(sb->s_root, del->h_path.dentry); + if (bindex < 0) { + if (remount) + goto out; /* success */ + err = -ENOENT; + pr_err("%s no such branch\n", del->pathname); + goto out; + } + AuDbg("bindex b%d\n", bindex); + + err = -EBUSY; + mnt_flags = au_mntflags(sb); + verbose = !!au_opt_test(mnt_flags, VERBOSE); + bend = au_sbend(sb); + if (unlikely(!bend)) { + AuVerbose(verbose, "no more branches left\n"); + goto out; + } + br = au_sbr(sb, bindex); + i = atomic_read(&br->br_count); + if (unlikely(i)) { + AuVerbose(verbose, "%d file(s) opened\n", i); + goto out; + } + + wbr = br->br_wbr; + do_wh = wbr && (wbr->wbr_whbase || wbr->wbr_plink || wbr->wbr_orph); + if (do_wh) { + /* instead of WbrWhMustWriteLock(wbr) */ + SiMustWriteLock(sb); + for (i = 0; i < AuBrWh_Last; i++) { + dput(wbr->wbr_wh[i]); + wbr->wbr_wh[i] = NULL; + } + } + + err = test_children_busy(sb->s_root, bindex, verbose); + if (unlikely(err)) { + if (do_wh) + goto out_wh; + goto out; + } + + err = 0; + br_id = br->br_id; + if (!remount) + au_br_do_del(sb, bindex, br); + else { + sysaufs_brs_del(sb, bindex); + au_br_do_del(sb, bindex, br); + sysaufs_brs_add(sb, bindex); + } + + if (!bindex) { + au_cpup_attr_all(sb->s_root->d_inode, /*force*/1); + sb->s_maxbytes = au_sbr_sb(sb, 0)->s_maxbytes; + } else + au_sub_nlink(sb->s_root->d_inode, del->h_path.dentry->d_inode); + if (au_opt_test(mnt_flags, PLINK)) + au_plink_half_refresh(sb, br_id); + + if (au_xino_brid(sb) == br_id) + au_xino_brid_set(sb, -1); + goto out; /* success */ + +out_wh: + /* revert */ + rerr = au_br_init_wh(sb, br, br->br_perm, del->h_path.dentry); + if (rerr) + pr_warning("failed re-creating base whiteout, %s. (%d)\n", + del->pathname, rerr); +out: + return err; +} + +/* ---------------------------------------------------------------------- */ + +static int au_ibusy(struct super_block *sb, struct aufs_ibusy __user *arg) +{ + int err; + aufs_bindex_t bstart, bend; + struct aufs_ibusy ibusy; + struct inode *inode, *h_inode; + + err = -EPERM; + if (unlikely(!capable(CAP_SYS_ADMIN))) + goto out; + + err = copy_from_user(&ibusy, arg, sizeof(ibusy)); + if (!err) + err = !access_ok(VERIFY_WRITE, &arg->h_ino, sizeof(arg->h_ino)); + if (unlikely(err)) { + err = -EFAULT; + AuTraceErr(err); + goto out; + } + + err = -EINVAL; + si_read_lock(sb, AuLock_FLUSH); + if (unlikely(ibusy.bindex < 0 || ibusy.bindex > au_sbend(sb))) + goto out_unlock; + + err = 0; + ibusy.h_ino = 0; /* invalid */ + inode = ilookup(sb, ibusy.ino); + if (!inode + || inode->i_ino == AUFS_ROOT_INO + || is_bad_inode(inode)) + goto out_unlock; + + ii_read_lock_child(inode); + bstart = au_ibstart(inode); + bend = au_ibend(inode); + if (bstart <= ibusy.bindex && ibusy.bindex <= bend) { + h_inode = au_h_iptr(inode, ibusy.bindex); + if (h_inode && au_test_ibusy(inode, bstart, bend)) + ibusy.h_ino = h_inode->i_ino; + } + ii_read_unlock(inode); + iput(inode); + +out_unlock: + si_read_unlock(sb); + if (!err) { + err = __put_user(ibusy.h_ino, &arg->h_ino); + if (unlikely(err)) { + err = -EFAULT; + AuTraceErr(err); + } + } +out: + return err; +} + +long au_ibusy_ioctl(struct file *file, unsigned long arg) +{ + return au_ibusy(file->f_dentry->d_sb, (void __user *)arg); +} + +#ifdef CONFIG_COMPAT +long au_ibusy_compat_ioctl(struct file *file, unsigned long arg) +{ + return au_ibusy(file->f_dentry->d_sb, compat_ptr(arg)); +} +#endif + +/* ---------------------------------------------------------------------- */ + +/* + * change a branch permission + */ + +static void au_warn_ima(void) +{ +#ifdef CONFIG_IMA + /* since it doesn't support mark_files_ro() */ + AuWarn1("RW -> RO makes IMA to produce wrong message\n"); +#endif +} + +static int do_need_sigen_inc(int a, int b) +{ + return au_br_whable(a) && !au_br_whable(b); +} + +static int need_sigen_inc(int old, int new) +{ + return do_need_sigen_inc(old, new) + || do_need_sigen_inc(new, old); +} + +static unsigned long long au_farray_cb(void *a, + unsigned long long max __maybe_unused, + void *arg) +{ + unsigned long long n; + struct file **p, *f; + struct super_block *sb = arg; + + n = 0; + p = a; + lg_global_lock(files_lglock); + do_file_list_for_each_entry(sb, f) { + if (au_fi(f) + && file_count(f) + && !special_file(f->f_dentry->d_inode->i_mode)) { + get_file(f); + *p++ = f; + n++; + AuDebugOn(n > max); + } + } while_file_list_for_each_entry; + lg_global_unlock(files_lglock); + + return n; +} + +static struct file **au_farray_alloc(struct super_block *sb, + unsigned long long *max) +{ + *max = atomic_long_read(&au_sbi(sb)->si_nfiles); + return au_array_alloc(max, au_farray_cb, sb); +} + +static void au_farray_free(struct file **a, unsigned long long max) +{ + unsigned long long ull; + + for (ull = 0; ull < max; ull++) + if (a[ull]) + fput(a[ull]); + au_array_free(a); +} + +static int au_br_mod_files_ro(struct super_block *sb, aufs_bindex_t bindex) +{ + int err, do_warn; + unsigned int mnt_flags; + unsigned long long ull, max; + aufs_bindex_t br_id; + unsigned char verbose; + struct file *file, *hf, **array; + struct inode *inode; + struct au_hfile *hfile; + + mnt_flags = au_mntflags(sb); + verbose = !!au_opt_test(mnt_flags, VERBOSE); + + array = au_farray_alloc(sb, &max); + err = PTR_ERR(array); + if (IS_ERR(array)) + goto out; + + do_warn = 0; + br_id = au_sbr_id(sb, bindex); + for (ull = 0; ull < max; ull++) { + file = array[ull]; + + /* AuDbg("%.*s\n", AuDLNPair(file->f_dentry)); */ + fi_read_lock(file); + if (unlikely(au_test_mmapped(file))) { + err = -EBUSY; + AuVerbose(verbose, "mmapped %.*s\n", + AuDLNPair(file->f_dentry)); + AuDbgFile(file); + FiMustNoWaiters(file); + fi_read_unlock(file); + goto out_array; + } + + inode = file->f_dentry->d_inode; + hfile = &au_fi(file)->fi_htop; + hf = hfile->hf_file; + if (!S_ISREG(inode->i_mode) + || !(file->f_mode & FMODE_WRITE) + || hfile->hf_br->br_id != br_id + || !(hf->f_mode & FMODE_WRITE)) + array[ull] = NULL; + else { + do_warn = 1; + get_file(file); + } + + FiMustNoWaiters(file); + fi_read_unlock(file); + fput(file); + } + + err = 0; + if (do_warn) + au_warn_ima(); + + for (ull = 0; ull < max; ull++) { + file = array[ull]; + if (!file) + continue; + + /* todo: already flushed? */ + /* cf. fs/super.c:mark_files_ro() */ + /* fi_read_lock(file); */ + hfile = &au_fi(file)->fi_htop; + hf = hfile->hf_file; + /* fi_read_unlock(file); */ + spin_lock(&hf->f_lock); + hf->f_mode &= ~FMODE_WRITE; + spin_unlock(&hf->f_lock); + if (!file_check_writeable(hf)) { + file_release_write(hf); + mnt_drop_write(hf->f_vfsmnt); + } + } + +out_array: + au_farray_free(array, max); +out: + AuTraceErr(err); + return err; +} + +int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount, + int *do_refresh) +{ + int err, rerr; + aufs_bindex_t bindex; + struct path path; + struct dentry *root; + struct au_branch *br; + + root = sb->s_root; + bindex = au_find_dbindex(root, mod->h_root); + if (bindex < 0) { + if (remount) + return 0; /* success */ + err = -ENOENT; + pr_err("%s no such branch\n", mod->path); + goto out; + } + AuDbg("bindex b%d\n", bindex); + + err = test_br(mod->h_root->d_inode, mod->perm, mod->path); + if (unlikely(err)) + goto out; + + br = au_sbr(sb, bindex); + if (br->br_perm == mod->perm) + return 0; /* success */ + + if (au_br_writable(br->br_perm)) { + /* remove whiteout base */ + err = au_br_init_wh(sb, br, mod->perm, mod->h_root); + if (unlikely(err)) + goto out; + + if (!au_br_writable(mod->perm)) { + /* rw --> ro, file might be mmapped */ + DiMustNoWaiters(root); + IiMustNoWaiters(root->d_inode); + di_write_unlock(root); + err = au_br_mod_files_ro(sb, bindex); + /* aufs_write_lock() calls ..._child() */ + di_write_lock_child(root); + + if (unlikely(err)) { + rerr = -ENOMEM; + br->br_wbr = kmalloc(sizeof(*br->br_wbr), + GFP_NOFS); + if (br->br_wbr) { + path.mnt = br->br_mnt; + path.dentry = mod->h_root; + rerr = au_wbr_init(br, sb, br->br_perm, + &path); + } + if (unlikely(rerr)) { + AuIOErr("nested error %d (%d)\n", + rerr, err); + br->br_perm = mod->perm; + } + } + } + } else if (au_br_writable(mod->perm)) { + /* ro --> rw */ + err = -ENOMEM; + br->br_wbr = kmalloc(sizeof(*br->br_wbr), GFP_NOFS); + if (br->br_wbr) { + path.mnt = br->br_mnt; + path.dentry = mod->h_root; + err = au_wbr_init(br, sb, mod->perm, &path); + if (unlikely(err)) { + kfree(br->br_wbr); + br->br_wbr = NULL; + } + } + } + + if (!err) { + *do_refresh |= need_sigen_inc(br->br_perm, mod->perm); + br->br_perm = mod->perm; + } + +out: + AuTraceErr(err); + return err; +} diff --git a/ubuntu/aufs/branch.h b/ubuntu/aufs/branch.h new file mode 100644 index 000000000000..1f39ad6c831f --- /dev/null +++ b/ubuntu/aufs/branch.h @@ -0,0 +1,232 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * branch filesystems and xino for them + */ + +#ifndef __AUFS_BRANCH_H__ +#define __AUFS_BRANCH_H__ + +#ifdef __KERNEL__ + +#include +#include +#include +#include "dynop.h" +#include "rwsem.h" +#include "super.h" + +/* ---------------------------------------------------------------------- */ + +/* a xino file */ +struct au_xino_file { + struct file *xi_file; + struct mutex xi_nondir_mtx; + + /* todo: make xino files an array to support huge inode number */ + +#ifdef CONFIG_DEBUG_FS + struct dentry *xi_dbgaufs; +#endif +}; + +/* members for writable branch only */ +enum {AuBrWh_BASE, AuBrWh_PLINK, AuBrWh_ORPH, AuBrWh_Last}; +struct au_wbr { + struct au_rwsem wbr_wh_rwsem; + struct dentry *wbr_wh[AuBrWh_Last]; + atomic_t wbr_wh_running; +#define wbr_whbase wbr_wh[AuBrWh_BASE] /* whiteout base */ +#define wbr_plink wbr_wh[AuBrWh_PLINK] /* pseudo-link dir */ +#define wbr_orph wbr_wh[AuBrWh_ORPH] /* dir for orphans */ + + /* mfs mode */ + unsigned long long wbr_bytes; +}; + +/* ext2 has 3 types of operations at least, ext3 has 4 */ +#define AuBrDynOp (AuDyLast * 4) + +/* protected by superblock rwsem */ +struct au_branch { + struct au_xino_file br_xino; + + aufs_bindex_t br_id; + + int br_perm; + struct vfsmount *br_mnt; + spinlock_t br_dykey_lock; + struct au_dykey *br_dykey[AuBrDynOp]; + atomic_t br_count; + + struct au_wbr *br_wbr; + + /* xino truncation */ + blkcnt_t br_xino_upper; /* watermark in blocks */ + atomic_t br_xino_running; + +#ifdef CONFIG_AUFS_HFSNOTIFY + struct fsnotify_group *br_hfsn_group; + struct fsnotify_ops br_hfsn_ops; +#endif + +#ifdef CONFIG_SYSFS + /* an entry under sysfs per mount-point */ + char br_name[8]; + struct attribute br_attr; +#endif +}; + +/* ---------------------------------------------------------------------- */ + +/* branch permissions and attributes */ +#define AuBrPerm_RW 1 /* writable, hardlinkable wh */ +#define AuBrPerm_RO (1 << 1) /* readonly */ +#define AuBrPerm_RR (1 << 2) /* natively readonly */ +#define AuBrPerm_Mask (AuBrPerm_RW | AuBrPerm_RO | AuBrPerm_RR) + +#define AuBrRAttr_WH (1 << 3) /* whiteout-able */ + +#define AuBrWAttr_NoLinkWH (1 << 4) /* un-hardlinkable whiteouts */ + +static inline int au_br_writable(int brperm) +{ + return brperm & AuBrPerm_RW; +} + +static inline int au_br_whable(int brperm) +{ + return brperm & (AuBrPerm_RW | AuBrRAttr_WH); +} + +static inline int au_br_wh_linkable(int brperm) +{ + return !(brperm & AuBrWAttr_NoLinkWH); +} + +static inline int au_br_rdonly(struct au_branch *br) +{ + return ((br->br_mnt->mnt_sb->s_flags & MS_RDONLY) + || !au_br_writable(br->br_perm)) + ? -EROFS : 0; +} + +static inline int au_br_hnotifyable(int brperm __maybe_unused) +{ +#ifdef CONFIG_AUFS_HNOTIFY + return !(brperm & AuBrPerm_RR); +#else + return 0; +#endif +} + +/* ---------------------------------------------------------------------- */ + +/* branch.c */ +struct au_sbinfo; +void au_br_free(struct au_sbinfo *sinfo); +int au_br_index(struct super_block *sb, aufs_bindex_t br_id); +struct au_opt_add; +int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount); +struct au_opt_del; +int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount); +long au_ibusy_ioctl(struct file *file, unsigned long arg); +#ifdef CONFIG_COMPAT +long au_ibusy_compat_ioctl(struct file *file, unsigned long arg); +#endif +struct au_opt_mod; +int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount, + int *do_refresh); + +/* xino.c */ +static const loff_t au_loff_max = LLONG_MAX; + +int au_xib_trunc(struct super_block *sb); +ssize_t xino_fread(au_readf_t func, struct file *file, void *buf, size_t size, + loff_t *pos); +ssize_t xino_fwrite(au_writef_t func, struct file *file, void *buf, size_t size, + loff_t *pos); +struct file *au_xino_create2(struct file *base_file, struct file *copy_src); +struct file *au_xino_create(struct super_block *sb, char *fname, int silent); +ino_t au_xino_new_ino(struct super_block *sb); +void au_xino_delete_inode(struct inode *inode, const int unlinked); +int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, + ino_t ino); +int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, + ino_t *ino); +int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t hino, + struct file *base_file, int do_test); +int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex); + +struct au_opt_xino; +int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount); +void au_xino_clr(struct super_block *sb); +struct file *au_xino_def(struct super_block *sb); +int au_xino_path(struct seq_file *seq, struct file *file); + +/* ---------------------------------------------------------------------- */ + +/* Superblock to branch */ +static inline +aufs_bindex_t au_sbr_id(struct super_block *sb, aufs_bindex_t bindex) +{ + return au_sbr(sb, bindex)->br_id; +} + +static inline +struct vfsmount *au_sbr_mnt(struct super_block *sb, aufs_bindex_t bindex) +{ + return au_sbr(sb, bindex)->br_mnt; +} + +static inline +struct super_block *au_sbr_sb(struct super_block *sb, aufs_bindex_t bindex) +{ + return au_sbr_mnt(sb, bindex)->mnt_sb; +} + +static inline void au_sbr_put(struct super_block *sb, aufs_bindex_t bindex) +{ + atomic_dec(&au_sbr(sb, bindex)->br_count); +} + +static inline int au_sbr_perm(struct super_block *sb, aufs_bindex_t bindex) +{ + return au_sbr(sb, bindex)->br_perm; +} + +static inline int au_sbr_whable(struct super_block *sb, aufs_bindex_t bindex) +{ + return au_br_whable(au_sbr_perm(sb, bindex)); +} + +/* ---------------------------------------------------------------------- */ + +/* + * wbr_wh_read_lock, wbr_wh_write_lock + * wbr_wh_read_unlock, wbr_wh_write_unlock, wbr_wh_downgrade_lock + */ +AuSimpleRwsemFuncs(wbr_wh, struct au_wbr *wbr, &wbr->wbr_wh_rwsem); + +#define WbrWhMustNoWaiters(wbr) AuRwMustNoWaiters(&wbr->wbr_wh_rwsem) +#define WbrWhMustAnyLock(wbr) AuRwMustAnyLock(&wbr->wbr_wh_rwsem) +#define WbrWhMustWriteLock(wbr) AuRwMustWriteLock(&wbr->wbr_wh_rwsem) + +#endif /* __KERNEL__ */ +#endif /* __AUFS_BRANCH_H__ */ diff --git a/ubuntu/aufs/conf.mk b/ubuntu/aufs/conf.mk new file mode 100644 index 000000000000..6c5108de1aaf --- /dev/null +++ b/ubuntu/aufs/conf.mk @@ -0,0 +1,38 @@ + +AuConfStr = CONFIG_AUFS_FS=${CONFIG_AUFS_FS} + +define AuConf +ifdef ${1} +AuConfStr += ${1}=${${1}} +endif +endef + +AuConfAll = BRANCH_MAX_127 BRANCH_MAX_511 BRANCH_MAX_1023 BRANCH_MAX_32767 \ + SBILIST \ + HNOTIFY HFSNOTIFY \ + EXPORT INO_T_64 \ + RDU \ + PROC_MAP \ + SP_IATTR \ + SHWH \ + BR_RAMFS \ + BR_FUSE POLL \ + BR_HFSPLUS \ + BDEV_LOOP \ + DEBUG MAGIC_SYSRQ +$(foreach i, ${AuConfAll}, \ + $(eval $(call AuConf,CONFIG_AUFS_${i}))) + +AuConfName = ${obj}/conf.str +${AuConfName}.tmp: FORCE + @echo ${AuConfStr} | tr ' ' '\n' | sed -e 's/^/"/' -e 's/$$/\\n"/' > $@ +${AuConfName}: ${AuConfName}.tmp + @diff -q $< $@ > /dev/null 2>&1 || { \ + echo ' GEN ' $@; \ + cp -p $< $@; \ + } +FORCE: +clean-files += ${AuConfName} ${AuConfName}.tmp +${obj}/sysfs.o: ${AuConfName} + +-include ${srctree}/${src}/conf_priv.mk diff --git a/ubuntu/aufs/cpup.c b/ubuntu/aufs/cpup.c new file mode 100644 index 000000000000..13fc30490674 --- /dev/null +++ b/ubuntu/aufs/cpup.c @@ -0,0 +1,1081 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * copy-up functions, see wbr_policy.c for copy-down + */ + +#include +#include +#include +#include +#include "aufs.h" + +void au_cpup_attr_flags(struct inode *dst, struct inode *src) +{ + const unsigned int mask = S_DEAD | S_SWAPFILE | S_PRIVATE + | S_NOATIME | S_NOCMTIME; + + dst->i_flags |= src->i_flags & ~mask; + if (au_test_fs_notime(dst->i_sb)) + dst->i_flags |= S_NOATIME | S_NOCMTIME; +} + +void au_cpup_attr_timesizes(struct inode *inode) +{ + struct inode *h_inode; + + h_inode = au_h_iptr(inode, au_ibstart(inode)); + fsstack_copy_attr_times(inode, h_inode); + fsstack_copy_inode_size(inode, h_inode); +} + +void au_cpup_attr_nlink(struct inode *inode, int force) +{ + struct inode *h_inode; + struct super_block *sb; + aufs_bindex_t bindex, bend; + + sb = inode->i_sb; + bindex = au_ibstart(inode); + h_inode = au_h_iptr(inode, bindex); + if (!force + && !S_ISDIR(h_inode->i_mode) + && au_opt_test(au_mntflags(sb), PLINK) + && au_plink_test(inode)) + return; + + set_nlink(inode, h_inode->i_nlink); + + /* + * fewer nlink makes find(1) noisy, but larger nlink doesn't. + * it may includes whplink directory. + */ + if (S_ISDIR(h_inode->i_mode)) { + bend = au_ibend(inode); + for (bindex++; bindex <= bend; bindex++) { + h_inode = au_h_iptr(inode, bindex); + if (h_inode) + au_add_nlink(inode, h_inode); + } + } +} + +void au_cpup_attr_changeable(struct inode *inode) +{ + struct inode *h_inode; + + h_inode = au_h_iptr(inode, au_ibstart(inode)); + inode->i_mode = h_inode->i_mode; + inode->i_uid = h_inode->i_uid; + inode->i_gid = h_inode->i_gid; + au_cpup_attr_timesizes(inode); + au_cpup_attr_flags(inode, h_inode); +} + +void au_cpup_igen(struct inode *inode, struct inode *h_inode) +{ + struct au_iinfo *iinfo = au_ii(inode); + + IiMustWriteLock(inode); + + iinfo->ii_higen = h_inode->i_generation; + iinfo->ii_hsb1 = h_inode->i_sb; +} + +void au_cpup_attr_all(struct inode *inode, int force) +{ + struct inode *h_inode; + + h_inode = au_h_iptr(inode, au_ibstart(inode)); + au_cpup_attr_changeable(inode); + if (inode->i_nlink > 0) + au_cpup_attr_nlink(inode, force); + inode->i_rdev = h_inode->i_rdev; + inode->i_blkbits = h_inode->i_blkbits; + au_cpup_igen(inode, h_inode); +} + +/* ---------------------------------------------------------------------- */ + +/* Note: dt_dentry and dt_h_dentry are not dget/dput-ed */ + +/* keep the timestamps of the parent dir when cpup */ +void au_dtime_store(struct au_dtime *dt, struct dentry *dentry, + struct path *h_path) +{ + struct inode *h_inode; + + dt->dt_dentry = dentry; + dt->dt_h_path = *h_path; + h_inode = h_path->dentry->d_inode; + dt->dt_atime = h_inode->i_atime; + dt->dt_mtime = h_inode->i_mtime; + /* smp_mb(); */ +} + +void au_dtime_revert(struct au_dtime *dt) +{ + struct iattr attr; + int err; + + attr.ia_atime = dt->dt_atime; + attr.ia_mtime = dt->dt_mtime; + attr.ia_valid = ATTR_FORCE | ATTR_MTIME | ATTR_MTIME_SET + | ATTR_ATIME | ATTR_ATIME_SET; + + err = vfsub_notify_change(&dt->dt_h_path, &attr); + if (unlikely(err)) + pr_warning("restoring timestamps failed(%d). ignored\n", err); +} + +/* ---------------------------------------------------------------------- */ + +static noinline_for_stack +int cpup_iattr(struct dentry *dst, aufs_bindex_t bindex, struct dentry *h_src) +{ + int err, sbits; + struct iattr ia; + struct path h_path; + struct inode *h_isrc, *h_idst; + + h_path.dentry = au_h_dptr(dst, bindex); + h_idst = h_path.dentry->d_inode; + h_path.mnt = au_sbr_mnt(dst->d_sb, bindex); + h_isrc = h_src->d_inode; + ia.ia_valid = ATTR_FORCE | ATTR_UID | ATTR_GID + | ATTR_ATIME | ATTR_MTIME + | ATTR_ATIME_SET | ATTR_MTIME_SET; + ia.ia_uid = h_isrc->i_uid; + ia.ia_gid = h_isrc->i_gid; + ia.ia_atime = h_isrc->i_atime; + ia.ia_mtime = h_isrc->i_mtime; + if (h_idst->i_mode != h_isrc->i_mode + && !S_ISLNK(h_idst->i_mode)) { + ia.ia_valid |= ATTR_MODE; + ia.ia_mode = h_isrc->i_mode; + } + sbits = !!(h_isrc->i_mode & (S_ISUID | S_ISGID)); + au_cpup_attr_flags(h_idst, h_isrc); + err = vfsub_notify_change(&h_path, &ia); + + /* is this nfs only? */ + if (!err && sbits && au_test_nfs(h_path.dentry->d_sb)) { + ia.ia_valid = ATTR_FORCE | ATTR_MODE; + ia.ia_mode = h_isrc->i_mode; + err = vfsub_notify_change(&h_path, &ia); + } + + return err; +} + +/* ---------------------------------------------------------------------- */ + +static int au_do_copy_file(struct file *dst, struct file *src, loff_t len, + char *buf, unsigned long blksize) +{ + int err; + size_t sz, rbytes, wbytes; + unsigned char all_zero; + char *p, *zp; + struct mutex *h_mtx; + /* reduce stack usage */ + struct iattr *ia; + + zp = page_address(ZERO_PAGE(0)); + if (unlikely(!zp)) + return -ENOMEM; /* possible? */ + + err = 0; + all_zero = 0; + while (len) { + AuDbg("len %lld\n", len); + sz = blksize; + if (len < blksize) + sz = len; + + rbytes = 0; + /* todo: signal_pending? */ + while (!rbytes || err == -EAGAIN || err == -EINTR) { + rbytes = vfsub_read_k(src, buf, sz, &src->f_pos); + err = rbytes; + } + if (unlikely(err < 0)) + break; + + all_zero = 0; + if (len >= rbytes && rbytes == blksize) + all_zero = !memcmp(buf, zp, rbytes); + if (!all_zero) { + wbytes = rbytes; + p = buf; + while (wbytes) { + size_t b; + + b = vfsub_write_k(dst, p, wbytes, &dst->f_pos); + err = b; + /* todo: signal_pending? */ + if (unlikely(err == -EAGAIN || err == -EINTR)) + continue; + if (unlikely(err < 0)) + break; + wbytes -= b; + p += b; + } + } else { + loff_t res; + + AuLabel(hole); + res = vfsub_llseek(dst, rbytes, SEEK_CUR); + err = res; + if (unlikely(res < 0)) + break; + } + len -= rbytes; + err = 0; + } + + /* the last block may be a hole */ + if (!err && all_zero) { + AuLabel(last hole); + + err = 1; + if (au_test_nfs(dst->f_dentry->d_sb)) { + /* nfs requires this step to make last hole */ + /* is this only nfs? */ + do { + /* todo: signal_pending? */ + err = vfsub_write_k(dst, "\0", 1, &dst->f_pos); + } while (err == -EAGAIN || err == -EINTR); + if (err == 1) + dst->f_pos--; + } + + if (err == 1) { + ia = (void *)buf; + ia->ia_size = dst->f_pos; + ia->ia_valid = ATTR_SIZE | ATTR_FILE; + ia->ia_file = dst; + h_mtx = &dst->f_dentry->d_inode->i_mutex; + mutex_lock_nested(h_mtx, AuLsc_I_CHILD2); + err = vfsub_notify_change(&dst->f_path, ia); + mutex_unlock(h_mtx); + } + } + + return err; +} + +int au_copy_file(struct file *dst, struct file *src, loff_t len) +{ + int err; + unsigned long blksize; + unsigned char do_kfree; + char *buf; + + err = -ENOMEM; + blksize = dst->f_dentry->d_sb->s_blocksize; + if (!blksize || PAGE_SIZE < blksize) + blksize = PAGE_SIZE; + AuDbg("blksize %lu\n", blksize); + do_kfree = (blksize != PAGE_SIZE && blksize >= sizeof(struct iattr *)); + if (do_kfree) + buf = kmalloc(blksize, GFP_NOFS); + else + buf = (void *)__get_free_page(GFP_NOFS); + if (unlikely(!buf)) + goto out; + + if (len > (1 << 22)) + AuDbg("copying a large file %lld\n", (long long)len); + + src->f_pos = 0; + dst->f_pos = 0; + err = au_do_copy_file(dst, src, len, buf, blksize); + if (do_kfree) + kfree(buf); + else + free_page((unsigned long)buf); + +out: + return err; +} + +/* + * to support a sparse file which is opened with O_APPEND, + * we need to close the file. + */ +static int au_cp_regular(struct dentry *dentry, aufs_bindex_t bdst, + aufs_bindex_t bsrc, loff_t len) +{ + int err, i; + enum { SRC, DST }; + struct { + aufs_bindex_t bindex; + unsigned int flags; + struct dentry *dentry; + struct file *file; + void *label, *label_file; + } *f, file[] = { + { + .bindex = bsrc, + .flags = O_RDONLY | O_NOATIME | O_LARGEFILE, + .file = NULL, + .label = &&out, + .label_file = &&out_src + }, + { + .bindex = bdst, + .flags = O_WRONLY | O_NOATIME | O_LARGEFILE, + .file = NULL, + .label = &&out_src, + .label_file = &&out_dst + } + }; + struct super_block *sb; + + /* bsrc branch can be ro/rw. */ + sb = dentry->d_sb; + f = file; + for (i = 0; i < 2; i++, f++) { + f->dentry = au_h_dptr(dentry, f->bindex); + f->file = au_h_open(dentry, f->bindex, f->flags, /*file*/NULL); + err = PTR_ERR(f->file); + if (IS_ERR(f->file)) + goto *f->label; + err = -EINVAL; + if (unlikely(!f->file->f_op)) + goto *f->label_file; + } + + /* try stopping to update while we copyup */ + IMustLock(file[SRC].dentry->d_inode); + err = au_copy_file(file[DST].file, file[SRC].file, len); + +out_dst: + fput(file[DST].file); + au_sbr_put(sb, file[DST].bindex); +out_src: + fput(file[SRC].file); + au_sbr_put(sb, file[SRC].bindex); +out: + return err; +} + +static int au_do_cpup_regular(struct dentry *dentry, aufs_bindex_t bdst, + aufs_bindex_t bsrc, loff_t len, + struct inode *h_dir, struct path *h_path) +{ + int err, rerr; + loff_t l; + + err = 0; + l = i_size_read(au_h_iptr(dentry->d_inode, bsrc)); + if (len == -1 || l < len) + len = l; + if (len) + err = au_cp_regular(dentry, bdst, bsrc, len); + if (!err) + goto out; /* success */ + + rerr = vfsub_unlink(h_dir, h_path, /*force*/0); + if (rerr) { + AuIOErr("failed unlinking cpup-ed %.*s(%d, %d)\n", + AuDLNPair(h_path->dentry), err, rerr); + err = -EIO; + } + +out: + return err; +} + +static int au_do_cpup_symlink(struct path *h_path, struct dentry *h_src, + struct inode *h_dir) +{ + int err, symlen; + mm_segment_t old_fs; + union { + char *k; + char __user *u; + } sym; + + err = -ENOSYS; + if (unlikely(!h_src->d_inode->i_op->readlink)) + goto out; + + err = -ENOMEM; + sym.k = __getname_gfp(GFP_NOFS); + if (unlikely(!sym.k)) + goto out; + + /* unnecessary to support mmap_sem since symlink is not mmap-able */ + old_fs = get_fs(); + set_fs(KERNEL_DS); + symlen = h_src->d_inode->i_op->readlink(h_src, sym.u, PATH_MAX); + err = symlen; + set_fs(old_fs); + + if (symlen > 0) { + sym.k[symlen] = 0; + err = vfsub_symlink(h_dir, h_path, sym.k); + } + __putname(sym.k); + +out: + return err; +} + +/* return with the lower dst inode is locked */ +static noinline_for_stack +int cpup_entry(struct dentry *dentry, aufs_bindex_t bdst, + aufs_bindex_t bsrc, loff_t len, unsigned int flags, + struct dentry *dst_parent) +{ + int err; + umode_t mode; + unsigned int mnt_flags; + unsigned char isdir; + const unsigned char do_dt = !!au_ftest_cpup(flags, DTIME); + struct au_dtime dt; + struct path h_path; + struct dentry *h_src, *h_dst, *h_parent; + struct inode *h_inode, *h_dir; + struct super_block *sb; + + /* bsrc branch can be ro/rw. */ + h_src = au_h_dptr(dentry, bsrc); + h_inode = h_src->d_inode; + AuDebugOn(h_inode != au_h_iptr(dentry->d_inode, bsrc)); + + /* try stopping to be referenced while we are creating */ + h_dst = au_h_dptr(dentry, bdst); + h_parent = h_dst->d_parent; /* dir inode is locked */ + h_dir = h_parent->d_inode; + IMustLock(h_dir); + AuDebugOn(h_parent != h_dst->d_parent); + + sb = dentry->d_sb; + h_path.mnt = au_sbr_mnt(sb, bdst); + if (do_dt) { + h_path.dentry = h_parent; + au_dtime_store(&dt, dst_parent, &h_path); + } + h_path.dentry = h_dst; + + isdir = 0; + mode = h_inode->i_mode; + switch (mode & S_IFMT) { + case S_IFREG: + /* try stopping to update while we are referencing */ + IMustLock(h_inode); + err = vfsub_create(h_dir, &h_path, mode | S_IWUSR); + if (!err) + err = au_do_cpup_regular + (dentry, bdst, bsrc, len, + au_h_iptr(dst_parent->d_inode, bdst), &h_path); + break; + case S_IFDIR: + isdir = 1; + err = vfsub_mkdir(h_dir, &h_path, mode); + if (!err) { + /* + * strange behaviour from the users view, + * particularry setattr case + */ + if (au_ibstart(dst_parent->d_inode) == bdst) + au_cpup_attr_nlink(dst_parent->d_inode, + /*force*/1); + au_cpup_attr_nlink(dentry->d_inode, /*force*/1); + } + break; + case S_IFLNK: + err = au_do_cpup_symlink(&h_path, h_src, h_dir); + break; + case S_IFCHR: + case S_IFBLK: + AuDebugOn(!capable(CAP_MKNOD)); + /*FALLTHROUGH*/ + case S_IFIFO: + case S_IFSOCK: + err = vfsub_mknod(h_dir, &h_path, mode, h_inode->i_rdev); + break; + default: + AuIOErr("Unknown inode type 0%o\n", mode); + err = -EIO; + } + + mnt_flags = au_mntflags(sb); + if (!au_opt_test(mnt_flags, UDBA_NONE) + && !isdir + && au_opt_test(mnt_flags, XINO) + && h_inode->i_nlink == 1 + /* todo: unnecessary? */ + /* && dentry->d_inode->i_nlink == 1 */ + && bdst < bsrc + && !au_ftest_cpup(flags, KEEPLINO)) + au_xino_write(sb, bsrc, h_inode->i_ino, /*ino*/0); + /* ignore this error */ + + if (do_dt) + au_dtime_revert(&dt); + return err; +} + +/* + * copyup the @dentry from @bsrc to @bdst. + * the caller must set the both of lower dentries. + * @len is for truncating when it is -1 copyup the entire file. + * in link/rename cases, @dst_parent may be different from the real one. + */ +static int au_cpup_single(struct dentry *dentry, aufs_bindex_t bdst, + aufs_bindex_t bsrc, loff_t len, unsigned int flags, + struct dentry *dst_parent) +{ + int err, rerr; + aufs_bindex_t old_ibstart; + unsigned char isdir, plink; + struct au_dtime dt; + struct path h_path; + struct dentry *h_src, *h_dst, *h_parent; + struct inode *dst_inode, *h_dir, *inode; + struct super_block *sb; + + AuDebugOn(bsrc <= bdst); + + sb = dentry->d_sb; + h_path.mnt = au_sbr_mnt(sb, bdst); + h_dst = au_h_dptr(dentry, bdst); + h_parent = h_dst->d_parent; /* dir inode is locked */ + h_dir = h_parent->d_inode; + IMustLock(h_dir); + + h_src = au_h_dptr(dentry, bsrc); + inode = dentry->d_inode; + + if (!dst_parent) + dst_parent = dget_parent(dentry); + else + dget(dst_parent); + + plink = !!au_opt_test(au_mntflags(sb), PLINK); + dst_inode = au_h_iptr(inode, bdst); + if (dst_inode) { + if (unlikely(!plink)) { + err = -EIO; + AuIOErr("hi%lu(i%lu) exists on b%d " + "but plink is disabled\n", + dst_inode->i_ino, inode->i_ino, bdst); + goto out; + } + + if (dst_inode->i_nlink) { + const int do_dt = au_ftest_cpup(flags, DTIME); + + h_src = au_plink_lkup(inode, bdst); + err = PTR_ERR(h_src); + if (IS_ERR(h_src)) + goto out; + if (unlikely(!h_src->d_inode)) { + err = -EIO; + AuIOErr("i%lu exists on a upper branch " + "but not pseudo-linked\n", + inode->i_ino); + dput(h_src); + goto out; + } + + if (do_dt) { + h_path.dentry = h_parent; + au_dtime_store(&dt, dst_parent, &h_path); + } + h_path.dentry = h_dst; + err = vfsub_link(h_src, h_dir, &h_path); + if (do_dt) + au_dtime_revert(&dt); + dput(h_src); + goto out; + } else + /* todo: cpup_wh_file? */ + /* udba work */ + au_update_ibrange(inode, /*do_put_zero*/1); + } + + old_ibstart = au_ibstart(inode); + err = cpup_entry(dentry, bdst, bsrc, len, flags, dst_parent); + if (unlikely(err)) + goto out; + dst_inode = h_dst->d_inode; + mutex_lock_nested(&dst_inode->i_mutex, AuLsc_I_CHILD2); + + err = cpup_iattr(dentry, bdst, h_src); + isdir = S_ISDIR(dst_inode->i_mode); + if (!err) { + if (bdst < old_ibstart) { + if (S_ISREG(inode->i_mode)) { + err = au_dy_iaop(inode, bdst, dst_inode); + if (unlikely(err)) + goto out_rev; + } + au_set_ibstart(inode, bdst); + } + au_set_h_iptr(inode, bdst, au_igrab(dst_inode), + au_hi_flags(inode, isdir)); + mutex_unlock(&dst_inode->i_mutex); + if (!isdir + && h_src->d_inode->i_nlink > 1 + && plink) + au_plink_append(inode, bdst, h_dst); + goto out; /* success */ + } + + /* revert */ +out_rev: + h_path.dentry = h_parent; + mutex_unlock(&dst_inode->i_mutex); + au_dtime_store(&dt, dst_parent, &h_path); + h_path.dentry = h_dst; + if (!isdir) + rerr = vfsub_unlink(h_dir, &h_path, /*force*/0); + else + rerr = vfsub_rmdir(h_dir, &h_path); + au_dtime_revert(&dt); + if (rerr) { + AuIOErr("failed removing broken entry(%d, %d)\n", err, rerr); + err = -EIO; + } + +out: + dput(dst_parent); + return err; +} + +struct au_cpup_single_args { + int *errp; + struct dentry *dentry; + aufs_bindex_t bdst, bsrc; + loff_t len; + unsigned int flags; + struct dentry *dst_parent; +}; + +static void au_call_cpup_single(void *args) +{ + struct au_cpup_single_args *a = args; + *a->errp = au_cpup_single(a->dentry, a->bdst, a->bsrc, a->len, + a->flags, a->dst_parent); +} + +/* + * prevent SIGXFSZ in copy-up. + * testing CAP_MKNOD is for generic fs, + * but CAP_FSETID is for xfs only, currently. + */ +static int au_cpup_sio_test(struct super_block *sb, umode_t mode) +{ + int do_sio; + + do_sio = 0; + if (!au_wkq_test() + && (!au_sbi(sb)->si_plink_maint_pid + || au_plink_maint(sb, AuLock_NOPLM))) { + switch (mode & S_IFMT) { + case S_IFREG: + /* no condition about RLIMIT_FSIZE and the file size */ + do_sio = 1; + break; + case S_IFCHR: + case S_IFBLK: + do_sio = !capable(CAP_MKNOD); + break; + } + if (!do_sio) + do_sio = ((mode & (S_ISUID | S_ISGID)) + && !capable(CAP_FSETID)); + } + + return do_sio; +} + +int au_sio_cpup_single(struct dentry *dentry, aufs_bindex_t bdst, + aufs_bindex_t bsrc, loff_t len, unsigned int flags, + struct dentry *dst_parent) +{ + int err, wkq_err; + struct dentry *h_dentry; + + h_dentry = au_h_dptr(dentry, bsrc); + if (!au_cpup_sio_test(dentry->d_sb, h_dentry->d_inode->i_mode)) + err = au_cpup_single(dentry, bdst, bsrc, len, flags, + dst_parent); + else { + struct au_cpup_single_args args = { + .errp = &err, + .dentry = dentry, + .bdst = bdst, + .bsrc = bsrc, + .len = len, + .flags = flags, + .dst_parent = dst_parent + }; + wkq_err = au_wkq_wait(au_call_cpup_single, &args); + if (unlikely(wkq_err)) + err = wkq_err; + } + + return err; +} + +/* + * copyup the @dentry from the first active lower branch to @bdst, + * using au_cpup_single(). + */ +static int au_cpup_simple(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, + unsigned int flags) +{ + int err; + aufs_bindex_t bsrc, bend; + + bend = au_dbend(dentry); + for (bsrc = bdst + 1; bsrc <= bend; bsrc++) + if (au_h_dptr(dentry, bsrc)) + break; + + err = au_lkup_neg(dentry, bdst); + if (!err) { + err = au_cpup_single(dentry, bdst, bsrc, len, flags, NULL); + if (!err) + return 0; /* success */ + + /* revert */ + au_set_h_dptr(dentry, bdst, NULL); + au_set_dbstart(dentry, bsrc); + } + + return err; +} + +struct au_cpup_simple_args { + int *errp; + struct dentry *dentry; + aufs_bindex_t bdst; + loff_t len; + unsigned int flags; +}; + +static void au_call_cpup_simple(void *args) +{ + struct au_cpup_simple_args *a = args; + *a->errp = au_cpup_simple(a->dentry, a->bdst, a->len, a->flags); +} + +int au_sio_cpup_simple(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, + unsigned int flags) +{ + int err, wkq_err; + struct dentry *parent; + struct inode *h_dir; + + parent = dget_parent(dentry); + h_dir = au_h_iptr(parent->d_inode, bdst); + if (!au_test_h_perm_sio(h_dir, MAY_EXEC | MAY_WRITE) + && !au_cpup_sio_test(dentry->d_sb, dentry->d_inode->i_mode)) + err = au_cpup_simple(dentry, bdst, len, flags); + else { + struct au_cpup_simple_args args = { + .errp = &err, + .dentry = dentry, + .bdst = bdst, + .len = len, + .flags = flags + }; + wkq_err = au_wkq_wait(au_call_cpup_simple, &args); + if (unlikely(wkq_err)) + err = wkq_err; + } + + dput(parent); + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* + * copyup the deleted file for writing. + */ +static int au_do_cpup_wh(struct dentry *dentry, aufs_bindex_t bdst, + struct dentry *wh_dentry, struct file *file, + loff_t len) +{ + int err; + aufs_bindex_t bstart; + struct au_dinfo *dinfo; + struct dentry *h_d_dst, *h_d_start; + struct au_hdentry *hdp; + + dinfo = au_di(dentry); + AuRwMustWriteLock(&dinfo->di_rwsem); + + bstart = dinfo->di_bstart; + hdp = dinfo->di_hdentry; + h_d_dst = hdp[0 + bdst].hd_dentry; + dinfo->di_bstart = bdst; + hdp[0 + bdst].hd_dentry = wh_dentry; + if (file) { + h_d_start = hdp[0 + bstart].hd_dentry; + hdp[0 + bstart].hd_dentry = au_hf_top(file)->f_dentry; + } + err = au_cpup_single(dentry, bdst, bstart, len, !AuCpup_DTIME, + /*h_parent*/NULL); + if (file) { + if (!err) + err = au_reopen_nondir(file); + hdp[0 + bstart].hd_dentry = h_d_start; + } + hdp[0 + bdst].hd_dentry = h_d_dst; + dinfo->di_bstart = bstart; + + return err; +} + +static int au_cpup_wh(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, + struct file *file) +{ + int err; + struct au_dtime dt; + struct dentry *parent, *h_parent, *wh_dentry; + struct au_branch *br; + struct path h_path; + + br = au_sbr(dentry->d_sb, bdst); + parent = dget_parent(dentry); + h_parent = au_h_dptr(parent, bdst); + wh_dentry = au_whtmp_lkup(h_parent, br, &dentry->d_name); + err = PTR_ERR(wh_dentry); + if (IS_ERR(wh_dentry)) + goto out; + + h_path.dentry = h_parent; + h_path.mnt = br->br_mnt; + au_dtime_store(&dt, parent, &h_path); + err = au_do_cpup_wh(dentry, bdst, wh_dentry, file, len); + if (unlikely(err)) + goto out_wh; + + dget(wh_dentry); + h_path.dentry = wh_dentry; + if (!S_ISDIR(wh_dentry->d_inode->i_mode)) + err = vfsub_unlink(h_parent->d_inode, &h_path, /*force*/0); + else + err = vfsub_rmdir(h_parent->d_inode, &h_path); + if (unlikely(err)) { + AuIOErr("failed remove copied-up tmp file %.*s(%d)\n", + AuDLNPair(wh_dentry), err); + err = -EIO; + } + au_dtime_revert(&dt); + au_set_hi_wh(dentry->d_inode, bdst, wh_dentry); + +out_wh: + dput(wh_dentry); +out: + dput(parent); + return err; +} + +struct au_cpup_wh_args { + int *errp; + struct dentry *dentry; + aufs_bindex_t bdst; + loff_t len; + struct file *file; +}; + +static void au_call_cpup_wh(void *args) +{ + struct au_cpup_wh_args *a = args; + *a->errp = au_cpup_wh(a->dentry, a->bdst, a->len, a->file); +} + +int au_sio_cpup_wh(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, + struct file *file) +{ + int err, wkq_err; + struct dentry *parent, *h_orph, *h_parent, *h_dentry; + struct inode *dir, *h_dir, *h_tmpdir, *h_inode; + struct au_wbr *wbr; + + parent = dget_parent(dentry); + dir = parent->d_inode; + h_orph = NULL; + h_parent = NULL; + h_dir = au_igrab(au_h_iptr(dir, bdst)); + h_tmpdir = h_dir; + if (!h_dir->i_nlink) { + wbr = au_sbr(dentry->d_sb, bdst)->br_wbr; + h_orph = wbr->wbr_orph; + + h_parent = dget(au_h_dptr(parent, bdst)); + au_set_h_dptr(parent, bdst, dget(h_orph)); + h_tmpdir = h_orph->d_inode; + au_set_h_iptr(dir, bdst, au_igrab(h_tmpdir), /*flags*/0); + + /* this temporary unlock is safe */ + if (file) + h_dentry = au_hf_top(file)->f_dentry; + else + h_dentry = au_h_dptr(dentry, au_dbstart(dentry)); + h_inode = h_dentry->d_inode; + IMustLock(h_inode); + mutex_unlock(&h_inode->i_mutex); + mutex_lock_nested(&h_tmpdir->i_mutex, AuLsc_I_PARENT3); + mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); + /* todo: au_h_open_pre()? */ + } + + if (!au_test_h_perm_sio(h_tmpdir, MAY_EXEC | MAY_WRITE) + && !au_cpup_sio_test(dentry->d_sb, dentry->d_inode->i_mode)) + err = au_cpup_wh(dentry, bdst, len, file); + else { + struct au_cpup_wh_args args = { + .errp = &err, + .dentry = dentry, + .bdst = bdst, + .len = len, + .file = file + }; + wkq_err = au_wkq_wait(au_call_cpup_wh, &args); + if (unlikely(wkq_err)) + err = wkq_err; + } + + if (h_orph) { + mutex_unlock(&h_tmpdir->i_mutex); + /* todo: au_h_open_post()? */ + au_set_h_iptr(dir, bdst, au_igrab(h_dir), /*flags*/0); + au_set_h_dptr(parent, bdst, h_parent); + } + iput(h_dir); + dput(parent); + + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* + * generic routine for both of copy-up and copy-down. + */ +/* cf. revalidate function in file.c */ +int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst, + int (*cp)(struct dentry *dentry, aufs_bindex_t bdst, + struct dentry *h_parent, void *arg), + void *arg) +{ + int err; + struct au_pin pin; + struct dentry *d, *parent, *h_parent, *real_parent; + + err = 0; + parent = dget_parent(dentry); + if (IS_ROOT(parent)) + goto out; + + au_pin_init(&pin, dentry, bdst, AuLsc_DI_PARENT2, AuLsc_I_PARENT2, + au_opt_udba(dentry->d_sb), AuPin_MNT_WRITE); + + /* do not use au_dpage */ + real_parent = parent; + while (1) { + dput(parent); + parent = dget_parent(dentry); + h_parent = au_h_dptr(parent, bdst); + if (h_parent) + goto out; /* success */ + + /* find top dir which is necessary to cpup */ + do { + d = parent; + dput(parent); + parent = dget_parent(d); + di_read_lock_parent3(parent, !AuLock_IR); + h_parent = au_h_dptr(parent, bdst); + di_read_unlock(parent, !AuLock_IR); + } while (!h_parent); + + if (d != real_parent) + di_write_lock_child3(d); + + /* somebody else might create while we were sleeping */ + if (!au_h_dptr(d, bdst) || !au_h_dptr(d, bdst)->d_inode) { + if (au_h_dptr(d, bdst)) + au_update_dbstart(d); + + au_pin_set_dentry(&pin, d); + err = au_do_pin(&pin); + if (!err) { + err = cp(d, bdst, h_parent, arg); + au_unpin(&pin); + } + } + + if (d != real_parent) + di_write_unlock(d); + if (unlikely(err)) + break; + } + +out: + dput(parent); + return err; +} + +static int au_cpup_dir(struct dentry *dentry, aufs_bindex_t bdst, + struct dentry *h_parent __maybe_unused , + void *arg __maybe_unused) +{ + return au_sio_cpup_simple(dentry, bdst, -1, AuCpup_DTIME); +} + +int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst) +{ + return au_cp_dirs(dentry, bdst, au_cpup_dir, NULL); +} + +int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst) +{ + int err; + struct dentry *parent; + struct inode *dir; + + parent = dget_parent(dentry); + dir = parent->d_inode; + err = 0; + if (au_h_iptr(dir, bdst)) + goto out; + + di_read_unlock(parent, AuLock_IR); + di_write_lock_parent(parent); + /* someone else might change our inode while we were sleeping */ + if (!au_h_iptr(dir, bdst)) + err = au_cpup_dirs(dentry, bdst); + di_downgrade_lock(parent, AuLock_IR); + +out: + dput(parent); + return err; +} diff --git a/ubuntu/aufs/cpup.h b/ubuntu/aufs/cpup.h new file mode 100644 index 000000000000..8e0aa591c626 --- /dev/null +++ b/ubuntu/aufs/cpup.h @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * copy-up/down functions + */ + +#ifndef __AUFS_CPUP_H__ +#define __AUFS_CPUP_H__ + +#ifdef __KERNEL__ + +#include +#include +#include + +struct inode; +struct file; + +void au_cpup_attr_flags(struct inode *dst, struct inode *src); +void au_cpup_attr_timesizes(struct inode *inode); +void au_cpup_attr_nlink(struct inode *inode, int force); +void au_cpup_attr_changeable(struct inode *inode); +void au_cpup_igen(struct inode *inode, struct inode *h_inode); +void au_cpup_attr_all(struct inode *inode, int force); + +/* ---------------------------------------------------------------------- */ + +/* cpup flags */ +#define AuCpup_DTIME 1 /* do dtime_store/revert */ +#define AuCpup_KEEPLINO (1 << 1) /* do not clear the lower xino, + for link(2) */ +#define au_ftest_cpup(flags, name) ((flags) & AuCpup_##name) +#define au_fset_cpup(flags, name) \ + do { (flags) |= AuCpup_##name; } while (0) +#define au_fclr_cpup(flags, name) \ + do { (flags) &= ~AuCpup_##name; } while (0) + +int au_copy_file(struct file *dst, struct file *src, loff_t len); +int au_sio_cpup_single(struct dentry *dentry, aufs_bindex_t bdst, + aufs_bindex_t bsrc, loff_t len, unsigned int flags, + struct dentry *dst_parent); +int au_sio_cpup_simple(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, + unsigned int flags); +int au_sio_cpup_wh(struct dentry *dentry, aufs_bindex_t bdst, loff_t len, + struct file *file); + +int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst, + int (*cp)(struct dentry *dentry, aufs_bindex_t bdst, + struct dentry *h_parent, void *arg), + void *arg); +int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst); +int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst); + +/* ---------------------------------------------------------------------- */ + +/* keep timestamps when copyup */ +struct au_dtime { + struct dentry *dt_dentry; + struct path dt_h_path; + struct timespec dt_atime, dt_mtime; +}; +void au_dtime_store(struct au_dtime *dt, struct dentry *dentry, + struct path *h_path); +void au_dtime_revert(struct au_dtime *dt); + +#endif /* __KERNEL__ */ +#endif /* __AUFS_CPUP_H__ */ diff --git a/ubuntu/aufs/dbgaufs.c b/ubuntu/aufs/dbgaufs.c new file mode 100644 index 000000000000..26d1d03ae0e6 --- /dev/null +++ b/ubuntu/aufs/dbgaufs.c @@ -0,0 +1,334 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * debugfs interface + */ + +#include +#include "aufs.h" + +#ifndef CONFIG_SYSFS +#error DEBUG_FS depends upon SYSFS +#endif + +static struct dentry *dbgaufs; +static const mode_t dbgaufs_mode = S_IRUSR | S_IRGRP | S_IROTH; + +/* 20 is max digits length of ulong 64 */ +struct dbgaufs_arg { + int n; + char a[20 * 4]; +}; + +/* + * common function for all XINO files + */ +static int dbgaufs_xi_release(struct inode *inode __maybe_unused, + struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static int dbgaufs_xi_open(struct file *xf, struct file *file, int do_fcnt) +{ + int err; + struct kstat st; + struct dbgaufs_arg *p; + + err = -ENOMEM; + p = kmalloc(sizeof(*p), GFP_NOFS); + if (unlikely(!p)) + goto out; + + err = 0; + p->n = 0; + file->private_data = p; + if (!xf) + goto out; + + err = vfs_getattr(xf->f_vfsmnt, xf->f_dentry, &st); + if (!err) { + if (do_fcnt) + p->n = snprintf + (p->a, sizeof(p->a), "%ld, %llux%lu %lld\n", + (long)file_count(xf), st.blocks, st.blksize, + (long long)st.size); + else + p->n = snprintf(p->a, sizeof(p->a), "%llux%lu %lld\n", + st.blocks, st.blksize, + (long long)st.size); + AuDebugOn(p->n >= sizeof(p->a)); + } else { + p->n = snprintf(p->a, sizeof(p->a), "err %d\n", err); + err = 0; + } + +out: + return err; + +} + +static ssize_t dbgaufs_xi_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct dbgaufs_arg *p; + + p = file->private_data; + return simple_read_from_buffer(buf, count, ppos, p->a, p->n); +} + +/* ---------------------------------------------------------------------- */ + +static int dbgaufs_xib_open(struct inode *inode, struct file *file) +{ + int err; + struct au_sbinfo *sbinfo; + struct super_block *sb; + + sbinfo = inode->i_private; + sb = sbinfo->si_sb; + si_noflush_read_lock(sb); + err = dbgaufs_xi_open(sbinfo->si_xib, file, /*do_fcnt*/0); + si_read_unlock(sb); + return err; +} + +static const struct file_operations dbgaufs_xib_fop = { + .owner = THIS_MODULE, + .open = dbgaufs_xib_open, + .release = dbgaufs_xi_release, + .read = dbgaufs_xi_read +}; + +/* ---------------------------------------------------------------------- */ + +#define DbgaufsXi_PREFIX "xi" + +static int dbgaufs_xino_open(struct inode *inode, struct file *file) +{ + int err; + long l; + struct au_sbinfo *sbinfo; + struct super_block *sb; + struct file *xf; + struct qstr *name; + + err = -ENOENT; + xf = NULL; + name = &file->f_dentry->d_name; + if (unlikely(name->len < sizeof(DbgaufsXi_PREFIX) + || memcmp(name->name, DbgaufsXi_PREFIX, + sizeof(DbgaufsXi_PREFIX) - 1))) + goto out; + err = kstrtol(name->name + sizeof(DbgaufsXi_PREFIX) - 1, 10, &l); + if (unlikely(err)) + goto out; + + sbinfo = inode->i_private; + sb = sbinfo->si_sb; + si_noflush_read_lock(sb); + if (l <= au_sbend(sb)) { + xf = au_sbr(sb, (aufs_bindex_t)l)->br_xino.xi_file; + err = dbgaufs_xi_open(xf, file, /*do_fcnt*/1); + } else + err = -ENOENT; + si_read_unlock(sb); + +out: + return err; +} + +static const struct file_operations dbgaufs_xino_fop = { + .owner = THIS_MODULE, + .open = dbgaufs_xino_open, + .release = dbgaufs_xi_release, + .read = dbgaufs_xi_read +}; + +void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex) +{ + aufs_bindex_t bend; + struct au_branch *br; + struct au_xino_file *xi; + + if (!au_sbi(sb)->si_dbgaufs) + return; + + bend = au_sbend(sb); + for (; bindex <= bend; bindex++) { + br = au_sbr(sb, bindex); + xi = &br->br_xino; + if (xi->xi_dbgaufs) { + debugfs_remove(xi->xi_dbgaufs); + xi->xi_dbgaufs = NULL; + } + } +} + +void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex) +{ + struct au_sbinfo *sbinfo; + struct dentry *parent; + struct au_branch *br; + struct au_xino_file *xi; + aufs_bindex_t bend; + char name[sizeof(DbgaufsXi_PREFIX) + 5]; /* "xi" bindex NULL */ + + sbinfo = au_sbi(sb); + parent = sbinfo->si_dbgaufs; + if (!parent) + return; + + bend = au_sbend(sb); + for (; bindex <= bend; bindex++) { + snprintf(name, sizeof(name), DbgaufsXi_PREFIX "%d", bindex); + br = au_sbr(sb, bindex); + xi = &br->br_xino; + AuDebugOn(xi->xi_dbgaufs); + xi->xi_dbgaufs = debugfs_create_file(name, dbgaufs_mode, parent, + sbinfo, &dbgaufs_xino_fop); + /* ignore an error */ + if (unlikely(!xi->xi_dbgaufs)) + AuWarn1("failed %s under debugfs\n", name); + } +} + +/* ---------------------------------------------------------------------- */ + +#ifdef CONFIG_AUFS_EXPORT +static int dbgaufs_xigen_open(struct inode *inode, struct file *file) +{ + int err; + struct au_sbinfo *sbinfo; + struct super_block *sb; + + sbinfo = inode->i_private; + sb = sbinfo->si_sb; + si_noflush_read_lock(sb); + err = dbgaufs_xi_open(sbinfo->si_xigen, file, /*do_fcnt*/0); + si_read_unlock(sb); + return err; +} + +static const struct file_operations dbgaufs_xigen_fop = { + .owner = THIS_MODULE, + .open = dbgaufs_xigen_open, + .release = dbgaufs_xi_release, + .read = dbgaufs_xi_read +}; + +static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo) +{ + int err; + + /* + * This function is a dynamic '__init' fucntion actually, + * so the tiny check for si_rwsem is unnecessary. + */ + /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ + + err = -EIO; + sbinfo->si_dbgaufs_xigen = debugfs_create_file + ("xigen", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo, + &dbgaufs_xigen_fop); + if (sbinfo->si_dbgaufs_xigen) + err = 0; + + return err; +} +#else +static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo) +{ + return 0; +} +#endif /* CONFIG_AUFS_EXPORT */ + +/* ---------------------------------------------------------------------- */ + +void dbgaufs_si_fin(struct au_sbinfo *sbinfo) +{ + /* + * This function is a dynamic '__init' fucntion actually, + * so the tiny check for si_rwsem is unnecessary. + */ + /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ + + debugfs_remove_recursive(sbinfo->si_dbgaufs); + sbinfo->si_dbgaufs = NULL; + kobject_put(&sbinfo->si_kobj); +} + +int dbgaufs_si_init(struct au_sbinfo *sbinfo) +{ + int err; + char name[SysaufsSiNameLen]; + + /* + * This function is a dynamic '__init' fucntion actually, + * so the tiny check for si_rwsem is unnecessary. + */ + /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ + + err = -ENOENT; + if (!dbgaufs) { + AuErr1("/debug/aufs is uninitialized\n"); + goto out; + } + + err = -EIO; + sysaufs_name(sbinfo, name); + sbinfo->si_dbgaufs = debugfs_create_dir(name, dbgaufs); + if (unlikely(!sbinfo->si_dbgaufs)) + goto out; + kobject_get(&sbinfo->si_kobj); + + sbinfo->si_dbgaufs_xib = debugfs_create_file + ("xib", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo, + &dbgaufs_xib_fop); + if (unlikely(!sbinfo->si_dbgaufs_xib)) + goto out_dir; + + err = dbgaufs_xigen_init(sbinfo); + if (!err) + goto out; /* success */ + +out_dir: + dbgaufs_si_fin(sbinfo); +out: + return err; +} + +/* ---------------------------------------------------------------------- */ + +void dbgaufs_fin(void) +{ + debugfs_remove(dbgaufs); +} + +int __init dbgaufs_init(void) +{ + int err; + + err = -EIO; + dbgaufs = debugfs_create_dir(AUFS_NAME, NULL); + if (dbgaufs) + err = 0; + return err; +} diff --git a/ubuntu/aufs/dbgaufs.h b/ubuntu/aufs/dbgaufs.h new file mode 100644 index 000000000000..2eaa3f021bc9 --- /dev/null +++ b/ubuntu/aufs/dbgaufs.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * debugfs interface + */ + +#ifndef __DBGAUFS_H__ +#define __DBGAUFS_H__ + +#ifdef __KERNEL__ + +#include +#include + +struct super_block; +struct au_sbinfo; + +#ifdef CONFIG_DEBUG_FS +/* dbgaufs.c */ +void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex); +void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex); +void dbgaufs_si_fin(struct au_sbinfo *sbinfo); +int dbgaufs_si_init(struct au_sbinfo *sbinfo); +void dbgaufs_fin(void); +int __init dbgaufs_init(void); +#else +AuStubVoid(dbgaufs_brs_del, struct super_block *sb, aufs_bindex_t bindex) +AuStubVoid(dbgaufs_brs_add, struct super_block *sb, aufs_bindex_t bindex) +AuStubVoid(dbgaufs_si_fin, struct au_sbinfo *sbinfo) +AuStubInt0(dbgaufs_si_init, struct au_sbinfo *sbinfo) +AuStubVoid(dbgaufs_fin, void) +AuStubInt0(__init dbgaufs_init, void) +#endif /* CONFIG_DEBUG_FS */ + +#endif /* __KERNEL__ */ +#endif /* __DBGAUFS_H__ */ diff --git a/ubuntu/aufs/dcsub.c b/ubuntu/aufs/dcsub.c new file mode 100644 index 000000000000..9d8cf10938f7 --- /dev/null +++ b/ubuntu/aufs/dcsub.c @@ -0,0 +1,243 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * sub-routines for dentry cache + */ + +#include "aufs.h" + +static void au_dpage_free(struct au_dpage *dpage) +{ + int i; + struct dentry **p; + + p = dpage->dentries; + for (i = 0; i < dpage->ndentry; i++) + dput(*p++); + free_page((unsigned long)dpage->dentries); +} + +int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp) +{ + int err; + void *p; + + err = -ENOMEM; + dpages->dpages = kmalloc(sizeof(*dpages->dpages), gfp); + if (unlikely(!dpages->dpages)) + goto out; + + p = (void *)__get_free_page(gfp); + if (unlikely(!p)) + goto out_dpages; + + dpages->dpages[0].ndentry = 0; + dpages->dpages[0].dentries = p; + dpages->ndpage = 1; + return 0; /* success */ + +out_dpages: + kfree(dpages->dpages); +out: + return err; +} + +void au_dpages_free(struct au_dcsub_pages *dpages) +{ + int i; + struct au_dpage *p; + + p = dpages->dpages; + for (i = 0; i < dpages->ndpage; i++) + au_dpage_free(p++); + kfree(dpages->dpages); +} + +static int au_dpages_append(struct au_dcsub_pages *dpages, + struct dentry *dentry, gfp_t gfp) +{ + int err, sz; + struct au_dpage *dpage; + void *p; + + dpage = dpages->dpages + dpages->ndpage - 1; + sz = PAGE_SIZE / sizeof(dentry); + if (unlikely(dpage->ndentry >= sz)) { + AuLabel(new dpage); + err = -ENOMEM; + sz = dpages->ndpage * sizeof(*dpages->dpages); + p = au_kzrealloc(dpages->dpages, sz, + sz + sizeof(*dpages->dpages), gfp); + if (unlikely(!p)) + goto out; + + dpages->dpages = p; + dpage = dpages->dpages + dpages->ndpage; + p = (void *)__get_free_page(gfp); + if (unlikely(!p)) + goto out; + + dpage->ndentry = 0; + dpage->dentries = p; + dpages->ndpage++; + } + + AuDebugOn(!dentry->d_count); + dpage->dentries[dpage->ndentry++] = dget_dlock(dentry); + return 0; /* success */ + +out: + return err; +} + +int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root, + au_dpages_test test, void *arg) +{ + int err; + struct dentry *this_parent; + struct list_head *next; + struct super_block *sb = root->d_sb; + + err = 0; + write_seqlock(&rename_lock); + this_parent = root; + spin_lock(&this_parent->d_lock); +repeat: + next = this_parent->d_subdirs.next; +resume: + if (this_parent->d_sb == sb + && !IS_ROOT(this_parent) + && au_di(this_parent) + && this_parent->d_count + && (!test || test(this_parent, arg))) { + err = au_dpages_append(dpages, this_parent, GFP_ATOMIC); + if (unlikely(err)) + goto out; + } + + while (next != &this_parent->d_subdirs) { + struct list_head *tmp = next; + struct dentry *dentry = list_entry(tmp, struct dentry, + d_u.d_child); + + next = tmp->next; + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + if (dentry->d_count) { + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&this_parent->d_lock); + spin_release(&dentry->d_lock.dep_map, 1, + _RET_IP_); + this_parent = dentry; + spin_acquire(&this_parent->d_lock.dep_map, 0, 1, + _RET_IP_); + goto repeat; + } + if (dentry->d_sb == sb + && au_di(dentry) + && (!test || test(dentry, arg))) + err = au_dpages_append(dpages, dentry, + GFP_ATOMIC); + } + spin_unlock(&dentry->d_lock); + if (unlikely(err)) + goto out; + } + + if (this_parent != root) { + struct dentry *tmp; + struct dentry *child; + + tmp = this_parent->d_parent; + rcu_read_lock(); + spin_unlock(&this_parent->d_lock); + child = this_parent; + this_parent = tmp; + spin_lock(&this_parent->d_lock); + rcu_read_unlock(); + next = child->d_u.d_child.next; + goto resume; + } + +out: + spin_unlock(&this_parent->d_lock); + write_sequnlock(&rename_lock); + return err; +} + +int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry, + int do_include, au_dpages_test test, void *arg) +{ + int err; + + err = 0; + write_seqlock(&rename_lock); + spin_lock(&dentry->d_lock); + if (do_include + && dentry->d_count + && (!test || test(dentry, arg))) + err = au_dpages_append(dpages, dentry, GFP_ATOMIC); + spin_unlock(&dentry->d_lock); + if (unlikely(err)) + goto out; + + /* + * vfsmount_lock is unnecessary since this is a traverse in a single + * mount + */ + while (!IS_ROOT(dentry)) { + dentry = dentry->d_parent; /* rename_lock is locked */ + spin_lock(&dentry->d_lock); + if (dentry->d_count + && (!test || test(dentry, arg))) + err = au_dpages_append(dpages, dentry, GFP_ATOMIC); + spin_unlock(&dentry->d_lock); + if (unlikely(err)) + break; + } + +out: + write_sequnlock(&rename_lock); + return err; +} + +static inline int au_dcsub_dpages_aufs(struct dentry *dentry, void *arg) +{ + return au_di(dentry) && dentry->d_sb == arg; +} + +int au_dcsub_pages_rev_aufs(struct au_dcsub_pages *dpages, + struct dentry *dentry, int do_include) +{ + return au_dcsub_pages_rev(dpages, dentry, do_include, + au_dcsub_dpages_aufs, dentry->d_sb); +} + +int au_test_subdir(struct dentry *d1, struct dentry *d2) +{ + struct path path[2] = { + { + .dentry = d1 + }, + { + .dentry = d2 + } + }; + + return path_is_under(path + 0, path + 1); +} diff --git a/ubuntu/aufs/dcsub.h b/ubuntu/aufs/dcsub.h new file mode 100644 index 000000000000..1211304afe87 --- /dev/null +++ b/ubuntu/aufs/dcsub.h @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * sub-routines for dentry cache + */ + +#ifndef __AUFS_DCSUB_H__ +#define __AUFS_DCSUB_H__ + +#ifdef __KERNEL__ + +#include +#include +#include + +struct dentry; + +struct au_dpage { + int ndentry; + struct dentry **dentries; +}; + +struct au_dcsub_pages { + int ndpage; + struct au_dpage *dpages; +}; + +/* ---------------------------------------------------------------------- */ + +/* dcsub.c */ +int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp); +void au_dpages_free(struct au_dcsub_pages *dpages); +typedef int (*au_dpages_test)(struct dentry *dentry, void *arg); +int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root, + au_dpages_test test, void *arg); +int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry, + int do_include, au_dpages_test test, void *arg); +int au_dcsub_pages_rev_aufs(struct au_dcsub_pages *dpages, + struct dentry *dentry, int do_include); +int au_test_subdir(struct dentry *d1, struct dentry *d2); + +/* ---------------------------------------------------------------------- */ + +static inline int au_d_hashed_positive(struct dentry *d) +{ + int err; + struct inode *inode = d->d_inode; + err = 0; + if (unlikely(d_unhashed(d) || !inode || !inode->i_nlink)) + err = -ENOENT; + return err; +} + +static inline int au_d_alive(struct dentry *d) +{ + int err; + struct inode *inode; + err = 0; + if (!IS_ROOT(d)) + err = au_d_hashed_positive(d); + else { + inode = d->d_inode; + if (unlikely(d_unlinked(d) || !inode || !inode->i_nlink)) + err = -ENOENT; + } + return err; +} + +static inline int au_alive_dir(struct dentry *d) +{ + int err; + err = au_d_alive(d); + if (unlikely(err || IS_DEADDIR(d->d_inode))) + err = -ENOENT; + return err; +} + +#endif /* __KERNEL__ */ +#endif /* __AUFS_DCSUB_H__ */ diff --git a/ubuntu/aufs/debug.c b/ubuntu/aufs/debug.c new file mode 100644 index 000000000000..040a4722a2ae --- /dev/null +++ b/ubuntu/aufs/debug.c @@ -0,0 +1,490 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * debug print functions + */ + +#include +#include +#include "aufs.h" + +int aufs_debug; +MODULE_PARM_DESC(debug, "debug print"); +module_param_named(debug, aufs_debug, int, S_IRUGO | S_IWUSR | S_IWGRP); + +char *au_plevel = KERN_DEBUG; +#define dpri(fmt, ...) do { \ + if ((au_plevel \ + && strcmp(au_plevel, KERN_DEBUG)) \ + || au_debug_test()) \ + printk("%s" fmt, au_plevel, ##__VA_ARGS__); \ +} while (0) + +/* ---------------------------------------------------------------------- */ + +void au_dpri_whlist(struct au_nhash *whlist) +{ + unsigned long ul, n; + struct hlist_head *head; + struct au_vdir_wh *tpos; + struct hlist_node *pos; + + n = whlist->nh_num; + head = whlist->nh_head; + for (ul = 0; ul < n; ul++) { + hlist_for_each_entry(tpos, pos, head, wh_hash) + dpri("b%d, %.*s, %d\n", + tpos->wh_bindex, + tpos->wh_str.len, tpos->wh_str.name, + tpos->wh_str.len); + head++; + } +} + +void au_dpri_vdir(struct au_vdir *vdir) +{ + unsigned long ul; + union au_vdir_deblk_p p; + unsigned char *o; + + if (!vdir || IS_ERR(vdir)) { + dpri("err %ld\n", PTR_ERR(vdir)); + return; + } + + dpri("deblk %u, nblk %lu, deblk %p, last{%lu, %p}, ver %lu\n", + vdir->vd_deblk_sz, vdir->vd_nblk, vdir->vd_deblk, + vdir->vd_last.ul, vdir->vd_last.p.deblk, vdir->vd_version); + for (ul = 0; ul < vdir->vd_nblk; ul++) { + p.deblk = vdir->vd_deblk[ul]; + o = p.deblk; + dpri("[%lu]: %p\n", ul, o); + } +} + +static int do_pri_inode(aufs_bindex_t bindex, struct inode *inode, int hn, + struct dentry *wh) +{ + char *n = NULL; + int l = 0; + + if (!inode || IS_ERR(inode)) { + dpri("i%d: err %ld\n", bindex, PTR_ERR(inode)); + return -1; + } + + /* the type of i_blocks depends upon CONFIG_LSF */ + BUILD_BUG_ON(sizeof(inode->i_blocks) != sizeof(unsigned long) + && sizeof(inode->i_blocks) != sizeof(u64)); + if (wh) { + n = (void *)wh->d_name.name; + l = wh->d_name.len; + } + + dpri("i%d: %p, i%lu, %s, cnt %d, nl %u, 0%o, sz %llu, blk %llu," + " hn %d, ct %lld, np %lu, st 0x%lx, f 0x%x, v %llu, g %x%s%.*s\n", + bindex, inode, + inode->i_ino, inode->i_sb ? au_sbtype(inode->i_sb) : "??", + atomic_read(&inode->i_count), inode->i_nlink, inode->i_mode, + i_size_read(inode), (unsigned long long)inode->i_blocks, + hn, (long long)timespec_to_ns(&inode->i_ctime) & 0x0ffff, + inode->i_mapping ? inode->i_mapping->nrpages : 0, + inode->i_state, inode->i_flags, inode->i_version, + inode->i_generation, + l ? ", wh " : "", l, n); + return 0; +} + +void au_dpri_inode(struct inode *inode) +{ + struct au_iinfo *iinfo; + aufs_bindex_t bindex; + int err, hn; + + err = do_pri_inode(-1, inode, -1, NULL); + if (err || !au_test_aufs(inode->i_sb)) + return; + + iinfo = au_ii(inode); + if (!iinfo) + return; + dpri("i-1: bstart %d, bend %d, gen %d\n", + iinfo->ii_bstart, iinfo->ii_bend, au_iigen(inode)); + if (iinfo->ii_bstart < 0) + return; + hn = 0; + for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; bindex++) { + hn = !!au_hn(iinfo->ii_hinode + bindex); + do_pri_inode(bindex, iinfo->ii_hinode[0 + bindex].hi_inode, hn, + iinfo->ii_hinode[0 + bindex].hi_whdentry); + } +} + +void au_dpri_dalias(struct inode *inode) +{ + struct dentry *d; + + spin_lock(&inode->i_lock); + list_for_each_entry(d, &inode->i_dentry, d_alias) + au_dpri_dentry(d); + spin_unlock(&inode->i_lock); +} + +static int do_pri_dentry(aufs_bindex_t bindex, struct dentry *dentry) +{ + struct dentry *wh = NULL; + int hn; + + if (!dentry || IS_ERR(dentry)) { + dpri("d%d: err %ld\n", bindex, PTR_ERR(dentry)); + return -1; + } + /* do not call dget_parent() here */ + /* note: access d_xxx without d_lock */ + dpri("d%d: %.*s?/%.*s, %s, cnt %d, flags 0x%x\n", + bindex, + AuDLNPair(dentry->d_parent), AuDLNPair(dentry), + dentry->d_sb ? au_sbtype(dentry->d_sb) : "??", + dentry->d_count, dentry->d_flags); + hn = -1; + if (bindex >= 0 && dentry->d_inode && au_test_aufs(dentry->d_sb)) { + struct au_iinfo *iinfo = au_ii(dentry->d_inode); + if (iinfo) { + hn = !!au_hn(iinfo->ii_hinode + bindex); + wh = iinfo->ii_hinode[0 + bindex].hi_whdentry; + } + } + do_pri_inode(bindex, dentry->d_inode, hn, wh); + return 0; +} + +void au_dpri_dentry(struct dentry *dentry) +{ + struct au_dinfo *dinfo; + aufs_bindex_t bindex; + int err; + struct au_hdentry *hdp; + + err = do_pri_dentry(-1, dentry); + if (err || !au_test_aufs(dentry->d_sb)) + return; + + dinfo = au_di(dentry); + if (!dinfo) + return; + dpri("d-1: bstart %d, bend %d, bwh %d, bdiropq %d, gen %d\n", + dinfo->di_bstart, dinfo->di_bend, + dinfo->di_bwh, dinfo->di_bdiropq, au_digen(dentry)); + if (dinfo->di_bstart < 0) + return; + hdp = dinfo->di_hdentry; + for (bindex = dinfo->di_bstart; bindex <= dinfo->di_bend; bindex++) + do_pri_dentry(bindex, hdp[0 + bindex].hd_dentry); +} + +static int do_pri_file(aufs_bindex_t bindex, struct file *file) +{ + char a[32]; + + if (!file || IS_ERR(file)) { + dpri("f%d: err %ld\n", bindex, PTR_ERR(file)); + return -1; + } + a[0] = 0; + if (bindex < 0 + && file->f_dentry + && au_test_aufs(file->f_dentry->d_sb) + && au_fi(file)) + snprintf(a, sizeof(a), ", gen %d, mmapped %d", + au_figen(file), atomic_read(&au_fi(file)->fi_mmapped)); + dpri("f%d: mode 0x%x, flags 0%o, cnt %ld, v %llu, pos %llu%s\n", + bindex, file->f_mode, file->f_flags, (long)file_count(file), + file->f_version, file->f_pos, a); + if (file->f_dentry) + do_pri_dentry(bindex, file->f_dentry); + return 0; +} + +void au_dpri_file(struct file *file) +{ + struct au_finfo *finfo; + struct au_fidir *fidir; + struct au_hfile *hfile; + aufs_bindex_t bindex; + int err; + + err = do_pri_file(-1, file); + if (err || !file->f_dentry || !au_test_aufs(file->f_dentry->d_sb)) + return; + + finfo = au_fi(file); + if (!finfo) + return; + if (finfo->fi_btop < 0) + return; + fidir = finfo->fi_hdir; + if (!fidir) + do_pri_file(finfo->fi_btop, finfo->fi_htop.hf_file); + else + for (bindex = finfo->fi_btop; + bindex >= 0 && bindex <= fidir->fd_bbot; + bindex++) { + hfile = fidir->fd_hfile + bindex; + do_pri_file(bindex, hfile ? hfile->hf_file : NULL); + } +} + +static int do_pri_br(aufs_bindex_t bindex, struct au_branch *br) +{ + struct vfsmount *mnt; + struct super_block *sb; + + if (!br || IS_ERR(br)) + goto out; + mnt = br->br_mnt; + if (!mnt || IS_ERR(mnt)) + goto out; + sb = mnt->mnt_sb; + if (!sb || IS_ERR(sb)) + goto out; + + dpri("s%d: {perm 0x%x, id %d, cnt %d, wbr %p}, " + "%s, dev 0x%02x%02x, flags 0x%lx, cnt %d, active %d, " + "xino %d\n", + bindex, br->br_perm, br->br_id, atomic_read(&br->br_count), + br->br_wbr, au_sbtype(sb), MAJOR(sb->s_dev), MINOR(sb->s_dev), + sb->s_flags, sb->s_count, + atomic_read(&sb->s_active), !!br->br_xino.xi_file); + return 0; + +out: + dpri("s%d: err %ld\n", bindex, PTR_ERR(br)); + return -1; +} + +void au_dpri_sb(struct super_block *sb) +{ + struct au_sbinfo *sbinfo; + aufs_bindex_t bindex; + int err; + /* to reuduce stack size */ + struct { + struct vfsmount mnt; + struct au_branch fake; + } *a; + + /* this function can be called from magic sysrq */ + a = kzalloc(sizeof(*a), GFP_ATOMIC); + if (unlikely(!a)) { + dpri("no memory\n"); + return; + } + + a->mnt.mnt_sb = sb; + a->fake.br_perm = 0; + a->fake.br_mnt = &a->mnt; + a->fake.br_xino.xi_file = NULL; + atomic_set(&a->fake.br_count, 0); + smp_mb(); /* atomic_set */ + err = do_pri_br(-1, &a->fake); + kfree(a); + dpri("dev 0x%x\n", sb->s_dev); + if (err || !au_test_aufs(sb)) + return; + + sbinfo = au_sbi(sb); + if (!sbinfo) + return; + dpri("nw %d, gen %u, kobj %d\n", + atomic_read(&sbinfo->si_nowait.nw_len), sbinfo->si_generation, + atomic_read(&sbinfo->si_kobj.kref.refcount)); + for (bindex = 0; bindex <= sbinfo->si_bend; bindex++) + do_pri_br(bindex, sbinfo->si_branch[0 + bindex]); +} + +/* ---------------------------------------------------------------------- */ + +void au_dbg_sleep_jiffy(int jiffy) +{ + while (jiffy) + jiffy = schedule_timeout_uninterruptible(jiffy); +} + +void au_dbg_iattr(struct iattr *ia) +{ +#define AuBit(name) if (ia->ia_valid & ATTR_ ## name) \ + dpri(#name "\n") + AuBit(MODE); + AuBit(UID); + AuBit(GID); + AuBit(SIZE); + AuBit(ATIME); + AuBit(MTIME); + AuBit(CTIME); + AuBit(ATIME_SET); + AuBit(MTIME_SET); + AuBit(FORCE); + AuBit(ATTR_FLAG); + AuBit(KILL_SUID); + AuBit(KILL_SGID); + AuBit(FILE); + AuBit(KILL_PRIV); + AuBit(OPEN); + AuBit(TIMES_SET); +#undef AuBit + dpri("ia_file %p\n", ia->ia_file); +} + +/* ---------------------------------------------------------------------- */ + +void __au_dbg_verify_dinode(struct dentry *dentry, const char *func, int line) +{ + struct inode *h_inode, *inode = dentry->d_inode; + struct dentry *h_dentry; + aufs_bindex_t bindex, bend, bi; + + if (!inode /* || au_di(dentry)->di_lsc == AuLsc_DI_TMP */) + return; + + bend = au_dbend(dentry); + bi = au_ibend(inode); + if (bi < bend) + bend = bi; + bindex = au_dbstart(dentry); + bi = au_ibstart(inode); + if (bi > bindex) + bindex = bi; + + for (; bindex <= bend; bindex++) { + h_dentry = au_h_dptr(dentry, bindex); + if (!h_dentry) + continue; + h_inode = au_h_iptr(inode, bindex); + if (unlikely(h_inode != h_dentry->d_inode)) { + int old = au_debug_test(); + if (!old) + au_debug(1); + AuDbg("b%d, %s:%d\n", bindex, func, line); + AuDbgDentry(dentry); + AuDbgInode(inode); + if (!old) + au_debug(0); + BUG(); + } + } +} + +void au_dbg_verify_dir_parent(struct dentry *dentry, unsigned int sigen) +{ + struct dentry *parent; + + parent = dget_parent(dentry); + AuDebugOn(!S_ISDIR(dentry->d_inode->i_mode)); + AuDebugOn(IS_ROOT(dentry)); + AuDebugOn(au_digen_test(parent, sigen)); + dput(parent); +} + +void au_dbg_verify_nondir_parent(struct dentry *dentry, unsigned int sigen) +{ + struct dentry *parent; + struct inode *inode; + + parent = dget_parent(dentry); + inode = dentry->d_inode; + AuDebugOn(inode && S_ISDIR(dentry->d_inode->i_mode)); + AuDebugOn(au_digen_test(parent, sigen)); + dput(parent); +} + +void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen) +{ + int err, i, j; + struct au_dcsub_pages dpages; + struct au_dpage *dpage; + struct dentry **dentries; + + err = au_dpages_init(&dpages, GFP_NOFS); + AuDebugOn(err); + err = au_dcsub_pages_rev_aufs(&dpages, parent, /*do_include*/1); + AuDebugOn(err); + for (i = dpages.ndpage - 1; !err && i >= 0; i--) { + dpage = dpages.dpages + i; + dentries = dpage->dentries; + for (j = dpage->ndentry - 1; !err && j >= 0; j--) + AuDebugOn(au_digen_test(dentries[j], sigen)); + } + au_dpages_free(&dpages); +} + +void au_dbg_verify_kthread(void) +{ + if (au_wkq_test()) { + au_dbg_blocked(); + /* + * It may be recursive, but udba=notify between two aufs mounts, + * where a single ro branch is shared, is not a problem. + */ + /* WARN_ON(1); */ + } +} + +/* ---------------------------------------------------------------------- */ + +void au_debug_sbinfo_init(struct au_sbinfo *sbinfo __maybe_unused) +{ +#ifdef AuForceNoPlink + au_opt_clr(sbinfo->si_mntflags, PLINK); +#endif +#ifdef AuForceNoXino + au_opt_clr(sbinfo->si_mntflags, XINO); +#endif +#ifdef AuForceNoRefrof + au_opt_clr(sbinfo->si_mntflags, REFROF); +#endif +#ifdef AuForceHnotify + au_opt_set_udba(sbinfo->si_mntflags, UDBA_HNOTIFY); +#endif +#ifdef AuForceRd0 + sbinfo->si_rdblk = 0; + sbinfo->si_rdhash = 0; +#endif +} + +int __init au_debug_init(void) +{ + aufs_bindex_t bindex; + struct au_vdir_destr destr; + + bindex = -1; + AuDebugOn(bindex >= 0); + + destr.len = -1; + AuDebugOn(destr.len < NAME_MAX); + +#ifdef CONFIG_4KSTACKS + pr_warning("CONFIG_4KSTACKS is defined.\n"); +#endif + +#ifdef AuForceNoBrs + sysaufs_brs = 0; +#endif + + return 0; +} diff --git a/ubuntu/aufs/debug.h b/ubuntu/aufs/debug.h new file mode 100644 index 000000000000..453459e664d9 --- /dev/null +++ b/ubuntu/aufs/debug.h @@ -0,0 +1,252 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * debug print functions + */ + +#ifndef __AUFS_DEBUG_H__ +#define __AUFS_DEBUG_H__ + +#ifdef __KERNEL__ + +#include +#include +/* #include */ +#include +#include +#include +/* #include */ +#include +/* #include */ +#include +#include + +#include + +#ifdef CONFIG_AUFS_DEBUG +#define AuDebugOn(a) BUG_ON(a) + +/* module parameter */ +extern int aufs_debug; +static inline void au_debug(int n) +{ + aufs_debug = n; + smp_mb(); +} + +static inline int au_debug_test(void) +{ + return aufs_debug; +} +#else +#define AuDebugOn(a) do {} while (0) +AuStubVoid(au_debug, int n) +AuStubInt0(au_debug_test, void) +#endif /* CONFIG_AUFS_DEBUG */ + +/* ---------------------------------------------------------------------- */ + +/* debug print */ + +#define AuDbg(fmt, ...) do { \ + if (au_debug_test()) \ + pr_debug("DEBUG: " fmt, ##__VA_ARGS__); \ +} while (0) +#define AuLabel(l) AuDbg(#l "\n") +#define AuIOErr(fmt, ...) pr_err("I/O Error, " fmt, ##__VA_ARGS__) +#define AuWarn1(fmt, ...) do { \ + static unsigned char _c; \ + if (!_c++) \ + pr_warning(fmt, ##__VA_ARGS__); \ +} while (0) + +#define AuErr1(fmt, ...) do { \ + static unsigned char _c; \ + if (!_c++) \ + pr_err(fmt, ##__VA_ARGS__); \ +} while (0) + +#define AuIOErr1(fmt, ...) do { \ + static unsigned char _c; \ + if (!_c++) \ + AuIOErr(fmt, ##__VA_ARGS__); \ +} while (0) + +#define AuUnsupportMsg "This operation is not supported." \ + " Please report this application to aufs-users ML." +#define AuUnsupport(fmt, ...) do { \ + pr_err(AuUnsupportMsg "\n" fmt, ##__VA_ARGS__); \ + dump_stack(); \ +} while (0) + +#define AuTraceErr(e) do { \ + if (unlikely((e) < 0)) \ + AuDbg("err %d\n", (int)(e)); \ +} while (0) + +#define AuTraceErrPtr(p) do { \ + if (IS_ERR(p)) \ + AuDbg("err %ld\n", PTR_ERR(p)); \ +} while (0) + +/* dirty macros for debug print, use with "%.*s" and caution */ +#define AuLNPair(qstr) (qstr)->len, (qstr)->name +#define AuDLNPair(d) AuLNPair(&(d)->d_name) + +/* ---------------------------------------------------------------------- */ + +struct au_sbinfo; +struct au_finfo; +struct dentry; +#ifdef CONFIG_AUFS_DEBUG +extern char *au_plevel; +struct au_nhash; +void au_dpri_whlist(struct au_nhash *whlist); +struct au_vdir; +void au_dpri_vdir(struct au_vdir *vdir); +struct inode; +void au_dpri_inode(struct inode *inode); +void au_dpri_dalias(struct inode *inode); +void au_dpri_dentry(struct dentry *dentry); +struct file; +void au_dpri_file(struct file *filp); +struct super_block; +void au_dpri_sb(struct super_block *sb); + +void au_dbg_sleep_jiffy(int jiffy); +struct iattr; +void au_dbg_iattr(struct iattr *ia); + +#define au_dbg_verify_dinode(d) __au_dbg_verify_dinode(d, __func__, __LINE__) +void __au_dbg_verify_dinode(struct dentry *dentry, const char *func, int line); +void au_dbg_verify_dir_parent(struct dentry *dentry, unsigned int sigen); +void au_dbg_verify_nondir_parent(struct dentry *dentry, unsigned int sigen); +void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen); +void au_dbg_verify_kthread(void); + +int __init au_debug_init(void); +void au_debug_sbinfo_init(struct au_sbinfo *sbinfo); +#define AuDbgWhlist(w) do { \ + AuDbg(#w "\n"); \ + au_dpri_whlist(w); \ +} while (0) + +#define AuDbgVdir(v) do { \ + AuDbg(#v "\n"); \ + au_dpri_vdir(v); \ +} while (0) + +#define AuDbgInode(i) do { \ + AuDbg(#i "\n"); \ + au_dpri_inode(i); \ +} while (0) + +#define AuDbgDAlias(i) do { \ + AuDbg(#i "\n"); \ + au_dpri_dalias(i); \ +} while (0) + +#define AuDbgDentry(d) do { \ + AuDbg(#d "\n"); \ + au_dpri_dentry(d); \ +} while (0) + +#define AuDbgFile(f) do { \ + AuDbg(#f "\n"); \ + au_dpri_file(f); \ +} while (0) + +#define AuDbgSb(sb) do { \ + AuDbg(#sb "\n"); \ + au_dpri_sb(sb); \ +} while (0) + +#define AuDbgSleep(sec) do { \ + AuDbg("sleep %d sec\n", sec); \ + ssleep(sec); \ +} while (0) + +#define AuDbgSleepJiffy(jiffy) do { \ + AuDbg("sleep %d jiffies\n", jiffy); \ + au_dbg_sleep_jiffy(jiffy); \ +} while (0) + +#define AuDbgIAttr(ia) do { \ + AuDbg("ia_valid 0x%x\n", (ia)->ia_valid); \ + au_dbg_iattr(ia); \ +} while (0) + +#define AuDbgSym(addr) do { \ + char sym[KSYM_SYMBOL_LEN]; \ + sprint_symbol(sym, (unsigned long)addr); \ + AuDbg("%s\n", sym); \ +} while (0) + +#define AuInfoSym(addr) do { \ + char sym[KSYM_SYMBOL_LEN]; \ + sprint_symbol(sym, (unsigned long)addr); \ + AuInfo("%s\n", sym); \ +} while (0) +#else +AuStubVoid(au_dbg_verify_dinode, struct dentry *dentry) +AuStubVoid(au_dbg_verify_dir_parent, struct dentry *dentry, unsigned int sigen) +AuStubVoid(au_dbg_verify_nondir_parent, struct dentry *dentry, + unsigned int sigen) +AuStubVoid(au_dbg_verify_gen, struct dentry *parent, unsigned int sigen) +AuStubVoid(au_dbg_verify_kthread, void) +AuStubInt0(__init au_debug_init, void) +AuStubVoid(au_debug_sbinfo_init, struct au_sbinfo *sbinfo) + +#define AuDbgWhlist(w) do {} while (0) +#define AuDbgVdir(v) do {} while (0) +#define AuDbgInode(i) do {} while (0) +#define AuDbgDAlias(i) do {} while (0) +#define AuDbgDentry(d) do {} while (0) +#define AuDbgFile(f) do {} while (0) +#define AuDbgSb(sb) do {} while (0) +#define AuDbgSleep(sec) do {} while (0) +#define AuDbgSleepJiffy(jiffy) do {} while (0) +#define AuDbgIAttr(ia) do {} while (0) +#define AuDbgSym(addr) do {} while (0) +#define AuInfoSym(addr) do {} while (0) +#endif /* CONFIG_AUFS_DEBUG */ + +/* ---------------------------------------------------------------------- */ + +#ifdef CONFIG_AUFS_MAGIC_SYSRQ +int __init au_sysrq_init(void); +void au_sysrq_fin(void); + +#ifdef CONFIG_HW_CONSOLE +#define au_dbg_blocked() do { \ + WARN_ON(1); \ + handle_sysrq('w'); \ +} while (0) +#else +AuStubVoid(au_dbg_blocked, void) +#endif + +#else +AuStubInt0(__init au_sysrq_init, void) +AuStubVoid(au_sysrq_fin, void) +AuStubVoid(au_dbg_blocked, void) +#endif /* CONFIG_AUFS_MAGIC_SYSRQ */ + +#endif /* __KERNEL__ */ +#endif /* __AUFS_DEBUG_H__ */ diff --git a/ubuntu/aufs/dentry.c b/ubuntu/aufs/dentry.c new file mode 100644 index 000000000000..176fbdcd6ec8 --- /dev/null +++ b/ubuntu/aufs/dentry.c @@ -0,0 +1,1140 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * lookup and dentry operations + */ + +#include +#include "aufs.h" + +static void au_h_nd(struct nameidata *h_nd, struct nameidata *nd) +{ + if (nd) { + *h_nd = *nd; + + /* + * gave up supporting LOOKUP_CREATE/OPEN for lower fs, + * due to whiteout and branch permission. + */ + h_nd->flags &= ~(/*LOOKUP_PARENT |*/ LOOKUP_OPEN | LOOKUP_CREATE + | LOOKUP_FOLLOW | LOOKUP_EXCL); + /* unnecessary? */ + h_nd->intent.open.file = NULL; + } else + memset(h_nd, 0, sizeof(*h_nd)); +} + +struct au_lkup_one_args { + struct dentry **errp; + struct qstr *name; + struct dentry *h_parent; + struct au_branch *br; + struct nameidata *nd; +}; + +struct dentry *au_lkup_one(struct qstr *name, struct dentry *h_parent, + struct au_branch *br, struct nameidata *nd) +{ + struct dentry *h_dentry; + int err; + struct nameidata h_nd; + + if (au_test_fs_null_nd(h_parent->d_sb)) + return vfsub_lookup_one_len(name->name, h_parent, name->len); + + au_h_nd(&h_nd, nd); + h_nd.path.dentry = h_parent; + h_nd.path.mnt = br->br_mnt; + + err = vfsub_name_hash(name->name, &h_nd.last, name->len); + h_dentry = ERR_PTR(err); + if (!err) { + path_get(&h_nd.path); + h_dentry = vfsub_lookup_hash(&h_nd); + path_put(&h_nd.path); + } + + AuTraceErrPtr(h_dentry); + return h_dentry; +} + +static void au_call_lkup_one(void *args) +{ + struct au_lkup_one_args *a = args; + *a->errp = au_lkup_one(a->name, a->h_parent, a->br, a->nd); +} + +#define AuLkup_ALLOW_NEG 1 +#define au_ftest_lkup(flags, name) ((flags) & AuLkup_##name) +#define au_fset_lkup(flags, name) \ + do { (flags) |= AuLkup_##name; } while (0) +#define au_fclr_lkup(flags, name) \ + do { (flags) &= ~AuLkup_##name; } while (0) + +struct au_do_lookup_args { + unsigned int flags; + mode_t type; + struct nameidata *nd; +}; + +/* + * returns positive/negative dentry, NULL or an error. + * NULL means whiteout-ed or not-found. + */ +static struct dentry* +au_do_lookup(struct dentry *h_parent, struct dentry *dentry, + aufs_bindex_t bindex, struct qstr *wh_name, + struct au_do_lookup_args *args) +{ + struct dentry *h_dentry; + struct inode *h_inode, *inode; + struct au_branch *br; + int wh_found, opq; + unsigned char wh_able; + const unsigned char allow_neg = !!au_ftest_lkup(args->flags, ALLOW_NEG); + + wh_found = 0; + br = au_sbr(dentry->d_sb, bindex); + wh_able = !!au_br_whable(br->br_perm); + if (wh_able) + wh_found = au_wh_test(h_parent, wh_name, br, /*try_sio*/0); + h_dentry = ERR_PTR(wh_found); + if (!wh_found) + goto real_lookup; + if (unlikely(wh_found < 0)) + goto out; + + /* We found a whiteout */ + /* au_set_dbend(dentry, bindex); */ + au_set_dbwh(dentry, bindex); + if (!allow_neg) + return NULL; /* success */ + +real_lookup: + h_dentry = au_lkup_one(&dentry->d_name, h_parent, br, args->nd); + if (IS_ERR(h_dentry)) + goto out; + + h_inode = h_dentry->d_inode; + if (!h_inode) { + if (!allow_neg) + goto out_neg; + } else if (wh_found + || (args->type && args->type != (h_inode->i_mode & S_IFMT))) + goto out_neg; + + if (au_dbend(dentry) <= bindex) + au_set_dbend(dentry, bindex); + if (au_dbstart(dentry) < 0 || bindex < au_dbstart(dentry)) + au_set_dbstart(dentry, bindex); + au_set_h_dptr(dentry, bindex, h_dentry); + + inode = dentry->d_inode; + if (!h_inode || !S_ISDIR(h_inode->i_mode) || !wh_able + || (inode && !S_ISDIR(inode->i_mode))) + goto out; /* success */ + + mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); + opq = au_diropq_test(h_dentry, br); + mutex_unlock(&h_inode->i_mutex); + if (opq > 0) + au_set_dbdiropq(dentry, bindex); + else if (unlikely(opq < 0)) { + au_set_h_dptr(dentry, bindex, NULL); + h_dentry = ERR_PTR(opq); + } + goto out; + +out_neg: + dput(h_dentry); + h_dentry = NULL; +out: + return h_dentry; +} + +static int au_test_shwh(struct super_block *sb, const struct qstr *name) +{ + if (unlikely(!au_opt_test(au_mntflags(sb), SHWH) + && !strncmp(name->name, AUFS_WH_PFX, AUFS_WH_PFX_LEN))) + return -EPERM; + return 0; +} + +/* + * returns the number of lower positive dentries, + * otherwise an error. + * can be called at unlinking with @type is zero. + */ +int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t bstart, mode_t type, + struct nameidata *nd) +{ + int npositive, err; + aufs_bindex_t bindex, btail, bdiropq; + unsigned char isdir; + struct qstr whname; + struct au_do_lookup_args args = { + .flags = 0, + .type = type, + .nd = nd + }; + const struct qstr *name = &dentry->d_name; + struct dentry *parent; + struct inode *inode; + + err = au_test_shwh(dentry->d_sb, name); + if (unlikely(err)) + goto out; + + err = au_wh_name_alloc(&whname, name); + if (unlikely(err)) + goto out; + + inode = dentry->d_inode; + isdir = !!(inode && S_ISDIR(inode->i_mode)); + if (!type) + au_fset_lkup(args.flags, ALLOW_NEG); + + npositive = 0; + parent = dget_parent(dentry); + btail = au_dbtaildir(parent); + for (bindex = bstart; bindex <= btail; bindex++) { + struct dentry *h_parent, *h_dentry; + struct inode *h_inode, *h_dir; + + h_dentry = au_h_dptr(dentry, bindex); + if (h_dentry) { + if (h_dentry->d_inode) + npositive++; + if (type != S_IFDIR) + break; + continue; + } + h_parent = au_h_dptr(parent, bindex); + if (!h_parent) + continue; + h_dir = h_parent->d_inode; + if (!h_dir || !S_ISDIR(h_dir->i_mode)) + continue; + + mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT); + h_dentry = au_do_lookup(h_parent, dentry, bindex, &whname, + &args); + mutex_unlock(&h_dir->i_mutex); + err = PTR_ERR(h_dentry); + if (IS_ERR(h_dentry)) + goto out_parent; + au_fclr_lkup(args.flags, ALLOW_NEG); + + if (au_dbwh(dentry) >= 0) + break; + if (!h_dentry) + continue; + h_inode = h_dentry->d_inode; + if (!h_inode) + continue; + npositive++; + if (!args.type) + args.type = h_inode->i_mode & S_IFMT; + if (args.type != S_IFDIR) + break; + else if (isdir) { + /* the type of lower may be different */ + bdiropq = au_dbdiropq(dentry); + if (bdiropq >= 0 && bdiropq <= bindex) + break; + } + } + + if (npositive) { + AuLabel(positive); + au_update_dbstart(dentry); + } + err = npositive; + if (unlikely(!au_opt_test(au_mntflags(dentry->d_sb), UDBA_NONE) + && au_dbstart(dentry) < 0)) { + err = -EIO; + AuIOErr("both of real entry and whiteout found, %.*s, err %d\n", + AuDLNPair(dentry), err); + } + +out_parent: + dput(parent); + kfree(whname.name); +out: + return err; +} + +struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent, + struct au_branch *br) +{ + struct dentry *dentry; + int wkq_err; + + if (!au_test_h_perm_sio(parent->d_inode, MAY_EXEC)) + dentry = au_lkup_one(name, parent, br, /*nd*/NULL); + else { + struct au_lkup_one_args args = { + .errp = &dentry, + .name = name, + .h_parent = parent, + .br = br, + .nd = NULL + }; + + wkq_err = au_wkq_wait(au_call_lkup_one, &args); + if (unlikely(wkq_err)) + dentry = ERR_PTR(wkq_err); + } + + return dentry; +} + +/* + * lookup @dentry on @bindex which should be negative. + */ +int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex) +{ + int err; + struct dentry *parent, *h_parent, *h_dentry; + + parent = dget_parent(dentry); + h_parent = au_h_dptr(parent, bindex); + h_dentry = au_sio_lkup_one(&dentry->d_name, h_parent, + au_sbr(dentry->d_sb, bindex)); + err = PTR_ERR(h_dentry); + if (IS_ERR(h_dentry)) + goto out; + if (unlikely(h_dentry->d_inode)) { + err = -EIO; + AuIOErr("%.*s should be negative on b%d.\n", + AuDLNPair(h_dentry), bindex); + dput(h_dentry); + goto out; + } + + err = 0; + if (bindex < au_dbstart(dentry)) + au_set_dbstart(dentry, bindex); + if (au_dbend(dentry) < bindex) + au_set_dbend(dentry, bindex); + au_set_h_dptr(dentry, bindex, h_dentry); + +out: + dput(parent); + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* subset of struct inode */ +struct au_iattr { + unsigned long i_ino; + /* unsigned int i_nlink; */ + uid_t i_uid; + gid_t i_gid; + u64 i_version; +/* + loff_t i_size; + blkcnt_t i_blocks; +*/ + umode_t i_mode; +}; + +static void au_iattr_save(struct au_iattr *ia, struct inode *h_inode) +{ + ia->i_ino = h_inode->i_ino; + /* ia->i_nlink = h_inode->i_nlink; */ + ia->i_uid = h_inode->i_uid; + ia->i_gid = h_inode->i_gid; + ia->i_version = h_inode->i_version; +/* + ia->i_size = h_inode->i_size; + ia->i_blocks = h_inode->i_blocks; +*/ + ia->i_mode = (h_inode->i_mode & S_IFMT); +} + +static int au_iattr_test(struct au_iattr *ia, struct inode *h_inode) +{ + return ia->i_ino != h_inode->i_ino + /* || ia->i_nlink != h_inode->i_nlink */ + || ia->i_uid != h_inode->i_uid + || ia->i_gid != h_inode->i_gid + || ia->i_version != h_inode->i_version +/* + || ia->i_size != h_inode->i_size + || ia->i_blocks != h_inode->i_blocks +*/ + || ia->i_mode != (h_inode->i_mode & S_IFMT); +} + +static int au_h_verify_dentry(struct dentry *h_dentry, struct dentry *h_parent, + struct au_branch *br) +{ + int err; + struct au_iattr ia; + struct inode *h_inode; + struct dentry *h_d; + struct super_block *h_sb; + + err = 0; + memset(&ia, -1, sizeof(ia)); + h_sb = h_dentry->d_sb; + h_inode = h_dentry->d_inode; + if (h_inode) + au_iattr_save(&ia, h_inode); + else if (au_test_nfs(h_sb) || au_test_fuse(h_sb)) + /* nfs d_revalidate may return 0 for negative dentry */ + /* fuse d_revalidate always return 0 for negative dentry */ + goto out; + + /* main purpose is namei.c:cached_lookup() and d_revalidate */ + h_d = au_lkup_one(&h_dentry->d_name, h_parent, br, /*nd*/NULL); + err = PTR_ERR(h_d); + if (IS_ERR(h_d)) + goto out; + + err = 0; + if (unlikely(h_d != h_dentry + || h_d->d_inode != h_inode + || (h_inode && au_iattr_test(&ia, h_inode)))) + err = au_busy_or_stale(); + dput(h_d); + +out: + AuTraceErr(err); + return err; +} + +int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir, + struct dentry *h_parent, struct au_branch *br) +{ + int err; + + err = 0; + if (udba == AuOpt_UDBA_REVAL + && !au_test_fs_remote(h_dentry->d_sb)) { + IMustLock(h_dir); + err = (h_dentry->d_parent->d_inode != h_dir); + } else if (udba != AuOpt_UDBA_NONE) + err = au_h_verify_dentry(h_dentry, h_parent, br); + + return err; +} + +/* ---------------------------------------------------------------------- */ + +static int au_do_refresh_hdentry(struct dentry *dentry, struct dentry *parent) +{ + int err; + aufs_bindex_t new_bindex, bindex, bend, bwh, bdiropq; + struct au_hdentry tmp, *p, *q; + struct au_dinfo *dinfo; + struct super_block *sb; + + DiMustWriteLock(dentry); + + sb = dentry->d_sb; + dinfo = au_di(dentry); + bend = dinfo->di_bend; + bwh = dinfo->di_bwh; + bdiropq = dinfo->di_bdiropq; + p = dinfo->di_hdentry + dinfo->di_bstart; + for (bindex = dinfo->di_bstart; bindex <= bend; bindex++, p++) { + if (!p->hd_dentry) + continue; + + new_bindex = au_br_index(sb, p->hd_id); + if (new_bindex == bindex) + continue; + + if (dinfo->di_bwh == bindex) + bwh = new_bindex; + if (dinfo->di_bdiropq == bindex) + bdiropq = new_bindex; + if (new_bindex < 0) { + au_hdput(p); + p->hd_dentry = NULL; + continue; + } + + /* swap two lower dentries, and loop again */ + q = dinfo->di_hdentry + new_bindex; + tmp = *q; + *q = *p; + *p = tmp; + if (tmp.hd_dentry) { + bindex--; + p--; + } + } + + dinfo->di_bwh = -1; + if (bwh >= 0 && bwh <= au_sbend(sb) && au_sbr_whable(sb, bwh)) + dinfo->di_bwh = bwh; + + dinfo->di_bdiropq = -1; + if (bdiropq >= 0 + && bdiropq <= au_sbend(sb) + && au_sbr_whable(sb, bdiropq)) + dinfo->di_bdiropq = bdiropq; + + err = -EIO; + dinfo->di_bstart = -1; + dinfo->di_bend = -1; + bend = au_dbend(parent); + p = dinfo->di_hdentry; + for (bindex = 0; bindex <= bend; bindex++, p++) + if (p->hd_dentry) { + dinfo->di_bstart = bindex; + break; + } + + if (dinfo->di_bstart >= 0) { + p = dinfo->di_hdentry + bend; + for (bindex = bend; bindex >= 0; bindex--, p--) + if (p->hd_dentry) { + dinfo->di_bend = bindex; + err = 0; + break; + } + } + + return err; +} + +static void au_do_hide(struct dentry *dentry) +{ + struct inode *inode; + + inode = dentry->d_inode; + if (inode) { + if (!S_ISDIR(inode->i_mode)) { + if (inode->i_nlink && !d_unhashed(dentry)) + drop_nlink(inode); + } else { + clear_nlink(inode); + /* stop next lookup */ + inode->i_flags |= S_DEAD; + } + smp_mb(); /* necessary? */ + } + d_drop(dentry); +} + +static int au_hide_children(struct dentry *parent) +{ + int err, i, j, ndentry; + struct au_dcsub_pages dpages; + struct au_dpage *dpage; + struct dentry *dentry; + + err = au_dpages_init(&dpages, GFP_NOFS); + if (unlikely(err)) + goto out; + err = au_dcsub_pages(&dpages, parent, NULL, NULL); + if (unlikely(err)) + goto out_dpages; + + /* in reverse order */ + for (i = dpages.ndpage - 1; i >= 0; i--) { + dpage = dpages.dpages + i; + ndentry = dpage->ndentry; + for (j = ndentry - 1; j >= 0; j--) { + dentry = dpage->dentries[j]; + if (dentry != parent) + au_do_hide(dentry); + } + } + +out_dpages: + au_dpages_free(&dpages); +out: + return err; +} + +static void au_hide(struct dentry *dentry) +{ + int err; + struct inode *inode; + + AuDbgDentry(dentry); + inode = dentry->d_inode; + if (inode && S_ISDIR(inode->i_mode)) { + /* shrink_dcache_parent(dentry); */ + err = au_hide_children(dentry); + if (unlikely(err)) + AuIOErr("%.*s, failed hiding children, ignored %d\n", + AuDLNPair(dentry), err); + } + au_do_hide(dentry); +} + +/* + * By adding a dirty branch, a cached dentry may be affected in various ways. + * + * a dirty branch is added + * - on the top of layers + * - in the middle of layers + * - to the bottom of layers + * + * on the added branch there exists + * - a whiteout + * - a diropq + * - a same named entry + * + exist + * * negative --> positive + * * positive --> positive + * - type is unchanged + * - type is changed + * + doesn't exist + * * negative --> negative + * * positive --> negative (rejected by au_br_del() for non-dir case) + * - none + */ +static int au_refresh_by_dinfo(struct dentry *dentry, struct au_dinfo *dinfo, + struct au_dinfo *tmp) +{ + int err; + aufs_bindex_t bindex, bend; + struct { + struct dentry *dentry; + struct inode *inode; + mode_t mode; + } orig_h, tmp_h; + struct au_hdentry *hd; + struct inode *inode, *h_inode; + struct dentry *h_dentry; + + err = 0; + AuDebugOn(dinfo->di_bstart < 0); + orig_h.dentry = dinfo->di_hdentry[dinfo->di_bstart].hd_dentry; + orig_h.inode = orig_h.dentry->d_inode; + orig_h.mode = 0; + if (orig_h.inode) + orig_h.mode = orig_h.inode->i_mode & S_IFMT; + memset(&tmp_h, 0, sizeof(tmp_h)); + if (tmp->di_bstart >= 0) { + tmp_h.dentry = tmp->di_hdentry[tmp->di_bstart].hd_dentry; + tmp_h.inode = tmp_h.dentry->d_inode; + if (tmp_h.inode) + tmp_h.mode = tmp_h.inode->i_mode & S_IFMT; + } + + inode = dentry->d_inode; + if (!orig_h.inode) { + AuDbg("nagative originally\n"); + if (inode) { + au_hide(dentry); + goto out; + } + AuDebugOn(inode); + AuDebugOn(dinfo->di_bstart != dinfo->di_bend); + AuDebugOn(dinfo->di_bdiropq != -1); + + if (!tmp_h.inode) { + AuDbg("negative --> negative\n"); + /* should have only one negative lower */ + if (tmp->di_bstart >= 0 + && tmp->di_bstart < dinfo->di_bstart) { + AuDebugOn(tmp->di_bstart != tmp->di_bend); + AuDebugOn(dinfo->di_bstart != dinfo->di_bend); + au_set_h_dptr(dentry, dinfo->di_bstart, NULL); + au_di_cp(dinfo, tmp); + hd = tmp->di_hdentry + tmp->di_bstart; + au_set_h_dptr(dentry, tmp->di_bstart, + dget(hd->hd_dentry)); + } + au_dbg_verify_dinode(dentry); + } else { + AuDbg("negative --> positive\n"); + /* + * similar to the behaviour of creating with bypassing + * aufs. + * unhash it in order to force an error in the + * succeeding create operation. + * we should not set S_DEAD here. + */ + d_drop(dentry); + /* au_di_swap(tmp, dinfo); */ + au_dbg_verify_dinode(dentry); + } + } else { + AuDbg("positive originally\n"); + /* inode may be NULL */ + AuDebugOn(inode && (inode->i_mode & S_IFMT) != orig_h.mode); + if (!tmp_h.inode) { + AuDbg("positive --> negative\n"); + /* or bypassing aufs */ + au_hide(dentry); + if (tmp->di_bwh >= 0 && tmp->di_bwh <= dinfo->di_bstart) + dinfo->di_bwh = tmp->di_bwh; + if (inode) + err = au_refresh_hinode_self(inode); + au_dbg_verify_dinode(dentry); + } else if (orig_h.mode == tmp_h.mode) { + AuDbg("positive --> positive, same type\n"); + if (!S_ISDIR(orig_h.mode) + && dinfo->di_bstart > tmp->di_bstart) { + /* + * similar to the behaviour of removing and + * creating. + */ + au_hide(dentry); + if (inode) + err = au_refresh_hinode_self(inode); + au_dbg_verify_dinode(dentry); + } else { + /* fill empty slots */ + if (dinfo->di_bstart > tmp->di_bstart) + dinfo->di_bstart = tmp->di_bstart; + if (dinfo->di_bend < tmp->di_bend) + dinfo->di_bend = tmp->di_bend; + dinfo->di_bwh = tmp->di_bwh; + dinfo->di_bdiropq = tmp->di_bdiropq; + hd = tmp->di_hdentry; + bend = dinfo->di_bend; + for (bindex = tmp->di_bstart; bindex <= bend; + bindex++) { + if (au_h_dptr(dentry, bindex)) + continue; + h_dentry = hd[bindex].hd_dentry; + if (!h_dentry) + continue; + h_inode = h_dentry->d_inode; + AuDebugOn(!h_inode); + AuDebugOn(orig_h.mode + != (h_inode->i_mode + & S_IFMT)); + au_set_h_dptr(dentry, bindex, + dget(h_dentry)); + } + err = au_refresh_hinode(inode, dentry); + au_dbg_verify_dinode(dentry); + } + } else { + AuDbg("positive --> positive, different type\n"); + /* similar to the behaviour of removing and creating */ + au_hide(dentry); + if (inode) + err = au_refresh_hinode_self(inode); + au_dbg_verify_dinode(dentry); + } + } + +out: + return err; +} + +int au_refresh_dentry(struct dentry *dentry, struct dentry *parent) +{ + int err, ebrange; + unsigned int sigen; + struct au_dinfo *dinfo, *tmp; + struct super_block *sb; + struct inode *inode; + + DiMustWriteLock(dentry); + AuDebugOn(IS_ROOT(dentry)); + AuDebugOn(!parent->d_inode); + + sb = dentry->d_sb; + inode = dentry->d_inode; + sigen = au_sigen(sb); + err = au_digen_test(parent, sigen); + if (unlikely(err)) + goto out; + + dinfo = au_di(dentry); + err = au_di_realloc(dinfo, au_sbend(sb) + 1); + if (unlikely(err)) + goto out; + ebrange = au_dbrange_test(dentry); + if (!ebrange) + ebrange = au_do_refresh_hdentry(dentry, parent); + + if (d_unhashed(dentry) || ebrange) { + AuDebugOn(au_dbstart(dentry) < 0 && au_dbend(dentry) >= 0); + if (inode) + err = au_refresh_hinode_self(inode); + au_dbg_verify_dinode(dentry); + if (!err) + goto out_dgen; /* success */ + goto out; + } + + /* temporary dinfo */ + AuDbgDentry(dentry); + err = -ENOMEM; + tmp = au_di_alloc(sb, AuLsc_DI_TMP); + if (unlikely(!tmp)) + goto out; + au_di_swap(tmp, dinfo); + /* returns the number of positive dentries */ + /* + * if current working dir is removed, it returns an error. + * but the dentry is legal. + */ + err = au_lkup_dentry(dentry, /*bstart*/0, /*type*/0, /*nd*/NULL); + AuDbgDentry(dentry); + au_di_swap(tmp, dinfo); + if (err == -ENOENT) + err = 0; + if (err >= 0) { + /* compare/refresh by dinfo */ + AuDbgDentry(dentry); + err = au_refresh_by_dinfo(dentry, dinfo, tmp); + au_dbg_verify_dinode(dentry); + AuTraceErr(err); + } + au_rw_write_unlock(&tmp->di_rwsem); + au_di_free(tmp); + if (unlikely(err)) + goto out; + +out_dgen: + au_update_digen(dentry); +out: + if (unlikely(err && !(dentry->d_flags & DCACHE_NFSFS_RENAMED))) { + AuIOErr("failed refreshing %.*s, %d\n", + AuDLNPair(dentry), err); + AuDbgDentry(dentry); + } + AuTraceErr(err); + return err; +} + +static noinline_for_stack +int au_do_h_d_reval(struct dentry *h_dentry, struct nameidata *nd, + struct dentry *dentry, aufs_bindex_t bindex) +{ + int err, valid; + int (*reval)(struct dentry *, struct nameidata *); + + err = 0; + if (!(h_dentry->d_flags & DCACHE_OP_REVALIDATE)) + goto out; + reval = h_dentry->d_op->d_revalidate; + + AuDbg("b%d\n", bindex); + if (au_test_fs_null_nd(h_dentry->d_sb)) + /* it may return tri-state */ + valid = reval(h_dentry, NULL); + else { + struct nameidata h_nd; + int locked; + struct dentry *parent; + + au_h_nd(&h_nd, nd); + parent = nd->path.dentry; + locked = (nd && nd->path.dentry != dentry); + if (locked) + di_read_lock_parent(parent, AuLock_IR); + BUG_ON(bindex > au_dbend(parent)); + h_nd.path.dentry = au_h_dptr(parent, bindex); + BUG_ON(!h_nd.path.dentry); + h_nd.path.mnt = au_sbr(parent->d_sb, bindex)->br_mnt; + path_get(&h_nd.path); + valid = reval(h_dentry, &h_nd); + path_put(&h_nd.path); + if (locked) + di_read_unlock(parent, AuLock_IR); + } + + if (unlikely(valid < 0)) + err = valid; + else if (!valid) + err = -EINVAL; + +out: + AuTraceErr(err); + return err; +} + +/* todo: remove this */ +static int h_d_revalidate(struct dentry *dentry, struct inode *inode, + struct nameidata *nd, int do_udba) +{ + int err; + umode_t mode, h_mode; + aufs_bindex_t bindex, btail, bstart, ibs, ibe; + unsigned char plus, unhashed, is_root, h_plus; + struct inode *h_inode, *h_cached_inode; + struct dentry *h_dentry; + struct qstr *name, *h_name; + + err = 0; + plus = 0; + mode = 0; + ibs = -1; + ibe = -1; + unhashed = !!d_unhashed(dentry); + is_root = !!IS_ROOT(dentry); + name = &dentry->d_name; + + /* + * Theoretically, REVAL test should be unnecessary in case of + * {FS,I}NOTIFY. + * But {fs,i}notify doesn't fire some necessary events, + * IN_ATTRIB for atime/nlink/pageio + * IN_DELETE for NFS dentry + * Let's do REVAL test too. + */ + if (do_udba && inode) { + mode = (inode->i_mode & S_IFMT); + plus = (inode->i_nlink > 0); + ibs = au_ibstart(inode); + ibe = au_ibend(inode); + } + + bstart = au_dbstart(dentry); + btail = bstart; + if (inode && S_ISDIR(inode->i_mode)) + btail = au_dbtaildir(dentry); + for (bindex = bstart; bindex <= btail; bindex++) { + h_dentry = au_h_dptr(dentry, bindex); + if (!h_dentry) + continue; + + AuDbg("b%d, %.*s\n", bindex, AuDLNPair(h_dentry)); + spin_lock(&h_dentry->d_lock); + h_name = &h_dentry->d_name; + if (unlikely(do_udba + && !is_root + && (unhashed != !!d_unhashed(h_dentry) + || name->len != h_name->len + || memcmp(name->name, h_name->name, name->len)) + )) { + AuDbg("unhash 0x%x 0x%x, %.*s %.*s\n", + unhashed, d_unhashed(h_dentry), + AuDLNPair(dentry), AuDLNPair(h_dentry)); + spin_unlock(&h_dentry->d_lock); + goto err; + } + spin_unlock(&h_dentry->d_lock); + + err = au_do_h_d_reval(h_dentry, nd, dentry, bindex); + if (unlikely(err)) + /* do not goto err, to keep the errno */ + break; + + /* todo: plink too? */ + if (!do_udba) + continue; + + /* UDBA tests */ + h_inode = h_dentry->d_inode; + if (unlikely(!!inode != !!h_inode)) + goto err; + + h_plus = plus; + h_mode = mode; + h_cached_inode = h_inode; + if (h_inode) { + h_mode = (h_inode->i_mode & S_IFMT); + h_plus = (h_inode->i_nlink > 0); + } + if (inode && ibs <= bindex && bindex <= ibe) + h_cached_inode = au_h_iptr(inode, bindex); + + if (unlikely(plus != h_plus + || mode != h_mode + || h_cached_inode != h_inode)) + goto err; + continue; + + err: + err = -EINVAL; + break; + } + + return err; +} + +/* todo: consolidate with do_refresh() and au_reval_for_attr() */ +static int simple_reval_dpath(struct dentry *dentry, unsigned int sigen) +{ + int err; + struct dentry *parent; + + if (!au_digen_test(dentry, sigen)) + return 0; + + parent = dget_parent(dentry); + di_read_lock_parent(parent, AuLock_IR); + AuDebugOn(au_digen_test(parent, sigen)); + au_dbg_verify_gen(parent, sigen); + err = au_refresh_dentry(dentry, parent); + di_read_unlock(parent, AuLock_IR); + dput(parent); + AuTraceErr(err); + return err; +} + +int au_reval_dpath(struct dentry *dentry, unsigned int sigen) +{ + int err; + struct dentry *d, *parent; + struct inode *inode; + + if (!au_ftest_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIR)) + return simple_reval_dpath(dentry, sigen); + + /* slow loop, keep it simple and stupid */ + /* cf: au_cpup_dirs() */ + err = 0; + parent = NULL; + while (au_digen_test(dentry, sigen)) { + d = dentry; + while (1) { + dput(parent); + parent = dget_parent(d); + if (!au_digen_test(parent, sigen)) + break; + d = parent; + } + + inode = d->d_inode; + if (d != dentry) + di_write_lock_child2(d); + + /* someone might update our dentry while we were sleeping */ + if (au_digen_test(d, sigen)) { + /* + * todo: consolidate with simple_reval_dpath(), + * do_refresh() and au_reval_for_attr(). + */ + di_read_lock_parent(parent, AuLock_IR); + err = au_refresh_dentry(d, parent); + di_read_unlock(parent, AuLock_IR); + } + + if (d != dentry) + di_write_unlock(d); + dput(parent); + if (unlikely(err)) + break; + } + + return err; +} + +/* + * if valid returns 1, otherwise 0. + */ +static int aufs_d_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + int valid, err; + unsigned int sigen; + unsigned char do_udba; + struct super_block *sb; + struct inode *inode; + + /* todo: support rcu-walk? */ + if (nd && (nd->flags & LOOKUP_RCU)) + return -ECHILD; + + valid = 0; + if (unlikely(!au_di(dentry))) + goto out; + + inode = dentry->d_inode; + if (inode && is_bad_inode(inode)) + goto out; + + valid = 1; + sb = dentry->d_sb; + /* + * todo: very ugly + * i_mutex of parent dir may be held, + * but we should not return 'invalid' due to busy. + */ + err = aufs_read_lock(dentry, AuLock_FLUSH | AuLock_DW | AuLock_NOPLM); + if (unlikely(err)) { + valid = err; + AuTraceErr(err); + goto out; + } + if (unlikely(au_dbrange_test(dentry))) { + err = -EINVAL; + AuTraceErr(err); + goto out_dgrade; + } + + sigen = au_sigen(sb); + if (au_digen_test(dentry, sigen)) { + AuDebugOn(IS_ROOT(dentry)); + err = au_reval_dpath(dentry, sigen); + if (unlikely(err)) { + AuTraceErr(err); + goto out_dgrade; + } + } + di_downgrade_lock(dentry, AuLock_IR); + + err = -EINVAL; + if (inode && (IS_DEADDIR(inode) || !inode->i_nlink)) + goto out_inval; + + do_udba = !au_opt_test(au_mntflags(sb), UDBA_NONE); + if (do_udba && inode) { + aufs_bindex_t bstart = au_ibstart(inode); + struct inode *h_inode; + + if (bstart >= 0) { + h_inode = au_h_iptr(inode, bstart); + if (h_inode && au_test_higen(inode, h_inode)) + goto out_inval; + } + } + + err = h_d_revalidate(dentry, inode, nd, do_udba); + if (unlikely(!err && do_udba && au_dbstart(dentry) < 0)) { + err = -EIO; + AuDbg("both of real entry and whiteout found, %.*s, err %d\n", + AuDLNPair(dentry), err); + } + goto out_inval; + +out_dgrade: + di_downgrade_lock(dentry, AuLock_IR); +out_inval: + aufs_read_unlock(dentry, AuLock_IR); + AuTraceErr(err); + valid = !err; +out: + if (!valid) { + AuDbg("%.*s invalid, %d\n", AuDLNPair(dentry), valid); + d_drop(dentry); + } + return valid; +} + +static void aufs_d_release(struct dentry *dentry) +{ + if (au_di(dentry)) { + au_di_fin(dentry); + au_hn_di_reinit(dentry); + } +} + +const struct dentry_operations aufs_dop = { + .d_revalidate = aufs_d_revalidate, + .d_release = aufs_d_release +}; diff --git a/ubuntu/aufs/dentry.h b/ubuntu/aufs/dentry.h new file mode 100644 index 000000000000..4f0827583c4a --- /dev/null +++ b/ubuntu/aufs/dentry.h @@ -0,0 +1,238 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * lookup and dentry operations + */ + +#ifndef __AUFS_DENTRY_H__ +#define __AUFS_DENTRY_H__ + +#ifdef __KERNEL__ + +#include +#include +#include "rwsem.h" + +struct au_hdentry { + struct dentry *hd_dentry; + aufs_bindex_t hd_id; +}; + +struct au_dinfo { + atomic_t di_generation; + + struct au_rwsem di_rwsem; + aufs_bindex_t di_bstart, di_bend, di_bwh, di_bdiropq; + struct au_hdentry *di_hdentry; +} ____cacheline_aligned_in_smp; + +/* ---------------------------------------------------------------------- */ + +/* dentry.c */ +extern const struct dentry_operations aufs_dop; +struct au_branch; +struct dentry *au_lkup_one(struct qstr *name, struct dentry *h_parent, + struct au_branch *br, struct nameidata *nd); +struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent, + struct au_branch *br); +int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir, + struct dentry *h_parent, struct au_branch *br); + +int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t bstart, mode_t type, + struct nameidata *nd); +int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex); +int au_refresh_dentry(struct dentry *dentry, struct dentry *parent); +int au_reval_dpath(struct dentry *dentry, unsigned int sigen); + +/* dinfo.c */ +void au_di_init_once(void *_di); +struct au_dinfo *au_di_alloc(struct super_block *sb, unsigned int lsc); +void au_di_free(struct au_dinfo *dinfo); +void au_di_swap(struct au_dinfo *a, struct au_dinfo *b); +void au_di_cp(struct au_dinfo *dst, struct au_dinfo *src); +int au_di_init(struct dentry *dentry); +void au_di_fin(struct dentry *dentry); +int au_di_realloc(struct au_dinfo *dinfo, int nbr); + +void di_read_lock(struct dentry *d, int flags, unsigned int lsc); +void di_read_unlock(struct dentry *d, int flags); +void di_downgrade_lock(struct dentry *d, int flags); +void di_write_lock(struct dentry *d, unsigned int lsc); +void di_write_unlock(struct dentry *d); +void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir); +void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir); +void di_write_unlock2(struct dentry *d1, struct dentry *d2); + +struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex); +struct dentry *au_h_d_alias(struct dentry *dentry, aufs_bindex_t bindex); +aufs_bindex_t au_dbtail(struct dentry *dentry); +aufs_bindex_t au_dbtaildir(struct dentry *dentry); + +void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex, + struct dentry *h_dentry); +int au_digen_test(struct dentry *dentry, unsigned int sigen); +int au_dbrange_test(struct dentry *dentry); +void au_update_digen(struct dentry *dentry); +void au_update_dbrange(struct dentry *dentry, int do_put_zero); +void au_update_dbstart(struct dentry *dentry); +void au_update_dbend(struct dentry *dentry); +int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry); + +/* ---------------------------------------------------------------------- */ + +static inline struct au_dinfo *au_di(struct dentry *dentry) +{ + return dentry->d_fsdata; +} + +/* ---------------------------------------------------------------------- */ + +/* lock subclass for dinfo */ +enum { + AuLsc_DI_CHILD, /* child first */ + AuLsc_DI_CHILD2, /* rename(2), link(2), and cpup at hnotify */ + AuLsc_DI_CHILD3, /* copyup dirs */ + AuLsc_DI_PARENT, + AuLsc_DI_PARENT2, + AuLsc_DI_PARENT3, + AuLsc_DI_TMP /* temp for replacing dinfo */ +}; + +/* + * di_read_lock_child, di_write_lock_child, + * di_read_lock_child2, di_write_lock_child2, + * di_read_lock_child3, di_write_lock_child3, + * di_read_lock_parent, di_write_lock_parent, + * di_read_lock_parent2, di_write_lock_parent2, + * di_read_lock_parent3, di_write_lock_parent3, + */ +#define AuReadLockFunc(name, lsc) \ +static inline void di_read_lock_##name(struct dentry *d, int flags) \ +{ di_read_lock(d, flags, AuLsc_DI_##lsc); } + +#define AuWriteLockFunc(name, lsc) \ +static inline void di_write_lock_##name(struct dentry *d) \ +{ di_write_lock(d, AuLsc_DI_##lsc); } + +#define AuRWLockFuncs(name, lsc) \ + AuReadLockFunc(name, lsc) \ + AuWriteLockFunc(name, lsc) + +AuRWLockFuncs(child, CHILD); +AuRWLockFuncs(child2, CHILD2); +AuRWLockFuncs(child3, CHILD3); +AuRWLockFuncs(parent, PARENT); +AuRWLockFuncs(parent2, PARENT2); +AuRWLockFuncs(parent3, PARENT3); + +#undef AuReadLockFunc +#undef AuWriteLockFunc +#undef AuRWLockFuncs + +#define DiMustNoWaiters(d) AuRwMustNoWaiters(&au_di(d)->di_rwsem) +#define DiMustAnyLock(d) AuRwMustAnyLock(&au_di(d)->di_rwsem) +#define DiMustWriteLock(d) AuRwMustWriteLock(&au_di(d)->di_rwsem) + +/* ---------------------------------------------------------------------- */ + +/* todo: memory barrier? */ +static inline unsigned int au_digen(struct dentry *d) +{ + return atomic_read(&au_di(d)->di_generation); +} + +static inline void au_h_dentry_init(struct au_hdentry *hdentry) +{ + hdentry->hd_dentry = NULL; +} + +static inline void au_hdput(struct au_hdentry *hd) +{ + if (hd) + dput(hd->hd_dentry); +} + +static inline aufs_bindex_t au_dbstart(struct dentry *dentry) +{ + DiMustAnyLock(dentry); + return au_di(dentry)->di_bstart; +} + +static inline aufs_bindex_t au_dbend(struct dentry *dentry) +{ + DiMustAnyLock(dentry); + return au_di(dentry)->di_bend; +} + +static inline aufs_bindex_t au_dbwh(struct dentry *dentry) +{ + DiMustAnyLock(dentry); + return au_di(dentry)->di_bwh; +} + +static inline aufs_bindex_t au_dbdiropq(struct dentry *dentry) +{ + DiMustAnyLock(dentry); + return au_di(dentry)->di_bdiropq; +} + +/* todo: hard/soft set? */ +static inline void au_set_dbstart(struct dentry *dentry, aufs_bindex_t bindex) +{ + DiMustWriteLock(dentry); + au_di(dentry)->di_bstart = bindex; +} + +static inline void au_set_dbend(struct dentry *dentry, aufs_bindex_t bindex) +{ + DiMustWriteLock(dentry); + au_di(dentry)->di_bend = bindex; +} + +static inline void au_set_dbwh(struct dentry *dentry, aufs_bindex_t bindex) +{ + DiMustWriteLock(dentry); + /* dbwh can be outside of bstart - bend range */ + au_di(dentry)->di_bwh = bindex; +} + +static inline void au_set_dbdiropq(struct dentry *dentry, aufs_bindex_t bindex) +{ + DiMustWriteLock(dentry); + au_di(dentry)->di_bdiropq = bindex; +} + +/* ---------------------------------------------------------------------- */ + +#ifdef CONFIG_AUFS_HNOTIFY +static inline void au_digen_dec(struct dentry *d) +{ + atomic_dec(&au_di(d)->di_generation); +} + +static inline void au_hn_di_reinit(struct dentry *dentry) +{ + dentry->d_fsdata = NULL; +} +#else +AuStubVoid(au_hn_di_reinit, struct dentry *dentry __maybe_unused) +#endif /* CONFIG_AUFS_HNOTIFY */ + +#endif /* __KERNEL__ */ +#endif /* __AUFS_DENTRY_H__ */ diff --git a/ubuntu/aufs/dinfo.c b/ubuntu/aufs/dinfo.c new file mode 100644 index 000000000000..a2a6809ca7cf --- /dev/null +++ b/ubuntu/aufs/dinfo.c @@ -0,0 +1,543 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * dentry private data + */ + +#include "aufs.h" + +void au_di_init_once(void *_dinfo) +{ + struct au_dinfo *dinfo = _dinfo; + static struct lock_class_key aufs_di; + + au_rw_init(&dinfo->di_rwsem); + au_rw_class(&dinfo->di_rwsem, &aufs_di); +} + +struct au_dinfo *au_di_alloc(struct super_block *sb, unsigned int lsc) +{ + struct au_dinfo *dinfo; + int nbr, i; + + dinfo = au_cache_alloc_dinfo(); + if (unlikely(!dinfo)) + goto out; + + nbr = au_sbend(sb) + 1; + if (nbr <= 0) + nbr = 1; + dinfo->di_hdentry = kcalloc(nbr, sizeof(*dinfo->di_hdentry), GFP_NOFS); + if (dinfo->di_hdentry) { + au_rw_write_lock_nested(&dinfo->di_rwsem, lsc); + dinfo->di_bstart = -1; + dinfo->di_bend = -1; + dinfo->di_bwh = -1; + dinfo->di_bdiropq = -1; + for (i = 0; i < nbr; i++) + dinfo->di_hdentry[i].hd_id = -1; + goto out; + } + + au_cache_free_dinfo(dinfo); + dinfo = NULL; + +out: + return dinfo; +} + +void au_di_free(struct au_dinfo *dinfo) +{ + struct au_hdentry *p; + aufs_bindex_t bend, bindex; + + /* dentry may not be revalidated */ + bindex = dinfo->di_bstart; + if (bindex >= 0) { + bend = dinfo->di_bend; + p = dinfo->di_hdentry + bindex; + while (bindex++ <= bend) + au_hdput(p++); + } + kfree(dinfo->di_hdentry); + au_cache_free_dinfo(dinfo); +} + +void au_di_swap(struct au_dinfo *a, struct au_dinfo *b) +{ + struct au_hdentry *p; + aufs_bindex_t bi; + + AuRwMustWriteLock(&a->di_rwsem); + AuRwMustWriteLock(&b->di_rwsem); + +#define DiSwap(v, name) \ + do { \ + v = a->di_##name; \ + a->di_##name = b->di_##name; \ + b->di_##name = v; \ + } while (0) + + DiSwap(p, hdentry); + DiSwap(bi, bstart); + DiSwap(bi, bend); + DiSwap(bi, bwh); + DiSwap(bi, bdiropq); + /* smp_mb(); */ + +#undef DiSwap +} + +void au_di_cp(struct au_dinfo *dst, struct au_dinfo *src) +{ + AuRwMustWriteLock(&dst->di_rwsem); + AuRwMustWriteLock(&src->di_rwsem); + + dst->di_bstart = src->di_bstart; + dst->di_bend = src->di_bend; + dst->di_bwh = src->di_bwh; + dst->di_bdiropq = src->di_bdiropq; + /* smp_mb(); */ +} + +int au_di_init(struct dentry *dentry) +{ + int err; + struct super_block *sb; + struct au_dinfo *dinfo; + + err = 0; + sb = dentry->d_sb; + dinfo = au_di_alloc(sb, AuLsc_DI_CHILD); + if (dinfo) { + atomic_set(&dinfo->di_generation, au_sigen(sb)); + /* smp_mb(); */ /* atomic_set */ + dentry->d_fsdata = dinfo; + } else + err = -ENOMEM; + + return err; +} + +void au_di_fin(struct dentry *dentry) +{ + struct au_dinfo *dinfo; + + dinfo = au_di(dentry); + AuRwDestroy(&dinfo->di_rwsem); + au_di_free(dinfo); +} + +int au_di_realloc(struct au_dinfo *dinfo, int nbr) +{ + int err, sz; + struct au_hdentry *hdp; + + AuRwMustWriteLock(&dinfo->di_rwsem); + + err = -ENOMEM; + sz = sizeof(*hdp) * (dinfo->di_bend + 1); + if (!sz) + sz = sizeof(*hdp); + hdp = au_kzrealloc(dinfo->di_hdentry, sz, sizeof(*hdp) * nbr, GFP_NOFS); + if (hdp) { + dinfo->di_hdentry = hdp; + err = 0; + } + + return err; +} + +/* ---------------------------------------------------------------------- */ + +static void do_ii_write_lock(struct inode *inode, unsigned int lsc) +{ + switch (lsc) { + case AuLsc_DI_CHILD: + ii_write_lock_child(inode); + break; + case AuLsc_DI_CHILD2: + ii_write_lock_child2(inode); + break; + case AuLsc_DI_CHILD3: + ii_write_lock_child3(inode); + break; + case AuLsc_DI_PARENT: + ii_write_lock_parent(inode); + break; + case AuLsc_DI_PARENT2: + ii_write_lock_parent2(inode); + break; + case AuLsc_DI_PARENT3: + ii_write_lock_parent3(inode); + break; + default: + BUG(); + } +} + +static void do_ii_read_lock(struct inode *inode, unsigned int lsc) +{ + switch (lsc) { + case AuLsc_DI_CHILD: + ii_read_lock_child(inode); + break; + case AuLsc_DI_CHILD2: + ii_read_lock_child2(inode); + break; + case AuLsc_DI_CHILD3: + ii_read_lock_child3(inode); + break; + case AuLsc_DI_PARENT: + ii_read_lock_parent(inode); + break; + case AuLsc_DI_PARENT2: + ii_read_lock_parent2(inode); + break; + case AuLsc_DI_PARENT3: + ii_read_lock_parent3(inode); + break; + default: + BUG(); + } +} + +void di_read_lock(struct dentry *d, int flags, unsigned int lsc) +{ + au_rw_read_lock_nested(&au_di(d)->di_rwsem, lsc); + if (d->d_inode) { + if (au_ftest_lock(flags, IW)) + do_ii_write_lock(d->d_inode, lsc); + else if (au_ftest_lock(flags, IR)) + do_ii_read_lock(d->d_inode, lsc); + } +} + +void di_read_unlock(struct dentry *d, int flags) +{ + if (d->d_inode) { + if (au_ftest_lock(flags, IW)) { + au_dbg_verify_dinode(d); + ii_write_unlock(d->d_inode); + } else if (au_ftest_lock(flags, IR)) { + au_dbg_verify_dinode(d); + ii_read_unlock(d->d_inode); + } + } + au_rw_read_unlock(&au_di(d)->di_rwsem); +} + +void di_downgrade_lock(struct dentry *d, int flags) +{ + if (d->d_inode && au_ftest_lock(flags, IR)) + ii_downgrade_lock(d->d_inode); + au_rw_dgrade_lock(&au_di(d)->di_rwsem); +} + +void di_write_lock(struct dentry *d, unsigned int lsc) +{ + au_rw_write_lock_nested(&au_di(d)->di_rwsem, lsc); + if (d->d_inode) + do_ii_write_lock(d->d_inode, lsc); +} + +void di_write_unlock(struct dentry *d) +{ + au_dbg_verify_dinode(d); + if (d->d_inode) + ii_write_unlock(d->d_inode); + au_rw_write_unlock(&au_di(d)->di_rwsem); +} + +void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir) +{ + AuDebugOn(d1 == d2 + || d1->d_inode == d2->d_inode + || d1->d_sb != d2->d_sb); + + if (isdir && au_test_subdir(d1, d2)) { + di_write_lock_child(d1); + di_write_lock_child2(d2); + } else { + /* there should be no races */ + di_write_lock_child(d2); + di_write_lock_child2(d1); + } +} + +void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir) +{ + AuDebugOn(d1 == d2 + || d1->d_inode == d2->d_inode + || d1->d_sb != d2->d_sb); + + if (isdir && au_test_subdir(d1, d2)) { + di_write_lock_parent(d1); + di_write_lock_parent2(d2); + } else { + /* there should be no races */ + di_write_lock_parent(d2); + di_write_lock_parent2(d1); + } +} + +void di_write_unlock2(struct dentry *d1, struct dentry *d2) +{ + di_write_unlock(d1); + if (d1->d_inode == d2->d_inode) + au_rw_write_unlock(&au_di(d2)->di_rwsem); + else + di_write_unlock(d2); +} + +/* ---------------------------------------------------------------------- */ + +struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex) +{ + struct dentry *d; + + DiMustAnyLock(dentry); + + if (au_dbstart(dentry) < 0 || bindex < au_dbstart(dentry)) + return NULL; + AuDebugOn(bindex < 0); + d = au_di(dentry)->di_hdentry[0 + bindex].hd_dentry; + AuDebugOn(d && d->d_count <= 0); + return d; +} + +/* + * extended version of au_h_dptr(). + * returns a hashed and positive h_dentry in bindex, NULL, or error. + */ +struct dentry *au_h_d_alias(struct dentry *dentry, aufs_bindex_t bindex) +{ + struct dentry *h_dentry; + struct inode *inode, *h_inode; + + inode = dentry->d_inode; + AuDebugOn(!inode); + + h_dentry = NULL; + if (au_dbstart(dentry) <= bindex + && bindex <= au_dbend(dentry)) + h_dentry = au_h_dptr(dentry, bindex); + if (h_dentry && !au_d_hashed_positive(h_dentry)) { + dget(h_dentry); + goto out; /* success */ + } + + AuDebugOn(bindex < au_ibstart(inode)); + AuDebugOn(au_ibend(inode) < bindex); + h_inode = au_h_iptr(inode, bindex); + h_dentry = d_find_alias(h_inode); + if (h_dentry) { + if (!IS_ERR(h_dentry)) { + if (!au_d_hashed_positive(h_dentry)) + goto out; /* success */ + dput(h_dentry); + } else + goto out; + } + + if (au_opt_test(au_mntflags(dentry->d_sb), PLINK)) { + h_dentry = au_plink_lkup(inode, bindex); + AuDebugOn(!h_dentry); + if (!IS_ERR(h_dentry)) { + if (!au_d_hashed_positive(h_dentry)) + goto out; /* success */ + dput(h_dentry); + h_dentry = NULL; + } + } + +out: + AuDbgDentry(h_dentry); + return h_dentry; +} + +aufs_bindex_t au_dbtail(struct dentry *dentry) +{ + aufs_bindex_t bend, bwh; + + bend = au_dbend(dentry); + if (0 <= bend) { + bwh = au_dbwh(dentry); + if (!bwh) + return bwh; + if (0 < bwh && bwh < bend) + return bwh - 1; + } + return bend; +} + +aufs_bindex_t au_dbtaildir(struct dentry *dentry) +{ + aufs_bindex_t bend, bopq; + + bend = au_dbtail(dentry); + if (0 <= bend) { + bopq = au_dbdiropq(dentry); + if (0 <= bopq && bopq < bend) + bend = bopq; + } + return bend; +} + +/* ---------------------------------------------------------------------- */ + +void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex, + struct dentry *h_dentry) +{ + struct au_hdentry *hd = au_di(dentry)->di_hdentry + bindex; + struct au_branch *br; + + DiMustWriteLock(dentry); + + au_hdput(hd); + hd->hd_dentry = h_dentry; + if (h_dentry) { + br = au_sbr(dentry->d_sb, bindex); + hd->hd_id = br->br_id; + } +} + +int au_dbrange_test(struct dentry *dentry) +{ + int err; + aufs_bindex_t bstart, bend; + + err = 0; + bstart = au_dbstart(dentry); + bend = au_dbend(dentry); + if (bstart >= 0) + AuDebugOn(bend < 0 && bstart > bend); + else { + err = -EIO; + AuDebugOn(bend >= 0); + } + + return err; +} + +int au_digen_test(struct dentry *dentry, unsigned int sigen) +{ + int err; + + err = 0; + if (unlikely(au_digen(dentry) != sigen + || au_iigen_test(dentry->d_inode, sigen))) + err = -EIO; + + return err; +} + +void au_update_digen(struct dentry *dentry) +{ + atomic_set(&au_di(dentry)->di_generation, au_sigen(dentry->d_sb)); + /* smp_mb(); */ /* atomic_set */ +} + +void au_update_dbrange(struct dentry *dentry, int do_put_zero) +{ + struct au_dinfo *dinfo; + struct dentry *h_d; + struct au_hdentry *hdp; + + DiMustWriteLock(dentry); + + dinfo = au_di(dentry); + if (!dinfo || dinfo->di_bstart < 0) + return; + + hdp = dinfo->di_hdentry; + if (do_put_zero) { + aufs_bindex_t bindex, bend; + + bend = dinfo->di_bend; + for (bindex = dinfo->di_bstart; bindex <= bend; bindex++) { + h_d = hdp[0 + bindex].hd_dentry; + if (h_d && !h_d->d_inode) + au_set_h_dptr(dentry, bindex, NULL); + } + } + + dinfo->di_bstart = -1; + while (++dinfo->di_bstart <= dinfo->di_bend) + if (hdp[0 + dinfo->di_bstart].hd_dentry) + break; + if (dinfo->di_bstart > dinfo->di_bend) { + dinfo->di_bstart = -1; + dinfo->di_bend = -1; + return; + } + + dinfo->di_bend++; + while (0 <= --dinfo->di_bend) + if (hdp[0 + dinfo->di_bend].hd_dentry) + break; + AuDebugOn(dinfo->di_bstart > dinfo->di_bend || dinfo->di_bend < 0); +} + +void au_update_dbstart(struct dentry *dentry) +{ + aufs_bindex_t bindex, bend; + struct dentry *h_dentry; + + bend = au_dbend(dentry); + for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) { + h_dentry = au_h_dptr(dentry, bindex); + if (!h_dentry) + continue; + if (h_dentry->d_inode) { + au_set_dbstart(dentry, bindex); + return; + } + au_set_h_dptr(dentry, bindex, NULL); + } +} + +void au_update_dbend(struct dentry *dentry) +{ + aufs_bindex_t bindex, bstart; + struct dentry *h_dentry; + + bstart = au_dbstart(dentry); + for (bindex = au_dbend(dentry); bindex >= bstart; bindex--) { + h_dentry = au_h_dptr(dentry, bindex); + if (!h_dentry) + continue; + if (h_dentry->d_inode) { + au_set_dbend(dentry, bindex); + return; + } + au_set_h_dptr(dentry, bindex, NULL); + } +} + +int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry) +{ + aufs_bindex_t bindex, bend; + + bend = au_dbend(dentry); + for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) + if (au_h_dptr(dentry, bindex) == h_dentry) + return bindex; + return -1; +} diff --git a/ubuntu/aufs/dir.c b/ubuntu/aufs/dir.c new file mode 100644 index 000000000000..2743616d97ad --- /dev/null +++ b/ubuntu/aufs/dir.c @@ -0,0 +1,635 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * directory operations + */ + +#include +#include +#include "aufs.h" + +void au_add_nlink(struct inode *dir, struct inode *h_dir) +{ + unsigned int nlink; + + AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode)); + + nlink = dir->i_nlink; + nlink += h_dir->i_nlink - 2; + if (h_dir->i_nlink < 2) + nlink += 2; + set_nlink(dir, nlink); +} + +void au_sub_nlink(struct inode *dir, struct inode *h_dir) +{ + unsigned int nlink; + + AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode)); + + nlink = dir->i_nlink; + nlink -= h_dir->i_nlink - 2; + if (h_dir->i_nlink < 2) + nlink -= 2; + set_nlink(dir, nlink); +} + +loff_t au_dir_size(struct file *file, struct dentry *dentry) +{ + loff_t sz; + aufs_bindex_t bindex, bend; + struct file *h_file; + struct dentry *h_dentry; + + sz = 0; + if (file) { + AuDebugOn(!file->f_dentry); + AuDebugOn(!file->f_dentry->d_inode); + AuDebugOn(!S_ISDIR(file->f_dentry->d_inode->i_mode)); + + bend = au_fbend_dir(file); + for (bindex = au_fbstart(file); + bindex <= bend && sz < KMALLOC_MAX_SIZE; + bindex++) { + h_file = au_hf_dir(file, bindex); + if (h_file + && h_file->f_dentry + && h_file->f_dentry->d_inode) + sz += i_size_read(h_file->f_dentry->d_inode); + } + } else { + AuDebugOn(!dentry); + AuDebugOn(!dentry->d_inode); + AuDebugOn(!S_ISDIR(dentry->d_inode->i_mode)); + + bend = au_dbtaildir(dentry); + for (bindex = au_dbstart(dentry); + bindex <= bend && sz < KMALLOC_MAX_SIZE; + bindex++) { + h_dentry = au_h_dptr(dentry, bindex); + if (h_dentry && h_dentry->d_inode) + sz += i_size_read(h_dentry->d_inode); + } + } + if (sz < KMALLOC_MAX_SIZE) + sz = roundup_pow_of_two(sz); + if (sz > KMALLOC_MAX_SIZE) + sz = KMALLOC_MAX_SIZE; + else if (sz < NAME_MAX) { + BUILD_BUG_ON(AUFS_RDBLK_DEF < NAME_MAX); + sz = AUFS_RDBLK_DEF; + } + return sz; +} + +/* ---------------------------------------------------------------------- */ + +static int reopen_dir(struct file *file) +{ + int err; + unsigned int flags; + aufs_bindex_t bindex, btail, bstart; + struct dentry *dentry, *h_dentry; + struct file *h_file; + + /* open all lower dirs */ + dentry = file->f_dentry; + bstart = au_dbstart(dentry); + for (bindex = au_fbstart(file); bindex < bstart; bindex++) + au_set_h_fptr(file, bindex, NULL); + au_set_fbstart(file, bstart); + + btail = au_dbtaildir(dentry); + for (bindex = au_fbend_dir(file); btail < bindex; bindex--) + au_set_h_fptr(file, bindex, NULL); + au_set_fbend_dir(file, btail); + + flags = vfsub_file_flags(file); + for (bindex = bstart; bindex <= btail; bindex++) { + h_dentry = au_h_dptr(dentry, bindex); + if (!h_dentry) + continue; + h_file = au_hf_dir(file, bindex); + if (h_file) + continue; + + h_file = au_h_open(dentry, bindex, flags, file); + err = PTR_ERR(h_file); + if (IS_ERR(h_file)) + goto out; /* close all? */ + au_set_h_fptr(file, bindex, h_file); + } + au_update_figen(file); + /* todo: necessary? */ + /* file->f_ra = h_file->f_ra; */ + err = 0; + +out: + return err; +} + +static int do_open_dir(struct file *file, int flags) +{ + int err; + aufs_bindex_t bindex, btail; + struct dentry *dentry, *h_dentry; + struct file *h_file; + + FiMustWriteLock(file); + + dentry = file->f_dentry; + err = au_alive_dir(dentry); + if (unlikely(err)) + goto out; + + file->f_version = dentry->d_inode->i_version; + bindex = au_dbstart(dentry); + au_set_fbstart(file, bindex); + btail = au_dbtaildir(dentry); + au_set_fbend_dir(file, btail); + for (; !err && bindex <= btail; bindex++) { + h_dentry = au_h_dptr(dentry, bindex); + if (!h_dentry) + continue; + + h_file = au_h_open(dentry, bindex, flags, file); + if (IS_ERR(h_file)) { + err = PTR_ERR(h_file); + break; + } + au_set_h_fptr(file, bindex, h_file); + } + au_update_figen(file); + /* todo: necessary? */ + /* file->f_ra = h_file->f_ra; */ + if (!err) + return 0; /* success */ + + /* close all */ + for (bindex = au_fbstart(file); bindex <= btail; bindex++) + au_set_h_fptr(file, bindex, NULL); + au_set_fbstart(file, -1); + au_set_fbend_dir(file, -1); + +out: + return err; +} + +static int aufs_open_dir(struct inode *inode __maybe_unused, + struct file *file) +{ + int err; + struct super_block *sb; + struct au_fidir *fidir; + + err = -ENOMEM; + sb = file->f_dentry->d_sb; + si_read_lock(sb, AuLock_FLUSH); + fidir = au_fidir_alloc(sb); + if (fidir) { + err = au_do_open(file, do_open_dir, fidir); + if (unlikely(err)) + kfree(fidir); + } + si_read_unlock(sb); + return err; +} + +static int aufs_release_dir(struct inode *inode __maybe_unused, + struct file *file) +{ + struct au_vdir *vdir_cache; + struct au_finfo *finfo; + struct au_fidir *fidir; + aufs_bindex_t bindex, bend; + + finfo = au_fi(file); + fidir = finfo->fi_hdir; + if (fidir) { + /* remove me from sb->s_files */ + file_sb_list_del(file); + + vdir_cache = fidir->fd_vdir_cache; /* lock-free */ + if (vdir_cache) + au_vdir_free(vdir_cache); + + bindex = finfo->fi_btop; + if (bindex >= 0) { + /* + * calls fput() instead of filp_close(), + * since no dnotify or lock for the lower file. + */ + bend = fidir->fd_bbot; + for (; bindex <= bend; bindex++) + au_set_h_fptr(file, bindex, NULL); + } + kfree(fidir); + finfo->fi_hdir = NULL; + } + au_finfo_fin(file); + return 0; +} + +/* ---------------------------------------------------------------------- */ + +static int au_do_flush_dir(struct file *file, fl_owner_t id) +{ + int err; + aufs_bindex_t bindex, bend; + struct file *h_file; + + err = 0; + bend = au_fbend_dir(file); + for (bindex = au_fbstart(file); !err && bindex <= bend; bindex++) { + h_file = au_hf_dir(file, bindex); + if (h_file) + err = vfsub_flush(h_file, id); + } + return err; +} + +static int aufs_flush_dir(struct file *file, fl_owner_t id) +{ + return au_do_flush(file, id, au_do_flush_dir); +} + +/* ---------------------------------------------------------------------- */ + +static int au_do_fsync_dir_no_file(struct dentry *dentry, int datasync) +{ + int err; + aufs_bindex_t bend, bindex; + struct inode *inode; + struct super_block *sb; + + err = 0; + sb = dentry->d_sb; + inode = dentry->d_inode; + IMustLock(inode); + bend = au_dbend(dentry); + for (bindex = au_dbstart(dentry); !err && bindex <= bend; bindex++) { + struct path h_path; + + if (au_test_ro(sb, bindex, inode)) + continue; + h_path.dentry = au_h_dptr(dentry, bindex); + if (!h_path.dentry) + continue; + + h_path.mnt = au_sbr_mnt(sb, bindex); + err = vfsub_fsync(NULL, &h_path, datasync); + } + + return err; +} + +static int au_do_fsync_dir(struct file *file, int datasync) +{ + int err; + aufs_bindex_t bend, bindex; + struct file *h_file; + struct super_block *sb; + struct inode *inode; + + err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1); + if (unlikely(err)) + goto out; + + sb = file->f_dentry->d_sb; + inode = file->f_dentry->d_inode; + bend = au_fbend_dir(file); + for (bindex = au_fbstart(file); !err && bindex <= bend; bindex++) { + h_file = au_hf_dir(file, bindex); + if (!h_file || au_test_ro(sb, bindex, inode)) + continue; + + err = vfsub_fsync(h_file, &h_file->f_path, datasync); + } + +out: + return err; +} + +/* + * @file may be NULL + */ +static int aufs_fsync_dir(struct file *file, loff_t start, loff_t end, + int datasync) +{ + int err; + struct dentry *dentry; + struct super_block *sb; + struct mutex *mtx; + + err = 0; + dentry = file->f_dentry; + mtx = &dentry->d_inode->i_mutex; + mutex_lock(mtx); + sb = dentry->d_sb; + si_noflush_read_lock(sb); + if (file) + err = au_do_fsync_dir(file, datasync); + else { + di_write_lock_child(dentry); + err = au_do_fsync_dir_no_file(dentry, datasync); + } + au_cpup_attr_timesizes(dentry->d_inode); + di_write_unlock(dentry); + if (file) + fi_write_unlock(file); + + si_read_unlock(sb); + mutex_unlock(mtx); + return err; +} + +/* ---------------------------------------------------------------------- */ + +static int aufs_readdir(struct file *file, void *dirent, filldir_t filldir) +{ + int err; + struct dentry *dentry; + struct inode *inode, *h_inode; + struct super_block *sb; + + dentry = file->f_dentry; + inode = dentry->d_inode; + IMustLock(inode); + + sb = dentry->d_sb; + si_read_lock(sb, AuLock_FLUSH); + err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1); + if (unlikely(err)) + goto out; + err = au_alive_dir(dentry); + if (!err) + err = au_vdir_init(file); + di_downgrade_lock(dentry, AuLock_IR); + if (unlikely(err)) + goto out_unlock; + + h_inode = au_h_iptr(inode, au_ibstart(inode)); + if (!au_test_nfsd()) { + err = au_vdir_fill_de(file, dirent, filldir); + fsstack_copy_attr_atime(inode, h_inode); + } else { + /* + * nfsd filldir may call lookup_one_len(), vfs_getattr(), + * encode_fh() and others. + */ + atomic_inc(&h_inode->i_count); + di_read_unlock(dentry, AuLock_IR); + si_read_unlock(sb); + err = au_vdir_fill_de(file, dirent, filldir); + fsstack_copy_attr_atime(inode, h_inode); + fi_write_unlock(file); + iput(h_inode); + + AuTraceErr(err); + return err; + } + +out_unlock: + di_read_unlock(dentry, AuLock_IR); + fi_write_unlock(file); +out: + si_read_unlock(sb); + return err; +} + +/* ---------------------------------------------------------------------- */ + +#define AuTestEmpty_WHONLY 1 +#define AuTestEmpty_CALLED (1 << 1) +#define AuTestEmpty_SHWH (1 << 2) +#define au_ftest_testempty(flags, name) ((flags) & AuTestEmpty_##name) +#define au_fset_testempty(flags, name) \ + do { (flags) |= AuTestEmpty_##name; } while (0) +#define au_fclr_testempty(flags, name) \ + do { (flags) &= ~AuTestEmpty_##name; } while (0) + +#ifndef CONFIG_AUFS_SHWH +#undef AuTestEmpty_SHWH +#define AuTestEmpty_SHWH 0 +#endif + +struct test_empty_arg { + struct au_nhash *whlist; + unsigned int flags; + int err; + aufs_bindex_t bindex; +}; + +static int test_empty_cb(void *__arg, const char *__name, int namelen, + loff_t offset __maybe_unused, u64 ino, + unsigned int d_type) +{ + struct test_empty_arg *arg = __arg; + char *name = (void *)__name; + + arg->err = 0; + au_fset_testempty(arg->flags, CALLED); + /* smp_mb(); */ + if (name[0] == '.' + && (namelen == 1 || (name[1] == '.' && namelen == 2))) + goto out; /* success */ + + if (namelen <= AUFS_WH_PFX_LEN + || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { + if (au_ftest_testempty(arg->flags, WHONLY) + && !au_nhash_test_known_wh(arg->whlist, name, namelen)) + arg->err = -ENOTEMPTY; + goto out; + } + + name += AUFS_WH_PFX_LEN; + namelen -= AUFS_WH_PFX_LEN; + if (!au_nhash_test_known_wh(arg->whlist, name, namelen)) + arg->err = au_nhash_append_wh + (arg->whlist, name, namelen, ino, d_type, arg->bindex, + au_ftest_testempty(arg->flags, SHWH)); + +out: + /* smp_mb(); */ + AuTraceErr(arg->err); + return arg->err; +} + +static int do_test_empty(struct dentry *dentry, struct test_empty_arg *arg) +{ + int err; + struct file *h_file; + + h_file = au_h_open(dentry, arg->bindex, + O_RDONLY | O_NONBLOCK | O_DIRECTORY | O_LARGEFILE, + /*file*/NULL); + err = PTR_ERR(h_file); + if (IS_ERR(h_file)) + goto out; + + err = 0; + if (!au_opt_test(au_mntflags(dentry->d_sb), UDBA_NONE) + && !h_file->f_dentry->d_inode->i_nlink) + goto out_put; + + do { + arg->err = 0; + au_fclr_testempty(arg->flags, CALLED); + /* smp_mb(); */ + err = vfsub_readdir(h_file, test_empty_cb, arg); + if (err >= 0) + err = arg->err; + } while (!err && au_ftest_testempty(arg->flags, CALLED)); + +out_put: + fput(h_file); + au_sbr_put(dentry->d_sb, arg->bindex); +out: + return err; +} + +struct do_test_empty_args { + int *errp; + struct dentry *dentry; + struct test_empty_arg *arg; +}; + +static void call_do_test_empty(void *args) +{ + struct do_test_empty_args *a = args; + *a->errp = do_test_empty(a->dentry, a->arg); +} + +static int sio_test_empty(struct dentry *dentry, struct test_empty_arg *arg) +{ + int err, wkq_err; + struct dentry *h_dentry; + struct inode *h_inode; + + h_dentry = au_h_dptr(dentry, arg->bindex); + h_inode = h_dentry->d_inode; + /* todo: i_mode changes anytime? */ + mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); + err = au_test_h_perm_sio(h_inode, MAY_EXEC | MAY_READ); + mutex_unlock(&h_inode->i_mutex); + if (!err) + err = do_test_empty(dentry, arg); + else { + struct do_test_empty_args args = { + .errp = &err, + .dentry = dentry, + .arg = arg + }; + unsigned int flags = arg->flags; + + wkq_err = au_wkq_wait(call_do_test_empty, &args); + if (unlikely(wkq_err)) + err = wkq_err; + arg->flags = flags; + } + + return err; +} + +int au_test_empty_lower(struct dentry *dentry) +{ + int err; + unsigned int rdhash; + aufs_bindex_t bindex, bstart, btail; + struct au_nhash whlist; + struct test_empty_arg arg; + + SiMustAnyLock(dentry->d_sb); + + rdhash = au_sbi(dentry->d_sb)->si_rdhash; + if (!rdhash) + rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, dentry)); + err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS); + if (unlikely(err)) + goto out; + + arg.flags = 0; + arg.whlist = &whlist; + bstart = au_dbstart(dentry); + if (au_opt_test(au_mntflags(dentry->d_sb), SHWH)) + au_fset_testempty(arg.flags, SHWH); + arg.bindex = bstart; + err = do_test_empty(dentry, &arg); + if (unlikely(err)) + goto out_whlist; + + au_fset_testempty(arg.flags, WHONLY); + btail = au_dbtaildir(dentry); + for (bindex = bstart + 1; !err && bindex <= btail; bindex++) { + struct dentry *h_dentry; + + h_dentry = au_h_dptr(dentry, bindex); + if (h_dentry && h_dentry->d_inode) { + arg.bindex = bindex; + err = do_test_empty(dentry, &arg); + } + } + +out_whlist: + au_nhash_wh_free(&whlist); +out: + return err; +} + +int au_test_empty(struct dentry *dentry, struct au_nhash *whlist) +{ + int err; + struct test_empty_arg arg; + aufs_bindex_t bindex, btail; + + err = 0; + arg.whlist = whlist; + arg.flags = AuTestEmpty_WHONLY; + if (au_opt_test(au_mntflags(dentry->d_sb), SHWH)) + au_fset_testempty(arg.flags, SHWH); + btail = au_dbtaildir(dentry); + for (bindex = au_dbstart(dentry); !err && bindex <= btail; bindex++) { + struct dentry *h_dentry; + + h_dentry = au_h_dptr(dentry, bindex); + if (h_dentry && h_dentry->d_inode) { + arg.bindex = bindex; + err = sio_test_empty(dentry, &arg); + } + } + + return err; +} + +/* ---------------------------------------------------------------------- */ + +const struct file_operations aufs_dir_fop = { + .owner = THIS_MODULE, + .llseek = default_llseek, + .read = generic_read_dir, + .readdir = aufs_readdir, + .unlocked_ioctl = aufs_ioctl_dir, +#ifdef CONFIG_COMPAT + .compat_ioctl = aufs_compat_ioctl_dir, +#endif + .open = aufs_open_dir, + .release = aufs_release_dir, + .flush = aufs_flush_dir, + .fsync = aufs_fsync_dir +}; diff --git a/ubuntu/aufs/dir.h b/ubuntu/aufs/dir.h new file mode 100644 index 000000000000..2d0d8d2b1518 --- /dev/null +++ b/ubuntu/aufs/dir.h @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * directory operations + */ + +#ifndef __AUFS_DIR_H__ +#define __AUFS_DIR_H__ + +#ifdef __KERNEL__ + +#include +#include + +/* ---------------------------------------------------------------------- */ + +/* need to be faster and smaller */ + +struct au_nhash { + unsigned int nh_num; + struct hlist_head *nh_head; +}; + +struct au_vdir_destr { + unsigned char len; + unsigned char name[0]; +} __packed; + +struct au_vdir_dehstr { + struct hlist_node hash; + struct au_vdir_destr *str; +} ____cacheline_aligned_in_smp; + +struct au_vdir_de { + ino_t de_ino; + unsigned char de_type; + /* caution: packed */ + struct au_vdir_destr de_str; +} __packed; + +struct au_vdir_wh { + struct hlist_node wh_hash; +#ifdef CONFIG_AUFS_SHWH + ino_t wh_ino; + aufs_bindex_t wh_bindex; + unsigned char wh_type; +#else + aufs_bindex_t wh_bindex; +#endif + /* caution: packed */ + struct au_vdir_destr wh_str; +} __packed; + +union au_vdir_deblk_p { + unsigned char *deblk; + struct au_vdir_de *de; +}; + +struct au_vdir { + unsigned char **vd_deblk; + unsigned long vd_nblk; + struct { + unsigned long ul; + union au_vdir_deblk_p p; + } vd_last; + + unsigned long vd_version; + unsigned int vd_deblk_sz; + unsigned long vd_jiffy; +} ____cacheline_aligned_in_smp; + +/* ---------------------------------------------------------------------- */ + +/* dir.c */ +extern const struct file_operations aufs_dir_fop; +void au_add_nlink(struct inode *dir, struct inode *h_dir); +void au_sub_nlink(struct inode *dir, struct inode *h_dir); +loff_t au_dir_size(struct file *file, struct dentry *dentry); +int au_test_empty_lower(struct dentry *dentry); +int au_test_empty(struct dentry *dentry, struct au_nhash *whlist); + +/* vdir.c */ +unsigned int au_rdhash_est(loff_t sz); +int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp); +void au_nhash_wh_free(struct au_nhash *whlist); +int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt, + int limit); +int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen); +int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino, + unsigned int d_type, aufs_bindex_t bindex, + unsigned char shwh); +void au_vdir_free(struct au_vdir *vdir); +int au_vdir_init(struct file *file); +int au_vdir_fill_de(struct file *file, void *dirent, filldir_t filldir); + +/* ioctl.c */ +long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg); + +#ifdef CONFIG_AUFS_RDU +/* rdu.c */ +long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +#ifdef CONFIG_COMPAT +long au_rdu_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg); +#endif +#else +static inline long au_rdu_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return -EINVAL; +} +#ifdef CONFIG_COMPAT +static inline long au_rdu_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return -EINVAL; +} +#endif +#endif + +#endif /* __KERNEL__ */ +#endif /* __AUFS_DIR_H__ */ diff --git a/ubuntu/aufs/dynop.c b/ubuntu/aufs/dynop.c new file mode 100644 index 000000000000..9b874c99c194 --- /dev/null +++ b/ubuntu/aufs/dynop.c @@ -0,0 +1,377 @@ +/* + * Copyright (C) 2010-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * dynamically customizable operations for regular files + */ + +#include "aufs.h" + +#define DyPrSym(key) AuDbgSym(key->dk_op.dy_hop) + +/* + * How large will these lists be? + * Usually just a few elements, 20-30 at most for each, I guess. + */ +static struct au_splhead dynop[AuDyLast]; + +static struct au_dykey *dy_gfind_get(struct au_splhead *spl, const void *h_op) +{ + struct au_dykey *key, *tmp; + struct list_head *head; + + key = NULL; + head = &spl->head; + rcu_read_lock(); + list_for_each_entry_rcu(tmp, head, dk_list) + if (tmp->dk_op.dy_hop == h_op) { + key = tmp; + kref_get(&key->dk_kref); + break; + } + rcu_read_unlock(); + + return key; +} + +static struct au_dykey *dy_bradd(struct au_branch *br, struct au_dykey *key) +{ + struct au_dykey **k, *found; + const void *h_op = key->dk_op.dy_hop; + int i; + + found = NULL; + k = br->br_dykey; + for (i = 0; i < AuBrDynOp; i++) + if (k[i]) { + if (k[i]->dk_op.dy_hop == h_op) { + found = k[i]; + break; + } + } else + break; + if (!found) { + spin_lock(&br->br_dykey_lock); + for (; i < AuBrDynOp; i++) + if (k[i]) { + if (k[i]->dk_op.dy_hop == h_op) { + found = k[i]; + break; + } + } else { + k[i] = key; + break; + } + spin_unlock(&br->br_dykey_lock); + BUG_ON(i == AuBrDynOp); /* expand the array */ + } + + return found; +} + +/* kref_get() if @key is already added */ +static struct au_dykey *dy_gadd(struct au_splhead *spl, struct au_dykey *key) +{ + struct au_dykey *tmp, *found; + struct list_head *head; + const void *h_op = key->dk_op.dy_hop; + + found = NULL; + head = &spl->head; + spin_lock(&spl->spin); + list_for_each_entry(tmp, head, dk_list) + if (tmp->dk_op.dy_hop == h_op) { + kref_get(&tmp->dk_kref); + found = tmp; + break; + } + if (!found) + list_add_rcu(&key->dk_list, head); + spin_unlock(&spl->spin); + + if (!found) + DyPrSym(key); + return found; +} + +static void dy_free_rcu(struct rcu_head *rcu) +{ + struct au_dykey *key; + + key = container_of(rcu, struct au_dykey, dk_rcu); + DyPrSym(key); + kfree(key); +} + +static void dy_free(struct kref *kref) +{ + struct au_dykey *key; + struct au_splhead *spl; + + key = container_of(kref, struct au_dykey, dk_kref); + spl = dynop + key->dk_op.dy_type; + au_spl_del_rcu(&key->dk_list, spl); + call_rcu(&key->dk_rcu, dy_free_rcu); +} + +void au_dy_put(struct au_dykey *key) +{ + kref_put(&key->dk_kref, dy_free); +} + +/* ---------------------------------------------------------------------- */ + +#define DyDbgSize(cnt, op) AuDebugOn(cnt != sizeof(op)/sizeof(void *)) + +#ifdef CONFIG_AUFS_DEBUG +#define DyDbgDeclare(cnt) unsigned int cnt = 0 +#define DyDbgInc(cnt) do { cnt++; } while (0) +#else +#define DyDbgDeclare(cnt) do {} while (0) +#define DyDbgInc(cnt) do {} while (0) +#endif + +#define DySet(func, dst, src, h_op, h_sb) do { \ + DyDbgInc(cnt); \ + if (h_op->func) { \ + if (src.func) \ + dst.func = src.func; \ + else \ + AuDbg("%s %s\n", au_sbtype(h_sb), #func); \ + } \ +} while (0) + +#define DySetForce(func, dst, src) do { \ + AuDebugOn(!src.func); \ + DyDbgInc(cnt); \ + dst.func = src.func; \ +} while (0) + +#define DySetAop(func) \ + DySet(func, dyaop->da_op, aufs_aop, h_aop, h_sb) +#define DySetAopForce(func) \ + DySetForce(func, dyaop->da_op, aufs_aop) + +static void dy_aop(struct au_dykey *key, const void *h_op, + struct super_block *h_sb __maybe_unused) +{ + struct au_dyaop *dyaop = (void *)key; + const struct address_space_operations *h_aop = h_op; + DyDbgDeclare(cnt); + + AuDbg("%s\n", au_sbtype(h_sb)); + + DySetAop(writepage); + DySetAopForce(readpage); /* force */ + DySetAop(writepages); + DySetAop(set_page_dirty); + DySetAop(readpages); + DySetAop(write_begin); + DySetAop(write_end); + DySetAop(bmap); + DySetAop(invalidatepage); + DySetAop(releasepage); + DySetAop(freepage); + /* these two will be changed according to an aufs mount option */ + DySetAop(direct_IO); + DySetAop(get_xip_mem); + DySetAop(migratepage); + DySetAop(launder_page); + DySetAop(is_partially_uptodate); + DySetAop(error_remove_page); + + DyDbgSize(cnt, *h_aop); + dyaop->da_get_xip_mem = h_aop->get_xip_mem; +} + +/* ---------------------------------------------------------------------- */ + +static void dy_bug(struct kref *kref) +{ + BUG(); +} + +static struct au_dykey *dy_get(struct au_dynop *op, struct au_branch *br) +{ + struct au_dykey *key, *old; + struct au_splhead *spl; + struct op { + unsigned int sz; + void (*set)(struct au_dykey *key, const void *h_op, + struct super_block *h_sb __maybe_unused); + }; + static const struct op a[] = { + [AuDy_AOP] = { + .sz = sizeof(struct au_dyaop), + .set = dy_aop + } + }; + const struct op *p; + + spl = dynop + op->dy_type; + key = dy_gfind_get(spl, op->dy_hop); + if (key) + goto out_add; /* success */ + + p = a + op->dy_type; + key = kzalloc(p->sz, GFP_NOFS); + if (unlikely(!key)) { + key = ERR_PTR(-ENOMEM); + goto out; + } + + key->dk_op.dy_hop = op->dy_hop; + kref_init(&key->dk_kref); + p->set(key, op->dy_hop, br->br_mnt->mnt_sb); + old = dy_gadd(spl, key); + if (old) { + kfree(key); + key = old; + } + +out_add: + old = dy_bradd(br, key); + if (old) + /* its ref-count should never be zero here */ + kref_put(&key->dk_kref, dy_bug); +out: + return key; +} + +/* ---------------------------------------------------------------------- */ +/* + * Aufs prohibits O_DIRECT by defaut even if the branch supports it. + * This behaviour is neccessary to return an error from open(O_DIRECT) instead + * of the succeeding I/O. The dio mount option enables O_DIRECT and makes + * open(O_DIRECT) always succeed, but the succeeding I/O may return an error. + * See the aufs manual in detail. + * + * To keep this behaviour, aufs has to set NULL to ->get_xip_mem too, and the + * performance of fadvise() and madvise() may be affected. + */ +static void dy_adx(struct au_dyaop *dyaop, int do_dx) +{ + if (!do_dx) { + dyaop->da_op.direct_IO = NULL; + dyaop->da_op.get_xip_mem = NULL; + } else { + dyaop->da_op.direct_IO = aufs_aop.direct_IO; + dyaop->da_op.get_xip_mem = aufs_aop.get_xip_mem; + if (!dyaop->da_get_xip_mem) + dyaop->da_op.get_xip_mem = NULL; + } +} + +static struct au_dyaop *dy_aget(struct au_branch *br, + const struct address_space_operations *h_aop, + int do_dx) +{ + struct au_dyaop *dyaop; + struct au_dynop op; + + op.dy_type = AuDy_AOP; + op.dy_haop = h_aop; + dyaop = (void *)dy_get(&op, br); + if (IS_ERR(dyaop)) + goto out; + dy_adx(dyaop, do_dx); + +out: + return dyaop; +} + +int au_dy_iaop(struct inode *inode, aufs_bindex_t bindex, + struct inode *h_inode) +{ + int err, do_dx; + struct super_block *sb; + struct au_branch *br; + struct au_dyaop *dyaop; + + AuDebugOn(!S_ISREG(h_inode->i_mode)); + IiMustWriteLock(inode); + + sb = inode->i_sb; + br = au_sbr(sb, bindex); + do_dx = !!au_opt_test(au_mntflags(sb), DIO); + dyaop = dy_aget(br, h_inode->i_mapping->a_ops, do_dx); + err = PTR_ERR(dyaop); + if (IS_ERR(dyaop)) + /* unnecessary to call dy_fput() */ + goto out; + + err = 0; + inode->i_mapping->a_ops = &dyaop->da_op; + +out: + return err; +} + +/* + * Is it safe to replace a_ops during the inode/file is in operation? + * Yes, I hope so. + */ +int au_dy_irefresh(struct inode *inode) +{ + int err; + aufs_bindex_t bstart; + struct inode *h_inode; + + err = 0; + if (S_ISREG(inode->i_mode)) { + bstart = au_ibstart(inode); + h_inode = au_h_iptr(inode, bstart); + err = au_dy_iaop(inode, bstart, h_inode); + } + return err; +} + +void au_dy_arefresh(int do_dx) +{ + struct au_splhead *spl; + struct list_head *head; + struct au_dykey *key; + + spl = dynop + AuDy_AOP; + head = &spl->head; + spin_lock(&spl->spin); + list_for_each_entry(key, head, dk_list) + dy_adx((void *)key, do_dx); + spin_unlock(&spl->spin); +} + +/* ---------------------------------------------------------------------- */ + +void __init au_dy_init(void) +{ + int i; + + /* make sure that 'struct au_dykey *' can be any type */ + BUILD_BUG_ON(offsetof(struct au_dyaop, da_key)); + + for (i = 0; i < AuDyLast; i++) + au_spl_init(dynop + i); +} + +void au_dy_fin(void) +{ + int i; + + for (i = 0; i < AuDyLast; i++) + WARN_ON(!list_empty(&dynop[i].head)); +} diff --git a/ubuntu/aufs/dynop.h b/ubuntu/aufs/dynop.h new file mode 100644 index 000000000000..c3217d433153 --- /dev/null +++ b/ubuntu/aufs/dynop.h @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2010-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * dynamically customizable operations (for regular files only) + */ + +#ifndef __AUFS_DYNOP_H__ +#define __AUFS_DYNOP_H__ + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include "inode.h" + +enum {AuDy_AOP, AuDyLast}; + +struct au_dynop { + int dy_type; + union { + const void *dy_hop; + const struct address_space_operations *dy_haop; + }; +}; + +struct au_dykey { + union { + struct list_head dk_list; + struct rcu_head dk_rcu; + }; + struct au_dynop dk_op; + + /* + * during I am in the branch local array, kref is gotten. when the + * branch is removed, kref is put. + */ + struct kref dk_kref; +}; + +/* stop unioning since their sizes are very different from each other */ +struct au_dyaop { + struct au_dykey da_key; + struct address_space_operations da_op; /* not const */ + int (*da_get_xip_mem)(struct address_space *, pgoff_t, int, + void **, unsigned long *); +}; + +/* ---------------------------------------------------------------------- */ + +/* dynop.c */ +struct au_branch; +void au_dy_put(struct au_dykey *key); +int au_dy_iaop(struct inode *inode, aufs_bindex_t bindex, + struct inode *h_inode); +int au_dy_irefresh(struct inode *inode); +void au_dy_arefresh(int do_dio); + +void __init au_dy_init(void); +void au_dy_fin(void); + +#endif /* __KERNEL__ */ +#endif /* __AUFS_DYNOP_H__ */ diff --git a/ubuntu/aufs/export.c b/ubuntu/aufs/export.c new file mode 100644 index 000000000000..05656fd1cf31 --- /dev/null +++ b/ubuntu/aufs/export.c @@ -0,0 +1,805 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * export via nfs + */ + +#include +#include +#include +#include +#include +#include +#include +#include "aufs.h" + +union conv { +#ifdef CONFIG_AUFS_INO_T_64 + __u32 a[2]; +#else + __u32 a[1]; +#endif + ino_t ino; +}; + +static ino_t decode_ino(__u32 *a) +{ + union conv u; + + BUILD_BUG_ON(sizeof(u.ino) != sizeof(u.a)); + u.a[0] = a[0]; +#ifdef CONFIG_AUFS_INO_T_64 + u.a[1] = a[1]; +#endif + return u.ino; +} + +static void encode_ino(__u32 *a, ino_t ino) +{ + union conv u; + + u.ino = ino; + a[0] = u.a[0]; +#ifdef CONFIG_AUFS_INO_T_64 + a[1] = u.a[1]; +#endif +} + +/* NFS file handle */ +enum { + Fh_br_id, + Fh_sigen, +#ifdef CONFIG_AUFS_INO_T_64 + /* support 64bit inode number */ + Fh_ino1, + Fh_ino2, + Fh_dir_ino1, + Fh_dir_ino2, +#else + Fh_ino1, + Fh_dir_ino1, +#endif + Fh_igen, + Fh_h_type, + Fh_tail, + + Fh_ino = Fh_ino1, + Fh_dir_ino = Fh_dir_ino1 +}; + +static int au_test_anon(struct dentry *dentry) +{ + /* note: read d_flags without d_lock */ + return !!(dentry->d_flags & DCACHE_DISCONNECTED); +} + +/* ---------------------------------------------------------------------- */ +/* inode generation external table */ + +void au_xigen_inc(struct inode *inode) +{ + loff_t pos; + ssize_t sz; + __u32 igen; + struct super_block *sb; + struct au_sbinfo *sbinfo; + + sb = inode->i_sb; + AuDebugOn(!au_opt_test(au_mntflags(sb), XINO)); + + sbinfo = au_sbi(sb); + pos = inode->i_ino; + pos *= sizeof(igen); + igen = inode->i_generation + 1; + sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xigen, &igen, + sizeof(igen), &pos); + if (sz == sizeof(igen)) + return; /* success */ + + if (unlikely(sz >= 0)) + AuIOErr("xigen error (%zd)\n", sz); +} + +int au_xigen_new(struct inode *inode) +{ + int err; + loff_t pos; + ssize_t sz; + struct super_block *sb; + struct au_sbinfo *sbinfo; + struct file *file; + + err = 0; + /* todo: dirty, at mount time */ + if (inode->i_ino == AUFS_ROOT_INO) + goto out; + sb = inode->i_sb; + SiMustAnyLock(sb); + if (unlikely(!au_opt_test(au_mntflags(sb), XINO))) + goto out; + + err = -EFBIG; + pos = inode->i_ino; + if (unlikely(au_loff_max / sizeof(inode->i_generation) - 1 < pos)) { + AuIOErr1("too large i%lld\n", pos); + goto out; + } + pos *= sizeof(inode->i_generation); + + err = 0; + sbinfo = au_sbi(sb); + file = sbinfo->si_xigen; + BUG_ON(!file); + + if (i_size_read(file->f_dentry->d_inode) + < pos + sizeof(inode->i_generation)) { + inode->i_generation = atomic_inc_return(&sbinfo->si_xigen_next); + sz = xino_fwrite(sbinfo->si_xwrite, file, &inode->i_generation, + sizeof(inode->i_generation), &pos); + } else + sz = xino_fread(sbinfo->si_xread, file, &inode->i_generation, + sizeof(inode->i_generation), &pos); + if (sz == sizeof(inode->i_generation)) + goto out; /* success */ + + err = sz; + if (unlikely(sz >= 0)) { + err = -EIO; + AuIOErr("xigen error (%zd)\n", sz); + } + +out: + return err; +} + +int au_xigen_set(struct super_block *sb, struct file *base) +{ + int err; + struct au_sbinfo *sbinfo; + struct file *file; + + SiMustWriteLock(sb); + + sbinfo = au_sbi(sb); + file = au_xino_create2(base, sbinfo->si_xigen); + err = PTR_ERR(file); + if (IS_ERR(file)) + goto out; + err = 0; + if (sbinfo->si_xigen) + fput(sbinfo->si_xigen); + sbinfo->si_xigen = file; + +out: + return err; +} + +void au_xigen_clr(struct super_block *sb) +{ + struct au_sbinfo *sbinfo; + + SiMustWriteLock(sb); + + sbinfo = au_sbi(sb); + if (sbinfo->si_xigen) { + fput(sbinfo->si_xigen); + sbinfo->si_xigen = NULL; + } +} + +/* ---------------------------------------------------------------------- */ + +static struct dentry *decode_by_ino(struct super_block *sb, ino_t ino, + ino_t dir_ino) +{ + struct dentry *dentry, *d; + struct inode *inode; + unsigned int sigen; + + dentry = NULL; + inode = ilookup(sb, ino); + if (!inode) + goto out; + + dentry = ERR_PTR(-ESTALE); + sigen = au_sigen(sb); + if (unlikely(is_bad_inode(inode) + || IS_DEADDIR(inode) + || sigen != au_iigen(inode))) + goto out_iput; + + dentry = NULL; + if (!dir_ino || S_ISDIR(inode->i_mode)) + dentry = d_find_alias(inode); + else { + spin_lock(&inode->i_lock); + list_for_each_entry(d, &inode->i_dentry, d_alias) { + spin_lock(&d->d_lock); + if (!au_test_anon(d) + && d->d_parent->d_inode->i_ino == dir_ino) { + dentry = dget_dlock(d); + spin_unlock(&d->d_lock); + break; + } + spin_unlock(&d->d_lock); + } + spin_unlock(&inode->i_lock); + } + if (unlikely(dentry && au_digen_test(dentry, sigen))) { + /* need to refresh */ + dput(dentry); + dentry = NULL; + } + +out_iput: + iput(inode); +out: + AuTraceErrPtr(dentry); + return dentry; +} + +/* ---------------------------------------------------------------------- */ + +/* todo: dirty? */ +/* if exportfs_decode_fh() passed vfsmount*, we could be happy */ + +struct au_compare_mnt_args { + /* input */ + struct super_block *sb; + + /* output */ + struct vfsmount *mnt; +}; + +static int au_compare_mnt(struct vfsmount *mnt, void *arg) +{ + struct au_compare_mnt_args *a = arg; + + if (mnt->mnt_sb != a->sb) + return 0; + a->mnt = mntget(mnt); + return 1; +} + +static struct vfsmount *au_mnt_get(struct super_block *sb) +{ + int err; + struct au_compare_mnt_args args = { + .sb = sb + }; + struct mnt_namespace *ns; + + br_read_lock(vfsmount_lock); + /* no get/put ?? */ + AuDebugOn(!current->nsproxy); + ns = current->nsproxy->mnt_ns; + AuDebugOn(!ns); + err = iterate_mounts(au_compare_mnt, &args, ns->root); + br_read_unlock(vfsmount_lock); + AuDebugOn(!err); + AuDebugOn(!args.mnt); + return args.mnt; +} + +struct au_nfsd_si_lock { + unsigned int sigen; + aufs_bindex_t bindex, br_id; + unsigned char force_lock; +}; + +static int si_nfsd_read_lock(struct super_block *sb, + struct au_nfsd_si_lock *nsi_lock) +{ + int err; + aufs_bindex_t bindex; + + si_read_lock(sb, AuLock_FLUSH); + + /* branch id may be wrapped around */ + err = 0; + bindex = au_br_index(sb, nsi_lock->br_id); + if (bindex >= 0 && nsi_lock->sigen + AUFS_BRANCH_MAX > au_sigen(sb)) + goto out; /* success */ + + err = -ESTALE; + bindex = -1; + if (!nsi_lock->force_lock) + si_read_unlock(sb); + +out: + nsi_lock->bindex = bindex; + return err; +} + +struct find_name_by_ino { + int called, found; + ino_t ino; + char *name; + int namelen; +}; + +static int +find_name_by_ino(void *arg, const char *name, int namelen, loff_t offset, + u64 ino, unsigned int d_type) +{ + struct find_name_by_ino *a = arg; + + a->called++; + if (a->ino != ino) + return 0; + + memcpy(a->name, name, namelen); + a->namelen = namelen; + a->found = 1; + return 1; +} + +static struct dentry *au_lkup_by_ino(struct path *path, ino_t ino, + struct au_nfsd_si_lock *nsi_lock) +{ + struct dentry *dentry, *parent; + struct file *file; + struct inode *dir; + struct find_name_by_ino arg; + int err; + + parent = path->dentry; + if (nsi_lock) + si_read_unlock(parent->d_sb); + file = vfsub_dentry_open(path, au_dir_roflags); + dentry = (void *)file; + if (IS_ERR(file)) + goto out; + + dentry = ERR_PTR(-ENOMEM); + arg.name = __getname_gfp(GFP_NOFS); + if (unlikely(!arg.name)) + goto out_file; + arg.ino = ino; + arg.found = 0; + do { + arg.called = 0; + /* smp_mb(); */ + err = vfsub_readdir(file, find_name_by_ino, &arg); + } while (!err && !arg.found && arg.called); + dentry = ERR_PTR(err); + if (unlikely(err)) + goto out_name; + dentry = ERR_PTR(-ENOENT); + if (!arg.found) + goto out_name; + + /* do not call au_lkup_one() */ + dir = parent->d_inode; + mutex_lock(&dir->i_mutex); + dentry = vfsub_lookup_one_len(arg.name, parent, arg.namelen); + mutex_unlock(&dir->i_mutex); + AuTraceErrPtr(dentry); + if (IS_ERR(dentry)) + goto out_name; + AuDebugOn(au_test_anon(dentry)); + if (unlikely(!dentry->d_inode)) { + dput(dentry); + dentry = ERR_PTR(-ENOENT); + } + +out_name: + __putname(arg.name); +out_file: + fput(file); +out: + if (unlikely(nsi_lock + && si_nfsd_read_lock(parent->d_sb, nsi_lock) < 0)) + if (!IS_ERR(dentry)) { + dput(dentry); + dentry = ERR_PTR(-ESTALE); + } + AuTraceErrPtr(dentry); + return dentry; +} + +static struct dentry *decode_by_dir_ino(struct super_block *sb, ino_t ino, + ino_t dir_ino, + struct au_nfsd_si_lock *nsi_lock) +{ + struct dentry *dentry; + struct path path; + + if (dir_ino != AUFS_ROOT_INO) { + path.dentry = decode_by_ino(sb, dir_ino, 0); + dentry = path.dentry; + if (!path.dentry || IS_ERR(path.dentry)) + goto out; + AuDebugOn(au_test_anon(path.dentry)); + } else + path.dentry = dget(sb->s_root); + + path.mnt = au_mnt_get(sb); + dentry = au_lkup_by_ino(&path, ino, nsi_lock); + path_put(&path); + +out: + AuTraceErrPtr(dentry); + return dentry; +} + +/* ---------------------------------------------------------------------- */ + +static int h_acceptable(void *expv, struct dentry *dentry) +{ + return 1; +} + +static char *au_build_path(struct dentry *h_parent, struct path *h_rootpath, + char *buf, int len, struct super_block *sb) +{ + char *p; + int n; + struct path path; + + p = d_path(h_rootpath, buf, len); + if (IS_ERR(p)) + goto out; + n = strlen(p); + + path.mnt = h_rootpath->mnt; + path.dentry = h_parent; + p = d_path(&path, buf, len); + if (IS_ERR(p)) + goto out; + if (n != 1) + p += n; + + path.mnt = au_mnt_get(sb); + path.dentry = sb->s_root; + p = d_path(&path, buf, len - strlen(p)); + mntput(path.mnt); + if (IS_ERR(p)) + goto out; + if (n != 1) + p[strlen(p)] = '/'; + +out: + AuTraceErrPtr(p); + return p; +} + +static +struct dentry *decode_by_path(struct super_block *sb, ino_t ino, __u32 *fh, + int fh_len, struct au_nfsd_si_lock *nsi_lock) +{ + struct dentry *dentry, *h_parent, *root; + struct super_block *h_sb; + char *pathname, *p; + struct vfsmount *h_mnt; + struct au_branch *br; + int err; + struct path path; + + br = au_sbr(sb, nsi_lock->bindex); + h_mnt = br->br_mnt; + h_sb = h_mnt->mnt_sb; + /* todo: call lower fh_to_dentry()? fh_to_parent()? */ + h_parent = exportfs_decode_fh(h_mnt, (void *)(fh + Fh_tail), + fh_len - Fh_tail, fh[Fh_h_type], + h_acceptable, /*context*/NULL); + dentry = h_parent; + if (unlikely(!h_parent || IS_ERR(h_parent))) { + AuWarn1("%s decode_fh failed, %ld\n", + au_sbtype(h_sb), PTR_ERR(h_parent)); + goto out; + } + dentry = NULL; + if (unlikely(au_test_anon(h_parent))) { + AuWarn1("%s decode_fh returned a disconnected dentry\n", + au_sbtype(h_sb)); + goto out_h_parent; + } + + dentry = ERR_PTR(-ENOMEM); + pathname = (void *)__get_free_page(GFP_NOFS); + if (unlikely(!pathname)) + goto out_h_parent; + + root = sb->s_root; + path.mnt = h_mnt; + di_read_lock_parent(root, !AuLock_IR); + path.dentry = au_h_dptr(root, nsi_lock->bindex); + di_read_unlock(root, !AuLock_IR); + p = au_build_path(h_parent, &path, pathname, PAGE_SIZE, sb); + dentry = (void *)p; + if (IS_ERR(p)) + goto out_pathname; + + si_read_unlock(sb); + err = vfsub_kern_path(p, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); + dentry = ERR_PTR(err); + if (unlikely(err)) + goto out_relock; + + dentry = ERR_PTR(-ENOENT); + AuDebugOn(au_test_anon(path.dentry)); + if (unlikely(!path.dentry->d_inode)) + goto out_path; + + if (ino != path.dentry->d_inode->i_ino) + dentry = au_lkup_by_ino(&path, ino, /*nsi_lock*/NULL); + else + dentry = dget(path.dentry); + +out_path: + path_put(&path); +out_relock: + if (unlikely(si_nfsd_read_lock(sb, nsi_lock) < 0)) + if (!IS_ERR(dentry)) { + dput(dentry); + dentry = ERR_PTR(-ESTALE); + } +out_pathname: + free_page((unsigned long)pathname); +out_h_parent: + dput(h_parent); +out: + AuTraceErrPtr(dentry); + return dentry; +} + +/* ---------------------------------------------------------------------- */ + +static struct dentry * +aufs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, + int fh_type) +{ + struct dentry *dentry; + __u32 *fh = fid->raw; + struct au_branch *br; + ino_t ino, dir_ino; + struct au_nfsd_si_lock nsi_lock = { + .force_lock = 0 + }; + + dentry = ERR_PTR(-ESTALE); + /* it should never happen, but the file handle is unreliable */ + if (unlikely(fh_len < Fh_tail)) + goto out; + nsi_lock.sigen = fh[Fh_sigen]; + nsi_lock.br_id = fh[Fh_br_id]; + + /* branch id may be wrapped around */ + br = NULL; + if (unlikely(si_nfsd_read_lock(sb, &nsi_lock))) + goto out; + nsi_lock.force_lock = 1; + + /* is this inode still cached? */ + ino = decode_ino(fh + Fh_ino); + /* it should never happen */ + if (unlikely(ino == AUFS_ROOT_INO)) + goto out; + + dir_ino = decode_ino(fh + Fh_dir_ino); + dentry = decode_by_ino(sb, ino, dir_ino); + if (IS_ERR(dentry)) + goto out_unlock; + if (dentry) + goto accept; + + /* is the parent dir cached? */ + br = au_sbr(sb, nsi_lock.bindex); + atomic_inc(&br->br_count); + dentry = decode_by_dir_ino(sb, ino, dir_ino, &nsi_lock); + if (IS_ERR(dentry)) + goto out_unlock; + if (dentry) + goto accept; + + /* lookup path */ + dentry = decode_by_path(sb, ino, fh, fh_len, &nsi_lock); + if (IS_ERR(dentry)) + goto out_unlock; + if (unlikely(!dentry)) + /* todo?: make it ESTALE */ + goto out_unlock; + +accept: + if (!au_digen_test(dentry, au_sigen(sb)) + && dentry->d_inode->i_generation == fh[Fh_igen]) + goto out_unlock; /* success */ + + dput(dentry); + dentry = ERR_PTR(-ESTALE); +out_unlock: + if (br) + atomic_dec(&br->br_count); + si_read_unlock(sb); +out: + AuTraceErrPtr(dentry); + return dentry; +} + +#if 0 /* reserved for future use */ +/* support subtreecheck option */ +static struct dentry *aufs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + struct dentry *parent; + __u32 *fh = fid->raw; + ino_t dir_ino; + + dir_ino = decode_ino(fh + Fh_dir_ino); + parent = decode_by_ino(sb, dir_ino, 0); + if (IS_ERR(parent)) + goto out; + if (!parent) + parent = decode_by_path(sb, au_br_index(sb, fh[Fh_br_id]), + dir_ino, fh, fh_len); + +out: + AuTraceErrPtr(parent); + return parent; +} +#endif + +/* ---------------------------------------------------------------------- */ + +static int aufs_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, + int connectable) +{ + int err; + aufs_bindex_t bindex, bend; + struct super_block *sb, *h_sb; + struct inode *inode; + struct dentry *parent, *h_parent; + struct au_branch *br; + + AuDebugOn(au_test_anon(dentry)); + + parent = NULL; + err = -ENOSPC; + if (unlikely(*max_len <= Fh_tail)) { + AuWarn1("NFSv2 client (max_len %d)?\n", *max_len); + goto out; + } + + err = FILEID_ROOT; + if (IS_ROOT(dentry)) { + AuDebugOn(dentry->d_inode->i_ino != AUFS_ROOT_INO); + goto out; + } + + h_parent = NULL; + err = aufs_read_lock(dentry, AuLock_FLUSH | AuLock_IR | AuLock_GEN); + if (unlikely(err)) + goto out; + + inode = dentry->d_inode; + AuDebugOn(!inode); + sb = dentry->d_sb; +#ifdef CONFIG_AUFS_DEBUG + if (unlikely(!au_opt_test(au_mntflags(sb), XINO))) + AuWarn1("NFS-exporting requires xino\n"); +#endif + err = -EIO; + parent = dget_parent(dentry); + di_read_lock_parent(parent, !AuLock_IR); + bend = au_dbtaildir(parent); + for (bindex = au_dbstart(parent); bindex <= bend; bindex++) { + h_parent = au_h_dptr(parent, bindex); + if (h_parent) { + dget(h_parent); + break; + } + } + if (unlikely(!h_parent)) + goto out_unlock; + + err = -EPERM; + br = au_sbr(sb, bindex); + h_sb = br->br_mnt->mnt_sb; + if (unlikely(!h_sb->s_export_op)) { + AuErr1("%s branch is not exportable\n", au_sbtype(h_sb)); + goto out_dput; + } + + fh[Fh_br_id] = br->br_id; + fh[Fh_sigen] = au_sigen(sb); + encode_ino(fh + Fh_ino, inode->i_ino); + encode_ino(fh + Fh_dir_ino, parent->d_inode->i_ino); + fh[Fh_igen] = inode->i_generation; + + *max_len -= Fh_tail; + fh[Fh_h_type] = exportfs_encode_fh(h_parent, (void *)(fh + Fh_tail), + max_len, + /*connectable or subtreecheck*/0); + err = fh[Fh_h_type]; + *max_len += Fh_tail; + /* todo: macros? */ + if (err != 255) + err = 99; + else + AuWarn1("%s encode_fh failed\n", au_sbtype(h_sb)); + +out_dput: + dput(h_parent); +out_unlock: + di_read_unlock(parent, !AuLock_IR); + dput(parent); + aufs_read_unlock(dentry, AuLock_IR); +out: + if (unlikely(err < 0)) + err = 255; + return err; +} + +/* ---------------------------------------------------------------------- */ + +static int aufs_commit_metadata(struct inode *inode) +{ + int err; + aufs_bindex_t bindex; + struct super_block *sb; + struct inode *h_inode; + int (*f)(struct inode *inode); + + sb = inode->i_sb; + si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); + ii_write_lock_child(inode); + bindex = au_ibstart(inode); + AuDebugOn(bindex < 0); + h_inode = au_h_iptr(inode, bindex); + + f = h_inode->i_sb->s_export_op->commit_metadata; + if (f) + err = f(h_inode); + else { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0 /* metadata only */ + }; + + err = sync_inode(h_inode, &wbc); + } + + au_cpup_attr_timesizes(inode); + ii_write_unlock(inode); + si_read_unlock(sb); + return err; +} + +/* ---------------------------------------------------------------------- */ + +static struct export_operations aufs_export_op = { + .fh_to_dentry = aufs_fh_to_dentry, + /* .fh_to_parent = aufs_fh_to_parent, */ + .encode_fh = aufs_encode_fh, + .commit_metadata = aufs_commit_metadata +}; + +void au_export_init(struct super_block *sb) +{ + struct au_sbinfo *sbinfo; + __u32 u; + + sb->s_export_op = &aufs_export_op; + sbinfo = au_sbi(sb); + sbinfo->si_xigen = NULL; + get_random_bytes(&u, sizeof(u)); + BUILD_BUG_ON(sizeof(u) != sizeof(int)); + atomic_set(&sbinfo->si_xigen_next, u); +} diff --git a/ubuntu/aufs/f_op.c b/ubuntu/aufs/f_op.c new file mode 100644 index 000000000000..43b5bca91887 --- /dev/null +++ b/ubuntu/aufs/f_op.c @@ -0,0 +1,731 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * file and vm operations + */ + +#include +#include +#include +#include +#include +#include "aufs.h" + +int au_do_open_nondir(struct file *file, int flags) +{ + int err; + aufs_bindex_t bindex; + struct file *h_file; + struct dentry *dentry; + struct au_finfo *finfo; + + FiMustWriteLock(file); + + dentry = file->f_dentry; + err = au_d_alive(dentry); + if (unlikely(err)) + goto out; + + finfo = au_fi(file); + memset(&finfo->fi_htop, 0, sizeof(finfo->fi_htop)); + atomic_set(&finfo->fi_mmapped, 0); + bindex = au_dbstart(dentry); + h_file = au_h_open(dentry, bindex, flags, file); + if (IS_ERR(h_file)) + err = PTR_ERR(h_file); + else { + au_set_fbstart(file, bindex); + au_set_h_fptr(file, bindex, h_file); + au_update_figen(file); + /* todo: necessary? */ + /* file->f_ra = h_file->f_ra; */ + } + +out: + return err; +} + +static int aufs_open_nondir(struct inode *inode __maybe_unused, + struct file *file) +{ + int err; + struct super_block *sb; + + AuDbg("%.*s, f_flags 0x%x, f_mode 0x%x\n", + AuDLNPair(file->f_dentry), vfsub_file_flags(file), + file->f_mode); + + sb = file->f_dentry->d_sb; + si_read_lock(sb, AuLock_FLUSH); + err = au_do_open(file, au_do_open_nondir, /*fidir*/NULL); + si_read_unlock(sb); + return err; +} + +int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file) +{ + struct au_finfo *finfo; + aufs_bindex_t bindex; + + finfo = au_fi(file); + bindex = finfo->fi_btop; + if (bindex >= 0) { + /* remove me from sb->s_files */ + file_sb_list_del(file); + au_set_h_fptr(file, bindex, NULL); + } + + au_finfo_fin(file); + return 0; +} + +/* ---------------------------------------------------------------------- */ + +static int au_do_flush_nondir(struct file *file, fl_owner_t id) +{ + int err; + struct file *h_file; + + err = 0; + h_file = au_hf_top(file); + if (h_file) + err = vfsub_flush(h_file, id); + return err; +} + +static int aufs_flush_nondir(struct file *file, fl_owner_t id) +{ + return au_do_flush(file, id, au_do_flush_nondir); +} + +/* ---------------------------------------------------------------------- */ +/* + * read and write functions acquire [fdi]_rwsem once, but release before + * mmap_sem. This is because to stop a race condition between mmap(2). + * Releasing these aufs-rwsem should be safe, no branch-mamagement (by keeping + * si_rwsem), no harmful copy-up should happen. Actually copy-up may happen in + * read functions after [fdi]_rwsem are released, but it should be harmless. + */ + +static ssize_t aufs_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + ssize_t err; + struct dentry *dentry; + struct file *h_file; + struct super_block *sb; + + dentry = file->f_dentry; + sb = dentry->d_sb; + si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); + err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); + if (unlikely(err)) + goto out; + + h_file = au_hf_top(file); + get_file(h_file); + di_read_unlock(dentry, AuLock_IR); + fi_read_unlock(file); + + /* filedata may be obsoleted by concurrent copyup, but no problem */ + err = vfsub_read_u(h_file, buf, count, ppos); + /* todo: necessary? */ + /* file->f_ra = h_file->f_ra; */ + /* update without lock, I don't think it a problem */ + fsstack_copy_attr_atime(dentry->d_inode, h_file->f_dentry->d_inode); + fput(h_file); + +out: + si_read_unlock(sb); + return err; +} + +/* + * todo: very ugly + * it locks both of i_mutex and si_rwsem for read in safe. + * if the plink maintenance mode continues forever (that is the problem), + * may loop forever. + */ +static void au_mtx_and_read_lock(struct inode *inode) +{ + int err; + struct super_block *sb = inode->i_sb; + + while (1) { + mutex_lock(&inode->i_mutex); + err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); + if (!err) + break; + mutex_unlock(&inode->i_mutex); + si_read_lock(sb, AuLock_NOPLMW); + si_read_unlock(sb); + } +} + +static ssize_t aufs_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + ssize_t err; + struct au_pin pin; + struct dentry *dentry; + struct super_block *sb; + struct inode *inode; + struct file *h_file; + char __user *buf = (char __user *)ubuf; + + dentry = file->f_dentry; + sb = dentry->d_sb; + inode = dentry->d_inode; + au_mtx_and_read_lock(inode); + + err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); + if (unlikely(err)) + goto out; + + err = au_ready_to_write(file, -1, &pin); + di_downgrade_lock(dentry, AuLock_IR); + if (unlikely(err)) { + di_read_unlock(dentry, AuLock_IR); + fi_write_unlock(file); + goto out; + } + + h_file = au_hf_top(file); + get_file(h_file); + au_unpin(&pin); + di_read_unlock(dentry, AuLock_IR); + fi_write_unlock(file); + + err = vfsub_write_u(h_file, buf, count, ppos); + ii_write_lock_child(inode); + au_cpup_attr_timesizes(inode); + inode->i_mode = h_file->f_dentry->d_inode->i_mode; + ii_write_unlock(inode); + fput(h_file); + +out: + si_read_unlock(sb); + mutex_unlock(&inode->i_mutex); + return err; +} + +static ssize_t au_do_aio(struct file *h_file, int rw, struct kiocb *kio, + const struct iovec *iov, unsigned long nv, loff_t pos) +{ + ssize_t err; + struct file *file; + ssize_t (*func)(struct kiocb *, const struct iovec *, unsigned long, + loff_t); + + err = security_file_permission(h_file, rw); + if (unlikely(err)) + goto out; + + err = -ENOSYS; + func = NULL; + if (rw == MAY_READ) + func = h_file->f_op->aio_read; + else if (rw == MAY_WRITE) + func = h_file->f_op->aio_write; + if (func) { + file = kio->ki_filp; + kio->ki_filp = h_file; + lockdep_off(); + err = func(kio, iov, nv, pos); + lockdep_on(); + kio->ki_filp = file; + } else + /* currently there is no such fs */ + WARN_ON_ONCE(1); + +out: + return err; +} + +static ssize_t aufs_aio_read(struct kiocb *kio, const struct iovec *iov, + unsigned long nv, loff_t pos) +{ + ssize_t err; + struct file *file, *h_file; + struct dentry *dentry; + struct super_block *sb; + + file = kio->ki_filp; + dentry = file->f_dentry; + sb = dentry->d_sb; + si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); + err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); + if (unlikely(err)) + goto out; + + h_file = au_hf_top(file); + get_file(h_file); + di_read_unlock(dentry, AuLock_IR); + fi_read_unlock(file); + + err = au_do_aio(h_file, MAY_READ, kio, iov, nv, pos); + /* todo: necessary? */ + /* file->f_ra = h_file->f_ra; */ + /* update without lock, I don't think it a problem */ + fsstack_copy_attr_atime(dentry->d_inode, h_file->f_dentry->d_inode); + fput(h_file); + +out: + si_read_unlock(sb); + return err; +} + +static ssize_t aufs_aio_write(struct kiocb *kio, const struct iovec *iov, + unsigned long nv, loff_t pos) +{ + ssize_t err; + struct au_pin pin; + struct dentry *dentry; + struct inode *inode; + struct file *file, *h_file; + struct super_block *sb; + + file = kio->ki_filp; + dentry = file->f_dentry; + sb = dentry->d_sb; + inode = dentry->d_inode; + au_mtx_and_read_lock(inode); + + err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); + if (unlikely(err)) + goto out; + + err = au_ready_to_write(file, -1, &pin); + di_downgrade_lock(dentry, AuLock_IR); + if (unlikely(err)) { + di_read_unlock(dentry, AuLock_IR); + fi_write_unlock(file); + goto out; + } + + h_file = au_hf_top(file); + get_file(h_file); + au_unpin(&pin); + di_read_unlock(dentry, AuLock_IR); + fi_write_unlock(file); + + err = au_do_aio(h_file, MAY_WRITE, kio, iov, nv, pos); + ii_write_lock_child(inode); + au_cpup_attr_timesizes(inode); + inode->i_mode = h_file->f_dentry->d_inode->i_mode; + ii_write_unlock(inode); + fput(h_file); + +out: + si_read_unlock(sb); + mutex_unlock(&inode->i_mutex); + return err; +} + +static ssize_t aufs_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + ssize_t err; + struct file *h_file; + struct dentry *dentry; + struct super_block *sb; + + dentry = file->f_dentry; + sb = dentry->d_sb; + si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); + err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); + if (unlikely(err)) + goto out; + + err = -EINVAL; + h_file = au_hf_top(file); + get_file(h_file); + if (au_test_loopback_kthread()) { + au_warn_loopback(h_file->f_dentry->d_sb); + if (file->f_mapping != h_file->f_mapping) { + file->f_mapping = h_file->f_mapping; + smp_mb(); /* unnecessary? */ + } + } + di_read_unlock(dentry, AuLock_IR); + fi_read_unlock(file); + + err = vfsub_splice_to(h_file, ppos, pipe, len, flags); + /* todo: necessasry? */ + /* file->f_ra = h_file->f_ra; */ + /* update without lock, I don't think it a problem */ + fsstack_copy_attr_atime(dentry->d_inode, h_file->f_dentry->d_inode); + fput(h_file); + +out: + si_read_unlock(sb); + return err; +} + +static ssize_t +aufs_splice_write(struct pipe_inode_info *pipe, struct file *file, loff_t *ppos, + size_t len, unsigned int flags) +{ + ssize_t err; + struct au_pin pin; + struct dentry *dentry; + struct inode *inode; + struct file *h_file; + struct super_block *sb; + + dentry = file->f_dentry; + sb = dentry->d_sb; + inode = dentry->d_inode; + au_mtx_and_read_lock(inode); + + err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); + if (unlikely(err)) + goto out; + + err = au_ready_to_write(file, -1, &pin); + di_downgrade_lock(dentry, AuLock_IR); + if (unlikely(err)) { + di_read_unlock(dentry, AuLock_IR); + fi_write_unlock(file); + goto out; + } + + h_file = au_hf_top(file); + get_file(h_file); + au_unpin(&pin); + di_read_unlock(dentry, AuLock_IR); + fi_write_unlock(file); + + err = vfsub_splice_from(pipe, h_file, ppos, len, flags); + ii_write_lock_child(inode); + au_cpup_attr_timesizes(inode); + inode->i_mode = h_file->f_dentry->d_inode->i_mode; + ii_write_unlock(inode); + fput(h_file); + +out: + si_read_unlock(sb); + mutex_unlock(&inode->i_mutex); + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* + * The locking order around current->mmap_sem. + * - in most and regular cases + * file I/O syscall -- aufs_read() or something + * -- si_rwsem for read -- mmap_sem + * (Note that [fdi]i_rwsem are released before mmap_sem). + * - in mmap case + * mmap(2) -- mmap_sem -- aufs_mmap() -- si_rwsem for read -- [fdi]i_rwsem + * This AB-BA order is definitly bad, but is not a problem since "si_rwsem for + * read" allows muliple processes to acquire it and [fdi]i_rwsem are not held in + * file I/O. Aufs needs to stop lockdep in aufs_mmap() though. + * It means that when aufs acquires si_rwsem for write, the process should never + * acquire mmap_sem. + * + * Actually aufs_readdir() holds [fdi]i_rwsem before mmap_sem, but this is not a + * problem either since any directory is not able to be mmap-ed. + * The similar scenario is applied to aufs_readlink() too. + */ + +/* cf. linux/include/linux/mman.h: calc_vm_prot_bits() */ +#define AuConv_VM_PROT(f, b) _calc_vm_trans(f, VM_##b, PROT_##b) + +static unsigned long au_arch_prot_conv(unsigned long flags) +{ + /* currently ppc64 only */ +#ifdef CONFIG_PPC64 + /* cf. linux/arch/powerpc/include/asm/mman.h */ + AuDebugOn(arch_calc_vm_prot_bits(-1) != VM_SAO); + return AuConv_VM_PROT(flags, SAO); +#else + AuDebugOn(arch_calc_vm_prot_bits(-1)); + return 0; +#endif +} + +static unsigned long au_prot_conv(unsigned long flags) +{ + return AuConv_VM_PROT(flags, READ) + | AuConv_VM_PROT(flags, WRITE) + | AuConv_VM_PROT(flags, EXEC) + | au_arch_prot_conv(flags); +} + +/* cf. linux/include/linux/mman.h: calc_vm_flag_bits() */ +#define AuConv_VM_MAP(f, b) _calc_vm_trans(f, VM_##b, MAP_##b) + +static unsigned long au_flag_conv(unsigned long flags) +{ + return AuConv_VM_MAP(flags, GROWSDOWN) + | AuConv_VM_MAP(flags, DENYWRITE) + | AuConv_VM_MAP(flags, EXECUTABLE) + | AuConv_VM_MAP(flags, LOCKED); +} + +static int aufs_mmap(struct file *file, struct vm_area_struct *vma) +{ + int err; + unsigned long prot; + aufs_bindex_t bstart; + const unsigned char wlock + = (file->f_mode & FMODE_WRITE) && (vma->vm_flags & VM_SHARED); + struct dentry *dentry; + struct super_block *sb; + struct file *h_file; + struct au_branch *br; + struct au_pin pin; + + AuDbgVmRegion(file, vma); + + dentry = file->f_dentry; + sb = dentry->d_sb; + lockdep_off(); + si_read_lock(sb, AuLock_NOPLMW); + err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); + if (unlikely(err)) + goto out; + + if (wlock) { + err = au_ready_to_write(file, -1, &pin); + di_write_unlock(dentry); + if (unlikely(err)) { + fi_write_unlock(file); + goto out; + } + au_unpin(&pin); + } else + di_write_unlock(dentry); + + bstart = au_fbstart(file); + br = au_sbr(sb, bstart); + h_file = au_hf_top(file); + get_file(h_file); + au_set_mmapped(file); + fi_write_unlock(file); + lockdep_on(); + + au_vm_file_reset(vma, h_file); + prot = au_prot_conv(vma->vm_flags); + err = security_file_mmap(h_file, /*reqprot*/prot, prot, + au_flag_conv(vma->vm_flags), vma->vm_start, 0); + if (!err) + err = h_file->f_op->mmap(h_file, vma); + if (unlikely(err)) + goto out_reset; + + au_vm_prfile_set(vma, file); + /* update without lock, I don't think it a problem */ + fsstack_copy_attr_atime(file->f_dentry->d_inode, + h_file->f_dentry->d_inode); + goto out_fput; /* success */ + +out_reset: + au_unset_mmapped(file); + au_vm_file_reset(vma, file); +out_fput: + fput(h_file); + lockdep_off(); +out: + si_read_unlock(sb); + lockdep_on(); + AuTraceErr(err); + return err; +} + +/* ---------------------------------------------------------------------- */ + +static int aufs_fsync_nondir(struct file *file, loff_t start, loff_t end, + int datasync) +{ + int err; + struct au_pin pin; + struct dentry *dentry; + struct inode *inode; + struct file *h_file; + struct super_block *sb; + + dentry = file->f_dentry; + inode = dentry->d_inode; + sb = dentry->d_sb; + mutex_lock(&inode->i_mutex); + err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); + if (unlikely(err)) + goto out; + + err = 0; /* -EBADF; */ /* posix? */ + if (unlikely(!(file->f_mode & FMODE_WRITE))) + goto out_si; + err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); + if (unlikely(err)) + goto out_si; + + err = au_ready_to_write(file, -1, &pin); + di_downgrade_lock(dentry, AuLock_IR); + if (unlikely(err)) + goto out_unlock; + au_unpin(&pin); + + err = -EINVAL; + h_file = au_hf_top(file); + err = vfsub_fsync(h_file, &h_file->f_path, datasync); + au_cpup_attr_timesizes(inode); + +out_unlock: + di_read_unlock(dentry, AuLock_IR); + fi_write_unlock(file); +out_si: + si_read_unlock(sb); +out: + mutex_unlock(&inode->i_mutex); + return err; +} + +/* no one supports this operation, currently */ +#if 0 +static int aufs_aio_fsync_nondir(struct kiocb *kio, int datasync) +{ + int err; + struct au_pin pin; + struct dentry *dentry; + struct inode *inode; + struct file *file, *h_file; + + file = kio->ki_filp; + dentry = file->f_dentry; + inode = dentry->d_inode; + au_mtx_and_read_lock(inode); + + err = 0; /* -EBADF; */ /* posix? */ + if (unlikely(!(file->f_mode & FMODE_WRITE))) + goto out; + err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); + if (unlikely(err)) + goto out; + + err = au_ready_to_write(file, -1, &pin); + di_downgrade_lock(dentry, AuLock_IR); + if (unlikely(err)) + goto out_unlock; + au_unpin(&pin); + + err = -ENOSYS; + h_file = au_hf_top(file); + if (h_file->f_op && h_file->f_op->aio_fsync) { + struct dentry *h_d; + struct mutex *h_mtx; + + h_d = h_file->f_dentry; + h_mtx = &h_d->d_inode->i_mutex; + if (!is_sync_kiocb(kio)) { + get_file(h_file); + fput(file); + } + kio->ki_filp = h_file; + err = h_file->f_op->aio_fsync(kio, datasync); + mutex_lock_nested(h_mtx, AuLsc_I_CHILD); + if (!err) + vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); + /*ignore*/ + au_cpup_attr_timesizes(inode); + mutex_unlock(h_mtx); + } + +out_unlock: + di_read_unlock(dentry, AuLock_IR); + fi_write_unlock(file); +out: + si_read_unlock(inode->sb); + mutex_unlock(&inode->i_mutex); + return err; +} +#endif + +static int aufs_fasync(int fd, struct file *file, int flag) +{ + int err; + struct file *h_file; + struct dentry *dentry; + struct super_block *sb; + + dentry = file->f_dentry; + sb = dentry->d_sb; + si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); + err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); + if (unlikely(err)) + goto out; + + h_file = au_hf_top(file); + if (h_file->f_op && h_file->f_op->fasync) + err = h_file->f_op->fasync(fd, h_file, flag); + + di_read_unlock(dentry, AuLock_IR); + fi_read_unlock(file); + +out: + si_read_unlock(sb); + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* no one supports this operation, currently */ +#if 0 +static ssize_t aufs_sendpage(struct file *file, struct page *page, int offset, + size_t len, loff_t *pos , int more) +{ +} +#endif + +/* ---------------------------------------------------------------------- */ + +const struct file_operations aufs_file_fop = { + .owner = THIS_MODULE, + + .llseek = default_llseek, + + .read = aufs_read, + .write = aufs_write, + .aio_read = aufs_aio_read, + .aio_write = aufs_aio_write, +#ifdef CONFIG_AUFS_POLL + .poll = aufs_poll, +#endif + .unlocked_ioctl = aufs_ioctl_nondir, +#ifdef CONFIG_COMPAT + .compat_ioctl = aufs_ioctl_nondir, /* same */ +#endif + .mmap = aufs_mmap, + .open = aufs_open_nondir, + .flush = aufs_flush_nondir, + .release = aufs_release_nondir, + .fsync = aufs_fsync_nondir, + /* .aio_fsync = aufs_aio_fsync_nondir, */ + .fasync = aufs_fasync, + /* .sendpage = aufs_sendpage, */ + .splice_write = aufs_splice_write, + .splice_read = aufs_splice_read, +#if 0 + .aio_splice_write = aufs_aio_splice_write, + .aio_splice_read = aufs_aio_splice_read +#endif +}; diff --git a/ubuntu/aufs/f_op_sp.c b/ubuntu/aufs/f_op_sp.c new file mode 100644 index 000000000000..4f5cde800d40 --- /dev/null +++ b/ubuntu/aufs/f_op_sp.c @@ -0,0 +1,299 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * file operations for special files. + * while they exist in aufs virtually, + * their file I/O is handled out of aufs. + */ + +#include +#include "aufs.h" + +static ssize_t aufs_aio_read_sp(struct kiocb *kio, const struct iovec *iov, + unsigned long nv, loff_t pos) +{ + ssize_t err; + aufs_bindex_t bstart; + unsigned char wbr; + struct file *file, *h_file; + struct super_block *sb; + + file = kio->ki_filp; + sb = file->f_dentry->d_sb; + si_read_lock(sb, AuLock_FLUSH); + fi_read_lock(file); + bstart = au_fbstart(file); + h_file = au_hf_top(file); + fi_read_unlock(file); + wbr = !!au_br_writable(au_sbr(sb, bstart)->br_perm); + si_read_unlock(sb); + + /* do not change the file in kio */ + AuDebugOn(!h_file->f_op || !h_file->f_op->aio_read); + err = h_file->f_op->aio_read(kio, iov, nv, pos); + if (err > 0 && wbr) + file_accessed(h_file); + + return err; +} + +static ssize_t aufs_aio_write_sp(struct kiocb *kio, const struct iovec *iov, + unsigned long nv, loff_t pos) +{ + ssize_t err; + aufs_bindex_t bstart; + unsigned char wbr; + struct super_block *sb; + struct file *file, *h_file; + + file = kio->ki_filp; + sb = file->f_dentry->d_sb; + si_read_lock(sb, AuLock_FLUSH); + fi_read_lock(file); + bstart = au_fbstart(file); + h_file = au_hf_top(file); + fi_read_unlock(file); + wbr = !!au_br_writable(au_sbr(sb, bstart)->br_perm); + si_read_unlock(sb); + + /* do not change the file in kio */ + AuDebugOn(!h_file->f_op || !h_file->f_op->aio_write); + err = h_file->f_op->aio_write(kio, iov, nv, pos); + if (err > 0 && wbr) + file_update_time(h_file); + + return err; +} + +/* ---------------------------------------------------------------------- */ + +static int aufs_release_sp(struct inode *inode, struct file *file) +{ + int err; + struct file *h_file; + + fi_read_lock(file); + h_file = au_hf_top(file); + fi_read_unlock(file); + /* close this fifo in aufs */ + err = h_file->f_op->release(inode, file); /* ignore */ + aufs_release_nondir(inode, file); /* ignore */ + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* currently, support only FIFO */ +enum { + AuSp_FIFO, AuSp_FIFO_R, AuSp_FIFO_W, AuSp_FIFO_RW, + /* AuSp_SOCK, AuSp_CHR, AuSp_BLK, */ + AuSp_Last +}; +static int aufs_open_sp(struct inode *inode, struct file *file); +static struct au_sp_fop { + int done; + struct file_operations fop; /* not 'const' */ + spinlock_t spin; +} au_sp_fop[AuSp_Last] = { + [AuSp_FIFO] = { + .fop = { + .owner = THIS_MODULE, + .open = aufs_open_sp + } + } +}; + +static void au_init_fop_sp(struct file *file) +{ + struct au_sp_fop *p; + int i; + struct file *h_file; + + p = au_sp_fop; + if (unlikely(!p->done)) { + /* initialize first time only */ + static DEFINE_SPINLOCK(spin); + + spin_lock(&spin); + if (!p->done) { + BUILD_BUG_ON(sizeof(au_sp_fop)/sizeof(*au_sp_fop) + != AuSp_Last); + for (i = 0; i < AuSp_Last; i++) + spin_lock_init(&p[i].spin); + p->done = 1; + } + spin_unlock(&spin); + } + + switch (file->f_mode & (FMODE_READ | FMODE_WRITE)) { + case FMODE_READ: + i = AuSp_FIFO_R; + break; + case FMODE_WRITE: + i = AuSp_FIFO_W; + break; + case FMODE_READ | FMODE_WRITE: + i = AuSp_FIFO_RW; + break; + default: + BUG(); + } + + p += i; + if (unlikely(!p->done)) { + /* initialize first time only */ + h_file = au_hf_top(file); + spin_lock(&p->spin); + if (!p->done) { + p->fop = *h_file->f_op; + p->fop.owner = THIS_MODULE; + if (p->fop.aio_read) + p->fop.aio_read = aufs_aio_read_sp; + if (p->fop.aio_write) + p->fop.aio_write = aufs_aio_write_sp; + p->fop.release = aufs_release_sp; + p->done = 1; + } + spin_unlock(&p->spin); + } + file->f_op = &p->fop; +} + +static int au_cpup_sp(struct dentry *dentry) +{ + int err; + aufs_bindex_t bcpup; + struct au_pin pin; + struct au_wr_dir_args wr_dir_args = { + .force_btgt = -1, + .flags = 0 + }; + + AuDbg("%.*s\n", AuDLNPair(dentry)); + + di_read_unlock(dentry, AuLock_IR); + di_write_lock_child(dentry); + err = au_wr_dir(dentry, /*src_dentry*/NULL, &wr_dir_args); + if (unlikely(err < 0)) + goto out; + bcpup = err; + err = 0; + if (bcpup == au_dbstart(dentry)) + goto out; /* success */ + + err = au_pin(&pin, dentry, bcpup, au_opt_udba(dentry->d_sb), + AuPin_MNT_WRITE); + if (!err) { + err = au_sio_cpup_simple(dentry, bcpup, -1, AuCpup_DTIME); + au_unpin(&pin); + } + +out: + di_downgrade_lock(dentry, AuLock_IR); + return err; +} + +static int au_do_open_sp(struct file *file, int flags) +{ + int err; + struct dentry *dentry; + struct super_block *sb; + struct file *h_file; + struct inode *h_inode; + + dentry = file->f_dentry; + AuDbg("%.*s\n", AuDLNPair(dentry)); + + /* + * try copying-up. + * operate on the ro branch is not an error. + */ + au_cpup_sp(dentry); /* ignore */ + + /* prepare h_file */ + err = au_do_open_nondir(file, vfsub_file_flags(file)); + if (unlikely(err)) + goto out; + + sb = dentry->d_sb; + h_file = au_hf_top(file); + h_inode = h_file->f_dentry->d_inode; + di_read_unlock(dentry, AuLock_IR); + fi_write_unlock(file); + si_read_unlock(sb); + /* open this fifo in aufs */ + err = h_inode->i_fop->open(file->f_dentry->d_inode, file); + si_noflush_read_lock(sb); + fi_write_lock(file); + di_read_lock_child(dentry, AuLock_IR); + if (!err) + au_init_fop_sp(file); + +out: + return err; +} + +static int aufs_open_sp(struct inode *inode, struct file *file) +{ + int err; + struct super_block *sb; + + sb = file->f_dentry->d_sb; + si_read_lock(sb, AuLock_FLUSH); + err = au_do_open(file, au_do_open_sp, /*fidir*/NULL); + si_read_unlock(sb); + return err; +} + +/* ---------------------------------------------------------------------- */ + +void au_init_special_fop(struct inode *inode, umode_t mode, dev_t rdev) +{ + init_special_inode(inode, mode, rdev); + + switch (mode & S_IFMT) { + case S_IFIFO: + inode->i_fop = &au_sp_fop[AuSp_FIFO].fop; + /*FALLTHROUGH*/ + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + break; + default: + AuDebugOn(1); + } +} + +int au_special_file(umode_t mode) +{ + int ret; + + ret = 0; + switch (mode & S_IFMT) { + case S_IFIFO: +#if 0 + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: +#endif + ret = 1; + } + + return ret; +} diff --git a/ubuntu/aufs/file.c b/ubuntu/aufs/file.c new file mode 100644 index 000000000000..b798891953f6 --- /dev/null +++ b/ubuntu/aufs/file.c @@ -0,0 +1,676 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * handling file/dir, and address_space operation + */ + +#include +#include +#include +#include +#include "aufs.h" + +/* drop flags for writing */ +unsigned int au_file_roflags(unsigned int flags) +{ + flags &= ~(O_WRONLY | O_RDWR | O_APPEND | O_CREAT | O_TRUNC); + flags |= O_RDONLY | O_NOATIME; + return flags; +} + +/* common functions to regular file and dir */ +struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags, + struct file *file) +{ + struct file *h_file; + struct dentry *h_dentry; + struct inode *h_inode; + struct super_block *sb; + struct au_branch *br; + struct path h_path; + int err, exec_flag; + + /* a race condition can happen between open and unlink/rmdir */ + h_file = ERR_PTR(-ENOENT); + h_dentry = au_h_dptr(dentry, bindex); + if (au_test_nfsd() && !h_dentry) + goto out; + h_inode = h_dentry->d_inode; + if (au_test_nfsd() && !h_inode) + goto out; + spin_lock(&h_dentry->d_lock); + err = (!d_unhashed(dentry) && d_unlinked(h_dentry)) + || !h_inode + /* || !dentry->d_inode->i_nlink */ + ; + spin_unlock(&h_dentry->d_lock); + if (unlikely(err)) + goto out; + + sb = dentry->d_sb; + br = au_sbr(sb, bindex); + h_file = ERR_PTR(-EACCES); + exec_flag = flags & __FMODE_EXEC; + if (exec_flag && (br->br_mnt->mnt_flags & MNT_NOEXEC)) + goto out; + + /* drop flags for writing */ + if (au_test_ro(sb, bindex, dentry->d_inode)) + flags = au_file_roflags(flags); + flags &= ~O_CREAT; + atomic_inc(&br->br_count); + h_path.dentry = h_dentry; + h_path.mnt = br->br_mnt; + if (!au_special_file(h_inode->i_mode)) + h_file = vfsub_dentry_open(&h_path, flags); + else { + /* this block depends upon the configuration */ + di_read_unlock(dentry, AuLock_IR); + fi_write_unlock(file); + si_read_unlock(sb); + h_file = vfsub_dentry_open(&h_path, flags); + si_noflush_read_lock(sb); + fi_write_lock(file); + di_read_lock_child(dentry, AuLock_IR); + } + if (IS_ERR(h_file)) + goto out_br; + + if (exec_flag) { + err = deny_write_access(h_file); + if (unlikely(err)) { + fput(h_file); + h_file = ERR_PTR(err); + goto out_br; + } + } + fsnotify_open(h_file); + goto out; /* success */ + +out_br: + atomic_dec(&br->br_count); +out: + return h_file; +} + +int au_do_open(struct file *file, int (*open)(struct file *file, int flags), + struct au_fidir *fidir) +{ + int err; + struct dentry *dentry; + + err = au_finfo_init(file, fidir); + if (unlikely(err)) + goto out; + + dentry = file->f_dentry; + di_read_lock_child(dentry, AuLock_IR); + err = open(file, vfsub_file_flags(file)); + di_read_unlock(dentry, AuLock_IR); + + fi_write_unlock(file); + if (unlikely(err)) { + au_fi(file)->fi_hdir = NULL; + au_finfo_fin(file); + } + +out: + return err; +} + +int au_reopen_nondir(struct file *file) +{ + int err; + aufs_bindex_t bstart; + struct dentry *dentry; + struct file *h_file, *h_file_tmp; + + dentry = file->f_dentry; + AuDebugOn(au_special_file(dentry->d_inode->i_mode)); + bstart = au_dbstart(dentry); + h_file_tmp = NULL; + if (au_fbstart(file) == bstart) { + h_file = au_hf_top(file); + if (file->f_mode == h_file->f_mode) + return 0; /* success */ + h_file_tmp = h_file; + get_file(h_file_tmp); + au_set_h_fptr(file, bstart, NULL); + } + AuDebugOn(au_fi(file)->fi_hdir); + AuDebugOn(au_fbstart(file) < bstart); + + h_file = au_h_open(dentry, bstart, vfsub_file_flags(file) & ~O_TRUNC, + file); + err = PTR_ERR(h_file); + if (IS_ERR(h_file)) + goto out; /* todo: close all? */ + + err = 0; + au_set_fbstart(file, bstart); + au_set_h_fptr(file, bstart, h_file); + au_update_figen(file); + /* todo: necessary? */ + /* file->f_ra = h_file->f_ra; */ + +out: + if (h_file_tmp) + fput(h_file_tmp); + return err; +} + +/* ---------------------------------------------------------------------- */ + +static int au_reopen_wh(struct file *file, aufs_bindex_t btgt, + struct dentry *hi_wh) +{ + int err; + aufs_bindex_t bstart; + struct au_dinfo *dinfo; + struct dentry *h_dentry; + struct au_hdentry *hdp; + + dinfo = au_di(file->f_dentry); + AuRwMustWriteLock(&dinfo->di_rwsem); + + bstart = dinfo->di_bstart; + dinfo->di_bstart = btgt; + hdp = dinfo->di_hdentry; + h_dentry = hdp[0 + btgt].hd_dentry; + hdp[0 + btgt].hd_dentry = hi_wh; + err = au_reopen_nondir(file); + hdp[0 + btgt].hd_dentry = h_dentry; + dinfo->di_bstart = bstart; + + return err; +} + +static int au_ready_to_write_wh(struct file *file, loff_t len, + aufs_bindex_t bcpup) +{ + int err; + struct inode *inode, *h_inode; + struct dentry *dentry, *h_dentry, *hi_wh; + + dentry = file->f_dentry; + au_update_dbstart(dentry); + inode = dentry->d_inode; + h_inode = NULL; + if (au_dbstart(dentry) <= bcpup && au_dbend(dentry) >= bcpup) { + h_dentry = au_h_dptr(dentry, bcpup); + if (h_dentry) + h_inode = h_dentry->d_inode; + } + hi_wh = au_hi_wh(inode, bcpup); + if (!hi_wh && !h_inode) + err = au_sio_cpup_wh(dentry, bcpup, len, file); + else + /* already copied-up after unlink */ + err = au_reopen_wh(file, bcpup, hi_wh); + + if (!err + && inode->i_nlink > 1 + && au_opt_test(au_mntflags(dentry->d_sb), PLINK)) + au_plink_append(inode, bcpup, au_h_dptr(dentry, bcpup)); + + return err; +} + +/* + * prepare the @file for writing. + */ +int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin) +{ + int err; + aufs_bindex_t bstart, bcpup, dbstart; + struct dentry *dentry, *parent, *h_dentry; + struct inode *h_inode, *inode; + struct super_block *sb; + struct file *h_file; + + dentry = file->f_dentry; + sb = dentry->d_sb; + inode = dentry->d_inode; + AuDebugOn(au_special_file(inode->i_mode)); + bstart = au_fbstart(file); + err = au_test_ro(sb, bstart, inode); + if (!err && (au_hf_top(file)->f_mode & FMODE_WRITE)) { + err = au_pin(pin, dentry, bstart, AuOpt_UDBA_NONE, /*flags*/0); + goto out; + } + + /* need to cpup or reopen */ + parent = dget_parent(dentry); + di_write_lock_parent(parent); + err = AuWbrCopyup(au_sbi(sb), dentry); + bcpup = err; + if (unlikely(err < 0)) + goto out_dgrade; + err = 0; + + if (!d_unhashed(dentry) && !au_h_dptr(parent, bcpup)) { + err = au_cpup_dirs(dentry, bcpup); + if (unlikely(err)) + goto out_dgrade; + } + + err = au_pin(pin, dentry, bcpup, AuOpt_UDBA_NONE, + AuPin_DI_LOCKED | AuPin_MNT_WRITE); + if (unlikely(err)) + goto out_dgrade; + + h_dentry = au_hf_top(file)->f_dentry; + h_inode = h_dentry->d_inode; + dbstart = au_dbstart(dentry); + if (dbstart <= bcpup) { + h_dentry = au_h_dptr(dentry, bcpup); + AuDebugOn(!h_dentry); + h_inode = h_dentry->d_inode; + AuDebugOn(!h_inode); + bstart = bcpup; + } + + if (dbstart <= bcpup /* just reopen */ + || !d_unhashed(dentry) /* copyup and reopen */ + ) { + mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); + h_file = au_h_open_pre(dentry, bstart); + if (IS_ERR(h_file)) { + err = PTR_ERR(h_file); + h_file = NULL; + } else { + di_downgrade_lock(parent, AuLock_IR); + if (dbstart > bcpup) + err = au_sio_cpup_simple(dentry, bcpup, len, + AuCpup_DTIME); + if (!err) + err = au_reopen_nondir(file); + } + mutex_unlock(&h_inode->i_mutex); + au_h_open_post(dentry, bstart, h_file); + } else { /* copyup as wh and reopen */ + /* + * since writable hfsplus branch is not supported, + * h_open_pre/post() are unnecessary. + */ + mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); + err = au_ready_to_write_wh(file, len, bcpup); + di_downgrade_lock(parent, AuLock_IR); + mutex_unlock(&h_inode->i_mutex); + } + + if (!err) { + au_pin_set_parent_lflag(pin, /*lflag*/0); + goto out_dput; /* success */ + } + au_unpin(pin); + goto out_unlock; + +out_dgrade: + di_downgrade_lock(parent, AuLock_IR); +out_unlock: + di_read_unlock(parent, AuLock_IR); +out_dput: + dput(parent); +out: + return err; +} + +/* ---------------------------------------------------------------------- */ + +int au_do_flush(struct file *file, fl_owner_t id, + int (*flush)(struct file *file, fl_owner_t id)) +{ + int err; + struct dentry *dentry; + struct super_block *sb; + struct inode *inode; + + dentry = file->f_dentry; + sb = dentry->d_sb; + inode = dentry->d_inode; + si_noflush_read_lock(sb); + fi_read_lock(file); + ii_read_lock_child(inode); + + err = flush(file, id); + au_cpup_attr_timesizes(inode); + + ii_read_unlock(inode); + fi_read_unlock(file); + si_read_unlock(sb); + return err; +} + +/* ---------------------------------------------------------------------- */ + +static int au_file_refresh_by_inode(struct file *file, int *need_reopen) +{ + int err; + aufs_bindex_t bstart; + struct au_pin pin; + struct au_finfo *finfo; + struct dentry *dentry, *parent, *hi_wh; + struct inode *inode; + struct super_block *sb; + + FiMustWriteLock(file); + + err = 0; + finfo = au_fi(file); + dentry = file->f_dentry; + sb = dentry->d_sb; + inode = dentry->d_inode; + bstart = au_ibstart(inode); + if (bstart == finfo->fi_btop || IS_ROOT(dentry)) + goto out; + + parent = dget_parent(dentry); + if (au_test_ro(sb, bstart, inode)) { + di_read_lock_parent(parent, !AuLock_IR); + err = AuWbrCopyup(au_sbi(sb), dentry); + bstart = err; + di_read_unlock(parent, !AuLock_IR); + if (unlikely(err < 0)) + goto out_parent; + err = 0; + } + + di_read_lock_parent(parent, AuLock_IR); + hi_wh = au_hi_wh(inode, bstart); + if (!S_ISDIR(inode->i_mode) + && au_opt_test(au_mntflags(sb), PLINK) + && au_plink_test(inode) + && !d_unhashed(dentry)) { + err = au_test_and_cpup_dirs(dentry, bstart); + if (unlikely(err)) + goto out_unlock; + + /* always superio. */ + err = au_pin(&pin, dentry, bstart, AuOpt_UDBA_NONE, + AuPin_DI_LOCKED | AuPin_MNT_WRITE); + if (!err) + err = au_sio_cpup_simple(dentry, bstart, -1, + AuCpup_DTIME); + au_unpin(&pin); + } else if (hi_wh) { + /* already copied-up after unlink */ + err = au_reopen_wh(file, bstart, hi_wh); + *need_reopen = 0; + } + +out_unlock: + di_read_unlock(parent, AuLock_IR); +out_parent: + dput(parent); +out: + return err; +} + +static void au_do_refresh_dir(struct file *file) +{ + aufs_bindex_t bindex, bend, new_bindex, brid; + struct au_hfile *p, tmp, *q; + struct au_finfo *finfo; + struct super_block *sb; + struct au_fidir *fidir; + + FiMustWriteLock(file); + + sb = file->f_dentry->d_sb; + finfo = au_fi(file); + fidir = finfo->fi_hdir; + AuDebugOn(!fidir); + p = fidir->fd_hfile + finfo->fi_btop; + brid = p->hf_br->br_id; + bend = fidir->fd_bbot; + for (bindex = finfo->fi_btop; bindex <= bend; bindex++, p++) { + if (!p->hf_file) + continue; + + new_bindex = au_br_index(sb, p->hf_br->br_id); + if (new_bindex == bindex) + continue; + if (new_bindex < 0) { + au_set_h_fptr(file, bindex, NULL); + continue; + } + + /* swap two lower inode, and loop again */ + q = fidir->fd_hfile + new_bindex; + tmp = *q; + *q = *p; + *p = tmp; + if (tmp.hf_file) { + bindex--; + p--; + } + } + + p = fidir->fd_hfile; + if (!au_test_mmapped(file) && !d_unlinked(file->f_dentry)) { + bend = au_sbend(sb); + for (finfo->fi_btop = 0; finfo->fi_btop <= bend; + finfo->fi_btop++, p++) + if (p->hf_file) { + if (p->hf_file->f_dentry + && p->hf_file->f_dentry->d_inode) + break; + else + au_hfput(p, file); + } + } else { + bend = au_br_index(sb, brid); + for (finfo->fi_btop = 0; finfo->fi_btop < bend; + finfo->fi_btop++, p++) + if (p->hf_file) + au_hfput(p, file); + bend = au_sbend(sb); + } + + p = fidir->fd_hfile + bend; + for (fidir->fd_bbot = bend; fidir->fd_bbot >= finfo->fi_btop; + fidir->fd_bbot--, p--) + if (p->hf_file) { + if (p->hf_file->f_dentry + && p->hf_file->f_dentry->d_inode) + break; + else + au_hfput(p, file); + } + AuDebugOn(fidir->fd_bbot < finfo->fi_btop); +} + +/* + * after branch manipulating, refresh the file. + */ +static int refresh_file(struct file *file, int (*reopen)(struct file *file)) +{ + int err, need_reopen; + aufs_bindex_t bend, bindex; + struct dentry *dentry; + struct au_finfo *finfo; + struct au_hfile *hfile; + + dentry = file->f_dentry; + finfo = au_fi(file); + if (!finfo->fi_hdir) { + hfile = &finfo->fi_htop; + AuDebugOn(!hfile->hf_file); + bindex = au_br_index(dentry->d_sb, hfile->hf_br->br_id); + AuDebugOn(bindex < 0); + if (bindex != finfo->fi_btop) + au_set_fbstart(file, bindex); + } else { + err = au_fidir_realloc(finfo, au_sbend(dentry->d_sb) + 1); + if (unlikely(err)) + goto out; + au_do_refresh_dir(file); + } + + err = 0; + need_reopen = 1; + if (!au_test_mmapped(file)) + err = au_file_refresh_by_inode(file, &need_reopen); + if (!err && need_reopen && !d_unlinked(dentry)) + err = reopen(file); + if (!err) { + au_update_figen(file); + goto out; /* success */ + } + + /* error, close all lower files */ + if (finfo->fi_hdir) { + bend = au_fbend_dir(file); + for (bindex = au_fbstart(file); bindex <= bend; bindex++) + au_set_h_fptr(file, bindex, NULL); + } + +out: + return err; +} + +/* common function to regular file and dir */ +int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file), + int wlock) +{ + int err; + unsigned int sigen, figen; + aufs_bindex_t bstart; + unsigned char pseudo_link; + struct dentry *dentry; + struct inode *inode; + + err = 0; + dentry = file->f_dentry; + inode = dentry->d_inode; + AuDebugOn(au_special_file(inode->i_mode)); + sigen = au_sigen(dentry->d_sb); + fi_write_lock(file); + figen = au_figen(file); + di_write_lock_child(dentry); + bstart = au_dbstart(dentry); + pseudo_link = (bstart != au_ibstart(inode)); + if (sigen == figen && !pseudo_link && au_fbstart(file) == bstart) { + if (!wlock) { + di_downgrade_lock(dentry, AuLock_IR); + fi_downgrade_lock(file); + } + goto out; /* success */ + } + + AuDbg("sigen %d, figen %d\n", sigen, figen); + if (au_digen_test(dentry, sigen)) { + err = au_reval_dpath(dentry, sigen); + AuDebugOn(!err && au_digen_test(dentry, sigen)); + } + + if (!err) + err = refresh_file(file, reopen); + if (!err) { + if (!wlock) { + di_downgrade_lock(dentry, AuLock_IR); + fi_downgrade_lock(file); + } + } else { + di_write_unlock(dentry); + fi_write_unlock(file); + } + +out: + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* cf. aufs_nopage() */ +/* for madvise(2) */ +static int aufs_readpage(struct file *file __maybe_unused, struct page *page) +{ + unlock_page(page); + return 0; +} + +/* it will never be called, but necessary to support O_DIRECT */ +static ssize_t aufs_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ BUG(); return 0; } + +/* + * it will never be called, but madvise and fadvise behaves differently + * when get_xip_mem is defined + */ +static int aufs_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, + int create, void **kmem, unsigned long *pfn) +{ BUG(); return 0; } + +/* they will never be called. */ +#ifdef CONFIG_AUFS_DEBUG +static int aufs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ AuUnsupport(); return 0; } +static int aufs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ AuUnsupport(); return 0; } +static int aufs_writepage(struct page *page, struct writeback_control *wbc) +{ AuUnsupport(); return 0; } + +static int aufs_set_page_dirty(struct page *page) +{ AuUnsupport(); return 0; } +static void aufs_invalidatepage(struct page *page, unsigned long offset) +{ AuUnsupport(); } +static int aufs_releasepage(struct page *page, gfp_t gfp) +{ AuUnsupport(); return 0; } +static int aufs_migratepage(struct address_space *mapping, struct page *newpage, + struct page *page) +{ AuUnsupport(); return 0; } +static int aufs_launder_page(struct page *page) +{ AuUnsupport(); return 0; } +static int aufs_is_partially_uptodate(struct page *page, + read_descriptor_t *desc, + unsigned long from) +{ AuUnsupport(); return 0; } +static int aufs_error_remove_page(struct address_space *mapping, + struct page *page) +{ AuUnsupport(); return 0; } +#endif /* CONFIG_AUFS_DEBUG */ + +const struct address_space_operations aufs_aop = { + .readpage = aufs_readpage, + .direct_IO = aufs_direct_IO, + .get_xip_mem = aufs_get_xip_mem, +#ifdef CONFIG_AUFS_DEBUG + .writepage = aufs_writepage, + /* no writepages, because of writepage */ + .set_page_dirty = aufs_set_page_dirty, + /* no readpages, because of readpage */ + .write_begin = aufs_write_begin, + .write_end = aufs_write_end, + /* no bmap, no block device */ + .invalidatepage = aufs_invalidatepage, + .releasepage = aufs_releasepage, + .migratepage = aufs_migratepage, + .launder_page = aufs_launder_page, + .is_partially_uptodate = aufs_is_partially_uptodate, + .error_remove_page = aufs_error_remove_page +#endif /* CONFIG_AUFS_DEBUG */ +}; diff --git a/ubuntu/aufs/file.h b/ubuntu/aufs/file.h new file mode 100644 index 000000000000..831dc5d252a0 --- /dev/null +++ b/ubuntu/aufs/file.h @@ -0,0 +1,299 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * file operations + */ + +#ifndef __AUFS_FILE_H__ +#define __AUFS_FILE_H__ + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include "rwsem.h" + +struct au_branch; +struct au_hfile { + struct file *hf_file; + struct au_branch *hf_br; +}; + +struct au_vdir; +struct au_fidir { + aufs_bindex_t fd_bbot; + aufs_bindex_t fd_nent; + struct au_vdir *fd_vdir_cache; + struct au_hfile fd_hfile[]; +}; + +static inline int au_fidir_sz(int nent) +{ + AuDebugOn(nent < 0); + return sizeof(struct au_fidir) + sizeof(struct au_hfile) * nent; +} + +struct au_finfo { + atomic_t fi_generation; + + struct au_rwsem fi_rwsem; + aufs_bindex_t fi_btop; + + /* do not union them */ + struct { /* for non-dir */ + struct au_hfile fi_htop; + atomic_t fi_mmapped; + }; + struct au_fidir *fi_hdir; /* for dir only */ +} ____cacheline_aligned_in_smp; + +/* ---------------------------------------------------------------------- */ + +/* file.c */ +extern const struct address_space_operations aufs_aop; +unsigned int au_file_roflags(unsigned int flags); +struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags, + struct file *file); +int au_do_open(struct file *file, int (*open)(struct file *file, int flags), + struct au_fidir *fidir); +int au_reopen_nondir(struct file *file); +struct au_pin; +int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin); +int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file), + int wlock); +int au_do_flush(struct file *file, fl_owner_t id, + int (*flush)(struct file *file, fl_owner_t id)); + +/* poll.c */ +#ifdef CONFIG_AUFS_POLL +unsigned int aufs_poll(struct file *file, poll_table *wait); +#endif + +#ifdef CONFIG_AUFS_BR_HFSPLUS +/* hfsplus.c */ +struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex); +void au_h_open_post(struct dentry *dentry, aufs_bindex_t bindex, + struct file *h_file); +#else +static inline +struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex) +{ + return NULL; +} + +AuStubVoid(au_h_open_post, struct dentry *dentry, aufs_bindex_t bindex, + struct file *h_file); +#endif + +/* f_op.c */ +extern const struct file_operations aufs_file_fop; +int au_do_open_nondir(struct file *file, int flags); +int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file); + +#ifdef CONFIG_AUFS_SP_IATTR +/* f_op_sp.c */ +int au_special_file(umode_t mode); +void au_init_special_fop(struct inode *inode, umode_t mode, dev_t rdev); +#else +AuStubInt0(au_special_file, umode_t mode) +static inline void au_init_special_fop(struct inode *inode, umode_t mode, + dev_t rdev) +{ + init_special_inode(inode, mode, rdev); +} +#endif + +/* finfo.c */ +void au_hfput(struct au_hfile *hf, struct file *file); +void au_set_h_fptr(struct file *file, aufs_bindex_t bindex, + struct file *h_file); + +void au_update_figen(struct file *file); +struct au_fidir *au_fidir_alloc(struct super_block *sb); +int au_fidir_realloc(struct au_finfo *finfo, int nbr); + +void au_fi_init_once(void *_fi); +void au_finfo_fin(struct file *file); +int au_finfo_init(struct file *file, struct au_fidir *fidir); + +/* ioctl.c */ +long aufs_ioctl_nondir(struct file *file, unsigned int cmd, unsigned long arg); +#ifdef CONFIG_COMPAT +long aufs_compat_ioctl_dir(struct file *file, unsigned int cmd, + unsigned long arg); +#endif + +/* ---------------------------------------------------------------------- */ + +static inline struct au_finfo *au_fi(struct file *file) +{ + return file->private_data; +} + +/* ---------------------------------------------------------------------- */ + +/* + * fi_read_lock, fi_write_lock, + * fi_read_unlock, fi_write_unlock, fi_downgrade_lock + */ +AuSimpleRwsemFuncs(fi, struct file *f, &au_fi(f)->fi_rwsem); + +#define FiMustNoWaiters(f) AuRwMustNoWaiters(&au_fi(f)->fi_rwsem) +#define FiMustAnyLock(f) AuRwMustAnyLock(&au_fi(f)->fi_rwsem) +#define FiMustWriteLock(f) AuRwMustWriteLock(&au_fi(f)->fi_rwsem) + +/* ---------------------------------------------------------------------- */ + +/* todo: hard/soft set? */ +static inline aufs_bindex_t au_fbstart(struct file *file) +{ + FiMustAnyLock(file); + return au_fi(file)->fi_btop; +} + +static inline aufs_bindex_t au_fbend_dir(struct file *file) +{ + FiMustAnyLock(file); + AuDebugOn(!au_fi(file)->fi_hdir); + return au_fi(file)->fi_hdir->fd_bbot; +} + +static inline struct au_vdir *au_fvdir_cache(struct file *file) +{ + FiMustAnyLock(file); + AuDebugOn(!au_fi(file)->fi_hdir); + return au_fi(file)->fi_hdir->fd_vdir_cache; +} + +static inline void au_set_fbstart(struct file *file, aufs_bindex_t bindex) +{ + FiMustWriteLock(file); + au_fi(file)->fi_btop = bindex; +} + +static inline void au_set_fbend_dir(struct file *file, aufs_bindex_t bindex) +{ + FiMustWriteLock(file); + AuDebugOn(!au_fi(file)->fi_hdir); + au_fi(file)->fi_hdir->fd_bbot = bindex; +} + +static inline void au_set_fvdir_cache(struct file *file, + struct au_vdir *vdir_cache) +{ + FiMustWriteLock(file); + AuDebugOn(!au_fi(file)->fi_hdir); + au_fi(file)->fi_hdir->fd_vdir_cache = vdir_cache; +} + +static inline struct file *au_hf_top(struct file *file) +{ + FiMustAnyLock(file); + AuDebugOn(au_fi(file)->fi_hdir); + return au_fi(file)->fi_htop.hf_file; +} + +static inline struct file *au_hf_dir(struct file *file, aufs_bindex_t bindex) +{ + FiMustAnyLock(file); + AuDebugOn(!au_fi(file)->fi_hdir); + return au_fi(file)->fi_hdir->fd_hfile[0 + bindex].hf_file; +} + +/* todo: memory barrier? */ +static inline unsigned int au_figen(struct file *f) +{ + return atomic_read(&au_fi(f)->fi_generation); +} + +static inline void au_set_mmapped(struct file *f) +{ + if (atomic_inc_return(&au_fi(f)->fi_mmapped)) + return; + pr_warning("fi_mmapped wrapped around\n"); + while (!atomic_inc_return(&au_fi(f)->fi_mmapped)) + ; +} + +static inline void au_unset_mmapped(struct file *f) +{ + atomic_dec(&au_fi(f)->fi_mmapped); +} + +static inline int au_test_mmapped(struct file *f) +{ + return atomic_read(&au_fi(f)->fi_mmapped); +} + +/* customize vma->vm_file */ + +static inline void au_do_vm_file_reset(struct vm_area_struct *vma, + struct file *file) +{ + struct file *f; + + f = vma->vm_file; + get_file(file); + vma->vm_file = file; + fput(f); +} + +#ifdef CONFIG_MMU +#define AuDbgVmRegion(file, vma) do {} while (0) + +static inline void au_vm_file_reset(struct vm_area_struct *vma, + struct file *file) +{ + au_do_vm_file_reset(vma, file); +} +#else +#define AuDbgVmRegion(file, vma) \ + AuDebugOn((vma)->vm_region && (vma)->vm_region->vm_file != (file)) + +static inline void au_vm_file_reset(struct vm_area_struct *vma, + struct file *file) +{ + struct file *f; + + au_do_vm_file_reset(vma, file); + f = vma->vm_region->vm_file; + get_file(file); + vma->vm_region->vm_file = file; + fput(f); +} +#endif /* CONFIG_MMU */ + +/* handle vma->vm_prfile */ +static inline void au_vm_prfile_set(struct vm_area_struct *vma, + struct file *file) +{ +#ifdef CONFIG_AUFS_PROC_MAP + get_file(file); + vma->vm_prfile = file; +#ifndef CONFIG_MMU + get_file(file); + vma->vm_region->vm_prfile = file; +#endif +#endif +} + +#endif /* __KERNEL__ */ +#endif /* __AUFS_FILE_H__ */ diff --git a/ubuntu/aufs/finfo.c b/ubuntu/aufs/finfo.c new file mode 100644 index 000000000000..3a8fa30d4965 --- /dev/null +++ b/ubuntu/aufs/finfo.c @@ -0,0 +1,157 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * file private data + */ + +#include +#include "aufs.h" + +void au_hfput(struct au_hfile *hf, struct file *file) +{ + /* todo: direct access f_flags */ + if (vfsub_file_flags(file) & __FMODE_EXEC) + allow_write_access(hf->hf_file); + fput(hf->hf_file); + hf->hf_file = NULL; + atomic_dec(&hf->hf_br->br_count); + hf->hf_br = NULL; +} + +void au_set_h_fptr(struct file *file, aufs_bindex_t bindex, struct file *val) +{ + struct au_finfo *finfo = au_fi(file); + struct au_hfile *hf; + struct au_fidir *fidir; + + fidir = finfo->fi_hdir; + if (!fidir) { + AuDebugOn(finfo->fi_btop != bindex); + hf = &finfo->fi_htop; + } else + hf = fidir->fd_hfile + bindex; + + if (hf && hf->hf_file) + au_hfput(hf, file); + if (val) { + FiMustWriteLock(file); + hf->hf_file = val; + hf->hf_br = au_sbr(file->f_dentry->d_sb, bindex); + } +} + +void au_update_figen(struct file *file) +{ + atomic_set(&au_fi(file)->fi_generation, au_digen(file->f_dentry)); + /* smp_mb(); */ /* atomic_set */ +} + +/* ---------------------------------------------------------------------- */ + +struct au_fidir *au_fidir_alloc(struct super_block *sb) +{ + struct au_fidir *fidir; + int nbr; + + nbr = au_sbend(sb) + 1; + if (nbr < 2) + nbr = 2; /* initial allocate for 2 branches */ + fidir = kzalloc(au_fidir_sz(nbr), GFP_NOFS); + if (fidir) { + fidir->fd_bbot = -1; + fidir->fd_nent = nbr; + fidir->fd_vdir_cache = NULL; + } + + return fidir; +} + +int au_fidir_realloc(struct au_finfo *finfo, int nbr) +{ + int err; + struct au_fidir *fidir, *p; + + AuRwMustWriteLock(&finfo->fi_rwsem); + fidir = finfo->fi_hdir; + AuDebugOn(!fidir); + + err = -ENOMEM; + p = au_kzrealloc(fidir, au_fidir_sz(fidir->fd_nent), au_fidir_sz(nbr), + GFP_NOFS); + if (p) { + p->fd_nent = nbr; + finfo->fi_hdir = p; + err = 0; + } + + return err; +} + +/* ---------------------------------------------------------------------- */ + +void au_finfo_fin(struct file *file) +{ + struct au_finfo *finfo; + + au_nfiles_dec(file->f_dentry->d_sb); + + finfo = au_fi(file); + AuDebugOn(finfo->fi_hdir); + AuRwDestroy(&finfo->fi_rwsem); + au_cache_free_finfo(finfo); +} + +void au_fi_init_once(void *_finfo) +{ + struct au_finfo *finfo = _finfo; + static struct lock_class_key aufs_fi; + + au_rw_init(&finfo->fi_rwsem); + au_rw_class(&finfo->fi_rwsem, &aufs_fi); +} + +int au_finfo_init(struct file *file, struct au_fidir *fidir) +{ + int err, lc_idx; + struct au_finfo *finfo; + struct dentry *dentry; + + err = -ENOMEM; + dentry = file->f_dentry; + finfo = au_cache_alloc_finfo(); + if (unlikely(!finfo)) + goto out; + + err = 0; + au_nfiles_inc(dentry->d_sb); + lc_idx = AuLcNonDir_FIINFO; + if (fidir) + lc_idx = AuLcDir_FIINFO; + au_rw_class(&finfo->fi_rwsem, au_lc_key + lc_idx); + au_rw_write_lock(&finfo->fi_rwsem); + finfo->fi_btop = -1; + finfo->fi_hdir = fidir; + atomic_set(&finfo->fi_generation, au_digen(dentry)); + /* smp_mb(); */ /* atomic_set */ + + file->private_data = finfo; + +out: + return err; +} diff --git a/ubuntu/aufs/fstype.h b/ubuntu/aufs/fstype.h new file mode 100644 index 000000000000..3bdbd143a824 --- /dev/null +++ b/ubuntu/aufs/fstype.h @@ -0,0 +1,497 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * judging filesystem type + */ + +#ifndef __AUFS_FSTYPE_H__ +#define __AUFS_FSTYPE_H__ + +#ifdef __KERNEL__ + +#include +#include +#include +#include + +static inline int au_test_aufs(struct super_block *sb) +{ + return sb->s_magic == AUFS_SUPER_MAGIC; +} + +static inline const char *au_sbtype(struct super_block *sb) +{ + return sb->s_type->name; +} + +static inline int au_test_iso9660(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_ROMFS_FS) || defined(CONFIG_ROMFS_FS_MODULE) + return sb->s_magic == ROMFS_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_romfs(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_ISO9660_FS) || defined(CONFIG_ISO9660_FS_MODULE) + return sb->s_magic == ISOFS_SUPER_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_cramfs(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_CRAMFS) || defined(CONFIG_CRAMFS_MODULE) + return sb->s_magic == CRAMFS_MAGIC; +#endif + return 0; +} + +static inline int au_test_nfs(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_NFS_FS) || defined(CONFIG_NFS_FS_MODULE) + return sb->s_magic == NFS_SUPER_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_fuse(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_FUSE_FS) || defined(CONFIG_FUSE_FS_MODULE) + return sb->s_magic == FUSE_SUPER_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_xfs(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_XFS_FS) || defined(CONFIG_XFS_FS_MODULE) + return sb->s_magic == XFS_SB_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_tmpfs(struct super_block *sb __maybe_unused) +{ +#ifdef CONFIG_TMPFS + return sb->s_magic == TMPFS_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_ecryptfs(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_ECRYPT_FS) || defined(CONFIG_ECRYPT_FS_MODULE) + return !strcmp(au_sbtype(sb), "ecryptfs"); +#else + return 0; +#endif +} + +static inline int au_test_smbfs(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_SMB_FS) || defined(CONFIG_SMB_FS_MODULE) + return sb->s_magic == SMB_SUPER_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_ocfs2(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_OCFS2_FS) || defined(CONFIG_OCFS2_FS_MODULE) + return sb->s_magic == OCFS2_SUPER_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_ocfs2_dlmfs(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_OCFS2_FS_O2CB) || defined(CONFIG_OCFS2_FS_O2CB_MODULE) + return sb->s_magic == DLMFS_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_coda(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_CODA_FS) || defined(CONFIG_CODA_FS_MODULE) + return sb->s_magic == CODA_SUPER_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_v9fs(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_9P_FS) || defined(CONFIG_9P_FS_MODULE) + return sb->s_magic == V9FS_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_ext4(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_EXT4DEV_FS) || defined(CONFIG_EXT4DEV_FS_MODULE) + return sb->s_magic == EXT4_SUPER_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_sysv(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_SYSV_FS) || defined(CONFIG_SYSV_FS_MODULE) + return !strcmp(au_sbtype(sb), "sysv"); +#else + return 0; +#endif +} + +static inline int au_test_ramfs(struct super_block *sb) +{ + return sb->s_magic == RAMFS_MAGIC; +} + +static inline int au_test_ubifs(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_UBIFS_FS) || defined(CONFIG_UBIFS_FS_MODULE) + return sb->s_magic == UBIFS_SUPER_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_procfs(struct super_block *sb __maybe_unused) +{ +#ifdef CONFIG_PROC_FS + return sb->s_magic == PROC_SUPER_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_sysfs(struct super_block *sb __maybe_unused) +{ +#ifdef CONFIG_SYSFS + return sb->s_magic == SYSFS_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_configfs(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_CONFIGFS_FS) || defined(CONFIG_CONFIGFS_FS_MODULE) + return sb->s_magic == CONFIGFS_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_minix(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_MINIX_FS) || defined(CONFIG_MINIX_FS_MODULE) + return sb->s_magic == MINIX3_SUPER_MAGIC + || sb->s_magic == MINIX2_SUPER_MAGIC + || sb->s_magic == MINIX2_SUPER_MAGIC2 + || sb->s_magic == MINIX_SUPER_MAGIC + || sb->s_magic == MINIX_SUPER_MAGIC2; +#else + return 0; +#endif +} + +static inline int au_test_cifs(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_CIFS_FS) || defined(CONFIGCIFS_FS_MODULE) + return sb->s_magic == CIFS_MAGIC_NUMBER; +#else + return 0; +#endif +} + +static inline int au_test_fat(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_FAT_FS) || defined(CONFIG_FAT_FS_MODULE) + return sb->s_magic == MSDOS_SUPER_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_msdos(struct super_block *sb) +{ + return au_test_fat(sb); +} + +static inline int au_test_vfat(struct super_block *sb) +{ + return au_test_fat(sb); +} + +static inline int au_test_securityfs(struct super_block *sb __maybe_unused) +{ +#ifdef CONFIG_SECURITYFS + return sb->s_magic == SECURITYFS_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_squashfs(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_SQUASHFS) || defined(CONFIG_SQUASHFS_MODULE) + return sb->s_magic == SQUASHFS_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_btrfs(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_BTRFS_FS) || defined(CONFIG_BTRFS_FS_MODULE) + return sb->s_magic == BTRFS_SUPER_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_xenfs(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_XENFS) || defined(CONFIG_XENFS_MODULE) + return sb->s_magic == XENFS_SUPER_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_debugfs(struct super_block *sb __maybe_unused) +{ +#ifdef CONFIG_DEBUG_FS + return sb->s_magic == DEBUGFS_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_nilfs(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_NILFS) || defined(CONFIG_NILFS_MODULE) + return sb->s_magic == NILFS_SUPER_MAGIC; +#else + return 0; +#endif +} + +static inline int au_test_hfsplus(struct super_block *sb __maybe_unused) +{ +#if defined(CONFIG_HFSPLUS_FS) || defined(CONFIG_HFSPLUS_FS_MODULE) + return sb->s_magic == HFSPLUS_SUPER_MAGIC; +#else + return 0; +#endif +} + +/* ---------------------------------------------------------------------- */ +/* + * they can't be an aufs branch. + */ +static inline int au_test_fs_unsuppoted(struct super_block *sb) +{ + return +#ifndef CONFIG_AUFS_BR_RAMFS + au_test_ramfs(sb) || +#endif + au_test_procfs(sb) + || au_test_sysfs(sb) + || au_test_configfs(sb) + || au_test_debugfs(sb) + || au_test_securityfs(sb) + || au_test_xenfs(sb) + || au_test_ecryptfs(sb) + /* || !strcmp(au_sbtype(sb), "unionfs") */ + || au_test_aufs(sb); /* will be supported in next version */ +} + +/* + * If the filesystem supports NFS-export, then it has to support NULL as + * a nameidata parameter for ->create(), ->lookup() and ->d_revalidate(). + * We can apply this principle when we handle a lower filesystem. + */ +static inline int au_test_fs_null_nd(struct super_block *sb) +{ + return !!sb->s_export_op; +} + +static inline int au_test_fs_remote(struct super_block *sb) +{ + return !au_test_tmpfs(sb) +#ifdef CONFIG_AUFS_BR_RAMFS + && !au_test_ramfs(sb) +#endif + && !(sb->s_type->fs_flags & FS_REQUIRES_DEV); +} + +/* ---------------------------------------------------------------------- */ + +/* + * Note: these functions (below) are created after reading ->getattr() in all + * filesystems under linux/fs. it means we have to do so in every update... + */ + +/* + * some filesystems require getattr to refresh the inode attributes before + * referencing. + * in most cases, we can rely on the inode attribute in NFS (or every remote fs) + * and leave the work for d_revalidate() + */ +static inline int au_test_fs_refresh_iattr(struct super_block *sb) +{ + return au_test_nfs(sb) + || au_test_fuse(sb) + /* || au_test_smbfs(sb) */ /* untested */ + /* || au_test_ocfs2(sb) */ /* untested */ + /* || au_test_btrfs(sb) */ /* untested */ + /* || au_test_coda(sb) */ /* untested */ + /* || au_test_v9fs(sb) */ /* untested */ + ; +} + +/* + * filesystems which don't maintain i_size or i_blocks. + */ +static inline int au_test_fs_bad_iattr_size(struct super_block *sb) +{ + return au_test_xfs(sb) + || au_test_btrfs(sb) + || au_test_ubifs(sb) + || au_test_hfsplus(sb) /* maintained, but incorrect */ + /* || au_test_ext4(sb) */ /* untested */ + /* || au_test_ocfs2(sb) */ /* untested */ + /* || au_test_ocfs2_dlmfs(sb) */ /* untested */ + /* || au_test_sysv(sb) */ /* untested */ + /* || au_test_minix(sb) */ /* untested */ + ; +} + +/* + * filesystems which don't store the correct value in some of their inode + * attributes. + */ +static inline int au_test_fs_bad_iattr(struct super_block *sb) +{ + return au_test_fs_bad_iattr_size(sb) + /* || au_test_cifs(sb) */ /* untested */ + || au_test_fat(sb) + || au_test_msdos(sb) + || au_test_vfat(sb); +} + +/* they don't check i_nlink in link(2) */ +static inline int au_test_fs_no_limit_nlink(struct super_block *sb) +{ + return au_test_tmpfs(sb) +#ifdef CONFIG_AUFS_BR_RAMFS + || au_test_ramfs(sb) +#endif + || au_test_ubifs(sb) + || au_test_btrfs(sb) + || au_test_hfsplus(sb); +} + +/* + * filesystems which sets S_NOATIME and S_NOCMTIME. + */ +static inline int au_test_fs_notime(struct super_block *sb) +{ + return au_test_nfs(sb) + || au_test_fuse(sb) + || au_test_ubifs(sb) + /* || au_test_cifs(sb) */ /* untested */ + ; +} + +/* + * filesystems which requires replacing i_mapping. + */ +static inline int au_test_fs_bad_mapping(struct super_block *sb) +{ + return au_test_fuse(sb) + || au_test_ubifs(sb); +} + +/* temporary support for i#1 in cramfs */ +static inline int au_test_fs_unique_ino(struct inode *inode) +{ + if (au_test_cramfs(inode->i_sb)) + return inode->i_ino != 1; + return 1; +} + +/* ---------------------------------------------------------------------- */ + +/* + * the filesystem where the xino files placed must support i/o after unlink and + * maintain i_size and i_blocks. + */ +static inline int au_test_fs_bad_xino(struct super_block *sb) +{ + return au_test_fs_remote(sb) + || au_test_fs_bad_iattr_size(sb) +#ifdef CONFIG_AUFS_BR_RAMFS + || !(au_test_ramfs(sb) || au_test_fs_null_nd(sb)) +#else + || !au_test_fs_null_nd(sb) /* to keep xino code simple */ +#endif + /* don't want unnecessary work for xino */ + || au_test_aufs(sb) + || au_test_ecryptfs(sb) + || au_test_nilfs(sb); +} + +static inline int au_test_fs_trunc_xino(struct super_block *sb) +{ + return au_test_tmpfs(sb) + || au_test_ramfs(sb); +} + +/* + * test if the @sb is real-readonly. + */ +static inline int au_test_fs_rr(struct super_block *sb) +{ + return au_test_squashfs(sb) + || au_test_iso9660(sb) + || au_test_cramfs(sb) + || au_test_romfs(sb); +} + +#endif /* __KERNEL__ */ +#endif /* __AUFS_FSTYPE_H__ */ diff --git a/ubuntu/aufs/hfsnotify.c b/ubuntu/aufs/hfsnotify.c new file mode 100644 index 000000000000..deb0ab4a0ae6 --- /dev/null +++ b/ubuntu/aufs/hfsnotify.c @@ -0,0 +1,247 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * fsnotify for the lower directories + */ + +#include "aufs.h" + +/* FS_IN_IGNORED is unnecessary */ +static const __u32 AuHfsnMask = (FS_MOVED_TO | FS_MOVED_FROM | FS_DELETE + | FS_CREATE | FS_EVENT_ON_CHILD); +static DECLARE_WAIT_QUEUE_HEAD(au_hfsn_wq); + +static void au_hfsn_free_mark(struct fsnotify_mark *mark) +{ + struct au_hnotify *hn = container_of(mark, struct au_hnotify, + hn_mark); + AuDbg("here\n"); + hn->hn_mark_dead = 1; + smp_mb(); + wake_up_all(&au_hfsn_wq); +} + +static int au_hfsn_alloc(struct au_hinode *hinode) +{ + struct au_hnotify *hn; + struct super_block *sb; + struct au_branch *br; + struct fsnotify_mark *mark; + aufs_bindex_t bindex; + + hn = hinode->hi_notify; + sb = hn->hn_aufs_inode->i_sb; + bindex = au_br_index(sb, hinode->hi_id); + br = au_sbr(sb, bindex); + hn->hn_mark_dead = 0; + mark = &hn->hn_mark; + fsnotify_init_mark(mark, au_hfsn_free_mark); + mark->mask = AuHfsnMask; + /* + * by udba rename or rmdir, aufs assign a new inode to the known + * h_inode, so specify 1 to allow dups. + */ + return fsnotify_add_mark(mark, br->br_hfsn_group, hinode->hi_inode, + /*mnt*/NULL, /*allow_dups*/1); +} + +static void au_hfsn_free(struct au_hinode *hinode) +{ + struct au_hnotify *hn; + struct fsnotify_mark *mark; + + hn = hinode->hi_notify; + mark = &hn->hn_mark; + fsnotify_destroy_mark(mark); + fsnotify_put_mark(mark); + + /* TODO: bad approach */ + wait_event(au_hfsn_wq, hn->hn_mark_dead); +} + +/* ---------------------------------------------------------------------- */ + +static void au_hfsn_ctl(struct au_hinode *hinode, int do_set) +{ + struct fsnotify_mark *mark; + + mark = &hinode->hi_notify->hn_mark; + spin_lock(&mark->lock); + if (do_set) { + AuDebugOn(mark->mask & AuHfsnMask); + mark->mask |= AuHfsnMask; + } else { + AuDebugOn(!(mark->mask & AuHfsnMask)); + mark->mask &= ~AuHfsnMask; + } + spin_unlock(&mark->lock); + /* fsnotify_recalc_inode_mask(hinode->hi_inode); */ +} + +/* ---------------------------------------------------------------------- */ + +/* #define AuDbgHnotify */ +#ifdef AuDbgHnotify +static char *au_hfsn_name(u32 mask) +{ +#ifdef CONFIG_AUFS_DEBUG +#define test_ret(flag) if (mask & flag) \ + return #flag; + test_ret(FS_ACCESS); + test_ret(FS_MODIFY); + test_ret(FS_ATTRIB); + test_ret(FS_CLOSE_WRITE); + test_ret(FS_CLOSE_NOWRITE); + test_ret(FS_OPEN); + test_ret(FS_MOVED_FROM); + test_ret(FS_MOVED_TO); + test_ret(FS_CREATE); + test_ret(FS_DELETE); + test_ret(FS_DELETE_SELF); + test_ret(FS_MOVE_SELF); + test_ret(FS_UNMOUNT); + test_ret(FS_Q_OVERFLOW); + test_ret(FS_IN_IGNORED); + test_ret(FS_IN_ISDIR); + test_ret(FS_IN_ONESHOT); + test_ret(FS_EVENT_ON_CHILD); + return ""; +#undef test_ret +#else + return "??"; +#endif +} +#endif + +/* ---------------------------------------------------------------------- */ + +static int au_hfsn_handle_event(struct fsnotify_group *group, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + struct fsnotify_event *event) +{ + int err; + struct au_hnotify *hnotify; + struct inode *h_dir, *h_inode; + __u32 mask; + struct qstr h_child_qstr = { + .name = event->file_name, + .len = event->name_len + }; + + AuDebugOn(event->data_type != FSNOTIFY_EVENT_INODE); + + err = 0; + /* if FS_UNMOUNT happens, there must be another bug */ + mask = event->mask; + AuDebugOn(mask & FS_UNMOUNT); + if (mask & (FS_IN_IGNORED | FS_UNMOUNT)) + goto out; + + h_dir = event->to_tell; + h_inode = event->inode; +#ifdef AuDbgHnotify + au_debug(1); + if (1 || h_child_qstr.len != sizeof(AUFS_XINO_FNAME) - 1 + || strncmp(h_child_qstr.name, AUFS_XINO_FNAME, h_child_qstr.len)) { + AuDbg("i%lu, mask 0x%x %s, hcname %.*s, hi%lu\n", + h_dir->i_ino, mask, au_hfsn_name(mask), + AuLNPair(&h_child_qstr), h_inode ? h_inode->i_ino : 0); + /* WARN_ON(1); */ + } + au_debug(0); +#endif + + AuDebugOn(!inode_mark); + hnotify = container_of(inode_mark, struct au_hnotify, hn_mark); + err = au_hnotify(h_dir, hnotify, mask, &h_child_qstr, h_inode); + +out: + return err; +} + +/* isn't it waste to ask every registered 'group'? */ +/* copied from linux/fs/notify/inotify/inotify_fsnotiry.c */ +/* it should be exported to modules */ +static bool au_hfsn_should_send_event(struct fsnotify_group *group, + struct inode *h_inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + __u32 mask, void *data, int data_type) +{ + mask = (mask & ~FS_EVENT_ON_CHILD); + return inode_mark->mask & mask; +} + +static struct fsnotify_ops au_hfsn_ops = { + .should_send_event = au_hfsn_should_send_event, + .handle_event = au_hfsn_handle_event +}; + +/* ---------------------------------------------------------------------- */ + +static void au_hfsn_fin_br(struct au_branch *br) +{ + if (br->br_hfsn_group) + fsnotify_put_group(br->br_hfsn_group); +} + +static int au_hfsn_init_br(struct au_branch *br, int perm) +{ + br->br_hfsn_group = NULL; + br->br_hfsn_ops = au_hfsn_ops; + return 0; +} + +static int au_hfsn_reset_br(unsigned int udba, struct au_branch *br, int perm) +{ + int err; + + err = 0; + if (udba != AuOpt_UDBA_HNOTIFY + || !au_br_hnotifyable(perm)) { + au_hfsn_fin_br(br); + br->br_hfsn_group = NULL; + goto out; + } + + if (br->br_hfsn_group) + goto out; + + br->br_hfsn_group = fsnotify_alloc_group(&br->br_hfsn_ops); + if (IS_ERR(br->br_hfsn_group)) { + err = PTR_ERR(br->br_hfsn_group); + pr_err("fsnotify_alloc_group() failed, %d\n", err); + br->br_hfsn_group = NULL; + } + +out: + AuTraceErr(err); + return err; +} + +const struct au_hnotify_op au_hnotify_op = { + .ctl = au_hfsn_ctl, + .alloc = au_hfsn_alloc, + .free = au_hfsn_free, + + .reset_br = au_hfsn_reset_br, + .fin_br = au_hfsn_fin_br, + .init_br = au_hfsn_init_br +}; diff --git a/ubuntu/aufs/hfsplus.c b/ubuntu/aufs/hfsplus.c new file mode 100644 index 000000000000..678b69c69684 --- /dev/null +++ b/ubuntu/aufs/hfsplus.c @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2010-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * special support for filesystems which aqucires an inode mutex + * at final closing a file, eg, hfsplus. + * + * This trick is very simple and stupid, just to open the file before really + * neceeary open to tell hfsplus that this is not the final closing. + * The caller should call au_h_open_pre() after acquiring the inode mutex, + * and au_h_open_post() after releasing it. + */ + +#include +#include "aufs.h" + +struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex) +{ + struct file *h_file; + struct dentry *h_dentry; + + h_dentry = au_h_dptr(dentry, bindex); + AuDebugOn(!h_dentry); + AuDebugOn(!h_dentry->d_inode); + IMustLock(h_dentry->d_inode); + + h_file = NULL; + if (au_test_hfsplus(h_dentry->d_sb) + && S_ISREG(h_dentry->d_inode->i_mode)) + h_file = au_h_open(dentry, bindex, + O_RDONLY | O_NOATIME | O_LARGEFILE, + /*file*/NULL); + return h_file; +} + +void au_h_open_post(struct dentry *dentry, aufs_bindex_t bindex, + struct file *h_file) +{ + if (h_file) { + fput(h_file); + au_sbr_put(dentry->d_sb, bindex); + } +} diff --git a/ubuntu/aufs/hnotify.c b/ubuntu/aufs/hnotify.c new file mode 100644 index 000000000000..5ec653c1cf3c --- /dev/null +++ b/ubuntu/aufs/hnotify.c @@ -0,0 +1,712 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * abstraction to notify the direct changes on lower directories + */ + +#include "aufs.h" + +int au_hn_alloc(struct au_hinode *hinode, struct inode *inode) +{ + int err; + struct au_hnotify *hn; + + err = -ENOMEM; + hn = au_cache_alloc_hnotify(); + if (hn) { + hn->hn_aufs_inode = inode; + hinode->hi_notify = hn; + err = au_hnotify_op.alloc(hinode); + AuTraceErr(err); + if (unlikely(err)) { + hinode->hi_notify = NULL; + au_cache_free_hnotify(hn); + /* + * The upper dir was removed by udba, but the same named + * dir left. In this case, aufs assignes a new inode + * number and set the monitor again. + * For the lower dir, the old monitnor is still left. + */ + if (err == -EEXIST) + err = 0; + } + } + + AuTraceErr(err); + return err; +} + +void au_hn_free(struct au_hinode *hinode) +{ + struct au_hnotify *hn; + + hn = hinode->hi_notify; + if (hn) { + au_hnotify_op.free(hinode); + au_cache_free_hnotify(hn); + hinode->hi_notify = NULL; + } +} + +/* ---------------------------------------------------------------------- */ + +void au_hn_ctl(struct au_hinode *hinode, int do_set) +{ + if (hinode->hi_notify) + au_hnotify_op.ctl(hinode, do_set); +} + +void au_hn_reset(struct inode *inode, unsigned int flags) +{ + aufs_bindex_t bindex, bend; + struct inode *hi; + struct dentry *iwhdentry; + + bend = au_ibend(inode); + for (bindex = au_ibstart(inode); bindex <= bend; bindex++) { + hi = au_h_iptr(inode, bindex); + if (!hi) + continue; + + /* mutex_lock_nested(&hi->i_mutex, AuLsc_I_CHILD); */ + iwhdentry = au_hi_wh(inode, bindex); + if (iwhdentry) + dget(iwhdentry); + au_igrab(hi); + au_set_h_iptr(inode, bindex, NULL, 0); + au_set_h_iptr(inode, bindex, au_igrab(hi), + flags & ~AuHi_XINO); + iput(hi); + dput(iwhdentry); + /* mutex_unlock(&hi->i_mutex); */ + } +} + +/* ---------------------------------------------------------------------- */ + +static int hn_xino(struct inode *inode, struct inode *h_inode) +{ + int err; + aufs_bindex_t bindex, bend, bfound, bstart; + struct inode *h_i; + + err = 0; + if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { + pr_warning("branch root dir was changed\n"); + goto out; + } + + bfound = -1; + bend = au_ibend(inode); + bstart = au_ibstart(inode); +#if 0 /* reserved for future use */ + if (bindex == bend) { + /* keep this ino in rename case */ + goto out; + } +#endif + for (bindex = bstart; bindex <= bend; bindex++) + if (au_h_iptr(inode, bindex) == h_inode) { + bfound = bindex; + break; + } + if (bfound < 0) + goto out; + + for (bindex = bstart; bindex <= bend; bindex++) { + h_i = au_h_iptr(inode, bindex); + if (!h_i) + continue; + + err = au_xino_write(inode->i_sb, bindex, h_i->i_ino, /*ino*/0); + /* ignore this error */ + /* bad action? */ + } + + /* children inode number will be broken */ + +out: + AuTraceErr(err); + return err; +} + +static int hn_gen_tree(struct dentry *dentry) +{ + int err, i, j, ndentry; + struct au_dcsub_pages dpages; + struct au_dpage *dpage; + struct dentry **dentries; + + err = au_dpages_init(&dpages, GFP_NOFS); + if (unlikely(err)) + goto out; + err = au_dcsub_pages(&dpages, dentry, NULL, NULL); + if (unlikely(err)) + goto out_dpages; + + for (i = 0; i < dpages.ndpage; i++) { + dpage = dpages.dpages + i; + dentries = dpage->dentries; + ndentry = dpage->ndentry; + for (j = 0; j < ndentry; j++) { + struct dentry *d; + + d = dentries[j]; + if (IS_ROOT(d)) + continue; + + au_digen_dec(d); + if (d->d_inode) + /* todo: reset children xino? + cached children only? */ + au_iigen_dec(d->d_inode); + } + } + +out_dpages: + au_dpages_free(&dpages); + +#if 0 + /* discard children */ + dentry_unhash(dentry); + dput(dentry); +#endif +out: + return err; +} + +/* + * return 0 if processed. + */ +static int hn_gen_by_inode(char *name, unsigned int nlen, struct inode *inode, + const unsigned int isdir) +{ + int err; + struct dentry *d; + struct qstr *dname; + + err = 1; + if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { + pr_warning("branch root dir was changed\n"); + err = 0; + goto out; + } + + if (!isdir) { + AuDebugOn(!name); + au_iigen_dec(inode); + spin_lock(&inode->i_lock); + list_for_each_entry(d, &inode->i_dentry, d_alias) { + spin_lock(&d->d_lock); + dname = &d->d_name; + if (dname->len != nlen + && memcmp(dname->name, name, nlen)) { + spin_unlock(&d->d_lock); + continue; + } + err = 0; + au_digen_dec(d); + spin_unlock(&d->d_lock); + break; + } + spin_unlock(&inode->i_lock); + } else { + au_fset_si(au_sbi(inode->i_sb), FAILED_REFRESH_DIR); + d = d_find_alias(inode); + if (!d) { + au_iigen_dec(inode); + goto out; + } + + spin_lock(&d->d_lock); + dname = &d->d_name; + if (dname->len == nlen && !memcmp(dname->name, name, nlen)) { + spin_unlock(&d->d_lock); + err = hn_gen_tree(d); + spin_lock(&d->d_lock); + } + spin_unlock(&d->d_lock); + dput(d); + } + +out: + AuTraceErr(err); + return err; +} + +static int hn_gen_by_name(struct dentry *dentry, const unsigned int isdir) +{ + int err; + struct inode *inode; + + inode = dentry->d_inode; + if (IS_ROOT(dentry) + /* || (inode && inode->i_ino == AUFS_ROOT_INO) */ + ) { + pr_warning("branch root dir was changed\n"); + return 0; + } + + err = 0; + if (!isdir) { + au_digen_dec(dentry); + if (inode) + au_iigen_dec(inode); + } else { + au_fset_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIR); + if (inode) + err = hn_gen_tree(dentry); + } + + AuTraceErr(err); + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* hnotify job flags */ +#define AuHnJob_XINO0 1 +#define AuHnJob_GEN (1 << 1) +#define AuHnJob_DIRENT (1 << 2) +#define AuHnJob_ISDIR (1 << 3) +#define AuHnJob_TRYXINO0 (1 << 4) +#define AuHnJob_MNTPNT (1 << 5) +#define au_ftest_hnjob(flags, name) ((flags) & AuHnJob_##name) +#define au_fset_hnjob(flags, name) \ + do { (flags) |= AuHnJob_##name; } while (0) +#define au_fclr_hnjob(flags, name) \ + do { (flags) &= ~AuHnJob_##name; } while (0) + +enum { + AuHn_CHILD, + AuHn_PARENT, + AuHnLast +}; + +struct au_hnotify_args { + struct inode *h_dir, *dir, *h_child_inode; + u32 mask; + unsigned int flags[AuHnLast]; + unsigned int h_child_nlen; + char h_child_name[]; +}; + +struct hn_job_args { + unsigned int flags; + struct inode *inode, *h_inode, *dir, *h_dir; + struct dentry *dentry; + char *h_name; + int h_nlen; +}; + +static int hn_job(struct hn_job_args *a) +{ + const unsigned int isdir = au_ftest_hnjob(a->flags, ISDIR); + + /* reset xino */ + if (au_ftest_hnjob(a->flags, XINO0) && a->inode) + hn_xino(a->inode, a->h_inode); /* ignore this error */ + + if (au_ftest_hnjob(a->flags, TRYXINO0) + && a->inode + && a->h_inode) { + mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); + if (!a->h_inode->i_nlink) + hn_xino(a->inode, a->h_inode); /* ignore this error */ + mutex_unlock(&a->h_inode->i_mutex); + } + + /* make the generation obsolete */ + if (au_ftest_hnjob(a->flags, GEN)) { + int err = -1; + if (a->inode) + err = hn_gen_by_inode(a->h_name, a->h_nlen, a->inode, + isdir); + if (err && a->dentry) + hn_gen_by_name(a->dentry, isdir); + /* ignore this error */ + } + + /* make dir entries obsolete */ + if (au_ftest_hnjob(a->flags, DIRENT) && a->inode) { + struct au_vdir *vdir; + + vdir = au_ivdir(a->inode); + if (vdir) + vdir->vd_jiffy = 0; + /* IMustLock(a->inode); */ + /* a->inode->i_version++; */ + } + + /* can do nothing but warn */ + if (au_ftest_hnjob(a->flags, MNTPNT) + && a->dentry + && d_mountpoint(a->dentry)) + pr_warning("mount-point %.*s is removed or renamed\n", + AuDLNPair(a->dentry)); + + return 0; +} + +/* ---------------------------------------------------------------------- */ + +static struct dentry *lookup_wlock_by_name(char *name, unsigned int nlen, + struct inode *dir) +{ + struct dentry *dentry, *d, *parent; + struct qstr *dname; + + parent = d_find_alias(dir); + if (!parent) + return NULL; + + dentry = NULL; + spin_lock(&parent->d_lock); + list_for_each_entry(d, &parent->d_subdirs, d_u.d_child) { + /* AuDbg("%.*s\n", AuDLNPair(d)); */ + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); + dname = &d->d_name; + if (dname->len != nlen || memcmp(dname->name, name, nlen)) + goto cont_unlock; + if (au_di(d)) + au_digen_dec(d); + else + goto cont_unlock; + if (d->d_count) { + dentry = dget_dlock(d); + spin_unlock(&d->d_lock); + break; + } + + cont_unlock: + spin_unlock(&d->d_lock); + } + spin_unlock(&parent->d_lock); + dput(parent); + + if (dentry) + di_write_lock_child(dentry); + + return dentry; +} + +static struct inode *lookup_wlock_by_ino(struct super_block *sb, + aufs_bindex_t bindex, ino_t h_ino) +{ + struct inode *inode; + ino_t ino; + int err; + + inode = NULL; + err = au_xino_read(sb, bindex, h_ino, &ino); + if (!err && ino) + inode = ilookup(sb, ino); + if (!inode) + goto out; + + if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { + pr_warning("wrong root branch\n"); + iput(inode); + inode = NULL; + goto out; + } + + ii_write_lock_child(inode); + +out: + return inode; +} + +static void au_hn_bh(void *_args) +{ + struct au_hnotify_args *a = _args; + struct super_block *sb; + aufs_bindex_t bindex, bend, bfound; + unsigned char xino, try_iput; + int err; + struct inode *inode; + ino_t h_ino; + struct hn_job_args args; + struct dentry *dentry; + struct au_sbinfo *sbinfo; + + AuDebugOn(!_args); + AuDebugOn(!a->h_dir); + AuDebugOn(!a->dir); + AuDebugOn(!a->mask); + AuDbg("mask 0x%x, i%lu, hi%lu, hci%lu\n", + a->mask, a->dir->i_ino, a->h_dir->i_ino, + a->h_child_inode ? a->h_child_inode->i_ino : 0); + + inode = NULL; + dentry = NULL; + /* + * do not lock a->dir->i_mutex here + * because of d_revalidate() may cause a deadlock. + */ + sb = a->dir->i_sb; + AuDebugOn(!sb); + sbinfo = au_sbi(sb); + AuDebugOn(!sbinfo); + si_write_lock(sb, AuLock_NOPLMW); + + ii_read_lock_parent(a->dir); + bfound = -1; + bend = au_ibend(a->dir); + for (bindex = au_ibstart(a->dir); bindex <= bend; bindex++) + if (au_h_iptr(a->dir, bindex) == a->h_dir) { + bfound = bindex; + break; + } + ii_read_unlock(a->dir); + if (unlikely(bfound < 0)) + goto out; + + xino = !!au_opt_test(au_mntflags(sb), XINO); + h_ino = 0; + if (a->h_child_inode) + h_ino = a->h_child_inode->i_ino; + + if (a->h_child_nlen + && (au_ftest_hnjob(a->flags[AuHn_CHILD], GEN) + || au_ftest_hnjob(a->flags[AuHn_CHILD], MNTPNT))) + dentry = lookup_wlock_by_name(a->h_child_name, a->h_child_nlen, + a->dir); + try_iput = 0; + if (dentry) + inode = dentry->d_inode; + if (xino && !inode && h_ino + && (au_ftest_hnjob(a->flags[AuHn_CHILD], XINO0) + || au_ftest_hnjob(a->flags[AuHn_CHILD], TRYXINO0) + || au_ftest_hnjob(a->flags[AuHn_CHILD], GEN))) { + inode = lookup_wlock_by_ino(sb, bfound, h_ino); + try_iput = 1; + } + + args.flags = a->flags[AuHn_CHILD]; + args.dentry = dentry; + args.inode = inode; + args.h_inode = a->h_child_inode; + args.dir = a->dir; + args.h_dir = a->h_dir; + args.h_name = a->h_child_name; + args.h_nlen = a->h_child_nlen; + err = hn_job(&args); + if (dentry) { + if (au_di(dentry)) + di_write_unlock(dentry); + dput(dentry); + } + if (inode && try_iput) { + ii_write_unlock(inode); + iput(inode); + } + + ii_write_lock_parent(a->dir); + args.flags = a->flags[AuHn_PARENT]; + args.dentry = NULL; + args.inode = a->dir; + args.h_inode = a->h_dir; + args.dir = NULL; + args.h_dir = NULL; + args.h_name = NULL; + args.h_nlen = 0; + err = hn_job(&args); + ii_write_unlock(a->dir); + +out: + iput(a->h_child_inode); + iput(a->h_dir); + iput(a->dir); + si_write_unlock(sb); + au_nwt_done(&sbinfo->si_nowait); + kfree(a); +} + +/* ---------------------------------------------------------------------- */ + +int au_hnotify(struct inode *h_dir, struct au_hnotify *hnotify, u32 mask, + struct qstr *h_child_qstr, struct inode *h_child_inode) +{ + int err, len; + unsigned int flags[AuHnLast], f; + unsigned char isdir, isroot, wh; + struct inode *dir; + struct au_hnotify_args *args; + char *p, *h_child_name; + + err = 0; + AuDebugOn(!hnotify || !hnotify->hn_aufs_inode); + dir = igrab(hnotify->hn_aufs_inode); + if (!dir) + goto out; + + isroot = (dir->i_ino == AUFS_ROOT_INO); + wh = 0; + h_child_name = (void *)h_child_qstr->name; + len = h_child_qstr->len; + if (h_child_name) { + if (len > AUFS_WH_PFX_LEN + && !memcmp(h_child_name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { + h_child_name += AUFS_WH_PFX_LEN; + len -= AUFS_WH_PFX_LEN; + wh = 1; + } + } + + isdir = 0; + if (h_child_inode) + isdir = !!S_ISDIR(h_child_inode->i_mode); + flags[AuHn_PARENT] = AuHnJob_ISDIR; + flags[AuHn_CHILD] = 0; + if (isdir) + flags[AuHn_CHILD] = AuHnJob_ISDIR; + au_fset_hnjob(flags[AuHn_PARENT], DIRENT); + au_fset_hnjob(flags[AuHn_CHILD], GEN); + switch (mask & FS_EVENTS_POSS_ON_CHILD) { + case FS_MOVED_FROM: + case FS_MOVED_TO: + au_fset_hnjob(flags[AuHn_CHILD], XINO0); + au_fset_hnjob(flags[AuHn_CHILD], MNTPNT); + /*FALLTHROUGH*/ + case FS_CREATE: + AuDebugOn(!h_child_name || !h_child_inode); + break; + + case FS_DELETE: + /* + * aufs never be able to get this child inode. + * revalidation should be in d_revalidate() + * by checking i_nlink, i_generation or d_unhashed(). + */ + AuDebugOn(!h_child_name); + au_fset_hnjob(flags[AuHn_CHILD], TRYXINO0); + au_fset_hnjob(flags[AuHn_CHILD], MNTPNT); + break; + + default: + AuDebugOn(1); + } + + if (wh) + h_child_inode = NULL; + + err = -ENOMEM; + /* iput() and kfree() will be called in au_hnotify() */ + args = kmalloc(sizeof(*args) + len + 1, GFP_NOFS); + if (unlikely(!args)) { + AuErr1("no memory\n"); + iput(dir); + goto out; + } + args->flags[AuHn_PARENT] = flags[AuHn_PARENT]; + args->flags[AuHn_CHILD] = flags[AuHn_CHILD]; + args->mask = mask; + args->dir = dir; + args->h_dir = igrab(h_dir); + if (h_child_inode) + h_child_inode = igrab(h_child_inode); /* can be NULL */ + args->h_child_inode = h_child_inode; + args->h_child_nlen = len; + if (len) { + p = (void *)args; + p += sizeof(*args); + memcpy(p, h_child_name, len); + p[len] = 0; + } + + f = 0; + if (!dir->i_nlink) + f = AuWkq_NEST; + err = au_wkq_nowait(au_hn_bh, args, dir->i_sb, f); + if (unlikely(err)) { + pr_err("wkq %d\n", err); + iput(args->h_child_inode); + iput(args->h_dir); + iput(args->dir); + kfree(args); + } + +out: + return err; +} + +/* ---------------------------------------------------------------------- */ + +int au_hnotify_reset_br(unsigned int udba, struct au_branch *br, int perm) +{ + int err; + + AuDebugOn(!(udba & AuOptMask_UDBA)); + + err = 0; + if (au_hnotify_op.reset_br) + err = au_hnotify_op.reset_br(udba, br, perm); + + return err; +} + +int au_hnotify_init_br(struct au_branch *br, int perm) +{ + int err; + + err = 0; + if (au_hnotify_op.init_br) + err = au_hnotify_op.init_br(br, perm); + + return err; +} + +void au_hnotify_fin_br(struct au_branch *br) +{ + if (au_hnotify_op.fin_br) + au_hnotify_op.fin_br(br); +} + +static void au_hn_destroy_cache(void) +{ + kmem_cache_destroy(au_cachep[AuCache_HNOTIFY]); + au_cachep[AuCache_HNOTIFY] = NULL; +} + +int __init au_hnotify_init(void) +{ + int err; + + err = -ENOMEM; + au_cachep[AuCache_HNOTIFY] = AuCache(au_hnotify); + if (au_cachep[AuCache_HNOTIFY]) { + err = 0; + if (au_hnotify_op.init) + err = au_hnotify_op.init(); + if (unlikely(err)) + au_hn_destroy_cache(); + } + AuTraceErr(err); + return err; +} + +void au_hnotify_fin(void) +{ + if (au_hnotify_op.fin) + au_hnotify_op.fin(); + /* cf. au_cache_fin() */ + if (au_cachep[AuCache_HNOTIFY]) + au_hn_destroy_cache(); +} diff --git a/ubuntu/aufs/i_op.c b/ubuntu/aufs/i_op.c new file mode 100644 index 000000000000..b99e7da0da6e --- /dev/null +++ b/ubuntu/aufs/i_op.c @@ -0,0 +1,994 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * inode operations (except add/del/rename) + */ + +#include +#include +#include +#include +#include +#include +#include "aufs.h" + +static int h_permission(struct inode *h_inode, int mask, + struct vfsmount *h_mnt, int brperm) +{ + int err; + const unsigned char write_mask = !!(mask & (MAY_WRITE | MAY_APPEND)); + + err = -EACCES; + if ((write_mask && IS_IMMUTABLE(h_inode)) + || ((mask & MAY_EXEC) + && S_ISREG(h_inode->i_mode) + && ((h_mnt->mnt_flags & MNT_NOEXEC) + || !(h_inode->i_mode & S_IXUGO)))) + goto out; + + /* + * - skip the lower fs test in the case of write to ro branch. + * - nfs dir permission write check is optimized, but a policy for + * link/rename requires a real check. + */ + if ((write_mask && !au_br_writable(brperm)) + || (au_test_nfs(h_inode->i_sb) && S_ISDIR(h_inode->i_mode) + && write_mask && !(mask & MAY_READ)) + || !h_inode->i_op->permission) { + /* AuLabel(generic_permission); */ + err = generic_permission(h_inode, mask); + } else { + /* AuLabel(h_inode->permission); */ + err = h_inode->i_op->permission(h_inode, mask); + AuTraceErr(err); + } + + if (!err) + err = devcgroup_inode_permission(h_inode, mask); + if (!err) + err = security_inode_permission(h_inode, mask); + +#if 0 + if (!err) { + /* todo: do we need to call ima_path_check()? */ + struct path h_path = { + .dentry = + .mnt = h_mnt + }; + err = ima_path_check(&h_path, + mask & (MAY_READ | MAY_WRITE | MAY_EXEC), + IMA_COUNT_LEAVE); + } +#endif + +out: + return err; +} + +static int aufs_permission(struct inode *inode, int mask) +{ + int err; + aufs_bindex_t bindex, bend; + const unsigned char isdir = !!S_ISDIR(inode->i_mode), + write_mask = !!(mask & (MAY_WRITE | MAY_APPEND)); + struct inode *h_inode; + struct super_block *sb; + struct au_branch *br; + + /* todo: support rcu-walk? */ + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + sb = inode->i_sb; + si_read_lock(sb, AuLock_FLUSH); + ii_read_lock_child(inode); +#if 0 + err = au_iigen_test(inode, au_sigen(sb)); + if (unlikely(err)) + goto out; +#endif + + if (!isdir || write_mask) { + err = au_busy_or_stale(); + h_inode = au_h_iptr(inode, au_ibstart(inode)); + if (unlikely(!h_inode + || (h_inode->i_mode & S_IFMT) + != (inode->i_mode & S_IFMT))) + goto out; + + err = 0; + bindex = au_ibstart(inode); + br = au_sbr(sb, bindex); + err = h_permission(h_inode, mask, br->br_mnt, br->br_perm); + if (write_mask + && !err + && !special_file(h_inode->i_mode)) { + /* test whether the upper writable branch exists */ + err = -EROFS; + for (; bindex >= 0; bindex--) + if (!au_br_rdonly(au_sbr(sb, bindex))) { + err = 0; + break; + } + } + goto out; + } + + /* non-write to dir */ + err = 0; + bend = au_ibend(inode); + for (bindex = au_ibstart(inode); !err && bindex <= bend; bindex++) { + h_inode = au_h_iptr(inode, bindex); + if (h_inode) { + err = au_busy_or_stale(); + if (unlikely(!S_ISDIR(h_inode->i_mode))) + break; + + br = au_sbr(sb, bindex); + err = h_permission(h_inode, mask, br->br_mnt, + br->br_perm); + } + } + +out: + ii_read_unlock(inode); + si_read_unlock(sb); + return err; +} + +/* ---------------------------------------------------------------------- */ + +static struct dentry *aufs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct dentry *ret, *parent; + struct inode *inode; + struct super_block *sb; + int err, npositive, lc_idx; + + IMustLock(dir); + + sb = dir->i_sb; + err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); + ret = ERR_PTR(err); + if (unlikely(err)) + goto out; + + ret = ERR_PTR(-ENAMETOOLONG); + if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN)) + goto out_si; + err = au_di_init(dentry); + ret = ERR_PTR(err); + if (unlikely(err)) + goto out_si; + + inode = NULL; + npositive = 0; /* suppress a warning */ + parent = dentry->d_parent; /* dir inode is locked */ + di_read_lock_parent(parent, AuLock_IR); + err = au_alive_dir(parent); + if (!err) + err = au_digen_test(parent, au_sigen(sb)); + if (!err) { + npositive = au_lkup_dentry(dentry, au_dbstart(parent), + /*type*/0, nd); + err = npositive; + } + di_read_unlock(parent, AuLock_IR); + ret = ERR_PTR(err); + if (unlikely(err < 0)) + goto out_unlock; + + if (npositive) { + inode = au_new_inode(dentry, /*must_new*/0); + ret = (void *)inode; + } + if (IS_ERR(inode)) { + inode = NULL; + goto out_unlock; + } + + ret = d_splice_alias(inode, dentry); + if (unlikely(IS_ERR(ret) && inode)) { + ii_write_unlock(inode); + lc_idx = AuLcNonDir_IIINFO; + if (S_ISLNK(inode->i_mode)) + lc_idx = AuLcSymlink_IIINFO; + else if (S_ISDIR(inode->i_mode)) + lc_idx = AuLcDir_IIINFO; + au_rw_class(&au_ii(inode)->ii_rwsem, au_lc_key + lc_idx); + iput(inode); + } + +out_unlock: + di_write_unlock(dentry); + if (unlikely(IS_ERR(ret) && inode)) { + lc_idx = AuLcNonDir_DIINFO; + if (S_ISLNK(inode->i_mode)) + lc_idx = AuLcSymlink_DIINFO; + else if (S_ISDIR(inode->i_mode)) + lc_idx = AuLcDir_DIINFO; + au_rw_class(&au_di(dentry)->di_rwsem, au_lc_key + lc_idx); + } +out_si: + si_read_unlock(sb); +out: + return ret; +} + +/* ---------------------------------------------------------------------- */ + +static int au_wr_dir_cpup(struct dentry *dentry, struct dentry *parent, + const unsigned char add_entry, aufs_bindex_t bcpup, + aufs_bindex_t bstart) +{ + int err; + struct dentry *h_parent; + struct inode *h_dir; + + if (add_entry) + IMustLock(parent->d_inode); + else + di_write_lock_parent(parent); + + err = 0; + if (!au_h_dptr(parent, bcpup)) { + if (bstart < bcpup) + err = au_cpdown_dirs(dentry, bcpup); + else + err = au_cpup_dirs(dentry, bcpup); + } + if (!err && add_entry) { + h_parent = au_h_dptr(parent, bcpup); + h_dir = h_parent->d_inode; + mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT); + err = au_lkup_neg(dentry, bcpup); + /* todo: no unlock here */ + mutex_unlock(&h_dir->i_mutex); + + AuDbg("bcpup %d\n", bcpup); + if (!err) { + if (!dentry->d_inode) + au_set_h_dptr(dentry, bstart, NULL); + au_update_dbrange(dentry, /*do_put_zero*/0); + } + } + + if (!add_entry) + di_write_unlock(parent); + if (!err) + err = bcpup; /* success */ + + AuTraceErr(err); + return err; +} + +/* + * decide the branch and the parent dir where we will create a new entry. + * returns new bindex or an error. + * copyup the parent dir if needed. + */ +int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry, + struct au_wr_dir_args *args) +{ + int err; + aufs_bindex_t bcpup, bstart, src_bstart; + const unsigned char add_entry = !!au_ftest_wrdir(args->flags, + ADD_ENTRY); + struct super_block *sb; + struct dentry *parent; + struct au_sbinfo *sbinfo; + + sb = dentry->d_sb; + sbinfo = au_sbi(sb); + parent = dget_parent(dentry); + bstart = au_dbstart(dentry); + bcpup = bstart; + if (args->force_btgt < 0) { + if (src_dentry) { + src_bstart = au_dbstart(src_dentry); + if (src_bstart < bstart) + bcpup = src_bstart; + } else if (add_entry) { + err = AuWbrCreate(sbinfo, dentry, + au_ftest_wrdir(args->flags, ISDIR)); + bcpup = err; + } + + if (bcpup < 0 || au_test_ro(sb, bcpup, dentry->d_inode)) { + if (add_entry) + err = AuWbrCopyup(sbinfo, dentry); + else { + if (!IS_ROOT(dentry)) { + di_read_lock_parent(parent, !AuLock_IR); + err = AuWbrCopyup(sbinfo, dentry); + di_read_unlock(parent, !AuLock_IR); + } else + err = AuWbrCopyup(sbinfo, dentry); + } + bcpup = err; + if (unlikely(err < 0)) + goto out; + } + } else { + bcpup = args->force_btgt; + AuDebugOn(au_test_ro(sb, bcpup, dentry->d_inode)); + } + + AuDbg("bstart %d, bcpup %d\n", bstart, bcpup); + err = bcpup; + if (bcpup == bstart) + goto out; /* success */ + + /* copyup the new parent into the branch we process */ + err = au_wr_dir_cpup(dentry, parent, add_entry, bcpup, bstart); + if (err >= 0) { + if (!dentry->d_inode) { + au_set_h_dptr(dentry, bstart, NULL); + au_set_dbstart(dentry, bcpup); + au_set_dbend(dentry, bcpup); + } + AuDebugOn(add_entry && !au_h_dptr(dentry, bcpup)); + } + +out: + dput(parent); + return err; +} + +/* ---------------------------------------------------------------------- */ + +struct dentry *au_pinned_h_parent(struct au_pin *pin) +{ + if (pin && pin->parent) + return au_h_dptr(pin->parent, pin->bindex); + return NULL; +} + +void au_unpin(struct au_pin *p) +{ + if (p->h_mnt && au_ftest_pin(p->flags, MNT_WRITE)) + mnt_drop_write(p->h_mnt); + if (!p->hdir) + return; + + au_hn_imtx_unlock(p->hdir); + if (!au_ftest_pin(p->flags, DI_LOCKED)) + di_read_unlock(p->parent, AuLock_IR); + iput(p->hdir->hi_inode); + dput(p->parent); + p->parent = NULL; + p->hdir = NULL; + p->h_mnt = NULL; +} + +int au_do_pin(struct au_pin *p) +{ + int err; + struct super_block *sb; + struct dentry *h_dentry, *h_parent; + struct au_branch *br; + struct inode *h_dir; + + err = 0; + sb = p->dentry->d_sb; + br = au_sbr(sb, p->bindex); + if (IS_ROOT(p->dentry)) { + if (au_ftest_pin(p->flags, MNT_WRITE)) { + p->h_mnt = br->br_mnt; + err = mnt_want_write(p->h_mnt); + if (unlikely(err)) { + au_fclr_pin(p->flags, MNT_WRITE); + goto out_err; + } + } + goto out; + } + + h_dentry = NULL; + if (p->bindex <= au_dbend(p->dentry)) + h_dentry = au_h_dptr(p->dentry, p->bindex); + + p->parent = dget_parent(p->dentry); + if (!au_ftest_pin(p->flags, DI_LOCKED)) + di_read_lock(p->parent, AuLock_IR, p->lsc_di); + + h_dir = NULL; + h_parent = au_h_dptr(p->parent, p->bindex); + p->hdir = au_hi(p->parent->d_inode, p->bindex); + if (p->hdir) + h_dir = p->hdir->hi_inode; + + /* + * udba case, or + * if DI_LOCKED is not set, then p->parent may be different + * and h_parent can be NULL. + */ + if (unlikely(!p->hdir || !h_dir || !h_parent)) { + err = -EBUSY; + if (!au_ftest_pin(p->flags, DI_LOCKED)) + di_read_unlock(p->parent, AuLock_IR); + dput(p->parent); + p->parent = NULL; + goto out_err; + } + + au_igrab(h_dir); + au_hn_imtx_lock_nested(p->hdir, p->lsc_hi); + + if (unlikely(p->hdir->hi_inode != h_parent->d_inode)) { + err = -EBUSY; + goto out_unpin; + } + if (h_dentry) { + err = au_h_verify(h_dentry, p->udba, h_dir, h_parent, br); + if (unlikely(err)) { + au_fclr_pin(p->flags, MNT_WRITE); + goto out_unpin; + } + } + + if (au_ftest_pin(p->flags, MNT_WRITE)) { + p->h_mnt = br->br_mnt; + err = mnt_want_write(p->h_mnt); + if (unlikely(err)) { + au_fclr_pin(p->flags, MNT_WRITE); + goto out_unpin; + } + } + goto out; /* success */ + +out_unpin: + au_unpin(p); +out_err: + pr_err("err %d\n", err); + err = au_busy_or_stale(); +out: + return err; +} + +void au_pin_init(struct au_pin *p, struct dentry *dentry, + aufs_bindex_t bindex, int lsc_di, int lsc_hi, + unsigned int udba, unsigned char flags) +{ + p->dentry = dentry; + p->udba = udba; + p->lsc_di = lsc_di; + p->lsc_hi = lsc_hi; + p->flags = flags; + p->bindex = bindex; + + p->parent = NULL; + p->hdir = NULL; + p->h_mnt = NULL; +} + +int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex, + unsigned int udba, unsigned char flags) +{ + au_pin_init(pin, dentry, bindex, AuLsc_DI_PARENT, AuLsc_I_PARENT2, + udba, flags); + return au_do_pin(pin); +} + +/* ---------------------------------------------------------------------- */ + +/* + * ->setattr() and ->getattr() are called in various cases. + * chmod, stat: dentry is revalidated. + * fchmod, fstat: file and dentry are not revalidated, additionally they may be + * unhashed. + * for ->setattr(), ia->ia_file is passed from ftruncate only. + */ +/* todo: consolidate with do_refresh() and simple_reval_dpath() */ +static int au_reval_for_attr(struct dentry *dentry, unsigned int sigen) +{ + int err; + struct inode *inode; + struct dentry *parent; + + err = 0; + inode = dentry->d_inode; + if (au_digen_test(dentry, sigen)) { + parent = dget_parent(dentry); + di_read_lock_parent(parent, AuLock_IR); + err = au_refresh_dentry(dentry, parent); + di_read_unlock(parent, AuLock_IR); + dput(parent); + } + + AuTraceErr(err); + return err; +} + +#define AuIcpup_DID_CPUP 1 +#define au_ftest_icpup(flags, name) ((flags) & AuIcpup_##name) +#define au_fset_icpup(flags, name) \ + do { (flags) |= AuIcpup_##name; } while (0) +#define au_fclr_icpup(flags, name) \ + do { (flags) &= ~AuIcpup_##name; } while (0) + +struct au_icpup_args { + unsigned char flags; + unsigned char pin_flags; + aufs_bindex_t btgt; + unsigned int udba; + struct au_pin pin; + struct path h_path; + struct inode *h_inode; +}; + +static int au_pin_and_icpup(struct dentry *dentry, struct iattr *ia, + struct au_icpup_args *a) +{ + int err; + loff_t sz; + aufs_bindex_t bstart, ibstart; + struct dentry *hi_wh, *parent; + struct inode *inode; + struct file *h_file; + struct au_wr_dir_args wr_dir_args = { + .force_btgt = -1, + .flags = 0 + }; + + bstart = au_dbstart(dentry); + inode = dentry->d_inode; + if (S_ISDIR(inode->i_mode)) + au_fset_wrdir(wr_dir_args.flags, ISDIR); + /* plink or hi_wh() case */ + ibstart = au_ibstart(inode); + if (bstart != ibstart && !au_test_ro(inode->i_sb, ibstart, inode)) + wr_dir_args.force_btgt = ibstart; + err = au_wr_dir(dentry, /*src_dentry*/NULL, &wr_dir_args); + if (unlikely(err < 0)) + goto out; + a->btgt = err; + if (err != bstart) + au_fset_icpup(a->flags, DID_CPUP); + + err = 0; + a->pin_flags = AuPin_MNT_WRITE; + parent = NULL; + if (!IS_ROOT(dentry)) { + au_fset_pin(a->pin_flags, DI_LOCKED); + parent = dget_parent(dentry); + di_write_lock_parent(parent); + } + + err = au_pin(&a->pin, dentry, a->btgt, a->udba, a->pin_flags); + if (unlikely(err)) + goto out_parent; + + a->h_path.dentry = au_h_dptr(dentry, bstart); + a->h_inode = a->h_path.dentry->d_inode; + mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); + sz = -1; + if ((ia->ia_valid & ATTR_SIZE) && ia->ia_size < i_size_read(a->h_inode)) + sz = ia->ia_size; + + h_file = NULL; + hi_wh = NULL; + if (au_ftest_icpup(a->flags, DID_CPUP) && d_unlinked(dentry)) { + hi_wh = au_hi_wh(inode, a->btgt); + if (!hi_wh) { + err = au_sio_cpup_wh(dentry, a->btgt, sz, /*file*/NULL); + if (unlikely(err)) + goto out_unlock; + hi_wh = au_hi_wh(inode, a->btgt); + /* todo: revalidate hi_wh? */ + } + } + + if (parent) { + au_pin_set_parent_lflag(&a->pin, /*lflag*/0); + di_downgrade_lock(parent, AuLock_IR); + dput(parent); + parent = NULL; + } + if (!au_ftest_icpup(a->flags, DID_CPUP)) + goto out; /* success */ + + if (!d_unhashed(dentry)) { + h_file = au_h_open_pre(dentry, bstart); + if (IS_ERR(h_file)) { + err = PTR_ERR(h_file); + h_file = NULL; + } else + err = au_sio_cpup_simple(dentry, a->btgt, sz, + AuCpup_DTIME); + if (!err) + a->h_path.dentry = au_h_dptr(dentry, a->btgt); + } else if (!hi_wh) + a->h_path.dentry = au_h_dptr(dentry, a->btgt); + else + a->h_path.dentry = hi_wh; /* do not dget here */ + +out_unlock: + mutex_unlock(&a->h_inode->i_mutex); + au_h_open_post(dentry, bstart, h_file); + a->h_inode = a->h_path.dentry->d_inode; + if (!err) { + mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); + goto out; /* success */ + } + + au_unpin(&a->pin); +out_parent: + if (parent) { + di_write_unlock(parent); + dput(parent); + } +out: + return err; +} + +static int aufs_setattr(struct dentry *dentry, struct iattr *ia) +{ + int err; + struct inode *inode; + struct super_block *sb; + struct file *file; + struct au_icpup_args *a; + + inode = dentry->d_inode; + IMustLock(inode); + + err = -ENOMEM; + a = kzalloc(sizeof(*a), GFP_NOFS); + if (unlikely(!a)) + goto out; + + if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) + ia->ia_valid &= ~ATTR_MODE; + + file = NULL; + sb = dentry->d_sb; + err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); + if (unlikely(err)) + goto out_kfree; + + if (ia->ia_valid & ATTR_FILE) { + /* currently ftruncate(2) only */ + AuDebugOn(!S_ISREG(inode->i_mode)); + file = ia->ia_file; + err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); + if (unlikely(err)) + goto out_si; + ia->ia_file = au_hf_top(file); + a->udba = AuOpt_UDBA_NONE; + } else { + /* fchmod() doesn't pass ia_file */ + a->udba = au_opt_udba(sb); + di_write_lock_child(dentry); + /* no d_unlinked(), to set UDBA_NONE for root */ + if (d_unhashed(dentry)) + a->udba = AuOpt_UDBA_NONE; + if (a->udba != AuOpt_UDBA_NONE) { + AuDebugOn(IS_ROOT(dentry)); + err = au_reval_for_attr(dentry, au_sigen(sb)); + if (unlikely(err)) + goto out_dentry; + } + } + + err = au_pin_and_icpup(dentry, ia, a); + if (unlikely(err < 0)) + goto out_dentry; + if (au_ftest_icpup(a->flags, DID_CPUP)) { + ia->ia_file = NULL; + ia->ia_valid &= ~ATTR_FILE; + } + + a->h_path.mnt = au_sbr_mnt(sb, a->btgt); + if ((ia->ia_valid & (ATTR_MODE | ATTR_CTIME)) + == (ATTR_MODE | ATTR_CTIME)) { + err = security_path_chmod(a->h_path.dentry, a->h_path.mnt, + ia->ia_mode); + if (unlikely(err)) + goto out_unlock; + } else if ((ia->ia_valid & (ATTR_UID | ATTR_GID)) + && (ia->ia_valid & ATTR_CTIME)) { + err = security_path_chown(&a->h_path, ia->ia_uid, ia->ia_gid); + if (unlikely(err)) + goto out_unlock; + } + + if (ia->ia_valid & ATTR_SIZE) { + struct file *f; + + if (ia->ia_size < i_size_read(inode)) + /* unmap only */ + truncate_setsize(inode, ia->ia_size); + + f = NULL; + if (ia->ia_valid & ATTR_FILE) + f = ia->ia_file; + mutex_unlock(&a->h_inode->i_mutex); + err = vfsub_trunc(&a->h_path, ia->ia_size, ia->ia_valid, f); + mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); + } else + err = vfsub_notify_change(&a->h_path, ia); + if (!err) + au_cpup_attr_changeable(inode); + +out_unlock: + mutex_unlock(&a->h_inode->i_mutex); + au_unpin(&a->pin); + if (unlikely(err)) + au_update_dbstart(dentry); +out_dentry: + di_write_unlock(dentry); + if (file) { + fi_write_unlock(file); + ia->ia_file = file; + ia->ia_valid |= ATTR_FILE; + } +out_si: + si_read_unlock(sb); +out_kfree: + kfree(a); +out: + AuTraceErr(err); + return err; +} + +static void au_refresh_iattr(struct inode *inode, struct kstat *st, + unsigned int nlink) +{ + unsigned int n; + + inode->i_mode = st->mode; + inode->i_uid = st->uid; + inode->i_gid = st->gid; + inode->i_atime = st->atime; + inode->i_mtime = st->mtime; + inode->i_ctime = st->ctime; + + au_cpup_attr_nlink(inode, /*force*/0); + if (S_ISDIR(inode->i_mode)) { + n = inode->i_nlink; + n -= nlink; + n += st->nlink; + set_nlink(inode, n); + } + + spin_lock(&inode->i_lock); + inode->i_blocks = st->blocks; + i_size_write(inode, st->size); + spin_unlock(&inode->i_lock); +} + +static int aufs_getattr(struct vfsmount *mnt __maybe_unused, + struct dentry *dentry, struct kstat *st) +{ + int err; + unsigned int mnt_flags; + aufs_bindex_t bindex; + unsigned char udba_none, positive; + struct super_block *sb, *h_sb; + struct inode *inode; + struct vfsmount *h_mnt; + struct dentry *h_dentry; + + sb = dentry->d_sb; + inode = dentry->d_inode; + err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); + if (unlikely(err)) + goto out; + mnt_flags = au_mntflags(sb); + udba_none = !!au_opt_test(mnt_flags, UDBA_NONE); + + /* support fstat(2) */ + if (!d_unlinked(dentry) && !udba_none) { + unsigned int sigen = au_sigen(sb); + err = au_digen_test(dentry, sigen); + if (!err) { + di_read_lock_child(dentry, AuLock_IR); + err = au_dbrange_test(dentry); + if (unlikely(err)) + goto out_unlock; + } else { + AuDebugOn(IS_ROOT(dentry)); + di_write_lock_child(dentry); + err = au_dbrange_test(dentry); + if (!err) + err = au_reval_for_attr(dentry, sigen); + di_downgrade_lock(dentry, AuLock_IR); + if (unlikely(err)) + goto out_unlock; + } + } else + di_read_lock_child(dentry, AuLock_IR); + + bindex = au_ibstart(inode); + h_mnt = au_sbr_mnt(sb, bindex); + h_sb = h_mnt->mnt_sb; + if (!au_test_fs_bad_iattr(h_sb) && udba_none) + goto out_fill; /* success */ + + h_dentry = NULL; + if (au_dbstart(dentry) == bindex) + h_dentry = dget(au_h_dptr(dentry, bindex)); + else if (au_opt_test(mnt_flags, PLINK) && au_plink_test(inode)) { + h_dentry = au_plink_lkup(inode, bindex); + if (IS_ERR(h_dentry)) + goto out_fill; /* pretending success */ + } + /* illegally overlapped or something */ + if (unlikely(!h_dentry)) + goto out_fill; /* pretending success */ + + positive = !!h_dentry->d_inode; + if (positive) + err = vfs_getattr(h_mnt, h_dentry, st); + dput(h_dentry); + if (!err) { + if (positive) + au_refresh_iattr(inode, st, h_dentry->d_inode->i_nlink); + goto out_fill; /* success */ + } + AuTraceErr(err); + goto out_unlock; + +out_fill: + generic_fillattr(inode, st); +out_unlock: + di_read_unlock(dentry, AuLock_IR); + si_read_unlock(sb); +out: + AuTraceErr(err); + return err; +} + +/* ---------------------------------------------------------------------- */ + +static int h_readlink(struct dentry *dentry, int bindex, char __user *buf, + int bufsiz) +{ + int err; + struct super_block *sb; + struct dentry *h_dentry; + + err = -EINVAL; + h_dentry = au_h_dptr(dentry, bindex); + if (unlikely(!h_dentry->d_inode->i_op->readlink)) + goto out; + + err = security_inode_readlink(h_dentry); + if (unlikely(err)) + goto out; + + sb = dentry->d_sb; + if (!au_test_ro(sb, bindex, dentry->d_inode)) { + vfsub_touch_atime(au_sbr_mnt(sb, bindex), h_dentry); + fsstack_copy_attr_atime(dentry->d_inode, h_dentry->d_inode); + } + err = h_dentry->d_inode->i_op->readlink(h_dentry, buf, bufsiz); + +out: + return err; +} + +static int aufs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) +{ + int err; + + err = aufs_read_lock(dentry, AuLock_IR | AuLock_GEN); + if (unlikely(err)) + goto out; + err = au_d_hashed_positive(dentry); + if (!err) + err = h_readlink(dentry, au_dbstart(dentry), buf, bufsiz); + aufs_read_unlock(dentry, AuLock_IR); + +out: + return err; +} + +static void *aufs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + int err; + mm_segment_t old_fs; + union { + char *k; + char __user *u; + } buf; + + err = -ENOMEM; + buf.k = __getname_gfp(GFP_NOFS); + if (unlikely(!buf.k)) + goto out; + + err = aufs_read_lock(dentry, AuLock_IR | AuLock_GEN); + if (unlikely(err)) + goto out_name; + + err = au_d_hashed_positive(dentry); + if (!err) { + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = h_readlink(dentry, au_dbstart(dentry), buf.u, PATH_MAX); + set_fs(old_fs); + } + aufs_read_unlock(dentry, AuLock_IR); + + if (err >= 0) { + buf.k[err] = 0; + /* will be freed by put_link */ + nd_set_link(nd, buf.k); + return NULL; /* success */ + } + +out_name: + __putname(buf.k); +out: + path_put(&nd->path); + AuTraceErr(err); + return ERR_PTR(err); +} + +static void aufs_put_link(struct dentry *dentry __maybe_unused, + struct nameidata *nd, void *cookie __maybe_unused) +{ + __putname(nd_get_link(nd)); +} + +/* ---------------------------------------------------------------------- */ + +static void aufs_truncate_range(struct inode *inode __maybe_unused, + loff_t start __maybe_unused, + loff_t end __maybe_unused) +{ + AuUnsupport(); +} + +/* ---------------------------------------------------------------------- */ + +struct inode_operations aufs_symlink_iop = { + .permission = aufs_permission, + .setattr = aufs_setattr, + .getattr = aufs_getattr, + .readlink = aufs_readlink, + .follow_link = aufs_follow_link, + .put_link = aufs_put_link +}; + +struct inode_operations aufs_dir_iop = { + .create = aufs_create, + .lookup = aufs_lookup, + .link = aufs_link, + .unlink = aufs_unlink, + .symlink = aufs_symlink, + .mkdir = aufs_mkdir, + .rmdir = aufs_rmdir, + .mknod = aufs_mknod, + .rename = aufs_rename, + + .permission = aufs_permission, + .setattr = aufs_setattr, + .getattr = aufs_getattr +}; + +struct inode_operations aufs_iop = { + .permission = aufs_permission, + .setattr = aufs_setattr, + .getattr = aufs_getattr, + .truncate_range = aufs_truncate_range +}; diff --git a/ubuntu/aufs/i_op_add.c b/ubuntu/aufs/i_op_add.c new file mode 100644 index 000000000000..d1aef9386ad9 --- /dev/null +++ b/ubuntu/aufs/i_op_add.c @@ -0,0 +1,711 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * inode operations (add entry) + */ + +#include "aufs.h" + +/* + * final procedure of adding a new entry, except link(2). + * remove whiteout, instantiate, copyup the parent dir's times and size + * and update version. + * if it failed, re-create the removed whiteout. + */ +static int epilog(struct inode *dir, aufs_bindex_t bindex, + struct dentry *wh_dentry, struct dentry *dentry) +{ + int err, rerr; + aufs_bindex_t bwh; + struct path h_path; + struct inode *inode, *h_dir; + struct dentry *wh; + + bwh = -1; + if (wh_dentry) { + h_dir = wh_dentry->d_parent->d_inode; /* dir inode is locked */ + IMustLock(h_dir); + AuDebugOn(au_h_iptr(dir, bindex) != h_dir); + bwh = au_dbwh(dentry); + h_path.dentry = wh_dentry; + h_path.mnt = au_sbr_mnt(dir->i_sb, bindex); + err = au_wh_unlink_dentry(au_h_iptr(dir, bindex), &h_path, + dentry); + if (unlikely(err)) + goto out; + } + + inode = au_new_inode(dentry, /*must_new*/1); + if (!IS_ERR(inode)) { + d_instantiate(dentry, inode); + dir = dentry->d_parent->d_inode; /* dir inode is locked */ + IMustLock(dir); + if (au_ibstart(dir) == au_dbstart(dentry)) + au_cpup_attr_timesizes(dir); + dir->i_version++; + return 0; /* success */ + } + + err = PTR_ERR(inode); + if (!wh_dentry) + goto out; + + /* revert */ + /* dir inode is locked */ + wh = au_wh_create(dentry, bwh, wh_dentry->d_parent); + rerr = PTR_ERR(wh); + if (IS_ERR(wh)) { + AuIOErr("%.*s reverting whiteout failed(%d, %d)\n", + AuDLNPair(dentry), err, rerr); + err = -EIO; + } else + dput(wh); + +out: + return err; +} + +static int au_d_may_add(struct dentry *dentry) +{ + int err; + + err = 0; + if (unlikely(d_unhashed(dentry))) + err = -ENOENT; + if (unlikely(dentry->d_inode)) + err = -EEXIST; + return err; +} + +/* + * simple tests for the adding inode operations. + * following the checks in vfs, plus the parent-child relationship. + */ +int au_may_add(struct dentry *dentry, aufs_bindex_t bindex, + struct dentry *h_parent, int isdir) +{ + int err; + umode_t h_mode; + struct dentry *h_dentry; + struct inode *h_inode; + + err = -ENAMETOOLONG; + if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN)) + goto out; + + h_dentry = au_h_dptr(dentry, bindex); + h_inode = h_dentry->d_inode; + if (!dentry->d_inode) { + err = -EEXIST; + if (unlikely(h_inode)) + goto out; + } else { + /* rename(2) case */ + err = -EIO; + if (unlikely(!h_inode || !h_inode->i_nlink)) + goto out; + + h_mode = h_inode->i_mode; + if (!isdir) { + err = -EISDIR; + if (unlikely(S_ISDIR(h_mode))) + goto out; + } else if (unlikely(!S_ISDIR(h_mode))) { + err = -ENOTDIR; + goto out; + } + } + + err = 0; + /* expected parent dir is locked */ + if (unlikely(h_parent != h_dentry->d_parent)) + err = -EIO; + +out: + AuTraceErr(err); + return err; +} + +/* + * initial procedure of adding a new entry. + * prepare writable branch and the parent dir, lock it, + * and lookup whiteout for the new entry. + */ +static struct dentry* +lock_hdir_lkup_wh(struct dentry *dentry, struct au_dtime *dt, + struct dentry *src_dentry, struct au_pin *pin, + struct au_wr_dir_args *wr_dir_args) +{ + struct dentry *wh_dentry, *h_parent; + struct super_block *sb; + struct au_branch *br; + int err; + unsigned int udba; + aufs_bindex_t bcpup; + + AuDbg("%.*s\n", AuDLNPair(dentry)); + + err = au_wr_dir(dentry, src_dentry, wr_dir_args); + bcpup = err; + wh_dentry = ERR_PTR(err); + if (unlikely(err < 0)) + goto out; + + sb = dentry->d_sb; + udba = au_opt_udba(sb); + err = au_pin(pin, dentry, bcpup, udba, + AuPin_DI_LOCKED | AuPin_MNT_WRITE); + wh_dentry = ERR_PTR(err); + if (unlikely(err)) + goto out; + + h_parent = au_pinned_h_parent(pin); + if (udba != AuOpt_UDBA_NONE + && au_dbstart(dentry) == bcpup) + err = au_may_add(dentry, bcpup, h_parent, + au_ftest_wrdir(wr_dir_args->flags, ISDIR)); + else if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN)) + err = -ENAMETOOLONG; + wh_dentry = ERR_PTR(err); + if (unlikely(err)) + goto out_unpin; + + br = au_sbr(sb, bcpup); + if (dt) { + struct path tmp = { + .dentry = h_parent, + .mnt = br->br_mnt + }; + au_dtime_store(dt, au_pinned_parent(pin), &tmp); + } + + wh_dentry = NULL; + if (bcpup != au_dbwh(dentry)) + goto out; /* success */ + + wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, br); + +out_unpin: + if (IS_ERR(wh_dentry)) + au_unpin(pin); +out: + return wh_dentry; +} + +/* ---------------------------------------------------------------------- */ + +enum { Mknod, Symlink, Creat }; +struct simple_arg { + int type; + union { + struct { + int mode; + struct nameidata *nd; + } c; + struct { + const char *symname; + } s; + struct { + int mode; + dev_t dev; + } m; + } u; +}; + +static int add_simple(struct inode *dir, struct dentry *dentry, + struct simple_arg *arg) +{ + int err; + aufs_bindex_t bstart; + unsigned char created; + struct au_dtime dt; + struct au_pin pin; + struct path h_path; + struct dentry *wh_dentry, *parent; + struct inode *h_dir; + struct au_wr_dir_args wr_dir_args = { + .force_btgt = -1, + .flags = AuWrDir_ADD_ENTRY + }; + + AuDbg("%.*s\n", AuDLNPair(dentry)); + IMustLock(dir); + + parent = dentry->d_parent; /* dir inode is locked */ + err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN); + if (unlikely(err)) + goto out; + err = au_d_may_add(dentry); + if (unlikely(err)) + goto out_unlock; + di_write_lock_parent(parent); + wh_dentry = lock_hdir_lkup_wh(dentry, &dt, /*src_dentry*/NULL, &pin, + &wr_dir_args); + err = PTR_ERR(wh_dentry); + if (IS_ERR(wh_dentry)) + goto out_parent; + + bstart = au_dbstart(dentry); + h_path.dentry = au_h_dptr(dentry, bstart); + h_path.mnt = au_sbr_mnt(dentry->d_sb, bstart); + h_dir = au_pinned_h_dir(&pin); + switch (arg->type) { + case Creat: + err = vfsub_create(h_dir, &h_path, arg->u.c.mode); + break; + case Symlink: + err = vfsub_symlink(h_dir, &h_path, arg->u.s.symname); + break; + case Mknod: + err = vfsub_mknod(h_dir, &h_path, arg->u.m.mode, arg->u.m.dev); + break; + default: + BUG(); + } + created = !err; + if (!err) + err = epilog(dir, bstart, wh_dentry, dentry); + + /* revert */ + if (unlikely(created && err && h_path.dentry->d_inode)) { + int rerr; + rerr = vfsub_unlink(h_dir, &h_path, /*force*/0); + if (rerr) { + AuIOErr("%.*s revert failure(%d, %d)\n", + AuDLNPair(dentry), err, rerr); + err = -EIO; + } + au_dtime_revert(&dt); + } + + au_unpin(&pin); + dput(wh_dentry); + +out_parent: + di_write_unlock(parent); +out_unlock: + if (unlikely(err)) { + au_update_dbstart(dentry); + d_drop(dentry); + } + aufs_read_unlock(dentry, AuLock_DW); +out: + return err; +} + +int aufs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +{ + struct simple_arg arg = { + .type = Mknod, + .u.m = { + .mode = mode, + .dev = dev + } + }; + return add_simple(dir, dentry, &arg); +} + +int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +{ + struct simple_arg arg = { + .type = Symlink, + .u.s.symname = symname + }; + return add_simple(dir, dentry, &arg); +} + +int aufs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct simple_arg arg = { + .type = Creat, + .u.c = { + .mode = mode, + .nd = nd + } + }; + return add_simple(dir, dentry, &arg); +} + +/* ---------------------------------------------------------------------- */ + +struct au_link_args { + aufs_bindex_t bdst, bsrc; + struct au_pin pin; + struct path h_path; + struct dentry *src_parent, *parent; +}; + +static int au_cpup_before_link(struct dentry *src_dentry, + struct au_link_args *a) +{ + int err; + struct dentry *h_src_dentry; + struct mutex *h_mtx; + struct file *h_file; + + di_read_lock_parent(a->src_parent, AuLock_IR); + err = au_test_and_cpup_dirs(src_dentry, a->bdst); + if (unlikely(err)) + goto out; + + h_src_dentry = au_h_dptr(src_dentry, a->bsrc); + h_mtx = &h_src_dentry->d_inode->i_mutex; + err = au_pin(&a->pin, src_dentry, a->bdst, + au_opt_udba(src_dentry->d_sb), + AuPin_DI_LOCKED | AuPin_MNT_WRITE); + if (unlikely(err)) + goto out; + mutex_lock_nested(h_mtx, AuLsc_I_CHILD); + h_file = au_h_open_pre(src_dentry, a->bsrc); + if (IS_ERR(h_file)) { + err = PTR_ERR(h_file); + h_file = NULL; + } else + err = au_sio_cpup_simple(src_dentry, a->bdst, -1, + AuCpup_DTIME /* | AuCpup_KEEPLINO */); + mutex_unlock(h_mtx); + au_h_open_post(src_dentry, a->bsrc, h_file); + au_unpin(&a->pin); + +out: + di_read_unlock(a->src_parent, AuLock_IR); + return err; +} + +static int au_cpup_or_link(struct dentry *src_dentry, struct au_link_args *a) +{ + int err; + unsigned char plink; + struct inode *h_inode, *inode; + struct dentry *h_src_dentry; + struct super_block *sb; + struct file *h_file; + + plink = 0; + h_inode = NULL; + sb = src_dentry->d_sb; + inode = src_dentry->d_inode; + if (au_ibstart(inode) <= a->bdst) + h_inode = au_h_iptr(inode, a->bdst); + if (!h_inode || !h_inode->i_nlink) { + /* copyup src_dentry as the name of dentry. */ + au_set_dbstart(src_dentry, a->bdst); + au_set_h_dptr(src_dentry, a->bdst, dget(a->h_path.dentry)); + h_inode = au_h_dptr(src_dentry, a->bsrc)->d_inode; + mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); + h_file = au_h_open_pre(src_dentry, a->bsrc); + if (IS_ERR(h_file)) { + err = PTR_ERR(h_file); + h_file = NULL; + } else + err = au_sio_cpup_single(src_dentry, a->bdst, a->bsrc, + -1, AuCpup_KEEPLINO, + a->parent); + mutex_unlock(&h_inode->i_mutex); + au_h_open_post(src_dentry, a->bsrc, h_file); + au_set_h_dptr(src_dentry, a->bdst, NULL); + au_set_dbstart(src_dentry, a->bsrc); + } else { + /* the inode of src_dentry already exists on a.bdst branch */ + h_src_dentry = d_find_alias(h_inode); + if (!h_src_dentry && au_plink_test(inode)) { + plink = 1; + h_src_dentry = au_plink_lkup(inode, a->bdst); + err = PTR_ERR(h_src_dentry); + if (IS_ERR(h_src_dentry)) + goto out; + + if (unlikely(!h_src_dentry->d_inode)) { + dput(h_src_dentry); + h_src_dentry = NULL; + } + + } + if (h_src_dentry) { + err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin), + &a->h_path); + dput(h_src_dentry); + } else { + AuIOErr("no dentry found for hi%lu on b%d\n", + h_inode->i_ino, a->bdst); + err = -EIO; + } + } + + if (!err && !plink) + au_plink_append(inode, a->bdst, a->h_path.dentry); + +out: + AuTraceErr(err); + return err; +} + +int aufs_link(struct dentry *src_dentry, struct inode *dir, + struct dentry *dentry) +{ + int err, rerr; + struct au_dtime dt; + struct au_link_args *a; + struct dentry *wh_dentry, *h_src_dentry; + struct inode *inode; + struct super_block *sb; + struct au_wr_dir_args wr_dir_args = { + /* .force_btgt = -1, */ + .flags = AuWrDir_ADD_ENTRY + }; + + IMustLock(dir); + inode = src_dentry->d_inode; + IMustLock(inode); + + err = -ENOMEM; + a = kzalloc(sizeof(*a), GFP_NOFS); + if (unlikely(!a)) + goto out; + + a->parent = dentry->d_parent; /* dir inode is locked */ + err = aufs_read_and_write_lock2(dentry, src_dentry, + AuLock_NOPLM | AuLock_GEN); + if (unlikely(err)) + goto out_kfree; + err = au_d_hashed_positive(src_dentry); + if (unlikely(err)) + goto out_unlock; + err = au_d_may_add(dentry); + if (unlikely(err)) + goto out_unlock; + + a->src_parent = dget_parent(src_dentry); + wr_dir_args.force_btgt = au_ibstart(inode); + + di_write_lock_parent(a->parent); + wr_dir_args.force_btgt = au_wbr(dentry, wr_dir_args.force_btgt); + wh_dentry = lock_hdir_lkup_wh(dentry, &dt, src_dentry, &a->pin, + &wr_dir_args); + err = PTR_ERR(wh_dentry); + if (IS_ERR(wh_dentry)) + goto out_parent; + + err = 0; + sb = dentry->d_sb; + a->bdst = au_dbstart(dentry); + a->h_path.dentry = au_h_dptr(dentry, a->bdst); + a->h_path.mnt = au_sbr_mnt(sb, a->bdst); + a->bsrc = au_ibstart(inode); + h_src_dentry = au_h_d_alias(src_dentry, a->bsrc); + if (!h_src_dentry) { + a->bsrc = au_dbstart(src_dentry); + h_src_dentry = au_h_d_alias(src_dentry, a->bsrc); + AuDebugOn(!h_src_dentry); + } else if (IS_ERR(h_src_dentry)) + goto out_parent; + + if (au_opt_test(au_mntflags(sb), PLINK)) { + if (a->bdst < a->bsrc + /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */) + err = au_cpup_or_link(src_dentry, a); + else + err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin), + &a->h_path); + dput(h_src_dentry); + } else { + /* + * copyup src_dentry to the branch we process, + * and then link(2) to it. + */ + dput(h_src_dentry); + if (a->bdst < a->bsrc + /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */) { + au_unpin(&a->pin); + di_write_unlock(a->parent); + err = au_cpup_before_link(src_dentry, a); + di_write_lock_parent(a->parent); + if (!err) + err = au_pin(&a->pin, dentry, a->bdst, + au_opt_udba(sb), + AuPin_DI_LOCKED | AuPin_MNT_WRITE); + if (unlikely(err)) + goto out_wh; + } + if (!err) { + h_src_dentry = au_h_dptr(src_dentry, a->bdst); + err = -ENOENT; + if (h_src_dentry && h_src_dentry->d_inode) + err = vfsub_link(h_src_dentry, + au_pinned_h_dir(&a->pin), + &a->h_path); + } + } + if (unlikely(err)) + goto out_unpin; + + if (wh_dentry) { + a->h_path.dentry = wh_dentry; + err = au_wh_unlink_dentry(au_pinned_h_dir(&a->pin), &a->h_path, + dentry); + if (unlikely(err)) + goto out_revert; + } + + dir->i_version++; + if (au_ibstart(dir) == au_dbstart(dentry)) + au_cpup_attr_timesizes(dir); + inc_nlink(inode); + inode->i_ctime = dir->i_ctime; + d_instantiate(dentry, au_igrab(inode)); + if (d_unhashed(a->h_path.dentry)) + /* some filesystem calls d_drop() */ + d_drop(dentry); + goto out_unpin; /* success */ + +out_revert: + rerr = vfsub_unlink(au_pinned_h_dir(&a->pin), &a->h_path, /*force*/0); + if (unlikely(rerr)) { + AuIOErr("%.*s reverting failed(%d, %d)\n", + AuDLNPair(dentry), err, rerr); + err = -EIO; + } + au_dtime_revert(&dt); +out_unpin: + au_unpin(&a->pin); +out_wh: + dput(wh_dentry); +out_parent: + di_write_unlock(a->parent); + dput(a->src_parent); +out_unlock: + if (unlikely(err)) { + au_update_dbstart(dentry); + d_drop(dentry); + } + aufs_read_and_write_unlock2(dentry, src_dentry); +out_kfree: + kfree(a); +out: + return err; +} + +int aufs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + int err, rerr; + aufs_bindex_t bindex; + unsigned char diropq; + struct path h_path; + struct dentry *wh_dentry, *parent, *opq_dentry; + struct mutex *h_mtx; + struct super_block *sb; + struct { + struct au_pin pin; + struct au_dtime dt; + } *a; /* reduce the stack usage */ + struct au_wr_dir_args wr_dir_args = { + .force_btgt = -1, + .flags = AuWrDir_ADD_ENTRY | AuWrDir_ISDIR + }; + + IMustLock(dir); + + err = -ENOMEM; + a = kmalloc(sizeof(*a), GFP_NOFS); + if (unlikely(!a)) + goto out; + + err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN); + if (unlikely(err)) + goto out_free; + err = au_d_may_add(dentry); + if (unlikely(err)) + goto out_unlock; + + parent = dentry->d_parent; /* dir inode is locked */ + di_write_lock_parent(parent); + wh_dentry = lock_hdir_lkup_wh(dentry, &a->dt, /*src_dentry*/NULL, + &a->pin, &wr_dir_args); + err = PTR_ERR(wh_dentry); + if (IS_ERR(wh_dentry)) + goto out_parent; + + sb = dentry->d_sb; + bindex = au_dbstart(dentry); + h_path.dentry = au_h_dptr(dentry, bindex); + h_path.mnt = au_sbr_mnt(sb, bindex); + err = vfsub_mkdir(au_pinned_h_dir(&a->pin), &h_path, mode); + if (unlikely(err)) + goto out_unpin; + + /* make the dir opaque */ + diropq = 0; + h_mtx = &h_path.dentry->d_inode->i_mutex; + if (wh_dentry + || au_opt_test(au_mntflags(sb), ALWAYS_DIROPQ)) { + mutex_lock_nested(h_mtx, AuLsc_I_CHILD); + opq_dentry = au_diropq_create(dentry, bindex); + mutex_unlock(h_mtx); + err = PTR_ERR(opq_dentry); + if (IS_ERR(opq_dentry)) + goto out_dir; + dput(opq_dentry); + diropq = 1; + } + + err = epilog(dir, bindex, wh_dentry, dentry); + if (!err) { + inc_nlink(dir); + goto out_unpin; /* success */ + } + + /* revert */ + if (diropq) { + AuLabel(revert opq); + mutex_lock_nested(h_mtx, AuLsc_I_CHILD); + rerr = au_diropq_remove(dentry, bindex); + mutex_unlock(h_mtx); + if (rerr) { + AuIOErr("%.*s reverting diropq failed(%d, %d)\n", + AuDLNPair(dentry), err, rerr); + err = -EIO; + } + } + +out_dir: + AuLabel(revert dir); + rerr = vfsub_rmdir(au_pinned_h_dir(&a->pin), &h_path); + if (rerr) { + AuIOErr("%.*s reverting dir failed(%d, %d)\n", + AuDLNPair(dentry), err, rerr); + err = -EIO; + } + au_dtime_revert(&a->dt); +out_unpin: + au_unpin(&a->pin); + dput(wh_dentry); +out_parent: + di_write_unlock(parent); +out_unlock: + if (unlikely(err)) { + au_update_dbstart(dentry); + d_drop(dentry); + } + aufs_read_unlock(dentry, AuLock_DW); +out_free: + kfree(a); +out: + return err; +} diff --git a/ubuntu/aufs/i_op_del.c b/ubuntu/aufs/i_op_del.c new file mode 100644 index 000000000000..309d936f7432 --- /dev/null +++ b/ubuntu/aufs/i_op_del.c @@ -0,0 +1,478 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * inode operations (del entry) + */ + +#include "aufs.h" + +/* + * decide if a new whiteout for @dentry is necessary or not. + * when it is necessary, prepare the parent dir for the upper branch whose + * branch index is @bcpup for creation. the actual creation of the whiteout will + * be done by caller. + * return value: + * 0: wh is unnecessary + * plus: wh is necessary + * minus: error + */ +int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup) +{ + int need_wh, err; + aufs_bindex_t bstart; + struct super_block *sb; + + sb = dentry->d_sb; + bstart = au_dbstart(dentry); + if (*bcpup < 0) { + *bcpup = bstart; + if (au_test_ro(sb, bstart, dentry->d_inode)) { + err = AuWbrCopyup(au_sbi(sb), dentry); + *bcpup = err; + if (unlikely(err < 0)) + goto out; + } + } else + AuDebugOn(bstart < *bcpup + || au_test_ro(sb, *bcpup, dentry->d_inode)); + AuDbg("bcpup %d, bstart %d\n", *bcpup, bstart); + + if (*bcpup != bstart) { + err = au_cpup_dirs(dentry, *bcpup); + if (unlikely(err)) + goto out; + need_wh = 1; + } else { + struct au_dinfo *dinfo, *tmp; + + need_wh = -ENOMEM; + dinfo = au_di(dentry); + tmp = au_di_alloc(sb, AuLsc_DI_TMP); + if (tmp) { + au_di_cp(tmp, dinfo); + au_di_swap(tmp, dinfo); + /* returns the number of positive dentries */ + need_wh = au_lkup_dentry(dentry, bstart + 1, /*type*/0, + /*nd*/NULL); + au_di_swap(tmp, dinfo); + au_rw_write_unlock(&tmp->di_rwsem); + au_di_free(tmp); + } + } + AuDbg("need_wh %d\n", need_wh); + err = need_wh; + +out: + return err; +} + +/* + * simple tests for the del-entry operations. + * following the checks in vfs, plus the parent-child relationship. + */ +int au_may_del(struct dentry *dentry, aufs_bindex_t bindex, + struct dentry *h_parent, int isdir) +{ + int err; + umode_t h_mode; + struct dentry *h_dentry, *h_latest; + struct inode *h_inode; + + h_dentry = au_h_dptr(dentry, bindex); + h_inode = h_dentry->d_inode; + if (dentry->d_inode) { + err = -ENOENT; + if (unlikely(!h_inode || !h_inode->i_nlink)) + goto out; + + h_mode = h_inode->i_mode; + if (!isdir) { + err = -EISDIR; + if (unlikely(S_ISDIR(h_mode))) + goto out; + } else if (unlikely(!S_ISDIR(h_mode))) { + err = -ENOTDIR; + goto out; + } + } else { + /* rename(2) case */ + err = -EIO; + if (unlikely(h_inode)) + goto out; + } + + err = -ENOENT; + /* expected parent dir is locked */ + if (unlikely(h_parent != h_dentry->d_parent)) + goto out; + err = 0; + + /* + * rmdir a dir may break the consistency on some filesystem. + * let's try heavy test. + */ + err = -EACCES; + if (unlikely(au_test_h_perm(h_parent->d_inode, MAY_EXEC | MAY_WRITE))) + goto out; + + h_latest = au_sio_lkup_one(&dentry->d_name, h_parent, + au_sbr(dentry->d_sb, bindex)); + err = -EIO; + if (IS_ERR(h_latest)) + goto out; + if (h_latest == h_dentry) + err = 0; + dput(h_latest); + +out: + return err; +} + +/* + * decide the branch where we operate for @dentry. the branch index will be set + * @rbcpup. after diciding it, 'pin' it and store the timestamps of the parent + * dir for reverting. + * when a new whiteout is necessary, create it. + */ +static struct dentry* +lock_hdir_create_wh(struct dentry *dentry, int isdir, aufs_bindex_t *rbcpup, + struct au_dtime *dt, struct au_pin *pin) +{ + struct dentry *wh_dentry; + struct super_block *sb; + struct path h_path; + int err, need_wh; + unsigned int udba; + aufs_bindex_t bcpup; + + need_wh = au_wr_dir_need_wh(dentry, isdir, rbcpup); + wh_dentry = ERR_PTR(need_wh); + if (unlikely(need_wh < 0)) + goto out; + + sb = dentry->d_sb; + udba = au_opt_udba(sb); + bcpup = *rbcpup; + err = au_pin(pin, dentry, bcpup, udba, + AuPin_DI_LOCKED | AuPin_MNT_WRITE); + wh_dentry = ERR_PTR(err); + if (unlikely(err)) + goto out; + + h_path.dentry = au_pinned_h_parent(pin); + if (udba != AuOpt_UDBA_NONE + && au_dbstart(dentry) == bcpup) { + err = au_may_del(dentry, bcpup, h_path.dentry, isdir); + wh_dentry = ERR_PTR(err); + if (unlikely(err)) + goto out_unpin; + } + + h_path.mnt = au_sbr_mnt(sb, bcpup); + au_dtime_store(dt, au_pinned_parent(pin), &h_path); + wh_dentry = NULL; + if (!need_wh) + goto out; /* success, no need to create whiteout */ + + wh_dentry = au_wh_create(dentry, bcpup, h_path.dentry); + if (IS_ERR(wh_dentry)) + goto out_unpin; + + /* returns with the parent is locked and wh_dentry is dget-ed */ + goto out; /* success */ + +out_unpin: + au_unpin(pin); +out: + return wh_dentry; +} + +/* + * when removing a dir, rename it to a unique temporary whiteout-ed name first + * in order to be revertible and save time for removing many child whiteouts + * under the dir. + * returns 1 when there are too many child whiteout and caller should remove + * them asynchronously. returns 0 when the number of children is enough small to + * remove now or the branch fs is a remote fs. + * otherwise return an error. + */ +static int renwh_and_rmdir(struct dentry *dentry, aufs_bindex_t bindex, + struct au_nhash *whlist, struct inode *dir) +{ + int rmdir_later, err, dirwh; + struct dentry *h_dentry; + struct super_block *sb; + + sb = dentry->d_sb; + SiMustAnyLock(sb); + h_dentry = au_h_dptr(dentry, bindex); + err = au_whtmp_ren(h_dentry, au_sbr(sb, bindex)); + if (unlikely(err)) + goto out; + + /* stop monitoring */ + au_hn_free(au_hi(dentry->d_inode, bindex)); + + if (!au_test_fs_remote(h_dentry->d_sb)) { + dirwh = au_sbi(sb)->si_dirwh; + rmdir_later = (dirwh <= 1); + if (!rmdir_later) + rmdir_later = au_nhash_test_longer_wh(whlist, bindex, + dirwh); + if (rmdir_later) + return rmdir_later; + } + + err = au_whtmp_rmdir(dir, bindex, h_dentry, whlist); + if (unlikely(err)) { + AuIOErr("rmdir %.*s, b%d failed, %d. ignored\n", + AuDLNPair(h_dentry), bindex, err); + err = 0; + } + +out: + AuTraceErr(err); + return err; +} + +/* + * final procedure for deleting a entry. + * maintain dentry and iattr. + */ +static void epilog(struct inode *dir, struct dentry *dentry, + aufs_bindex_t bindex) +{ + struct inode *inode; + + inode = dentry->d_inode; + d_drop(dentry); + inode->i_ctime = dir->i_ctime; + + if (au_ibstart(dir) == bindex) + au_cpup_attr_timesizes(dir); + dir->i_version++; +} + +/* + * when an error happened, remove the created whiteout and revert everything. + */ +static int do_revert(int err, struct inode *dir, aufs_bindex_t bindex, + aufs_bindex_t bwh, struct dentry *wh_dentry, + struct dentry *dentry, struct au_dtime *dt) +{ + int rerr; + struct path h_path = { + .dentry = wh_dentry, + .mnt = au_sbr_mnt(dir->i_sb, bindex) + }; + + rerr = au_wh_unlink_dentry(au_h_iptr(dir, bindex), &h_path, dentry); + if (!rerr) { + au_set_dbwh(dentry, bwh); + au_dtime_revert(dt); + return 0; + } + + AuIOErr("%.*s reverting whiteout failed(%d, %d)\n", + AuDLNPair(dentry), err, rerr); + return -EIO; +} + +/* ---------------------------------------------------------------------- */ + +int aufs_unlink(struct inode *dir, struct dentry *dentry) +{ + int err; + aufs_bindex_t bwh, bindex, bstart; + struct au_dtime dt; + struct au_pin pin; + struct path h_path; + struct inode *inode, *h_dir; + struct dentry *parent, *wh_dentry; + + IMustLock(dir); + + err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN); + if (unlikely(err)) + goto out; + err = au_d_hashed_positive(dentry); + if (unlikely(err)) + goto out_unlock; + inode = dentry->d_inode; + IMustLock(inode); + err = -EISDIR; + if (unlikely(S_ISDIR(inode->i_mode))) + goto out_unlock; /* possible? */ + + bstart = au_dbstart(dentry); + bwh = au_dbwh(dentry); + bindex = -1; + parent = dentry->d_parent; /* dir inode is locked */ + di_write_lock_parent(parent); + wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/0, &bindex, &dt, &pin); + err = PTR_ERR(wh_dentry); + if (IS_ERR(wh_dentry)) + goto out_parent; + + h_path.mnt = au_sbr_mnt(dentry->d_sb, bstart); + h_path.dentry = au_h_dptr(dentry, bstart); + dget(h_path.dentry); + if (bindex == bstart) { + h_dir = au_pinned_h_dir(&pin); + err = vfsub_unlink(h_dir, &h_path, /*force*/0); + } else { + /* dir inode is locked */ + h_dir = wh_dentry->d_parent->d_inode; + IMustLock(h_dir); + err = 0; + } + + if (!err) { + vfsub_drop_nlink(inode); + epilog(dir, dentry, bindex); + + /* update target timestamps */ + if (bindex == bstart) { + vfsub_update_h_iattr(&h_path, /*did*/NULL); /*ignore*/ + inode->i_ctime = h_path.dentry->d_inode->i_ctime; + } else + /* todo: this timestamp may be reverted later */ + inode->i_ctime = h_dir->i_ctime; + goto out_unpin; /* success */ + } + + /* revert */ + if (wh_dentry) { + int rerr; + + rerr = do_revert(err, dir, bindex, bwh, wh_dentry, dentry, &dt); + if (rerr) + err = rerr; + } + +out_unpin: + au_unpin(&pin); + dput(wh_dentry); + dput(h_path.dentry); +out_parent: + di_write_unlock(parent); +out_unlock: + aufs_read_unlock(dentry, AuLock_DW); +out: + return err; +} + +int aufs_rmdir(struct inode *dir, struct dentry *dentry) +{ + int err, rmdir_later; + aufs_bindex_t bwh, bindex, bstart; + struct au_dtime dt; + struct au_pin pin; + struct inode *inode; + struct dentry *parent, *wh_dentry, *h_dentry; + struct au_whtmp_rmdir *args; + + IMustLock(dir); + + err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH | AuLock_GEN); + if (unlikely(err)) + goto out; + err = au_alive_dir(dentry); + if (unlikely(err)) + goto out_unlock; + inode = dentry->d_inode; + IMustLock(inode); + err = -ENOTDIR; + if (unlikely(!S_ISDIR(inode->i_mode))) + goto out_unlock; /* possible? */ + + err = -ENOMEM; + args = au_whtmp_rmdir_alloc(dir->i_sb, GFP_NOFS); + if (unlikely(!args)) + goto out_unlock; + + parent = dentry->d_parent; /* dir inode is locked */ + di_write_lock_parent(parent); + err = au_test_empty(dentry, &args->whlist); + if (unlikely(err)) + goto out_parent; + + bstart = au_dbstart(dentry); + bwh = au_dbwh(dentry); + bindex = -1; + wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/1, &bindex, &dt, &pin); + err = PTR_ERR(wh_dentry); + if (IS_ERR(wh_dentry)) + goto out_parent; + + h_dentry = au_h_dptr(dentry, bstart); + dget(h_dentry); + rmdir_later = 0; + if (bindex == bstart) { + err = renwh_and_rmdir(dentry, bstart, &args->whlist, dir); + if (err > 0) { + rmdir_later = err; + err = 0; + } + } else { + /* stop monitoring */ + au_hn_free(au_hi(inode, bstart)); + + /* dir inode is locked */ + IMustLock(wh_dentry->d_parent->d_inode); + err = 0; + } + + if (!err) { + vfsub_dead_dir(inode); + au_set_dbdiropq(dentry, -1); + epilog(dir, dentry, bindex); + + if (rmdir_later) { + au_whtmp_kick_rmdir(dir, bstart, h_dentry, args); + args = NULL; + } + + goto out_unpin; /* success */ + } + + /* revert */ + AuLabel(revert); + if (wh_dentry) { + int rerr; + + rerr = do_revert(err, dir, bindex, bwh, wh_dentry, dentry, &dt); + if (rerr) + err = rerr; + } + +out_unpin: + au_unpin(&pin); + dput(wh_dentry); + dput(h_dentry); +out_parent: + di_write_unlock(parent); + if (args) + au_whtmp_rmdir_free(args); +out_unlock: + aufs_read_unlock(dentry, AuLock_DW); +out: + AuTraceErr(err); + return err; +} diff --git a/ubuntu/aufs/i_op_ren.c b/ubuntu/aufs/i_op_ren.c new file mode 100644 index 000000000000..e8025baa97f6 --- /dev/null +++ b/ubuntu/aufs/i_op_ren.c @@ -0,0 +1,1017 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * inode operation (rename entry) + * todo: this is crazy monster + */ + +#include "aufs.h" + +enum { AuSRC, AuDST, AuSrcDst }; +enum { AuPARENT, AuCHILD, AuParentChild }; + +#define AuRen_ISDIR 1 +#define AuRen_ISSAMEDIR (1 << 1) +#define AuRen_WHSRC (1 << 2) +#define AuRen_WHDST (1 << 3) +#define AuRen_MNT_WRITE (1 << 4) +#define AuRen_DT_DSTDIR (1 << 5) +#define AuRen_DIROPQ (1 << 6) +#define AuRen_CPUP (1 << 7) +#define au_ftest_ren(flags, name) ((flags) & AuRen_##name) +#define au_fset_ren(flags, name) \ + do { (flags) |= AuRen_##name; } while (0) +#define au_fclr_ren(flags, name) \ + do { (flags) &= ~AuRen_##name; } while (0) + +struct au_ren_args { + struct { + struct dentry *dentry, *h_dentry, *parent, *h_parent, + *wh_dentry; + struct inode *dir, *inode; + struct au_hinode *hdir; + struct au_dtime dt[AuParentChild]; + aufs_bindex_t bstart; + } sd[AuSrcDst]; + +#define src_dentry sd[AuSRC].dentry +#define src_dir sd[AuSRC].dir +#define src_inode sd[AuSRC].inode +#define src_h_dentry sd[AuSRC].h_dentry +#define src_parent sd[AuSRC].parent +#define src_h_parent sd[AuSRC].h_parent +#define src_wh_dentry sd[AuSRC].wh_dentry +#define src_hdir sd[AuSRC].hdir +#define src_h_dir sd[AuSRC].hdir->hi_inode +#define src_dt sd[AuSRC].dt +#define src_bstart sd[AuSRC].bstart + +#define dst_dentry sd[AuDST].dentry +#define dst_dir sd[AuDST].dir +#define dst_inode sd[AuDST].inode +#define dst_h_dentry sd[AuDST].h_dentry +#define dst_parent sd[AuDST].parent +#define dst_h_parent sd[AuDST].h_parent +#define dst_wh_dentry sd[AuDST].wh_dentry +#define dst_hdir sd[AuDST].hdir +#define dst_h_dir sd[AuDST].hdir->hi_inode +#define dst_dt sd[AuDST].dt +#define dst_bstart sd[AuDST].bstart + + struct dentry *h_trap; + struct au_branch *br; + struct au_hinode *src_hinode; + struct path h_path; + struct au_nhash whlist; + aufs_bindex_t btgt, src_bwh, src_bdiropq; + + unsigned int flags; + + struct au_whtmp_rmdir *thargs; + struct dentry *h_dst; +}; + +/* ---------------------------------------------------------------------- */ + +/* + * functions for reverting. + * when an error happened in a single rename systemcall, we should revert + * everything as if nothing happend. + * we don't need to revert the copied-up/down the parent dir since they are + * harmless. + */ + +#define RevertFailure(fmt, ...) do { \ + AuIOErr("revert failure: " fmt " (%d, %d)\n", \ + ##__VA_ARGS__, err, rerr); \ + err = -EIO; \ +} while (0) + +static void au_ren_rev_diropq(int err, struct au_ren_args *a) +{ + int rerr; + + au_hn_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD); + rerr = au_diropq_remove(a->src_dentry, a->btgt); + au_hn_imtx_unlock(a->src_hinode); + au_set_dbdiropq(a->src_dentry, a->src_bdiropq); + if (rerr) + RevertFailure("remove diropq %.*s", AuDLNPair(a->src_dentry)); +} + +static void au_ren_rev_rename(int err, struct au_ren_args *a) +{ + int rerr; + + a->h_path.dentry = au_lkup_one(&a->src_dentry->d_name, a->src_h_parent, + a->br, /*nd*/NULL); + rerr = PTR_ERR(a->h_path.dentry); + if (IS_ERR(a->h_path.dentry)) { + RevertFailure("au_lkup_one %.*s", AuDLNPair(a->src_dentry)); + return; + } + + rerr = vfsub_rename(a->dst_h_dir, + au_h_dptr(a->src_dentry, a->btgt), + a->src_h_dir, &a->h_path); + d_drop(a->h_path.dentry); + dput(a->h_path.dentry); + /* au_set_h_dptr(a->src_dentry, a->btgt, NULL); */ + if (rerr) + RevertFailure("rename %.*s", AuDLNPair(a->src_dentry)); +} + +static void au_ren_rev_cpup(int err, struct au_ren_args *a) +{ + int rerr; + + a->h_path.dentry = a->dst_h_dentry; + rerr = vfsub_unlink(a->dst_h_dir, &a->h_path, /*force*/0); + au_set_h_dptr(a->src_dentry, a->btgt, NULL); + au_set_dbstart(a->src_dentry, a->src_bstart); + if (rerr) + RevertFailure("unlink %.*s", AuDLNPair(a->dst_h_dentry)); +} + +static void au_ren_rev_whtmp(int err, struct au_ren_args *a) +{ + int rerr; + + a->h_path.dentry = au_lkup_one(&a->dst_dentry->d_name, a->dst_h_parent, + a->br, /*nd*/NULL); + rerr = PTR_ERR(a->h_path.dentry); + if (IS_ERR(a->h_path.dentry)) { + RevertFailure("lookup %.*s", AuDLNPair(a->dst_dentry)); + return; + } + if (a->h_path.dentry->d_inode) { + d_drop(a->h_path.dentry); + dput(a->h_path.dentry); + return; + } + + rerr = vfsub_rename(a->dst_h_dir, a->h_dst, a->dst_h_dir, &a->h_path); + d_drop(a->h_path.dentry); + dput(a->h_path.dentry); + if (!rerr) + au_set_h_dptr(a->dst_dentry, a->btgt, dget(a->h_dst)); + else + RevertFailure("rename %.*s", AuDLNPair(a->h_dst)); +} + +static void au_ren_rev_whsrc(int err, struct au_ren_args *a) +{ + int rerr; + + a->h_path.dentry = a->src_wh_dentry; + rerr = au_wh_unlink_dentry(a->src_h_dir, &a->h_path, a->src_dentry); + au_set_dbwh(a->src_dentry, a->src_bwh); + if (rerr) + RevertFailure("unlink %.*s", AuDLNPair(a->src_wh_dentry)); +} +#undef RevertFailure + +/* ---------------------------------------------------------------------- */ + +/* + * when we have to copyup the renaming entry, do it with the rename-target name + * in order to minimize the cost (the later actual rename is unnecessary). + * otherwise rename it on the target branch. + */ +static int au_ren_or_cpup(struct au_ren_args *a) +{ + int err; + struct dentry *d; + + d = a->src_dentry; + if (au_dbstart(d) == a->btgt) { + a->h_path.dentry = a->dst_h_dentry; + if (au_ftest_ren(a->flags, DIROPQ) + && au_dbdiropq(d) == a->btgt) + au_fclr_ren(a->flags, DIROPQ); + AuDebugOn(au_dbstart(d) != a->btgt); + err = vfsub_rename(a->src_h_dir, au_h_dptr(d, a->btgt), + a->dst_h_dir, &a->h_path); + } else { + struct mutex *h_mtx = &a->src_h_dentry->d_inode->i_mutex; + struct file *h_file; + + au_fset_ren(a->flags, CPUP); + mutex_lock_nested(h_mtx, AuLsc_I_CHILD); + au_set_dbstart(d, a->btgt); + au_set_h_dptr(d, a->btgt, dget(a->dst_h_dentry)); + h_file = au_h_open_pre(d, a->src_bstart); + if (IS_ERR(h_file)) { + err = PTR_ERR(h_file); + h_file = NULL; + } else + err = au_sio_cpup_single(d, a->btgt, a->src_bstart, -1, + !AuCpup_DTIME, a->dst_parent); + mutex_unlock(h_mtx); + au_h_open_post(d, a->src_bstart, h_file); + if (!err) { + d = a->dst_dentry; + au_set_h_dptr(d, a->btgt, NULL); + au_update_dbstart(d); + } else { + au_set_h_dptr(d, a->btgt, NULL); + au_set_dbstart(d, a->src_bstart); + } + } + if (!err && a->h_dst) + /* it will be set to dinfo later */ + dget(a->h_dst); + + return err; +} + +/* cf. aufs_rmdir() */ +static int au_ren_del_whtmp(struct au_ren_args *a) +{ + int err; + struct inode *dir; + + dir = a->dst_dir; + SiMustAnyLock(dir->i_sb); + if (!au_nhash_test_longer_wh(&a->whlist, a->btgt, + au_sbi(dir->i_sb)->si_dirwh) + || au_test_fs_remote(a->h_dst->d_sb)) { + err = au_whtmp_rmdir(dir, a->btgt, a->h_dst, &a->whlist); + if (unlikely(err)) + pr_warning("failed removing whtmp dir %.*s (%d), " + "ignored.\n", AuDLNPair(a->h_dst), err); + } else { + au_nhash_wh_free(&a->thargs->whlist); + a->thargs->whlist = a->whlist; + a->whlist.nh_num = 0; + au_whtmp_kick_rmdir(dir, a->btgt, a->h_dst, a->thargs); + dput(a->h_dst); + a->thargs = NULL; + } + + return 0; +} + +/* make it 'opaque' dir. */ +static int au_ren_diropq(struct au_ren_args *a) +{ + int err; + struct dentry *diropq; + + err = 0; + a->src_bdiropq = au_dbdiropq(a->src_dentry); + a->src_hinode = au_hi(a->src_inode, a->btgt); + au_hn_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD); + diropq = au_diropq_create(a->src_dentry, a->btgt); + au_hn_imtx_unlock(a->src_hinode); + if (IS_ERR(diropq)) + err = PTR_ERR(diropq); + dput(diropq); + + return err; +} + +static int do_rename(struct au_ren_args *a) +{ + int err; + struct dentry *d, *h_d; + + /* prepare workqueue args for asynchronous rmdir */ + h_d = a->dst_h_dentry; + if (au_ftest_ren(a->flags, ISDIR) && h_d->d_inode) { + err = -ENOMEM; + a->thargs = au_whtmp_rmdir_alloc(a->src_dentry->d_sb, GFP_NOFS); + if (unlikely(!a->thargs)) + goto out; + a->h_dst = dget(h_d); + } + + /* create whiteout for src_dentry */ + if (au_ftest_ren(a->flags, WHSRC)) { + a->src_bwh = au_dbwh(a->src_dentry); + AuDebugOn(a->src_bwh >= 0); + a->src_wh_dentry + = au_wh_create(a->src_dentry, a->btgt, a->src_h_parent); + err = PTR_ERR(a->src_wh_dentry); + if (IS_ERR(a->src_wh_dentry)) + goto out_thargs; + } + + /* lookup whiteout for dentry */ + if (au_ftest_ren(a->flags, WHDST)) { + h_d = au_wh_lkup(a->dst_h_parent, &a->dst_dentry->d_name, + a->br); + err = PTR_ERR(h_d); + if (IS_ERR(h_d)) + goto out_whsrc; + if (!h_d->d_inode) + dput(h_d); + else + a->dst_wh_dentry = h_d; + } + + /* rename dentry to tmpwh */ + if (a->thargs) { + err = au_whtmp_ren(a->dst_h_dentry, a->br); + if (unlikely(err)) + goto out_whdst; + + d = a->dst_dentry; + au_set_h_dptr(d, a->btgt, NULL); + err = au_lkup_neg(d, a->btgt); + if (unlikely(err)) + goto out_whtmp; + a->dst_h_dentry = au_h_dptr(d, a->btgt); + } + + /* cpup src */ + if (a->dst_h_dentry->d_inode && a->src_bstart != a->btgt) { + struct mutex *h_mtx = &a->src_h_dentry->d_inode->i_mutex; + struct file *h_file; + + mutex_lock_nested(h_mtx, AuLsc_I_CHILD); + AuDebugOn(au_dbstart(a->src_dentry) != a->src_bstart); + h_file = au_h_open_pre(a->src_dentry, a->src_bstart); + if (IS_ERR(h_file)) { + err = PTR_ERR(h_file); + h_file = NULL; + } else + err = au_sio_cpup_simple(a->src_dentry, a->btgt, -1, + !AuCpup_DTIME); + mutex_unlock(h_mtx); + au_h_open_post(a->src_dentry, a->src_bstart, h_file); + if (unlikely(err)) + goto out_whtmp; + } + + /* rename by vfs_rename or cpup */ + d = a->dst_dentry; + if (au_ftest_ren(a->flags, ISDIR) + && (a->dst_wh_dentry + || au_dbdiropq(d) == a->btgt + /* hide the lower to keep xino */ + || a->btgt < au_dbend(d) + || au_opt_test(au_mntflags(d->d_sb), ALWAYS_DIROPQ))) + au_fset_ren(a->flags, DIROPQ); + err = au_ren_or_cpup(a); + if (unlikely(err)) + /* leave the copied-up one */ + goto out_whtmp; + + /* make dir opaque */ + if (au_ftest_ren(a->flags, DIROPQ)) { + err = au_ren_diropq(a); + if (unlikely(err)) + goto out_rename; + } + + /* update target timestamps */ + AuDebugOn(au_dbstart(a->src_dentry) != a->btgt); + a->h_path.dentry = au_h_dptr(a->src_dentry, a->btgt); + vfsub_update_h_iattr(&a->h_path, /*did*/NULL); /*ignore*/ + a->src_inode->i_ctime = a->h_path.dentry->d_inode->i_ctime; + + /* remove whiteout for dentry */ + if (a->dst_wh_dentry) { + a->h_path.dentry = a->dst_wh_dentry; + err = au_wh_unlink_dentry(a->dst_h_dir, &a->h_path, + a->dst_dentry); + if (unlikely(err)) + goto out_diropq; + } + + /* remove whtmp */ + if (a->thargs) + au_ren_del_whtmp(a); /* ignore this error */ + + err = 0; + goto out_success; + +out_diropq: + if (au_ftest_ren(a->flags, DIROPQ)) + au_ren_rev_diropq(err, a); +out_rename: + if (!au_ftest_ren(a->flags, CPUP)) + au_ren_rev_rename(err, a); + else + au_ren_rev_cpup(err, a); + dput(a->h_dst); +out_whtmp: + if (a->thargs) + au_ren_rev_whtmp(err, a); +out_whdst: + dput(a->dst_wh_dentry); + a->dst_wh_dentry = NULL; +out_whsrc: + if (a->src_wh_dentry) + au_ren_rev_whsrc(err, a); +out_success: + dput(a->src_wh_dentry); + dput(a->dst_wh_dentry); +out_thargs: + if (a->thargs) { + dput(a->h_dst); + au_whtmp_rmdir_free(a->thargs); + a->thargs = NULL; + } +out: + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* + * test if @dentry dir can be rename destination or not. + * success means, it is a logically empty dir. + */ +static int may_rename_dstdir(struct dentry *dentry, struct au_nhash *whlist) +{ + return au_test_empty(dentry, whlist); +} + +/* + * test if @dentry dir can be rename source or not. + * if it can, return 0 and @children is filled. + * success means, + * - it is a logically empty dir. + * - or, it exists on writable branch and has no children including whiteouts + * on the lower branch. + */ +static int may_rename_srcdir(struct dentry *dentry, aufs_bindex_t btgt) +{ + int err; + unsigned int rdhash; + aufs_bindex_t bstart; + + bstart = au_dbstart(dentry); + if (bstart != btgt) { + struct au_nhash whlist; + + SiMustAnyLock(dentry->d_sb); + rdhash = au_sbi(dentry->d_sb)->si_rdhash; + if (!rdhash) + rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, + dentry)); + err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS); + if (unlikely(err)) + goto out; + err = au_test_empty(dentry, &whlist); + au_nhash_wh_free(&whlist); + goto out; + } + + if (bstart == au_dbtaildir(dentry)) + return 0; /* success */ + + err = au_test_empty_lower(dentry); + +out: + if (err == -ENOTEMPTY) { + AuWarn1("renaming dir who has child(ren) on multiple branches," + " is not supported\n"); + err = -EXDEV; + } + return err; +} + +/* side effect: sets whlist and h_dentry */ +static int au_ren_may_dir(struct au_ren_args *a) +{ + int err; + unsigned int rdhash; + struct dentry *d; + + d = a->dst_dentry; + SiMustAnyLock(d->d_sb); + + err = 0; + if (au_ftest_ren(a->flags, ISDIR) && a->dst_inode) { + rdhash = au_sbi(d->d_sb)->si_rdhash; + if (!rdhash) + rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, d)); + err = au_nhash_alloc(&a->whlist, rdhash, GFP_NOFS); + if (unlikely(err)) + goto out; + + au_set_dbstart(d, a->dst_bstart); + err = may_rename_dstdir(d, &a->whlist); + au_set_dbstart(d, a->btgt); + } + a->dst_h_dentry = au_h_dptr(d, au_dbstart(d)); + if (unlikely(err)) + goto out; + + d = a->src_dentry; + a->src_h_dentry = au_h_dptr(d, au_dbstart(d)); + if (au_ftest_ren(a->flags, ISDIR)) { + err = may_rename_srcdir(d, a->btgt); + if (unlikely(err)) { + au_nhash_wh_free(&a->whlist); + a->whlist.nh_num = 0; + } + } +out: + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* + * simple tests for rename. + * following the checks in vfs, plus the parent-child relationship. + */ +static int au_may_ren(struct au_ren_args *a) +{ + int err, isdir; + struct inode *h_inode; + + if (a->src_bstart == a->btgt) { + err = au_may_del(a->src_dentry, a->btgt, a->src_h_parent, + au_ftest_ren(a->flags, ISDIR)); + if (unlikely(err)) + goto out; + err = -EINVAL; + if (unlikely(a->src_h_dentry == a->h_trap)) + goto out; + } + + err = 0; + if (a->dst_bstart != a->btgt) + goto out; + + err = -ENOTEMPTY; + if (unlikely(a->dst_h_dentry == a->h_trap)) + goto out; + + err = -EIO; + h_inode = a->dst_h_dentry->d_inode; + isdir = !!au_ftest_ren(a->flags, ISDIR); + if (!a->dst_dentry->d_inode) { + if (unlikely(h_inode)) + goto out; + err = au_may_add(a->dst_dentry, a->btgt, a->dst_h_parent, + isdir); + } else { + if (unlikely(!h_inode || !h_inode->i_nlink)) + goto out; + err = au_may_del(a->dst_dentry, a->btgt, a->dst_h_parent, + isdir); + if (unlikely(err)) + goto out; + } + +out: + if (unlikely(err == -ENOENT || err == -EEXIST)) + err = -EIO; + AuTraceErr(err); + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* + * locking order + * (VFS) + * - src_dir and dir by lock_rename() + * - inode if exitsts + * (aufs) + * - lock all + * + src_dentry and dentry by aufs_read_and_write_lock2() which calls, + * + si_read_lock + * + di_write_lock2_child() + * + di_write_lock_child() + * + ii_write_lock_child() + * + di_write_lock_child2() + * + ii_write_lock_child2() + * + src_parent and parent + * + di_write_lock_parent() + * + ii_write_lock_parent() + * + di_write_lock_parent2() + * + ii_write_lock_parent2() + * + lower src_dir and dir by vfsub_lock_rename() + * + verify the every relationships between child and parent. if any + * of them failed, unlock all and return -EBUSY. + */ +static void au_ren_unlock(struct au_ren_args *a) +{ + struct super_block *sb; + + sb = a->dst_dentry->d_sb; + if (au_ftest_ren(a->flags, MNT_WRITE)) + mnt_drop_write(a->br->br_mnt); + vfsub_unlock_rename(a->src_h_parent, a->src_hdir, + a->dst_h_parent, a->dst_hdir); +} + +static int au_ren_lock(struct au_ren_args *a) +{ + int err; + unsigned int udba; + + err = 0; + a->src_h_parent = au_h_dptr(a->src_parent, a->btgt); + a->src_hdir = au_hi(a->src_dir, a->btgt); + a->dst_h_parent = au_h_dptr(a->dst_parent, a->btgt); + a->dst_hdir = au_hi(a->dst_dir, a->btgt); + a->h_trap = vfsub_lock_rename(a->src_h_parent, a->src_hdir, + a->dst_h_parent, a->dst_hdir); + udba = au_opt_udba(a->src_dentry->d_sb); + if (unlikely(a->src_hdir->hi_inode != a->src_h_parent->d_inode + || a->dst_hdir->hi_inode != a->dst_h_parent->d_inode)) + err = au_busy_or_stale(); + if (!err && au_dbstart(a->src_dentry) == a->btgt) + err = au_h_verify(a->src_h_dentry, udba, + a->src_h_parent->d_inode, a->src_h_parent, + a->br); + if (!err && au_dbstart(a->dst_dentry) == a->btgt) + err = au_h_verify(a->dst_h_dentry, udba, + a->dst_h_parent->d_inode, a->dst_h_parent, + a->br); + if (!err) { + err = mnt_want_write(a->br->br_mnt); + if (unlikely(err)) + goto out_unlock; + au_fset_ren(a->flags, MNT_WRITE); + goto out; /* success */ + } + + err = au_busy_or_stale(); + +out_unlock: + au_ren_unlock(a); +out: + return err; +} + +/* ---------------------------------------------------------------------- */ + +static void au_ren_refresh_dir(struct au_ren_args *a) +{ + struct inode *dir; + + dir = a->dst_dir; + dir->i_version++; + if (au_ftest_ren(a->flags, ISDIR)) { + /* is this updating defined in POSIX? */ + au_cpup_attr_timesizes(a->src_inode); + au_cpup_attr_nlink(dir, /*force*/1); + } + + if (au_ibstart(dir) == a->btgt) + au_cpup_attr_timesizes(dir); + + if (au_ftest_ren(a->flags, ISSAMEDIR)) + return; + + dir = a->src_dir; + dir->i_version++; + if (au_ftest_ren(a->flags, ISDIR)) + au_cpup_attr_nlink(dir, /*force*/1); + if (au_ibstart(dir) == a->btgt) + au_cpup_attr_timesizes(dir); +} + +static void au_ren_refresh(struct au_ren_args *a) +{ + aufs_bindex_t bend, bindex; + struct dentry *d, *h_d; + struct inode *i, *h_i; + struct super_block *sb; + + d = a->dst_dentry; + d_drop(d); + if (a->h_dst) + /* already dget-ed by au_ren_or_cpup() */ + au_set_h_dptr(d, a->btgt, a->h_dst); + + i = a->dst_inode; + if (i) { + if (!au_ftest_ren(a->flags, ISDIR)) + vfsub_drop_nlink(i); + else { + vfsub_dead_dir(i); + au_cpup_attr_timesizes(i); + } + au_update_dbrange(d, /*do_put_zero*/1); + } else { + bend = a->btgt; + for (bindex = au_dbstart(d); bindex < bend; bindex++) + au_set_h_dptr(d, bindex, NULL); + bend = au_dbend(d); + for (bindex = a->btgt + 1; bindex <= bend; bindex++) + au_set_h_dptr(d, bindex, NULL); + au_update_dbrange(d, /*do_put_zero*/0); + } + + d = a->src_dentry; + au_set_dbwh(d, -1); + bend = au_dbend(d); + for (bindex = a->btgt + 1; bindex <= bend; bindex++) { + h_d = au_h_dptr(d, bindex); + if (h_d) + au_set_h_dptr(d, bindex, NULL); + } + au_set_dbend(d, a->btgt); + + sb = d->d_sb; + i = a->src_inode; + if (au_opt_test(au_mntflags(sb), PLINK) && au_plink_test(i)) + return; /* success */ + + bend = au_ibend(i); + for (bindex = a->btgt + 1; bindex <= bend; bindex++) { + h_i = au_h_iptr(i, bindex); + if (h_i) { + au_xino_write(sb, bindex, h_i->i_ino, /*ino*/0); + /* ignore this error */ + au_set_h_iptr(i, bindex, NULL, 0); + } + } + au_set_ibend(i, a->btgt); +} + +/* ---------------------------------------------------------------------- */ + +/* mainly for link(2) and rename(2) */ +int au_wbr(struct dentry *dentry, aufs_bindex_t btgt) +{ + aufs_bindex_t bdiropq, bwh; + struct dentry *parent; + struct au_branch *br; + + parent = dentry->d_parent; + IMustLock(parent->d_inode); /* dir is locked */ + + bdiropq = au_dbdiropq(parent); + bwh = au_dbwh(dentry); + br = au_sbr(dentry->d_sb, btgt); + if (au_br_rdonly(br) + || (0 <= bdiropq && bdiropq < btgt) + || (0 <= bwh && bwh < btgt)) + btgt = -1; + + AuDbg("btgt %d\n", btgt); + return btgt; +} + +/* sets src_bstart, dst_bstart and btgt */ +static int au_ren_wbr(struct au_ren_args *a) +{ + int err; + struct au_wr_dir_args wr_dir_args = { + /* .force_btgt = -1, */ + .flags = AuWrDir_ADD_ENTRY + }; + + a->src_bstart = au_dbstart(a->src_dentry); + a->dst_bstart = au_dbstart(a->dst_dentry); + if (au_ftest_ren(a->flags, ISDIR)) + au_fset_wrdir(wr_dir_args.flags, ISDIR); + wr_dir_args.force_btgt = a->src_bstart; + if (a->dst_inode && a->dst_bstart < a->src_bstart) + wr_dir_args.force_btgt = a->dst_bstart; + wr_dir_args.force_btgt = au_wbr(a->dst_dentry, wr_dir_args.force_btgt); + err = au_wr_dir(a->dst_dentry, a->src_dentry, &wr_dir_args); + a->btgt = err; + + return err; +} + +static void au_ren_dt(struct au_ren_args *a) +{ + a->h_path.dentry = a->src_h_parent; + au_dtime_store(a->src_dt + AuPARENT, a->src_parent, &a->h_path); + if (!au_ftest_ren(a->flags, ISSAMEDIR)) { + a->h_path.dentry = a->dst_h_parent; + au_dtime_store(a->dst_dt + AuPARENT, a->dst_parent, &a->h_path); + } + + au_fclr_ren(a->flags, DT_DSTDIR); + if (!au_ftest_ren(a->flags, ISDIR)) + return; + + a->h_path.dentry = a->src_h_dentry; + au_dtime_store(a->src_dt + AuCHILD, a->src_dentry, &a->h_path); + if (a->dst_h_dentry->d_inode) { + au_fset_ren(a->flags, DT_DSTDIR); + a->h_path.dentry = a->dst_h_dentry; + au_dtime_store(a->dst_dt + AuCHILD, a->dst_dentry, &a->h_path); + } +} + +static void au_ren_rev_dt(int err, struct au_ren_args *a) +{ + struct dentry *h_d; + struct mutex *h_mtx; + + au_dtime_revert(a->src_dt + AuPARENT); + if (!au_ftest_ren(a->flags, ISSAMEDIR)) + au_dtime_revert(a->dst_dt + AuPARENT); + + if (au_ftest_ren(a->flags, ISDIR) && err != -EIO) { + h_d = a->src_dt[AuCHILD].dt_h_path.dentry; + h_mtx = &h_d->d_inode->i_mutex; + mutex_lock_nested(h_mtx, AuLsc_I_CHILD); + au_dtime_revert(a->src_dt + AuCHILD); + mutex_unlock(h_mtx); + + if (au_ftest_ren(a->flags, DT_DSTDIR)) { + h_d = a->dst_dt[AuCHILD].dt_h_path.dentry; + h_mtx = &h_d->d_inode->i_mutex; + mutex_lock_nested(h_mtx, AuLsc_I_CHILD); + au_dtime_revert(a->dst_dt + AuCHILD); + mutex_unlock(h_mtx); + } + } +} + +/* ---------------------------------------------------------------------- */ + +int aufs_rename(struct inode *_src_dir, struct dentry *_src_dentry, + struct inode *_dst_dir, struct dentry *_dst_dentry) +{ + int err, flags; + /* reduce stack space */ + struct au_ren_args *a; + + AuDbg("%.*s, %.*s\n", AuDLNPair(_src_dentry), AuDLNPair(_dst_dentry)); + IMustLock(_src_dir); + IMustLock(_dst_dir); + + err = -ENOMEM; + BUILD_BUG_ON(sizeof(*a) > PAGE_SIZE); + a = kzalloc(sizeof(*a), GFP_NOFS); + if (unlikely(!a)) + goto out; + + a->src_dir = _src_dir; + a->src_dentry = _src_dentry; + a->src_inode = a->src_dentry->d_inode; + a->src_parent = a->src_dentry->d_parent; /* dir inode is locked */ + a->dst_dir = _dst_dir; + a->dst_dentry = _dst_dentry; + a->dst_inode = a->dst_dentry->d_inode; + a->dst_parent = a->dst_dentry->d_parent; /* dir inode is locked */ + if (a->dst_inode) { + IMustLock(a->dst_inode); + au_igrab(a->dst_inode); + } + + err = -ENOTDIR; + flags = AuLock_FLUSH | AuLock_NOPLM | AuLock_GEN; + if (S_ISDIR(a->src_inode->i_mode)) { + au_fset_ren(a->flags, ISDIR); + if (unlikely(a->dst_inode && !S_ISDIR(a->dst_inode->i_mode))) + goto out_free; + err = aufs_read_and_write_lock2(a->dst_dentry, a->src_dentry, + AuLock_DIR | flags); + } else + err = aufs_read_and_write_lock2(a->dst_dentry, a->src_dentry, + flags); + if (unlikely(err)) + goto out_free; + + err = au_d_hashed_positive(a->src_dentry); + if (unlikely(err)) + goto out_unlock; + err = -ENOENT; + if (a->dst_inode) { + /* + * If it is a dir, VFS unhash dst_dentry before this + * function. It means we cannot rely upon d_unhashed(). + */ + if (unlikely(!a->dst_inode->i_nlink)) + goto out_unlock; + if (!S_ISDIR(a->dst_inode->i_mode)) { + err = au_d_hashed_positive(a->dst_dentry); + if (unlikely(err)) + goto out_unlock; + } else if (unlikely(IS_DEADDIR(a->dst_inode))) + goto out_unlock; + } else if (unlikely(d_unhashed(a->dst_dentry))) + goto out_unlock; + + au_fset_ren(a->flags, ISSAMEDIR); /* temporary */ + di_write_lock_parent(a->dst_parent); + + /* which branch we process */ + err = au_ren_wbr(a); + if (unlikely(err < 0)) + goto out_parent; + a->br = au_sbr(a->dst_dentry->d_sb, a->btgt); + a->h_path.mnt = a->br->br_mnt; + + /* are they available to be renamed */ + err = au_ren_may_dir(a); + if (unlikely(err)) + goto out_children; + + /* prepare the writable parent dir on the same branch */ + if (a->dst_bstart == a->btgt) { + au_fset_ren(a->flags, WHDST); + } else { + err = au_cpup_dirs(a->dst_dentry, a->btgt); + if (unlikely(err)) + goto out_children; + } + + if (a->src_dir != a->dst_dir) { + /* + * this temporary unlock is safe, + * because both dir->i_mutex are locked. + */ + di_write_unlock(a->dst_parent); + di_write_lock_parent(a->src_parent); + err = au_wr_dir_need_wh(a->src_dentry, + au_ftest_ren(a->flags, ISDIR), + &a->btgt); + di_write_unlock(a->src_parent); + di_write_lock2_parent(a->src_parent, a->dst_parent, /*isdir*/1); + au_fclr_ren(a->flags, ISSAMEDIR); + } else + err = au_wr_dir_need_wh(a->src_dentry, + au_ftest_ren(a->flags, ISDIR), + &a->btgt); + if (unlikely(err < 0)) + goto out_children; + if (err) + au_fset_ren(a->flags, WHSRC); + + /* lock them all */ + err = au_ren_lock(a); + if (unlikely(err)) + goto out_children; + + if (!au_opt_test(au_mntflags(a->dst_dir->i_sb), UDBA_NONE)) + err = au_may_ren(a); + else if (unlikely(a->dst_dentry->d_name.len > AUFS_MAX_NAMELEN)) + err = -ENAMETOOLONG; + if (unlikely(err)) + goto out_hdir; + + /* store timestamps to be revertible */ + au_ren_dt(a); + + /* here we go */ + err = do_rename(a); + if (unlikely(err)) + goto out_dt; + + /* update dir attributes */ + au_ren_refresh_dir(a); + + /* dput/iput all lower dentries */ + au_ren_refresh(a); + + goto out_hdir; /* success */ + +out_dt: + au_ren_rev_dt(err, a); +out_hdir: + au_ren_unlock(a); +out_children: + au_nhash_wh_free(&a->whlist); + if (err && a->dst_inode && a->dst_bstart != a->btgt) { + AuDbg("bstart %d, btgt %d\n", a->dst_bstart, a->btgt); + au_set_h_dptr(a->dst_dentry, a->btgt, NULL); + au_set_dbstart(a->dst_dentry, a->dst_bstart); + } +out_parent: + if (!err) + d_move(a->src_dentry, a->dst_dentry); + else { + au_update_dbstart(a->dst_dentry); + if (!a->dst_inode) + d_drop(a->dst_dentry); + } + if (au_ftest_ren(a->flags, ISSAMEDIR)) + di_write_unlock(a->dst_parent); + else + di_write_unlock2(a->src_parent, a->dst_parent); +out_unlock: + aufs_read_and_write_unlock2(a->dst_dentry, a->src_dentry); +out_free: + iput(a->dst_inode); + if (a->thargs) + au_whtmp_rmdir_free(a->thargs); + kfree(a); +out: + AuTraceErr(err); + return err; +} diff --git a/ubuntu/aufs/iinfo.c b/ubuntu/aufs/iinfo.c new file mode 100644 index 000000000000..cd1086eb4595 --- /dev/null +++ b/ubuntu/aufs/iinfo.c @@ -0,0 +1,264 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * inode private data + */ + +#include "aufs.h" + +struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex) +{ + struct inode *h_inode; + + IiMustAnyLock(inode); + + h_inode = au_ii(inode)->ii_hinode[0 + bindex].hi_inode; + AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0); + return h_inode; +} + +/* todo: hard/soft set? */ +void au_hiput(struct au_hinode *hinode) +{ + au_hn_free(hinode); + dput(hinode->hi_whdentry); + iput(hinode->hi_inode); +} + +unsigned int au_hi_flags(struct inode *inode, int isdir) +{ + unsigned int flags; + const unsigned int mnt_flags = au_mntflags(inode->i_sb); + + flags = 0; + if (au_opt_test(mnt_flags, XINO)) + au_fset_hi(flags, XINO); + if (isdir && au_opt_test(mnt_flags, UDBA_HNOTIFY)) + au_fset_hi(flags, HNOTIFY); + return flags; +} + +void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex, + struct inode *h_inode, unsigned int flags) +{ + struct au_hinode *hinode; + struct inode *hi; + struct au_iinfo *iinfo = au_ii(inode); + + IiMustWriteLock(inode); + + hinode = iinfo->ii_hinode + bindex; + hi = hinode->hi_inode; + AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0); + + if (hi) + au_hiput(hinode); + hinode->hi_inode = h_inode; + if (h_inode) { + int err; + struct super_block *sb = inode->i_sb; + struct au_branch *br; + + AuDebugOn(inode->i_mode + && (h_inode->i_mode & S_IFMT) + != (inode->i_mode & S_IFMT)); + if (bindex == iinfo->ii_bstart) + au_cpup_igen(inode, h_inode); + br = au_sbr(sb, bindex); + hinode->hi_id = br->br_id; + if (au_ftest_hi(flags, XINO)) { + err = au_xino_write(sb, bindex, h_inode->i_ino, + inode->i_ino); + if (unlikely(err)) + AuIOErr1("failed au_xino_write() %d\n", err); + } + + if (au_ftest_hi(flags, HNOTIFY) + && au_br_hnotifyable(br->br_perm)) { + err = au_hn_alloc(hinode, inode); + if (unlikely(err)) + AuIOErr1("au_hn_alloc() %d\n", err); + } + } +} + +void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex, + struct dentry *h_wh) +{ + struct au_hinode *hinode; + + IiMustWriteLock(inode); + + hinode = au_ii(inode)->ii_hinode + bindex; + AuDebugOn(hinode->hi_whdentry); + hinode->hi_whdentry = h_wh; +} + +void au_update_iigen(struct inode *inode) +{ + atomic_set(&au_ii(inode)->ii_generation, au_sigen(inode->i_sb)); + /* smp_mb(); */ /* atomic_set */ +} + +/* it may be called at remount time, too */ +void au_update_ibrange(struct inode *inode, int do_put_zero) +{ + struct au_iinfo *iinfo; + aufs_bindex_t bindex, bend; + + iinfo = au_ii(inode); + if (!iinfo) + return; + + IiMustWriteLock(inode); + + if (do_put_zero && iinfo->ii_bstart >= 0) { + for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; + bindex++) { + struct inode *h_i; + + h_i = iinfo->ii_hinode[0 + bindex].hi_inode; + if (h_i && !h_i->i_nlink) + au_set_h_iptr(inode, bindex, NULL, 0); + } + } + + iinfo->ii_bstart = -1; + iinfo->ii_bend = -1; + bend = au_sbend(inode->i_sb); + for (bindex = 0; bindex <= bend; bindex++) + if (iinfo->ii_hinode[0 + bindex].hi_inode) { + iinfo->ii_bstart = bindex; + break; + } + if (iinfo->ii_bstart >= 0) + for (bindex = bend; bindex >= iinfo->ii_bstart; bindex--) + if (iinfo->ii_hinode[0 + bindex].hi_inode) { + iinfo->ii_bend = bindex; + break; + } + AuDebugOn(iinfo->ii_bstart > iinfo->ii_bend); +} + +/* ---------------------------------------------------------------------- */ + +void au_icntnr_init_once(void *_c) +{ + struct au_icntnr *c = _c; + struct au_iinfo *iinfo = &c->iinfo; + static struct lock_class_key aufs_ii; + + au_rw_init(&iinfo->ii_rwsem); + au_rw_class(&iinfo->ii_rwsem, &aufs_ii); + inode_init_once(&c->vfs_inode); +} + +int au_iinfo_init(struct inode *inode) +{ + struct au_iinfo *iinfo; + struct super_block *sb; + int nbr, i; + + sb = inode->i_sb; + iinfo = &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo); + nbr = au_sbend(sb) + 1; + if (unlikely(nbr <= 0)) + nbr = 1; + iinfo->ii_hinode = kcalloc(nbr, sizeof(*iinfo->ii_hinode), GFP_NOFS); + if (iinfo->ii_hinode) { + au_ninodes_inc(sb); + for (i = 0; i < nbr; i++) + iinfo->ii_hinode[i].hi_id = -1; + + atomic_set(&iinfo->ii_generation, au_sigen(sb)); + /* smp_mb(); */ /* atomic_set */ + iinfo->ii_bstart = -1; + iinfo->ii_bend = -1; + iinfo->ii_vdir = NULL; + return 0; + } + return -ENOMEM; +} + +int au_ii_realloc(struct au_iinfo *iinfo, int nbr) +{ + int err, sz; + struct au_hinode *hip; + + AuRwMustWriteLock(&iinfo->ii_rwsem); + + err = -ENOMEM; + sz = sizeof(*hip) * (iinfo->ii_bend + 1); + if (!sz) + sz = sizeof(*hip); + hip = au_kzrealloc(iinfo->ii_hinode, sz, sizeof(*hip) * nbr, GFP_NOFS); + if (hip) { + iinfo->ii_hinode = hip; + err = 0; + } + + return err; +} + +void au_iinfo_fin(struct inode *inode) +{ + struct au_iinfo *iinfo; + struct au_hinode *hi; + struct super_block *sb; + aufs_bindex_t bindex, bend; + const unsigned char unlinked = !inode->i_nlink; + + iinfo = au_ii(inode); + /* bad_inode case */ + if (!iinfo) + return; + + sb = inode->i_sb; + au_ninodes_dec(sb); + if (si_pid_test(sb)) + au_xino_delete_inode(inode, unlinked); + else { + /* + * it is safe to hide the dependency between sbinfo and + * sb->s_umount. + */ + lockdep_off(); + si_noflush_read_lock(sb); + au_xino_delete_inode(inode, unlinked); + si_read_unlock(sb); + lockdep_on(); + } + + if (iinfo->ii_vdir) + au_vdir_free(iinfo->ii_vdir); + + bindex = iinfo->ii_bstart; + if (bindex >= 0) { + hi = iinfo->ii_hinode + bindex; + bend = iinfo->ii_bend; + while (bindex++ <= bend) { + if (hi->hi_inode) + au_hiput(hi); + hi++; + } + } + kfree(iinfo->ii_hinode); + iinfo->ii_hinode = NULL; + AuRwDestroy(&iinfo->ii_rwsem); +} diff --git a/ubuntu/aufs/inode.c b/ubuntu/aufs/inode.c new file mode 100644 index 000000000000..00ae8877b7d2 --- /dev/null +++ b/ubuntu/aufs/inode.c @@ -0,0 +1,471 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * inode functions + */ + +#include "aufs.h" + +struct inode *au_igrab(struct inode *inode) +{ + if (inode) { + AuDebugOn(!atomic_read(&inode->i_count)); + ihold(inode); + } + return inode; +} + +static void au_refresh_hinode_attr(struct inode *inode, int do_version) +{ + au_cpup_attr_all(inode, /*force*/0); + au_update_iigen(inode); + if (do_version) + inode->i_version++; +} + +static int au_ii_refresh(struct inode *inode, int *update) +{ + int err, e; + umode_t type; + aufs_bindex_t bindex, new_bindex; + struct super_block *sb; + struct au_iinfo *iinfo; + struct au_hinode *p, *q, tmp; + + IiMustWriteLock(inode); + + *update = 0; + sb = inode->i_sb; + type = inode->i_mode & S_IFMT; + iinfo = au_ii(inode); + err = au_ii_realloc(iinfo, au_sbend(sb) + 1); + if (unlikely(err)) + goto out; + + AuDebugOn(iinfo->ii_bstart < 0); + p = iinfo->ii_hinode + iinfo->ii_bstart; + for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; + bindex++, p++) { + if (!p->hi_inode) + continue; + + AuDebugOn(type != (p->hi_inode->i_mode & S_IFMT)); + new_bindex = au_br_index(sb, p->hi_id); + if (new_bindex == bindex) + continue; + + if (new_bindex < 0) { + *update = 1; + au_hiput(p); + p->hi_inode = NULL; + continue; + } + + if (new_bindex < iinfo->ii_bstart) + iinfo->ii_bstart = new_bindex; + if (iinfo->ii_bend < new_bindex) + iinfo->ii_bend = new_bindex; + /* swap two lower inode, and loop again */ + q = iinfo->ii_hinode + new_bindex; + tmp = *q; + *q = *p; + *p = tmp; + if (tmp.hi_inode) { + bindex--; + p--; + } + } + au_update_ibrange(inode, /*do_put_zero*/0); + e = au_dy_irefresh(inode); + if (unlikely(e && !err)) + err = e; + +out: + AuTraceErr(err); + return err; +} + +int au_refresh_hinode_self(struct inode *inode) +{ + int err, update; + + err = au_ii_refresh(inode, &update); + if (!err) + au_refresh_hinode_attr(inode, update && S_ISDIR(inode->i_mode)); + + AuTraceErr(err); + return err; +} + +int au_refresh_hinode(struct inode *inode, struct dentry *dentry) +{ + int err, e, update; + unsigned int flags; + umode_t mode; + aufs_bindex_t bindex, bend; + unsigned char isdir; + struct au_hinode *p; + struct au_iinfo *iinfo; + + err = au_ii_refresh(inode, &update); + if (unlikely(err)) + goto out; + + update = 0; + iinfo = au_ii(inode); + p = iinfo->ii_hinode + iinfo->ii_bstart; + mode = (inode->i_mode & S_IFMT); + isdir = S_ISDIR(mode); + flags = au_hi_flags(inode, isdir); + bend = au_dbend(dentry); + for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) { + struct inode *h_i; + struct dentry *h_d; + + h_d = au_h_dptr(dentry, bindex); + if (!h_d || !h_d->d_inode) + continue; + + AuDebugOn(mode != (h_d->d_inode->i_mode & S_IFMT)); + if (iinfo->ii_bstart <= bindex && bindex <= iinfo->ii_bend) { + h_i = au_h_iptr(inode, bindex); + if (h_i) { + if (h_i == h_d->d_inode) + continue; + err = -EIO; + break; + } + } + if (bindex < iinfo->ii_bstart) + iinfo->ii_bstart = bindex; + if (iinfo->ii_bend < bindex) + iinfo->ii_bend = bindex; + au_set_h_iptr(inode, bindex, au_igrab(h_d->d_inode), flags); + update = 1; + } + au_update_ibrange(inode, /*do_put_zero*/0); + e = au_dy_irefresh(inode); + if (unlikely(e && !err)) + err = e; + if (!err) + au_refresh_hinode_attr(inode, update && isdir); + +out: + AuTraceErr(err); + return err; +} + +static int set_inode(struct inode *inode, struct dentry *dentry) +{ + int err; + unsigned int flags; + umode_t mode; + aufs_bindex_t bindex, bstart, btail; + unsigned char isdir; + struct dentry *h_dentry; + struct inode *h_inode; + struct au_iinfo *iinfo; + + IiMustWriteLock(inode); + + err = 0; + isdir = 0; + bstart = au_dbstart(dentry); + h_inode = au_h_dptr(dentry, bstart)->d_inode; + mode = h_inode->i_mode; + switch (mode & S_IFMT) { + case S_IFREG: + btail = au_dbtail(dentry); + inode->i_op = &aufs_iop; + inode->i_fop = &aufs_file_fop; + err = au_dy_iaop(inode, bstart, h_inode); + if (unlikely(err)) + goto out; + break; + case S_IFDIR: + isdir = 1; + btail = au_dbtaildir(dentry); + inode->i_op = &aufs_dir_iop; + inode->i_fop = &aufs_dir_fop; + break; + case S_IFLNK: + btail = au_dbtail(dentry); + inode->i_op = &aufs_symlink_iop; + break; + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + case S_IFSOCK: + btail = au_dbtail(dentry); + inode->i_op = &aufs_iop; + au_init_special_fop(inode, mode, h_inode->i_rdev); + break; + default: + AuIOErr("Unknown file type 0%o\n", mode); + err = -EIO; + goto out; + } + + /* do not set hnotify for whiteouted dirs (SHWH mode) */ + flags = au_hi_flags(inode, isdir); + if (au_opt_test(au_mntflags(dentry->d_sb), SHWH) + && au_ftest_hi(flags, HNOTIFY) + && dentry->d_name.len > AUFS_WH_PFX_LEN + && !memcmp(dentry->d_name.name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) + au_fclr_hi(flags, HNOTIFY); + iinfo = au_ii(inode); + iinfo->ii_bstart = bstart; + iinfo->ii_bend = btail; + for (bindex = bstart; bindex <= btail; bindex++) { + h_dentry = au_h_dptr(dentry, bindex); + if (h_dentry) + au_set_h_iptr(inode, bindex, + au_igrab(h_dentry->d_inode), flags); + } + au_cpup_attr_all(inode, /*force*/1); + +out: + return err; +} + +/* + * successful returns with iinfo write_locked + * minus: errno + * zero: success, matched + * plus: no error, but unmatched + */ +static int reval_inode(struct inode *inode, struct dentry *dentry) +{ + int err; + aufs_bindex_t bindex, bend; + struct inode *h_inode, *h_dinode; + + /* + * before this function, if aufs got any iinfo lock, it must be only + * one, the parent dir. + * it can happen by UDBA and the obsoleted inode number. + */ + err = -EIO; + if (unlikely(inode->i_ino == parent_ino(dentry))) + goto out; + + err = 1; + ii_write_lock_new_child(inode); + h_dinode = au_h_dptr(dentry, au_dbstart(dentry))->d_inode; + bend = au_ibend(inode); + for (bindex = au_ibstart(inode); bindex <= bend; bindex++) { + h_inode = au_h_iptr(inode, bindex); + if (h_inode && h_inode == h_dinode) { + err = 0; + if (au_iigen_test(inode, au_digen(dentry))) + err = au_refresh_hinode(inode, dentry); + break; + } + } + + if (unlikely(err)) + ii_write_unlock(inode); +out: + return err; +} + +int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, + unsigned int d_type, ino_t *ino) +{ + int err; + struct mutex *mtx; + + /* prevent hardlinked inode number from race condition */ + mtx = NULL; + if (d_type != DT_DIR) { + mtx = &au_sbr(sb, bindex)->br_xino.xi_nondir_mtx; + mutex_lock(mtx); + } + err = au_xino_read(sb, bindex, h_ino, ino); + if (unlikely(err)) + goto out; + + if (!*ino) { + err = -EIO; + *ino = au_xino_new_ino(sb); + if (unlikely(!*ino)) + goto out; + err = au_xino_write(sb, bindex, h_ino, *ino); + if (unlikely(err)) + goto out; + } + +out: + if (mtx) + mutex_unlock(mtx); + return err; +} + +/* successful returns with iinfo write_locked */ +/* todo: return with unlocked? */ +struct inode *au_new_inode(struct dentry *dentry, int must_new) +{ + struct inode *inode, *h_inode; + struct dentry *h_dentry; + struct super_block *sb; + struct mutex *mtx; + ino_t h_ino, ino; + int err; + aufs_bindex_t bstart; + + sb = dentry->d_sb; + bstart = au_dbstart(dentry); + h_dentry = au_h_dptr(dentry, bstart); + h_inode = h_dentry->d_inode; + h_ino = h_inode->i_ino; + + /* + * stop 'race'-ing between hardlinks under different + * parents. + */ + mtx = NULL; + if (!S_ISDIR(h_inode->i_mode)) + mtx = &au_sbr(sb, bstart)->br_xino.xi_nondir_mtx; + +new_ino: + if (mtx) + mutex_lock(mtx); + err = au_xino_read(sb, bstart, h_ino, &ino); + inode = ERR_PTR(err); + if (unlikely(err)) + goto out; + + if (!ino) { + ino = au_xino_new_ino(sb); + if (unlikely(!ino)) { + inode = ERR_PTR(-EIO); + goto out; + } + } + + AuDbg("i%lu\n", (unsigned long)ino); + inode = au_iget_locked(sb, ino); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out; + + AuDbg("%lx, new %d\n", inode->i_state, !!(inode->i_state & I_NEW)); + if (inode->i_state & I_NEW) { + ii_write_lock_new_child(inode); + err = set_inode(inode, dentry); + if (!err) { + unlock_new_inode(inode); + goto out; /* success */ + } + + /* + * iget_failed() calls iput(), but we need to call + * ii_write_unlock() after iget_failed(). so dirty hack for + * i_count. + */ + atomic_inc(&inode->i_count); + iget_failed(inode); + ii_write_unlock(inode); + au_xino_write(sb, bstart, h_ino, /*ino*/0); + /* ignore this error */ + goto out_iput; + } else if (!must_new && !IS_DEADDIR(inode) && inode->i_nlink) { + /* + * horrible race condition between lookup, readdir and copyup + * (or something). + */ + if (mtx) + mutex_unlock(mtx); + err = reval_inode(inode, dentry); + if (unlikely(err < 0)) { + mtx = NULL; + goto out_iput; + } + + if (!err) { + mtx = NULL; + goto out; /* success */ + } else if (mtx) + mutex_lock(mtx); + } + + if (unlikely(au_test_fs_unique_ino(h_dentry->d_inode))) + AuWarn1("Warning: Un-notified UDBA or repeatedly renamed dir," + " b%d, %s, %.*s, hi%lu, i%lu.\n", + bstart, au_sbtype(h_dentry->d_sb), AuDLNPair(dentry), + (unsigned long)h_ino, (unsigned long)ino); + ino = 0; + err = au_xino_write(sb, bstart, h_ino, /*ino*/0); + if (!err) { + iput(inode); + if (mtx) + mutex_unlock(mtx); + goto new_ino; + } + +out_iput: + iput(inode); + inode = ERR_PTR(err); +out: + if (mtx) + mutex_unlock(mtx); + return inode; +} + +/* ---------------------------------------------------------------------- */ + +int au_test_ro(struct super_block *sb, aufs_bindex_t bindex, + struct inode *inode) +{ + int err; + + err = au_br_rdonly(au_sbr(sb, bindex)); + + /* pseudo-link after flushed may happen out of bounds */ + if (!err + && inode + && au_ibstart(inode) <= bindex + && bindex <= au_ibend(inode)) { + /* + * permission check is unnecessary since vfsub routine + * will be called later + */ + struct inode *hi = au_h_iptr(inode, bindex); + if (hi) + err = IS_IMMUTABLE(hi) ? -EROFS : 0; + } + + return err; +} + +int au_test_h_perm(struct inode *h_inode, int mask) +{ + if (!current_fsuid()) + return 0; + return inode_permission(h_inode, mask); +} + +int au_test_h_perm_sio(struct inode *h_inode, int mask) +{ + if (au_test_nfs(h_inode->i_sb) + && (mask & MAY_WRITE) + && S_ISDIR(h_inode->i_mode)) + mask |= MAY_READ; /* force permission check */ + return au_test_h_perm(h_inode, mask); +} diff --git a/ubuntu/aufs/inode.h b/ubuntu/aufs/inode.h new file mode 100644 index 000000000000..0c500474c963 --- /dev/null +++ b/ubuntu/aufs/inode.h @@ -0,0 +1,556 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * inode operations + */ + +#ifndef __AUFS_INODE_H__ +#define __AUFS_INODE_H__ + +#ifdef __KERNEL__ + +#include +#include +#include +#include "rwsem.h" + +struct vfsmount; + +struct au_hnotify { +#ifdef CONFIG_AUFS_HNOTIFY +#ifdef CONFIG_AUFS_HFSNOTIFY + /* never use fsnotify_add_vfsmount_mark() */ + struct fsnotify_mark hn_mark; + int hn_mark_dead; +#endif + struct inode *hn_aufs_inode; /* no get/put */ +#endif +} ____cacheline_aligned_in_smp; + +struct au_hinode { + struct inode *hi_inode; + aufs_bindex_t hi_id; +#ifdef CONFIG_AUFS_HNOTIFY + struct au_hnotify *hi_notify; +#endif + + /* reference to the copied-up whiteout with get/put */ + struct dentry *hi_whdentry; +}; + +struct au_vdir; +struct au_iinfo { + atomic_t ii_generation; + struct super_block *ii_hsb1; /* no get/put */ + + struct au_rwsem ii_rwsem; + aufs_bindex_t ii_bstart, ii_bend; + __u32 ii_higen; + struct au_hinode *ii_hinode; + struct au_vdir *ii_vdir; +}; + +struct au_icntnr { + struct au_iinfo iinfo; + struct inode vfs_inode; +} ____cacheline_aligned_in_smp; + +/* au_pin flags */ +#define AuPin_DI_LOCKED 1 +#define AuPin_MNT_WRITE (1 << 1) +#define au_ftest_pin(flags, name) ((flags) & AuPin_##name) +#define au_fset_pin(flags, name) \ + do { (flags) |= AuPin_##name; } while (0) +#define au_fclr_pin(flags, name) \ + do { (flags) &= ~AuPin_##name; } while (0) + +struct au_pin { + /* input */ + struct dentry *dentry; + unsigned int udba; + unsigned char lsc_di, lsc_hi, flags; + aufs_bindex_t bindex; + + /* output */ + struct dentry *parent; + struct au_hinode *hdir; + struct vfsmount *h_mnt; +}; + +/* ---------------------------------------------------------------------- */ + +static inline struct au_iinfo *au_ii(struct inode *inode) +{ + struct au_iinfo *iinfo; + + iinfo = &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo); + if (iinfo->ii_hinode) + return iinfo; + return NULL; /* debugging bad_inode case */ +} + +/* ---------------------------------------------------------------------- */ + +/* inode.c */ +struct inode *au_igrab(struct inode *inode); +int au_refresh_hinode_self(struct inode *inode); +int au_refresh_hinode(struct inode *inode, struct dentry *dentry); +int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, + unsigned int d_type, ino_t *ino); +struct inode *au_new_inode(struct dentry *dentry, int must_new); +int au_test_ro(struct super_block *sb, aufs_bindex_t bindex, + struct inode *inode); +int au_test_h_perm(struct inode *h_inode, int mask); +int au_test_h_perm_sio(struct inode *h_inode, int mask); + +static inline int au_wh_ino(struct super_block *sb, aufs_bindex_t bindex, + ino_t h_ino, unsigned int d_type, ino_t *ino) +{ +#ifdef CONFIG_AUFS_SHWH + return au_ino(sb, bindex, h_ino, d_type, ino); +#else + return 0; +#endif +} + +/* i_op.c */ +extern struct inode_operations aufs_iop, aufs_symlink_iop, aufs_dir_iop; + +/* au_wr_dir flags */ +#define AuWrDir_ADD_ENTRY 1 +#define AuWrDir_ISDIR (1 << 1) +#define au_ftest_wrdir(flags, name) ((flags) & AuWrDir_##name) +#define au_fset_wrdir(flags, name) \ + do { (flags) |= AuWrDir_##name; } while (0) +#define au_fclr_wrdir(flags, name) \ + do { (flags) &= ~AuWrDir_##name; } while (0) + +struct au_wr_dir_args { + aufs_bindex_t force_btgt; + unsigned char flags; +}; +int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry, + struct au_wr_dir_args *args); + +struct dentry *au_pinned_h_parent(struct au_pin *pin); +void au_pin_init(struct au_pin *pin, struct dentry *dentry, + aufs_bindex_t bindex, int lsc_di, int lsc_hi, + unsigned int udba, unsigned char flags); +int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex, + unsigned int udba, unsigned char flags) __must_check; +int au_do_pin(struct au_pin *pin) __must_check; +void au_unpin(struct au_pin *pin); + +/* i_op_add.c */ +int au_may_add(struct dentry *dentry, aufs_bindex_t bindex, + struct dentry *h_parent, int isdir); +int aufs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev); +int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname); +int aufs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd); +int aufs_link(struct dentry *src_dentry, struct inode *dir, + struct dentry *dentry); +int aufs_mkdir(struct inode *dir, struct dentry *dentry, int mode); + +/* i_op_del.c */ +int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup); +int au_may_del(struct dentry *dentry, aufs_bindex_t bindex, + struct dentry *h_parent, int isdir); +int aufs_unlink(struct inode *dir, struct dentry *dentry); +int aufs_rmdir(struct inode *dir, struct dentry *dentry); + +/* i_op_ren.c */ +int au_wbr(struct dentry *dentry, aufs_bindex_t btgt); +int aufs_rename(struct inode *src_dir, struct dentry *src_dentry, + struct inode *dir, struct dentry *dentry); + +/* iinfo.c */ +struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex); +void au_hiput(struct au_hinode *hinode); +void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex, + struct dentry *h_wh); +unsigned int au_hi_flags(struct inode *inode, int isdir); + +/* hinode flags */ +#define AuHi_XINO 1 +#define AuHi_HNOTIFY (1 << 1) +#define au_ftest_hi(flags, name) ((flags) & AuHi_##name) +#define au_fset_hi(flags, name) \ + do { (flags) |= AuHi_##name; } while (0) +#define au_fclr_hi(flags, name) \ + do { (flags) &= ~AuHi_##name; } while (0) + +#ifndef CONFIG_AUFS_HNOTIFY +#undef AuHi_HNOTIFY +#define AuHi_HNOTIFY 0 +#endif + +void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex, + struct inode *h_inode, unsigned int flags); + +void au_update_iigen(struct inode *inode); +void au_update_ibrange(struct inode *inode, int do_put_zero); + +void au_icntnr_init_once(void *_c); +int au_iinfo_init(struct inode *inode); +void au_iinfo_fin(struct inode *inode); +int au_ii_realloc(struct au_iinfo *iinfo, int nbr); + +#ifdef CONFIG_PROC_FS +/* plink.c */ +int au_plink_maint(struct super_block *sb, int flags); +void au_plink_maint_leave(struct au_sbinfo *sbinfo); +int au_plink_maint_enter(struct super_block *sb); +#ifdef CONFIG_AUFS_DEBUG +void au_plink_list(struct super_block *sb); +#else +AuStubVoid(au_plink_list, struct super_block *sb) +#endif +int au_plink_test(struct inode *inode); +struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex); +void au_plink_append(struct inode *inode, aufs_bindex_t bindex, + struct dentry *h_dentry); +void au_plink_put(struct super_block *sb, int verbose); +void au_plink_clean(struct super_block *sb, int verbose); +void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id); +#else +AuStubInt0(au_plink_maint, struct super_block *sb, int flags); +AuStubVoid(au_plink_maint_leave, struct au_sbinfo *sbinfo); +AuStubInt0(au_plink_maint_enter, struct super_block *sb); +AuStubVoid(au_plink_list, struct super_block *sb); +AuStubInt0(au_plink_test, struct inode *inode); +AuStub(struct dentry *, au_plink_lkup, return NULL, + struct inode *inode, aufs_bindex_t bindex); +AuStubVoid(au_plink_append, struct inode *inode, aufs_bindex_t bindex, + struct dentry *h_dentry); +AuStubVoid(au_plink_put, struct super_block *sb, int verbose); +AuStubVoid(au_plink_clean, struct super_block *sb, int verbose); +AuStubVoid(au_plink_half_refresh, struct super_block *sb, aufs_bindex_t br_id); +#endif /* CONFIG_PROC_FS */ + +/* ---------------------------------------------------------------------- */ + +/* lock subclass for iinfo */ +enum { + AuLsc_II_CHILD, /* child first */ + AuLsc_II_CHILD2, /* rename(2), link(2), and cpup at hnotify */ + AuLsc_II_CHILD3, /* copyup dirs */ + AuLsc_II_PARENT, /* see AuLsc_I_PARENT in vfsub.h */ + AuLsc_II_PARENT2, + AuLsc_II_PARENT3, /* copyup dirs */ + AuLsc_II_NEW_CHILD +}; + +/* + * ii_read_lock_child, ii_write_lock_child, + * ii_read_lock_child2, ii_write_lock_child2, + * ii_read_lock_child3, ii_write_lock_child3, + * ii_read_lock_parent, ii_write_lock_parent, + * ii_read_lock_parent2, ii_write_lock_parent2, + * ii_read_lock_parent3, ii_write_lock_parent3, + * ii_read_lock_new_child, ii_write_lock_new_child, + */ +#define AuReadLockFunc(name, lsc) \ +static inline void ii_read_lock_##name(struct inode *i) \ +{ \ + au_rw_read_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \ +} + +#define AuWriteLockFunc(name, lsc) \ +static inline void ii_write_lock_##name(struct inode *i) \ +{ \ + au_rw_write_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \ +} + +#define AuRWLockFuncs(name, lsc) \ + AuReadLockFunc(name, lsc) \ + AuWriteLockFunc(name, lsc) + +AuRWLockFuncs(child, CHILD); +AuRWLockFuncs(child2, CHILD2); +AuRWLockFuncs(child3, CHILD3); +AuRWLockFuncs(parent, PARENT); +AuRWLockFuncs(parent2, PARENT2); +AuRWLockFuncs(parent3, PARENT3); +AuRWLockFuncs(new_child, NEW_CHILD); + +#undef AuReadLockFunc +#undef AuWriteLockFunc +#undef AuRWLockFuncs + +/* + * ii_read_unlock, ii_write_unlock, ii_downgrade_lock + */ +AuSimpleUnlockRwsemFuncs(ii, struct inode *i, &au_ii(i)->ii_rwsem); + +#define IiMustNoWaiters(i) AuRwMustNoWaiters(&au_ii(i)->ii_rwsem) +#define IiMustAnyLock(i) AuRwMustAnyLock(&au_ii(i)->ii_rwsem) +#define IiMustWriteLock(i) AuRwMustWriteLock(&au_ii(i)->ii_rwsem) + +/* ---------------------------------------------------------------------- */ + +static inline void au_icntnr_init(struct au_icntnr *c) +{ +#ifdef CONFIG_AUFS_DEBUG + c->vfs_inode.i_mode = 0; +#endif +} + +static inline unsigned int au_iigen(struct inode *inode) +{ + return atomic_read(&au_ii(inode)->ii_generation); +} + +/* tiny test for inode number */ +/* tmpfs generation is too rough */ +static inline int au_test_higen(struct inode *inode, struct inode *h_inode) +{ + struct au_iinfo *iinfo; + + iinfo = au_ii(inode); + AuRwMustAnyLock(&iinfo->ii_rwsem); + return !(iinfo->ii_hsb1 == h_inode->i_sb + && iinfo->ii_higen == h_inode->i_generation); +} + +static inline void au_iigen_dec(struct inode *inode) +{ + atomic_dec(&au_ii(inode)->ii_generation); +} + +static inline int au_iigen_test(struct inode *inode, unsigned int sigen) +{ + int err; + + err = 0; + if (unlikely(inode && au_iigen(inode) != sigen)) + err = -EIO; + + return err; +} + +/* ---------------------------------------------------------------------- */ + +static inline aufs_bindex_t au_ii_br_id(struct inode *inode, + aufs_bindex_t bindex) +{ + IiMustAnyLock(inode); + return au_ii(inode)->ii_hinode[0 + bindex].hi_id; +} + +static inline aufs_bindex_t au_ibstart(struct inode *inode) +{ + IiMustAnyLock(inode); + return au_ii(inode)->ii_bstart; +} + +static inline aufs_bindex_t au_ibend(struct inode *inode) +{ + IiMustAnyLock(inode); + return au_ii(inode)->ii_bend; +} + +static inline struct au_vdir *au_ivdir(struct inode *inode) +{ + IiMustAnyLock(inode); + return au_ii(inode)->ii_vdir; +} + +static inline struct dentry *au_hi_wh(struct inode *inode, aufs_bindex_t bindex) +{ + IiMustAnyLock(inode); + return au_ii(inode)->ii_hinode[0 + bindex].hi_whdentry; +} + +static inline void au_set_ibstart(struct inode *inode, aufs_bindex_t bindex) +{ + IiMustWriteLock(inode); + au_ii(inode)->ii_bstart = bindex; +} + +static inline void au_set_ibend(struct inode *inode, aufs_bindex_t bindex) +{ + IiMustWriteLock(inode); + au_ii(inode)->ii_bend = bindex; +} + +static inline void au_set_ivdir(struct inode *inode, struct au_vdir *vdir) +{ + IiMustWriteLock(inode); + au_ii(inode)->ii_vdir = vdir; +} + +static inline struct au_hinode *au_hi(struct inode *inode, aufs_bindex_t bindex) +{ + IiMustAnyLock(inode); + return au_ii(inode)->ii_hinode + bindex; +} + +/* ---------------------------------------------------------------------- */ + +static inline struct dentry *au_pinned_parent(struct au_pin *pin) +{ + if (pin) + return pin->parent; + return NULL; +} + +static inline struct inode *au_pinned_h_dir(struct au_pin *pin) +{ + if (pin && pin->hdir) + return pin->hdir->hi_inode; + return NULL; +} + +static inline struct au_hinode *au_pinned_hdir(struct au_pin *pin) +{ + if (pin) + return pin->hdir; + return NULL; +} + +static inline void au_pin_set_dentry(struct au_pin *pin, struct dentry *dentry) +{ + if (pin) + pin->dentry = dentry; +} + +static inline void au_pin_set_parent_lflag(struct au_pin *pin, + unsigned char lflag) +{ + if (pin) { + if (lflag) + au_fset_pin(pin->flags, DI_LOCKED); + else + au_fclr_pin(pin->flags, DI_LOCKED); + } +} + +static inline void au_pin_set_parent(struct au_pin *pin, struct dentry *parent) +{ + if (pin) { + dput(pin->parent); + pin->parent = dget(parent); + } +} + +/* ---------------------------------------------------------------------- */ + +struct au_branch; +#ifdef CONFIG_AUFS_HNOTIFY +struct au_hnotify_op { + void (*ctl)(struct au_hinode *hinode, int do_set); + int (*alloc)(struct au_hinode *hinode); + void (*free)(struct au_hinode *hinode); + + void (*fin)(void); + int (*init)(void); + + int (*reset_br)(unsigned int udba, struct au_branch *br, int perm); + void (*fin_br)(struct au_branch *br); + int (*init_br)(struct au_branch *br, int perm); +}; + +/* hnotify.c */ +int au_hn_alloc(struct au_hinode *hinode, struct inode *inode); +void au_hn_free(struct au_hinode *hinode); +void au_hn_ctl(struct au_hinode *hinode, int do_set); +void au_hn_reset(struct inode *inode, unsigned int flags); +int au_hnotify(struct inode *h_dir, struct au_hnotify *hnotify, u32 mask, + struct qstr *h_child_qstr, struct inode *h_child_inode); +int au_hnotify_reset_br(unsigned int udba, struct au_branch *br, int perm); +int au_hnotify_init_br(struct au_branch *br, int perm); +void au_hnotify_fin_br(struct au_branch *br); +int __init au_hnotify_init(void); +void au_hnotify_fin(void); + +/* hfsnotify.c */ +extern const struct au_hnotify_op au_hnotify_op; + +static inline +void au_hn_init(struct au_hinode *hinode) +{ + hinode->hi_notify = NULL; +} + +static inline struct au_hnotify *au_hn(struct au_hinode *hinode) +{ + return hinode->hi_notify; +} + +#else +static inline +int au_hn_alloc(struct au_hinode *hinode __maybe_unused, + struct inode *inode __maybe_unused) +{ + return -EOPNOTSUPP; +} + +static inline struct au_hnotify *au_hn(struct au_hinode *hinode) +{ + return NULL; +} + +AuStubVoid(au_hn_free, struct au_hinode *hinode __maybe_unused) +AuStubVoid(au_hn_ctl, struct au_hinode *hinode __maybe_unused, + int do_set __maybe_unused) +AuStubVoid(au_hn_reset, struct inode *inode __maybe_unused, + unsigned int flags __maybe_unused) +AuStubInt0(au_hnotify_reset_br, unsigned int udba __maybe_unused, + struct au_branch *br __maybe_unused, + int perm __maybe_unused) +AuStubInt0(au_hnotify_init_br, struct au_branch *br __maybe_unused, + int perm __maybe_unused) +AuStubVoid(au_hnotify_fin_br, struct au_branch *br __maybe_unused) +AuStubInt0(__init au_hnotify_init, void) +AuStubVoid(au_hnotify_fin, void) +AuStubVoid(au_hn_init, struct au_hinode *hinode __maybe_unused) +#endif /* CONFIG_AUFS_HNOTIFY */ + +static inline void au_hn_suspend(struct au_hinode *hdir) +{ + au_hn_ctl(hdir, /*do_set*/0); +} + +static inline void au_hn_resume(struct au_hinode *hdir) +{ + au_hn_ctl(hdir, /*do_set*/1); +} + +static inline void au_hn_imtx_lock(struct au_hinode *hdir) +{ + mutex_lock(&hdir->hi_inode->i_mutex); + au_hn_suspend(hdir); +} + +static inline void au_hn_imtx_lock_nested(struct au_hinode *hdir, + unsigned int sc __maybe_unused) +{ + mutex_lock_nested(&hdir->hi_inode->i_mutex, sc); + au_hn_suspend(hdir); +} + +static inline void au_hn_imtx_unlock(struct au_hinode *hdir) +{ + au_hn_resume(hdir); + mutex_unlock(&hdir->hi_inode->i_mutex); +} + +#endif /* __KERNEL__ */ +#endif /* __AUFS_INODE_H__ */ diff --git a/ubuntu/aufs/ioctl.c b/ubuntu/aufs/ioctl.c new file mode 100644 index 000000000000..93151b350ee3 --- /dev/null +++ b/ubuntu/aufs/ioctl.c @@ -0,0 +1,197 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * ioctl + * plink-management and readdir in userspace. + * assist the pathconf(3) wrapper library. + */ + +#include +#include "aufs.h" + +static int au_wbr_fd(struct path *path, struct aufs_wbr_fd __user *arg) +{ + int err, fd; + aufs_bindex_t wbi, bindex, bend; + struct file *h_file; + struct super_block *sb; + struct dentry *root; + struct au_branch *br; + struct aufs_wbr_fd wbrfd = { + .oflags = au_dir_roflags, + .brid = -1 + }; + const int valid = O_RDONLY | O_NONBLOCK | O_LARGEFILE | O_DIRECTORY + | O_NOATIME | O_CLOEXEC; + + AuDebugOn(wbrfd.oflags & ~valid); + + if (arg) { + err = copy_from_user(&wbrfd, arg, sizeof(wbrfd)); + if (unlikely(err)) { + err = -EFAULT; + goto out; + } + + err = -EINVAL; + AuDbg("wbrfd{0%o, %d}\n", wbrfd.oflags, wbrfd.brid); + wbrfd.oflags |= au_dir_roflags; + AuDbg("0%o\n", wbrfd.oflags); + if (unlikely(wbrfd.oflags & ~valid)) + goto out; + } + + fd = get_unused_fd(); + err = fd; + if (unlikely(fd < 0)) + goto out; + + h_file = ERR_PTR(-EINVAL); + wbi = 0; + br = NULL; + sb = path->dentry->d_sb; + root = sb->s_root; + aufs_read_lock(root, AuLock_IR); + bend = au_sbend(sb); + if (wbrfd.brid >= 0) { + wbi = au_br_index(sb, wbrfd.brid); + if (unlikely(wbi < 0 || wbi > bend)) + goto out_unlock; + } + + h_file = ERR_PTR(-ENOENT); + br = au_sbr(sb, wbi); + if (!au_br_writable(br->br_perm)) { + if (arg) + goto out_unlock; + + bindex = wbi + 1; + wbi = -1; + for (; bindex <= bend; bindex++) { + br = au_sbr(sb, bindex); + if (au_br_writable(br->br_perm)) { + wbi = bindex; + br = au_sbr(sb, wbi); + break; + } + } + } + AuDbg("wbi %d\n", wbi); + if (wbi >= 0) + h_file = au_h_open(root, wbi, wbrfd.oflags, NULL); + +out_unlock: + aufs_read_unlock(root, AuLock_IR); + err = PTR_ERR(h_file); + if (IS_ERR(h_file)) + goto out_fd; + + atomic_dec(&br->br_count); /* cf. au_h_open() */ + fd_install(fd, h_file); + err = fd; + goto out; /* success */ + +out_fd: + put_unused_fd(fd); +out: + AuTraceErr(err); + return err; +} + +/* ---------------------------------------------------------------------- */ + +long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg) +{ + long err; + + switch (cmd) { + case AUFS_CTL_RDU: + case AUFS_CTL_RDU_INO: + err = au_rdu_ioctl(file, cmd, arg); + break; + + case AUFS_CTL_WBR_FD: + err = au_wbr_fd(&file->f_path, (void __user *)arg); + break; + + case AUFS_CTL_IBUSY: + err = au_ibusy_ioctl(file, arg); + break; + + default: + /* do not call the lower */ + AuDbg("0x%x\n", cmd); + err = -ENOTTY; + } + + AuTraceErr(err); + return err; +} + +long aufs_ioctl_nondir(struct file *file, unsigned int cmd, unsigned long arg) +{ + long err; + + switch (cmd) { + case AUFS_CTL_WBR_FD: + err = au_wbr_fd(&file->f_path, (void __user *)arg); + break; + + default: + /* do not call the lower */ + AuDbg("0x%x\n", cmd); + err = -ENOTTY; + } + + AuTraceErr(err); + return err; +} + +#ifdef CONFIG_COMPAT +long aufs_compat_ioctl_dir(struct file *file, unsigned int cmd, + unsigned long arg) +{ + long err; + + switch (cmd) { + case AUFS_CTL_RDU: + case AUFS_CTL_RDU_INO: + err = au_rdu_compat_ioctl(file, cmd, arg); + break; + + case AUFS_CTL_IBUSY: + err = au_ibusy_compat_ioctl(file, arg); + break; + + default: + err = aufs_ioctl_dir(file, cmd, arg); + } + + AuTraceErr(err); + return err; +} + +#if 0 /* unused yet */ +long aufs_compat_ioctl_nondir(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return aufs_ioctl_nondir(file, cmd, (unsigned long)compat_ptr(arg)); +} +#endif +#endif diff --git a/ubuntu/aufs/loop.c b/ubuntu/aufs/loop.c new file mode 100644 index 000000000000..4998192e6f17 --- /dev/null +++ b/ubuntu/aufs/loop.c @@ -0,0 +1,133 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * support for loopback block device as a branch + */ + +#include +#include "aufs.h" + +/* + * test if two lower dentries have overlapping branches. + */ +int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_adding) +{ + struct super_block *h_sb; + struct loop_device *l; + + h_sb = h_adding->d_sb; + if (MAJOR(h_sb->s_dev) != LOOP_MAJOR) + return 0; + + l = h_sb->s_bdev->bd_disk->private_data; + h_adding = l->lo_backing_file->f_dentry; + /* + * h_adding can be local NFS. + * in this case aufs cannot detect the loop. + */ + if (unlikely(h_adding->d_sb == sb)) + return 1; + return !!au_test_subdir(h_adding, sb->s_root); +} + +/* true if a kernel thread named 'loop[0-9].*' accesses a file */ +int au_test_loopback_kthread(void) +{ + int ret; + struct task_struct *tsk = current; + + ret = 0; + if (tsk->flags & PF_KTHREAD) { + const char c = tsk->comm[4]; + ret = ('0' <= c && c <= '9' + && !strncmp(tsk->comm, "loop", 4)); + } + + return ret; +} + +/* ---------------------------------------------------------------------- */ + +#define au_warn_loopback_step 16 +static int au_warn_loopback_nelem = au_warn_loopback_step; +static unsigned long *au_warn_loopback_array; + +void au_warn_loopback(struct super_block *h_sb) +{ + int i, new_nelem; + unsigned long *a, magic; + static DEFINE_SPINLOCK(spin); + + magic = h_sb->s_magic; + spin_lock(&spin); + a = au_warn_loopback_array; + for (i = 0; i < au_warn_loopback_nelem && *a; i++) + if (a[i] == magic) { + spin_unlock(&spin); + return; + } + + /* h_sb is new to us, print it */ + if (i < au_warn_loopback_nelem) { + a[i] = magic; + goto pr; + } + + /* expand the array */ + new_nelem = au_warn_loopback_nelem + au_warn_loopback_step; + a = au_kzrealloc(au_warn_loopback_array, + au_warn_loopback_nelem * sizeof(unsigned long), + new_nelem * sizeof(unsigned long), GFP_ATOMIC); + if (a) { + au_warn_loopback_nelem = new_nelem; + au_warn_loopback_array = a; + a[i] = magic; + goto pr; + } + + spin_unlock(&spin); + AuWarn1("realloc failed, ignored\n"); + return; + +pr: + spin_unlock(&spin); + pr_warning("you may want to try another patch for loopback file " + "on %s(0x%lx) branch\n", au_sbtype(h_sb), magic); +} + +int au_loopback_init(void) +{ + int err; + struct super_block *sb __maybe_unused; + + AuDebugOn(sizeof(sb->s_magic) != sizeof(unsigned long)); + + err = 0; + au_warn_loopback_array = kcalloc(au_warn_loopback_step, + sizeof(unsigned long), GFP_NOFS); + if (unlikely(!au_warn_loopback_array)) + err = -ENOMEM; + + return err; +} + +void au_loopback_fin(void) +{ + kfree(au_warn_loopback_array); +} diff --git a/ubuntu/aufs/loop.h b/ubuntu/aufs/loop.h new file mode 100644 index 000000000000..fc156e804530 --- /dev/null +++ b/ubuntu/aufs/loop.h @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * support for loopback mount as a branch + */ + +#ifndef __AUFS_LOOP_H__ +#define __AUFS_LOOP_H__ + +#ifdef __KERNEL__ + +struct dentry; +struct super_block; + +#ifdef CONFIG_AUFS_BDEV_LOOP +/* loop.c */ +int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_adding); +int au_test_loopback_kthread(void); +void au_warn_loopback(struct super_block *h_sb); + +int au_loopback_init(void); +void au_loopback_fin(void); +#else +AuStubInt0(au_test_loopback_overlap, struct super_block *sb, + struct dentry *h_adding) +AuStubInt0(au_test_loopback_kthread, void) +AuStubVoid(au_warn_loopback, struct super_block *h_sb) + +AuStubInt0(au_loopback_init, void) +AuStubVoid(au_loopback_fin, void) +#endif /* BLK_DEV_LOOP */ + +#endif /* __KERNEL__ */ +#endif /* __AUFS_LOOP_H__ */ diff --git a/ubuntu/aufs/magic.mk b/ubuntu/aufs/magic.mk new file mode 100644 index 000000000000..3e6387b01b49 --- /dev/null +++ b/ubuntu/aufs/magic.mk @@ -0,0 +1,54 @@ + +# defined in ${srctree}/fs/fuse/inode.c +# tristate +ifdef CONFIG_FUSE_FS +ccflags-y += -DFUSE_SUPER_MAGIC=0x65735546 +endif + +# defined in ${srctree}/fs/ocfs2/ocfs2_fs.h +# tristate +ifdef CONFIG_OCFS2_FS +ccflags-y += -DOCFS2_SUPER_MAGIC=0x7461636f +endif + +# defined in ${srctree}/fs/ocfs2/dlm/userdlm.h +# tristate +ifdef CONFIG_OCFS2_FS_O2CB +ccflags-y += -DDLMFS_MAGIC=0x76a9f425 +endif + +# defined in ${srctree}/fs/cifs/cifsfs.c +# tristate +ifdef CONFIG_CIFS_FS +ccflags-y += -DCIFS_MAGIC_NUMBER=0xFF534D42 +endif + +# defined in ${srctree}/fs/xfs/xfs_sb.h +# tristate +ifdef CONFIG_XFS_FS +ccflags-y += -DXFS_SB_MAGIC=0x58465342 +endif + +# defined in ${srctree}/fs/configfs/mount.c +# tristate +ifdef CONFIG_CONFIGFS_FS +ccflags-y += -DCONFIGFS_MAGIC=0x62656570 +endif + +# defined in ${srctree}/fs/9p/v9fs.h +# tristate +ifdef CONFIG_9P_FS +ccflags-y += -DV9FS_MAGIC=0x01021997 +endif + +# defined in ${srctree}/fs/ubifs/ubifs.h +# tristate +ifdef CONFIG_UBIFS_FS +ccflags-y += -DUBIFS_SUPER_MAGIC=0x24051905 +endif + +# defined in ${srctree}/fs/hfsplus/hfsplus_raw.h +# tristate +ifdef CONFIG_HFSPLUS_FS +ccflags-y += -DHFSPLUS_SUPER_MAGIC=0x482b +endif diff --git a/ubuntu/aufs/module.c b/ubuntu/aufs/module.c new file mode 100644 index 000000000000..175f03475776 --- /dev/null +++ b/ubuntu/aufs/module.c @@ -0,0 +1,195 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * module global variables and operations + */ + +#include +#include +#include "aufs.h" + +void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp) +{ + if (new_sz <= nused) + return p; + + p = krealloc(p, new_sz, gfp); + if (p) + memset(p + nused, 0, new_sz - nused); + return p; +} + +/* ---------------------------------------------------------------------- */ + +/* + * aufs caches + */ +struct kmem_cache *au_cachep[AuCache_Last]; +static int __init au_cache_init(void) +{ + au_cachep[AuCache_DINFO] = AuCacheCtor(au_dinfo, au_di_init_once); + if (au_cachep[AuCache_DINFO]) + /* SLAB_DESTROY_BY_RCU */ + au_cachep[AuCache_ICNTNR] = AuCacheCtor(au_icntnr, + au_icntnr_init_once); + if (au_cachep[AuCache_ICNTNR]) + au_cachep[AuCache_FINFO] = AuCacheCtor(au_finfo, + au_fi_init_once); + if (au_cachep[AuCache_FINFO]) + au_cachep[AuCache_VDIR] = AuCache(au_vdir); + if (au_cachep[AuCache_VDIR]) + au_cachep[AuCache_DEHSTR] = AuCache(au_vdir_dehstr); + if (au_cachep[AuCache_DEHSTR]) + return 0; + + return -ENOMEM; +} + +static void au_cache_fin(void) +{ + int i; + + /* including AuCache_HNOTIFY */ + for (i = 0; i < AuCache_Last; i++) + if (au_cachep[i]) { + kmem_cache_destroy(au_cachep[i]); + au_cachep[i] = NULL; + } +} + +/* ---------------------------------------------------------------------- */ + +int au_dir_roflags; + +#ifdef CONFIG_AUFS_SBILIST +/* + * iterate_supers_type() doesn't protect us from + * remounting (branch management) + */ +struct au_splhead au_sbilist; +#endif + +struct lock_class_key au_lc_key[AuLcKey_Last]; + +/* + * functions for module interface. + */ +MODULE_LICENSE("GPL"); +/* MODULE_LICENSE("GPL v2"); */ +MODULE_AUTHOR("Junjiro R. Okajima "); +MODULE_DESCRIPTION(AUFS_NAME + " -- Advanced multi layered unification filesystem"); +MODULE_VERSION(AUFS_VERSION); + +/* this module parameter has no meaning when SYSFS is disabled */ +int sysaufs_brs = 1; +MODULE_PARM_DESC(brs, "use /fs/aufs/si_*/brN"); +module_param_named(brs, sysaufs_brs, int, S_IRUGO); + +/* ---------------------------------------------------------------------- */ + +static char au_esc_chars[0x20 + 3]; /* 0x01-0x20, backslash, del, and NULL */ + +int au_seq_path(struct seq_file *seq, struct path *path) +{ + return seq_path(seq, path, au_esc_chars); +} + +/* ---------------------------------------------------------------------- */ + +static int __init aufs_init(void) +{ + int err, i; + char *p; + + p = au_esc_chars; + for (i = 1; i <= ' '; i++) + *p++ = i; + *p++ = '\\'; + *p++ = '\x7f'; + *p = 0; + + au_dir_roflags = au_file_roflags(O_DIRECTORY | O_LARGEFILE); + + au_sbilist_init(); + sysaufs_brs_init(); + au_debug_init(); + au_dy_init(); + err = sysaufs_init(); + if (unlikely(err)) + goto out; + err = au_procfs_init(); + if (unlikely(err)) + goto out_sysaufs; + err = au_wkq_init(); + if (unlikely(err)) + goto out_procfs; + err = au_loopback_init(); + if (unlikely(err)) + goto out_wkq; + err = au_hnotify_init(); + if (unlikely(err)) + goto out_loopback; + err = au_sysrq_init(); + if (unlikely(err)) + goto out_hin; + err = au_cache_init(); + if (unlikely(err)) + goto out_sysrq; + err = register_filesystem(&aufs_fs_type); + if (unlikely(err)) + goto out_cache; + /* since we define pr_fmt, call printk directly */ + printk(KERN_INFO AUFS_NAME " " AUFS_VERSION "\n"); + goto out; /* success */ + +out_cache: + au_cache_fin(); +out_sysrq: + au_sysrq_fin(); +out_hin: + au_hnotify_fin(); +out_loopback: + au_loopback_fin(); +out_wkq: + au_wkq_fin(); +out_procfs: + au_procfs_fin(); +out_sysaufs: + sysaufs_fin(); + au_dy_fin(); +out: + return err; +} + +static void __exit aufs_exit(void) +{ + unregister_filesystem(&aufs_fs_type); + au_cache_fin(); + au_sysrq_fin(); + au_hnotify_fin(); + au_loopback_fin(); + au_wkq_fin(); + au_procfs_fin(); + sysaufs_fin(); + au_dy_fin(); +} + +module_init(aufs_init); +module_exit(aufs_exit); diff --git a/ubuntu/aufs/module.h b/ubuntu/aufs/module.h new file mode 100644 index 000000000000..e61fa54768dc --- /dev/null +++ b/ubuntu/aufs/module.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * module initialization and module-global + */ + +#ifndef __AUFS_MODULE_H__ +#define __AUFS_MODULE_H__ + +#ifdef __KERNEL__ + +#include + +struct path; +struct seq_file; + +/* module parameters */ +extern int sysaufs_brs; + +/* ---------------------------------------------------------------------- */ + +extern int au_dir_roflags; + +enum { + AuLcNonDir_FIINFO, + AuLcNonDir_DIINFO, + AuLcNonDir_IIINFO, + + AuLcDir_FIINFO, + AuLcDir_DIINFO, + AuLcDir_IIINFO, + + AuLcSymlink_DIINFO, + AuLcSymlink_IIINFO, + + AuLcKey_Last +}; +extern struct lock_class_key au_lc_key[AuLcKey_Last]; + +void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp); +int au_seq_path(struct seq_file *seq, struct path *path); + +#ifdef CONFIG_PROC_FS +/* procfs.c */ +int __init au_procfs_init(void); +void au_procfs_fin(void); +#else +AuStubInt0(au_procfs_init, void); +AuStubVoid(au_procfs_fin, void); +#endif + +/* ---------------------------------------------------------------------- */ + +/* kmem cache */ +enum { + AuCache_DINFO, + AuCache_ICNTNR, + AuCache_FINFO, + AuCache_VDIR, + AuCache_DEHSTR, +#ifdef CONFIG_AUFS_HNOTIFY + AuCache_HNOTIFY, +#endif + AuCache_Last +}; + +#define AuCacheFlags (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD) +#define AuCache(type) KMEM_CACHE(type, AuCacheFlags) +#define AuCacheCtor(type, ctor) \ + kmem_cache_create(#type, sizeof(struct type), \ + __alignof__(struct type), AuCacheFlags, ctor) + +extern struct kmem_cache *au_cachep[]; + +#define AuCacheFuncs(name, index) \ +static inline struct au_##name *au_cache_alloc_##name(void) \ +{ return kmem_cache_alloc(au_cachep[AuCache_##index], GFP_NOFS); } \ +static inline void au_cache_free_##name(struct au_##name *p) \ +{ kmem_cache_free(au_cachep[AuCache_##index], p); } + +AuCacheFuncs(dinfo, DINFO); +AuCacheFuncs(icntnr, ICNTNR); +AuCacheFuncs(finfo, FINFO); +AuCacheFuncs(vdir, VDIR); +AuCacheFuncs(vdir_dehstr, DEHSTR); +#ifdef CONFIG_AUFS_HNOTIFY +AuCacheFuncs(hnotify, HNOTIFY); +#endif + +#endif /* __KERNEL__ */ +#endif /* __AUFS_MODULE_H__ */ diff --git a/ubuntu/aufs/opts.c b/ubuntu/aufs/opts.c new file mode 100644 index 000000000000..4e868a6de6c7 --- /dev/null +++ b/ubuntu/aufs/opts.c @@ -0,0 +1,1679 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * mount options/flags + */ + +#include +#include +#include +#include /* a distribution requires */ +#include +#include "aufs.h" + +/* ---------------------------------------------------------------------- */ + +enum { + Opt_br, + Opt_add, Opt_del, Opt_mod, Opt_reorder, Opt_append, Opt_prepend, + Opt_idel, Opt_imod, Opt_ireorder, + Opt_dirwh, Opt_rdcache, Opt_rdblk, Opt_rdhash, Opt_rendir, + Opt_rdblk_def, Opt_rdhash_def, + Opt_xino, Opt_zxino, Opt_noxino, + Opt_trunc_xino, Opt_trunc_xino_v, Opt_notrunc_xino, + Opt_trunc_xino_path, Opt_itrunc_xino, + Opt_trunc_xib, Opt_notrunc_xib, + Opt_shwh, Opt_noshwh, + Opt_plink, Opt_noplink, Opt_list_plink, + Opt_udba, + Opt_dio, Opt_nodio, + /* Opt_lock, Opt_unlock, */ + Opt_cmd, Opt_cmd_args, + Opt_diropq_a, Opt_diropq_w, + Opt_warn_perm, Opt_nowarn_perm, + Opt_wbr_copyup, Opt_wbr_create, + Opt_refrof, Opt_norefrof, + Opt_verbose, Opt_noverbose, + Opt_sum, Opt_nosum, Opt_wsum, + Opt_tail, Opt_ignore, Opt_ignore_silent, Opt_err +}; + +static match_table_t options = { + {Opt_br, "br=%s"}, + {Opt_br, "br:%s"}, + + {Opt_add, "add=%d:%s"}, + {Opt_add, "add:%d:%s"}, + {Opt_add, "ins=%d:%s"}, + {Opt_add, "ins:%d:%s"}, + {Opt_append, "append=%s"}, + {Opt_append, "append:%s"}, + {Opt_prepend, "prepend=%s"}, + {Opt_prepend, "prepend:%s"}, + + {Opt_del, "del=%s"}, + {Opt_del, "del:%s"}, + /* {Opt_idel, "idel:%d"}, */ + {Opt_mod, "mod=%s"}, + {Opt_mod, "mod:%s"}, + /* {Opt_imod, "imod:%d:%s"}, */ + + {Opt_dirwh, "dirwh=%d"}, + + {Opt_xino, "xino=%s"}, + {Opt_noxino, "noxino"}, + {Opt_trunc_xino, "trunc_xino"}, + {Opt_trunc_xino_v, "trunc_xino_v=%d:%d"}, + {Opt_notrunc_xino, "notrunc_xino"}, + {Opt_trunc_xino_path, "trunc_xino=%s"}, + {Opt_itrunc_xino, "itrunc_xino=%d"}, + /* {Opt_zxino, "zxino=%s"}, */ + {Opt_trunc_xib, "trunc_xib"}, + {Opt_notrunc_xib, "notrunc_xib"}, + +#ifdef CONFIG_PROC_FS + {Opt_plink, "plink"}, +#else + {Opt_ignore_silent, "plink"}, +#endif + + {Opt_noplink, "noplink"}, + +#ifdef CONFIG_AUFS_DEBUG + {Opt_list_plink, "list_plink"}, +#endif + + {Opt_udba, "udba=%s"}, + + {Opt_dio, "dio"}, + {Opt_nodio, "nodio"}, + + {Opt_diropq_a, "diropq=always"}, + {Opt_diropq_a, "diropq=a"}, + {Opt_diropq_w, "diropq=whiteouted"}, + {Opt_diropq_w, "diropq=w"}, + + {Opt_warn_perm, "warn_perm"}, + {Opt_nowarn_perm, "nowarn_perm"}, + + /* keep them temporary */ + {Opt_ignore_silent, "coo=%s"}, + {Opt_ignore_silent, "nodlgt"}, + {Opt_ignore_silent, "nodirperm1"}, + {Opt_ignore_silent, "clean_plink"}, + +#ifdef CONFIG_AUFS_SHWH + {Opt_shwh, "shwh"}, +#endif + {Opt_noshwh, "noshwh"}, + + {Opt_rendir, "rendir=%d"}, + + {Opt_refrof, "refrof"}, + {Opt_norefrof, "norefrof"}, + + {Opt_verbose, "verbose"}, + {Opt_verbose, "v"}, + {Opt_noverbose, "noverbose"}, + {Opt_noverbose, "quiet"}, + {Opt_noverbose, "q"}, + {Opt_noverbose, "silent"}, + + {Opt_sum, "sum"}, + {Opt_nosum, "nosum"}, + {Opt_wsum, "wsum"}, + + {Opt_rdcache, "rdcache=%d"}, + {Opt_rdblk, "rdblk=%d"}, + {Opt_rdblk_def, "rdblk=def"}, + {Opt_rdhash, "rdhash=%d"}, + {Opt_rdhash_def, "rdhash=def"}, + + {Opt_wbr_create, "create=%s"}, + {Opt_wbr_create, "create_policy=%s"}, + {Opt_wbr_copyup, "cpup=%s"}, + {Opt_wbr_copyup, "copyup=%s"}, + {Opt_wbr_copyup, "copyup_policy=%s"}, + + /* internal use for the scripts */ + {Opt_ignore_silent, "si=%s"}, + + {Opt_br, "dirs=%s"}, + {Opt_ignore, "debug=%d"}, + {Opt_ignore, "delete=whiteout"}, + {Opt_ignore, "delete=all"}, + {Opt_ignore, "imap=%s"}, + + /* temporary workaround, due to old mount(8)? */ + {Opt_ignore_silent, "relatime"}, + + {Opt_err, NULL} +}; + +/* ---------------------------------------------------------------------- */ + +static const char *au_parser_pattern(int val, struct match_token *token) +{ + while (token->pattern) { + if (token->token == val) + return token->pattern; + token++; + } + BUG(); + return "??"; +} + +/* ---------------------------------------------------------------------- */ + +static match_table_t brperm = { + {AuBrPerm_RO, AUFS_BRPERM_RO}, + {AuBrPerm_RR, AUFS_BRPERM_RR}, + {AuBrPerm_RW, AUFS_BRPERM_RW}, + {0, NULL} +}; + +static match_table_t brrattr = { + {AuBrRAttr_WH, AUFS_BRRATTR_WH}, + {0, NULL} +}; + +static match_table_t brwattr = { + {AuBrWAttr_NoLinkWH, AUFS_BRWATTR_NLWH}, + {0, NULL} +}; + +#define AuBrStr_LONGEST AUFS_BRPERM_RW "+" AUFS_BRWATTR_NLWH + +static int br_attr_val(char *str, match_table_t table, substring_t args[]) +{ + int attr, v; + char *p; + + attr = 0; + do { + p = strchr(str, '+'); + if (p) + *p = 0; + v = match_token(str, table, args); + if (v) + attr |= v; + else { + if (p) + *p = '+'; + pr_warning("ignored branch attribute %s\n", str); + break; + } + if (p) + str = p + 1; + } while (p); + + return attr; +} + +static int noinline_for_stack br_perm_val(char *perm) +{ + int val; + char *p; + substring_t args[MAX_OPT_ARGS]; + + p = strchr(perm, '+'); + if (p) + *p = 0; + val = match_token(perm, brperm, args); + if (!val) { + if (p) + *p = '+'; + pr_warning("ignored branch permission %s\n", perm); + val = AuBrPerm_RO; + goto out; + } + if (!p) + goto out; + + switch (val) { + case AuBrPerm_RO: + case AuBrPerm_RR: + val |= br_attr_val(p + 1, brrattr, args); + break; + case AuBrPerm_RW: + val |= br_attr_val(p + 1, brwattr, args); + break; + } + +out: + return val; +} + +/* Caller should free the return value */ +char *au_optstr_br_perm(int brperm) +{ + char *p, a[sizeof(AuBrStr_LONGEST)]; + int sz; + +#define SetPerm(str) do { \ + sz = sizeof(str); \ + memcpy(a, str, sz); \ + p = a + sz - 1; \ + } while (0) + +#define AppendAttr(flag, str) do { \ + if (brperm & flag) { \ + sz = sizeof(str); \ + *p++ = '+'; \ + memcpy(p, str, sz); \ + p += sz - 1; \ + } \ + } while (0) + + switch (brperm & AuBrPerm_Mask) { + case AuBrPerm_RO: + SetPerm(AUFS_BRPERM_RO); + break; + case AuBrPerm_RR: + SetPerm(AUFS_BRPERM_RR); + break; + case AuBrPerm_RW: + SetPerm(AUFS_BRPERM_RW); + break; + default: + AuDebugOn(1); + } + + AppendAttr(AuBrRAttr_WH, AUFS_BRRATTR_WH); + AppendAttr(AuBrWAttr_NoLinkWH, AUFS_BRWATTR_NLWH); + + AuDebugOn(strlen(a) >= sizeof(a)); + return kstrdup(a, GFP_NOFS); +#undef SetPerm +#undef AppendAttr +} + +/* ---------------------------------------------------------------------- */ + +static match_table_t udbalevel = { + {AuOpt_UDBA_REVAL, "reval"}, + {AuOpt_UDBA_NONE, "none"}, +#ifdef CONFIG_AUFS_HNOTIFY + {AuOpt_UDBA_HNOTIFY, "notify"}, /* abstraction */ +#ifdef CONFIG_AUFS_HFSNOTIFY + {AuOpt_UDBA_HNOTIFY, "fsnotify"}, +#endif +#endif + {-1, NULL} +}; + +static int noinline_for_stack udba_val(char *str) +{ + substring_t args[MAX_OPT_ARGS]; + + return match_token(str, udbalevel, args); +} + +const char *au_optstr_udba(int udba) +{ + return au_parser_pattern(udba, (void *)udbalevel); +} + +/* ---------------------------------------------------------------------- */ + +static match_table_t au_wbr_create_policy = { + {AuWbrCreate_TDP, "tdp"}, + {AuWbrCreate_TDP, "top-down-parent"}, + {AuWbrCreate_RR, "rr"}, + {AuWbrCreate_RR, "round-robin"}, + {AuWbrCreate_MFS, "mfs"}, + {AuWbrCreate_MFS, "most-free-space"}, + {AuWbrCreate_MFSV, "mfs:%d"}, + {AuWbrCreate_MFSV, "most-free-space:%d"}, + + {AuWbrCreate_MFSRR, "mfsrr:%d"}, + {AuWbrCreate_MFSRRV, "mfsrr:%d:%d"}, + {AuWbrCreate_PMFS, "pmfs"}, + {AuWbrCreate_PMFSV, "pmfs:%d"}, + + {-1, NULL} +}; + +/* + * cf. linux/lib/parser.c and cmdline.c + * gave up calling memparse() since it uses simple_strtoull() instead of + * kstrto...(). + */ +static int noinline_for_stack +au_match_ull(substring_t *s, unsigned long long *result) +{ + int err; + unsigned int len; + char a[32]; + + err = -ERANGE; + len = s->to - s->from; + if (len + 1 <= sizeof(a)) { + memcpy(a, s->from, len); + a[len] = '\0'; + err = kstrtoull(a, 0, result); + } + return err; +} + +static int au_wbr_mfs_wmark(substring_t *arg, char *str, + struct au_opt_wbr_create *create) +{ + int err; + unsigned long long ull; + + err = 0; + if (!au_match_ull(arg, &ull)) + create->mfsrr_watermark = ull; + else { + pr_err("bad integer in %s\n", str); + err = -EINVAL; + } + + return err; +} + +static int au_wbr_mfs_sec(substring_t *arg, char *str, + struct au_opt_wbr_create *create) +{ + int n, err; + + err = 0; + if (!match_int(arg, &n) && 0 <= n && n <= AUFS_MFS_MAX_SEC) + create->mfs_second = n; + else { + pr_err("bad integer in %s\n", str); + err = -EINVAL; + } + + return err; +} + +static int noinline_for_stack +au_wbr_create_val(char *str, struct au_opt_wbr_create *create) +{ + int err, e; + substring_t args[MAX_OPT_ARGS]; + + err = match_token(str, au_wbr_create_policy, args); + create->wbr_create = err; + switch (err) { + case AuWbrCreate_MFSRRV: + e = au_wbr_mfs_wmark(&args[0], str, create); + if (!e) + e = au_wbr_mfs_sec(&args[1], str, create); + if (unlikely(e)) + err = e; + break; + case AuWbrCreate_MFSRR: + e = au_wbr_mfs_wmark(&args[0], str, create); + if (unlikely(e)) { + err = e; + break; + } + /*FALLTHROUGH*/ + case AuWbrCreate_MFS: + case AuWbrCreate_PMFS: + create->mfs_second = AUFS_MFS_DEF_SEC; + break; + case AuWbrCreate_MFSV: + case AuWbrCreate_PMFSV: + e = au_wbr_mfs_sec(&args[0], str, create); + if (unlikely(e)) + err = e; + break; + } + + return err; +} + +const char *au_optstr_wbr_create(int wbr_create) +{ + return au_parser_pattern(wbr_create, (void *)au_wbr_create_policy); +} + +static match_table_t au_wbr_copyup_policy = { + {AuWbrCopyup_TDP, "tdp"}, + {AuWbrCopyup_TDP, "top-down-parent"}, + {AuWbrCopyup_BUP, "bup"}, + {AuWbrCopyup_BUP, "bottom-up-parent"}, + {AuWbrCopyup_BU, "bu"}, + {AuWbrCopyup_BU, "bottom-up"}, + {-1, NULL} +}; + +static int noinline_for_stack au_wbr_copyup_val(char *str) +{ + substring_t args[MAX_OPT_ARGS]; + + return match_token(str, au_wbr_copyup_policy, args); +} + +const char *au_optstr_wbr_copyup(int wbr_copyup) +{ + return au_parser_pattern(wbr_copyup, (void *)au_wbr_copyup_policy); +} + +/* ---------------------------------------------------------------------- */ + +static const int lkup_dirflags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; + +static void dump_opts(struct au_opts *opts) +{ +#ifdef CONFIG_AUFS_DEBUG + /* reduce stack space */ + union { + struct au_opt_add *add; + struct au_opt_del *del; + struct au_opt_mod *mod; + struct au_opt_xino *xino; + struct au_opt_xino_itrunc *xino_itrunc; + struct au_opt_wbr_create *create; + } u; + struct au_opt *opt; + + opt = opts->opt; + while (opt->type != Opt_tail) { + switch (opt->type) { + case Opt_add: + u.add = &opt->add; + AuDbg("add {b%d, %s, 0x%x, %p}\n", + u.add->bindex, u.add->pathname, u.add->perm, + u.add->path.dentry); + break; + case Opt_del: + case Opt_idel: + u.del = &opt->del; + AuDbg("del {%s, %p}\n", + u.del->pathname, u.del->h_path.dentry); + break; + case Opt_mod: + case Opt_imod: + u.mod = &opt->mod; + AuDbg("mod {%s, 0x%x, %p}\n", + u.mod->path, u.mod->perm, u.mod->h_root); + break; + case Opt_append: + u.add = &opt->add; + AuDbg("append {b%d, %s, 0x%x, %p}\n", + u.add->bindex, u.add->pathname, u.add->perm, + u.add->path.dentry); + break; + case Opt_prepend: + u.add = &opt->add; + AuDbg("prepend {b%d, %s, 0x%x, %p}\n", + u.add->bindex, u.add->pathname, u.add->perm, + u.add->path.dentry); + break; + case Opt_dirwh: + AuDbg("dirwh %d\n", opt->dirwh); + break; + case Opt_rdcache: + AuDbg("rdcache %d\n", opt->rdcache); + break; + case Opt_rdblk: + AuDbg("rdblk %u\n", opt->rdblk); + break; + case Opt_rdblk_def: + AuDbg("rdblk_def\n"); + break; + case Opt_rdhash: + AuDbg("rdhash %u\n", opt->rdhash); + break; + case Opt_rdhash_def: + AuDbg("rdhash_def\n"); + break; + case Opt_xino: + u.xino = &opt->xino; + AuDbg("xino {%s %.*s}\n", + u.xino->path, + AuDLNPair(u.xino->file->f_dentry)); + break; + case Opt_trunc_xino: + AuLabel(trunc_xino); + break; + case Opt_notrunc_xino: + AuLabel(notrunc_xino); + break; + case Opt_trunc_xino_path: + case Opt_itrunc_xino: + u.xino_itrunc = &opt->xino_itrunc; + AuDbg("trunc_xino %d\n", u.xino_itrunc->bindex); + break; + + case Opt_noxino: + AuLabel(noxino); + break; + case Opt_trunc_xib: + AuLabel(trunc_xib); + break; + case Opt_notrunc_xib: + AuLabel(notrunc_xib); + break; + case Opt_shwh: + AuLabel(shwh); + break; + case Opt_noshwh: + AuLabel(noshwh); + break; + case Opt_plink: + AuLabel(plink); + break; + case Opt_noplink: + AuLabel(noplink); + break; + case Opt_list_plink: + AuLabel(list_plink); + break; + case Opt_udba: + AuDbg("udba %d, %s\n", + opt->udba, au_optstr_udba(opt->udba)); + break; + case Opt_dio: + AuLabel(dio); + break; + case Opt_nodio: + AuLabel(nodio); + break; + case Opt_diropq_a: + AuLabel(diropq_a); + break; + case Opt_diropq_w: + AuLabel(diropq_w); + break; + case Opt_warn_perm: + AuLabel(warn_perm); + break; + case Opt_nowarn_perm: + AuLabel(nowarn_perm); + break; + case Opt_refrof: + AuLabel(refrof); + break; + case Opt_norefrof: + AuLabel(norefrof); + break; + case Opt_verbose: + AuLabel(verbose); + break; + case Opt_noverbose: + AuLabel(noverbose); + break; + case Opt_sum: + AuLabel(sum); + break; + case Opt_nosum: + AuLabel(nosum); + break; + case Opt_wsum: + AuLabel(wsum); + break; + case Opt_wbr_create: + u.create = &opt->wbr_create; + AuDbg("create %d, %s\n", u.create->wbr_create, + au_optstr_wbr_create(u.create->wbr_create)); + switch (u.create->wbr_create) { + case AuWbrCreate_MFSV: + case AuWbrCreate_PMFSV: + AuDbg("%d sec\n", u.create->mfs_second); + break; + case AuWbrCreate_MFSRR: + AuDbg("%llu watermark\n", + u.create->mfsrr_watermark); + break; + case AuWbrCreate_MFSRRV: + AuDbg("%llu watermark, %d sec\n", + u.create->mfsrr_watermark, + u.create->mfs_second); + break; + } + break; + case Opt_wbr_copyup: + AuDbg("copyup %d, %s\n", opt->wbr_copyup, + au_optstr_wbr_copyup(opt->wbr_copyup)); + break; + default: + BUG(); + } + opt++; + } +#endif +} + +void au_opts_free(struct au_opts *opts) +{ + struct au_opt *opt; + + opt = opts->opt; + while (opt->type != Opt_tail) { + switch (opt->type) { + case Opt_add: + case Opt_append: + case Opt_prepend: + path_put(&opt->add.path); + break; + case Opt_del: + case Opt_idel: + path_put(&opt->del.h_path); + break; + case Opt_mod: + case Opt_imod: + dput(opt->mod.h_root); + break; + case Opt_xino: + fput(opt->xino.file); + break; + } + opt++; + } +} + +static int opt_add(struct au_opt *opt, char *opt_str, unsigned long sb_flags, + aufs_bindex_t bindex) +{ + int err; + struct au_opt_add *add = &opt->add; + char *p; + + add->bindex = bindex; + add->perm = AuBrPerm_RO; + add->pathname = opt_str; + p = strchr(opt_str, '='); + if (p) { + *p++ = 0; + if (*p) + add->perm = br_perm_val(p); + } + + err = vfsub_kern_path(add->pathname, lkup_dirflags, &add->path); + if (!err) { + if (!p) { + add->perm = AuBrPerm_RO; + if (au_test_fs_rr(add->path.dentry->d_sb)) + add->perm = AuBrPerm_RR; + else if (!bindex && !(sb_flags & MS_RDONLY)) + add->perm = AuBrPerm_RW; + } + opt->type = Opt_add; + goto out; + } + pr_err("lookup failed %s (%d)\n", add->pathname, err); + err = -EINVAL; + +out: + return err; +} + +static int au_opts_parse_del(struct au_opt_del *del, substring_t args[]) +{ + int err; + + del->pathname = args[0].from; + AuDbg("del path %s\n", del->pathname); + + err = vfsub_kern_path(del->pathname, lkup_dirflags, &del->h_path); + if (unlikely(err)) + pr_err("lookup failed %s (%d)\n", del->pathname, err); + + return err; +} + +#if 0 /* reserved for future use */ +static int au_opts_parse_idel(struct super_block *sb, aufs_bindex_t bindex, + struct au_opt_del *del, substring_t args[]) +{ + int err; + struct dentry *root; + + err = -EINVAL; + root = sb->s_root; + aufs_read_lock(root, AuLock_FLUSH); + if (bindex < 0 || au_sbend(sb) < bindex) { + pr_err("out of bounds, %d\n", bindex); + goto out; + } + + err = 0; + del->h_path.dentry = dget(au_h_dptr(root, bindex)); + del->h_path.mnt = mntget(au_sbr_mnt(sb, bindex)); + +out: + aufs_read_unlock(root, !AuLock_IR); + return err; +} +#endif + +static int noinline_for_stack +au_opts_parse_mod(struct au_opt_mod *mod, substring_t args[]) +{ + int err; + struct path path; + char *p; + + err = -EINVAL; + mod->path = args[0].from; + p = strchr(mod->path, '='); + if (unlikely(!p)) { + pr_err("no permssion %s\n", args[0].from); + goto out; + } + + *p++ = 0; + err = vfsub_kern_path(mod->path, lkup_dirflags, &path); + if (unlikely(err)) { + pr_err("lookup failed %s (%d)\n", mod->path, err); + goto out; + } + + mod->perm = br_perm_val(p); + AuDbg("mod path %s, perm 0x%x, %s\n", mod->path, mod->perm, p); + mod->h_root = dget(path.dentry); + path_put(&path); + +out: + return err; +} + +#if 0 /* reserved for future use */ +static int au_opts_parse_imod(struct super_block *sb, aufs_bindex_t bindex, + struct au_opt_mod *mod, substring_t args[]) +{ + int err; + struct dentry *root; + + err = -EINVAL; + root = sb->s_root; + aufs_read_lock(root, AuLock_FLUSH); + if (bindex < 0 || au_sbend(sb) < bindex) { + pr_err("out of bounds, %d\n", bindex); + goto out; + } + + err = 0; + mod->perm = br_perm_val(args[1].from); + AuDbg("mod path %s, perm 0x%x, %s\n", + mod->path, mod->perm, args[1].from); + mod->h_root = dget(au_h_dptr(root, bindex)); + +out: + aufs_read_unlock(root, !AuLock_IR); + return err; +} +#endif + +static int au_opts_parse_xino(struct super_block *sb, struct au_opt_xino *xino, + substring_t args[]) +{ + int err; + struct file *file; + + file = au_xino_create(sb, args[0].from, /*silent*/0); + err = PTR_ERR(file); + if (IS_ERR(file)) + goto out; + + err = -EINVAL; + if (unlikely(file->f_dentry->d_sb == sb)) { + fput(file); + pr_err("%s must be outside\n", args[0].from); + goto out; + } + + err = 0; + xino->file = file; + xino->path = args[0].from; + +out: + return err; +} + +static int noinline_for_stack +au_opts_parse_xino_itrunc_path(struct super_block *sb, + struct au_opt_xino_itrunc *xino_itrunc, + substring_t args[]) +{ + int err; + aufs_bindex_t bend, bindex; + struct path path; + struct dentry *root; + + err = vfsub_kern_path(args[0].from, lkup_dirflags, &path); + if (unlikely(err)) { + pr_err("lookup failed %s (%d)\n", args[0].from, err); + goto out; + } + + xino_itrunc->bindex = -1; + root = sb->s_root; + aufs_read_lock(root, AuLock_FLUSH); + bend = au_sbend(sb); + for (bindex = 0; bindex <= bend; bindex++) { + if (au_h_dptr(root, bindex) == path.dentry) { + xino_itrunc->bindex = bindex; + break; + } + } + aufs_read_unlock(root, !AuLock_IR); + path_put(&path); + + if (unlikely(xino_itrunc->bindex < 0)) { + pr_err("no such branch %s\n", args[0].from); + err = -EINVAL; + } + +out: + return err; +} + +/* called without aufs lock */ +int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts) +{ + int err, n, token; + aufs_bindex_t bindex; + unsigned char skipped; + struct dentry *root; + struct au_opt *opt, *opt_tail; + char *opt_str; + /* reduce the stack space */ + union { + struct au_opt_xino_itrunc *xino_itrunc; + struct au_opt_wbr_create *create; + } u; + struct { + substring_t args[MAX_OPT_ARGS]; + } *a; + + err = -ENOMEM; + a = kmalloc(sizeof(*a), GFP_NOFS); + if (unlikely(!a)) + goto out; + + root = sb->s_root; + err = 0; + bindex = 0; + opt = opts->opt; + opt_tail = opt + opts->max_opt - 1; + opt->type = Opt_tail; + while (!err && (opt_str = strsep(&str, ",")) && *opt_str) { + err = -EINVAL; + skipped = 0; + token = match_token(opt_str, options, a->args); + switch (token) { + case Opt_br: + err = 0; + while (!err && (opt_str = strsep(&a->args[0].from, ":")) + && *opt_str) { + err = opt_add(opt, opt_str, opts->sb_flags, + bindex++); + if (unlikely(!err && ++opt > opt_tail)) { + err = -E2BIG; + break; + } + opt->type = Opt_tail; + skipped = 1; + } + break; + case Opt_add: + if (unlikely(match_int(&a->args[0], &n))) { + pr_err("bad integer in %s\n", opt_str); + break; + } + bindex = n; + err = opt_add(opt, a->args[1].from, opts->sb_flags, + bindex); + if (!err) + opt->type = token; + break; + case Opt_append: + err = opt_add(opt, a->args[0].from, opts->sb_flags, + /*dummy bindex*/1); + if (!err) + opt->type = token; + break; + case Opt_prepend: + err = opt_add(opt, a->args[0].from, opts->sb_flags, + /*bindex*/0); + if (!err) + opt->type = token; + break; + case Opt_del: + err = au_opts_parse_del(&opt->del, a->args); + if (!err) + opt->type = token; + break; +#if 0 /* reserved for future use */ + case Opt_idel: + del->pathname = "(indexed)"; + if (unlikely(match_int(&args[0], &n))) { + pr_err("bad integer in %s\n", opt_str); + break; + } + err = au_opts_parse_idel(sb, n, &opt->del, a->args); + if (!err) + opt->type = token; + break; +#endif + case Opt_mod: + err = au_opts_parse_mod(&opt->mod, a->args); + if (!err) + opt->type = token; + break; +#ifdef IMOD /* reserved for future use */ + case Opt_imod: + u.mod->path = "(indexed)"; + if (unlikely(match_int(&a->args[0], &n))) { + pr_err("bad integer in %s\n", opt_str); + break; + } + err = au_opts_parse_imod(sb, n, &opt->mod, a->args); + if (!err) + opt->type = token; + break; +#endif + case Opt_xino: + err = au_opts_parse_xino(sb, &opt->xino, a->args); + if (!err) + opt->type = token; + break; + + case Opt_trunc_xino_path: + err = au_opts_parse_xino_itrunc_path + (sb, &opt->xino_itrunc, a->args); + if (!err) + opt->type = token; + break; + + case Opt_itrunc_xino: + u.xino_itrunc = &opt->xino_itrunc; + if (unlikely(match_int(&a->args[0], &n))) { + pr_err("bad integer in %s\n", opt_str); + break; + } + u.xino_itrunc->bindex = n; + aufs_read_lock(root, AuLock_FLUSH); + if (n < 0 || au_sbend(sb) < n) { + pr_err("out of bounds, %d\n", n); + aufs_read_unlock(root, !AuLock_IR); + break; + } + aufs_read_unlock(root, !AuLock_IR); + err = 0; + opt->type = token; + break; + + case Opt_dirwh: + if (unlikely(match_int(&a->args[0], &opt->dirwh))) + break; + err = 0; + opt->type = token; + break; + + case Opt_rdcache: + if (unlikely(match_int(&a->args[0], &n))) { + pr_err("bad integer in %s\n", opt_str); + break; + } + if (unlikely(n > AUFS_RDCACHE_MAX)) { + pr_err("rdcache must be smaller than %d\n", + AUFS_RDCACHE_MAX); + break; + } + opt->rdcache = n; + err = 0; + opt->type = token; + break; + case Opt_rdblk: + if (unlikely(match_int(&a->args[0], &n) + || n < 0 + || n > KMALLOC_MAX_SIZE)) { + pr_err("bad integer in %s\n", opt_str); + break; + } + if (unlikely(n && n < NAME_MAX)) { + pr_err("rdblk must be larger than %d\n", + NAME_MAX); + break; + } + opt->rdblk = n; + err = 0; + opt->type = token; + break; + case Opt_rdhash: + if (unlikely(match_int(&a->args[0], &n) + || n < 0 + || n * sizeof(struct hlist_head) + > KMALLOC_MAX_SIZE)) { + pr_err("bad integer in %s\n", opt_str); + break; + } + opt->rdhash = n; + err = 0; + opt->type = token; + break; + + case Opt_trunc_xino: + case Opt_notrunc_xino: + case Opt_noxino: + case Opt_trunc_xib: + case Opt_notrunc_xib: + case Opt_shwh: + case Opt_noshwh: + case Opt_plink: + case Opt_noplink: + case Opt_list_plink: + case Opt_dio: + case Opt_nodio: + case Opt_diropq_a: + case Opt_diropq_w: + case Opt_warn_perm: + case Opt_nowarn_perm: + case Opt_refrof: + case Opt_norefrof: + case Opt_verbose: + case Opt_noverbose: + case Opt_sum: + case Opt_nosum: + case Opt_wsum: + case Opt_rdblk_def: + case Opt_rdhash_def: + err = 0; + opt->type = token; + break; + + case Opt_udba: + opt->udba = udba_val(a->args[0].from); + if (opt->udba >= 0) { + err = 0; + opt->type = token; + } else + pr_err("wrong value, %s\n", opt_str); + break; + + case Opt_wbr_create: + u.create = &opt->wbr_create; + u.create->wbr_create + = au_wbr_create_val(a->args[0].from, u.create); + if (u.create->wbr_create >= 0) { + err = 0; + opt->type = token; + } else + pr_err("wrong value, %s\n", opt_str); + break; + case Opt_wbr_copyup: + opt->wbr_copyup = au_wbr_copyup_val(a->args[0].from); + if (opt->wbr_copyup >= 0) { + err = 0; + opt->type = token; + } else + pr_err("wrong value, %s\n", opt_str); + break; + + case Opt_ignore: + pr_warning("ignored %s\n", opt_str); + /*FALLTHROUGH*/ + case Opt_ignore_silent: + skipped = 1; + err = 0; + break; + case Opt_err: + pr_err("unknown option %s\n", opt_str); + break; + } + + if (!err && !skipped) { + if (unlikely(++opt > opt_tail)) { + err = -E2BIG; + opt--; + opt->type = Opt_tail; + break; + } + opt->type = Opt_tail; + } + } + + kfree(a); + dump_opts(opts); + if (unlikely(err)) + au_opts_free(opts); + +out: + return err; +} + +static int au_opt_wbr_create(struct super_block *sb, + struct au_opt_wbr_create *create) +{ + int err; + struct au_sbinfo *sbinfo; + + SiMustWriteLock(sb); + + err = 1; /* handled */ + sbinfo = au_sbi(sb); + if (sbinfo->si_wbr_create_ops->fin) { + err = sbinfo->si_wbr_create_ops->fin(sb); + if (!err) + err = 1; + } + + sbinfo->si_wbr_create = create->wbr_create; + sbinfo->si_wbr_create_ops = au_wbr_create_ops + create->wbr_create; + switch (create->wbr_create) { + case AuWbrCreate_MFSRRV: + case AuWbrCreate_MFSRR: + sbinfo->si_wbr_mfs.mfsrr_watermark = create->mfsrr_watermark; + /*FALLTHROUGH*/ + case AuWbrCreate_MFS: + case AuWbrCreate_MFSV: + case AuWbrCreate_PMFS: + case AuWbrCreate_PMFSV: + sbinfo->si_wbr_mfs.mfs_expire + = msecs_to_jiffies(create->mfs_second * MSEC_PER_SEC); + break; + } + + if (sbinfo->si_wbr_create_ops->init) + sbinfo->si_wbr_create_ops->init(sb); /* ignore */ + + return err; +} + +/* + * returns, + * plus: processed without an error + * zero: unprocessed + */ +static int au_opt_simple(struct super_block *sb, struct au_opt *opt, + struct au_opts *opts) +{ + int err; + struct au_sbinfo *sbinfo; + + SiMustWriteLock(sb); + + err = 1; /* handled */ + sbinfo = au_sbi(sb); + switch (opt->type) { + case Opt_udba: + sbinfo->si_mntflags &= ~AuOptMask_UDBA; + sbinfo->si_mntflags |= opt->udba; + opts->given_udba |= opt->udba; + break; + + case Opt_plink: + au_opt_set(sbinfo->si_mntflags, PLINK); + break; + case Opt_noplink: + if (au_opt_test(sbinfo->si_mntflags, PLINK)) + au_plink_put(sb, /*verbose*/1); + au_opt_clr(sbinfo->si_mntflags, PLINK); + break; + case Opt_list_plink: + if (au_opt_test(sbinfo->si_mntflags, PLINK)) + au_plink_list(sb); + break; + + case Opt_dio: + au_opt_set(sbinfo->si_mntflags, DIO); + au_fset_opts(opts->flags, REFRESH_DYAOP); + break; + case Opt_nodio: + au_opt_clr(sbinfo->si_mntflags, DIO); + au_fset_opts(opts->flags, REFRESH_DYAOP); + break; + + case Opt_diropq_a: + au_opt_set(sbinfo->si_mntflags, ALWAYS_DIROPQ); + break; + case Opt_diropq_w: + au_opt_clr(sbinfo->si_mntflags, ALWAYS_DIROPQ); + break; + + case Opt_warn_perm: + au_opt_set(sbinfo->si_mntflags, WARN_PERM); + break; + case Opt_nowarn_perm: + au_opt_clr(sbinfo->si_mntflags, WARN_PERM); + break; + + case Opt_refrof: + au_opt_set(sbinfo->si_mntflags, REFROF); + break; + case Opt_norefrof: + au_opt_clr(sbinfo->si_mntflags, REFROF); + break; + + case Opt_verbose: + au_opt_set(sbinfo->si_mntflags, VERBOSE); + break; + case Opt_noverbose: + au_opt_clr(sbinfo->si_mntflags, VERBOSE); + break; + + case Opt_sum: + au_opt_set(sbinfo->si_mntflags, SUM); + break; + case Opt_wsum: + au_opt_clr(sbinfo->si_mntflags, SUM); + au_opt_set(sbinfo->si_mntflags, SUM_W); + case Opt_nosum: + au_opt_clr(sbinfo->si_mntflags, SUM); + au_opt_clr(sbinfo->si_mntflags, SUM_W); + break; + + case Opt_wbr_create: + err = au_opt_wbr_create(sb, &opt->wbr_create); + break; + case Opt_wbr_copyup: + sbinfo->si_wbr_copyup = opt->wbr_copyup; + sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + opt->wbr_copyup; + break; + + case Opt_dirwh: + sbinfo->si_dirwh = opt->dirwh; + break; + + case Opt_rdcache: + sbinfo->si_rdcache + = msecs_to_jiffies(opt->rdcache * MSEC_PER_SEC); + break; + case Opt_rdblk: + sbinfo->si_rdblk = opt->rdblk; + break; + case Opt_rdblk_def: + sbinfo->si_rdblk = AUFS_RDBLK_DEF; + break; + case Opt_rdhash: + sbinfo->si_rdhash = opt->rdhash; + break; + case Opt_rdhash_def: + sbinfo->si_rdhash = AUFS_RDHASH_DEF; + break; + + case Opt_shwh: + au_opt_set(sbinfo->si_mntflags, SHWH); + break; + case Opt_noshwh: + au_opt_clr(sbinfo->si_mntflags, SHWH); + break; + + case Opt_trunc_xino: + au_opt_set(sbinfo->si_mntflags, TRUNC_XINO); + break; + case Opt_notrunc_xino: + au_opt_clr(sbinfo->si_mntflags, TRUNC_XINO); + break; + + case Opt_trunc_xino_path: + case Opt_itrunc_xino: + err = au_xino_trunc(sb, opt->xino_itrunc.bindex); + if (!err) + err = 1; + break; + + case Opt_trunc_xib: + au_fset_opts(opts->flags, TRUNC_XIB); + break; + case Opt_notrunc_xib: + au_fclr_opts(opts->flags, TRUNC_XIB); + break; + + default: + err = 0; + break; + } + + return err; +} + +/* + * returns tri-state. + * plus: processed without an error + * zero: unprocessed + * minus: error + */ +static int au_opt_br(struct super_block *sb, struct au_opt *opt, + struct au_opts *opts) +{ + int err, do_refresh; + + err = 0; + switch (opt->type) { + case Opt_append: + opt->add.bindex = au_sbend(sb) + 1; + if (opt->add.bindex < 0) + opt->add.bindex = 0; + goto add; + case Opt_prepend: + opt->add.bindex = 0; + add: + case Opt_add: + err = au_br_add(sb, &opt->add, + au_ftest_opts(opts->flags, REMOUNT)); + if (!err) { + err = 1; + au_fset_opts(opts->flags, REFRESH); + } + break; + + case Opt_del: + case Opt_idel: + err = au_br_del(sb, &opt->del, + au_ftest_opts(opts->flags, REMOUNT)); + if (!err) { + err = 1; + au_fset_opts(opts->flags, TRUNC_XIB); + au_fset_opts(opts->flags, REFRESH); + } + break; + + case Opt_mod: + case Opt_imod: + err = au_br_mod(sb, &opt->mod, + au_ftest_opts(opts->flags, REMOUNT), + &do_refresh); + if (!err) { + err = 1; + if (do_refresh) + au_fset_opts(opts->flags, REFRESH); + } + break; + } + + return err; +} + +static int au_opt_xino(struct super_block *sb, struct au_opt *opt, + struct au_opt_xino **opt_xino, + struct au_opts *opts) +{ + int err; + aufs_bindex_t bend, bindex; + struct dentry *root, *parent, *h_root; + + err = 0; + switch (opt->type) { + case Opt_xino: + err = au_xino_set(sb, &opt->xino, + !!au_ftest_opts(opts->flags, REMOUNT)); + if (unlikely(err)) + break; + + *opt_xino = &opt->xino; + au_xino_brid_set(sb, -1); + + /* safe d_parent access */ + parent = opt->xino.file->f_dentry->d_parent; + root = sb->s_root; + bend = au_sbend(sb); + for (bindex = 0; bindex <= bend; bindex++) { + h_root = au_h_dptr(root, bindex); + if (h_root == parent) { + au_xino_brid_set(sb, au_sbr_id(sb, bindex)); + break; + } + } + break; + + case Opt_noxino: + au_xino_clr(sb); + au_xino_brid_set(sb, -1); + *opt_xino = (void *)-1; + break; + } + + return err; +} + +int au_opts_verify(struct super_block *sb, unsigned long sb_flags, + unsigned int pending) +{ + int err; + aufs_bindex_t bindex, bend; + unsigned char do_plink, skip, do_free; + struct au_branch *br; + struct au_wbr *wbr; + struct dentry *root; + struct inode *dir, *h_dir; + struct au_sbinfo *sbinfo; + struct au_hinode *hdir; + + SiMustAnyLock(sb); + + sbinfo = au_sbi(sb); + AuDebugOn(!(sbinfo->si_mntflags & AuOptMask_UDBA)); + + if (!(sb_flags & MS_RDONLY)) { + if (unlikely(!au_br_writable(au_sbr_perm(sb, 0)))) + pr_warning("first branch should be rw\n"); + if (unlikely(au_opt_test(sbinfo->si_mntflags, SHWH))) + pr_warning("shwh should be used with ro\n"); + } + + if (au_opt_test((sbinfo->si_mntflags | pending), UDBA_HNOTIFY) + && !au_opt_test(sbinfo->si_mntflags, XINO)) + pr_warning("udba=*notify requires xino\n"); + + err = 0; + root = sb->s_root; + dir = root->d_inode; + do_plink = !!au_opt_test(sbinfo->si_mntflags, PLINK); + bend = au_sbend(sb); + for (bindex = 0; !err && bindex <= bend; bindex++) { + skip = 0; + h_dir = au_h_iptr(dir, bindex); + br = au_sbr(sb, bindex); + do_free = 0; + + wbr = br->br_wbr; + if (wbr) + wbr_wh_read_lock(wbr); + + if (!au_br_writable(br->br_perm)) { + do_free = !!wbr; + skip = (!wbr + || (!wbr->wbr_whbase + && !wbr->wbr_plink + && !wbr->wbr_orph)); + } else if (!au_br_wh_linkable(br->br_perm)) { + /* skip = (!br->br_whbase && !br->br_orph); */ + skip = (!wbr || !wbr->wbr_whbase); + if (skip && wbr) { + if (do_plink) + skip = !!wbr->wbr_plink; + else + skip = !wbr->wbr_plink; + } + } else { + /* skip = (br->br_whbase && br->br_ohph); */ + skip = (wbr && wbr->wbr_whbase); + if (skip) { + if (do_plink) + skip = !!wbr->wbr_plink; + else + skip = !wbr->wbr_plink; + } + } + if (wbr) + wbr_wh_read_unlock(wbr); + + if (skip) + continue; + + hdir = au_hi(dir, bindex); + au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); + if (wbr) + wbr_wh_write_lock(wbr); + err = au_wh_init(au_h_dptr(root, bindex), br, sb); + if (wbr) + wbr_wh_write_unlock(wbr); + au_hn_imtx_unlock(hdir); + + if (!err && do_free) { + kfree(wbr); + br->br_wbr = NULL; + } + } + + return err; +} + +int au_opts_mount(struct super_block *sb, struct au_opts *opts) +{ + int err; + unsigned int tmp; + aufs_bindex_t bindex, bend; + struct au_opt *opt; + struct au_opt_xino *opt_xino, xino; + struct au_sbinfo *sbinfo; + struct au_branch *br; + + SiMustWriteLock(sb); + + err = 0; + opt_xino = NULL; + opt = opts->opt; + while (err >= 0 && opt->type != Opt_tail) + err = au_opt_simple(sb, opt++, opts); + if (err > 0) + err = 0; + else if (unlikely(err < 0)) + goto out; + + /* disable xino and udba temporary */ + sbinfo = au_sbi(sb); + tmp = sbinfo->si_mntflags; + au_opt_clr(sbinfo->si_mntflags, XINO); + au_opt_set_udba(sbinfo->si_mntflags, UDBA_REVAL); + + opt = opts->opt; + while (err >= 0 && opt->type != Opt_tail) + err = au_opt_br(sb, opt++, opts); + if (err > 0) + err = 0; + else if (unlikely(err < 0)) + goto out; + + bend = au_sbend(sb); + if (unlikely(bend < 0)) { + err = -EINVAL; + pr_err("no branches\n"); + goto out; + } + + if (au_opt_test(tmp, XINO)) + au_opt_set(sbinfo->si_mntflags, XINO); + opt = opts->opt; + while (!err && opt->type != Opt_tail) + err = au_opt_xino(sb, opt++, &opt_xino, opts); + if (unlikely(err)) + goto out; + + err = au_opts_verify(sb, sb->s_flags, tmp); + if (unlikely(err)) + goto out; + + /* restore xino */ + if (au_opt_test(tmp, XINO) && !opt_xino) { + xino.file = au_xino_def(sb); + err = PTR_ERR(xino.file); + if (IS_ERR(xino.file)) + goto out; + + err = au_xino_set(sb, &xino, /*remount*/0); + fput(xino.file); + if (unlikely(err)) + goto out; + } + + /* restore udba */ + tmp &= AuOptMask_UDBA; + sbinfo->si_mntflags &= ~AuOptMask_UDBA; + sbinfo->si_mntflags |= tmp; + bend = au_sbend(sb); + for (bindex = 0; bindex <= bend; bindex++) { + br = au_sbr(sb, bindex); + err = au_hnotify_reset_br(tmp, br, br->br_perm); + if (unlikely(err)) + AuIOErr("hnotify failed on br %d, %d, ignored\n", + bindex, err); + /* go on even if err */ + } + if (au_opt_test(tmp, UDBA_HNOTIFY)) { + struct inode *dir = sb->s_root->d_inode; + au_hn_reset(dir, au_hi_flags(dir, /*isdir*/1) & ~AuHi_XINO); + } + +out: + return err; +} + +int au_opts_remount(struct super_block *sb, struct au_opts *opts) +{ + int err, rerr; + struct inode *dir; + struct au_opt_xino *opt_xino; + struct au_opt *opt; + struct au_sbinfo *sbinfo; + + SiMustWriteLock(sb); + + dir = sb->s_root->d_inode; + sbinfo = au_sbi(sb); + err = 0; + opt_xino = NULL; + opt = opts->opt; + while (err >= 0 && opt->type != Opt_tail) { + err = au_opt_simple(sb, opt, opts); + if (!err) + err = au_opt_br(sb, opt, opts); + if (!err) + err = au_opt_xino(sb, opt, &opt_xino, opts); + opt++; + } + if (err > 0) + err = 0; + AuTraceErr(err); + /* go on even err */ + + rerr = au_opts_verify(sb, opts->sb_flags, /*pending*/0); + if (unlikely(rerr && !err)) + err = rerr; + + if (au_ftest_opts(opts->flags, TRUNC_XIB)) { + rerr = au_xib_trunc(sb); + if (unlikely(rerr && !err)) + err = rerr; + } + + /* will be handled by the caller */ + if (!au_ftest_opts(opts->flags, REFRESH) + && (opts->given_udba || au_opt_test(sbinfo->si_mntflags, XINO))) + au_fset_opts(opts->flags, REFRESH); + + AuDbg("status 0x%x\n", opts->flags); + return err; +} + +/* ---------------------------------------------------------------------- */ + +unsigned int au_opt_udba(struct super_block *sb) +{ + return au_mntflags(sb) & AuOptMask_UDBA; +} diff --git a/ubuntu/aufs/opts.h b/ubuntu/aufs/opts.h new file mode 100644 index 000000000000..1ef371020d64 --- /dev/null +++ b/ubuntu/aufs/opts.h @@ -0,0 +1,210 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * mount options/flags + */ + +#ifndef __AUFS_OPTS_H__ +#define __AUFS_OPTS_H__ + +#ifdef __KERNEL__ + +#include +#include + +struct file; +struct super_block; + +/* ---------------------------------------------------------------------- */ + +/* mount flags */ +#define AuOpt_XINO 1 /* external inode number bitmap + and translation table */ +#define AuOpt_TRUNC_XINO (1 << 1) /* truncate xino files */ +#define AuOpt_UDBA_NONE (1 << 2) /* users direct branch access */ +#define AuOpt_UDBA_REVAL (1 << 3) +#define AuOpt_UDBA_HNOTIFY (1 << 4) +#define AuOpt_SHWH (1 << 5) /* show whiteout */ +#define AuOpt_PLINK (1 << 6) /* pseudo-link */ +#define AuOpt_DIRPERM1 (1 << 7) /* unimplemented */ +#define AuOpt_REFROF (1 << 8) /* unimplemented */ +#define AuOpt_ALWAYS_DIROPQ (1 << 9) /* policy to creating diropq */ +#define AuOpt_SUM (1 << 10) /* summation for statfs(2) */ +#define AuOpt_SUM_W (1 << 11) /* unimplemented */ +#define AuOpt_WARN_PERM (1 << 12) /* warn when add-branch */ +#define AuOpt_VERBOSE (1 << 13) /* busy inode when del-branch */ +#define AuOpt_DIO (1 << 14) /* direct io */ + +#ifndef CONFIG_AUFS_HNOTIFY +#undef AuOpt_UDBA_HNOTIFY +#define AuOpt_UDBA_HNOTIFY 0 +#endif +#ifndef CONFIG_AUFS_SHWH +#undef AuOpt_SHWH +#define AuOpt_SHWH 0 +#endif + +#define AuOpt_Def (AuOpt_XINO \ + | AuOpt_UDBA_REVAL \ + | AuOpt_PLINK \ + /* | AuOpt_DIRPERM1 */ \ + | AuOpt_WARN_PERM) +#define AuOptMask_UDBA (AuOpt_UDBA_NONE \ + | AuOpt_UDBA_REVAL \ + | AuOpt_UDBA_HNOTIFY) + +#define au_opt_test(flags, name) (flags & AuOpt_##name) +#define au_opt_set(flags, name) do { \ + BUILD_BUG_ON(AuOpt_##name & AuOptMask_UDBA); \ + ((flags) |= AuOpt_##name); \ +} while (0) +#define au_opt_set_udba(flags, name) do { \ + (flags) &= ~AuOptMask_UDBA; \ + ((flags) |= AuOpt_##name); \ +} while (0) +#define au_opt_clr(flags, name) do { \ + ((flags) &= ~AuOpt_##name); \ +} while (0) + +static inline unsigned int au_opts_plink(unsigned int mntflags) +{ +#ifdef CONFIG_PROC_FS + return mntflags; +#else + return mntflags & ~AuOpt_PLINK; +#endif +} + +/* ---------------------------------------------------------------------- */ + +/* policies to select one among multiple writable branches */ +enum { + AuWbrCreate_TDP, /* top down parent */ + AuWbrCreate_RR, /* round robin */ + AuWbrCreate_MFS, /* most free space */ + AuWbrCreate_MFSV, /* mfs with seconds */ + AuWbrCreate_MFSRR, /* mfs then rr */ + AuWbrCreate_MFSRRV, /* mfs then rr with seconds */ + AuWbrCreate_PMFS, /* parent and mfs */ + AuWbrCreate_PMFSV, /* parent and mfs with seconds */ + + AuWbrCreate_Def = AuWbrCreate_TDP +}; + +enum { + AuWbrCopyup_TDP, /* top down parent */ + AuWbrCopyup_BUP, /* bottom up parent */ + AuWbrCopyup_BU, /* bottom up */ + + AuWbrCopyup_Def = AuWbrCopyup_TDP +}; + +/* ---------------------------------------------------------------------- */ + +struct au_opt_add { + aufs_bindex_t bindex; + char *pathname; + int perm; + struct path path; +}; + +struct au_opt_del { + char *pathname; + struct path h_path; +}; + +struct au_opt_mod { + char *path; + int perm; + struct dentry *h_root; +}; + +struct au_opt_xino { + char *path; + struct file *file; +}; + +struct au_opt_xino_itrunc { + aufs_bindex_t bindex; +}; + +struct au_opt_wbr_create { + int wbr_create; + int mfs_second; + unsigned long long mfsrr_watermark; +}; + +struct au_opt { + int type; + union { + struct au_opt_xino xino; + struct au_opt_xino_itrunc xino_itrunc; + struct au_opt_add add; + struct au_opt_del del; + struct au_opt_mod mod; + int dirwh; + int rdcache; + unsigned int rdblk; + unsigned int rdhash; + int udba; + struct au_opt_wbr_create wbr_create; + int wbr_copyup; + }; +}; + +/* opts flags */ +#define AuOpts_REMOUNT 1 +#define AuOpts_REFRESH (1 << 1) +#define AuOpts_TRUNC_XIB (1 << 2) +#define AuOpts_REFRESH_DYAOP (1 << 3) +#define au_ftest_opts(flags, name) ((flags) & AuOpts_##name) +#define au_fset_opts(flags, name) \ + do { (flags) |= AuOpts_##name; } while (0) +#define au_fclr_opts(flags, name) \ + do { (flags) &= ~AuOpts_##name; } while (0) + +struct au_opts { + struct au_opt *opt; + int max_opt; + + unsigned int given_udba; + unsigned int flags; + unsigned long sb_flags; +}; + +/* ---------------------------------------------------------------------- */ + +char *au_optstr_br_perm(int brperm); +const char *au_optstr_udba(int udba); +const char *au_optstr_wbr_copyup(int wbr_copyup); +const char *au_optstr_wbr_create(int wbr_create); + +void au_opts_free(struct au_opts *opts); +int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts); +int au_opts_verify(struct super_block *sb, unsigned long sb_flags, + unsigned int pending); +int au_opts_mount(struct super_block *sb, struct au_opts *opts); +int au_opts_remount(struct super_block *sb, struct au_opts *opts); + +unsigned int au_opt_udba(struct super_block *sb); + +/* ---------------------------------------------------------------------- */ + +#endif /* __KERNEL__ */ +#endif /* __AUFS_OPTS_H__ */ diff --git a/ubuntu/aufs/plink.c b/ubuntu/aufs/plink.c new file mode 100644 index 000000000000..b1888f30c719 --- /dev/null +++ b/ubuntu/aufs/plink.c @@ -0,0 +1,515 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * pseudo-link + */ + +#include "aufs.h" + +/* + * the pseudo-link maintenance mode. + * during a user process maintains the pseudo-links, + * prohibit adding a new plink and branch manipulation. + * + * Flags + * NOPLM: + * For entry functions which will handle plink, and i_mutex is already held + * in VFS. + * They cannot wait and should return an error at once. + * Callers has to check the error. + * NOPLMW: + * For entry functions which will handle plink, but i_mutex is not held + * in VFS. + * They can wait the plink maintenance mode to finish. + * + * They behave like F_SETLK and F_SETLKW. + * If the caller never handle plink, then both flags are unnecessary. + */ + +int au_plink_maint(struct super_block *sb, int flags) +{ + int err; + pid_t pid, ppid; + struct au_sbinfo *sbi; + + SiMustAnyLock(sb); + + err = 0; + if (!au_opt_test(au_mntflags(sb), PLINK)) + goto out; + + sbi = au_sbi(sb); + pid = sbi->si_plink_maint_pid; + if (!pid || pid == current->pid) + goto out; + + /* todo: it highly depends upon /sbin/mount.aufs */ + rcu_read_lock(); + ppid = task_pid_vnr(rcu_dereference(current->real_parent)); + rcu_read_unlock(); + if (pid == ppid) + goto out; + + if (au_ftest_lock(flags, NOPLMW)) { + /* if there is no i_mutex lock in VFS, we don't need to wait */ + /* AuDebugOn(!lockdep_depth(current)); */ + while (sbi->si_plink_maint_pid) { + si_read_unlock(sb); + /* gave up wake_up_bit() */ + wait_event(sbi->si_plink_wq, !sbi->si_plink_maint_pid); + + if (au_ftest_lock(flags, FLUSH)) + au_nwt_flush(&sbi->si_nowait); + si_noflush_read_lock(sb); + } + } else if (au_ftest_lock(flags, NOPLM)) { + AuDbg("ppid %d, pid %d\n", ppid, pid); + err = -EAGAIN; + } + +out: + return err; +} + +void au_plink_maint_leave(struct au_sbinfo *sbinfo) +{ + spin_lock(&sbinfo->si_plink_maint_lock); + sbinfo->si_plink_maint_pid = 0; + spin_unlock(&sbinfo->si_plink_maint_lock); + wake_up_all(&sbinfo->si_plink_wq); +} + +int au_plink_maint_enter(struct super_block *sb) +{ + int err; + struct au_sbinfo *sbinfo; + + err = 0; + sbinfo = au_sbi(sb); + /* make sure i am the only one in this fs */ + si_write_lock(sb, AuLock_FLUSH); + if (au_opt_test(au_mntflags(sb), PLINK)) { + spin_lock(&sbinfo->si_plink_maint_lock); + if (!sbinfo->si_plink_maint_pid) + sbinfo->si_plink_maint_pid = current->pid; + else + err = -EBUSY; + spin_unlock(&sbinfo->si_plink_maint_lock); + } + si_write_unlock(sb); + + return err; +} + +/* ---------------------------------------------------------------------- */ + +struct pseudo_link { + union { + struct list_head list; + struct rcu_head rcu; + }; + struct inode *inode; +}; + +#ifdef CONFIG_AUFS_DEBUG +void au_plink_list(struct super_block *sb) +{ + struct au_sbinfo *sbinfo; + struct list_head *plink_list; + struct pseudo_link *plink; + + SiMustAnyLock(sb); + + sbinfo = au_sbi(sb); + AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); + AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); + + plink_list = &sbinfo->si_plink.head; + rcu_read_lock(); + list_for_each_entry_rcu(plink, plink_list, list) + AuDbg("%lu\n", plink->inode->i_ino); + rcu_read_unlock(); +} +#endif + +/* is the inode pseudo-linked? */ +int au_plink_test(struct inode *inode) +{ + int found; + struct au_sbinfo *sbinfo; + struct list_head *plink_list; + struct pseudo_link *plink; + + sbinfo = au_sbi(inode->i_sb); + AuRwMustAnyLock(&sbinfo->si_rwsem); + AuDebugOn(!au_opt_test(au_mntflags(inode->i_sb), PLINK)); + AuDebugOn(au_plink_maint(inode->i_sb, AuLock_NOPLM)); + + found = 0; + plink_list = &sbinfo->si_plink.head; + rcu_read_lock(); + list_for_each_entry_rcu(plink, plink_list, list) + if (plink->inode == inode) { + found = 1; + break; + } + rcu_read_unlock(); + return found; +} + +/* ---------------------------------------------------------------------- */ + +/* + * generate a name for plink. + * the file will be stored under AUFS_WH_PLINKDIR. + */ +/* 20 is max digits length of ulong 64 */ +#define PLINK_NAME_LEN ((20 + 1) * 2) + +static int plink_name(char *name, int len, struct inode *inode, + aufs_bindex_t bindex) +{ + int rlen; + struct inode *h_inode; + + h_inode = au_h_iptr(inode, bindex); + rlen = snprintf(name, len, "%lu.%lu", inode->i_ino, h_inode->i_ino); + return rlen; +} + +struct au_do_plink_lkup_args { + struct dentry **errp; + struct qstr *tgtname; + struct dentry *h_parent; + struct au_branch *br; +}; + +static struct dentry *au_do_plink_lkup(struct qstr *tgtname, + struct dentry *h_parent, + struct au_branch *br) +{ + struct dentry *h_dentry; + struct mutex *h_mtx; + + h_mtx = &h_parent->d_inode->i_mutex; + mutex_lock_nested(h_mtx, AuLsc_I_CHILD2); + h_dentry = au_lkup_one(tgtname, h_parent, br, /*nd*/NULL); + mutex_unlock(h_mtx); + return h_dentry; +} + +static void au_call_do_plink_lkup(void *args) +{ + struct au_do_plink_lkup_args *a = args; + *a->errp = au_do_plink_lkup(a->tgtname, a->h_parent, a->br); +} + +/* lookup the plink-ed @inode under the branch at @bindex */ +struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex) +{ + struct dentry *h_dentry, *h_parent; + struct au_branch *br; + struct inode *h_dir; + int wkq_err; + char a[PLINK_NAME_LEN]; + struct qstr tgtname = { + .name = a + }; + + AuDebugOn(au_plink_maint(inode->i_sb, AuLock_NOPLM)); + + br = au_sbr(inode->i_sb, bindex); + h_parent = br->br_wbr->wbr_plink; + h_dir = h_parent->d_inode; + tgtname.len = plink_name(a, sizeof(a), inode, bindex); + + if (current_fsuid()) { + struct au_do_plink_lkup_args args = { + .errp = &h_dentry, + .tgtname = &tgtname, + .h_parent = h_parent, + .br = br + }; + + wkq_err = au_wkq_wait(au_call_do_plink_lkup, &args); + if (unlikely(wkq_err)) + h_dentry = ERR_PTR(wkq_err); + } else + h_dentry = au_do_plink_lkup(&tgtname, h_parent, br); + + return h_dentry; +} + +/* create a pseudo-link */ +static int do_whplink(struct qstr *tgt, struct dentry *h_parent, + struct dentry *h_dentry, struct au_branch *br) +{ + int err; + struct path h_path = { + .mnt = br->br_mnt + }; + struct inode *h_dir; + + h_dir = h_parent->d_inode; + mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_CHILD2); +again: + h_path.dentry = au_lkup_one(tgt, h_parent, br, /*nd*/NULL); + err = PTR_ERR(h_path.dentry); + if (IS_ERR(h_path.dentry)) + goto out; + + err = 0; + /* wh.plink dir is not monitored */ + /* todo: is it really safe? */ + if (h_path.dentry->d_inode + && h_path.dentry->d_inode != h_dentry->d_inode) { + err = vfsub_unlink(h_dir, &h_path, /*force*/0); + dput(h_path.dentry); + h_path.dentry = NULL; + if (!err) + goto again; + } + if (!err && !h_path.dentry->d_inode) + err = vfsub_link(h_dentry, h_dir, &h_path); + dput(h_path.dentry); + +out: + mutex_unlock(&h_dir->i_mutex); + return err; +} + +struct do_whplink_args { + int *errp; + struct qstr *tgt; + struct dentry *h_parent; + struct dentry *h_dentry; + struct au_branch *br; +}; + +static void call_do_whplink(void *args) +{ + struct do_whplink_args *a = args; + *a->errp = do_whplink(a->tgt, a->h_parent, a->h_dentry, a->br); +} + +static int whplink(struct dentry *h_dentry, struct inode *inode, + aufs_bindex_t bindex, struct au_branch *br) +{ + int err, wkq_err; + struct au_wbr *wbr; + struct dentry *h_parent; + struct inode *h_dir; + char a[PLINK_NAME_LEN]; + struct qstr tgtname = { + .name = a + }; + + wbr = au_sbr(inode->i_sb, bindex)->br_wbr; + h_parent = wbr->wbr_plink; + h_dir = h_parent->d_inode; + tgtname.len = plink_name(a, sizeof(a), inode, bindex); + + /* always superio. */ + if (current_fsuid()) { + struct do_whplink_args args = { + .errp = &err, + .tgt = &tgtname, + .h_parent = h_parent, + .h_dentry = h_dentry, + .br = br + }; + wkq_err = au_wkq_wait(call_do_whplink, &args); + if (unlikely(wkq_err)) + err = wkq_err; + } else + err = do_whplink(&tgtname, h_parent, h_dentry, br); + + return err; +} + +/* free a single plink */ +static void do_put_plink(struct pseudo_link *plink, int do_del) +{ + if (do_del) + list_del(&plink->list); + iput(plink->inode); + kfree(plink); +} + +static void do_put_plink_rcu(struct rcu_head *rcu) +{ + struct pseudo_link *plink; + + plink = container_of(rcu, struct pseudo_link, rcu); + iput(plink->inode); + kfree(plink); +} + +/* + * create a new pseudo-link for @h_dentry on @bindex. + * the linked inode is held in aufs @inode. + */ +void au_plink_append(struct inode *inode, aufs_bindex_t bindex, + struct dentry *h_dentry) +{ + struct super_block *sb; + struct au_sbinfo *sbinfo; + struct list_head *plink_list; + struct pseudo_link *plink, *tmp; + int found, err, cnt; + + sb = inode->i_sb; + sbinfo = au_sbi(sb); + AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); + AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); + + cnt = 0; + found = 0; + plink_list = &sbinfo->si_plink.head; + rcu_read_lock(); + list_for_each_entry_rcu(plink, plink_list, list) { + cnt++; + if (plink->inode == inode) { + found = 1; + break; + } + } + rcu_read_unlock(); + if (found) + return; + + tmp = kmalloc(sizeof(*plink), GFP_NOFS); + if (tmp) + tmp->inode = au_igrab(inode); + else { + err = -ENOMEM; + goto out; + } + + spin_lock(&sbinfo->si_plink.spin); + list_for_each_entry(plink, plink_list, list) { + if (plink->inode == inode) { + found = 1; + break; + } + } + if (!found) + list_add_rcu(&tmp->list, plink_list); + spin_unlock(&sbinfo->si_plink.spin); + if (!found) { + cnt++; + WARN_ONCE(cnt > AUFS_PLINK_WARN, + "unexpectedly many pseudo links, %d\n", cnt); + err = whplink(h_dentry, inode, bindex, au_sbr(sb, bindex)); + } else { + do_put_plink(tmp, 0); + return; + } + +out: + if (unlikely(err)) { + pr_warning("err %d, damaged pseudo link.\n", err); + if (tmp) { + au_spl_del_rcu(&tmp->list, &sbinfo->si_plink); + call_rcu(&tmp->rcu, do_put_plink_rcu); + } + } +} + +/* free all plinks */ +void au_plink_put(struct super_block *sb, int verbose) +{ + struct au_sbinfo *sbinfo; + struct list_head *plink_list; + struct pseudo_link *plink, *tmp; + + SiMustWriteLock(sb); + + sbinfo = au_sbi(sb); + AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); + AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); + + plink_list = &sbinfo->si_plink.head; + /* no spin_lock since sbinfo is write-locked */ + WARN(verbose && !list_empty(plink_list), "pseudo-link is not flushed"); + list_for_each_entry_safe(plink, tmp, plink_list, list) + do_put_plink(plink, 0); + INIT_LIST_HEAD(plink_list); +} + +void au_plink_clean(struct super_block *sb, int verbose) +{ + struct dentry *root; + + root = sb->s_root; + aufs_write_lock(root); + if (au_opt_test(au_mntflags(sb), PLINK)) + au_plink_put(sb, verbose); + aufs_write_unlock(root); +} + +/* free the plinks on a branch specified by @br_id */ +void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id) +{ + struct au_sbinfo *sbinfo; + struct list_head *plink_list; + struct pseudo_link *plink, *tmp; + struct inode *inode; + aufs_bindex_t bstart, bend, bindex; + unsigned char do_put; + + SiMustWriteLock(sb); + + sbinfo = au_sbi(sb); + AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); + AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); + + plink_list = &sbinfo->si_plink.head; + /* no spin_lock since sbinfo is write-locked */ + list_for_each_entry_safe(plink, tmp, plink_list, list) { + do_put = 0; + inode = au_igrab(plink->inode); + ii_write_lock_child(inode); + bstart = au_ibstart(inode); + bend = au_ibend(inode); + if (bstart >= 0) { + for (bindex = bstart; bindex <= bend; bindex++) { + if (!au_h_iptr(inode, bindex) + || au_ii_br_id(inode, bindex) != br_id) + continue; + au_set_h_iptr(inode, bindex, NULL, 0); + do_put = 1; + break; + } + } else + do_put_plink(plink, 1); + + if (do_put) { + for (bindex = bstart; bindex <= bend; bindex++) + if (au_h_iptr(inode, bindex)) { + do_put = 0; + break; + } + if (do_put) + do_put_plink(plink, 1); + } + ii_write_unlock(inode); + iput(inode); + } +} diff --git a/ubuntu/aufs/poll.c b/ubuntu/aufs/poll.c new file mode 100644 index 000000000000..873f3bb5e6be --- /dev/null +++ b/ubuntu/aufs/poll.c @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * poll operation + * There is only one filesystem which implements ->poll operation, currently. + */ + +#include "aufs.h" + +unsigned int aufs_poll(struct file *file, poll_table *wait) +{ + unsigned int mask; + int err; + struct file *h_file; + struct dentry *dentry; + struct super_block *sb; + + /* We should pretend an error happened. */ + mask = POLLERR /* | POLLIN | POLLOUT */; + dentry = file->f_dentry; + sb = dentry->d_sb; + si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); + err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); + if (unlikely(err)) + goto out; + + /* it is not an error if h_file has no operation */ + mask = DEFAULT_POLLMASK; + h_file = au_hf_top(file); + if (h_file->f_op && h_file->f_op->poll) + mask = h_file->f_op->poll(h_file, wait); + + di_read_unlock(dentry, AuLock_IR); + fi_read_unlock(file); + +out: + si_read_unlock(sb); + AuTraceErr((int)mask); + return mask; +} diff --git a/ubuntu/aufs/procfs.c b/ubuntu/aufs/procfs.c new file mode 100644 index 000000000000..25463319bad7 --- /dev/null +++ b/ubuntu/aufs/procfs.c @@ -0,0 +1,170 @@ +/* + * Copyright (C) 2010-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * procfs interfaces + */ + +#include +#include "aufs.h" + +static int au_procfs_plm_release(struct inode *inode, struct file *file) +{ + struct au_sbinfo *sbinfo; + + sbinfo = file->private_data; + if (sbinfo) { + au_plink_maint_leave(sbinfo); + kobject_put(&sbinfo->si_kobj); + } + + return 0; +} + +static void au_procfs_plm_write_clean(struct file *file) +{ + struct au_sbinfo *sbinfo; + + sbinfo = file->private_data; + if (sbinfo) + au_plink_clean(sbinfo->si_sb, /*verbose*/0); +} + +static int au_procfs_plm_write_si(struct file *file, unsigned long id) +{ + int err; + struct super_block *sb; + struct au_sbinfo *sbinfo; + + err = -EBUSY; + if (unlikely(file->private_data)) + goto out; + + sb = NULL; + /* don't use au_sbilist_lock() here */ + spin_lock(&au_sbilist.spin); + list_for_each_entry(sbinfo, &au_sbilist.head, si_list) + if (id == sysaufs_si_id(sbinfo)) { + kobject_get(&sbinfo->si_kobj); + sb = sbinfo->si_sb; + break; + } + spin_unlock(&au_sbilist.spin); + + err = -EINVAL; + if (unlikely(!sb)) + goto out; + + err = au_plink_maint_enter(sb); + if (!err) + /* keep kobject_get() */ + file->private_data = sbinfo; + else + kobject_put(&sbinfo->si_kobj); +out: + return err; +} + +/* + * Accept a valid "si=xxxx" only. + * Once it is accepted successfully, accept "clean" too. + */ +static ssize_t au_procfs_plm_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + ssize_t err; + unsigned long id; + /* last newline is allowed */ + char buf[3 + sizeof(unsigned long) * 2 + 1]; + + err = -EACCES; + if (unlikely(!capable(CAP_SYS_ADMIN))) + goto out; + + err = -EINVAL; + if (unlikely(count > sizeof(buf))) + goto out; + + err = copy_from_user(buf, ubuf, count); + if (unlikely(err)) { + err = -EFAULT; + goto out; + } + buf[count] = 0; + + err = -EINVAL; + if (!strcmp("clean", buf)) { + au_procfs_plm_write_clean(file); + goto out_success; + } else if (unlikely(strncmp("si=", buf, 3))) + goto out; + + err = kstrtoul(buf + 3, 16, &id); + if (unlikely(err)) + goto out; + + err = au_procfs_plm_write_si(file, id); + if (unlikely(err)) + goto out; + +out_success: + err = count; /* success */ +out: + return err; +} + +static const struct file_operations au_procfs_plm_fop = { + .write = au_procfs_plm_write, + .release = au_procfs_plm_release, + .owner = THIS_MODULE +}; + +/* ---------------------------------------------------------------------- */ + +static struct proc_dir_entry *au_procfs_dir; + +void au_procfs_fin(void) +{ + remove_proc_entry(AUFS_PLINK_MAINT_NAME, au_procfs_dir); + remove_proc_entry(AUFS_PLINK_MAINT_DIR, NULL); +} + +int __init au_procfs_init(void) +{ + int err; + struct proc_dir_entry *entry; + + err = -ENOMEM; + au_procfs_dir = proc_mkdir(AUFS_PLINK_MAINT_DIR, NULL); + if (unlikely(!au_procfs_dir)) + goto out; + + entry = proc_create(AUFS_PLINK_MAINT_NAME, S_IFREG | S_IWUSR, + au_procfs_dir, &au_procfs_plm_fop); + if (unlikely(!entry)) + goto out_dir; + + err = 0; + goto out; /* success */ + + +out_dir: + remove_proc_entry(AUFS_PLINK_MAINT_DIR, NULL); +out: + return err; +} diff --git a/ubuntu/aufs/rdu.c b/ubuntu/aufs/rdu.c new file mode 100644 index 000000000000..4e0756dc312b --- /dev/null +++ b/ubuntu/aufs/rdu.c @@ -0,0 +1,385 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * readdir in userspace. + */ + +#include +#include +#include +#include +#include +#include "aufs.h" + +/* bits for struct aufs_rdu.flags */ +#define AuRdu_CALLED 1 +#define AuRdu_CONT (1 << 1) +#define AuRdu_FULL (1 << 2) +#define au_ftest_rdu(flags, name) ((flags) & AuRdu_##name) +#define au_fset_rdu(flags, name) \ + do { (flags) |= AuRdu_##name; } while (0) +#define au_fclr_rdu(flags, name) \ + do { (flags) &= ~AuRdu_##name; } while (0) + +struct au_rdu_arg { + struct aufs_rdu *rdu; + union au_rdu_ent_ul ent; + unsigned long end; + + struct super_block *sb; + int err; +}; + +static int au_rdu_fill(void *__arg, const char *name, int nlen, + loff_t offset, u64 h_ino, unsigned int d_type) +{ + int err, len; + struct au_rdu_arg *arg = __arg; + struct aufs_rdu *rdu = arg->rdu; + struct au_rdu_ent ent; + + err = 0; + arg->err = 0; + au_fset_rdu(rdu->cookie.flags, CALLED); + len = au_rdu_len(nlen); + if (arg->ent.ul + len < arg->end) { + ent.ino = h_ino; + ent.bindex = rdu->cookie.bindex; + ent.type = d_type; + ent.nlen = nlen; + if (unlikely(nlen > AUFS_MAX_NAMELEN)) + ent.type = DT_UNKNOWN; + + /* unnecessary to support mmap_sem since this is a dir */ + err = -EFAULT; + if (copy_to_user(arg->ent.e, &ent, sizeof(ent))) + goto out; + if (copy_to_user(arg->ent.e->name, name, nlen)) + goto out; + /* the terminating NULL */ + if (__put_user(0, arg->ent.e->name + nlen)) + goto out; + err = 0; + /* AuDbg("%p, %.*s\n", arg->ent.p, nlen, name); */ + arg->ent.ul += len; + rdu->rent++; + } else { + err = -EFAULT; + au_fset_rdu(rdu->cookie.flags, FULL); + rdu->full = 1; + rdu->tail = arg->ent; + } + +out: + /* AuTraceErr(err); */ + return err; +} + +static int au_rdu_do(struct file *h_file, struct au_rdu_arg *arg) +{ + int err; + loff_t offset; + struct au_rdu_cookie *cookie = &arg->rdu->cookie; + + offset = vfsub_llseek(h_file, cookie->h_pos, SEEK_SET); + err = offset; + if (unlikely(offset != cookie->h_pos)) + goto out; + + err = 0; + do { + arg->err = 0; + au_fclr_rdu(cookie->flags, CALLED); + /* smp_mb(); */ + err = vfsub_readdir(h_file, au_rdu_fill, arg); + if (err >= 0) + err = arg->err; + } while (!err + && au_ftest_rdu(cookie->flags, CALLED) + && !au_ftest_rdu(cookie->flags, FULL)); + cookie->h_pos = h_file->f_pos; + +out: + AuTraceErr(err); + return err; +} + +static int au_rdu(struct file *file, struct aufs_rdu *rdu) +{ + int err; + aufs_bindex_t bend; + struct au_rdu_arg arg; + struct dentry *dentry; + struct inode *inode; + struct file *h_file; + struct au_rdu_cookie *cookie = &rdu->cookie; + + err = !access_ok(VERIFY_WRITE, rdu->ent.e, rdu->sz); + if (unlikely(err)) { + err = -EFAULT; + AuTraceErr(err); + goto out; + } + rdu->rent = 0; + rdu->tail = rdu->ent; + rdu->full = 0; + arg.rdu = rdu; + arg.ent = rdu->ent; + arg.end = arg.ent.ul; + arg.end += rdu->sz; + + err = -ENOTDIR; + if (unlikely(!file->f_op || !file->f_op->readdir)) + goto out; + + err = security_file_permission(file, MAY_READ); + AuTraceErr(err); + if (unlikely(err)) + goto out; + + dentry = file->f_dentry; + inode = dentry->d_inode; +#if 1 + mutex_lock(&inode->i_mutex); +#else + err = mutex_lock_killable(&inode->i_mutex); + AuTraceErr(err); + if (unlikely(err)) + goto out; +#endif + + arg.sb = inode->i_sb; + err = si_read_lock(arg.sb, AuLock_FLUSH | AuLock_NOPLM); + if (unlikely(err)) + goto out_mtx; + err = au_alive_dir(dentry); + if (unlikely(err)) + goto out_si; + /* todo: reval? */ + fi_read_lock(file); + + err = -EAGAIN; + if (unlikely(au_ftest_rdu(cookie->flags, CONT) + && cookie->generation != au_figen(file))) + goto out_unlock; + + err = 0; + if (!rdu->blk) { + rdu->blk = au_sbi(arg.sb)->si_rdblk; + if (!rdu->blk) + rdu->blk = au_dir_size(file, /*dentry*/NULL); + } + bend = au_fbstart(file); + if (cookie->bindex < bend) + cookie->bindex = bend; + bend = au_fbend_dir(file); + /* AuDbg("b%d, b%d\n", cookie->bindex, bend); */ + for (; !err && cookie->bindex <= bend; + cookie->bindex++, cookie->h_pos = 0) { + h_file = au_hf_dir(file, cookie->bindex); + if (!h_file) + continue; + + au_fclr_rdu(cookie->flags, FULL); + err = au_rdu_do(h_file, &arg); + AuTraceErr(err); + if (unlikely(au_ftest_rdu(cookie->flags, FULL) || err)) + break; + } + AuDbg("rent %llu\n", rdu->rent); + + if (!err && !au_ftest_rdu(cookie->flags, CONT)) { + rdu->shwh = !!au_opt_test(au_sbi(arg.sb)->si_mntflags, SHWH); + au_fset_rdu(cookie->flags, CONT); + cookie->generation = au_figen(file); + } + + ii_read_lock_child(inode); + fsstack_copy_attr_atime(inode, au_h_iptr(inode, au_ibstart(inode))); + ii_read_unlock(inode); + +out_unlock: + fi_read_unlock(file); +out_si: + si_read_unlock(arg.sb); +out_mtx: + mutex_unlock(&inode->i_mutex); +out: + AuTraceErr(err); + return err; +} + +static int au_rdu_ino(struct file *file, struct aufs_rdu *rdu) +{ + int err; + ino_t ino; + unsigned long long nent; + union au_rdu_ent_ul *u; + struct au_rdu_ent ent; + struct super_block *sb; + + err = 0; + nent = rdu->nent; + u = &rdu->ent; + sb = file->f_dentry->d_sb; + si_read_lock(sb, AuLock_FLUSH); + while (nent-- > 0) { + /* unnecessary to support mmap_sem since this is a dir */ + err = copy_from_user(&ent, u->e, sizeof(ent)); + if (!err) + err = !access_ok(VERIFY_WRITE, &u->e->ino, sizeof(ino)); + if (unlikely(err)) { + err = -EFAULT; + AuTraceErr(err); + break; + } + + /* AuDbg("b%d, i%llu\n", ent.bindex, ent.ino); */ + if (!ent.wh) + err = au_ino(sb, ent.bindex, ent.ino, ent.type, &ino); + else + err = au_wh_ino(sb, ent.bindex, ent.ino, ent.type, + &ino); + if (unlikely(err)) { + AuTraceErr(err); + break; + } + + err = __put_user(ino, &u->e->ino); + if (unlikely(err)) { + err = -EFAULT; + AuTraceErr(err); + break; + } + u->ul += au_rdu_len(ent.nlen); + } + si_read_unlock(sb); + + return err; +} + +/* ---------------------------------------------------------------------- */ + +static int au_rdu_verify(struct aufs_rdu *rdu) +{ + AuDbg("rdu{%llu, %p, %u | %u | %llu, %u, %u | " + "%llu, b%d, 0x%x, g%u}\n", + rdu->sz, rdu->ent.e, rdu->verify[AufsCtlRduV_SZ], + rdu->blk, + rdu->rent, rdu->shwh, rdu->full, + rdu->cookie.h_pos, rdu->cookie.bindex, rdu->cookie.flags, + rdu->cookie.generation); + + if (rdu->verify[AufsCtlRduV_SZ] == sizeof(*rdu)) + return 0; + + AuDbg("%u:%u\n", + rdu->verify[AufsCtlRduV_SZ], (unsigned int)sizeof(*rdu)); + return -EINVAL; +} + +long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long err, e; + struct aufs_rdu rdu; + void __user *p = (void __user *)arg; + + err = copy_from_user(&rdu, p, sizeof(rdu)); + if (unlikely(err)) { + err = -EFAULT; + AuTraceErr(err); + goto out; + } + err = au_rdu_verify(&rdu); + if (unlikely(err)) + goto out; + + switch (cmd) { + case AUFS_CTL_RDU: + err = au_rdu(file, &rdu); + if (unlikely(err)) + break; + + e = copy_to_user(p, &rdu, sizeof(rdu)); + if (unlikely(e)) { + err = -EFAULT; + AuTraceErr(err); + } + break; + case AUFS_CTL_RDU_INO: + err = au_rdu_ino(file, &rdu); + break; + + default: + /* err = -ENOTTY; */ + err = -EINVAL; + } + +out: + AuTraceErr(err); + return err; +} + +#ifdef CONFIG_COMPAT +long au_rdu_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long err, e; + struct aufs_rdu rdu; + void __user *p = compat_ptr(arg); + + /* todo: get_user()? */ + err = copy_from_user(&rdu, p, sizeof(rdu)); + if (unlikely(err)) { + err = -EFAULT; + AuTraceErr(err); + goto out; + } + rdu.ent.e = compat_ptr(rdu.ent.ul); + err = au_rdu_verify(&rdu); + if (unlikely(err)) + goto out; + + switch (cmd) { + case AUFS_CTL_RDU: + err = au_rdu(file, &rdu); + if (unlikely(err)) + break; + + rdu.ent.ul = ptr_to_compat(rdu.ent.e); + rdu.tail.ul = ptr_to_compat(rdu.tail.e); + e = copy_to_user(p, &rdu, sizeof(rdu)); + if (unlikely(e)) { + err = -EFAULT; + AuTraceErr(err); + } + break; + case AUFS_CTL_RDU_INO: + err = au_rdu_ino(file, &rdu); + break; + + default: + /* err = -ENOTTY; */ + err = -EINVAL; + } + +out: + AuTraceErr(err); + return err; +} +#endif diff --git a/ubuntu/aufs/rwsem.h b/ubuntu/aufs/rwsem.h new file mode 100644 index 000000000000..dcd1def9eed8 --- /dev/null +++ b/ubuntu/aufs/rwsem.h @@ -0,0 +1,189 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * simple read-write semaphore wrappers + */ + +#ifndef __AUFS_RWSEM_H__ +#define __AUFS_RWSEM_H__ + +#ifdef __KERNEL__ + +#include +#include "debug.h" + +struct au_rwsem { + struct rw_semaphore rwsem; +#ifdef CONFIG_AUFS_DEBUG + /* just for debugging, not almighty counter */ + atomic_t rcnt, wcnt; +#endif +}; + +#ifdef CONFIG_AUFS_DEBUG +#define AuDbgCntInit(rw) do { \ + atomic_set(&(rw)->rcnt, 0); \ + atomic_set(&(rw)->wcnt, 0); \ + smp_mb(); /* atomic set */ \ +} while (0) + +#define AuDbgRcntInc(rw) atomic_inc(&(rw)->rcnt) +#define AuDbgRcntDec(rw) WARN_ON(atomic_dec_return(&(rw)->rcnt) < 0) +#define AuDbgWcntInc(rw) atomic_inc(&(rw)->wcnt) +#define AuDbgWcntDec(rw) WARN_ON(atomic_dec_return(&(rw)->wcnt) < 0) +#else +#define AuDbgCntInit(rw) do {} while (0) +#define AuDbgRcntInc(rw) do {} while (0) +#define AuDbgRcntDec(rw) do {} while (0) +#define AuDbgWcntInc(rw) do {} while (0) +#define AuDbgWcntDec(rw) do {} while (0) +#endif /* CONFIG_AUFS_DEBUG */ + +/* to debug easier, do not make them inlined functions */ +#define AuRwMustNoWaiters(rw) AuDebugOn(!list_empty(&(rw)->rwsem.wait_list)) +/* rwsem_is_locked() is unusable */ +#define AuRwMustReadLock(rw) AuDebugOn(atomic_read(&(rw)->rcnt) <= 0) +#define AuRwMustWriteLock(rw) AuDebugOn(atomic_read(&(rw)->wcnt) <= 0) +#define AuRwMustAnyLock(rw) AuDebugOn(atomic_read(&(rw)->rcnt) <= 0 \ + && atomic_read(&(rw)->wcnt) <= 0) +#define AuRwDestroy(rw) AuDebugOn(atomic_read(&(rw)->rcnt) \ + || atomic_read(&(rw)->wcnt)) + +#define au_rw_class(rw, key) lockdep_set_class(&(rw)->rwsem, key) + +static inline void au_rw_init(struct au_rwsem *rw) +{ + AuDbgCntInit(rw); + init_rwsem(&rw->rwsem); +} + +static inline void au_rw_init_wlock(struct au_rwsem *rw) +{ + au_rw_init(rw); + down_write(&rw->rwsem); + AuDbgWcntInc(rw); +} + +static inline void au_rw_init_wlock_nested(struct au_rwsem *rw, + unsigned int lsc) +{ + au_rw_init(rw); + down_write_nested(&rw->rwsem, lsc); + AuDbgWcntInc(rw); +} + +static inline void au_rw_read_lock(struct au_rwsem *rw) +{ + down_read(&rw->rwsem); + AuDbgRcntInc(rw); +} + +static inline void au_rw_read_lock_nested(struct au_rwsem *rw, unsigned int lsc) +{ + down_read_nested(&rw->rwsem, lsc); + AuDbgRcntInc(rw); +} + +static inline void au_rw_read_unlock(struct au_rwsem *rw) +{ + AuRwMustReadLock(rw); + AuDbgRcntDec(rw); + up_read(&rw->rwsem); +} + +static inline void au_rw_dgrade_lock(struct au_rwsem *rw) +{ + AuRwMustWriteLock(rw); + AuDbgRcntInc(rw); + AuDbgWcntDec(rw); + downgrade_write(&rw->rwsem); +} + +static inline void au_rw_write_lock(struct au_rwsem *rw) +{ + down_write(&rw->rwsem); + AuDbgWcntInc(rw); +} + +static inline void au_rw_write_lock_nested(struct au_rwsem *rw, + unsigned int lsc) +{ + down_write_nested(&rw->rwsem, lsc); + AuDbgWcntInc(rw); +} + +static inline void au_rw_write_unlock(struct au_rwsem *rw) +{ + AuRwMustWriteLock(rw); + AuDbgWcntDec(rw); + up_write(&rw->rwsem); +} + +/* why is not _nested version defined */ +static inline int au_rw_read_trylock(struct au_rwsem *rw) +{ + int ret = down_read_trylock(&rw->rwsem); + if (ret) + AuDbgRcntInc(rw); + return ret; +} + +static inline int au_rw_write_trylock(struct au_rwsem *rw) +{ + int ret = down_write_trylock(&rw->rwsem); + if (ret) + AuDbgWcntInc(rw); + return ret; +} + +#undef AuDbgCntInit +#undef AuDbgRcntInc +#undef AuDbgRcntDec +#undef AuDbgWcntInc +#undef AuDbgWcntDec + +#define AuSimpleLockRwsemFuncs(prefix, param, rwsem) \ +static inline void prefix##_read_lock(param) \ +{ au_rw_read_lock(rwsem); } \ +static inline void prefix##_write_lock(param) \ +{ au_rw_write_lock(rwsem); } \ +static inline int prefix##_read_trylock(param) \ +{ return au_rw_read_trylock(rwsem); } \ +static inline int prefix##_write_trylock(param) \ +{ return au_rw_write_trylock(rwsem); } +/* why is not _nested version defined */ +/* static inline void prefix##_read_trylock_nested(param, lsc) +{ au_rw_read_trylock_nested(rwsem, lsc)); } +static inline void prefix##_write_trylock_nestd(param, lsc) +{ au_rw_write_trylock_nested(rwsem, lsc); } */ + +#define AuSimpleUnlockRwsemFuncs(prefix, param, rwsem) \ +static inline void prefix##_read_unlock(param) \ +{ au_rw_read_unlock(rwsem); } \ +static inline void prefix##_write_unlock(param) \ +{ au_rw_write_unlock(rwsem); } \ +static inline void prefix##_downgrade_lock(param) \ +{ au_rw_dgrade_lock(rwsem); } + +#define AuSimpleRwsemFuncs(prefix, param, rwsem) \ + AuSimpleLockRwsemFuncs(prefix, param, rwsem) \ + AuSimpleUnlockRwsemFuncs(prefix, param, rwsem) + +#endif /* __KERNEL__ */ +#endif /* __AUFS_RWSEM_H__ */ diff --git a/ubuntu/aufs/sbinfo.c b/ubuntu/aufs/sbinfo.c new file mode 100644 index 000000000000..d9141f8cc48c --- /dev/null +++ b/ubuntu/aufs/sbinfo.c @@ -0,0 +1,344 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * superblock private data + */ + +#include +#include "aufs.h" + +/* + * they are necessary regardless sysfs is disabled. + */ +void au_si_free(struct kobject *kobj) +{ + struct au_sbinfo *sbinfo; + char *locked __maybe_unused; /* debug only */ + + sbinfo = container_of(kobj, struct au_sbinfo, si_kobj); + AuDebugOn(!list_empty(&sbinfo->si_plink.head)); + AuDebugOn(atomic_read(&sbinfo->si_nowait.nw_len)); + + au_rw_write_lock(&sbinfo->si_rwsem); + au_br_free(sbinfo); + au_rw_write_unlock(&sbinfo->si_rwsem); + + AuDebugOn(radix_tree_gang_lookup + (&sbinfo->au_si_pid.tree, (void **)&locked, + /*first_index*/PID_MAX_DEFAULT - 1, + /*max_items*/sizeof(locked)/sizeof(*locked))); + + kfree(sbinfo->si_branch); + kfree(sbinfo->au_si_pid.bitmap); + mutex_destroy(&sbinfo->si_xib_mtx); + AuRwDestroy(&sbinfo->si_rwsem); + + kfree(sbinfo); +} + +int au_si_alloc(struct super_block *sb) +{ + int err; + struct au_sbinfo *sbinfo; + static struct lock_class_key aufs_si; + + err = -ENOMEM; + sbinfo = kzalloc(sizeof(*sbinfo), GFP_NOFS); + if (unlikely(!sbinfo)) + goto out; + + BUILD_BUG_ON(sizeof(unsigned long) != + sizeof(*sbinfo->au_si_pid.bitmap)); + sbinfo->au_si_pid.bitmap = kcalloc(BITS_TO_LONGS(PID_MAX_DEFAULT), + sizeof(*sbinfo->au_si_pid.bitmap), + GFP_NOFS); + if (unlikely(!sbinfo->au_si_pid.bitmap)) + goto out_sbinfo; + + /* will be reallocated separately */ + sbinfo->si_branch = kzalloc(sizeof(*sbinfo->si_branch), GFP_NOFS); + if (unlikely(!sbinfo->si_branch)) + goto out_pidmap; + + err = sysaufs_si_init(sbinfo); + if (unlikely(err)) + goto out_br; + + au_nwt_init(&sbinfo->si_nowait); + au_rw_init_wlock(&sbinfo->si_rwsem); + au_rw_class(&sbinfo->si_rwsem, &aufs_si); + spin_lock_init(&sbinfo->au_si_pid.tree_lock); + INIT_RADIX_TREE(&sbinfo->au_si_pid.tree, GFP_ATOMIC | __GFP_NOFAIL); + + atomic_long_set(&sbinfo->si_ninodes, 0); + atomic_long_set(&sbinfo->si_nfiles, 0); + + sbinfo->si_bend = -1; + + sbinfo->si_wbr_copyup = AuWbrCopyup_Def; + sbinfo->si_wbr_create = AuWbrCreate_Def; + sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + sbinfo->si_wbr_copyup; + sbinfo->si_wbr_create_ops = au_wbr_create_ops + sbinfo->si_wbr_create; + + sbinfo->si_mntflags = au_opts_plink(AuOpt_Def); + + mutex_init(&sbinfo->si_xib_mtx); + sbinfo->si_xino_brid = -1; + /* leave si_xib_last_pindex and si_xib_next_bit */ + + sbinfo->si_rdcache = msecs_to_jiffies(AUFS_RDCACHE_DEF * MSEC_PER_SEC); + sbinfo->si_rdblk = AUFS_RDBLK_DEF; + sbinfo->si_rdhash = AUFS_RDHASH_DEF; + sbinfo->si_dirwh = AUFS_DIRWH_DEF; + + au_spl_init(&sbinfo->si_plink); + init_waitqueue_head(&sbinfo->si_plink_wq); + spin_lock_init(&sbinfo->si_plink_maint_lock); + + /* leave other members for sysaufs and si_mnt. */ + sbinfo->si_sb = sb; + sb->s_fs_info = sbinfo; + si_pid_set(sb); + au_debug_sbinfo_init(sbinfo); + return 0; /* success */ + +out_br: + kfree(sbinfo->si_branch); +out_pidmap: + kfree(sbinfo->au_si_pid.bitmap); +out_sbinfo: + kfree(sbinfo); +out: + return err; +} + +int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr) +{ + int err, sz; + struct au_branch **brp; + + AuRwMustWriteLock(&sbinfo->si_rwsem); + + err = -ENOMEM; + sz = sizeof(*brp) * (sbinfo->si_bend + 1); + if (unlikely(!sz)) + sz = sizeof(*brp); + brp = au_kzrealloc(sbinfo->si_branch, sz, sizeof(*brp) * nbr, GFP_NOFS); + if (brp) { + sbinfo->si_branch = brp; + err = 0; + } + + return err; +} + +/* ---------------------------------------------------------------------- */ + +unsigned int au_sigen_inc(struct super_block *sb) +{ + unsigned int gen; + + SiMustWriteLock(sb); + + gen = ++au_sbi(sb)->si_generation; + au_update_digen(sb->s_root); + au_update_iigen(sb->s_root->d_inode); + sb->s_root->d_inode->i_version++; + return gen; +} + +aufs_bindex_t au_new_br_id(struct super_block *sb) +{ + aufs_bindex_t br_id; + int i; + struct au_sbinfo *sbinfo; + + SiMustWriteLock(sb); + + sbinfo = au_sbi(sb); + for (i = 0; i <= AUFS_BRANCH_MAX; i++) { + br_id = ++sbinfo->si_last_br_id; + AuDebugOn(br_id < 0); + if (br_id && au_br_index(sb, br_id) < 0) + return br_id; + } + + return -1; +} + +/* ---------------------------------------------------------------------- */ + +/* it is ok that new 'nwt' tasks are appended while we are sleeping */ +int si_read_lock(struct super_block *sb, int flags) +{ + int err; + + err = 0; + if (au_ftest_lock(flags, FLUSH)) + au_nwt_flush(&au_sbi(sb)->si_nowait); + + si_noflush_read_lock(sb); + err = au_plink_maint(sb, flags); + if (unlikely(err)) + si_read_unlock(sb); + + return err; +} + +int si_write_lock(struct super_block *sb, int flags) +{ + int err; + + if (au_ftest_lock(flags, FLUSH)) + au_nwt_flush(&au_sbi(sb)->si_nowait); + + si_noflush_write_lock(sb); + err = au_plink_maint(sb, flags); + if (unlikely(err)) + si_write_unlock(sb); + + return err; +} + +/* dentry and super_block lock. call at entry point */ +int aufs_read_lock(struct dentry *dentry, int flags) +{ + int err; + struct super_block *sb; + + sb = dentry->d_sb; + err = si_read_lock(sb, flags); + if (unlikely(err)) + goto out; + + if (au_ftest_lock(flags, DW)) + di_write_lock_child(dentry); + else + di_read_lock_child(dentry, flags); + + if (au_ftest_lock(flags, GEN)) { + err = au_digen_test(dentry, au_sigen(sb)); + AuDebugOn(!err && au_dbrange_test(dentry)); + if (unlikely(err)) + aufs_read_unlock(dentry, flags); + } + +out: + return err; +} + +void aufs_read_unlock(struct dentry *dentry, int flags) +{ + if (au_ftest_lock(flags, DW)) + di_write_unlock(dentry); + else + di_read_unlock(dentry, flags); + si_read_unlock(dentry->d_sb); +} + +void aufs_write_lock(struct dentry *dentry) +{ + si_write_lock(dentry->d_sb, AuLock_FLUSH | AuLock_NOPLMW); + di_write_lock_child(dentry); +} + +void aufs_write_unlock(struct dentry *dentry) +{ + di_write_unlock(dentry); + si_write_unlock(dentry->d_sb); +} + +int aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int flags) +{ + int err; + unsigned int sigen; + struct super_block *sb; + + sb = d1->d_sb; + err = si_read_lock(sb, flags); + if (unlikely(err)) + goto out; + + di_write_lock2_child(d1, d2, au_ftest_lock(flags, DIR)); + + if (au_ftest_lock(flags, GEN)) { + sigen = au_sigen(sb); + err = au_digen_test(d1, sigen); + AuDebugOn(!err && au_dbrange_test(d1)); + if (!err) { + err = au_digen_test(d2, sigen); + AuDebugOn(!err && au_dbrange_test(d2)); + } + if (unlikely(err)) + aufs_read_and_write_unlock2(d1, d2); + } + +out: + return err; +} + +void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2) +{ + di_write_unlock2(d1, d2); + si_read_unlock(d1->d_sb); +} + +/* ---------------------------------------------------------------------- */ + +int si_pid_test_slow(struct super_block *sb) +{ + void *p; + + rcu_read_lock(); + p = radix_tree_lookup(&au_sbi(sb)->au_si_pid.tree, current->pid); + rcu_read_unlock(); + + return (long)!!p; +} + +void si_pid_set_slow(struct super_block *sb) +{ + int err; + struct au_sbinfo *sbinfo; + + AuDebugOn(si_pid_test_slow(sb)); + + sbinfo = au_sbi(sb); + err = radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); + AuDebugOn(err); + spin_lock(&sbinfo->au_si_pid.tree_lock); + err = radix_tree_insert(&sbinfo->au_si_pid.tree, current->pid, + /*any valid ptr*/sb); + spin_unlock(&sbinfo->au_si_pid.tree_lock); + AuDebugOn(err); + radix_tree_preload_end(); +} + +void si_pid_clr_slow(struct super_block *sb) +{ + void *p; + struct au_sbinfo *sbinfo; + + AuDebugOn(!si_pid_test_slow(sb)); + + sbinfo = au_sbi(sb); + spin_lock(&sbinfo->au_si_pid.tree_lock); + p = radix_tree_delete(&sbinfo->au_si_pid.tree, current->pid); + spin_unlock(&sbinfo->au_si_pid.tree_lock); +} diff --git a/ubuntu/aufs/spl.h b/ubuntu/aufs/spl.h new file mode 100644 index 000000000000..37a032907a32 --- /dev/null +++ b/ubuntu/aufs/spl.h @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * simple list protected by a spinlock + */ + +#ifndef __AUFS_SPL_H__ +#define __AUFS_SPL_H__ + +#ifdef __KERNEL__ + +#include +#include +#include + +struct au_splhead { + spinlock_t spin; + struct list_head head; +}; + +static inline void au_spl_init(struct au_splhead *spl) +{ + spin_lock_init(&spl->spin); + INIT_LIST_HEAD(&spl->head); +} + +static inline void au_spl_add(struct list_head *list, struct au_splhead *spl) +{ + spin_lock(&spl->spin); + list_add(list, &spl->head); + spin_unlock(&spl->spin); +} + +static inline void au_spl_del(struct list_head *list, struct au_splhead *spl) +{ + spin_lock(&spl->spin); + list_del(list); + spin_unlock(&spl->spin); +} + +static inline void au_spl_del_rcu(struct list_head *list, + struct au_splhead *spl) +{ + spin_lock(&spl->spin); + list_del_rcu(list); + spin_unlock(&spl->spin); +} + +#endif /* __KERNEL__ */ +#endif /* __AUFS_SPL_H__ */ diff --git a/ubuntu/aufs/super.c b/ubuntu/aufs/super.c new file mode 100644 index 000000000000..595282e8700c --- /dev/null +++ b/ubuntu/aufs/super.c @@ -0,0 +1,939 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * mount and super_block operations + */ + +#include +#include +#include +#include +#include +#include +#include +#include "aufs.h" + +/* + * super_operations + */ +static struct inode *aufs_alloc_inode(struct super_block *sb __maybe_unused) +{ + struct au_icntnr *c; + + c = au_cache_alloc_icntnr(); + if (c) { + au_icntnr_init(c); + c->vfs_inode.i_version = 1; /* sigen(sb); */ + c->iinfo.ii_hinode = NULL; + return &c->vfs_inode; + } + return NULL; +} + +static void aufs_destroy_inode_cb(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + + INIT_LIST_HEAD(&inode->i_dentry); + au_cache_free_icntnr(container_of(inode, struct au_icntnr, vfs_inode)); +} + +static void aufs_destroy_inode(struct inode *inode) +{ + au_iinfo_fin(inode); + call_rcu(&inode->i_rcu, aufs_destroy_inode_cb); +} + +struct inode *au_iget_locked(struct super_block *sb, ino_t ino) +{ + struct inode *inode; + int err; + + inode = iget_locked(sb, ino); + if (unlikely(!inode)) { + inode = ERR_PTR(-ENOMEM); + goto out; + } + if (!(inode->i_state & I_NEW)) + goto out; + + err = au_xigen_new(inode); + if (!err) + err = au_iinfo_init(inode); + if (!err) + inode->i_version++; + else { + iget_failed(inode); + inode = ERR_PTR(err); + } + +out: + /* never return NULL */ + AuDebugOn(!inode); + AuTraceErrPtr(inode); + return inode; +} + +/* lock free root dinfo */ +static int au_show_brs(struct seq_file *seq, struct super_block *sb) +{ + int err; + aufs_bindex_t bindex, bend; + struct path path; + struct au_hdentry *hdp; + struct au_branch *br; + char *perm; + + err = 0; + bend = au_sbend(sb); + hdp = au_di(sb->s_root)->di_hdentry; + for (bindex = 0; !err && bindex <= bend; bindex++) { + br = au_sbr(sb, bindex); + path.mnt = br->br_mnt; + path.dentry = hdp[bindex].hd_dentry; + err = au_seq_path(seq, &path); + if (err > 0) { + perm = au_optstr_br_perm(br->br_perm); + if (perm) { + err = seq_printf(seq, "=%s", perm); + kfree(perm); + if (err == -1) + err = -E2BIG; + } else + err = -ENOMEM; + } + if (!err && bindex != bend) + err = seq_putc(seq, ':'); + } + + return err; +} + +static void au_show_wbr_create(struct seq_file *m, int v, + struct au_sbinfo *sbinfo) +{ + const char *pat; + + AuRwMustAnyLock(&sbinfo->si_rwsem); + + seq_printf(m, ",create="); + pat = au_optstr_wbr_create(v); + switch (v) { + case AuWbrCreate_TDP: + case AuWbrCreate_RR: + case AuWbrCreate_MFS: + case AuWbrCreate_PMFS: + seq_printf(m, pat); + break; + case AuWbrCreate_MFSV: + seq_printf(m, /*pat*/"mfs:%lu", + jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire) + / MSEC_PER_SEC); + break; + case AuWbrCreate_PMFSV: + seq_printf(m, /*pat*/"pmfs:%lu", + jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire) + / MSEC_PER_SEC); + break; + case AuWbrCreate_MFSRR: + seq_printf(m, /*pat*/"mfsrr:%llu", + sbinfo->si_wbr_mfs.mfsrr_watermark); + break; + case AuWbrCreate_MFSRRV: + seq_printf(m, /*pat*/"mfsrr:%llu:%lu", + sbinfo->si_wbr_mfs.mfsrr_watermark, + jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire) + / MSEC_PER_SEC); + break; + } +} + +static int au_show_xino(struct seq_file *seq, struct vfsmount *mnt) +{ +#ifdef CONFIG_SYSFS + return 0; +#else + int err; + const int len = sizeof(AUFS_XINO_FNAME) - 1; + aufs_bindex_t bindex, brid; + struct super_block *sb; + struct qstr *name; + struct file *f; + struct dentry *d, *h_root; + struct au_hdentry *hdp; + + AuRwMustAnyLock(&sbinfo->si_rwsem); + + err = 0; + sb = mnt->mnt_sb; + f = au_sbi(sb)->si_xib; + if (!f) + goto out; + + /* stop printing the default xino path on the first writable branch */ + h_root = NULL; + brid = au_xino_brid(sb); + if (brid >= 0) { + bindex = au_br_index(sb, brid); + hdp = au_di(sb->s_root)->di_hdentry; + h_root = hdp[0 + bindex].hd_dentry; + } + d = f->f_dentry; + name = &d->d_name; + /* safe ->d_parent because the file is unlinked */ + if (d->d_parent == h_root + && name->len == len + && !memcmp(name->name, AUFS_XINO_FNAME, len)) + goto out; + + seq_puts(seq, ",xino="); + err = au_xino_path(seq, f); + +out: + return err; +#endif +} + +/* seq_file will re-call me in case of too long string */ +static int aufs_show_options(struct seq_file *m, struct vfsmount *mnt) +{ + int err; + unsigned int mnt_flags, v; + struct super_block *sb; + struct au_sbinfo *sbinfo; + +#define AuBool(name, str) do { \ + v = au_opt_test(mnt_flags, name); \ + if (v != au_opt_test(AuOpt_Def, name)) \ + seq_printf(m, ",%s" #str, v ? "" : "no"); \ +} while (0) + +#define AuStr(name, str) do { \ + v = mnt_flags & AuOptMask_##name; \ + if (v != (AuOpt_Def & AuOptMask_##name)) \ + seq_printf(m, "," #str "=%s", au_optstr_##str(v)); \ +} while (0) + +#define AuUInt(name, str, val) do { \ + if (val != AUFS_##name##_DEF) \ + seq_printf(m, "," #str "=%u", val); \ +} while (0) + + /* lock free root dinfo */ + sb = mnt->mnt_sb; + si_noflush_read_lock(sb); + sbinfo = au_sbi(sb); + seq_printf(m, ",si=%lx", sysaufs_si_id(sbinfo)); + + mnt_flags = au_mntflags(sb); + if (au_opt_test(mnt_flags, XINO)) { + err = au_show_xino(m, mnt); + if (unlikely(err)) + goto out; + } else + seq_puts(m, ",noxino"); + + AuBool(TRUNC_XINO, trunc_xino); + AuStr(UDBA, udba); + AuBool(SHWH, shwh); + AuBool(PLINK, plink); + AuBool(DIO, dio); + /* AuBool(DIRPERM1, dirperm1); */ + /* AuBool(REFROF, refrof); */ + + v = sbinfo->si_wbr_create; + if (v != AuWbrCreate_Def) + au_show_wbr_create(m, v, sbinfo); + + v = sbinfo->si_wbr_copyup; + if (v != AuWbrCopyup_Def) + seq_printf(m, ",cpup=%s", au_optstr_wbr_copyup(v)); + + v = au_opt_test(mnt_flags, ALWAYS_DIROPQ); + if (v != au_opt_test(AuOpt_Def, ALWAYS_DIROPQ)) + seq_printf(m, ",diropq=%c", v ? 'a' : 'w'); + + AuUInt(DIRWH, dirwh, sbinfo->si_dirwh); + + v = jiffies_to_msecs(sbinfo->si_rdcache) / MSEC_PER_SEC; + AuUInt(RDCACHE, rdcache, v); + + AuUInt(RDBLK, rdblk, sbinfo->si_rdblk); + AuUInt(RDHASH, rdhash, sbinfo->si_rdhash); + + AuBool(SUM, sum); + /* AuBool(SUM_W, wsum); */ + AuBool(WARN_PERM, warn_perm); + AuBool(VERBOSE, verbose); + +out: + /* be sure to print "br:" last */ + if (!sysaufs_brs) { + seq_puts(m, ",br:"); + au_show_brs(m, sb); + } + si_read_unlock(sb); + return 0; + +#undef AuBool +#undef AuStr +#undef AuUInt +} + +/* ---------------------------------------------------------------------- */ + +/* sum mode which returns the summation for statfs(2) */ + +static u64 au_add_till_max(u64 a, u64 b) +{ + u64 old; + + old = a; + a += b; + if (old < a) + return a; + return ULLONG_MAX; +} + +static int au_statfs_sum(struct super_block *sb, struct kstatfs *buf) +{ + int err; + u64 blocks, bfree, bavail, files, ffree; + aufs_bindex_t bend, bindex, i; + unsigned char shared; + struct path h_path; + struct super_block *h_sb; + + blocks = 0; + bfree = 0; + bavail = 0; + files = 0; + ffree = 0; + + err = 0; + bend = au_sbend(sb); + for (bindex = bend; bindex >= 0; bindex--) { + h_path.mnt = au_sbr_mnt(sb, bindex); + h_sb = h_path.mnt->mnt_sb; + shared = 0; + for (i = bindex + 1; !shared && i <= bend; i++) + shared = (au_sbr_sb(sb, i) == h_sb); + if (shared) + continue; + + /* sb->s_root for NFS is unreliable */ + h_path.dentry = h_path.mnt->mnt_root; + err = vfs_statfs(&h_path, buf); + if (unlikely(err)) + goto out; + + blocks = au_add_till_max(blocks, buf->f_blocks); + bfree = au_add_till_max(bfree, buf->f_bfree); + bavail = au_add_till_max(bavail, buf->f_bavail); + files = au_add_till_max(files, buf->f_files); + ffree = au_add_till_max(ffree, buf->f_ffree); + } + + buf->f_blocks = blocks; + buf->f_bfree = bfree; + buf->f_bavail = bavail; + buf->f_files = files; + buf->f_ffree = ffree; + +out: + return err; +} + +static int aufs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + int err; + struct path h_path; + struct super_block *sb; + + /* lock free root dinfo */ + sb = dentry->d_sb; + si_noflush_read_lock(sb); + if (!au_opt_test(au_mntflags(sb), SUM)) { + /* sb->s_root for NFS is unreliable */ + h_path.mnt = au_sbr_mnt(sb, 0); + h_path.dentry = h_path.mnt->mnt_root; + err = vfs_statfs(&h_path, buf); + } else + err = au_statfs_sum(sb, buf); + si_read_unlock(sb); + + if (!err) { + buf->f_type = AUFS_SUPER_MAGIC; + buf->f_namelen = AUFS_MAX_NAMELEN; + memset(&buf->f_fsid, 0, sizeof(buf->f_fsid)); + } + /* buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1; */ + + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* final actions when unmounting a file system */ +static void aufs_put_super(struct super_block *sb) +{ + struct au_sbinfo *sbinfo; + + sbinfo = au_sbi(sb); + if (!sbinfo) + return; + + dbgaufs_si_fin(sbinfo); + kobject_put(&sbinfo->si_kobj); +} + +/* ---------------------------------------------------------------------- */ + +void au_array_free(void *array) +{ + if (array) { + if (!is_vmalloc_addr(array)) + kfree(array); + else + vfree(array); + } +} + +void *au_array_alloc(unsigned long long *hint, au_arraycb_t cb, void *arg) +{ + void *array; + unsigned long long n; + + array = NULL; + n = 0; + if (!*hint) + goto out; + + if (*hint > ULLONG_MAX / sizeof(array)) { + array = ERR_PTR(-EMFILE); + pr_err("hint %llu\n", *hint); + goto out; + } + + array = kmalloc(sizeof(array) * *hint, GFP_NOFS); + if (unlikely(!array)) + array = vmalloc(sizeof(array) * *hint); + if (unlikely(!array)) { + array = ERR_PTR(-ENOMEM); + goto out; + } + + n = cb(array, *hint, arg); + AuDebugOn(n > *hint); + +out: + *hint = n; + return array; +} + +static unsigned long long au_iarray_cb(void *a, + unsigned long long max __maybe_unused, + void *arg) +{ + unsigned long long n; + struct inode **p, *inode; + struct list_head *head; + + n = 0; + p = a; + head = arg; + spin_lock(&inode_sb_list_lock); + list_for_each_entry(inode, head, i_sb_list) { + if (!is_bad_inode(inode) + && au_ii(inode)->ii_bstart >= 0) { + spin_lock(&inode->i_lock); + if (atomic_read(&inode->i_count)) { + au_igrab(inode); + *p++ = inode; + n++; + AuDebugOn(n > max); + } + spin_unlock(&inode->i_lock); + } + } + spin_unlock(&inode_sb_list_lock); + + return n; +} + +struct inode **au_iarray_alloc(struct super_block *sb, unsigned long long *max) +{ + *max = atomic_long_read(&au_sbi(sb)->si_ninodes); + return au_array_alloc(max, au_iarray_cb, &sb->s_inodes); +} + +void au_iarray_free(struct inode **a, unsigned long long max) +{ + unsigned long long ull; + + for (ull = 0; ull < max; ull++) + iput(a[ull]); + au_array_free(a); +} + +/* ---------------------------------------------------------------------- */ + +/* + * refresh dentry and inode at remount time. + */ +/* todo: consolidate with simple_reval_dpath() and au_reval_for_attr() */ +static int au_do_refresh(struct dentry *dentry, unsigned int dir_flags, + struct dentry *parent) +{ + int err; + + di_write_lock_child(dentry); + di_read_lock_parent(parent, AuLock_IR); + err = au_refresh_dentry(dentry, parent); + if (!err && dir_flags) + au_hn_reset(dentry->d_inode, dir_flags); + di_read_unlock(parent, AuLock_IR); + di_write_unlock(dentry); + + return err; +} + +static int au_do_refresh_d(struct dentry *dentry, unsigned int sigen, + struct au_sbinfo *sbinfo, + const unsigned int dir_flags) +{ + int err; + struct dentry *parent; + struct inode *inode; + + err = 0; + parent = dget_parent(dentry); + if (!au_digen_test(parent, sigen) && au_digen_test(dentry, sigen)) { + inode = dentry->d_inode; + if (inode) { + if (!S_ISDIR(inode->i_mode)) + err = au_do_refresh(dentry, /*dir_flags*/0, + parent); + else { + err = au_do_refresh(dentry, dir_flags, parent); + if (unlikely(err)) + au_fset_si(sbinfo, FAILED_REFRESH_DIR); + } + } else + err = au_do_refresh(dentry, /*dir_flags*/0, parent); + AuDbgDentry(dentry); + } + dput(parent); + + AuTraceErr(err); + return err; +} + +static int au_refresh_d(struct super_block *sb) +{ + int err, i, j, ndentry, e; + unsigned int sigen; + struct au_dcsub_pages dpages; + struct au_dpage *dpage; + struct dentry **dentries, *d; + struct au_sbinfo *sbinfo; + struct dentry *root = sb->s_root; + const unsigned int dir_flags = au_hi_flags(root->d_inode, /*isdir*/1); + + err = au_dpages_init(&dpages, GFP_NOFS); + if (unlikely(err)) + goto out; + err = au_dcsub_pages(&dpages, root, NULL, NULL); + if (unlikely(err)) + goto out_dpages; + + sigen = au_sigen(sb); + sbinfo = au_sbi(sb); + for (i = 0; i < dpages.ndpage; i++) { + dpage = dpages.dpages + i; + dentries = dpage->dentries; + ndentry = dpage->ndentry; + for (j = 0; j < ndentry; j++) { + d = dentries[j]; + e = au_do_refresh_d(d, sigen, sbinfo, dir_flags); + if (unlikely(e && !err)) + err = e; + /* go on even err */ + } + } + +out_dpages: + au_dpages_free(&dpages); +out: + return err; +} + +static int au_refresh_i(struct super_block *sb) +{ + int err, e; + unsigned int sigen; + unsigned long long max, ull; + struct inode *inode, **array; + + array = au_iarray_alloc(sb, &max); + err = PTR_ERR(array); + if (IS_ERR(array)) + goto out; + + err = 0; + sigen = au_sigen(sb); + for (ull = 0; ull < max; ull++) { + inode = array[ull]; + if (au_iigen(inode) != sigen) { + ii_write_lock_child(inode); + e = au_refresh_hinode_self(inode); + ii_write_unlock(inode); + if (unlikely(e)) { + pr_err("error %d, i%lu\n", e, inode->i_ino); + if (!err) + err = e; + /* go on even if err */ + } + } + } + + au_iarray_free(array, max); + +out: + return err; +} + +static void au_remount_refresh(struct super_block *sb) +{ + int err, e; + unsigned int udba; + aufs_bindex_t bindex, bend; + struct dentry *root; + struct inode *inode; + struct au_branch *br; + + au_sigen_inc(sb); + au_fclr_si(au_sbi(sb), FAILED_REFRESH_DIR); + + root = sb->s_root; + DiMustNoWaiters(root); + inode = root->d_inode; + IiMustNoWaiters(inode); + + udba = au_opt_udba(sb); + bend = au_sbend(sb); + for (bindex = 0; bindex <= bend; bindex++) { + br = au_sbr(sb, bindex); + err = au_hnotify_reset_br(udba, br, br->br_perm); + if (unlikely(err)) + AuIOErr("hnotify failed on br %d, %d, ignored\n", + bindex, err); + /* go on even if err */ + } + au_hn_reset(inode, au_hi_flags(inode, /*isdir*/1)); + + di_write_unlock(root); + err = au_refresh_d(sb); + e = au_refresh_i(sb); + if (unlikely(e && !err)) + err = e; + /* aufs_write_lock() calls ..._child() */ + di_write_lock_child(root); + + au_cpup_attr_all(inode, /*force*/1); + + if (unlikely(err)) + AuIOErr("refresh failed, ignored, %d\n", err); +} + +/* stop extra interpretation of errno in mount(8), and strange error messages */ +static int cvt_err(int err) +{ + AuTraceErr(err); + + switch (err) { + case -ENOENT: + case -ENOTDIR: + case -EEXIST: + case -EIO: + err = -EINVAL; + } + return err; +} + +static int aufs_remount_fs(struct super_block *sb, int *flags, char *data) +{ + int err, do_dx; + unsigned int mntflags; + struct au_opts opts; + struct dentry *root; + struct inode *inode; + struct au_sbinfo *sbinfo; + + err = 0; + root = sb->s_root; + if (!data || !*data) { + err = si_write_lock(sb, AuLock_FLUSH | AuLock_NOPLM); + if (!err) { + di_write_lock_child(root); + err = au_opts_verify(sb, *flags, /*pending*/0); + aufs_write_unlock(root); + } + goto out; + } + + err = -ENOMEM; + memset(&opts, 0, sizeof(opts)); + opts.opt = (void *)__get_free_page(GFP_NOFS); + if (unlikely(!opts.opt)) + goto out; + opts.max_opt = PAGE_SIZE / sizeof(*opts.opt); + opts.flags = AuOpts_REMOUNT; + opts.sb_flags = *flags; + + /* parse it before aufs lock */ + err = au_opts_parse(sb, data, &opts); + if (unlikely(err)) + goto out_opts; + + sbinfo = au_sbi(sb); + inode = root->d_inode; + mutex_lock(&inode->i_mutex); + err = si_write_lock(sb, AuLock_FLUSH | AuLock_NOPLM); + if (unlikely(err)) + goto out_mtx; + di_write_lock_child(root); + + /* au_opts_remount() may return an error */ + err = au_opts_remount(sb, &opts); + au_opts_free(&opts); + + if (au_ftest_opts(opts.flags, REFRESH)) + au_remount_refresh(sb); + + if (au_ftest_opts(opts.flags, REFRESH_DYAOP)) { + mntflags = au_mntflags(sb); + do_dx = !!au_opt_test(mntflags, DIO); + au_dy_arefresh(do_dx); + } + + aufs_write_unlock(root); + +out_mtx: + mutex_unlock(&inode->i_mutex); +out_opts: + free_page((unsigned long)opts.opt); +out: + err = cvt_err(err); + AuTraceErr(err); + return err; +} + +static const struct super_operations aufs_sop = { + .alloc_inode = aufs_alloc_inode, + .destroy_inode = aufs_destroy_inode, + /* always deleting, no clearing */ + .drop_inode = generic_delete_inode, + .show_options = aufs_show_options, + .statfs = aufs_statfs, + .put_super = aufs_put_super, + .remount_fs = aufs_remount_fs +}; + +/* ---------------------------------------------------------------------- */ + +static int alloc_root(struct super_block *sb) +{ + int err; + struct inode *inode; + struct dentry *root; + + err = -ENOMEM; + inode = au_iget_locked(sb, AUFS_ROOT_INO); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out; + + inode->i_op = &aufs_dir_iop; + inode->i_fop = &aufs_dir_fop; + inode->i_mode = S_IFDIR; + set_nlink(inode, 2); + unlock_new_inode(inode); + + root = d_alloc_root(inode); + if (unlikely(!root)) + goto out_iput; + err = PTR_ERR(root); + if (IS_ERR(root)) + goto out_iput; + + err = au_di_init(root); + if (!err) { + sb->s_root = root; + return 0; /* success */ + } + dput(root); + goto out; /* do not iput */ + +out_iput: + iget_failed(inode); +out: + return err; + +} + +static int aufs_fill_super(struct super_block *sb, void *raw_data, + int silent __maybe_unused) +{ + int err; + struct au_opts opts; + struct dentry *root; + struct inode *inode; + char *arg = raw_data; + + if (unlikely(!arg || !*arg)) { + err = -EINVAL; + pr_err("no arg\n"); + goto out; + } + + err = -ENOMEM; + memset(&opts, 0, sizeof(opts)); + opts.opt = (void *)__get_free_page(GFP_NOFS); + if (unlikely(!opts.opt)) + goto out; + opts.max_opt = PAGE_SIZE / sizeof(*opts.opt); + opts.sb_flags = sb->s_flags; + + err = au_si_alloc(sb); + if (unlikely(err)) + goto out_opts; + + /* all timestamps always follow the ones on the branch */ + sb->s_flags |= MS_NOATIME | MS_NODIRATIME; + sb->s_op = &aufs_sop; + sb->s_d_op = &aufs_dop; + sb->s_magic = AUFS_SUPER_MAGIC; + sb->s_maxbytes = 0; + au_export_init(sb); + + err = alloc_root(sb); + if (unlikely(err)) { + si_write_unlock(sb); + goto out_info; + } + root = sb->s_root; + inode = root->d_inode; + + /* + * actually we can parse options regardless aufs lock here. + * but at remount time, parsing must be done before aufs lock. + * so we follow the same rule. + */ + ii_write_lock_parent(inode); + aufs_write_unlock(root); + err = au_opts_parse(sb, arg, &opts); + if (unlikely(err)) + goto out_root; + + /* lock vfs_inode first, then aufs. */ + mutex_lock(&inode->i_mutex); + aufs_write_lock(root); + err = au_opts_mount(sb, &opts); + au_opts_free(&opts); + aufs_write_unlock(root); + mutex_unlock(&inode->i_mutex); + if (!err) + goto out_opts; /* success */ + +out_root: + dput(root); + sb->s_root = NULL; +out_info: + dbgaufs_si_fin(au_sbi(sb)); + kobject_put(&au_sbi(sb)->si_kobj); + sb->s_fs_info = NULL; +out_opts: + free_page((unsigned long)opts.opt); +out: + AuTraceErr(err); + err = cvt_err(err); + AuTraceErr(err); + return err; +} + +/* ---------------------------------------------------------------------- */ + +static struct dentry *aufs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name __maybe_unused, + void *raw_data) +{ + struct dentry *root; + struct super_block *sb; + + /* all timestamps always follow the ones on the branch */ + /* mnt->mnt_flags |= MNT_NOATIME | MNT_NODIRATIME; */ + root = mount_nodev(fs_type, flags, raw_data, aufs_fill_super); + if (IS_ERR(root)) + goto out; + + sb = root->d_sb; + si_write_lock(sb, !AuLock_FLUSH); + sysaufs_brs_add(sb, 0); + si_write_unlock(sb); + au_sbilist_add(sb); + +out: + return root; +} + +static void aufs_kill_sb(struct super_block *sb) +{ + struct au_sbinfo *sbinfo; + + sbinfo = au_sbi(sb); + if (sbinfo) { + au_sbilist_del(sb); + aufs_write_lock(sb->s_root); + if (sbinfo->si_wbr_create_ops->fin) + sbinfo->si_wbr_create_ops->fin(sb); + if (au_opt_test(sbinfo->si_mntflags, UDBA_HNOTIFY)) { + au_opt_set_udba(sbinfo->si_mntflags, UDBA_NONE); + au_remount_refresh(sb); + } + if (au_opt_test(sbinfo->si_mntflags, PLINK)) + au_plink_put(sb, /*verbose*/1); + au_xino_clr(sb); + sbinfo->si_sb = NULL; + aufs_write_unlock(sb->s_root); + au_nwt_flush(&sbinfo->si_nowait); + } + generic_shutdown_super(sb); +} + +struct file_system_type aufs_fs_type = { + .name = AUFS_FSTYPE, + .fs_flags = + FS_RENAME_DOES_D_MOVE /* a race between rename and others */ + | FS_REVAL_DOT, /* for NFS branch and udba */ + .mount = aufs_mount, + .kill_sb = aufs_kill_sb, + /* no need to __module_get() and module_put(). */ + .owner = THIS_MODULE, +}; diff --git a/ubuntu/aufs/super.h b/ubuntu/aufs/super.h new file mode 100644 index 000000000000..f490b4af4c86 --- /dev/null +++ b/ubuntu/aufs/super.h @@ -0,0 +1,547 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * super_block operations + */ + +#ifndef __AUFS_SUPER_H__ +#define __AUFS_SUPER_H__ + +#ifdef __KERNEL__ + +#include +#include +#include "rwsem.h" +#include "spl.h" +#include "wkq.h" + +typedef ssize_t (*au_readf_t)(struct file *, char __user *, size_t, loff_t *); +typedef ssize_t (*au_writef_t)(struct file *, const char __user *, size_t, + loff_t *); + +/* policies to select one among multiple writable branches */ +struct au_wbr_copyup_operations { + int (*copyup)(struct dentry *dentry); +}; + +struct au_wbr_create_operations { + int (*create)(struct dentry *dentry, int isdir); + int (*init)(struct super_block *sb); + int (*fin)(struct super_block *sb); +}; + +struct au_wbr_mfs { + struct mutex mfs_lock; /* protect this structure */ + unsigned long mfs_jiffy; + unsigned long mfs_expire; + aufs_bindex_t mfs_bindex; + + unsigned long long mfsrr_bytes; + unsigned long long mfsrr_watermark; +}; + +struct au_branch; +struct au_sbinfo { + /* nowait tasks in the system-wide workqueue */ + struct au_nowait_tasks si_nowait; + + /* + * tried sb->s_umount, but failed due to the dependecy between i_mutex. + * rwsem for au_sbinfo is necessary. + */ + struct au_rwsem si_rwsem; + + /* prevent recursive locking in deleting inode */ + struct { + unsigned long *bitmap; + spinlock_t tree_lock; + struct radix_tree_root tree; + } au_si_pid; + + /* + * dirty approach to protect sb->sb_inodes and ->s_files from remount. + */ + atomic_long_t si_ninodes, si_nfiles; + + /* branch management */ + unsigned int si_generation; + + /* see above flags */ + unsigned char au_si_status; + + aufs_bindex_t si_bend; + + /* dirty trick to keep br_id plus */ + unsigned int si_last_br_id : + sizeof(aufs_bindex_t) * BITS_PER_BYTE - 1; + struct au_branch **si_branch; + + /* policy to select a writable branch */ + unsigned char si_wbr_copyup; + unsigned char si_wbr_create; + struct au_wbr_copyup_operations *si_wbr_copyup_ops; + struct au_wbr_create_operations *si_wbr_create_ops; + + /* round robin */ + atomic_t si_wbr_rr_next; + + /* most free space */ + struct au_wbr_mfs si_wbr_mfs; + + /* mount flags */ + /* include/asm-ia64/siginfo.h defines a macro named si_flags */ + unsigned int si_mntflags; + + /* external inode number (bitmap and translation table) */ + au_readf_t si_xread; + au_writef_t si_xwrite; + struct file *si_xib; + struct mutex si_xib_mtx; /* protect xib members */ + unsigned long *si_xib_buf; + unsigned long si_xib_last_pindex; + int si_xib_next_bit; + aufs_bindex_t si_xino_brid; + /* reserved for future use */ + /* unsigned long long si_xib_limit; */ /* Max xib file size */ + +#ifdef CONFIG_AUFS_EXPORT + /* i_generation */ + struct file *si_xigen; + atomic_t si_xigen_next; +#endif + + /* vdir parameters */ + unsigned long si_rdcache; /* max cache time in jiffies */ + unsigned int si_rdblk; /* deblk size */ + unsigned int si_rdhash; /* hash size */ + + /* + * If the number of whiteouts are larger than si_dirwh, leave all of + * them after au_whtmp_ren to reduce the cost of rmdir(2). + * future fsck.aufs or kernel thread will remove them later. + * Otherwise, remove all whiteouts and the dir in rmdir(2). + */ + unsigned int si_dirwh; + + /* + * rename(2) a directory with all children. + */ + /* reserved for future use */ + /* int si_rendir; */ + + /* pseudo_link list */ + struct au_splhead si_plink; + wait_queue_head_t si_plink_wq; + spinlock_t si_plink_maint_lock; + pid_t si_plink_maint_pid; + + /* + * sysfs and lifetime management. + * this is not a small structure and it may be a waste of memory in case + * of sysfs is disabled, particulary when many aufs-es are mounted. + * but using sysfs is majority. + */ + struct kobject si_kobj; +#ifdef CONFIG_DEBUG_FS + struct dentry *si_dbgaufs, *si_dbgaufs_xib; +#ifdef CONFIG_AUFS_EXPORT + struct dentry *si_dbgaufs_xigen; +#endif +#endif + +#ifdef CONFIG_AUFS_SBILIST + struct list_head si_list; +#endif + + /* dirty, necessary for unmounting, sysfs and sysrq */ + struct super_block *si_sb; +}; + +/* sbinfo status flags */ +/* + * set true when refresh_dirs() failed at remount time. + * then try refreshing dirs at access time again. + * if it is false, refreshing dirs at access time is unnecesary + */ +#define AuSi_FAILED_REFRESH_DIR 1 +static inline unsigned char au_do_ftest_si(struct au_sbinfo *sbi, + unsigned int flag) +{ + AuRwMustAnyLock(&sbi->si_rwsem); + return sbi->au_si_status & flag; +} +#define au_ftest_si(sbinfo, name) au_do_ftest_si(sbinfo, AuSi_##name) +#define au_fset_si(sbinfo, name) do { \ + AuRwMustWriteLock(&(sbinfo)->si_rwsem); \ + (sbinfo)->au_si_status |= AuSi_##name; \ +} while (0) +#define au_fclr_si(sbinfo, name) do { \ + AuRwMustWriteLock(&(sbinfo)->si_rwsem); \ + (sbinfo)->au_si_status &= ~AuSi_##name; \ +} while (0) + +/* ---------------------------------------------------------------------- */ + +/* policy to select one among writable branches */ +#define AuWbrCopyup(sbinfo, ...) \ + ((sbinfo)->si_wbr_copyup_ops->copyup(__VA_ARGS__)) +#define AuWbrCreate(sbinfo, ...) \ + ((sbinfo)->si_wbr_create_ops->create(__VA_ARGS__)) + +/* flags for si_read_lock()/aufs_read_lock()/di_read_lock() */ +#define AuLock_DW 1 /* write-lock dentry */ +#define AuLock_IR (1 << 1) /* read-lock inode */ +#define AuLock_IW (1 << 2) /* write-lock inode */ +#define AuLock_FLUSH (1 << 3) /* wait for 'nowait' tasks */ +#define AuLock_DIR (1 << 4) /* target is a dir */ +#define AuLock_NOPLM (1 << 5) /* return err in plm mode */ +#define AuLock_NOPLMW (1 << 6) /* wait for plm mode ends */ +#define AuLock_GEN (1 << 7) /* test digen/iigen */ +#define au_ftest_lock(flags, name) ((flags) & AuLock_##name) +#define au_fset_lock(flags, name) \ + do { (flags) |= AuLock_##name; } while (0) +#define au_fclr_lock(flags, name) \ + do { (flags) &= ~AuLock_##name; } while (0) + +/* ---------------------------------------------------------------------- */ + +/* super.c */ +extern struct file_system_type aufs_fs_type; +struct inode *au_iget_locked(struct super_block *sb, ino_t ino); +typedef unsigned long long (*au_arraycb_t)(void *array, unsigned long long max, + void *arg); +void au_array_free(void *array); +void *au_array_alloc(unsigned long long *hint, au_arraycb_t cb, void *arg); +struct inode **au_iarray_alloc(struct super_block *sb, unsigned long long *max); +void au_iarray_free(struct inode **a, unsigned long long max); + +/* sbinfo.c */ +void au_si_free(struct kobject *kobj); +int au_si_alloc(struct super_block *sb); +int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr); + +unsigned int au_sigen_inc(struct super_block *sb); +aufs_bindex_t au_new_br_id(struct super_block *sb); + +int si_read_lock(struct super_block *sb, int flags); +int si_write_lock(struct super_block *sb, int flags); +int aufs_read_lock(struct dentry *dentry, int flags); +void aufs_read_unlock(struct dentry *dentry, int flags); +void aufs_write_lock(struct dentry *dentry); +void aufs_write_unlock(struct dentry *dentry); +int aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int flags); +void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2); + +int si_pid_test_slow(struct super_block *sb); +void si_pid_set_slow(struct super_block *sb); +void si_pid_clr_slow(struct super_block *sb); + +/* wbr_policy.c */ +extern struct au_wbr_copyup_operations au_wbr_copyup_ops[]; +extern struct au_wbr_create_operations au_wbr_create_ops[]; +int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst); + +/* ---------------------------------------------------------------------- */ + +static inline struct au_sbinfo *au_sbi(struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* ---------------------------------------------------------------------- */ + +#ifdef CONFIG_AUFS_EXPORT +void au_export_init(struct super_block *sb); + +static inline int au_test_nfsd(void) +{ + struct task_struct *tsk = current; + + return (tsk->flags & PF_KTHREAD) + && !strcmp(tsk->comm, "nfsd"); +} + +void au_xigen_inc(struct inode *inode); +int au_xigen_new(struct inode *inode); +int au_xigen_set(struct super_block *sb, struct file *base); +void au_xigen_clr(struct super_block *sb); + +static inline int au_busy_or_stale(void) +{ + if (!au_test_nfsd()) + return -EBUSY; + return -ESTALE; +} +#else +AuStubVoid(au_export_init, struct super_block *sb) +AuStubInt0(au_test_nfsd, void) +AuStubVoid(au_xigen_inc, struct inode *inode) +AuStubInt0(au_xigen_new, struct inode *inode) +AuStubInt0(au_xigen_set, struct super_block *sb, struct file *base) +AuStubVoid(au_xigen_clr, struct super_block *sb) +static inline int au_busy_or_stale(void) +{ + return -EBUSY; +} +#endif /* CONFIG_AUFS_EXPORT */ + +/* ---------------------------------------------------------------------- */ + +#ifdef CONFIG_AUFS_SBILIST +/* module.c */ +extern struct au_splhead au_sbilist; + +static inline void au_sbilist_init(void) +{ + au_spl_init(&au_sbilist); +} + +static inline void au_sbilist_add(struct super_block *sb) +{ + au_spl_add(&au_sbi(sb)->si_list, &au_sbilist); +} + +static inline void au_sbilist_del(struct super_block *sb) +{ + au_spl_del(&au_sbi(sb)->si_list, &au_sbilist); +} + +#ifdef CONFIG_AUFS_MAGIC_SYSRQ +static inline void au_sbilist_lock(void) +{ + spin_lock(&au_sbilist.spin); +} + +static inline void au_sbilist_unlock(void) +{ + spin_unlock(&au_sbilist.spin); +} +#define AuGFP_SBILIST GFP_ATOMIC +#else +AuStubVoid(au_sbilist_lock, void) +AuStubVoid(au_sbilist_unlock, void) +#define AuGFP_SBILIST GFP_NOFS +#endif /* CONFIG_AUFS_MAGIC_SYSRQ */ +#else +AuStubVoid(au_sbilist_init, void) +AuStubVoid(au_sbilist_add, struct super_block*) +AuStubVoid(au_sbilist_del, struct super_block*) +AuStubVoid(au_sbilist_lock, void) +AuStubVoid(au_sbilist_unlock, void) +#define AuGFP_SBILIST GFP_NOFS +#endif + +/* ---------------------------------------------------------------------- */ + +static inline void dbgaufs_si_null(struct au_sbinfo *sbinfo) +{ + /* + * This function is a dynamic '__init' fucntion actually, + * so the tiny check for si_rwsem is unnecessary. + */ + /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ +#ifdef CONFIG_DEBUG_FS + sbinfo->si_dbgaufs = NULL; + sbinfo->si_dbgaufs_xib = NULL; +#ifdef CONFIG_AUFS_EXPORT + sbinfo->si_dbgaufs_xigen = NULL; +#endif +#endif +} + +/* ---------------------------------------------------------------------- */ + +static inline pid_t si_pid_bit(void) +{ + /* the origin of pid is 1, but the bitmap's is 0 */ + return current->pid - 1; +} + +static inline int si_pid_test(struct super_block *sb) +{ + pid_t bit = si_pid_bit(); + if (bit < PID_MAX_DEFAULT) + return test_bit(bit, au_sbi(sb)->au_si_pid.bitmap); + else + return si_pid_test_slow(sb); +} + +static inline void si_pid_set(struct super_block *sb) +{ + pid_t bit = si_pid_bit(); + if (bit < PID_MAX_DEFAULT) { + AuDebugOn(test_bit(bit, au_sbi(sb)->au_si_pid.bitmap)); + set_bit(bit, au_sbi(sb)->au_si_pid.bitmap); + /* smp_mb(); */ + } else + si_pid_set_slow(sb); +} + +static inline void si_pid_clr(struct super_block *sb) +{ + pid_t bit = si_pid_bit(); + if (bit < PID_MAX_DEFAULT) { + AuDebugOn(!test_bit(bit, au_sbi(sb)->au_si_pid.bitmap)); + clear_bit(bit, au_sbi(sb)->au_si_pid.bitmap); + /* smp_mb(); */ + } else + si_pid_clr_slow(sb); +} + +/* ---------------------------------------------------------------------- */ + +/* lock superblock. mainly for entry point functions */ +/* + * __si_read_lock, __si_write_lock, + * __si_read_unlock, __si_write_unlock, __si_downgrade_lock + */ +AuSimpleRwsemFuncs(__si, struct super_block *sb, &au_sbi(sb)->si_rwsem); + +#define SiMustNoWaiters(sb) AuRwMustNoWaiters(&au_sbi(sb)->si_rwsem) +#define SiMustAnyLock(sb) AuRwMustAnyLock(&au_sbi(sb)->si_rwsem) +#define SiMustWriteLock(sb) AuRwMustWriteLock(&au_sbi(sb)->si_rwsem) + +static inline void si_noflush_read_lock(struct super_block *sb) +{ + __si_read_lock(sb); + si_pid_set(sb); +} + +static inline int si_noflush_read_trylock(struct super_block *sb) +{ + int locked = __si_read_trylock(sb); + if (locked) + si_pid_set(sb); + return locked; +} + +static inline void si_noflush_write_lock(struct super_block *sb) +{ + __si_write_lock(sb); + si_pid_set(sb); +} + +static inline int si_noflush_write_trylock(struct super_block *sb) +{ + int locked = __si_write_trylock(sb); + if (locked) + si_pid_set(sb); + return locked; +} + +#if 0 /* unused */ +static inline int si_read_trylock(struct super_block *sb, int flags) +{ + if (au_ftest_lock(flags, FLUSH)) + au_nwt_flush(&au_sbi(sb)->si_nowait); + return si_noflush_read_trylock(sb); +} +#endif + +static inline void si_read_unlock(struct super_block *sb) +{ + si_pid_clr(sb); + __si_read_unlock(sb); +} + +#if 0 /* unused */ +static inline int si_write_trylock(struct super_block *sb, int flags) +{ + if (au_ftest_lock(flags, FLUSH)) + au_nwt_flush(&au_sbi(sb)->si_nowait); + return si_noflush_write_trylock(sb); +} +#endif + +static inline void si_write_unlock(struct super_block *sb) +{ + si_pid_clr(sb); + __si_write_unlock(sb); +} + +#if 0 /* unused */ +static inline void si_downgrade_lock(struct super_block *sb) +{ + __si_downgrade_lock(sb); +} +#endif + +/* ---------------------------------------------------------------------- */ + +static inline aufs_bindex_t au_sbend(struct super_block *sb) +{ + SiMustAnyLock(sb); + return au_sbi(sb)->si_bend; +} + +static inline unsigned int au_mntflags(struct super_block *sb) +{ + SiMustAnyLock(sb); + return au_sbi(sb)->si_mntflags; +} + +static inline unsigned int au_sigen(struct super_block *sb) +{ + SiMustAnyLock(sb); + return au_sbi(sb)->si_generation; +} + +static inline void au_ninodes_inc(struct super_block *sb) +{ + atomic_long_inc(&au_sbi(sb)->si_ninodes); +} + +static inline void au_ninodes_dec(struct super_block *sb) +{ + AuDebugOn(!atomic_long_read(&au_sbi(sb)->si_ninodes)); + atomic_long_dec(&au_sbi(sb)->si_ninodes); +} + +static inline void au_nfiles_inc(struct super_block *sb) +{ + atomic_long_inc(&au_sbi(sb)->si_nfiles); +} + +static inline void au_nfiles_dec(struct super_block *sb) +{ + AuDebugOn(!atomic_long_read(&au_sbi(sb)->si_nfiles)); + atomic_long_dec(&au_sbi(sb)->si_nfiles); +} + +static inline struct au_branch *au_sbr(struct super_block *sb, + aufs_bindex_t bindex) +{ + SiMustAnyLock(sb); + return au_sbi(sb)->si_branch[0 + bindex]; +} + +static inline void au_xino_brid_set(struct super_block *sb, aufs_bindex_t brid) +{ + SiMustWriteLock(sb); + au_sbi(sb)->si_xino_brid = brid; +} + +static inline aufs_bindex_t au_xino_brid(struct super_block *sb) +{ + SiMustAnyLock(sb); + return au_sbi(sb)->si_xino_brid; +} + +#endif /* __KERNEL__ */ +#endif /* __AUFS_SUPER_H__ */ diff --git a/ubuntu/aufs/sysaufs.c b/ubuntu/aufs/sysaufs.c new file mode 100644 index 000000000000..ab4036e18a67 --- /dev/null +++ b/ubuntu/aufs/sysaufs.c @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * sysfs interface and lifetime management + * they are necessary regardless sysfs is disabled. + */ + +#include +#include +#include +#include "aufs.h" + +unsigned long sysaufs_si_mask; +struct kset *sysaufs_kset; + +#define AuSiAttr(_name) { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = sysaufs_si_##_name, \ +} + +static struct sysaufs_si_attr sysaufs_si_attr_xi_path = AuSiAttr(xi_path); +struct attribute *sysaufs_si_attrs[] = { + &sysaufs_si_attr_xi_path.attr, + NULL, +}; + +static const struct sysfs_ops au_sbi_ops = { + .show = sysaufs_si_show +}; + +static struct kobj_type au_sbi_ktype = { + .release = au_si_free, + .sysfs_ops = &au_sbi_ops, + .default_attrs = sysaufs_si_attrs +}; + +/* ---------------------------------------------------------------------- */ + +int sysaufs_si_init(struct au_sbinfo *sbinfo) +{ + int err; + + sbinfo->si_kobj.kset = sysaufs_kset; + /* cf. sysaufs_name() */ + err = kobject_init_and_add + (&sbinfo->si_kobj, &au_sbi_ktype, /*&sysaufs_kset->kobj*/NULL, + SysaufsSiNamePrefix "%lx", sysaufs_si_id(sbinfo)); + + dbgaufs_si_null(sbinfo); + if (!err) { + err = dbgaufs_si_init(sbinfo); + if (unlikely(err)) + kobject_put(&sbinfo->si_kobj); + } + return err; +} + +void sysaufs_fin(void) +{ + dbgaufs_fin(); + sysfs_remove_group(&sysaufs_kset->kobj, sysaufs_attr_group); + kset_unregister(sysaufs_kset); +} + +int __init sysaufs_init(void) +{ + int err; + + do { + get_random_bytes(&sysaufs_si_mask, sizeof(sysaufs_si_mask)); + } while (!sysaufs_si_mask); + + err = -EINVAL; + sysaufs_kset = kset_create_and_add(AUFS_NAME, NULL, fs_kobj); + if (unlikely(!sysaufs_kset)) + goto out; + err = PTR_ERR(sysaufs_kset); + if (IS_ERR(sysaufs_kset)) + goto out; + err = sysfs_create_group(&sysaufs_kset->kobj, sysaufs_attr_group); + if (unlikely(err)) { + kset_unregister(sysaufs_kset); + goto out; + } + + err = dbgaufs_init(); + if (unlikely(err)) + sysaufs_fin(); +out: + return err; +} diff --git a/ubuntu/aufs/sysaufs.h b/ubuntu/aufs/sysaufs.h new file mode 100644 index 000000000000..5b6b8d0c5a85 --- /dev/null +++ b/ubuntu/aufs/sysaufs.h @@ -0,0 +1,105 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * sysfs interface and mount lifetime management + */ + +#ifndef __SYSAUFS_H__ +#define __SYSAUFS_H__ + +#ifdef __KERNEL__ + +#include +#include +#include "module.h" + +struct super_block; +struct au_sbinfo; + +struct sysaufs_si_attr { + struct attribute attr; + int (*show)(struct seq_file *seq, struct super_block *sb); +}; + +/* ---------------------------------------------------------------------- */ + +/* sysaufs.c */ +extern unsigned long sysaufs_si_mask; +extern struct kset *sysaufs_kset; +extern struct attribute *sysaufs_si_attrs[]; +int sysaufs_si_init(struct au_sbinfo *sbinfo); +int __init sysaufs_init(void); +void sysaufs_fin(void); + +/* ---------------------------------------------------------------------- */ + +/* some people doesn't like to show a pointer in kernel */ +static inline unsigned long sysaufs_si_id(struct au_sbinfo *sbinfo) +{ + return sysaufs_si_mask ^ (unsigned long)sbinfo; +} + +#define SysaufsSiNamePrefix "si_" +#define SysaufsSiNameLen (sizeof(SysaufsSiNamePrefix) + 16) +static inline void sysaufs_name(struct au_sbinfo *sbinfo, char *name) +{ + snprintf(name, SysaufsSiNameLen, SysaufsSiNamePrefix "%lx", + sysaufs_si_id(sbinfo)); +} + +struct au_branch; +#ifdef CONFIG_SYSFS +/* sysfs.c */ +extern struct attribute_group *sysaufs_attr_group; + +int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb); +ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr, + char *buf); + +void sysaufs_br_init(struct au_branch *br); +void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex); +void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex); + +#define sysaufs_brs_init() do {} while (0) + +#else +#define sysaufs_attr_group NULL + +AuStubInt0(sysaufs_si_xi_path, struct seq_file *seq, struct super_block *sb) + +static inline +ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return 0; +} + +AuStubVoid(sysaufs_br_init, struct au_branch *br) +AuStubVoid(sysaufs_brs_add, struct super_block *sb, aufs_bindex_t bindex) +AuStubVoid(sysaufs_brs_del, struct super_block *sb, aufs_bindex_t bindex) + +static inline void sysaufs_brs_init(void) +{ + sysaufs_brs = 0; +} + +#endif /* CONFIG_SYSFS */ + +#endif /* __KERNEL__ */ +#endif /* __SYSAUFS_H__ */ diff --git a/ubuntu/aufs/sysfs.c b/ubuntu/aufs/sysfs.c new file mode 100644 index 000000000000..215cd6e5a2fb --- /dev/null +++ b/ubuntu/aufs/sysfs.c @@ -0,0 +1,260 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * sysfs interface + */ + +#include +#include +#include +#include +#include "aufs.h" + +#ifdef CONFIG_AUFS_FS_MODULE +/* this entry violates the "one line per file" policy of sysfs */ +static ssize_t config_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + ssize_t err; + static char *conf = +/* this file is generated at compiling */ +#include "conf.str" + ; + + err = snprintf(buf, PAGE_SIZE, conf); + if (unlikely(err >= PAGE_SIZE)) + err = -EFBIG; + return err; +} + +static struct kobj_attribute au_config_attr = __ATTR_RO(config); +#endif + +static struct attribute *au_attr[] = { +#ifdef CONFIG_AUFS_FS_MODULE + &au_config_attr.attr, +#endif + NULL, /* need to NULL terminate the list of attributes */ +}; + +static struct attribute_group sysaufs_attr_group_body = { + .attrs = au_attr +}; + +struct attribute_group *sysaufs_attr_group = &sysaufs_attr_group_body; + +/* ---------------------------------------------------------------------- */ + +int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb) +{ + int err; + + SiMustAnyLock(sb); + + err = 0; + if (au_opt_test(au_mntflags(sb), XINO)) { + err = au_xino_path(seq, au_sbi(sb)->si_xib); + seq_putc(seq, '\n'); + } + return err; +} + +/* + * the lifetime of branch is independent from the entry under sysfs. + * sysfs handles the lifetime of the entry, and never call ->show() after it is + * unlinked. + */ +static int sysaufs_si_br(struct seq_file *seq, struct super_block *sb, + aufs_bindex_t bindex) +{ + int err; + struct path path; + struct dentry *root; + struct au_branch *br; + char *perm; + + AuDbg("b%d\n", bindex); + + err = 0; + root = sb->s_root; + di_read_lock_parent(root, !AuLock_IR); + br = au_sbr(sb, bindex); + path.mnt = br->br_mnt; + path.dentry = au_h_dptr(root, bindex); + au_seq_path(seq, &path); + di_read_unlock(root, !AuLock_IR); + perm = au_optstr_br_perm(br->br_perm); + if (perm) { + err = seq_printf(seq, "=%s\n", perm); + kfree(perm); + if (err == -1) + err = -E2BIG; + } else + err = -ENOMEM; + return err; +} + +/* ---------------------------------------------------------------------- */ + +static struct seq_file *au_seq(char *p, ssize_t len) +{ + struct seq_file *seq; + + seq = kzalloc(sizeof(*seq), GFP_NOFS); + if (seq) { + /* mutex_init(&seq.lock); */ + seq->buf = p; + seq->size = len; + return seq; /* success */ + } + + seq = ERR_PTR(-ENOMEM); + return seq; +} + +#define SysaufsBr_PREFIX "br" + +/* todo: file size may exceed PAGE_SIZE */ +ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + ssize_t err; + long l; + aufs_bindex_t bend; + struct au_sbinfo *sbinfo; + struct super_block *sb; + struct seq_file *seq; + char *name; + struct attribute **cattr; + + sbinfo = container_of(kobj, struct au_sbinfo, si_kobj); + sb = sbinfo->si_sb; + + /* + * prevent a race condition between sysfs and aufs. + * for instance, sysfs_file_read() calls sysfs_get_active_two() which + * prohibits maintaining the sysfs entries. + * hew we acquire read lock after sysfs_get_active_two(). + * on the other hand, the remount process may maintain the sysfs/aufs + * entries after acquiring write lock. + * it can cause a deadlock. + * simply we gave up processing read here. + */ + err = -EBUSY; + if (unlikely(!si_noflush_read_trylock(sb))) + goto out; + + seq = au_seq(buf, PAGE_SIZE); + err = PTR_ERR(seq); + if (IS_ERR(seq)) + goto out_unlock; + + name = (void *)attr->name; + cattr = sysaufs_si_attrs; + while (*cattr) { + if (!strcmp(name, (*cattr)->name)) { + err = container_of(*cattr, struct sysaufs_si_attr, attr) + ->show(seq, sb); + goto out_seq; + } + cattr++; + } + + bend = au_sbend(sb); + if (!strncmp(name, SysaufsBr_PREFIX, sizeof(SysaufsBr_PREFIX) - 1)) { + name += sizeof(SysaufsBr_PREFIX) - 1; + err = kstrtol(name, 10, &l); + if (!err) { + if (l <= bend) + err = sysaufs_si_br(seq, sb, (aufs_bindex_t)l); + else + err = -ENOENT; + } + goto out_seq; + } + BUG(); + +out_seq: + if (!err) { + err = seq->count; + /* sysfs limit */ + if (unlikely(err == PAGE_SIZE)) + err = -EFBIG; + } + kfree(seq); +out_unlock: + si_read_unlock(sb); +out: + return err; +} + +/* ---------------------------------------------------------------------- */ + +void sysaufs_br_init(struct au_branch *br) +{ + struct attribute *attr = &br->br_attr; + + sysfs_attr_init(attr); + attr->name = br->br_name; + attr->mode = S_IRUGO; +} + +void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex) +{ + struct au_branch *br; + struct kobject *kobj; + aufs_bindex_t bend; + + dbgaufs_brs_del(sb, bindex); + + if (!sysaufs_brs) + return; + + kobj = &au_sbi(sb)->si_kobj; + bend = au_sbend(sb); + for (; bindex <= bend; bindex++) { + br = au_sbr(sb, bindex); + sysfs_remove_file(kobj, &br->br_attr); + } +} + +void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex) +{ + int err; + aufs_bindex_t bend; + struct kobject *kobj; + struct au_branch *br; + + dbgaufs_brs_add(sb, bindex); + + if (!sysaufs_brs) + return; + + kobj = &au_sbi(sb)->si_kobj; + bend = au_sbend(sb); + for (; bindex <= bend; bindex++) { + br = au_sbr(sb, bindex); + snprintf(br->br_name, sizeof(br->br_name), SysaufsBr_PREFIX + "%d", bindex); + err = sysfs_create_file(kobj, &br->br_attr); + if (unlikely(err)) + pr_warning("failed %s under sysfs(%d)\n", + br->br_name, err); + } +} diff --git a/ubuntu/aufs/sysrq.c b/ubuntu/aufs/sysrq.c new file mode 100644 index 000000000000..59d3b521b22a --- /dev/null +++ b/ubuntu/aufs/sysrq.c @@ -0,0 +1,151 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * magic sysrq hanlder + */ + +#include +#include +#include +/* #include */ +#include +#include "aufs.h" + +/* ---------------------------------------------------------------------- */ + +static void sysrq_sb(struct super_block *sb) +{ + char *plevel; + struct au_sbinfo *sbinfo; + struct file *file; + + plevel = au_plevel; + au_plevel = KERN_WARNING; + + sbinfo = au_sbi(sb); + /* since we define pr_fmt, call printk directly */ + printk(KERN_WARNING "si=%lx\n", sysaufs_si_id(sbinfo)); + printk(KERN_WARNING AUFS_NAME ": superblock\n"); + au_dpri_sb(sb); + +#if 0 + printk(KERN_WARNING AUFS_NAME ": root dentry\n"); + au_dpri_dentry(sb->s_root); + printk(KERN_WARNING AUFS_NAME ": root inode\n"); + au_dpri_inode(sb->s_root->d_inode); +#endif + +#if 0 + do { + int err, i, j, ndentry; + struct au_dcsub_pages dpages; + struct au_dpage *dpage; + + err = au_dpages_init(&dpages, GFP_ATOMIC); + if (unlikely(err)) + break; + err = au_dcsub_pages(&dpages, sb->s_root, NULL, NULL); + if (!err) + for (i = 0; i < dpages.ndpage; i++) { + dpage = dpages.dpages + i; + ndentry = dpage->ndentry; + for (j = 0; j < ndentry; j++) + au_dpri_dentry(dpage->dentries[j]); + } + au_dpages_free(&dpages); + } while (0); +#endif + +#if 1 + { + struct inode *i; + printk(KERN_WARNING AUFS_NAME ": isolated inode\n"); + spin_lock(&inode_sb_list_lock); + list_for_each_entry(i, &sb->s_inodes, i_sb_list) { + spin_lock(&i->i_lock); + if (1 || list_empty(&i->i_dentry)) + au_dpri_inode(i); + spin_unlock(&i->i_lock); + } + spin_unlock(&inode_sb_list_lock); + } +#endif + printk(KERN_WARNING AUFS_NAME ": files\n"); + lg_global_lock(files_lglock); + do_file_list_for_each_entry(sb, file) { + umode_t mode; + mode = file->f_dentry->d_inode->i_mode; + if (!special_file(mode) || au_special_file(mode)) + au_dpri_file(file); + } while_file_list_for_each_entry; + lg_global_unlock(files_lglock); + printk(KERN_WARNING AUFS_NAME ": done\n"); + + au_plevel = plevel; +} + +/* ---------------------------------------------------------------------- */ + +/* module parameter */ +static char *aufs_sysrq_key = "a"; +module_param_named(sysrq, aufs_sysrq_key, charp, S_IRUGO); +MODULE_PARM_DESC(sysrq, "MagicSysRq key for " AUFS_NAME); + +static void au_sysrq(int key __maybe_unused) +{ + struct au_sbinfo *sbinfo; + + lockdep_off(); + au_sbilist_lock(); + list_for_each_entry(sbinfo, &au_sbilist.head, si_list) + sysrq_sb(sbinfo->si_sb); + au_sbilist_unlock(); + lockdep_on(); +} + +static struct sysrq_key_op au_sysrq_op = { + .handler = au_sysrq, + .help_msg = "Aufs", + .action_msg = "Aufs", + .enable_mask = SYSRQ_ENABLE_DUMP +}; + +/* ---------------------------------------------------------------------- */ + +int __init au_sysrq_init(void) +{ + int err; + char key; + + err = -1; + key = *aufs_sysrq_key; + if ('a' <= key && key <= 'z') + err = register_sysrq_key(key, &au_sysrq_op); + if (unlikely(err)) + pr_err("err %d, sysrq=%c\n", err, key); + return err; +} + +void au_sysrq_fin(void) +{ + int err; + err = unregister_sysrq_key(*aufs_sysrq_key, &au_sysrq_op); + if (unlikely(err)) + pr_err("err %d (ignored)\n", err); +} diff --git a/ubuntu/aufs/vdir.c b/ubuntu/aufs/vdir.c new file mode 100644 index 000000000000..5e4237686650 --- /dev/null +++ b/ubuntu/aufs/vdir.c @@ -0,0 +1,886 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * virtual or vertical directory + */ + +#include +#include "aufs.h" + +static unsigned int calc_size(int nlen) +{ + return ALIGN(sizeof(struct au_vdir_de) + nlen, sizeof(ino_t)); +} + +static int set_deblk_end(union au_vdir_deblk_p *p, + union au_vdir_deblk_p *deblk_end) +{ + if (calc_size(0) <= deblk_end->deblk - p->deblk) { + p->de->de_str.len = 0; + /* smp_mb(); */ + return 0; + } + return -1; /* error */ +} + +/* returns true or false */ +static int is_deblk_end(union au_vdir_deblk_p *p, + union au_vdir_deblk_p *deblk_end) +{ + if (calc_size(0) <= deblk_end->deblk - p->deblk) + return !p->de->de_str.len; + return 1; +} + +static unsigned char *last_deblk(struct au_vdir *vdir) +{ + return vdir->vd_deblk[vdir->vd_nblk - 1]; +} + +/* ---------------------------------------------------------------------- */ + +/* estimate the apropriate size for name hash table */ +unsigned int au_rdhash_est(loff_t sz) +{ + unsigned int n; + + n = UINT_MAX; + sz >>= 10; + if (sz < n) + n = sz; + if (sz < AUFS_RDHASH_DEF) + n = AUFS_RDHASH_DEF; + /* pr_info("n %u\n", n); */ + return n; +} + +/* + * the allocated memory has to be freed by + * au_nhash_wh_free() or au_nhash_de_free(). + */ +int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp) +{ + struct hlist_head *head; + unsigned int u; + + head = kmalloc(sizeof(*nhash->nh_head) * num_hash, gfp); + if (head) { + nhash->nh_num = num_hash; + nhash->nh_head = head; + for (u = 0; u < num_hash; u++) + INIT_HLIST_HEAD(head++); + return 0; /* success */ + } + + return -ENOMEM; +} + +static void nhash_count(struct hlist_head *head) +{ +#if 0 + unsigned long n; + struct hlist_node *pos; + + n = 0; + hlist_for_each(pos, head) + n++; + pr_info("%lu\n", n); +#endif +} + +static void au_nhash_wh_do_free(struct hlist_head *head) +{ + struct au_vdir_wh *tpos; + struct hlist_node *pos, *node; + + hlist_for_each_entry_safe(tpos, pos, node, head, wh_hash) { + /* hlist_del(pos); */ + kfree(tpos); + } +} + +static void au_nhash_de_do_free(struct hlist_head *head) +{ + struct au_vdir_dehstr *tpos; + struct hlist_node *pos, *node; + + hlist_for_each_entry_safe(tpos, pos, node, head, hash) { + /* hlist_del(pos); */ + au_cache_free_vdir_dehstr(tpos); + } +} + +static void au_nhash_do_free(struct au_nhash *nhash, + void (*free)(struct hlist_head *head)) +{ + unsigned int n; + struct hlist_head *head; + + n = nhash->nh_num; + if (!n) + return; + + head = nhash->nh_head; + while (n-- > 0) { + nhash_count(head); + free(head++); + } + kfree(nhash->nh_head); +} + +void au_nhash_wh_free(struct au_nhash *whlist) +{ + au_nhash_do_free(whlist, au_nhash_wh_do_free); +} + +static void au_nhash_de_free(struct au_nhash *delist) +{ + au_nhash_do_free(delist, au_nhash_de_do_free); +} + +/* ---------------------------------------------------------------------- */ + +int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt, + int limit) +{ + int num; + unsigned int u, n; + struct hlist_head *head; + struct au_vdir_wh *tpos; + struct hlist_node *pos; + + num = 0; + n = whlist->nh_num; + head = whlist->nh_head; + for (u = 0; u < n; u++, head++) + hlist_for_each_entry(tpos, pos, head, wh_hash) + if (tpos->wh_bindex == btgt && ++num > limit) + return 1; + return 0; +} + +static struct hlist_head *au_name_hash(struct au_nhash *nhash, + unsigned char *name, + unsigned int len) +{ + unsigned int v; + /* const unsigned int magic_bit = 12; */ + + AuDebugOn(!nhash->nh_num || !nhash->nh_head); + + v = 0; + while (len--) + v += *name++; + /* v = hash_long(v, magic_bit); */ + v %= nhash->nh_num; + return nhash->nh_head + v; +} + +static int au_nhash_test_name(struct au_vdir_destr *str, const char *name, + int nlen) +{ + return str->len == nlen && !memcmp(str->name, name, nlen); +} + +/* returns found or not */ +int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen) +{ + struct hlist_head *head; + struct au_vdir_wh *tpos; + struct hlist_node *pos; + struct au_vdir_destr *str; + + head = au_name_hash(whlist, name, nlen); + hlist_for_each_entry(tpos, pos, head, wh_hash) { + str = &tpos->wh_str; + AuDbg("%.*s\n", str->len, str->name); + if (au_nhash_test_name(str, name, nlen)) + return 1; + } + return 0; +} + +/* returns found(true) or not */ +static int test_known(struct au_nhash *delist, char *name, int nlen) +{ + struct hlist_head *head; + struct au_vdir_dehstr *tpos; + struct hlist_node *pos; + struct au_vdir_destr *str; + + head = au_name_hash(delist, name, nlen); + hlist_for_each_entry(tpos, pos, head, hash) { + str = tpos->str; + AuDbg("%.*s\n", str->len, str->name); + if (au_nhash_test_name(str, name, nlen)) + return 1; + } + return 0; +} + +static void au_shwh_init_wh(struct au_vdir_wh *wh, ino_t ino, + unsigned char d_type) +{ +#ifdef CONFIG_AUFS_SHWH + wh->wh_ino = ino; + wh->wh_type = d_type; +#endif +} + +/* ---------------------------------------------------------------------- */ + +int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino, + unsigned int d_type, aufs_bindex_t bindex, + unsigned char shwh) +{ + int err; + struct au_vdir_destr *str; + struct au_vdir_wh *wh; + + AuDbg("%.*s\n", nlen, name); + AuDebugOn(!whlist->nh_num || !whlist->nh_head); + + err = -ENOMEM; + wh = kmalloc(sizeof(*wh) + nlen, GFP_NOFS); + if (unlikely(!wh)) + goto out; + + err = 0; + wh->wh_bindex = bindex; + if (shwh) + au_shwh_init_wh(wh, ino, d_type); + str = &wh->wh_str; + str->len = nlen; + memcpy(str->name, name, nlen); + hlist_add_head(&wh->wh_hash, au_name_hash(whlist, name, nlen)); + /* smp_mb(); */ + +out: + return err; +} + +static int append_deblk(struct au_vdir *vdir) +{ + int err; + unsigned long ul; + const unsigned int deblk_sz = vdir->vd_deblk_sz; + union au_vdir_deblk_p p, deblk_end; + unsigned char **o; + + err = -ENOMEM; + o = krealloc(vdir->vd_deblk, sizeof(*o) * (vdir->vd_nblk + 1), + GFP_NOFS); + if (unlikely(!o)) + goto out; + + vdir->vd_deblk = o; + p.deblk = kmalloc(deblk_sz, GFP_NOFS); + if (p.deblk) { + ul = vdir->vd_nblk++; + vdir->vd_deblk[ul] = p.deblk; + vdir->vd_last.ul = ul; + vdir->vd_last.p.deblk = p.deblk; + deblk_end.deblk = p.deblk + deblk_sz; + err = set_deblk_end(&p, &deblk_end); + } + +out: + return err; +} + +static int append_de(struct au_vdir *vdir, char *name, int nlen, ino_t ino, + unsigned int d_type, struct au_nhash *delist) +{ + int err; + unsigned int sz; + const unsigned int deblk_sz = vdir->vd_deblk_sz; + union au_vdir_deblk_p p, *room, deblk_end; + struct au_vdir_dehstr *dehstr; + + p.deblk = last_deblk(vdir); + deblk_end.deblk = p.deblk + deblk_sz; + room = &vdir->vd_last.p; + AuDebugOn(room->deblk < p.deblk || deblk_end.deblk <= room->deblk + || !is_deblk_end(room, &deblk_end)); + + sz = calc_size(nlen); + if (unlikely(sz > deblk_end.deblk - room->deblk)) { + err = append_deblk(vdir); + if (unlikely(err)) + goto out; + + p.deblk = last_deblk(vdir); + deblk_end.deblk = p.deblk + deblk_sz; + /* smp_mb(); */ + AuDebugOn(room->deblk != p.deblk); + } + + err = -ENOMEM; + dehstr = au_cache_alloc_vdir_dehstr(); + if (unlikely(!dehstr)) + goto out; + + dehstr->str = &room->de->de_str; + hlist_add_head(&dehstr->hash, au_name_hash(delist, name, nlen)); + room->de->de_ino = ino; + room->de->de_type = d_type; + room->de->de_str.len = nlen; + memcpy(room->de->de_str.name, name, nlen); + + err = 0; + room->deblk += sz; + if (unlikely(set_deblk_end(room, &deblk_end))) + err = append_deblk(vdir); + /* smp_mb(); */ + +out: + return err; +} + +/* ---------------------------------------------------------------------- */ + +void au_vdir_free(struct au_vdir *vdir) +{ + unsigned char **deblk; + + deblk = vdir->vd_deblk; + while (vdir->vd_nblk--) + kfree(*deblk++); + kfree(vdir->vd_deblk); + au_cache_free_vdir(vdir); +} + +static struct au_vdir *alloc_vdir(struct file *file) +{ + struct au_vdir *vdir; + struct super_block *sb; + int err; + + sb = file->f_dentry->d_sb; + SiMustAnyLock(sb); + + err = -ENOMEM; + vdir = au_cache_alloc_vdir(); + if (unlikely(!vdir)) + goto out; + + vdir->vd_deblk = kzalloc(sizeof(*vdir->vd_deblk), GFP_NOFS); + if (unlikely(!vdir->vd_deblk)) + goto out_free; + + vdir->vd_deblk_sz = au_sbi(sb)->si_rdblk; + if (!vdir->vd_deblk_sz) { + /* estimate the apropriate size for deblk */ + vdir->vd_deblk_sz = au_dir_size(file, /*dentry*/NULL); + /* pr_info("vd_deblk_sz %u\n", vdir->vd_deblk_sz); */ + } + vdir->vd_nblk = 0; + vdir->vd_version = 0; + vdir->vd_jiffy = 0; + err = append_deblk(vdir); + if (!err) + return vdir; /* success */ + + kfree(vdir->vd_deblk); + +out_free: + au_cache_free_vdir(vdir); +out: + vdir = ERR_PTR(err); + return vdir; +} + +static int reinit_vdir(struct au_vdir *vdir) +{ + int err; + union au_vdir_deblk_p p, deblk_end; + + while (vdir->vd_nblk > 1) { + kfree(vdir->vd_deblk[vdir->vd_nblk - 1]); + /* vdir->vd_deblk[vdir->vd_nblk - 1] = NULL; */ + vdir->vd_nblk--; + } + p.deblk = vdir->vd_deblk[0]; + deblk_end.deblk = p.deblk + vdir->vd_deblk_sz; + err = set_deblk_end(&p, &deblk_end); + /* keep vd_dblk_sz */ + vdir->vd_last.ul = 0; + vdir->vd_last.p.deblk = vdir->vd_deblk[0]; + vdir->vd_version = 0; + vdir->vd_jiffy = 0; + /* smp_mb(); */ + return err; +} + +/* ---------------------------------------------------------------------- */ + +#define AuFillVdir_CALLED 1 +#define AuFillVdir_WHABLE (1 << 1) +#define AuFillVdir_SHWH (1 << 2) +#define au_ftest_fillvdir(flags, name) ((flags) & AuFillVdir_##name) +#define au_fset_fillvdir(flags, name) \ + do { (flags) |= AuFillVdir_##name; } while (0) +#define au_fclr_fillvdir(flags, name) \ + do { (flags) &= ~AuFillVdir_##name; } while (0) + +#ifndef CONFIG_AUFS_SHWH +#undef AuFillVdir_SHWH +#define AuFillVdir_SHWH 0 +#endif + +struct fillvdir_arg { + struct file *file; + struct au_vdir *vdir; + struct au_nhash delist; + struct au_nhash whlist; + aufs_bindex_t bindex; + unsigned int flags; + int err; +}; + +static int fillvdir(void *__arg, const char *__name, int nlen, + loff_t offset __maybe_unused, u64 h_ino, + unsigned int d_type) +{ + struct fillvdir_arg *arg = __arg; + char *name = (void *)__name; + struct super_block *sb; + ino_t ino; + const unsigned char shwh = !!au_ftest_fillvdir(arg->flags, SHWH); + + arg->err = 0; + sb = arg->file->f_dentry->d_sb; + au_fset_fillvdir(arg->flags, CALLED); + /* smp_mb(); */ + if (nlen <= AUFS_WH_PFX_LEN + || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { + if (test_known(&arg->delist, name, nlen) + || au_nhash_test_known_wh(&arg->whlist, name, nlen)) + goto out; /* already exists or whiteouted */ + + sb = arg->file->f_dentry->d_sb; + arg->err = au_ino(sb, arg->bindex, h_ino, d_type, &ino); + if (!arg->err) { + if (unlikely(nlen > AUFS_MAX_NAMELEN)) + d_type = DT_UNKNOWN; + arg->err = append_de(arg->vdir, name, nlen, ino, + d_type, &arg->delist); + } + } else if (au_ftest_fillvdir(arg->flags, WHABLE)) { + name += AUFS_WH_PFX_LEN; + nlen -= AUFS_WH_PFX_LEN; + if (au_nhash_test_known_wh(&arg->whlist, name, nlen)) + goto out; /* already whiteouted */ + + if (shwh) + arg->err = au_wh_ino(sb, arg->bindex, h_ino, d_type, + &ino); + if (!arg->err) { + if (nlen <= AUFS_MAX_NAMELEN + AUFS_WH_PFX_LEN) + d_type = DT_UNKNOWN; + arg->err = au_nhash_append_wh + (&arg->whlist, name, nlen, ino, d_type, + arg->bindex, shwh); + } + } + +out: + if (!arg->err) + arg->vdir->vd_jiffy = jiffies; + /* smp_mb(); */ + AuTraceErr(arg->err); + return arg->err; +} + +static int au_handle_shwh(struct super_block *sb, struct au_vdir *vdir, + struct au_nhash *whlist, struct au_nhash *delist) +{ +#ifdef CONFIG_AUFS_SHWH + int err; + unsigned int nh, u; + struct hlist_head *head; + struct au_vdir_wh *tpos; + struct hlist_node *pos, *n; + char *p, *o; + struct au_vdir_destr *destr; + + AuDebugOn(!au_opt_test(au_mntflags(sb), SHWH)); + + err = -ENOMEM; + o = p = __getname_gfp(GFP_NOFS); + if (unlikely(!p)) + goto out; + + err = 0; + nh = whlist->nh_num; + memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); + p += AUFS_WH_PFX_LEN; + for (u = 0; u < nh; u++) { + head = whlist->nh_head + u; + hlist_for_each_entry_safe(tpos, pos, n, head, wh_hash) { + destr = &tpos->wh_str; + memcpy(p, destr->name, destr->len); + err = append_de(vdir, o, destr->len + AUFS_WH_PFX_LEN, + tpos->wh_ino, tpos->wh_type, delist); + if (unlikely(err)) + break; + } + } + + __putname(o); + +out: + AuTraceErr(err); + return err; +#else + return 0; +#endif +} + +static int au_do_read_vdir(struct fillvdir_arg *arg) +{ + int err; + unsigned int rdhash; + loff_t offset; + aufs_bindex_t bend, bindex, bstart; + unsigned char shwh; + struct file *hf, *file; + struct super_block *sb; + + file = arg->file; + sb = file->f_dentry->d_sb; + SiMustAnyLock(sb); + + rdhash = au_sbi(sb)->si_rdhash; + if (!rdhash) + rdhash = au_rdhash_est(au_dir_size(file, /*dentry*/NULL)); + err = au_nhash_alloc(&arg->delist, rdhash, GFP_NOFS); + if (unlikely(err)) + goto out; + err = au_nhash_alloc(&arg->whlist, rdhash, GFP_NOFS); + if (unlikely(err)) + goto out_delist; + + err = 0; + arg->flags = 0; + shwh = 0; + if (au_opt_test(au_mntflags(sb), SHWH)) { + shwh = 1; + au_fset_fillvdir(arg->flags, SHWH); + } + bstart = au_fbstart(file); + bend = au_fbend_dir(file); + for (bindex = bstart; !err && bindex <= bend; bindex++) { + hf = au_hf_dir(file, bindex); + if (!hf) + continue; + + offset = vfsub_llseek(hf, 0, SEEK_SET); + err = offset; + if (unlikely(offset)) + break; + + arg->bindex = bindex; + au_fclr_fillvdir(arg->flags, WHABLE); + if (shwh + || (bindex != bend + && au_br_whable(au_sbr_perm(sb, bindex)))) + au_fset_fillvdir(arg->flags, WHABLE); + do { + arg->err = 0; + au_fclr_fillvdir(arg->flags, CALLED); + /* smp_mb(); */ + err = vfsub_readdir(hf, fillvdir, arg); + if (err >= 0) + err = arg->err; + } while (!err && au_ftest_fillvdir(arg->flags, CALLED)); + } + + if (!err && shwh) + err = au_handle_shwh(sb, arg->vdir, &arg->whlist, &arg->delist); + + au_nhash_wh_free(&arg->whlist); + +out_delist: + au_nhash_de_free(&arg->delist); +out: + return err; +} + +static int read_vdir(struct file *file, int may_read) +{ + int err; + unsigned long expire; + unsigned char do_read; + struct fillvdir_arg arg; + struct inode *inode; + struct au_vdir *vdir, *allocated; + + err = 0; + inode = file->f_dentry->d_inode; + IMustLock(inode); + SiMustAnyLock(inode->i_sb); + + allocated = NULL; + do_read = 0; + expire = au_sbi(inode->i_sb)->si_rdcache; + vdir = au_ivdir(inode); + if (!vdir) { + do_read = 1; + vdir = alloc_vdir(file); + err = PTR_ERR(vdir); + if (IS_ERR(vdir)) + goto out; + err = 0; + allocated = vdir; + } else if (may_read + && (inode->i_version != vdir->vd_version + || time_after(jiffies, vdir->vd_jiffy + expire))) { + do_read = 1; + err = reinit_vdir(vdir); + if (unlikely(err)) + goto out; + } + + if (!do_read) + return 0; /* success */ + + arg.file = file; + arg.vdir = vdir; + err = au_do_read_vdir(&arg); + if (!err) { + /* file->f_pos = 0; */ + vdir->vd_version = inode->i_version; + vdir->vd_last.ul = 0; + vdir->vd_last.p.deblk = vdir->vd_deblk[0]; + if (allocated) + au_set_ivdir(inode, allocated); + } else if (allocated) + au_vdir_free(allocated); + +out: + return err; +} + +static int copy_vdir(struct au_vdir *tgt, struct au_vdir *src) +{ + int err, rerr; + unsigned long ul, n; + const unsigned int deblk_sz = src->vd_deblk_sz; + + AuDebugOn(tgt->vd_nblk != 1); + + err = -ENOMEM; + if (tgt->vd_nblk < src->vd_nblk) { + unsigned char **p; + + p = krealloc(tgt->vd_deblk, sizeof(*p) * src->vd_nblk, + GFP_NOFS); + if (unlikely(!p)) + goto out; + tgt->vd_deblk = p; + } + + if (tgt->vd_deblk_sz != deblk_sz) { + unsigned char *p; + + tgt->vd_deblk_sz = deblk_sz; + p = krealloc(tgt->vd_deblk[0], deblk_sz, GFP_NOFS); + if (unlikely(!p)) + goto out; + tgt->vd_deblk[0] = p; + } + memcpy(tgt->vd_deblk[0], src->vd_deblk[0], deblk_sz); + tgt->vd_version = src->vd_version; + tgt->vd_jiffy = src->vd_jiffy; + + n = src->vd_nblk; + for (ul = 1; ul < n; ul++) { + tgt->vd_deblk[ul] = kmemdup(src->vd_deblk[ul], deblk_sz, + GFP_NOFS); + if (unlikely(!tgt->vd_deblk[ul])) + goto out; + tgt->vd_nblk++; + } + tgt->vd_nblk = n; + tgt->vd_last.ul = tgt->vd_last.ul; + tgt->vd_last.p.deblk = tgt->vd_deblk[tgt->vd_last.ul]; + tgt->vd_last.p.deblk += src->vd_last.p.deblk + - src->vd_deblk[src->vd_last.ul]; + /* smp_mb(); */ + return 0; /* success */ + +out: + rerr = reinit_vdir(tgt); + BUG_ON(rerr); + return err; +} + +int au_vdir_init(struct file *file) +{ + int err; + struct inode *inode; + struct au_vdir *vdir_cache, *allocated; + + err = read_vdir(file, !file->f_pos); + if (unlikely(err)) + goto out; + + allocated = NULL; + vdir_cache = au_fvdir_cache(file); + if (!vdir_cache) { + vdir_cache = alloc_vdir(file); + err = PTR_ERR(vdir_cache); + if (IS_ERR(vdir_cache)) + goto out; + allocated = vdir_cache; + } else if (!file->f_pos && vdir_cache->vd_version != file->f_version) { + err = reinit_vdir(vdir_cache); + if (unlikely(err)) + goto out; + } else + return 0; /* success */ + + inode = file->f_dentry->d_inode; + err = copy_vdir(vdir_cache, au_ivdir(inode)); + if (!err) { + file->f_version = inode->i_version; + if (allocated) + au_set_fvdir_cache(file, allocated); + } else if (allocated) + au_vdir_free(allocated); + +out: + return err; +} + +static loff_t calc_offset(struct au_vdir *vdir) +{ + loff_t offset; + union au_vdir_deblk_p p; + + p.deblk = vdir->vd_deblk[vdir->vd_last.ul]; + offset = vdir->vd_last.p.deblk - p.deblk; + offset += vdir->vd_deblk_sz * vdir->vd_last.ul; + return offset; +} + +/* returns true or false */ +static int seek_vdir(struct file *file) +{ + int valid; + unsigned int deblk_sz; + unsigned long ul, n; + loff_t offset; + union au_vdir_deblk_p p, deblk_end; + struct au_vdir *vdir_cache; + + valid = 1; + vdir_cache = au_fvdir_cache(file); + offset = calc_offset(vdir_cache); + AuDbg("offset %lld\n", offset); + if (file->f_pos == offset) + goto out; + + vdir_cache->vd_last.ul = 0; + vdir_cache->vd_last.p.deblk = vdir_cache->vd_deblk[0]; + if (!file->f_pos) + goto out; + + valid = 0; + deblk_sz = vdir_cache->vd_deblk_sz; + ul = div64_u64(file->f_pos, deblk_sz); + AuDbg("ul %lu\n", ul); + if (ul >= vdir_cache->vd_nblk) + goto out; + + n = vdir_cache->vd_nblk; + for (; ul < n; ul++) { + p.deblk = vdir_cache->vd_deblk[ul]; + deblk_end.deblk = p.deblk + deblk_sz; + offset = ul; + offset *= deblk_sz; + while (!is_deblk_end(&p, &deblk_end) && offset < file->f_pos) { + unsigned int l; + + l = calc_size(p.de->de_str.len); + offset += l; + p.deblk += l; + } + if (!is_deblk_end(&p, &deblk_end)) { + valid = 1; + vdir_cache->vd_last.ul = ul; + vdir_cache->vd_last.p = p; + break; + } + } + +out: + /* smp_mb(); */ + AuTraceErr(!valid); + return valid; +} + +int au_vdir_fill_de(struct file *file, void *dirent, filldir_t filldir) +{ + int err; + unsigned int l, deblk_sz; + union au_vdir_deblk_p deblk_end; + struct au_vdir *vdir_cache; + struct au_vdir_de *de; + + vdir_cache = au_fvdir_cache(file); + if (!seek_vdir(file)) + return 0; + + deblk_sz = vdir_cache->vd_deblk_sz; + while (1) { + deblk_end.deblk = vdir_cache->vd_deblk[vdir_cache->vd_last.ul]; + deblk_end.deblk += deblk_sz; + while (!is_deblk_end(&vdir_cache->vd_last.p, &deblk_end)) { + de = vdir_cache->vd_last.p.de; + AuDbg("%.*s, off%lld, i%lu, dt%d\n", + de->de_str.len, de->de_str.name, file->f_pos, + (unsigned long)de->de_ino, de->de_type); + err = filldir(dirent, de->de_str.name, de->de_str.len, + file->f_pos, de->de_ino, de->de_type); + if (unlikely(err)) { + AuTraceErr(err); + /* todo: ignore the error caused by udba? */ + /* return err; */ + return 0; + } + + l = calc_size(de->de_str.len); + vdir_cache->vd_last.p.deblk += l; + file->f_pos += l; + } + if (vdir_cache->vd_last.ul < vdir_cache->vd_nblk - 1) { + vdir_cache->vd_last.ul++; + vdir_cache->vd_last.p.deblk + = vdir_cache->vd_deblk[vdir_cache->vd_last.ul]; + file->f_pos = deblk_sz * vdir_cache->vd_last.ul; + continue; + } + break; + } + + /* smp_mb(); */ + return 0; +} diff --git a/ubuntu/aufs/vfsub.c b/ubuntu/aufs/vfsub.c new file mode 100644 index 000000000000..301e34c50100 --- /dev/null +++ b/ubuntu/aufs/vfsub.c @@ -0,0 +1,837 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * sub-routines for VFS + */ + +#include +#include +#include +#include +#include +#include +#include "aufs.h" + +int vfsub_update_h_iattr(struct path *h_path, int *did) +{ + int err; + struct kstat st; + struct super_block *h_sb; + + /* for remote fs, leave work for its getattr or d_revalidate */ + /* for bad i_attr fs, handle them in aufs_getattr() */ + /* still some fs may acquire i_mutex. we need to skip them */ + err = 0; + if (!did) + did = &err; + h_sb = h_path->dentry->d_sb; + *did = (!au_test_fs_remote(h_sb) && au_test_fs_refresh_iattr(h_sb)); + if (*did) + err = vfs_getattr(h_path->mnt, h_path->dentry, &st); + + return err; +} + +/* ---------------------------------------------------------------------- */ + +struct file *vfsub_dentry_open(struct path *path, int flags) +{ + struct file *file; + + path_get(path); + file = dentry_open(path->dentry, path->mnt, + flags /* | __FMODE_NONOTIFY */, + current_cred()); + if (!IS_ERR_OR_NULL(file) + && (file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) + i_readcount_inc(path->dentry->d_inode); + + return file; +} + +struct file *vfsub_filp_open(const char *path, int oflags, int mode) +{ + struct file *file; + + lockdep_off(); + file = filp_open(path, + oflags /* | __FMODE_NONOTIFY */, + mode); + lockdep_on(); + if (IS_ERR(file)) + goto out; + vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ + +out: + return file; +} + +int vfsub_kern_path(const char *name, unsigned int flags, struct path *path) +{ + int err; + + err = kern_path(name, flags, path); + if (!err && path->dentry->d_inode) + vfsub_update_h_iattr(path, /*did*/NULL); /*ignore*/ + return err; +} + +struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent, + int len) +{ + struct path path = { + .mnt = NULL + }; + + /* VFS checks it too, but by WARN_ON_ONCE() */ + IMustLock(parent->d_inode); + + path.dentry = lookup_one_len(name, parent, len); + if (IS_ERR(path.dentry)) + goto out; + if (path.dentry->d_inode) + vfsub_update_h_iattr(&path, /*did*/NULL); /*ignore*/ + +out: + AuTraceErrPtr(path.dentry); + return path.dentry; +} + +struct dentry *vfsub_lookup_hash(struct nameidata *nd) +{ + struct path path = { + .mnt = nd->path.mnt + }; + + IMustLock(nd->path.dentry->d_inode); + + path.dentry = lookup_hash(nd); + if (IS_ERR(path.dentry)) + goto out; + if (path.dentry->d_inode) + vfsub_update_h_iattr(&path, /*did*/NULL); /*ignore*/ + +out: + AuTraceErrPtr(path.dentry); + return path.dentry; +} + +/* + * this is "VFS:__lookup_one_len()" which was removed and merged into + * VFS:lookup_one_len() by the commit. + * 6a96ba5 2011-03-14 kill __lookup_one_len() + * this function should always be equivalent to the corresponding part in + * VFS:lookup_one_len(). + */ +int vfsub_name_hash(const char *name, struct qstr *this, int len) +{ + unsigned long hash; + unsigned int c; + + this->name = name; + this->len = len; + if (!len) + return -EACCES; + + hash = init_name_hash(); + while (len--) { + c = *(const unsigned char *)name++; + if (c == '/' || c == '\0') + return -EACCES; + hash = partial_name_hash(c, hash); + } + this->hash = end_name_hash(hash); + return 0; +} + +/* ---------------------------------------------------------------------- */ + +struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1, + struct dentry *d2, struct au_hinode *hdir2) +{ + struct dentry *d; + + lockdep_off(); + d = lock_rename(d1, d2); + lockdep_on(); + au_hn_suspend(hdir1); + if (hdir1 != hdir2) + au_hn_suspend(hdir2); + + return d; +} + +void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1, + struct dentry *d2, struct au_hinode *hdir2) +{ + au_hn_resume(hdir1); + if (hdir1 != hdir2) + au_hn_resume(hdir2); + lockdep_off(); + unlock_rename(d1, d2); + lockdep_on(); +} + +/* ---------------------------------------------------------------------- */ + +int vfsub_create(struct inode *dir, struct path *path, int mode) +{ + int err; + struct dentry *d; + + IMustLock(dir); + + d = path->dentry; + path->dentry = d->d_parent; + err = security_path_mknod(path, d, mode, 0); + path->dentry = d; + if (unlikely(err)) + goto out; + + if (au_test_fs_null_nd(dir->i_sb)) + err = vfs_create(dir, path->dentry, mode, NULL); + else { + struct nameidata h_nd; + + memset(&h_nd, 0, sizeof(h_nd)); + h_nd.flags = LOOKUP_CREATE; + h_nd.intent.open.flags = O_CREAT + | vfsub_fmode_to_uint(FMODE_READ); + h_nd.intent.open.create_mode = mode; + h_nd.path.dentry = path->dentry->d_parent; + h_nd.path.mnt = path->mnt; + path_get(&h_nd.path); + err = vfs_create(dir, path->dentry, mode, &h_nd); + path_put(&h_nd.path); + } + + if (!err) { + struct path tmp = *path; + int did; + + vfsub_update_h_iattr(&tmp, &did); + if (did) { + tmp.dentry = path->dentry->d_parent; + vfsub_update_h_iattr(&tmp, /*did*/NULL); + } + /*ignore*/ + } + +out: + return err; +} + +int vfsub_symlink(struct inode *dir, struct path *path, const char *symname) +{ + int err; + struct dentry *d; + + IMustLock(dir); + + d = path->dentry; + path->dentry = d->d_parent; + err = security_path_symlink(path, d, symname); + path->dentry = d; + if (unlikely(err)) + goto out; + + err = vfs_symlink(dir, path->dentry, symname); + if (!err) { + struct path tmp = *path; + int did; + + vfsub_update_h_iattr(&tmp, &did); + if (did) { + tmp.dentry = path->dentry->d_parent; + vfsub_update_h_iattr(&tmp, /*did*/NULL); + } + /*ignore*/ + } + +out: + return err; +} + +int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev) +{ + int err; + struct dentry *d; + + IMustLock(dir); + + d = path->dentry; + path->dentry = d->d_parent; + err = security_path_mknod(path, d, mode, new_encode_dev(dev)); + path->dentry = d; + if (unlikely(err)) + goto out; + + err = vfs_mknod(dir, path->dentry, mode, dev); + if (!err) { + struct path tmp = *path; + int did; + + vfsub_update_h_iattr(&tmp, &did); + if (did) { + tmp.dentry = path->dentry->d_parent; + vfsub_update_h_iattr(&tmp, /*did*/NULL); + } + /*ignore*/ + } + +out: + return err; +} + +static int au_test_nlink(struct inode *inode) +{ + const unsigned int link_max = UINT_MAX >> 1; /* rough margin */ + + if (!au_test_fs_no_limit_nlink(inode->i_sb) + || inode->i_nlink < link_max) + return 0; + return -EMLINK; +} + +int vfsub_link(struct dentry *src_dentry, struct inode *dir, struct path *path) +{ + int err; + struct dentry *d; + + IMustLock(dir); + + err = au_test_nlink(src_dentry->d_inode); + if (unlikely(err)) + return err; + + d = path->dentry; + path->dentry = d->d_parent; + err = security_path_link(src_dentry, path, d); + path->dentry = d; + if (unlikely(err)) + goto out; + + lockdep_off(); + err = vfs_link(src_dentry, dir, path->dentry); + lockdep_on(); + if (!err) { + struct path tmp = *path; + int did; + + /* fuse has different memory inode for the same inumber */ + vfsub_update_h_iattr(&tmp, &did); + if (did) { + tmp.dentry = path->dentry->d_parent; + vfsub_update_h_iattr(&tmp, /*did*/NULL); + tmp.dentry = src_dentry; + vfsub_update_h_iattr(&tmp, /*did*/NULL); + } + /*ignore*/ + } + +out: + return err; +} + +int vfsub_rename(struct inode *src_dir, struct dentry *src_dentry, + struct inode *dir, struct path *path) +{ + int err; + struct path tmp = { + .mnt = path->mnt + }; + struct dentry *d; + + IMustLock(dir); + IMustLock(src_dir); + + d = path->dentry; + path->dentry = d->d_parent; + tmp.dentry = src_dentry->d_parent; + err = security_path_rename(&tmp, src_dentry, path, d); + path->dentry = d; + if (unlikely(err)) + goto out; + + lockdep_off(); + err = vfs_rename(src_dir, src_dentry, dir, path->dentry); + lockdep_on(); + if (!err) { + int did; + + tmp.dentry = d->d_parent; + vfsub_update_h_iattr(&tmp, &did); + if (did) { + tmp.dentry = src_dentry; + vfsub_update_h_iattr(&tmp, /*did*/NULL); + tmp.dentry = src_dentry->d_parent; + vfsub_update_h_iattr(&tmp, /*did*/NULL); + } + /*ignore*/ + } + +out: + return err; +} + +int vfsub_mkdir(struct inode *dir, struct path *path, int mode) +{ + int err; + struct dentry *d; + + IMustLock(dir); + + d = path->dentry; + path->dentry = d->d_parent; + err = security_path_mkdir(path, d, mode); + path->dentry = d; + if (unlikely(err)) + goto out; + + err = vfs_mkdir(dir, path->dentry, mode); + if (!err) { + struct path tmp = *path; + int did; + + vfsub_update_h_iattr(&tmp, &did); + if (did) { + tmp.dentry = path->dentry->d_parent; + vfsub_update_h_iattr(&tmp, /*did*/NULL); + } + /*ignore*/ + } + +out: + return err; +} + +int vfsub_rmdir(struct inode *dir, struct path *path) +{ + int err; + struct dentry *d; + + IMustLock(dir); + + d = path->dentry; + path->dentry = d->d_parent; + err = security_path_rmdir(path, d); + path->dentry = d; + if (unlikely(err)) + goto out; + + lockdep_off(); + err = vfs_rmdir(dir, path->dentry); + lockdep_on(); + if (!err) { + struct path tmp = { + .dentry = path->dentry->d_parent, + .mnt = path->mnt + }; + + vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/ + } + +out: + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* todo: support mmap_sem? */ +ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count, + loff_t *ppos) +{ + ssize_t err; + + lockdep_off(); + err = vfs_read(file, ubuf, count, ppos); + lockdep_on(); + if (err >= 0) + vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ + return err; +} + +/* todo: kernel_read()? */ +ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count, + loff_t *ppos) +{ + ssize_t err; + mm_segment_t oldfs; + union { + void *k; + char __user *u; + } buf; + + buf.k = kbuf; + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = vfsub_read_u(file, buf.u, count, ppos); + set_fs(oldfs); + return err; +} + +ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count, + loff_t *ppos) +{ + ssize_t err; + + lockdep_off(); + err = vfs_write(file, ubuf, count, ppos); + lockdep_on(); + if (err >= 0) + vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ + return err; +} + +ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count, loff_t *ppos) +{ + ssize_t err; + mm_segment_t oldfs; + union { + void *k; + const char __user *u; + } buf; + + buf.k = kbuf; + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = vfsub_write_u(file, buf.u, count, ppos); + set_fs(oldfs); + return err; +} + +int vfsub_flush(struct file *file, fl_owner_t id) +{ + int err; + + err = 0; + if (file->f_op && file->f_op->flush) { + if (!au_test_nfs(file->f_dentry->d_sb)) + err = file->f_op->flush(file, id); + else { + lockdep_off(); + err = file->f_op->flush(file, id); + lockdep_on(); + } + if (!err) + vfsub_update_h_iattr(&file->f_path, /*did*/NULL); + /*ignore*/ + } + return err; +} + +int vfsub_readdir(struct file *file, filldir_t filldir, void *arg) +{ + int err; + + lockdep_off(); + err = vfs_readdir(file, filldir, arg); + lockdep_on(); + if (err >= 0) + vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ + return err; +} + +long vfsub_splice_to(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + long err; + + lockdep_off(); + err = do_splice_to(in, ppos, pipe, len, flags); + lockdep_on(); + file_accessed(in); + if (err >= 0) + vfsub_update_h_iattr(&in->f_path, /*did*/NULL); /*ignore*/ + return err; +} + +long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + long err; + + lockdep_off(); + err = do_splice_from(pipe, out, ppos, len, flags); + lockdep_on(); + if (err >= 0) + vfsub_update_h_iattr(&out->f_path, /*did*/NULL); /*ignore*/ + return err; +} + +int vfsub_fsync(struct file *file, struct path *path, int datasync) +{ + int err; + + /* file can be NULL */ + lockdep_off(); + err = vfs_fsync(file, datasync); + lockdep_on(); + if (!err) { + if (!path) { + AuDebugOn(!file); + path = &file->f_path; + } + vfsub_update_h_iattr(path, /*did*/NULL); /*ignore*/ + } + return err; +} + +/* cf. open.c:do_sys_truncate() and do_sys_ftruncate() */ +int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr, + struct file *h_file) +{ + int err; + struct inode *h_inode; + + h_inode = h_path->dentry->d_inode; + if (!h_file) { + err = mnt_want_write(h_path->mnt); + if (err) + goto out; + err = inode_permission(h_inode, MAY_WRITE); + if (err) + goto out_mnt; + err = get_write_access(h_inode); + if (err) + goto out_mnt; + err = break_lease(h_inode, O_WRONLY); + if (err) + goto out_inode; + } + + err = locks_verify_truncate(h_inode, h_file, length); + if (!err) + err = security_path_truncate(h_path); + if (!err) { + lockdep_off(); + err = do_truncate(h_path->dentry, length, attr, h_file); + lockdep_on(); + } + +out_inode: + if (!h_file) + put_write_access(h_inode); +out_mnt: + if (!h_file) + mnt_drop_write(h_path->mnt); +out: + return err; +} + +/* ---------------------------------------------------------------------- */ + +struct au_vfsub_mkdir_args { + int *errp; + struct inode *dir; + struct path *path; + int mode; +}; + +static void au_call_vfsub_mkdir(void *args) +{ + struct au_vfsub_mkdir_args *a = args; + *a->errp = vfsub_mkdir(a->dir, a->path, a->mode); +} + +int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode) +{ + int err, do_sio, wkq_err; + + do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE); + if (!do_sio) + err = vfsub_mkdir(dir, path, mode); + else { + struct au_vfsub_mkdir_args args = { + .errp = &err, + .dir = dir, + .path = path, + .mode = mode + }; + wkq_err = au_wkq_wait(au_call_vfsub_mkdir, &args); + if (unlikely(wkq_err)) + err = wkq_err; + } + + return err; +} + +struct au_vfsub_rmdir_args { + int *errp; + struct inode *dir; + struct path *path; +}; + +static void au_call_vfsub_rmdir(void *args) +{ + struct au_vfsub_rmdir_args *a = args; + *a->errp = vfsub_rmdir(a->dir, a->path); +} + +int vfsub_sio_rmdir(struct inode *dir, struct path *path) +{ + int err, do_sio, wkq_err; + + do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE); + if (!do_sio) + err = vfsub_rmdir(dir, path); + else { + struct au_vfsub_rmdir_args args = { + .errp = &err, + .dir = dir, + .path = path + }; + wkq_err = au_wkq_wait(au_call_vfsub_rmdir, &args); + if (unlikely(wkq_err)) + err = wkq_err; + } + + return err; +} + +/* ---------------------------------------------------------------------- */ + +struct notify_change_args { + int *errp; + struct path *path; + struct iattr *ia; +}; + +static void call_notify_change(void *args) +{ + struct notify_change_args *a = args; + struct inode *h_inode; + + h_inode = a->path->dentry->d_inode; + IMustLock(h_inode); + + *a->errp = -EPERM; + if (!IS_IMMUTABLE(h_inode) && !IS_APPEND(h_inode)) { + *a->errp = notify_change(a->path->dentry, a->ia); + if (!*a->errp) + vfsub_update_h_iattr(a->path, /*did*/NULL); /*ignore*/ + } + AuTraceErr(*a->errp); +} + +int vfsub_notify_change(struct path *path, struct iattr *ia) +{ + int err; + struct notify_change_args args = { + .errp = &err, + .path = path, + .ia = ia + }; + + call_notify_change(&args); + + return err; +} + +int vfsub_sio_notify_change(struct path *path, struct iattr *ia) +{ + int err, wkq_err; + struct notify_change_args args = { + .errp = &err, + .path = path, + .ia = ia + }; + + wkq_err = au_wkq_wait(call_notify_change, &args); + if (unlikely(wkq_err)) + err = wkq_err; + + return err; +} + +/* ---------------------------------------------------------------------- */ + +struct unlink_args { + int *errp; + struct inode *dir; + struct path *path; +}; + +static void call_unlink(void *args) +{ + struct unlink_args *a = args; + struct dentry *d = a->path->dentry; + struct inode *h_inode; + const int stop_sillyrename = (au_test_nfs(d->d_sb) + && d->d_count == 1); + + IMustLock(a->dir); + + a->path->dentry = d->d_parent; + *a->errp = security_path_unlink(a->path, d); + a->path->dentry = d; + if (unlikely(*a->errp)) + return; + + if (!stop_sillyrename) + dget(d); + h_inode = d->d_inode; + if (h_inode) + ihold(h_inode); + + lockdep_off(); + *a->errp = vfs_unlink(a->dir, d); + lockdep_on(); + if (!*a->errp) { + struct path tmp = { + .dentry = d->d_parent, + .mnt = a->path->mnt + }; + vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/ + } + + if (!stop_sillyrename) + dput(d); + if (h_inode) + iput(h_inode); + + AuTraceErr(*a->errp); +} + +/* + * @dir: must be locked. + * @dentry: target dentry. + */ +int vfsub_unlink(struct inode *dir, struct path *path, int force) +{ + int err; + struct unlink_args args = { + .errp = &err, + .dir = dir, + .path = path + }; + + if (!force) + call_unlink(&args); + else { + int wkq_err; + + wkq_err = au_wkq_wait(call_unlink, &args); + if (unlikely(wkq_err)) + err = wkq_err; + } + + return err; +} diff --git a/ubuntu/aufs/vfsub.h b/ubuntu/aufs/vfsub.h new file mode 100644 index 000000000000..fe0751811bbf --- /dev/null +++ b/ubuntu/aufs/vfsub.h @@ -0,0 +1,232 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * sub-routines for VFS + */ + +#ifndef __AUFS_VFSUB_H__ +#define __AUFS_VFSUB_H__ + +#ifdef __KERNEL__ + +#include +#include +#include "debug.h" + +/* copied from linux/fs/internal.h */ +/* todo: BAD approach!! */ +DECLARE_BRLOCK(vfsmount_lock); +extern void file_sb_list_del(struct file *f); +extern spinlock_t inode_sb_list_lock; + +/* copied from linux/fs/file_table.c */ +DECLARE_LGLOCK(files_lglock); +#ifdef CONFIG_SMP +/* + * These macros iterate all files on all CPUs for a given superblock. + * files_lglock must be held globally. + */ +#define do_file_list_for_each_entry(__sb, __file) \ +{ \ + int i; \ + for_each_possible_cpu(i) { \ + struct list_head *list; \ + list = per_cpu_ptr((__sb)->s_files, i); \ + list_for_each_entry((__file), list, f_u.fu_list) + +#define while_file_list_for_each_entry \ + } \ +} + +#else + +#define do_file_list_for_each_entry(__sb, __file) \ +{ \ + struct list_head *list; \ + list = &(sb)->s_files; \ + list_for_each_entry((__file), list, f_u.fu_list) + +#define while_file_list_for_each_entry \ +} +#endif + +/* ---------------------------------------------------------------------- */ + +/* lock subclass for lower inode */ +/* default MAX_LOCKDEP_SUBCLASSES(8) is not enough */ +/* reduce? gave up. */ +enum { + AuLsc_I_Begin = I_MUTEX_QUOTA, /* 4 */ + AuLsc_I_PARENT, /* lower inode, parent first */ + AuLsc_I_PARENT2, /* copyup dirs */ + AuLsc_I_PARENT3, /* copyup wh */ + AuLsc_I_CHILD, + AuLsc_I_CHILD2, + AuLsc_I_End +}; + +/* to debug easier, do not make them inlined functions */ +#define MtxMustLock(mtx) AuDebugOn(!mutex_is_locked(mtx)) +#define IMustLock(i) MtxMustLock(&(i)->i_mutex) + +/* ---------------------------------------------------------------------- */ + +static inline void vfsub_drop_nlink(struct inode *inode) +{ + AuDebugOn(!inode->i_nlink); + drop_nlink(inode); +} + +static inline void vfsub_dead_dir(struct inode *inode) +{ + AuDebugOn(!S_ISDIR(inode->i_mode)); + inode->i_flags |= S_DEAD; + clear_nlink(inode); +} + +/* ---------------------------------------------------------------------- */ + +int vfsub_update_h_iattr(struct path *h_path, int *did); +struct file *vfsub_dentry_open(struct path *path, int flags); +struct file *vfsub_filp_open(const char *path, int oflags, int mode); +int vfsub_kern_path(const char *name, unsigned int flags, struct path *path); +struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent, + int len); +struct dentry *vfsub_lookup_hash(struct nameidata *nd); +int vfsub_name_hash(const char *name, struct qstr *this, int len); + +/* ---------------------------------------------------------------------- */ + +struct au_hinode; +struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1, + struct dentry *d2, struct au_hinode *hdir2); +void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1, + struct dentry *d2, struct au_hinode *hdir2); + +int vfsub_create(struct inode *dir, struct path *path, int mode); +int vfsub_symlink(struct inode *dir, struct path *path, + const char *symname); +int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev); +int vfsub_link(struct dentry *src_dentry, struct inode *dir, + struct path *path); +int vfsub_rename(struct inode *src_hdir, struct dentry *src_dentry, + struct inode *hdir, struct path *path); +int vfsub_mkdir(struct inode *dir, struct path *path, int mode); +int vfsub_rmdir(struct inode *dir, struct path *path); + +/* ---------------------------------------------------------------------- */ + +ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count, + loff_t *ppos); +ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count, + loff_t *ppos); +ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count, + loff_t *ppos); +ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count, + loff_t *ppos); +int vfsub_flush(struct file *file, fl_owner_t id); +int vfsub_readdir(struct file *file, filldir_t filldir, void *arg); + +static inline unsigned int vfsub_file_flags(struct file *file) +{ + unsigned int flags; + + spin_lock(&file->f_lock); + flags = file->f_flags; + spin_unlock(&file->f_lock); + + return flags; +} + +static inline void vfsub_file_accessed(struct file *h_file) +{ + file_accessed(h_file); + vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); /*ignore*/ +} + +static inline void vfsub_touch_atime(struct vfsmount *h_mnt, + struct dentry *h_dentry) +{ + struct path h_path = { + .dentry = h_dentry, + .mnt = h_mnt + }; + touch_atime(h_mnt, h_dentry); + vfsub_update_h_iattr(&h_path, /*did*/NULL); /*ignore*/ +} + +long vfsub_splice_to(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); +long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags); +int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr, + struct file *h_file); +int vfsub_fsync(struct file *file, struct path *path, int datasync); + +/* ---------------------------------------------------------------------- */ + +static inline loff_t vfsub_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t err; + + lockdep_off(); + err = vfs_llseek(file, offset, origin); + lockdep_on(); + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* dirty workaround for strict type of fmode_t */ +union vfsub_fmu { + fmode_t fm; + unsigned int ui; +}; + +static inline unsigned int vfsub_fmode_to_uint(fmode_t fm) +{ + union vfsub_fmu u = { + .fm = fm + }; + + BUILD_BUG_ON(sizeof(u.fm) != sizeof(u.ui)); + + return u.ui; +} + +static inline fmode_t vfsub_uint_to_fmode(unsigned int ui) +{ + union vfsub_fmu u = { + .ui = ui + }; + + return u.fm; +} + +/* ---------------------------------------------------------------------- */ + +int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode); +int vfsub_sio_rmdir(struct inode *dir, struct path *path); +int vfsub_sio_notify_change(struct path *path, struct iattr *ia); +int vfsub_notify_change(struct path *path, struct iattr *ia); +int vfsub_unlink(struct inode *dir, struct path *path, int force); + +#endif /* __KERNEL__ */ +#endif /* __AUFS_VFSUB_H__ */ diff --git a/ubuntu/aufs/wbr_policy.c b/ubuntu/aufs/wbr_policy.c new file mode 100644 index 000000000000..dec340a4ef0d --- /dev/null +++ b/ubuntu/aufs/wbr_policy.c @@ -0,0 +1,700 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * policies for selecting one among multiple writable branches + */ + +#include +#include "aufs.h" + +/* subset of cpup_attr() */ +static noinline_for_stack +int au_cpdown_attr(struct path *h_path, struct dentry *h_src) +{ + int err, sbits; + struct iattr ia; + struct inode *h_isrc; + + h_isrc = h_src->d_inode; + ia.ia_valid = ATTR_FORCE | ATTR_MODE | ATTR_UID | ATTR_GID; + ia.ia_mode = h_isrc->i_mode; + ia.ia_uid = h_isrc->i_uid; + ia.ia_gid = h_isrc->i_gid; + sbits = !!(ia.ia_mode & (S_ISUID | S_ISGID)); + au_cpup_attr_flags(h_path->dentry->d_inode, h_isrc); + err = vfsub_sio_notify_change(h_path, &ia); + + /* is this nfs only? */ + if (!err && sbits && au_test_nfs(h_path->dentry->d_sb)) { + ia.ia_valid = ATTR_FORCE | ATTR_MODE; + ia.ia_mode = h_isrc->i_mode; + err = vfsub_sio_notify_change(h_path, &ia); + } + + return err; +} + +#define AuCpdown_PARENT_OPQ 1 +#define AuCpdown_WHED (1 << 1) +#define AuCpdown_MADE_DIR (1 << 2) +#define AuCpdown_DIROPQ (1 << 3) +#define au_ftest_cpdown(flags, name) ((flags) & AuCpdown_##name) +#define au_fset_cpdown(flags, name) \ + do { (flags) |= AuCpdown_##name; } while (0) +#define au_fclr_cpdown(flags, name) \ + do { (flags) &= ~AuCpdown_##name; } while (0) + +struct au_cpdown_dir_args { + struct dentry *parent; + unsigned int flags; +}; + +static int au_cpdown_dir_opq(struct dentry *dentry, aufs_bindex_t bdst, + struct au_cpdown_dir_args *a) +{ + int err; + struct dentry *opq_dentry; + + opq_dentry = au_diropq_create(dentry, bdst); + err = PTR_ERR(opq_dentry); + if (IS_ERR(opq_dentry)) + goto out; + dput(opq_dentry); + au_fset_cpdown(a->flags, DIROPQ); + +out: + return err; +} + +static int au_cpdown_dir_wh(struct dentry *dentry, struct dentry *h_parent, + struct inode *dir, aufs_bindex_t bdst) +{ + int err; + struct path h_path; + struct au_branch *br; + + br = au_sbr(dentry->d_sb, bdst); + h_path.dentry = au_wh_lkup(h_parent, &dentry->d_name, br); + err = PTR_ERR(h_path.dentry); + if (IS_ERR(h_path.dentry)) + goto out; + + err = 0; + if (h_path.dentry->d_inode) { + h_path.mnt = br->br_mnt; + err = au_wh_unlink_dentry(au_h_iptr(dir, bdst), &h_path, + dentry); + } + dput(h_path.dentry); + +out: + return err; +} + +static int au_cpdown_dir(struct dentry *dentry, aufs_bindex_t bdst, + struct dentry *h_parent, void *arg) +{ + int err, rerr; + aufs_bindex_t bopq, bstart; + struct path h_path; + struct dentry *parent; + struct inode *h_dir, *h_inode, *inode, *dir; + struct au_cpdown_dir_args *args = arg; + + bstart = au_dbstart(dentry); + /* dentry is di-locked */ + parent = dget_parent(dentry); + dir = parent->d_inode; + h_dir = h_parent->d_inode; + AuDebugOn(h_dir != au_h_iptr(dir, bdst)); + IMustLock(h_dir); + + err = au_lkup_neg(dentry, bdst); + if (unlikely(err < 0)) + goto out; + h_path.dentry = au_h_dptr(dentry, bdst); + h_path.mnt = au_sbr_mnt(dentry->d_sb, bdst); + err = vfsub_sio_mkdir(au_h_iptr(dir, bdst), &h_path, + S_IRWXU | S_IRUGO | S_IXUGO); + if (unlikely(err)) + goto out_put; + au_fset_cpdown(args->flags, MADE_DIR); + + bopq = au_dbdiropq(dentry); + au_fclr_cpdown(args->flags, WHED); + au_fclr_cpdown(args->flags, DIROPQ); + if (au_dbwh(dentry) == bdst) + au_fset_cpdown(args->flags, WHED); + if (!au_ftest_cpdown(args->flags, PARENT_OPQ) && bopq <= bdst) + au_fset_cpdown(args->flags, PARENT_OPQ); + h_inode = h_path.dentry->d_inode; + mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); + if (au_ftest_cpdown(args->flags, WHED)) { + err = au_cpdown_dir_opq(dentry, bdst, args); + if (unlikely(err)) { + mutex_unlock(&h_inode->i_mutex); + goto out_dir; + } + } + + err = au_cpdown_attr(&h_path, au_h_dptr(dentry, bstart)); + mutex_unlock(&h_inode->i_mutex); + if (unlikely(err)) + goto out_opq; + + if (au_ftest_cpdown(args->flags, WHED)) { + err = au_cpdown_dir_wh(dentry, h_parent, dir, bdst); + if (unlikely(err)) + goto out_opq; + } + + inode = dentry->d_inode; + if (au_ibend(inode) < bdst) + au_set_ibend(inode, bdst); + au_set_h_iptr(inode, bdst, au_igrab(h_inode), + au_hi_flags(inode, /*isdir*/1)); + goto out; /* success */ + + /* revert */ +out_opq: + if (au_ftest_cpdown(args->flags, DIROPQ)) { + mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); + rerr = au_diropq_remove(dentry, bdst); + mutex_unlock(&h_inode->i_mutex); + if (unlikely(rerr)) { + AuIOErr("failed removing diropq for %.*s b%d (%d)\n", + AuDLNPair(dentry), bdst, rerr); + err = -EIO; + goto out; + } + } +out_dir: + if (au_ftest_cpdown(args->flags, MADE_DIR)) { + rerr = vfsub_sio_rmdir(au_h_iptr(dir, bdst), &h_path); + if (unlikely(rerr)) { + AuIOErr("failed removing %.*s b%d (%d)\n", + AuDLNPair(dentry), bdst, rerr); + err = -EIO; + } + } +out_put: + au_set_h_dptr(dentry, bdst, NULL); + if (au_dbend(dentry) == bdst) + au_update_dbend(dentry); +out: + dput(parent); + return err; +} + +int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst) +{ + int err; + struct au_cpdown_dir_args args = { + .parent = dget_parent(dentry), + .flags = 0 + }; + + err = au_cp_dirs(dentry, bdst, au_cpdown_dir, &args); + dput(args.parent); + + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* policies for create */ + +static int au_wbr_nonopq(struct dentry *dentry, aufs_bindex_t bindex) +{ + int err, i, j, ndentry; + aufs_bindex_t bopq; + struct au_dcsub_pages dpages; + struct au_dpage *dpage; + struct dentry **dentries, *parent, *d; + + err = au_dpages_init(&dpages, GFP_NOFS); + if (unlikely(err)) + goto out; + parent = dget_parent(dentry); + err = au_dcsub_pages_rev_aufs(&dpages, parent, /*do_include*/0); + if (unlikely(err)) + goto out_free; + + err = bindex; + for (i = 0; i < dpages.ndpage; i++) { + dpage = dpages.dpages + i; + dentries = dpage->dentries; + ndentry = dpage->ndentry; + for (j = 0; j < ndentry; j++) { + d = dentries[j]; + di_read_lock_parent2(d, !AuLock_IR); + bopq = au_dbdiropq(d); + di_read_unlock(d, !AuLock_IR); + if (bopq >= 0 && bopq < err) + err = bopq; + } + } + +out_free: + dput(parent); + au_dpages_free(&dpages); +out: + return err; +} + +static int au_wbr_bu(struct super_block *sb, aufs_bindex_t bindex) +{ + for (; bindex >= 0; bindex--) + if (!au_br_rdonly(au_sbr(sb, bindex))) + return bindex; + return -EROFS; +} + +/* top down parent */ +static int au_wbr_create_tdp(struct dentry *dentry, int isdir __maybe_unused) +{ + int err; + aufs_bindex_t bstart, bindex; + struct super_block *sb; + struct dentry *parent, *h_parent; + + sb = dentry->d_sb; + bstart = au_dbstart(dentry); + err = bstart; + if (!au_br_rdonly(au_sbr(sb, bstart))) + goto out; + + err = -EROFS; + parent = dget_parent(dentry); + for (bindex = au_dbstart(parent); bindex < bstart; bindex++) { + h_parent = au_h_dptr(parent, bindex); + if (!h_parent || !h_parent->d_inode) + continue; + + if (!au_br_rdonly(au_sbr(sb, bindex))) { + err = bindex; + break; + } + } + dput(parent); + + /* bottom up here */ + if (unlikely(err < 0)) { + err = au_wbr_bu(sb, bstart - 1); + if (err >= 0) + err = au_wbr_nonopq(dentry, err); + } + +out: + AuDbg("b%d\n", err); + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* an exception for the policy other than tdp */ +static int au_wbr_create_exp(struct dentry *dentry) +{ + int err; + aufs_bindex_t bwh, bdiropq; + struct dentry *parent; + + err = -1; + bwh = au_dbwh(dentry); + parent = dget_parent(dentry); + bdiropq = au_dbdiropq(parent); + if (bwh >= 0) { + if (bdiropq >= 0) + err = min(bdiropq, bwh); + else + err = bwh; + AuDbg("%d\n", err); + } else if (bdiropq >= 0) { + err = bdiropq; + AuDbg("%d\n", err); + } + dput(parent); + + if (err >= 0) + err = au_wbr_nonopq(dentry, err); + + if (err >= 0 && au_br_rdonly(au_sbr(dentry->d_sb, err))) + err = -1; + + AuDbg("%d\n", err); + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* round robin */ +static int au_wbr_create_init_rr(struct super_block *sb) +{ + int err; + + err = au_wbr_bu(sb, au_sbend(sb)); + atomic_set(&au_sbi(sb)->si_wbr_rr_next, -err); /* less important */ + /* smp_mb(); */ + + AuDbg("b%d\n", err); + return err; +} + +static int au_wbr_create_rr(struct dentry *dentry, int isdir) +{ + int err, nbr; + unsigned int u; + aufs_bindex_t bindex, bend; + struct super_block *sb; + atomic_t *next; + + err = au_wbr_create_exp(dentry); + if (err >= 0) + goto out; + + sb = dentry->d_sb; + next = &au_sbi(sb)->si_wbr_rr_next; + bend = au_sbend(sb); + nbr = bend + 1; + for (bindex = 0; bindex <= bend; bindex++) { + if (!isdir) { + err = atomic_dec_return(next) + 1; + /* modulo for 0 is meaningless */ + if (unlikely(!err)) + err = atomic_dec_return(next) + 1; + } else + err = atomic_read(next); + AuDbg("%d\n", err); + u = err; + err = u % nbr; + AuDbg("%d\n", err); + if (!au_br_rdonly(au_sbr(sb, err))) + break; + err = -EROFS; + } + + if (err >= 0) + err = au_wbr_nonopq(dentry, err); + +out: + AuDbg("%d\n", err); + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* most free space */ +static void au_mfs(struct dentry *dentry) +{ + struct super_block *sb; + struct au_branch *br; + struct au_wbr_mfs *mfs; + aufs_bindex_t bindex, bend; + int err; + unsigned long long b, bavail; + struct path h_path; + /* reduce the stack usage */ + struct kstatfs *st; + + st = kmalloc(sizeof(*st), GFP_NOFS); + if (unlikely(!st)) { + AuWarn1("failed updating mfs(%d), ignored\n", -ENOMEM); + return; + } + + bavail = 0; + sb = dentry->d_sb; + mfs = &au_sbi(sb)->si_wbr_mfs; + MtxMustLock(&mfs->mfs_lock); + mfs->mfs_bindex = -EROFS; + mfs->mfsrr_bytes = 0; + bend = au_sbend(sb); + for (bindex = 0; bindex <= bend; bindex++) { + br = au_sbr(sb, bindex); + if (au_br_rdonly(br)) + continue; + + /* sb->s_root for NFS is unreliable */ + h_path.mnt = br->br_mnt; + h_path.dentry = h_path.mnt->mnt_root; + err = vfs_statfs(&h_path, st); + if (unlikely(err)) { + AuWarn1("failed statfs, b%d, %d\n", bindex, err); + continue; + } + + /* when the available size is equal, select the lower one */ + BUILD_BUG_ON(sizeof(b) < sizeof(st->f_bavail) + || sizeof(b) < sizeof(st->f_bsize)); + b = st->f_bavail * st->f_bsize; + br->br_wbr->wbr_bytes = b; + if (b >= bavail) { + bavail = b; + mfs->mfs_bindex = bindex; + mfs->mfs_jiffy = jiffies; + } + } + + mfs->mfsrr_bytes = bavail; + AuDbg("b%d\n", mfs->mfs_bindex); + kfree(st); +} + +static int au_wbr_create_mfs(struct dentry *dentry, int isdir __maybe_unused) +{ + int err; + struct super_block *sb; + struct au_wbr_mfs *mfs; + + err = au_wbr_create_exp(dentry); + if (err >= 0) + goto out; + + sb = dentry->d_sb; + mfs = &au_sbi(sb)->si_wbr_mfs; + mutex_lock(&mfs->mfs_lock); + if (time_after(jiffies, mfs->mfs_jiffy + mfs->mfs_expire) + || mfs->mfs_bindex < 0 + || au_br_rdonly(au_sbr(sb, mfs->mfs_bindex))) + au_mfs(dentry); + mutex_unlock(&mfs->mfs_lock); + err = mfs->mfs_bindex; + + if (err >= 0) + err = au_wbr_nonopq(dentry, err); + +out: + AuDbg("b%d\n", err); + return err; +} + +static int au_wbr_create_init_mfs(struct super_block *sb) +{ + struct au_wbr_mfs *mfs; + + mfs = &au_sbi(sb)->si_wbr_mfs; + mutex_init(&mfs->mfs_lock); + mfs->mfs_jiffy = 0; + mfs->mfs_bindex = -EROFS; + + return 0; +} + +static int au_wbr_create_fin_mfs(struct super_block *sb __maybe_unused) +{ + mutex_destroy(&au_sbi(sb)->si_wbr_mfs.mfs_lock); + return 0; +} + +/* ---------------------------------------------------------------------- */ + +/* most free space and then round robin */ +static int au_wbr_create_mfsrr(struct dentry *dentry, int isdir) +{ + int err; + struct au_wbr_mfs *mfs; + + err = au_wbr_create_mfs(dentry, isdir); + if (err >= 0) { + mfs = &au_sbi(dentry->d_sb)->si_wbr_mfs; + mutex_lock(&mfs->mfs_lock); + if (mfs->mfsrr_bytes < mfs->mfsrr_watermark) + err = au_wbr_create_rr(dentry, isdir); + mutex_unlock(&mfs->mfs_lock); + } + + AuDbg("b%d\n", err); + return err; +} + +static int au_wbr_create_init_mfsrr(struct super_block *sb) +{ + int err; + + au_wbr_create_init_mfs(sb); /* ignore */ + err = au_wbr_create_init_rr(sb); + + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* top down parent and most free space */ +static int au_wbr_create_pmfs(struct dentry *dentry, int isdir) +{ + int err, e2; + unsigned long long b; + aufs_bindex_t bindex, bstart, bend; + struct super_block *sb; + struct dentry *parent, *h_parent; + struct au_branch *br; + + err = au_wbr_create_tdp(dentry, isdir); + if (unlikely(err < 0)) + goto out; + parent = dget_parent(dentry); + bstart = au_dbstart(parent); + bend = au_dbtaildir(parent); + if (bstart == bend) + goto out_parent; /* success */ + + e2 = au_wbr_create_mfs(dentry, isdir); + if (e2 < 0) + goto out_parent; /* success */ + + /* when the available size is equal, select upper one */ + sb = dentry->d_sb; + br = au_sbr(sb, err); + b = br->br_wbr->wbr_bytes; + AuDbg("b%d, %llu\n", err, b); + + for (bindex = bstart; bindex <= bend; bindex++) { + h_parent = au_h_dptr(parent, bindex); + if (!h_parent || !h_parent->d_inode) + continue; + + br = au_sbr(sb, bindex); + if (!au_br_rdonly(br) && br->br_wbr->wbr_bytes > b) { + b = br->br_wbr->wbr_bytes; + err = bindex; + AuDbg("b%d, %llu\n", err, b); + } + } + + if (err >= 0) + err = au_wbr_nonopq(dentry, err); + +out_parent: + dput(parent); +out: + AuDbg("b%d\n", err); + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* policies for copyup */ + +/* top down parent */ +static int au_wbr_copyup_tdp(struct dentry *dentry) +{ + return au_wbr_create_tdp(dentry, /*isdir, anything is ok*/0); +} + +/* bottom up parent */ +static int au_wbr_copyup_bup(struct dentry *dentry) +{ + int err; + aufs_bindex_t bindex, bstart; + struct dentry *parent, *h_parent; + struct super_block *sb; + + err = -EROFS; + sb = dentry->d_sb; + parent = dget_parent(dentry); + bstart = au_dbstart(parent); + for (bindex = au_dbstart(dentry); bindex >= bstart; bindex--) { + h_parent = au_h_dptr(parent, bindex); + if (!h_parent || !h_parent->d_inode) + continue; + + if (!au_br_rdonly(au_sbr(sb, bindex))) { + err = bindex; + break; + } + } + dput(parent); + + /* bottom up here */ + if (unlikely(err < 0)) + err = au_wbr_bu(sb, bstart - 1); + + AuDbg("b%d\n", err); + return err; +} + +/* bottom up */ +static int au_wbr_copyup_bu(struct dentry *dentry) +{ + int err; + aufs_bindex_t bstart; + + bstart = au_dbstart(dentry); + err = au_wbr_bu(dentry->d_sb, bstart); + AuDbg("b%d\n", err); + if (err > bstart) + err = au_wbr_nonopq(dentry, err); + + AuDbg("b%d\n", err); + return err; +} + +/* ---------------------------------------------------------------------- */ + +struct au_wbr_copyup_operations au_wbr_copyup_ops[] = { + [AuWbrCopyup_TDP] = { + .copyup = au_wbr_copyup_tdp + }, + [AuWbrCopyup_BUP] = { + .copyup = au_wbr_copyup_bup + }, + [AuWbrCopyup_BU] = { + .copyup = au_wbr_copyup_bu + } +}; + +struct au_wbr_create_operations au_wbr_create_ops[] = { + [AuWbrCreate_TDP] = { + .create = au_wbr_create_tdp + }, + [AuWbrCreate_RR] = { + .create = au_wbr_create_rr, + .init = au_wbr_create_init_rr + }, + [AuWbrCreate_MFS] = { + .create = au_wbr_create_mfs, + .init = au_wbr_create_init_mfs, + .fin = au_wbr_create_fin_mfs + }, + [AuWbrCreate_MFSV] = { + .create = au_wbr_create_mfs, + .init = au_wbr_create_init_mfs, + .fin = au_wbr_create_fin_mfs + }, + [AuWbrCreate_MFSRR] = { + .create = au_wbr_create_mfsrr, + .init = au_wbr_create_init_mfsrr, + .fin = au_wbr_create_fin_mfs + }, + [AuWbrCreate_MFSRRV] = { + .create = au_wbr_create_mfsrr, + .init = au_wbr_create_init_mfsrr, + .fin = au_wbr_create_fin_mfs + }, + [AuWbrCreate_PMFS] = { + .create = au_wbr_create_pmfs, + .init = au_wbr_create_init_mfs, + .fin = au_wbr_create_fin_mfs + }, + [AuWbrCreate_PMFSV] = { + .create = au_wbr_create_pmfs, + .init = au_wbr_create_init_mfs, + .fin = au_wbr_create_fin_mfs + } +}; diff --git a/ubuntu/aufs/whout.c b/ubuntu/aufs/whout.c new file mode 100644 index 000000000000..de3a9f5bee2c --- /dev/null +++ b/ubuntu/aufs/whout.c @@ -0,0 +1,1050 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * whiteout for logical deletion and opaque directory + */ + +#include +#include "aufs.h" + +#define WH_MASK S_IRUGO + +/* + * If a directory contains this file, then it is opaque. We start with the + * .wh. flag so that it is blocked by lookup. + */ +static struct qstr diropq_name = { + .name = AUFS_WH_DIROPQ, + .len = sizeof(AUFS_WH_DIROPQ) - 1 +}; + +/* + * generate whiteout name, which is NOT terminated by NULL. + * @name: original d_name.name + * @len: original d_name.len + * @wh: whiteout qstr + * returns zero when succeeds, otherwise error. + * succeeded value as wh->name should be freed by kfree(). + */ +int au_wh_name_alloc(struct qstr *wh, const struct qstr *name) +{ + char *p; + + if (unlikely(name->len > PATH_MAX - AUFS_WH_PFX_LEN)) + return -ENAMETOOLONG; + + wh->len = name->len + AUFS_WH_PFX_LEN; + p = kmalloc(wh->len, GFP_NOFS); + wh->name = p; + if (p) { + memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); + memcpy(p + AUFS_WH_PFX_LEN, name->name, name->len); + /* smp_mb(); */ + return 0; + } + return -ENOMEM; +} + +/* ---------------------------------------------------------------------- */ + +/* + * test if the @wh_name exists under @h_parent. + * @try_sio specifies the necessary of super-io. + */ +int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, + struct au_branch *br, int try_sio) +{ + int err; + struct dentry *wh_dentry; + + if (!try_sio) + wh_dentry = au_lkup_one(wh_name, h_parent, br, /*nd*/NULL); + else + wh_dentry = au_sio_lkup_one(wh_name, h_parent, br); + err = PTR_ERR(wh_dentry); + if (IS_ERR(wh_dentry)) + goto out; + + err = 0; + if (!wh_dentry->d_inode) + goto out_wh; /* success */ + + err = 1; + if (S_ISREG(wh_dentry->d_inode->i_mode)) + goto out_wh; /* success */ + + err = -EIO; + AuIOErr("%.*s Invalid whiteout entry type 0%o.\n", + AuDLNPair(wh_dentry), wh_dentry->d_inode->i_mode); + +out_wh: + dput(wh_dentry); +out: + return err; +} + +/* + * test if the @h_dentry sets opaque or not. + */ +int au_diropq_test(struct dentry *h_dentry, struct au_branch *br) +{ + int err; + struct inode *h_dir; + + h_dir = h_dentry->d_inode; + err = au_wh_test(h_dentry, &diropq_name, br, + au_test_h_perm_sio(h_dir, MAY_EXEC)); + return err; +} + +/* + * returns a negative dentry whose name is unique and temporary. + */ +struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br, + struct qstr *prefix) +{ + struct dentry *dentry; + int i; + char defname[NAME_MAX - AUFS_MAX_NAMELEN + DNAME_INLINE_LEN + 1], + *name, *p; + /* strict atomic_t is unnecessary here */ + static unsigned short cnt; + struct qstr qs; + + BUILD_BUG_ON(sizeof(cnt) * 2 > AUFS_WH_TMP_LEN); + + name = defname; + qs.len = sizeof(defname) - DNAME_INLINE_LEN + prefix->len - 1; + if (unlikely(prefix->len > DNAME_INLINE_LEN)) { + dentry = ERR_PTR(-ENAMETOOLONG); + if (unlikely(qs.len > NAME_MAX)) + goto out; + dentry = ERR_PTR(-ENOMEM); + name = kmalloc(qs.len + 1, GFP_NOFS); + if (unlikely(!name)) + goto out; + } + + /* doubly whiteout-ed */ + memcpy(name, AUFS_WH_PFX AUFS_WH_PFX, AUFS_WH_PFX_LEN * 2); + p = name + AUFS_WH_PFX_LEN * 2; + memcpy(p, prefix->name, prefix->len); + p += prefix->len; + *p++ = '.'; + AuDebugOn(name + qs.len + 1 - p <= AUFS_WH_TMP_LEN); + + qs.name = name; + for (i = 0; i < 3; i++) { + sprintf(p, "%.*x", AUFS_WH_TMP_LEN, cnt++); + dentry = au_sio_lkup_one(&qs, h_parent, br); + if (IS_ERR(dentry) || !dentry->d_inode) + goto out_name; + dput(dentry); + } + /* pr_warning("could not get random name\n"); */ + dentry = ERR_PTR(-EEXIST); + AuDbg("%.*s\n", AuLNPair(&qs)); + BUG(); + +out_name: + if (name != defname) + kfree(name); +out: + AuTraceErrPtr(dentry); + return dentry; +} + +/* + * rename the @h_dentry on @br to the whiteouted temporary name. + */ +int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br) +{ + int err; + struct path h_path = { + .mnt = br->br_mnt + }; + struct inode *h_dir; + struct dentry *h_parent; + + h_parent = h_dentry->d_parent; /* dir inode is locked */ + h_dir = h_parent->d_inode; + IMustLock(h_dir); + + h_path.dentry = au_whtmp_lkup(h_parent, br, &h_dentry->d_name); + err = PTR_ERR(h_path.dentry); + if (IS_ERR(h_path.dentry)) + goto out; + + /* under the same dir, no need to lock_rename() */ + err = vfsub_rename(h_dir, h_dentry, h_dir, &h_path); + AuTraceErr(err); + dput(h_path.dentry); + +out: + AuTraceErr(err); + return err; +} + +/* ---------------------------------------------------------------------- */ +/* + * functions for removing a whiteout + */ + +static int do_unlink_wh(struct inode *h_dir, struct path *h_path) +{ + int force; + + /* + * forces superio when the dir has a sticky bit. + * this may be a violation of unix fs semantics. + */ + force = (h_dir->i_mode & S_ISVTX) + && h_path->dentry->d_inode->i_uid != current_fsuid(); + return vfsub_unlink(h_dir, h_path, force); +} + +int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path, + struct dentry *dentry) +{ + int err; + + err = do_unlink_wh(h_dir, h_path); + if (!err && dentry) + au_set_dbwh(dentry, -1); + + return err; +} + +static int unlink_wh_name(struct dentry *h_parent, struct qstr *wh, + struct au_branch *br) +{ + int err; + struct path h_path = { + .mnt = br->br_mnt + }; + + err = 0; + h_path.dentry = au_lkup_one(wh, h_parent, br, /*nd*/NULL); + if (IS_ERR(h_path.dentry)) + err = PTR_ERR(h_path.dentry); + else { + if (h_path.dentry->d_inode + && S_ISREG(h_path.dentry->d_inode->i_mode)) + err = do_unlink_wh(h_parent->d_inode, &h_path); + dput(h_path.dentry); + } + + return err; +} + +/* ---------------------------------------------------------------------- */ +/* + * initialize/clean whiteout for a branch + */ + +static void au_wh_clean(struct inode *h_dir, struct path *whpath, + const int isdir) +{ + int err; + + if (!whpath->dentry->d_inode) + return; + + err = mnt_want_write(whpath->mnt); + if (!err) { + if (isdir) + err = vfsub_rmdir(h_dir, whpath); + else + err = vfsub_unlink(h_dir, whpath, /*force*/0); + mnt_drop_write(whpath->mnt); + } + if (unlikely(err)) + pr_warning("failed removing %.*s (%d), ignored.\n", + AuDLNPair(whpath->dentry), err); +} + +static int test_linkable(struct dentry *h_root) +{ + struct inode *h_dir = h_root->d_inode; + + if (h_dir->i_op->link) + return 0; + + pr_err("%.*s (%s) doesn't support link(2), use noplink and rw+nolwh\n", + AuDLNPair(h_root), au_sbtype(h_root->d_sb)); + return -ENOSYS; +} + +/* todo: should this mkdir be done in /sbin/mount.aufs helper? */ +static int au_whdir(struct inode *h_dir, struct path *path) +{ + int err; + + err = -EEXIST; + if (!path->dentry->d_inode) { + int mode = S_IRWXU; + + if (au_test_nfs(path->dentry->d_sb)) + mode |= S_IXUGO; + err = mnt_want_write(path->mnt); + if (!err) { + err = vfsub_mkdir(h_dir, path, mode); + mnt_drop_write(path->mnt); + } + } else if (S_ISDIR(path->dentry->d_inode->i_mode)) + err = 0; + else + pr_err("unknown %.*s exists\n", AuDLNPair(path->dentry)); + + return err; +} + +struct au_wh_base { + const struct qstr *name; + struct dentry *dentry; +}; + +static void au_wh_init_ro(struct inode *h_dir, struct au_wh_base base[], + struct path *h_path) +{ + h_path->dentry = base[AuBrWh_BASE].dentry; + au_wh_clean(h_dir, h_path, /*isdir*/0); + h_path->dentry = base[AuBrWh_PLINK].dentry; + au_wh_clean(h_dir, h_path, /*isdir*/1); + h_path->dentry = base[AuBrWh_ORPH].dentry; + au_wh_clean(h_dir, h_path, /*isdir*/1); +} + +/* + * returns tri-state, + * minus: error, caller should print the mesage + * zero: succuess + * plus: error, caller should NOT print the mesage + */ +static int au_wh_init_rw_nolink(struct dentry *h_root, struct au_wbr *wbr, + int do_plink, struct au_wh_base base[], + struct path *h_path) +{ + int err; + struct inode *h_dir; + + h_dir = h_root->d_inode; + h_path->dentry = base[AuBrWh_BASE].dentry; + au_wh_clean(h_dir, h_path, /*isdir*/0); + h_path->dentry = base[AuBrWh_PLINK].dentry; + if (do_plink) { + err = test_linkable(h_root); + if (unlikely(err)) { + err = 1; + goto out; + } + + err = au_whdir(h_dir, h_path); + if (unlikely(err)) + goto out; + wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry); + } else + au_wh_clean(h_dir, h_path, /*isdir*/1); + h_path->dentry = base[AuBrWh_ORPH].dentry; + err = au_whdir(h_dir, h_path); + if (unlikely(err)) + goto out; + wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry); + +out: + return err; +} + +/* + * for the moment, aufs supports the branch filesystem which does not support + * link(2). testing on FAT which does not support i_op->setattr() fully either, + * copyup failed. finally, such filesystem will not be used as the writable + * branch. + * + * returns tri-state, see above. + */ +static int au_wh_init_rw(struct dentry *h_root, struct au_wbr *wbr, + int do_plink, struct au_wh_base base[], + struct path *h_path) +{ + int err; + struct inode *h_dir; + + WbrWhMustWriteLock(wbr); + + err = test_linkable(h_root); + if (unlikely(err)) { + err = 1; + goto out; + } + + /* + * todo: should this create be done in /sbin/mount.aufs helper? + */ + err = -EEXIST; + h_dir = h_root->d_inode; + if (!base[AuBrWh_BASE].dentry->d_inode) { + err = mnt_want_write(h_path->mnt); + if (!err) { + h_path->dentry = base[AuBrWh_BASE].dentry; + err = vfsub_create(h_dir, h_path, WH_MASK); + mnt_drop_write(h_path->mnt); + } + } else if (S_ISREG(base[AuBrWh_BASE].dentry->d_inode->i_mode)) + err = 0; + else + pr_err("unknown %.*s/%.*s exists\n", + AuDLNPair(h_root), AuDLNPair(base[AuBrWh_BASE].dentry)); + if (unlikely(err)) + goto out; + + h_path->dentry = base[AuBrWh_PLINK].dentry; + if (do_plink) { + err = au_whdir(h_dir, h_path); + if (unlikely(err)) + goto out; + wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry); + } else + au_wh_clean(h_dir, h_path, /*isdir*/1); + wbr->wbr_whbase = dget(base[AuBrWh_BASE].dentry); + + h_path->dentry = base[AuBrWh_ORPH].dentry; + err = au_whdir(h_dir, h_path); + if (unlikely(err)) + goto out; + wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry); + +out: + return err; +} + +/* + * initialize the whiteout base file/dir for @br. + */ +int au_wh_init(struct dentry *h_root, struct au_branch *br, + struct super_block *sb) +{ + int err, i; + const unsigned char do_plink + = !!au_opt_test(au_mntflags(sb), PLINK); + struct path path = { + .mnt = br->br_mnt + }; + struct inode *h_dir; + struct au_wbr *wbr = br->br_wbr; + static const struct qstr base_name[] = { + [AuBrWh_BASE] = { + .name = AUFS_BASE_NAME, + .len = sizeof(AUFS_BASE_NAME) - 1 + }, + [AuBrWh_PLINK] = { + .name = AUFS_PLINKDIR_NAME, + .len = sizeof(AUFS_PLINKDIR_NAME) - 1 + }, + [AuBrWh_ORPH] = { + .name = AUFS_ORPHDIR_NAME, + .len = sizeof(AUFS_ORPHDIR_NAME) - 1 + } + }; + struct au_wh_base base[] = { + [AuBrWh_BASE] = { + .name = base_name + AuBrWh_BASE, + .dentry = NULL + }, + [AuBrWh_PLINK] = { + .name = base_name + AuBrWh_PLINK, + .dentry = NULL + }, + [AuBrWh_ORPH] = { + .name = base_name + AuBrWh_ORPH, + .dentry = NULL + } + }; + + if (wbr) + WbrWhMustWriteLock(wbr); + + for (i = 0; i < AuBrWh_Last; i++) { + /* doubly whiteouted */ + struct dentry *d; + + d = au_wh_lkup(h_root, (void *)base[i].name, br); + err = PTR_ERR(d); + if (IS_ERR(d)) + goto out; + + base[i].dentry = d; + AuDebugOn(wbr + && wbr->wbr_wh[i] + && wbr->wbr_wh[i] != base[i].dentry); + } + + if (wbr) + for (i = 0; i < AuBrWh_Last; i++) { + dput(wbr->wbr_wh[i]); + wbr->wbr_wh[i] = NULL; + } + + err = 0; + if (!au_br_writable(br->br_perm)) { + h_dir = h_root->d_inode; + au_wh_init_ro(h_dir, base, &path); + } else if (!au_br_wh_linkable(br->br_perm)) { + err = au_wh_init_rw_nolink(h_root, wbr, do_plink, base, &path); + if (err > 0) + goto out; + else if (err) + goto out_err; + } else { + err = au_wh_init_rw(h_root, wbr, do_plink, base, &path); + if (err > 0) + goto out; + else if (err) + goto out_err; + } + goto out; /* success */ + +out_err: + pr_err("an error(%d) on the writable branch %.*s(%s)\n", + err, AuDLNPair(h_root), au_sbtype(h_root->d_sb)); +out: + for (i = 0; i < AuBrWh_Last; i++) + dput(base[i].dentry); + return err; +} + +/* ---------------------------------------------------------------------- */ +/* + * whiteouts are all hard-linked usually. + * when its link count reaches a ceiling, we create a new whiteout base + * asynchronously. + */ + +struct reinit_br_wh { + struct super_block *sb; + struct au_branch *br; +}; + +static void reinit_br_wh(void *arg) +{ + int err; + aufs_bindex_t bindex; + struct path h_path; + struct reinit_br_wh *a = arg; + struct au_wbr *wbr; + struct inode *dir; + struct dentry *h_root; + struct au_hinode *hdir; + + err = 0; + wbr = a->br->br_wbr; + /* big aufs lock */ + si_noflush_write_lock(a->sb); + if (!au_br_writable(a->br->br_perm)) + goto out; + bindex = au_br_index(a->sb, a->br->br_id); + if (unlikely(bindex < 0)) + goto out; + + di_read_lock_parent(a->sb->s_root, AuLock_IR); + dir = a->sb->s_root->d_inode; + hdir = au_hi(dir, bindex); + h_root = au_h_dptr(a->sb->s_root, bindex); + + au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); + wbr_wh_write_lock(wbr); + err = au_h_verify(wbr->wbr_whbase, au_opt_udba(a->sb), hdir->hi_inode, + h_root, a->br); + if (!err) { + err = mnt_want_write(a->br->br_mnt); + if (!err) { + h_path.dentry = wbr->wbr_whbase; + h_path.mnt = a->br->br_mnt; + err = vfsub_unlink(hdir->hi_inode, &h_path, /*force*/0); + mnt_drop_write(a->br->br_mnt); + } + } else { + pr_warning("%.*s is moved, ignored\n", + AuDLNPair(wbr->wbr_whbase)); + err = 0; + } + dput(wbr->wbr_whbase); + wbr->wbr_whbase = NULL; + if (!err) + err = au_wh_init(h_root, a->br, a->sb); + wbr_wh_write_unlock(wbr); + au_hn_imtx_unlock(hdir); + di_read_unlock(a->sb->s_root, AuLock_IR); + +out: + if (wbr) + atomic_dec(&wbr->wbr_wh_running); + atomic_dec(&a->br->br_count); + si_write_unlock(a->sb); + au_nwt_done(&au_sbi(a->sb)->si_nowait); + kfree(arg); + if (unlikely(err)) + AuIOErr("err %d\n", err); +} + +static void kick_reinit_br_wh(struct super_block *sb, struct au_branch *br) +{ + int do_dec, wkq_err; + struct reinit_br_wh *arg; + + do_dec = 1; + if (atomic_inc_return(&br->br_wbr->wbr_wh_running) != 1) + goto out; + + /* ignore ENOMEM */ + arg = kmalloc(sizeof(*arg), GFP_NOFS); + if (arg) { + /* + * dec(wh_running), kfree(arg) and dec(br_count) + * in reinit function + */ + arg->sb = sb; + arg->br = br; + atomic_inc(&br->br_count); + wkq_err = au_wkq_nowait(reinit_br_wh, arg, sb, /*flags*/0); + if (unlikely(wkq_err)) { + atomic_dec(&br->br_wbr->wbr_wh_running); + atomic_dec(&br->br_count); + kfree(arg); + } + do_dec = 0; + } + +out: + if (do_dec) + atomic_dec(&br->br_wbr->wbr_wh_running); +} + +/* ---------------------------------------------------------------------- */ + +/* + * create the whiteout @wh. + */ +static int link_or_create_wh(struct super_block *sb, aufs_bindex_t bindex, + struct dentry *wh) +{ + int err; + struct path h_path = { + .dentry = wh + }; + struct au_branch *br; + struct au_wbr *wbr; + struct dentry *h_parent; + struct inode *h_dir; + + h_parent = wh->d_parent; /* dir inode is locked */ + h_dir = h_parent->d_inode; + IMustLock(h_dir); + + br = au_sbr(sb, bindex); + h_path.mnt = br->br_mnt; + wbr = br->br_wbr; + wbr_wh_read_lock(wbr); + if (wbr->wbr_whbase) { + err = vfsub_link(wbr->wbr_whbase, h_dir, &h_path); + if (!err || err != -EMLINK) + goto out; + + /* link count full. re-initialize br_whbase. */ + kick_reinit_br_wh(sb, br); + } + + /* return this error in this context */ + err = vfsub_create(h_dir, &h_path, WH_MASK); + +out: + wbr_wh_read_unlock(wbr); + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* + * create or remove the diropq. + */ +static struct dentry *do_diropq(struct dentry *dentry, aufs_bindex_t bindex, + unsigned int flags) +{ + struct dentry *opq_dentry, *h_dentry; + struct super_block *sb; + struct au_branch *br; + int err; + + sb = dentry->d_sb; + br = au_sbr(sb, bindex); + h_dentry = au_h_dptr(dentry, bindex); + opq_dentry = au_lkup_one(&diropq_name, h_dentry, br, /*nd*/NULL); + if (IS_ERR(opq_dentry)) + goto out; + + if (au_ftest_diropq(flags, CREATE)) { + err = link_or_create_wh(sb, bindex, opq_dentry); + if (!err) { + au_set_dbdiropq(dentry, bindex); + goto out; /* success */ + } + } else { + struct path tmp = { + .dentry = opq_dentry, + .mnt = br->br_mnt + }; + err = do_unlink_wh(au_h_iptr(dentry->d_inode, bindex), &tmp); + if (!err) + au_set_dbdiropq(dentry, -1); + } + dput(opq_dentry); + opq_dentry = ERR_PTR(err); + +out: + return opq_dentry; +} + +struct do_diropq_args { + struct dentry **errp; + struct dentry *dentry; + aufs_bindex_t bindex; + unsigned int flags; +}; + +static void call_do_diropq(void *args) +{ + struct do_diropq_args *a = args; + *a->errp = do_diropq(a->dentry, a->bindex, a->flags); +} + +struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex, + unsigned int flags) +{ + struct dentry *diropq, *h_dentry; + + h_dentry = au_h_dptr(dentry, bindex); + if (!au_test_h_perm_sio(h_dentry->d_inode, MAY_EXEC | MAY_WRITE)) + diropq = do_diropq(dentry, bindex, flags); + else { + int wkq_err; + struct do_diropq_args args = { + .errp = &diropq, + .dentry = dentry, + .bindex = bindex, + .flags = flags + }; + + wkq_err = au_wkq_wait(call_do_diropq, &args); + if (unlikely(wkq_err)) + diropq = ERR_PTR(wkq_err); + } + + return diropq; +} + +/* ---------------------------------------------------------------------- */ + +/* + * lookup whiteout dentry. + * @h_parent: lower parent dentry which must exist and be locked + * @base_name: name of dentry which will be whiteouted + * returns dentry for whiteout. + */ +struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name, + struct au_branch *br) +{ + int err; + struct qstr wh_name; + struct dentry *wh_dentry; + + err = au_wh_name_alloc(&wh_name, base_name); + wh_dentry = ERR_PTR(err); + if (!err) { + wh_dentry = au_lkup_one(&wh_name, h_parent, br, /*nd*/NULL); + kfree(wh_name.name); + } + return wh_dentry; +} + +/* + * link/create a whiteout for @dentry on @bindex. + */ +struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex, + struct dentry *h_parent) +{ + struct dentry *wh_dentry; + struct super_block *sb; + int err; + + sb = dentry->d_sb; + wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, au_sbr(sb, bindex)); + if (!IS_ERR(wh_dentry) && !wh_dentry->d_inode) { + err = link_or_create_wh(sb, bindex, wh_dentry); + if (!err) + au_set_dbwh(dentry, bindex); + else { + dput(wh_dentry); + wh_dentry = ERR_PTR(err); + } + } + + return wh_dentry; +} + +/* ---------------------------------------------------------------------- */ + +/* Delete all whiteouts in this directory on branch bindex. */ +static int del_wh_children(struct dentry *h_dentry, struct au_nhash *whlist, + aufs_bindex_t bindex, struct au_branch *br) +{ + int err; + unsigned long ul, n; + struct qstr wh_name; + char *p; + struct hlist_head *head; + struct au_vdir_wh *tpos; + struct hlist_node *pos; + struct au_vdir_destr *str; + + err = -ENOMEM; + p = __getname_gfp(GFP_NOFS); + wh_name.name = p; + if (unlikely(!wh_name.name)) + goto out; + + err = 0; + memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); + p += AUFS_WH_PFX_LEN; + n = whlist->nh_num; + head = whlist->nh_head; + for (ul = 0; !err && ul < n; ul++, head++) { + hlist_for_each_entry(tpos, pos, head, wh_hash) { + if (tpos->wh_bindex != bindex) + continue; + + str = &tpos->wh_str; + if (str->len + AUFS_WH_PFX_LEN <= PATH_MAX) { + memcpy(p, str->name, str->len); + wh_name.len = AUFS_WH_PFX_LEN + str->len; + err = unlink_wh_name(h_dentry, &wh_name, br); + if (!err) + continue; + break; + } + AuIOErr("whiteout name too long %.*s\n", + str->len, str->name); + err = -EIO; + break; + } + } + __putname(wh_name.name); + +out: + return err; +} + +struct del_wh_children_args { + int *errp; + struct dentry *h_dentry; + struct au_nhash *whlist; + aufs_bindex_t bindex; + struct au_branch *br; +}; + +static void call_del_wh_children(void *args) +{ + struct del_wh_children_args *a = args; + *a->errp = del_wh_children(a->h_dentry, a->whlist, a->bindex, a->br); +} + +/* ---------------------------------------------------------------------- */ + +struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp) +{ + struct au_whtmp_rmdir *whtmp; + int err; + unsigned int rdhash; + + SiMustAnyLock(sb); + + whtmp = kmalloc(sizeof(*whtmp), gfp); + if (unlikely(!whtmp)) { + whtmp = ERR_PTR(-ENOMEM); + goto out; + } + + whtmp->dir = NULL; + whtmp->br = NULL; + whtmp->wh_dentry = NULL; + /* no estimation for dir size */ + rdhash = au_sbi(sb)->si_rdhash; + if (!rdhash) + rdhash = AUFS_RDHASH_DEF; + err = au_nhash_alloc(&whtmp->whlist, rdhash, gfp); + if (unlikely(err)) { + kfree(whtmp); + whtmp = ERR_PTR(err); + } + +out: + return whtmp; +} + +void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp) +{ + if (whtmp->br) + atomic_dec(&whtmp->br->br_count); + dput(whtmp->wh_dentry); + iput(whtmp->dir); + au_nhash_wh_free(&whtmp->whlist); + kfree(whtmp); +} + +/* + * rmdir the whiteouted temporary named dir @h_dentry. + * @whlist: whiteouted children. + */ +int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex, + struct dentry *wh_dentry, struct au_nhash *whlist) +{ + int err; + struct path h_tmp; + struct inode *wh_inode, *h_dir; + struct au_branch *br; + + h_dir = wh_dentry->d_parent->d_inode; /* dir inode is locked */ + IMustLock(h_dir); + + br = au_sbr(dir->i_sb, bindex); + wh_inode = wh_dentry->d_inode; + mutex_lock_nested(&wh_inode->i_mutex, AuLsc_I_CHILD); + + /* + * someone else might change some whiteouts while we were sleeping. + * it means this whlist may have an obsoleted entry. + */ + if (!au_test_h_perm_sio(wh_inode, MAY_EXEC | MAY_WRITE)) + err = del_wh_children(wh_dentry, whlist, bindex, br); + else { + int wkq_err; + struct del_wh_children_args args = { + .errp = &err, + .h_dentry = wh_dentry, + .whlist = whlist, + .bindex = bindex, + .br = br + }; + + wkq_err = au_wkq_wait(call_del_wh_children, &args); + if (unlikely(wkq_err)) + err = wkq_err; + } + mutex_unlock(&wh_inode->i_mutex); + + if (!err) { + h_tmp.dentry = wh_dentry; + h_tmp.mnt = br->br_mnt; + err = vfsub_rmdir(h_dir, &h_tmp); + } + + if (!err) { + if (au_ibstart(dir) == bindex) { + /* todo: dir->i_mutex is necessary */ + au_cpup_attr_timesizes(dir); + vfsub_drop_nlink(dir); + } + return 0; /* success */ + } + + pr_warning("failed removing %.*s(%d), ignored\n", + AuDLNPair(wh_dentry), err); + return err; +} + +static void call_rmdir_whtmp(void *args) +{ + int err; + aufs_bindex_t bindex; + struct au_whtmp_rmdir *a = args; + struct super_block *sb; + struct dentry *h_parent; + struct inode *h_dir; + struct au_hinode *hdir; + + /* rmdir by nfsd may cause deadlock with this i_mutex */ + /* mutex_lock(&a->dir->i_mutex); */ + err = -EROFS; + sb = a->dir->i_sb; + si_read_lock(sb, !AuLock_FLUSH); + if (!au_br_writable(a->br->br_perm)) + goto out; + bindex = au_br_index(sb, a->br->br_id); + if (unlikely(bindex < 0)) + goto out; + + err = -EIO; + ii_write_lock_parent(a->dir); + h_parent = dget_parent(a->wh_dentry); + h_dir = h_parent->d_inode; + hdir = au_hi(a->dir, bindex); + au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); + err = au_h_verify(a->wh_dentry, au_opt_udba(sb), h_dir, h_parent, + a->br); + if (!err) { + err = mnt_want_write(a->br->br_mnt); + if (!err) { + err = au_whtmp_rmdir(a->dir, bindex, a->wh_dentry, + &a->whlist); + mnt_drop_write(a->br->br_mnt); + } + } + au_hn_imtx_unlock(hdir); + dput(h_parent); + ii_write_unlock(a->dir); + +out: + /* mutex_unlock(&a->dir->i_mutex); */ + au_whtmp_rmdir_free(a); + si_read_unlock(sb); + au_nwt_done(&au_sbi(sb)->si_nowait); + if (unlikely(err)) + AuIOErr("err %d\n", err); +} + +void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex, + struct dentry *wh_dentry, struct au_whtmp_rmdir *args) +{ + int wkq_err; + struct super_block *sb; + + IMustLock(dir); + + /* all post-process will be done in do_rmdir_whtmp(). */ + sb = dir->i_sb; + args->dir = au_igrab(dir); + args->br = au_sbr(sb, bindex); + atomic_inc(&args->br->br_count); + args->wh_dentry = dget(wh_dentry); + wkq_err = au_wkq_nowait(call_rmdir_whtmp, args, sb, /*flags*/0); + if (unlikely(wkq_err)) { + pr_warning("rmdir error %.*s (%d), ignored\n", + AuDLNPair(wh_dentry), wkq_err); + au_whtmp_rmdir_free(args); + } +} diff --git a/ubuntu/aufs/whout.h b/ubuntu/aufs/whout.h new file mode 100644 index 000000000000..11137f40049d --- /dev/null +++ b/ubuntu/aufs/whout.h @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * whiteout for logical deletion and opaque directory + */ + +#ifndef __AUFS_WHOUT_H__ +#define __AUFS_WHOUT_H__ + +#ifdef __KERNEL__ + +#include +#include "dir.h" + +/* whout.c */ +int au_wh_name_alloc(struct qstr *wh, const struct qstr *name); +struct au_branch; +int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, + struct au_branch *br, int try_sio); +int au_diropq_test(struct dentry *h_dentry, struct au_branch *br); +struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br, + struct qstr *prefix); +int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br); +int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path, + struct dentry *dentry); +int au_wh_init(struct dentry *h_parent, struct au_branch *br, + struct super_block *sb); + +/* diropq flags */ +#define AuDiropq_CREATE 1 +#define au_ftest_diropq(flags, name) ((flags) & AuDiropq_##name) +#define au_fset_diropq(flags, name) \ + do { (flags) |= AuDiropq_##name; } while (0) +#define au_fclr_diropq(flags, name) \ + do { (flags) &= ~AuDiropq_##name; } while (0) + +struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex, + unsigned int flags); +struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name, + struct au_branch *br); +struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex, + struct dentry *h_parent); + +/* real rmdir for the whiteout-ed dir */ +struct au_whtmp_rmdir { + struct inode *dir; + struct au_branch *br; + struct dentry *wh_dentry; + struct au_nhash whlist; +}; + +struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp); +void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp); +int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex, + struct dentry *wh_dentry, struct au_nhash *whlist); +void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex, + struct dentry *wh_dentry, struct au_whtmp_rmdir *args); + +/* ---------------------------------------------------------------------- */ + +static inline struct dentry *au_diropq_create(struct dentry *dentry, + aufs_bindex_t bindex) +{ + return au_diropq_sio(dentry, bindex, AuDiropq_CREATE); +} + +static inline int au_diropq_remove(struct dentry *dentry, aufs_bindex_t bindex) +{ + return PTR_ERR(au_diropq_sio(dentry, bindex, !AuDiropq_CREATE)); +} + +#endif /* __KERNEL__ */ +#endif /* __AUFS_WHOUT_H__ */ diff --git a/ubuntu/aufs/wkq.c b/ubuntu/aufs/wkq.c new file mode 100644 index 000000000000..d75808a1a7e3 --- /dev/null +++ b/ubuntu/aufs/wkq.c @@ -0,0 +1,214 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * workqueue for asynchronous/super-io operations + * todo: try new dredential scheme + */ + +#include +#include "aufs.h" + +/* internal workqueue named AUFS_WKQ_NAME */ + +static struct workqueue_struct *au_wkq; + +struct au_wkinfo { + struct work_struct wk; + struct kobject *kobj; + + unsigned int flags; /* see wkq.h */ + + au_wkq_func_t func; + void *args; + + struct completion *comp; +}; + +/* ---------------------------------------------------------------------- */ + +static void wkq_func(struct work_struct *wk) +{ + struct au_wkinfo *wkinfo = container_of(wk, struct au_wkinfo, wk); + + AuDebugOn(current_fsuid()); + AuDebugOn(rlimit(RLIMIT_FSIZE) != RLIM_INFINITY); + + wkinfo->func(wkinfo->args); + if (au_ftest_wkq(wkinfo->flags, WAIT)) + complete(wkinfo->comp); + else { + kobject_put(wkinfo->kobj); + module_put(THIS_MODULE); /* todo: ?? */ + kfree(wkinfo); + } +} + +/* + * Since struct completion is large, try allocating it dynamically. + */ +#if defined(CONFIG_4KSTACKS) || defined(AuTest4KSTACKS) +#define AuWkqCompDeclare(name) struct completion *comp = NULL + +static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp) +{ + *comp = kmalloc(sizeof(**comp), GFP_NOFS); + if (*comp) { + init_completion(*comp); + wkinfo->comp = *comp; + return 0; + } + return -ENOMEM; +} + +static void au_wkq_comp_free(struct completion *comp) +{ + kfree(comp); +} + +#else + +/* no braces */ +#define AuWkqCompDeclare(name) \ + DECLARE_COMPLETION_ONSTACK(_ ## name); \ + struct completion *comp = &_ ## name + +static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp) +{ + wkinfo->comp = *comp; + return 0; +} + +static void au_wkq_comp_free(struct completion *comp __maybe_unused) +{ + /* empty */ +} +#endif /* 4KSTACKS */ + +static void au_wkq_run(struct au_wkinfo *wkinfo) +{ + if (au_ftest_wkq(wkinfo->flags, NEST)) { + if (au_wkq_test()) { + AuWarn1("wkq from wkq, due to a dead dir by UDBA?\n"); + AuDebugOn(au_ftest_wkq(wkinfo->flags, WAIT)); + } + } else + au_dbg_verify_kthread(); + + if (au_ftest_wkq(wkinfo->flags, WAIT)) { + INIT_WORK_ONSTACK(&wkinfo->wk, wkq_func); + queue_work(au_wkq, &wkinfo->wk); + } else { + INIT_WORK(&wkinfo->wk, wkq_func); + schedule_work(&wkinfo->wk); + } +} + +/* + * Be careful. It is easy to make deadlock happen. + * processA: lock, wkq and wait + * processB: wkq and wait, lock in wkq + * --> deadlock + */ +int au_wkq_do_wait(unsigned int flags, au_wkq_func_t func, void *args) +{ + int err; + AuWkqCompDeclare(comp); + struct au_wkinfo wkinfo = { + .flags = flags, + .func = func, + .args = args + }; + + err = au_wkq_comp_alloc(&wkinfo, &comp); + if (!err) { + au_wkq_run(&wkinfo); + /* no timeout, no interrupt */ + wait_for_completion(wkinfo.comp); + au_wkq_comp_free(comp); + destroy_work_on_stack(&wkinfo.wk); + } + + return err; + +} + +/* + * Note: dget/dput() in func for aufs dentries are not supported. It will be a + * problem in a concurrent umounting. + */ +int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb, + unsigned int flags) +{ + int err; + struct au_wkinfo *wkinfo; + + atomic_inc(&au_sbi(sb)->si_nowait.nw_len); + + /* + * wkq_func() must free this wkinfo. + * it highly depends upon the implementation of workqueue. + */ + err = 0; + wkinfo = kmalloc(sizeof(*wkinfo), GFP_NOFS); + if (wkinfo) { + wkinfo->kobj = &au_sbi(sb)->si_kobj; + wkinfo->flags = flags & ~AuWkq_WAIT; + wkinfo->func = func; + wkinfo->args = args; + wkinfo->comp = NULL; + kobject_get(wkinfo->kobj); + __module_get(THIS_MODULE); /* todo: ?? */ + + au_wkq_run(wkinfo); + } else { + err = -ENOMEM; + au_nwt_done(&au_sbi(sb)->si_nowait); + } + + return err; +} + +/* ---------------------------------------------------------------------- */ + +void au_nwt_init(struct au_nowait_tasks *nwt) +{ + atomic_set(&nwt->nw_len, 0); + /* smp_mb(); */ /* atomic_set */ + init_waitqueue_head(&nwt->nw_wq); +} + +void au_wkq_fin(void) +{ + destroy_workqueue(au_wkq); +} + +int __init au_wkq_init(void) +{ + int err; + + err = 0; + BUILD_BUG_ON(!WQ_RESCUER); + au_wkq = alloc_workqueue(AUFS_WKQ_NAME, !WQ_RESCUER, WQ_DFL_ACTIVE); + if (IS_ERR(au_wkq)) + err = PTR_ERR(au_wkq); + else if (!au_wkq) + err = -ENOMEM; + + return err; +} diff --git a/ubuntu/aufs/wkq.h b/ubuntu/aufs/wkq.h new file mode 100644 index 000000000000..c6352aeeac89 --- /dev/null +++ b/ubuntu/aufs/wkq.h @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * workqueue for asynchronous/super-io operations + * todo: try new credentials management scheme + */ + +#ifndef __AUFS_WKQ_H__ +#define __AUFS_WKQ_H__ + +#ifdef __KERNEL__ + +#include +#include +#include + +struct super_block; + +/* ---------------------------------------------------------------------- */ + +/* + * in the next operation, wait for the 'nowait' tasks in system-wide workqueue + */ +struct au_nowait_tasks { + atomic_t nw_len; + wait_queue_head_t nw_wq; +}; + +/* ---------------------------------------------------------------------- */ + +typedef void (*au_wkq_func_t)(void *args); + +/* wkq flags */ +#define AuWkq_WAIT 1 +#define AuWkq_NEST (1 << 1) +#define au_ftest_wkq(flags, name) ((flags) & AuWkq_##name) +#define au_fset_wkq(flags, name) \ + do { (flags) |= AuWkq_##name; } while (0) +#define au_fclr_wkq(flags, name) \ + do { (flags) &= ~AuWkq_##name; } while (0) + +#ifndef CONFIG_AUFS_HNOTIFY +#undef AuWkq_NEST +#define AuWkq_NEST 0 +#endif + +/* wkq.c */ +int au_wkq_do_wait(unsigned int flags, au_wkq_func_t func, void *args); +int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb, + unsigned int flags); +void au_nwt_init(struct au_nowait_tasks *nwt); +int __init au_wkq_init(void); +void au_wkq_fin(void); + +/* ---------------------------------------------------------------------- */ + +static inline int au_wkq_test(void) +{ + return current->flags & PF_WQ_WORKER; +} + +static inline int au_wkq_wait(au_wkq_func_t func, void *args) +{ + return au_wkq_do_wait(AuWkq_WAIT, func, args); +} + +static inline void au_nwt_done(struct au_nowait_tasks *nwt) +{ + if (atomic_dec_and_test(&nwt->nw_len)) + wake_up_all(&nwt->nw_wq); +} + +static inline int au_nwt_flush(struct au_nowait_tasks *nwt) +{ + wait_event(nwt->nw_wq, !atomic_read(&nwt->nw_len)); + return 0; +} + +#endif /* __KERNEL__ */ +#endif /* __AUFS_WKQ_H__ */ diff --git a/ubuntu/aufs/xino.c b/ubuntu/aufs/xino.c new file mode 100644 index 000000000000..835fe5ca81f5 --- /dev/null +++ b/ubuntu/aufs/xino.c @@ -0,0 +1,1266 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * external inode number translation table and bitmap + */ + +#include +#include +#include +#include "aufs.h" + +/* todo: unnecessary to support mmap_sem since kernel-space? */ +ssize_t xino_fread(au_readf_t func, struct file *file, void *kbuf, size_t size, + loff_t *pos) +{ + ssize_t err; + mm_segment_t oldfs; + union { + void *k; + char __user *u; + } buf; + + buf.k = kbuf; + oldfs = get_fs(); + set_fs(KERNEL_DS); + do { + /* todo: signal_pending? */ + err = func(file, buf.u, size, pos); + } while (err == -EAGAIN || err == -EINTR); + set_fs(oldfs); + +#if 0 /* reserved for future use */ + if (err > 0) + fsnotify_access(file->f_dentry); +#endif + + return err; +} + +/* ---------------------------------------------------------------------- */ + +static ssize_t do_xino_fwrite(au_writef_t func, struct file *file, void *kbuf, + size_t size, loff_t *pos) +{ + ssize_t err; + mm_segment_t oldfs; + union { + void *k; + const char __user *u; + } buf; + + buf.k = kbuf; + oldfs = get_fs(); + set_fs(KERNEL_DS); + do { + /* todo: signal_pending? */ + err = func(file, buf.u, size, pos); + } while (err == -EAGAIN || err == -EINTR); + set_fs(oldfs); + +#if 0 /* reserved for future use */ + if (err > 0) + fsnotify_modify(file->f_dentry); +#endif + + return err; +} + +struct do_xino_fwrite_args { + ssize_t *errp; + au_writef_t func; + struct file *file; + void *buf; + size_t size; + loff_t *pos; +}; + +static void call_do_xino_fwrite(void *args) +{ + struct do_xino_fwrite_args *a = args; + *a->errp = do_xino_fwrite(a->func, a->file, a->buf, a->size, a->pos); +} + +ssize_t xino_fwrite(au_writef_t func, struct file *file, void *buf, size_t size, + loff_t *pos) +{ + ssize_t err; + + /* todo: signal block and no wkq? */ + if (rlimit(RLIMIT_FSIZE) == RLIM_INFINITY) { + lockdep_off(); + err = do_xino_fwrite(func, file, buf, size, pos); + lockdep_on(); + } else { + /* + * it breaks RLIMIT_FSIZE and normal user's limit, + * users should care about quota and real 'filesystem full.' + */ + int wkq_err; + struct do_xino_fwrite_args args = { + .errp = &err, + .func = func, + .file = file, + .buf = buf, + .size = size, + .pos = pos + }; + + wkq_err = au_wkq_wait(call_do_xino_fwrite, &args); + if (unlikely(wkq_err)) + err = wkq_err; + } + + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* + * create a new xinofile at the same place/path as @base_file. + */ +struct file *au_xino_create2(struct file *base_file, struct file *copy_src) +{ + struct file *file; + struct dentry *base, *parent; + struct inode *dir; + struct qstr *name; + struct path path; + int err; + + base = base_file->f_dentry; + parent = base->d_parent; /* dir inode is locked */ + dir = parent->d_inode; + IMustLock(dir); + + file = ERR_PTR(-EINVAL); + name = &base->d_name; + path.dentry = vfsub_lookup_one_len(name->name, parent, name->len); + if (IS_ERR(path.dentry)) { + file = (void *)path.dentry; + pr_err("%.*s lookup err %ld\n", + AuLNPair(name), PTR_ERR(path.dentry)); + goto out; + } + + /* no need to mnt_want_write() since we call dentry_open() later */ + err = vfs_create(dir, path.dentry, S_IRUGO | S_IWUGO, NULL); + if (unlikely(err)) { + file = ERR_PTR(err); + pr_err("%.*s create err %d\n", AuLNPair(name), err); + goto out_dput; + } + + path.mnt = base_file->f_vfsmnt; + file = vfsub_dentry_open(&path, + O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE + /* | __FMODE_NONOTIFY */); + if (IS_ERR(file)) { + pr_err("%.*s open err %ld\n", AuLNPair(name), PTR_ERR(file)); + goto out_dput; + } + + err = vfsub_unlink(dir, &file->f_path, /*force*/0); + if (unlikely(err)) { + pr_err("%.*s unlink err %d\n", AuLNPair(name), err); + goto out_fput; + } + + if (copy_src) { + /* no one can touch copy_src xino */ + err = au_copy_file(file, copy_src, + i_size_read(copy_src->f_dentry->d_inode)); + if (unlikely(err)) { + pr_err("%.*s copy err %d\n", AuLNPair(name), err); + goto out_fput; + } + } + goto out_dput; /* success */ + +out_fput: + fput(file); + file = ERR_PTR(err); +out_dput: + dput(path.dentry); +out: + return file; +} + +struct au_xino_lock_dir { + struct au_hinode *hdir; + struct dentry *parent; + struct mutex *mtx; +}; + +static void au_xino_lock_dir(struct super_block *sb, struct file *xino, + struct au_xino_lock_dir *ldir) +{ + aufs_bindex_t brid, bindex; + + ldir->hdir = NULL; + bindex = -1; + brid = au_xino_brid(sb); + if (brid >= 0) + bindex = au_br_index(sb, brid); + if (bindex >= 0) { + ldir->hdir = au_hi(sb->s_root->d_inode, bindex); + au_hn_imtx_lock_nested(ldir->hdir, AuLsc_I_PARENT); + } else { + ldir->parent = dget_parent(xino->f_dentry); + ldir->mtx = &ldir->parent->d_inode->i_mutex; + mutex_lock_nested(ldir->mtx, AuLsc_I_PARENT); + } +} + +static void au_xino_unlock_dir(struct au_xino_lock_dir *ldir) +{ + if (ldir->hdir) + au_hn_imtx_unlock(ldir->hdir); + else { + mutex_unlock(ldir->mtx); + dput(ldir->parent); + } +} + +/* ---------------------------------------------------------------------- */ + +/* trucate xino files asynchronously */ + +int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex) +{ + int err; + aufs_bindex_t bi, bend; + struct au_branch *br; + struct file *new_xino, *file; + struct super_block *h_sb; + struct au_xino_lock_dir ldir; + + err = -EINVAL; + bend = au_sbend(sb); + if (unlikely(bindex < 0 || bend < bindex)) + goto out; + br = au_sbr(sb, bindex); + file = br->br_xino.xi_file; + if (!file) + goto out; + + au_xino_lock_dir(sb, file, &ldir); + /* mnt_want_write() is unnecessary here */ + new_xino = au_xino_create2(file, file); + au_xino_unlock_dir(&ldir); + err = PTR_ERR(new_xino); + if (IS_ERR(new_xino)) + goto out; + err = 0; + fput(file); + br->br_xino.xi_file = new_xino; + + h_sb = br->br_mnt->mnt_sb; + for (bi = 0; bi <= bend; bi++) { + if (unlikely(bi == bindex)) + continue; + br = au_sbr(sb, bi); + if (br->br_mnt->mnt_sb != h_sb) + continue; + + fput(br->br_xino.xi_file); + br->br_xino.xi_file = new_xino; + get_file(new_xino); + } + +out: + return err; +} + +struct xino_do_trunc_args { + struct super_block *sb; + struct au_branch *br; +}; + +static void xino_do_trunc(void *_args) +{ + struct xino_do_trunc_args *args = _args; + struct super_block *sb; + struct au_branch *br; + struct inode *dir; + int err; + aufs_bindex_t bindex; + + err = 0; + sb = args->sb; + dir = sb->s_root->d_inode; + br = args->br; + + si_noflush_write_lock(sb); + ii_read_lock_parent(dir); + bindex = au_br_index(sb, br->br_id); + err = au_xino_trunc(sb, bindex); + if (!err + && br->br_xino.xi_file->f_dentry->d_inode->i_blocks + >= br->br_xino_upper) + br->br_xino_upper += AUFS_XINO_TRUNC_STEP; + + ii_read_unlock(dir); + if (unlikely(err)) + pr_warning("err b%d, (%d)\n", bindex, err); + atomic_dec(&br->br_xino_running); + atomic_dec(&br->br_count); + si_write_unlock(sb); + au_nwt_done(&au_sbi(sb)->si_nowait); + kfree(args); +} + +static void xino_try_trunc(struct super_block *sb, struct au_branch *br) +{ + struct xino_do_trunc_args *args; + int wkq_err; + + if (br->br_xino.xi_file->f_dentry->d_inode->i_blocks + < br->br_xino_upper) + return; + + if (atomic_inc_return(&br->br_xino_running) > 1) + goto out; + + /* lock and kfree() will be called in trunc_xino() */ + args = kmalloc(sizeof(*args), GFP_NOFS); + if (unlikely(!args)) { + AuErr1("no memory\n"); + goto out_args; + } + + atomic_inc(&br->br_count); + args->sb = sb; + args->br = br; + wkq_err = au_wkq_nowait(xino_do_trunc, args, sb, /*flags*/0); + if (!wkq_err) + return; /* success */ + + pr_err("wkq %d\n", wkq_err); + atomic_dec(&br->br_count); + +out_args: + kfree(args); +out: + atomic_dec(&br->br_xino_running); +} + +/* ---------------------------------------------------------------------- */ + +static int au_xino_do_write(au_writef_t write, struct file *file, + ino_t h_ino, ino_t ino) +{ + loff_t pos; + ssize_t sz; + + pos = h_ino; + if (unlikely(au_loff_max / sizeof(ino) - 1 < pos)) { + AuIOErr1("too large hi%lu\n", (unsigned long)h_ino); + return -EFBIG; + } + pos *= sizeof(ino); + sz = xino_fwrite(write, file, &ino, sizeof(ino), &pos); + if (sz == sizeof(ino)) + return 0; /* success */ + + AuIOErr("write failed (%zd)\n", sz); + return -EIO; +} + +/* + * write @ino to the xinofile for the specified branch{@sb, @bindex} + * at the position of @h_ino. + * even if @ino is zero, it is written to the xinofile and means no entry. + * if the size of the xino file on a specific filesystem exceeds the watermark, + * try truncating it. + */ +int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, + ino_t ino) +{ + int err; + unsigned int mnt_flags; + struct au_branch *br; + + BUILD_BUG_ON(sizeof(long long) != sizeof(au_loff_max) + || ((loff_t)-1) > 0); + SiMustAnyLock(sb); + + mnt_flags = au_mntflags(sb); + if (!au_opt_test(mnt_flags, XINO)) + return 0; + + br = au_sbr(sb, bindex); + err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file, + h_ino, ino); + if (!err) { + if (au_opt_test(mnt_flags, TRUNC_XINO) + && au_test_fs_trunc_xino(br->br_mnt->mnt_sb)) + xino_try_trunc(sb, br); + return 0; /* success */ + } + + AuIOErr("write failed (%d)\n", err); + return -EIO; +} + +/* ---------------------------------------------------------------------- */ + +/* aufs inode number bitmap */ + +static const int page_bits = (int)PAGE_SIZE * BITS_PER_BYTE; +static ino_t xib_calc_ino(unsigned long pindex, int bit) +{ + ino_t ino; + + AuDebugOn(bit < 0 || page_bits <= bit); + ino = AUFS_FIRST_INO + pindex * page_bits + bit; + return ino; +} + +static void xib_calc_bit(ino_t ino, unsigned long *pindex, int *bit) +{ + AuDebugOn(ino < AUFS_FIRST_INO); + ino -= AUFS_FIRST_INO; + *pindex = ino / page_bits; + *bit = ino % page_bits; +} + +static int xib_pindex(struct super_block *sb, unsigned long pindex) +{ + int err; + loff_t pos; + ssize_t sz; + struct au_sbinfo *sbinfo; + struct file *xib; + unsigned long *p; + + sbinfo = au_sbi(sb); + MtxMustLock(&sbinfo->si_xib_mtx); + AuDebugOn(pindex > ULONG_MAX / PAGE_SIZE + || !au_opt_test(sbinfo->si_mntflags, XINO)); + + if (pindex == sbinfo->si_xib_last_pindex) + return 0; + + xib = sbinfo->si_xib; + p = sbinfo->si_xib_buf; + pos = sbinfo->si_xib_last_pindex; + pos *= PAGE_SIZE; + sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos); + if (unlikely(sz != PAGE_SIZE)) + goto out; + + pos = pindex; + pos *= PAGE_SIZE; + if (i_size_read(xib->f_dentry->d_inode) >= pos + PAGE_SIZE) + sz = xino_fread(sbinfo->si_xread, xib, p, PAGE_SIZE, &pos); + else { + memset(p, 0, PAGE_SIZE); + sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos); + } + if (sz == PAGE_SIZE) { + sbinfo->si_xib_last_pindex = pindex; + return 0; /* success */ + } + +out: + AuIOErr1("write failed (%zd)\n", sz); + err = sz; + if (sz >= 0) + err = -EIO; + return err; +} + +/* ---------------------------------------------------------------------- */ + +static void au_xib_clear_bit(struct inode *inode) +{ + int err, bit; + unsigned long pindex; + struct super_block *sb; + struct au_sbinfo *sbinfo; + + AuDebugOn(inode->i_nlink); + + sb = inode->i_sb; + xib_calc_bit(inode->i_ino, &pindex, &bit); + AuDebugOn(page_bits <= bit); + sbinfo = au_sbi(sb); + mutex_lock(&sbinfo->si_xib_mtx); + err = xib_pindex(sb, pindex); + if (!err) { + clear_bit(bit, sbinfo->si_xib_buf); + sbinfo->si_xib_next_bit = bit; + } + mutex_unlock(&sbinfo->si_xib_mtx); +} + +/* for s_op->delete_inode() */ +void au_xino_delete_inode(struct inode *inode, const int unlinked) +{ + int err; + unsigned int mnt_flags; + aufs_bindex_t bindex, bend, bi; + unsigned char try_trunc; + struct au_iinfo *iinfo; + struct super_block *sb; + struct au_hinode *hi; + struct inode *h_inode; + struct au_branch *br; + au_writef_t xwrite; + + sb = inode->i_sb; + mnt_flags = au_mntflags(sb); + if (!au_opt_test(mnt_flags, XINO) + || inode->i_ino == AUFS_ROOT_INO) + return; + + if (unlinked) { + au_xigen_inc(inode); + au_xib_clear_bit(inode); + } + + iinfo = au_ii(inode); + if (!iinfo) + return; + + bindex = iinfo->ii_bstart; + if (bindex < 0) + return; + + xwrite = au_sbi(sb)->si_xwrite; + try_trunc = !!au_opt_test(mnt_flags, TRUNC_XINO); + hi = iinfo->ii_hinode + bindex; + bend = iinfo->ii_bend; + for (; bindex <= bend; bindex++, hi++) { + h_inode = hi->hi_inode; + if (!h_inode + || (!unlinked && h_inode->i_nlink)) + continue; + + /* inode may not be revalidated */ + bi = au_br_index(sb, hi->hi_id); + if (bi < 0) + continue; + + br = au_sbr(sb, bi); + err = au_xino_do_write(xwrite, br->br_xino.xi_file, + h_inode->i_ino, /*ino*/0); + if (!err && try_trunc + && au_test_fs_trunc_xino(br->br_mnt->mnt_sb)) + xino_try_trunc(sb, br); + } +} + +/* get an unused inode number from bitmap */ +ino_t au_xino_new_ino(struct super_block *sb) +{ + ino_t ino; + unsigned long *p, pindex, ul, pend; + struct au_sbinfo *sbinfo; + struct file *file; + int free_bit, err; + + if (!au_opt_test(au_mntflags(sb), XINO)) + return iunique(sb, AUFS_FIRST_INO); + + sbinfo = au_sbi(sb); + mutex_lock(&sbinfo->si_xib_mtx); + p = sbinfo->si_xib_buf; + free_bit = sbinfo->si_xib_next_bit; + if (free_bit < page_bits && !test_bit(free_bit, p)) + goto out; /* success */ + free_bit = find_first_zero_bit(p, page_bits); + if (free_bit < page_bits) + goto out; /* success */ + + pindex = sbinfo->si_xib_last_pindex; + for (ul = pindex - 1; ul < ULONG_MAX; ul--) { + err = xib_pindex(sb, ul); + if (unlikely(err)) + goto out_err; + free_bit = find_first_zero_bit(p, page_bits); + if (free_bit < page_bits) + goto out; /* success */ + } + + file = sbinfo->si_xib; + pend = i_size_read(file->f_dentry->d_inode) / PAGE_SIZE; + for (ul = pindex + 1; ul <= pend; ul++) { + err = xib_pindex(sb, ul); + if (unlikely(err)) + goto out_err; + free_bit = find_first_zero_bit(p, page_bits); + if (free_bit < page_bits) + goto out; /* success */ + } + BUG(); + +out: + set_bit(free_bit, p); + sbinfo->si_xib_next_bit = free_bit + 1; + pindex = sbinfo->si_xib_last_pindex; + mutex_unlock(&sbinfo->si_xib_mtx); + ino = xib_calc_ino(pindex, free_bit); + AuDbg("i%lu\n", (unsigned long)ino); + return ino; +out_err: + mutex_unlock(&sbinfo->si_xib_mtx); + AuDbg("i0\n"); + return 0; +} + +/* + * read @ino from xinofile for the specified branch{@sb, @bindex} + * at the position of @h_ino. + * if @ino does not exist and @do_new is true, get new one. + */ +int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, + ino_t *ino) +{ + int err; + ssize_t sz; + loff_t pos; + struct file *file; + struct au_sbinfo *sbinfo; + + *ino = 0; + if (!au_opt_test(au_mntflags(sb), XINO)) + return 0; /* no xino */ + + err = 0; + sbinfo = au_sbi(sb); + pos = h_ino; + if (unlikely(au_loff_max / sizeof(*ino) - 1 < pos)) { + AuIOErr1("too large hi%lu\n", (unsigned long)h_ino); + return -EFBIG; + } + pos *= sizeof(*ino); + + file = au_sbr(sb, bindex)->br_xino.xi_file; + if (i_size_read(file->f_dentry->d_inode) < pos + sizeof(*ino)) + return 0; /* no ino */ + + sz = xino_fread(sbinfo->si_xread, file, ino, sizeof(*ino), &pos); + if (sz == sizeof(*ino)) + return 0; /* success */ + + err = sz; + if (unlikely(sz >= 0)) { + err = -EIO; + AuIOErr("xino read error (%zd)\n", sz); + } + + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* create and set a new xino file */ + +struct file *au_xino_create(struct super_block *sb, char *fname, int silent) +{ + struct file *file; + struct dentry *h_parent, *d; + struct inode *h_dir; + int err; + + /* + * at mount-time, and the xino file is the default path, + * hnotify is disabled so we have no notify events to ignore. + * when a user specified the xino, we cannot get au_hdir to be ignored. + */ + file = vfsub_filp_open(fname, O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE + /* | __FMODE_NONOTIFY */, + S_IRUGO | S_IWUGO); + if (IS_ERR(file)) { + if (!silent) + pr_err("open %s(%ld)\n", fname, PTR_ERR(file)); + return file; + } + + /* keep file count */ + h_parent = dget_parent(file->f_dentry); + h_dir = h_parent->d_inode; + mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT); + /* mnt_want_write() is unnecessary here */ + err = vfsub_unlink(h_dir, &file->f_path, /*force*/0); + mutex_unlock(&h_dir->i_mutex); + dput(h_parent); + if (unlikely(err)) { + if (!silent) + pr_err("unlink %s(%d)\n", fname, err); + goto out; + } + + err = -EINVAL; + d = file->f_dentry; + if (unlikely(sb == d->d_sb)) { + if (!silent) + pr_err("%s must be outside\n", fname); + goto out; + } + if (unlikely(au_test_fs_bad_xino(d->d_sb))) { + if (!silent) + pr_err("xino doesn't support %s(%s)\n", + fname, au_sbtype(d->d_sb)); + goto out; + } + return file; /* success */ + +out: + fput(file); + file = ERR_PTR(err); + return file; +} + +/* + * find another branch who is on the same filesystem of the specified + * branch{@btgt}. search until @bend. + */ +static int is_sb_shared(struct super_block *sb, aufs_bindex_t btgt, + aufs_bindex_t bend) +{ + aufs_bindex_t bindex; + struct super_block *tgt_sb = au_sbr_sb(sb, btgt); + + for (bindex = 0; bindex < btgt; bindex++) + if (unlikely(tgt_sb == au_sbr_sb(sb, bindex))) + return bindex; + for (bindex++; bindex <= bend; bindex++) + if (unlikely(tgt_sb == au_sbr_sb(sb, bindex))) + return bindex; + return -1; +} + +/* ---------------------------------------------------------------------- */ + +/* + * initialize the xinofile for the specified branch @br + * at the place/path where @base_file indicates. + * test whether another branch is on the same filesystem or not, + * if @do_test is true. + */ +int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t h_ino, + struct file *base_file, int do_test) +{ + int err; + ino_t ino; + aufs_bindex_t bend, bindex; + struct au_branch *shared_br, *b; + struct file *file; + struct super_block *tgt_sb; + + shared_br = NULL; + bend = au_sbend(sb); + if (do_test) { + tgt_sb = br->br_mnt->mnt_sb; + for (bindex = 0; bindex <= bend; bindex++) { + b = au_sbr(sb, bindex); + if (tgt_sb == b->br_mnt->mnt_sb) { + shared_br = b; + break; + } + } + } + + if (!shared_br || !shared_br->br_xino.xi_file) { + struct au_xino_lock_dir ldir; + + au_xino_lock_dir(sb, base_file, &ldir); + /* mnt_want_write() is unnecessary here */ + file = au_xino_create2(base_file, NULL); + au_xino_unlock_dir(&ldir); + err = PTR_ERR(file); + if (IS_ERR(file)) + goto out; + br->br_xino.xi_file = file; + } else { + br->br_xino.xi_file = shared_br->br_xino.xi_file; + get_file(br->br_xino.xi_file); + } + + ino = AUFS_ROOT_INO; + err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file, + h_ino, ino); + if (unlikely(err)) { + fput(br->br_xino.xi_file); + br->br_xino.xi_file = NULL; + } + +out: + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* trucate a xino bitmap file */ + +/* todo: slow */ +static int do_xib_restore(struct super_block *sb, struct file *file, void *page) +{ + int err, bit; + ssize_t sz; + unsigned long pindex; + loff_t pos, pend; + struct au_sbinfo *sbinfo; + au_readf_t func; + ino_t *ino; + unsigned long *p; + + err = 0; + sbinfo = au_sbi(sb); + MtxMustLock(&sbinfo->si_xib_mtx); + p = sbinfo->si_xib_buf; + func = sbinfo->si_xread; + pend = i_size_read(file->f_dentry->d_inode); + pos = 0; + while (pos < pend) { + sz = xino_fread(func, file, page, PAGE_SIZE, &pos); + err = sz; + if (unlikely(sz <= 0)) + goto out; + + err = 0; + for (ino = page; sz > 0; ino++, sz -= sizeof(ino)) { + if (unlikely(*ino < AUFS_FIRST_INO)) + continue; + + xib_calc_bit(*ino, &pindex, &bit); + AuDebugOn(page_bits <= bit); + err = xib_pindex(sb, pindex); + if (!err) + set_bit(bit, p); + else + goto out; + } + } + +out: + return err; +} + +static int xib_restore(struct super_block *sb) +{ + int err; + aufs_bindex_t bindex, bend; + void *page; + + err = -ENOMEM; + page = (void *)__get_free_page(GFP_NOFS); + if (unlikely(!page)) + goto out; + + err = 0; + bend = au_sbend(sb); + for (bindex = 0; !err && bindex <= bend; bindex++) + if (!bindex || is_sb_shared(sb, bindex, bindex - 1) < 0) + err = do_xib_restore + (sb, au_sbr(sb, bindex)->br_xino.xi_file, page); + else + AuDbg("b%d\n", bindex); + free_page((unsigned long)page); + +out: + return err; +} + +int au_xib_trunc(struct super_block *sb) +{ + int err; + ssize_t sz; + loff_t pos; + struct au_xino_lock_dir ldir; + struct au_sbinfo *sbinfo; + unsigned long *p; + struct file *file; + + SiMustWriteLock(sb); + + err = 0; + sbinfo = au_sbi(sb); + if (!au_opt_test(sbinfo->si_mntflags, XINO)) + goto out; + + file = sbinfo->si_xib; + if (i_size_read(file->f_dentry->d_inode) <= PAGE_SIZE) + goto out; + + au_xino_lock_dir(sb, file, &ldir); + /* mnt_want_write() is unnecessary here */ + file = au_xino_create2(sbinfo->si_xib, NULL); + au_xino_unlock_dir(&ldir); + err = PTR_ERR(file); + if (IS_ERR(file)) + goto out; + fput(sbinfo->si_xib); + sbinfo->si_xib = file; + + p = sbinfo->si_xib_buf; + memset(p, 0, PAGE_SIZE); + pos = 0; + sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xib, p, PAGE_SIZE, &pos); + if (unlikely(sz != PAGE_SIZE)) { + err = sz; + AuIOErr("err %d\n", err); + if (sz >= 0) + err = -EIO; + goto out; + } + + mutex_lock(&sbinfo->si_xib_mtx); + /* mnt_want_write() is unnecessary here */ + err = xib_restore(sb); + mutex_unlock(&sbinfo->si_xib_mtx); + +out: + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* + * xino mount option handlers + */ +static au_readf_t find_readf(struct file *h_file) +{ + const struct file_operations *fop = h_file->f_op; + + if (fop) { + if (fop->read) + return fop->read; + if (fop->aio_read) + return do_sync_read; + } + return ERR_PTR(-ENOSYS); +} + +static au_writef_t find_writef(struct file *h_file) +{ + const struct file_operations *fop = h_file->f_op; + + if (fop) { + if (fop->write) + return fop->write; + if (fop->aio_write) + return do_sync_write; + } + return ERR_PTR(-ENOSYS); +} + +/* xino bitmap */ +static void xino_clear_xib(struct super_block *sb) +{ + struct au_sbinfo *sbinfo; + + SiMustWriteLock(sb); + + sbinfo = au_sbi(sb); + sbinfo->si_xread = NULL; + sbinfo->si_xwrite = NULL; + if (sbinfo->si_xib) + fput(sbinfo->si_xib); + sbinfo->si_xib = NULL; + free_page((unsigned long)sbinfo->si_xib_buf); + sbinfo->si_xib_buf = NULL; +} + +static int au_xino_set_xib(struct super_block *sb, struct file *base) +{ + int err; + loff_t pos; + struct au_sbinfo *sbinfo; + struct file *file; + + SiMustWriteLock(sb); + + sbinfo = au_sbi(sb); + file = au_xino_create2(base, sbinfo->si_xib); + err = PTR_ERR(file); + if (IS_ERR(file)) + goto out; + if (sbinfo->si_xib) + fput(sbinfo->si_xib); + sbinfo->si_xib = file; + sbinfo->si_xread = find_readf(file); + sbinfo->si_xwrite = find_writef(file); + + err = -ENOMEM; + if (!sbinfo->si_xib_buf) + sbinfo->si_xib_buf = (void *)get_zeroed_page(GFP_NOFS); + if (unlikely(!sbinfo->si_xib_buf)) + goto out_unset; + + sbinfo->si_xib_last_pindex = 0; + sbinfo->si_xib_next_bit = 0; + if (i_size_read(file->f_dentry->d_inode) < PAGE_SIZE) { + pos = 0; + err = xino_fwrite(sbinfo->si_xwrite, file, sbinfo->si_xib_buf, + PAGE_SIZE, &pos); + if (unlikely(err != PAGE_SIZE)) + goto out_free; + } + err = 0; + goto out; /* success */ + +out_free: + free_page((unsigned long)sbinfo->si_xib_buf); + sbinfo->si_xib_buf = NULL; + if (err >= 0) + err = -EIO; +out_unset: + fput(sbinfo->si_xib); + sbinfo->si_xib = NULL; + sbinfo->si_xread = NULL; + sbinfo->si_xwrite = NULL; +out: + return err; +} + +/* xino for each branch */ +static void xino_clear_br(struct super_block *sb) +{ + aufs_bindex_t bindex, bend; + struct au_branch *br; + + bend = au_sbend(sb); + for (bindex = 0; bindex <= bend; bindex++) { + br = au_sbr(sb, bindex); + if (!br || !br->br_xino.xi_file) + continue; + + fput(br->br_xino.xi_file); + br->br_xino.xi_file = NULL; + } +} + +static int au_xino_set_br(struct super_block *sb, struct file *base) +{ + int err; + ino_t ino; + aufs_bindex_t bindex, bend, bshared; + struct { + struct file *old, *new; + } *fpair, *p; + struct au_branch *br; + struct inode *inode; + au_writef_t writef; + + SiMustWriteLock(sb); + + err = -ENOMEM; + bend = au_sbend(sb); + fpair = kcalloc(bend + 1, sizeof(*fpair), GFP_NOFS); + if (unlikely(!fpair)) + goto out; + + inode = sb->s_root->d_inode; + ino = AUFS_ROOT_INO; + writef = au_sbi(sb)->si_xwrite; + for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) { + br = au_sbr(sb, bindex); + bshared = is_sb_shared(sb, bindex, bindex - 1); + if (bshared >= 0) { + /* shared xino */ + *p = fpair[bshared]; + get_file(p->new); + } + + if (!p->new) { + /* new xino */ + p->old = br->br_xino.xi_file; + p->new = au_xino_create2(base, br->br_xino.xi_file); + err = PTR_ERR(p->new); + if (IS_ERR(p->new)) { + p->new = NULL; + goto out_pair; + } + } + + err = au_xino_do_write(writef, p->new, + au_h_iptr(inode, bindex)->i_ino, ino); + if (unlikely(err)) + goto out_pair; + } + + for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) { + br = au_sbr(sb, bindex); + if (br->br_xino.xi_file) + fput(br->br_xino.xi_file); + get_file(p->new); + br->br_xino.xi_file = p->new; + } + +out_pair: + for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) + if (p->new) + fput(p->new); + else + break; + kfree(fpair); +out: + return err; +} + +void au_xino_clr(struct super_block *sb) +{ + struct au_sbinfo *sbinfo; + + au_xigen_clr(sb); + xino_clear_xib(sb); + xino_clear_br(sb); + sbinfo = au_sbi(sb); + /* lvalue, do not call au_mntflags() */ + au_opt_clr(sbinfo->si_mntflags, XINO); +} + +int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount) +{ + int err, skip; + struct dentry *parent, *cur_parent; + struct qstr *dname, *cur_name; + struct file *cur_xino; + struct inode *dir; + struct au_sbinfo *sbinfo; + + SiMustWriteLock(sb); + + err = 0; + sbinfo = au_sbi(sb); + parent = dget_parent(xino->file->f_dentry); + if (remount) { + skip = 0; + dname = &xino->file->f_dentry->d_name; + cur_xino = sbinfo->si_xib; + if (cur_xino) { + cur_parent = dget_parent(cur_xino->f_dentry); + cur_name = &cur_xino->f_dentry->d_name; + skip = (cur_parent == parent + && dname->len == cur_name->len + && !memcmp(dname->name, cur_name->name, + dname->len)); + dput(cur_parent); + } + if (skip) + goto out; + } + + au_opt_set(sbinfo->si_mntflags, XINO); + dir = parent->d_inode; + mutex_lock_nested(&dir->i_mutex, AuLsc_I_PARENT); + /* mnt_want_write() is unnecessary here */ + err = au_xino_set_xib(sb, xino->file); + if (!err) + err = au_xigen_set(sb, xino->file); + if (!err) + err = au_xino_set_br(sb, xino->file); + mutex_unlock(&dir->i_mutex); + if (!err) + goto out; /* success */ + + /* reset all */ + AuIOErr("failed creating xino(%d).\n", err); + +out: + dput(parent); + return err; +} + +/* ---------------------------------------------------------------------- */ + +/* + * create a xinofile at the default place/path. + */ +struct file *au_xino_def(struct super_block *sb) +{ + struct file *file; + char *page, *p; + struct au_branch *br; + struct super_block *h_sb; + struct path path; + aufs_bindex_t bend, bindex, bwr; + + br = NULL; + bend = au_sbend(sb); + bwr = -1; + for (bindex = 0; bindex <= bend; bindex++) { + br = au_sbr(sb, bindex); + if (au_br_writable(br->br_perm) + && !au_test_fs_bad_xino(br->br_mnt->mnt_sb)) { + bwr = bindex; + break; + } + } + + if (bwr >= 0) { + file = ERR_PTR(-ENOMEM); + page = __getname_gfp(GFP_NOFS); + if (unlikely(!page)) + goto out; + path.mnt = br->br_mnt; + path.dentry = au_h_dptr(sb->s_root, bwr); + p = d_path(&path, page, PATH_MAX - sizeof(AUFS_XINO_FNAME)); + file = (void *)p; + if (!IS_ERR(p)) { + strcat(p, "/" AUFS_XINO_FNAME); + AuDbg("%s\n", p); + file = au_xino_create(sb, p, /*silent*/0); + if (!IS_ERR(file)) + au_xino_brid_set(sb, br->br_id); + } + __putname(page); + } else { + file = au_xino_create(sb, AUFS_XINO_DEFPATH, /*silent*/0); + if (IS_ERR(file)) + goto out; + h_sb = file->f_dentry->d_sb; + if (unlikely(au_test_fs_bad_xino(h_sb))) { + pr_err("xino doesn't support %s(%s)\n", + AUFS_XINO_DEFPATH, au_sbtype(h_sb)); + fput(file); + file = ERR_PTR(-EINVAL); + } + if (!IS_ERR(file)) + au_xino_brid_set(sb, -1); + } + +out: + return file; +} + +/* ---------------------------------------------------------------------- */ + +int au_xino_path(struct seq_file *seq, struct file *file) +{ + int err; + + err = au_seq_path(seq, &file->f_path); + if (unlikely(err < 0)) + goto out; + + err = 0; +#define Deleted "\\040(deleted)" + seq->count -= sizeof(Deleted) - 1; + AuDebugOn(memcmp(seq->buf + seq->count, Deleted, + sizeof(Deleted) - 1)); +#undef Deleted + +out: + return err; +} diff --git a/ubuntu/include/linux/aufs_type.h b/ubuntu/include/linux/aufs_type.h new file mode 100644 index 000000000000..6738afb001f0 --- /dev/null +++ b/ubuntu/include/linux/aufs_type.h @@ -0,0 +1,224 @@ +/* + * Copyright (C) 2005-2011 Junjiro R. Okajima + * + * This program, aufs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef __AUFS_TYPE_H__ +#define __AUFS_TYPE_H__ + +#include +#include +#include +#ifdef __KERNEL__ +#include +#else +#include +#include +#endif + +#define AUFS_VERSION "3.x-rcN-20111205" + +/* todo? move this to linux-2.6.19/include/magic.h */ +#define AUFS_SUPER_MAGIC ('a' << 24 | 'u' << 16 | 'f' << 8 | 's') + +/* ---------------------------------------------------------------------- */ + +#ifdef CONFIG_AUFS_BRANCH_MAX_127 +typedef int8_t aufs_bindex_t; +#define AUFS_BRANCH_MAX 127 +#else +typedef int16_t aufs_bindex_t; +#ifdef CONFIG_AUFS_BRANCH_MAX_511 +#define AUFS_BRANCH_MAX 511 +#elif defined(CONFIG_AUFS_BRANCH_MAX_1023) +#define AUFS_BRANCH_MAX 1023 +#elif defined(CONFIG_AUFS_BRANCH_MAX_32767) +#define AUFS_BRANCH_MAX 32767 +#endif +#endif + +#ifdef __KERNEL__ +#ifndef AUFS_BRANCH_MAX +#error unknown CONFIG_AUFS_BRANCH_MAX value +#endif +#endif /* __KERNEL__ */ + +/* ---------------------------------------------------------------------- */ + +#define AUFS_NAME "aufs" +#define AUFS_FSTYPE AUFS_NAME + +#define AUFS_ROOT_INO 2 +#define AUFS_FIRST_INO 11 + +#define AUFS_WH_PFX ".wh." +#define AUFS_WH_PFX_LEN ((int)sizeof(AUFS_WH_PFX) - 1) +#define AUFS_WH_TMP_LEN 4 +/* a limit for rmdir/rename a dir */ +#define AUFS_MAX_NAMELEN (NAME_MAX \ + - AUFS_WH_PFX_LEN * 2 /* doubly whiteouted */\ + - 1 /* dot */\ + - AUFS_WH_TMP_LEN) /* hex */ +#define AUFS_XINO_FNAME "." AUFS_NAME ".xino" +#define AUFS_XINO_DEFPATH "/tmp/" AUFS_XINO_FNAME +#define AUFS_XINO_TRUNC_INIT 64 /* blocks */ +#define AUFS_XINO_TRUNC_STEP 4 /* blocks */ +#define AUFS_DIRWH_DEF 3 +#define AUFS_RDCACHE_DEF 10 /* seconds */ +#define AUFS_RDCACHE_MAX 3600 /* seconds */ +#define AUFS_RDBLK_DEF 512 /* bytes */ +#define AUFS_RDHASH_DEF 32 +#define AUFS_WKQ_NAME AUFS_NAME "d" +#define AUFS_MFS_DEF_SEC 30 /* seconds */ +#define AUFS_MFS_MAX_SEC 3600 /* seconds */ +#define AUFS_PLINK_WARN 100 /* number of plinks */ + +/* pseudo-link maintenace under /proc */ +#define AUFS_PLINK_MAINT_NAME "plink_maint" +#define AUFS_PLINK_MAINT_DIR "fs/" AUFS_NAME +#define AUFS_PLINK_MAINT_PATH AUFS_PLINK_MAINT_DIR "/" AUFS_PLINK_MAINT_NAME + +#define AUFS_DIROPQ_NAME AUFS_WH_PFX ".opq" /* whiteouted doubly */ +#define AUFS_WH_DIROPQ AUFS_WH_PFX AUFS_DIROPQ_NAME + +#define AUFS_BASE_NAME AUFS_WH_PFX AUFS_NAME +#define AUFS_PLINKDIR_NAME AUFS_WH_PFX "plnk" +#define AUFS_ORPHDIR_NAME AUFS_WH_PFX "orph" + +/* doubly whiteouted */ +#define AUFS_WH_BASE AUFS_WH_PFX AUFS_BASE_NAME +#define AUFS_WH_PLINKDIR AUFS_WH_PFX AUFS_PLINKDIR_NAME +#define AUFS_WH_ORPHDIR AUFS_WH_PFX AUFS_ORPHDIR_NAME + +/* branch permissions and attributes */ +#define AUFS_BRPERM_RW "rw" +#define AUFS_BRPERM_RO "ro" +#define AUFS_BRPERM_RR "rr" +#define AUFS_BRRATTR_WH "wh" +#define AUFS_BRWATTR_NLWH "nolwh" + +/* ---------------------------------------------------------------------- */ + +/* ioctl */ +enum { + /* readdir in userspace */ + AuCtl_RDU, + AuCtl_RDU_INO, + + /* pathconf wrapper */ + AuCtl_WBR_FD, + + /* busy inode */ + AuCtl_IBUSY +}; + +/* borrowed from linux/include/linux/kernel.h */ +#ifndef ALIGN +#define ALIGN(x, a) __ALIGN_MASK(x, (typeof(x))(a)-1) +#define __ALIGN_MASK(x, mask) (((x)+(mask))&~(mask)) +#endif + +/* borrowed from linux/include/linux/compiler-gcc3.h */ +#ifndef __aligned +#define __aligned(x) __attribute__((aligned(x))) +#endif + +#ifdef __KERNEL__ +#ifndef __packed +#define __packed __attribute__((packed)) +#endif +#endif + +struct au_rdu_cookie { + uint64_t h_pos; + int16_t bindex; + uint8_t flags; + uint8_t pad; + uint32_t generation; +} __aligned(8); + +struct au_rdu_ent { + uint64_t ino; + int16_t bindex; + uint8_t type; + uint8_t nlen; + uint8_t wh; + char name[0]; +} __aligned(8); + +static inline int au_rdu_len(int nlen) +{ + /* include the terminating NULL */ + return ALIGN(sizeof(struct au_rdu_ent) + nlen + 1, + sizeof(uint64_t)); +} + +union au_rdu_ent_ul { + struct au_rdu_ent __user *e; + uint64_t ul; +}; + +enum { + AufsCtlRduV_SZ, + AufsCtlRduV_End +}; + +struct aufs_rdu { + /* input */ + union { + uint64_t sz; /* AuCtl_RDU */ + uint64_t nent; /* AuCtl_RDU_INO */ + }; + union au_rdu_ent_ul ent; + uint16_t verify[AufsCtlRduV_End]; + + /* input/output */ + uint32_t blk; + + /* output */ + union au_rdu_ent_ul tail; + /* number of entries which were added in a single call */ + uint64_t rent; + uint8_t full; + uint8_t shwh; + + struct au_rdu_cookie cookie; +} __aligned(8); + +/* ---------------------------------------------------------------------- */ + +struct aufs_wbr_fd { + uint32_t oflags; + int16_t brid; +} __aligned(8); + +/* ---------------------------------------------------------------------- */ + +struct aufs_ibusy { + uint64_t ino, h_ino; + int16_t bindex; +} __aligned(8); + +/* ---------------------------------------------------------------------- */ + +#define AuCtlType 'A' +#define AUFS_CTL_RDU _IOWR(AuCtlType, AuCtl_RDU, struct aufs_rdu) +#define AUFS_CTL_RDU_INO _IOWR(AuCtlType, AuCtl_RDU_INO, struct aufs_rdu) +#define AUFS_CTL_WBR_FD _IOW(AuCtlType, AuCtl_WBR_FD, \ + struct aufs_wbr_fd) +#define AUFS_CTL_IBUSY _IOWR(AuCtlType, AuCtl_IBUSY, struct aufs_ibusy) + +#endif /* __AUFS_TYPE_H__ */ -- cgit v1.2.3 From f40b43d55aeb02157f64cb9e5b9a10fb97b0f31f Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Fri, 21 Jan 2011 14:41:48 +0000 Subject: UBUNTU: ubuntu: AUFS -- suppress benign plink warning messages We are getting a lot of bug reports for unexpectedly high plink counts. This message is benign and not worth reporting as a bug. Suppress. BugLink: http://bugs.launchpad.net/bugs/621195 Signed-off-by: Andy Whitcroft --- ubuntu/aufs/plink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ubuntu/aufs/plink.c b/ubuntu/aufs/plink.c index b1888f30c719..1824e6df635e 100644 --- a/ubuntu/aufs/plink.c +++ b/ubuntu/aufs/plink.c @@ -414,8 +414,8 @@ void au_plink_append(struct inode *inode, aufs_bindex_t bindex, spin_unlock(&sbinfo->si_plink.spin); if (!found) { cnt++; - WARN_ONCE(cnt > AUFS_PLINK_WARN, - "unexpectedly many pseudo links, %d\n", cnt); +// WARN_ONCE(cnt > AUFS_PLINK_WARN, +// "unexpectedly many pseudo links, %d\n", cnt); err = whplink(h_dentry, inode, bindex, au_sbr(sb, bindex)); } else { do_put_plink(tmp, 0); -- cgit v1.2.3 From a55289e8bdcc5d9d3f162f4f20e72ed85ed1c294 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 1 Jun 2010 14:10:55 +0100 Subject: UBUNTU: ubuntu: AUFS -- enable in config and makefile Signed-off-by: Andy Whitcroft Signed-off-by: Leann Ogasawara --- ubuntu/Kconfig | 1 + ubuntu/Makefile | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/ubuntu/Kconfig b/ubuntu/Kconfig index 042f8ad518b3..faedd0bbc16b 100644 --- a/ubuntu/Kconfig +++ b/ubuntu/Kconfig @@ -16,6 +16,7 @@ source "ubuntu/dm-raid4-5/Kconfig" ## ## ## +source "ubuntu/aufs/Kconfig" ## endmenu diff --git a/ubuntu/Makefile b/ubuntu/Makefile index bad1866141f7..54550aa97286 100644 --- a/ubuntu/Makefile +++ b/ubuntu/Makefile @@ -19,6 +19,10 @@ ## ## ## +obj-$(CONFIG_AUFS_FS) += aufs/ +## +## +## # This is a stupid trick to get kbuild to create ubuntu/built-in.o obj- += foo.o -- cgit v1.2.3 From d97e6fc8993e055bb1fb7e20e6d9f8e6015fe2b2 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Mon, 15 Aug 2011 12:01:19 -0700 Subject: UBUNTU: ubuntu: AUFS -- disable in favor of overlayfs Remain disabled while we acertain whether there are any hard requirements for aufs that overlayfs cannot handle. Signed-off-by: Leann Ogasawara --- ubuntu/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ubuntu/Makefile b/ubuntu/Makefile index 54550aa97286..66bbdf69d6f4 100644 --- a/ubuntu/Makefile +++ b/ubuntu/Makefile @@ -19,7 +19,7 @@ ## ## ## -obj-$(CONFIG_AUFS_FS) += aufs/ +#obj-$(CONFIG_AUFS_FS) += aufs/ ## ## ## -- cgit v1.2.3 From 976a2b8e106a871bb94b2235b4ae0ace77c2518b Mon Sep 17 00:00:00 2001 From: Mathieu Trudel-Lapierre Date: Tue, 10 Jan 2012 14:54:34 +0100 Subject: UBUNTU: SAUCE: ipv6: make the net.ipv6.conf.all.use_tempaddr sysctl propagate to interface settings The description for IPV6_PRIVACY mentions using .../all/use_tempaddr to enable IPv6 Privacy Extensions, and IP sysctl documentation mentions 'all' as setting all interface-specific settings. We make sure at least use_tempaddr actually works as documented. Signed-off-by: Mathieu Trudel-Lapierre Signed-off-by: Tim Gardner --- net/ipv6/addrconf.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 8f6411c97189..246b533ad5a7 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4360,6 +4360,84 @@ int addrconf_sysctl_disable(ctl_table *ctl, int write, return ret; } +#ifdef CONFIG_IPV6_PRIVACY +static void dev_tempaddr_change(struct inet6_dev *idev) +{ + if (!idev || !idev->dev) + return; + + if (!idev->cnf.disable_ipv6) { + /* If ipv6 is enabled, try to bring down and back up the + * interface to get new temporary addresses created + */ + addrconf_notify(NULL, NETDEV_DOWN, idev->dev); + addrconf_notify(NULL, NETDEV_UP, idev->dev); + } +} + +static void addrconf_tempaddr_change(struct net *net, __s32 newf) +{ + struct net_device *dev; + struct inet6_dev *idev; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + idev = __in6_dev_get(dev); + if (idev) { + int changed = (!idev->cnf.use_tempaddr) ^ (!newf); + idev->cnf.use_tempaddr = newf; + if (changed) + dev_tempaddr_change(idev); + } + } + rcu_read_unlock(); +} + +static int addrconf_use_tempaddr(struct ctl_table *table, int *p, int old) +{ + struct net *net; + + net = (struct net *)table->extra2; + + if (p == &net->ipv6.devconf_dflt->use_tempaddr) + return 0; + + if (!rtnl_trylock()) { + /* Restore the original values before restarting */ + *p = old; + return restart_syscall(); + } + + if (p == &net->ipv6.devconf_all->use_tempaddr) { + __s32 newf = net->ipv6.devconf_all->use_tempaddr; + net->ipv6.devconf_dflt->use_tempaddr = newf; + addrconf_tempaddr_change(net, newf); + } else if ((!*p) ^ (!old)) + dev_tempaddr_change((struct inet6_dev *)table->extra1); + + rtnl_unlock(); + return 0; +} + +static +int addrconf_sysctl_tempaddr(ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = ctl->data; + int val = *valp; + loff_t pos = *ppos; + int ret; + + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + + if (write) + ret = addrconf_use_tempaddr(ctl, valp, val); + if (ret) + *ppos = pos; + return ret; +} +#endif + static struct addrconf_sysctl_table { struct ctl_table_header *sysctl_header; @@ -4450,7 +4528,7 @@ static struct addrconf_sysctl_table .data = &ipv6_devconf.use_tempaddr, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = addrconf_sysctl_tempaddr, }, { .procname = "temp_valid_lft", -- cgit v1.2.3 From 3dfa0885e1f71469ea369a6a7f2acf8439e15662 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Mon, 16 Jan 2012 10:17:56 +0000 Subject: UBUNTU: ensure debian/ is not excluded from git by default Signed-off-by: Andy Whitcroft --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 57af07cf7e68..268fbfedc80b 100644 --- a/.gitignore +++ b/.gitignore @@ -48,7 +48,7 @@ modules.builtin # # Debian directory (make deb-pkg) # -/debian/ +#/debian/ # # git files that we don't want to ignore even it they are dot-files -- cgit v1.2.3 From f67e69ddf0c09e81e87b7010e71853921bed41de Mon Sep 17 00:00:00 2001 From: Manoj Iyer Date: Mon, 30 Jan 2012 08:54:09 -0800 Subject: UBUNTU: SAUCE: Bluetooth: Add support for BCM20702A0 [0a5c:21e1] Add vendor specific ID for BCM20702A0. usb-devices: T: Bus=02 Lev=02 Prnt=02 Port=05 Cnt=01 Dev#= 4 Spd=12 MxCh= 0 D: Ver= 2.00 Cls=ff(vend.) Sub=01 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=0a5c ProdID=21e1 Rev=01.12 S: Manufacturer=Broadcom Corp S: Product=BCM20702A0 S: SerialNumber=60D819F03A6D C: #Ifs= 4 Cfg#= 1 Atr=e0 MxPwr=0mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=01 Prot=01 Driver=btusb I: If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=01 Prot=01 Driver=btusb I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) I: If#= 3 Alt= 0 #EPs= 0 Cls=fe(app. ) Sub=01 Prot=01 Driver=(none) BugLink: http://bugs.launchpad.net/bugs/906832 Signed-off-by: Manoj Iyer Signed-off-by: James M. Leddy Acked-by: Tim Gardner Acked-by: Seth Forshee Signed-off-by: Leann Ogasawara --- drivers/bluetooth/btusb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index c9463af8e564..d97afc12ccd2 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -102,6 +102,7 @@ static struct usb_device_id btusb_table[] = { /* Broadcom BCM20702A0 */ { USB_DEVICE(0x0489, 0xe042) }, + { USB_DEVICE(0x0a5c, 0x21e1) }, { USB_DEVICE(0x0a5c, 0x21e3) }, { USB_DEVICE(0x0a5c, 0x21e6) }, { USB_DEVICE(0x0a5c, 0x21e8) }, -- cgit v1.2.3 From 46d1d6f09f028f4333a24bef413b1d672858e0fb Mon Sep 17 00:00:00 2001 From: "James M. Leddy" Date: Fri, 27 Jan 2012 18:42:38 -0500 Subject: UBUNTU: SAUCE: Bluetooth: Add support for BCM20702A0 [0a5c:21e6] Add another vendor specific ID for BCM20702A0. output of usb-devices: T: Bus=01 Lev=02 Prnt=02 Port=03 Cnt=04 Dev#= 6 Spd=12 MxCh= 0 D: Ver= 2.00 Cls=ff(vend.) Sub=01 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=0a5c ProdID=21e6 Rev=01.12 S: Manufacturer=Broadcom Corp S: Product=BCM20702A0 S: SerialNumber=D0DF9AFB227B C: #Ifs= 4 Cfg#= 1 Atr=e0 MxPwr=0mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=01 Prot=01 Driver=(none) I: If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=01 Prot=01 Driver=(none) I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) I: If#= 3 Alt= 0 #EPs= 0 Cls=fe(app. ) Sub=01 Prot=01 Driver=(none) BugLink: http://bugs.launchpad.net/bugs/906832 Signed-off-by: James M. Leddy Acked-by: Tim Gardner Acked-by: Seth Forshee Signed-off-by: Leann Ogasawara --- drivers/bluetooth/btusb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index d97afc12ccd2..88bcbade13be 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -107,6 +107,7 @@ static struct usb_device_id btusb_table[] = { { USB_DEVICE(0x0a5c, 0x21e6) }, { USB_DEVICE(0x0a5c, 0x21e8) }, { USB_DEVICE(0x0a5c, 0x21f3) }, + { USB_DEVICE(0x0a5c, 0x21e6) }, { USB_DEVICE(0x413c, 0x8197) }, /* Foxconn - Hon Hai */ -- cgit v1.2.3 From 5d3f922c7329a04f8b55100d179d9542a9ddc897 Mon Sep 17 00:00:00 2001 From: Manoj Iyer Date: Thu, 2 Feb 2012 10:42:28 -0600 Subject: UBUNTU: SAUCE: Add vendor specific ID (0a5c 21f3) for BCM20702A0. https://lkml.org/lkml/2012/2/2/220 T: Bus=01 Lev=02 Prnt=02 Port=03 Cnt=03 Dev#= 5 Spd=12 MxCh= 0 D: Ver= 2.00 Cls=ff(vend.) Sub=01 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=0a5c ProdID=21f3 Rev=01.12 S: Manufacturer=Broadcom Corp S: Product=BCM20702A0 S: SerialNumber=74DE2B344A7B C: #Ifs= 4 Cfg#= 1 Atr=e0 MxPwr=0mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=01 Prot=01 Driver=(none) I: If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=01 Prot=01 Driver=(none) I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) I: If#= 3 Alt= 0 #EPs= 0 Cls=fe(app. ) Sub=01 Prot=01 Driver=(none) BugLink: http://bugs.launchpad.net/bugs/925552 Signed-off-by: Manoj Iyer Tested-by: Dennis Chua Signed-off-by: Leann Ogasawara --- drivers/bluetooth/btusb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 88bcbade13be..e9611d2cb440 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -108,6 +108,7 @@ static struct usb_device_id btusb_table[] = { { USB_DEVICE(0x0a5c, 0x21e8) }, { USB_DEVICE(0x0a5c, 0x21f3) }, { USB_DEVICE(0x0a5c, 0x21e6) }, + { USB_DEVICE(0x0a5c, 0x21f3) }, { USB_DEVICE(0x413c, 0x8197) }, /* Foxconn - Hon Hai */ -- cgit v1.2.3 From e8412dbd7aa1e409c401650b1f31082d2f29cdc0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 30 Nov 2011 14:20:13 -0800 Subject: UBUNTU: SAUCE: Yama: add link restrictions Add symlink and hardlink restrictions that have shown real-world security benefits, along with sysctl knobs to control them. Signed-off-by: Kees Cook Signed-off-by: Tim Gardner --- Documentation/security/Yama.txt | 44 +++++++++++++ security/yama/Kconfig | 5 +- security/yama/yama_lsm.c | 138 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 185 insertions(+), 2 deletions(-) diff --git a/Documentation/security/Yama.txt b/Documentation/security/Yama.txt index e369de2d48cd..b72884a0a76f 100644 --- a/Documentation/security/Yama.txt +++ b/Documentation/security/Yama.txt @@ -5,10 +5,54 @@ any other LSM). Yama is controlled through sysctl in /proc/sys/kernel/yama: +- protected_sticky_symlinks +- protected_nonaccess_hardlinks - ptrace_scope ============================================================== +protected_sticky_symlinks: + +A long-standing class of security issues is the symlink-based +time-of-check-time-of-use race, most commonly seen in world-writable +directories like /tmp. The common method of exploitation of this flaw +is to cross privilege boundaries when following a given symlink (i.e. a +root process follows a symlink belonging to another user). For a likely +incomplete list of hundreds of examples across the years, please see: +http://cve.mitre.org/cgi-bin/cvekey.cgi?keyword=/tmp + +When set to "0", symlink following behavior is unrestricted. + +When set to "1" symlinks are permitted to be followed only when outside +a sticky world-writable directory, or when the uid of the symlink and +follower match, or when the directory owner matches the symlink's owner. + +This protection is based on the restrictions in Openwall and grsecurity. + +============================================================== + +protected_nonaccess_hardlinks: + +Hardlinks can be abused in a similar fashion to symlinks in sticky +world-writable directories, but their weakness is not limited to +just that scenario. For example, if /etc and /home are on the same +partition, a regular user can create a hardlink to /etc/shadow in their +home directory. While it retains the original owner and permissions, +it is possible for privileged programs that are otherwise symlink-safe +to mistakenly access the file through its hardlink. Additionally, a very +minor untraceable quota-bypassing local denial of service is possible by +an attacker exhausting disk space by filling a world-writable directory +with hardlinks. + +When set to "0", hardlink creation behavior is unrestricted. + +When set to "1", hardlinks cannot be created to files that a given user +would be unable to read and write originally, or are otherwise sensitive. + +This protection is based on the restrictions in Openwall and grsecurity. + +============================================================== + ptrace_scope: As Linux grows in popularity, it will become a larger target for diff --git a/security/yama/Kconfig b/security/yama/Kconfig index 51d6709d8bbd..1a5d1c1a7db4 100644 --- a/security/yama/Kconfig +++ b/security/yama/Kconfig @@ -7,7 +7,8 @@ config SECURITY_YAMA help This selects Yama, which extends DAC support with additional system-wide security settings beyond regular Linux discretionary - access controls. Currently available is ptrace scope restriction. - Further information can be found in Documentation/security/Yama.txt. + access controls. Currently available are symlink, hardlink, and + ptrace scope restrictions. Further information can be found in + Documentation/security/Yama.txt. If you are unsure how to answer this question, answer N. diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 83554ee8a587..50c72221d131 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -17,6 +17,9 @@ #include #include #include +#include +#include +#include #define YAMA_SCOPE_DISABLED 0 #define YAMA_SCOPE_RELATIONAL 1 @@ -24,6 +27,8 @@ #define YAMA_SCOPE_NO_ATTACH 3 static int ptrace_scope = YAMA_SCOPE_RELATIONAL; +static int protected_sticky_symlinks = 1; +static int protected_nonaccess_hardlinks = 1; /* describe a ptrace relationship for potential exception */ struct ptrace_relation { @@ -290,10 +295,125 @@ static int yama_ptrace_access_check(struct task_struct *child, return rc; } +/** + * yama_inode_follow_link - check for symlinks in sticky world-writeable dirs + * @dentry: The inode/dentry of the symlink + * @nameidata: The path data of the symlink + * + * In the case of the protected_sticky_symlinks sysctl being enabled, + * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is + * in a sticky world-writable directory. This is to protect privileged + * processes from failing races against path names that may change out + * from under them by way of other users creating malicious symlinks. + * It will permit symlinks to only be followed when outside a sticky + * world-writable directory, or when the uid of the symlink and follower + * match, or when the directory owner matches the symlink's owner. + * + * Returns 0 if following the symlink is allowed, -ve on error. + */ +static int yama_inode_follow_link(struct dentry *dentry, + struct nameidata *nameidata) +{ + int rc = 0; + const struct inode *parent; + const struct inode *inode; + const struct cred *cred; + + if (!protected_sticky_symlinks) + return 0; + + /* if inode isn't a symlink, don't try to evaluate blocking it */ + inode = dentry->d_inode; + if (!S_ISLNK(inode->i_mode)) + return 0; + + /* owner and follower match? */ + cred = current_cred(); + if (cred->fsuid == inode->i_uid) + return 0; + + /* check parent directory mode and owner */ + spin_lock(&dentry->d_lock); + parent = dentry->d_parent->d_inode; + if ((parent->i_mode & (S_ISVTX|S_IWOTH)) == (S_ISVTX|S_IWOTH) && + parent->i_uid != inode->i_uid) { + rc = -EACCES; + } + spin_unlock(&dentry->d_lock); + + if (rc) { + char name[sizeof(current->comm)]; + printk_ratelimited(KERN_NOTICE "non-matching-uid symlink " + "following attempted in sticky world-writable " + "directory by %s (fsuid %d != %d)\n", + get_task_comm(name, current), + cred->fsuid, inode->i_uid); + } + + return rc; +} + +static int yama_generic_permission(struct inode *inode, int mask) +{ + int retval; + + if (inode->i_op->permission) + retval = inode->i_op->permission(inode, mask); + else + retval = generic_permission(inode, mask); + return retval; +} + +/** + * yama_path_link - verify that hardlinking is allowed + * @old_dentry: the source inode/dentry to hardlink from + * @new_dir: target directory + * @new_dentry: the target inode/dentry to hardlink to + * + * Block hardlink when all of: + * - fsuid does not match inode + * - not CAP_FOWNER + * - and at least one of: + * - inode is not a regular file + * - inode is setuid + * - inode is setgid and group-exec + * - access failure for read and write + * + * Returns 0 if successful, -ve on error. + */ +static int yama_path_link(struct dentry *old_dentry, struct path *new_dir, + struct dentry *new_dentry) +{ + int rc = 0; + struct inode *inode = old_dentry->d_inode; + const int mode = inode->i_mode; + const struct cred *cred = current_cred(); + + if (!protected_nonaccess_hardlinks) + return 0; + + if (cred->fsuid != inode->i_uid && + (!S_ISREG(mode) || (mode & S_ISUID) || + ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) || + (yama_generic_permission(inode, MAY_READ | MAY_WRITE))) && + !capable(CAP_FOWNER)) { + char name[sizeof(current->comm)]; + printk_ratelimited(KERN_NOTICE "non-accessible hardlink" + " creation was attempted by: %s (fsuid %d)\n", + get_task_comm(name, current), + cred->fsuid); + rc = -EPERM; + } + + return rc; +} + static struct security_operations yama_ops = { .name = "yama", .ptrace_access_check = yama_ptrace_access_check, + .inode_follow_link = yama_inode_follow_link, + .path_link = yama_path_link, .task_prctl = yama_task_prctl, .task_free = yama_task_free, }; @@ -328,6 +448,24 @@ struct ctl_path yama_sysctl_path[] = { }; static struct ctl_table yama_sysctl_table[] = { + { + .procname = "protected_sticky_symlinks", + .data = &protected_sticky_symlinks, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &max_scope, + }, + { + .procname = "protected_nonaccess_hardlinks", + .data = &protected_nonaccess_hardlinks, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &max_scope, + }, { .procname = "ptrace_scope", .data = &ptrace_scope, -- cgit v1.2.3 From b435a06a8ba4a35d4c166fd257fba32535f36909 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 29 Jun 2010 11:07:44 -0700 Subject: UBUNTU: SAUCE: security: unconditionally chain to Yama LSM This patch forces the LSM to always chain through the Yama LSM regardless of which LSM is selected as the primary LSM. This is not intended for upstream. This is, however, what Ubuntu and ChromeOS are doing. Signed-off-by: Kees Cook Signed-off-by: Tim Gardner --- include/linux/security.h | 42 ++++++++++++++++++++++++++++++++++++++++++ security/security.c | 17 +++++++++++++++++ security/yama/yama_lsm.c | 26 +++++--------------------- 3 files changed, 64 insertions(+), 21 deletions(-) diff --git a/include/linux/security.h b/include/linux/security.h index 4e5a73cdbbef..6ae2a7f42cac 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -3023,5 +3023,47 @@ static inline void free_secdata(void *secdata) { } #endif /* CONFIG_SECURITY */ +#ifdef CONFIG_SECURITY_YAMA +extern int yama_ptrace_access_check(struct task_struct *child, + unsigned int mode); +extern int yama_path_link(struct dentry *old_dentry, struct path *new_dir, + struct dentry *new_dentry); +extern int yama_inode_follow_link(struct dentry *dentry, + struct nameidata *nameidata); +extern void yama_task_free(struct task_struct *task); +extern int yama_task_prctl(int option, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5); +#else +static inline int yama_ptrace_access_check(struct task_struct *child, + unsigned int mode) +{ + return 0; +} + +static inline int yama_path_link(struct dentry *old_dentry, + struct path *new_dir, + struct dentry *new_dentry) +{ + return 0; +} + +static inline int yama_inode_follow_link(struct dentry *dentry, + struct nameidata *nameidata) +{ + return 0; +} + +static inline void yama_task_free(struct task_struct *task) +{ +} + +static inline int yama_task_prctl(int option, unsigned long arg2, + unsigned long arg3, unsigned long arg4, + unsigned long arg5) +{ + return -ENOSYS; +} +#endif /* CONFIG_SECURITY_YAMA */ + #endif /* ! __LINUX_SECURITY_H */ diff --git a/security/security.c b/security/security.c index c7afd35aa2b0..7fb3683fdad9 100644 --- a/security/security.c +++ b/security/security.c @@ -135,6 +135,10 @@ int __init register_security(struct security_operations *ops) int security_ptrace_access_check(struct task_struct *child, unsigned int mode) { + int rc; + rc = yama_ptrace_access_check(child, mode); + if (rc) + return rc; return security_ops->ptrace_access_check(child, mode); } @@ -405,8 +409,12 @@ EXPORT_SYMBOL(security_path_symlink); int security_path_link(struct dentry *old_dentry, struct path *new_dir, struct dentry *new_dentry) { + int rc; if (unlikely(IS_PRIVATE(old_dentry->d_inode))) return 0; + rc = yama_path_link(old_dentry, new_dir, new_dentry); + if (rc) + return rc; return security_ops->path_link(old_dentry, new_dir, new_dentry); } EXPORT_SYMBOL(security_path_link); @@ -525,8 +533,12 @@ EXPORT_SYMBOL(security_inode_readlink); int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd) { + int rc; if (unlikely(IS_PRIVATE(dentry->d_inode))) return 0; + rc = yama_inode_follow_link(dentry, nd); + if (rc) + return rc; return security_ops->inode_follow_link(dentry, nd); } @@ -770,6 +782,7 @@ int security_task_create(unsigned long clone_flags) void security_task_free(struct task_struct *task) { + yama_task_free(task); security_ops->task_free(task); } @@ -885,6 +898,10 @@ int security_task_wait(struct task_struct *p) int security_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { + int rc; + rc = yama_task_prctl(option, arg2, arg3, arg4, arg5); + if (rc != -ENOSYS) + return rc; return security_ops->task_prctl(option, arg2, arg3, arg4, arg5); } diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 50c72221d131..8694b0e3f8e1 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -105,7 +105,7 @@ static void yama_ptracer_del(struct task_struct *tracer, * yama_task_free - check for task_pid to remove from exception list * @task: task being removed */ -static void yama_task_free(struct task_struct *task) +void yama_task_free(struct task_struct *task) { yama_ptracer_del(task, task); } @@ -121,7 +121,7 @@ static void yama_task_free(struct task_struct *task) * Return 0 on success, -ve on error. -ENOSYS is returned when Yama * does not handle the given option. */ -static int yama_task_prctl(int option, unsigned long arg2, unsigned long arg3, +int yama_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { int rc; @@ -248,7 +248,7 @@ static int ptracer_exception_found(struct task_struct *tracer, * * Returns 0 if following the ptrace is allowed, -ve on error. */ -static int yama_ptrace_access_check(struct task_struct *child, +int yama_ptrace_access_check(struct task_struct *child, unsigned int mode) { int rc; @@ -311,7 +311,7 @@ static int yama_ptrace_access_check(struct task_struct *child, * * Returns 0 if following the symlink is allowed, -ve on error. */ -static int yama_inode_follow_link(struct dentry *dentry, +int yama_inode_follow_link(struct dentry *dentry, struct nameidata *nameidata) { int rc = 0; @@ -381,7 +381,7 @@ static int yama_generic_permission(struct inode *inode, int mask) * * Returns 0 if successful, -ve on error. */ -static int yama_path_link(struct dentry *old_dentry, struct path *new_dir, +int yama_path_link(struct dentry *old_dentry, struct path *new_dir, struct dentry *new_dentry) { int rc = 0; @@ -408,16 +408,6 @@ static int yama_path_link(struct dentry *old_dentry, struct path *new_dir, return rc; } -static struct security_operations yama_ops = { - .name = "yama", - - .ptrace_access_check = yama_ptrace_access_check, - .inode_follow_link = yama_inode_follow_link, - .path_link = yama_path_link, - .task_prctl = yama_task_prctl, - .task_free = yama_task_free, -}; - #ifdef CONFIG_SYSCTL static int yama_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -481,14 +471,8 @@ static struct ctl_table yama_sysctl_table[] = { static __init int yama_init(void) { - if (!security_module_enable(&yama_ops)) - return 0; - printk(KERN_INFO "Yama: becoming mindful.\n"); - if (register_security(&yama_ops)) - panic("Yama: kernel registration failed.\n"); - #ifdef CONFIG_SYSCTL if (!register_sysctl_paths(yama_sysctl_path, yama_sysctl_table)) panic("Yama: sysctl registration failed.\n"); -- cgit v1.2.3 From c773515e6c07e8585e4d99130609857a84bfc614 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 16 Feb 2012 18:27:25 +0000 Subject: UBUNTU: ubuntu: AUFS -- adapt to the new changelog handling Signed-off-by: Andy Whitcroft --- ubuntu/aufs-update | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) mode change 100644 => 100755 ubuntu/aufs-update diff --git a/ubuntu/aufs-update b/ubuntu/aufs-update old mode 100644 new mode 100755 index 5b4b5a6daef0..796ae1d471a9 --- a/ubuntu/aufs-update +++ b/ubuntu/aufs-update @@ -12,8 +12,8 @@ aufs="$1" # Get the current tip name { read x url - read x o_tip_sha1 read x o_log_sha1 + read x o_tip_sha1 } Date: Thu, 16 Feb 2012 20:07:29 +0000 Subject: UBUNTU: ubuntu: AUFS -- sort out the relative header paths Signed-off-by: Andy Whitcroft --- ubuntu/aufs-update | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ubuntu/aufs-update b/ubuntu/aufs-update index 796ae1d471a9..a31cf944d7ff 100755 --- a/ubuntu/aufs-update +++ b/ubuntu/aufs-update @@ -29,12 +29,16 @@ git checkout -f HEAD -- aufs/BOM aufs/BOM.UPDATING # Reinsert the include update. #sed -i -e '1iEXTRA_CFLAGS += -I$(src)/include' aufs/Makefile +# Fix the relative patch references ... +sed -i -e 's@/include/@/ubuntu/include/@g;' aufs/Makefile + ## # Find the latest commit in the ChangeLog. ## read x n_log_sha1 <"$aufs/ChangeLog" # Insert the new commit ID and commit the result. sed -i -e "s/^COMMIT: .*/COMMIT: $n_tip_sha1/" aufs/BOM sed -i -e "s/^CHANGELOG: .*/CHANGELOG: $n_log_sha1/" aufs/BOM + git add aufs include/linux { echo "UBUNTU: ubuntu: AUFS -- update to $n_tip_sha1" -- cgit v1.2.3 From 6a53f22279171a8d5b7c5da1d2ef17df176771eb Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 16 Feb 2012 20:08:20 +0000 Subject: UBUNTU: ubuntu: AUFS -- update to d266b0c5d0693d6383976ee54b9e2c0fa9a3f5b0 Andy Whitcroft (1): UBUNTU: ubuntu: AUFS -- suppress benign plink warning messages J. R. Okajima (10): aufs: headers 1/2, bugfix, where the pr_fmt macro definition aufs: headers 2/2, simply refined aufs: tiny, update the year aufs: update the donator aufs stdalone: include path in Makefile aufs: tiny, update the year aufs: tiny, remove a duplicated header by accident aufs: tiny, restore the removed header files for 2.6.38 make aufs-version 3.2 aufs3.2 20120109 Signed-off-by: Andy Whitcroft --- ubuntu/aufs/BOM | 2 +- ubuntu/aufs/Makefile | 8 ++++++-- ubuntu/aufs/aufs.h | 2 +- ubuntu/aufs/branch.c | 3 +-- ubuntu/aufs/branch.h | 4 +--- ubuntu/aufs/cpup.c | 4 +--- ubuntu/aufs/cpup.h | 4 +--- ubuntu/aufs/dbgaufs.c | 2 +- ubuntu/aufs/dbgaufs.h | 5 +---- ubuntu/aufs/dcsub.c | 2 +- ubuntu/aufs/dcsub.h | 3 +-- ubuntu/aufs/debug.c | 3 +-- ubuntu/aufs/debug.h | 11 +---------- ubuntu/aufs/dentry.c | 2 +- ubuntu/aufs/dentry.h | 3 +-- ubuntu/aufs/dinfo.c | 2 +- ubuntu/aufs/dir.c | 3 +-- ubuntu/aufs/dir.h | 3 +-- ubuntu/aufs/dynop.c | 2 +- ubuntu/aufs/dynop.h | 6 +----- ubuntu/aufs/export.c | 3 +-- ubuntu/aufs/f_op.c | 4 +--- ubuntu/aufs/f_op_sp.c | 3 +-- ubuntu/aufs/file.c | 5 +---- ubuntu/aufs/file.h | 3 +-- ubuntu/aufs/finfo.c | 3 +-- ubuntu/aufs/fstype.h | 3 +-- ubuntu/aufs/hfsnotify.c | 2 +- ubuntu/aufs/hfsplus.c | 3 +-- ubuntu/aufs/hnotify.c | 2 +- ubuntu/aufs/i_op.c | 4 +--- ubuntu/aufs/i_op_add.c | 2 +- ubuntu/aufs/i_op_del.c | 2 +- ubuntu/aufs/i_op_ren.c | 2 +- ubuntu/aufs/iinfo.c | 2 +- ubuntu/aufs/inode.c | 2 +- ubuntu/aufs/inode.h | 4 +--- ubuntu/aufs/ioctl.c | 3 +-- ubuntu/aufs/loop.c | 2 +- ubuntu/aufs/loop.h | 2 +- ubuntu/aufs/module.c | 2 +- ubuntu/aufs/module.h | 2 +- ubuntu/aufs/opts.c | 4 +--- ubuntu/aufs/opts.h | 3 +-- ubuntu/aufs/plink.c | 2 +- ubuntu/aufs/poll.c | 2 +- ubuntu/aufs/procfs.c | 2 +- ubuntu/aufs/rdu.c | 4 +--- ubuntu/aufs/rwsem.h | 3 +-- ubuntu/aufs/sbinfo.c | 3 +-- ubuntu/aufs/spl.h | 6 +----- ubuntu/aufs/super.c | 5 ++--- ubuntu/aufs/super.h | 3 +-- ubuntu/aufs/sysaufs.c | 4 +--- ubuntu/aufs/sysaufs.h | 3 +-- ubuntu/aufs/sysfs.c | 5 +---- ubuntu/aufs/sysrq.c | 5 +---- ubuntu/aufs/vdir.c | 3 +-- ubuntu/aufs/vfsub.c | 4 +--- ubuntu/aufs/vfsub.h | 2 +- ubuntu/aufs/wbr_policy.c | 2 +- ubuntu/aufs/whout.c | 3 +-- ubuntu/aufs/whout.h | 3 +-- ubuntu/aufs/wkq.c | 2 +- ubuntu/aufs/wkq.h | 6 +----- ubuntu/aufs/xino.c | 4 +--- ubuntu/include/linux/aufs_type.h | 25 +++++++++++++++++-------- 67 files changed, 89 insertions(+), 153 deletions(-) diff --git a/ubuntu/aufs/BOM b/ubuntu/aufs/BOM index 04dc856855fb..f549f2e3ea1b 100644 --- a/ubuntu/aufs/BOM +++ b/ubuntu/aufs/BOM @@ -1,3 +1,3 @@ URL: git://aufs.git.sourceforge.net/gitroot/aufs/aufs3-standalone.git CHANGELOG: -COMMIT: 4cf5db36bcd9748e8e7270022f295f84d1fc2245 +COMMIT: d266b0c5d0693d6383976ee54b9e2c0fa9a3f5b0 diff --git a/ubuntu/aufs/Makefile b/ubuntu/aufs/Makefile index 46615fe113e6..7e60943ffd4b 100644 --- a/ubuntu/aufs/Makefile +++ b/ubuntu/aufs/Makefile @@ -8,8 +8,12 @@ endif # cf. include/linux/kernel.h # enable pr_debug ccflags-y += -DDEBUG -# sparse doesn't allow spaces -ccflags-y += -D'pr_fmt(fmt)=AUFS_NAME"\040%s:%d:%s[%d]:\040"fmt,__func__,__LINE__,current->comm,current->pid' +# sparse requires the full pathname +ifdef M +ccflags-y += -include ${M}/../../ubuntu/include/linux/aufs_type.h +else +ccflags-y += -include ${srctree}/ubuntu/include/linux/aufs_type.h +endif obj-$(CONFIG_AUFS_FS) += aufs.o aufs-y := module.o sbinfo.o super.o branch.o xino.o sysaufs.o opts.o \ diff --git a/ubuntu/aufs/aufs.h b/ubuntu/aufs/aufs.h index 3fd15c26ff41..33963a85a68c 100644 --- a/ubuntu/aufs/aufs.h +++ b/ubuntu/aufs/aufs.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/branch.c b/ubuntu/aufs/branch.c index dccc7409931d..4cfe9e28e2a7 100644 --- a/ubuntu/aufs/branch.c +++ b/ubuntu/aufs/branch.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -21,7 +21,6 @@ */ #include -#include #include #include "aufs.h" diff --git a/ubuntu/aufs/branch.h b/ubuntu/aufs/branch.h index 1f39ad6c831f..3870bc3f4bd6 100644 --- a/ubuntu/aufs/branch.h +++ b/ubuntu/aufs/branch.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -25,9 +25,7 @@ #ifdef __KERNEL__ -#include #include -#include #include "dynop.h" #include "rwsem.h" #include "super.h" diff --git a/ubuntu/aufs/cpup.c b/ubuntu/aufs/cpup.c index 13fc30490674..7bf2672a9cbc 100644 --- a/ubuntu/aufs/cpup.c +++ b/ubuntu/aufs/cpup.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,10 +20,8 @@ * copy-up functions, see wbr_policy.c for copy-down */ -#include #include #include -#include #include "aufs.h" void au_cpup_attr_flags(struct inode *dst, struct inode *src) diff --git a/ubuntu/aufs/cpup.h b/ubuntu/aufs/cpup.h index 8e0aa591c626..2e4fbe079a25 100644 --- a/ubuntu/aufs/cpup.h +++ b/ubuntu/aufs/cpup.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -26,8 +26,6 @@ #ifdef __KERNEL__ #include -#include -#include struct inode; struct file; diff --git a/ubuntu/aufs/dbgaufs.c b/ubuntu/aufs/dbgaufs.c index 26d1d03ae0e6..d1dbfd2655ec 100644 --- a/ubuntu/aufs/dbgaufs.c +++ b/ubuntu/aufs/dbgaufs.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/dbgaufs.h b/ubuntu/aufs/dbgaufs.h index 2eaa3f021bc9..8fe49742e558 100644 --- a/ubuntu/aufs/dbgaufs.h +++ b/ubuntu/aufs/dbgaufs.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -25,9 +25,6 @@ #ifdef __KERNEL__ -#include -#include - struct super_block; struct au_sbinfo; diff --git a/ubuntu/aufs/dcsub.c b/ubuntu/aufs/dcsub.c index 9d8cf10938f7..5e8321eca3d4 100644 --- a/ubuntu/aufs/dcsub.c +++ b/ubuntu/aufs/dcsub.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/dcsub.h b/ubuntu/aufs/dcsub.h index 1211304afe87..3d10731e2769 100644 --- a/ubuntu/aufs/dcsub.h +++ b/ubuntu/aufs/dcsub.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -27,7 +27,6 @@ #include #include -#include struct dentry; diff --git a/ubuntu/aufs/debug.c b/ubuntu/aufs/debug.c index 040a4722a2ae..9c9091d290a3 100644 --- a/ubuntu/aufs/debug.c +++ b/ubuntu/aufs/debug.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,7 +20,6 @@ * debug print functions */ -#include #include #include "aufs.h" diff --git a/ubuntu/aufs/debug.h b/ubuntu/aufs/debug.h index 453459e664d9..c4d5610a7f5c 100644 --- a/ubuntu/aufs/debug.h +++ b/ubuntu/aufs/debug.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -26,18 +26,9 @@ #ifdef __KERNEL__ #include -#include -/* #include */ -#include #include #include -/* #include */ -#include -/* #include */ #include -#include - -#include #ifdef CONFIG_AUFS_DEBUG #define AuDebugOn(a) BUG_ON(a) diff --git a/ubuntu/aufs/dentry.c b/ubuntu/aufs/dentry.c index 176fbdcd6ec8..11ed8c25faf1 100644 --- a/ubuntu/aufs/dentry.c +++ b/ubuntu/aufs/dentry.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/dentry.h b/ubuntu/aufs/dentry.h index 4f0827583c4a..bbd9dc643efd 100644 --- a/ubuntu/aufs/dentry.h +++ b/ubuntu/aufs/dentry.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -26,7 +26,6 @@ #ifdef __KERNEL__ #include -#include #include "rwsem.h" struct au_hdentry { diff --git a/ubuntu/aufs/dinfo.c b/ubuntu/aufs/dinfo.c index a2a6809ca7cf..24f22a91d21e 100644 --- a/ubuntu/aufs/dinfo.c +++ b/ubuntu/aufs/dinfo.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/dir.c b/ubuntu/aufs/dir.c index 2743616d97ad..ec5a236b46d2 100644 --- a/ubuntu/aufs/dir.c +++ b/ubuntu/aufs/dir.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,7 +20,6 @@ * directory operations */ -#include #include #include "aufs.h" diff --git a/ubuntu/aufs/dir.h b/ubuntu/aufs/dir.h index 2d0d8d2b1518..dc52a6325c33 100644 --- a/ubuntu/aufs/dir.h +++ b/ubuntu/aufs/dir.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -26,7 +26,6 @@ #ifdef __KERNEL__ #include -#include /* ---------------------------------------------------------------------- */ diff --git a/ubuntu/aufs/dynop.c b/ubuntu/aufs/dynop.c index 9b874c99c194..2aedc11fa079 100644 --- a/ubuntu/aufs/dynop.c +++ b/ubuntu/aufs/dynop.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2011 Junjiro R. Okajima + * Copyright (C) 2010-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/dynop.h b/ubuntu/aufs/dynop.h index c3217d433153..029b08cff025 100644 --- a/ubuntu/aufs/dynop.h +++ b/ubuntu/aufs/dynop.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2011 Junjiro R. Okajima + * Copyright (C) 2010-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -25,10 +25,6 @@ #ifdef __KERNEL__ -#include -#include -#include -#include #include "inode.h" enum {AuDy_AOP, AuDyLast}; diff --git a/ubuntu/aufs/export.c b/ubuntu/aufs/export.c index 05656fd1cf31..962d8e04f9e0 100644 --- a/ubuntu/aufs/export.c +++ b/ubuntu/aufs/export.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -21,7 +21,6 @@ */ #include -#include #include #include #include diff --git a/ubuntu/aufs/f_op.c b/ubuntu/aufs/f_op.c index 43b5bca91887..942e71a7781c 100644 --- a/ubuntu/aufs/f_op.c +++ b/ubuntu/aufs/f_op.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,10 +20,8 @@ * file and vm operations */ -#include #include #include -#include #include #include "aufs.h" diff --git a/ubuntu/aufs/f_op_sp.c b/ubuntu/aufs/f_op_sp.c index 4f5cde800d40..48b8aa18af27 100644 --- a/ubuntu/aufs/f_op_sp.c +++ b/ubuntu/aufs/f_op_sp.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -22,7 +22,6 @@ * their file I/O is handled out of aufs. */ -#include #include "aufs.h" static ssize_t aufs_aio_read_sp(struct kiocb *kio, const struct iovec *iov, diff --git a/ubuntu/aufs/file.c b/ubuntu/aufs/file.c index b798891953f6..0b9aa487f012 100644 --- a/ubuntu/aufs/file.c +++ b/ubuntu/aufs/file.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,9 +20,6 @@ * handling file/dir, and address_space operation */ -#include -#include -#include #include #include "aufs.h" diff --git a/ubuntu/aufs/file.h b/ubuntu/aufs/file.h index 831dc5d252a0..8bd6e3bab1a0 100644 --- a/ubuntu/aufs/file.h +++ b/ubuntu/aufs/file.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -28,7 +28,6 @@ #include #include #include -#include #include "rwsem.h" struct au_branch; diff --git a/ubuntu/aufs/finfo.c b/ubuntu/aufs/finfo.c index 3a8fa30d4965..3208510c5ee9 100644 --- a/ubuntu/aufs/finfo.c +++ b/ubuntu/aufs/finfo.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,7 +20,6 @@ * file private data */ -#include #include "aufs.h" void au_hfput(struct au_hfile *hf, struct file *file) diff --git a/ubuntu/aufs/fstype.h b/ubuntu/aufs/fstype.h index 3bdbd143a824..a6b1037eeeeb 100644 --- a/ubuntu/aufs/fstype.h +++ b/ubuntu/aufs/fstype.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -28,7 +28,6 @@ #include #include #include -#include static inline int au_test_aufs(struct super_block *sb) { diff --git a/ubuntu/aufs/hfsnotify.c b/ubuntu/aufs/hfsnotify.c index deb0ab4a0ae6..c48c315ac3e2 100644 --- a/ubuntu/aufs/hfsnotify.c +++ b/ubuntu/aufs/hfsnotify.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/hfsplus.c b/ubuntu/aufs/hfsplus.c index 678b69c69684..9941f76083c5 100644 --- a/ubuntu/aufs/hfsplus.c +++ b/ubuntu/aufs/hfsplus.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2011 Junjiro R. Okajima + * Copyright (C) 2010-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -26,7 +26,6 @@ * and au_h_open_post() after releasing it. */ -#include #include "aufs.h" struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex) diff --git a/ubuntu/aufs/hnotify.c b/ubuntu/aufs/hnotify.c index 5ec653c1cf3c..251588c4c8e5 100644 --- a/ubuntu/aufs/hnotify.c +++ b/ubuntu/aufs/hnotify.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/i_op.c b/ubuntu/aufs/i_op.c index b99e7da0da6e..fb0e646474e7 100644 --- a/ubuntu/aufs/i_op.c +++ b/ubuntu/aufs/i_op.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -22,10 +22,8 @@ #include #include -#include #include #include -#include #include "aufs.h" static int h_permission(struct inode *h_inode, int mask, diff --git a/ubuntu/aufs/i_op_add.c b/ubuntu/aufs/i_op_add.c index d1aef9386ad9..c5255c6a7b9d 100644 --- a/ubuntu/aufs/i_op_add.c +++ b/ubuntu/aufs/i_op_add.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/i_op_del.c b/ubuntu/aufs/i_op_del.c index 309d936f7432..e69c151ce75e 100644 --- a/ubuntu/aufs/i_op_del.c +++ b/ubuntu/aufs/i_op_del.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/i_op_ren.c b/ubuntu/aufs/i_op_ren.c index e8025baa97f6..f253bec7efa9 100644 --- a/ubuntu/aufs/i_op_ren.c +++ b/ubuntu/aufs/i_op_ren.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/iinfo.c b/ubuntu/aufs/iinfo.c index cd1086eb4595..d9571c47609f 100644 --- a/ubuntu/aufs/iinfo.c +++ b/ubuntu/aufs/iinfo.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/inode.c b/ubuntu/aufs/inode.c index 00ae8877b7d2..df70edee95c3 100644 --- a/ubuntu/aufs/inode.c +++ b/ubuntu/aufs/inode.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/inode.h b/ubuntu/aufs/inode.h index 0c500474c963..1fe58f632534 100644 --- a/ubuntu/aufs/inode.h +++ b/ubuntu/aufs/inode.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -25,9 +25,7 @@ #ifdef __KERNEL__ -#include #include -#include #include "rwsem.h" struct vfsmount; diff --git a/ubuntu/aufs/ioctl.c b/ubuntu/aufs/ioctl.c index 93151b350ee3..30a49dff8214 100644 --- a/ubuntu/aufs/ioctl.c +++ b/ubuntu/aufs/ioctl.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -22,7 +22,6 @@ * assist the pathconf(3) wrapper library. */ -#include #include "aufs.h" static int au_wbr_fd(struct path *path, struct aufs_wbr_fd __user *arg) diff --git a/ubuntu/aufs/loop.c b/ubuntu/aufs/loop.c index 4998192e6f17..317f0e46de33 100644 --- a/ubuntu/aufs/loop.c +++ b/ubuntu/aufs/loop.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/loop.h b/ubuntu/aufs/loop.h index fc156e804530..b7af6a73cad8 100644 --- a/ubuntu/aufs/loop.h +++ b/ubuntu/aufs/loop.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/module.c b/ubuntu/aufs/module.c index 175f03475776..ef288b867244 100644 --- a/ubuntu/aufs/module.c +++ b/ubuntu/aufs/module.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/module.h b/ubuntu/aufs/module.h index e61fa54768dc..242f4ea3e99c 100644 --- a/ubuntu/aufs/module.h +++ b/ubuntu/aufs/module.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/opts.c b/ubuntu/aufs/opts.c index 4e868a6de6c7..35c18ef307e9 100644 --- a/ubuntu/aufs/opts.c +++ b/ubuntu/aufs/opts.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,8 +20,6 @@ * mount options/flags */ -#include -#include #include #include /* a distribution requires */ #include diff --git a/ubuntu/aufs/opts.h b/ubuntu/aufs/opts.h index 1ef371020d64..958f5119b01e 100644 --- a/ubuntu/aufs/opts.h +++ b/ubuntu/aufs/opts.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -26,7 +26,6 @@ #ifdef __KERNEL__ #include -#include struct file; struct super_block; diff --git a/ubuntu/aufs/plink.c b/ubuntu/aufs/plink.c index 1824e6df635e..6338461902b6 100644 --- a/ubuntu/aufs/plink.c +++ b/ubuntu/aufs/plink.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/poll.c b/ubuntu/aufs/poll.c index 873f3bb5e6be..59d1a4b285ea 100644 --- a/ubuntu/aufs/poll.c +++ b/ubuntu/aufs/poll.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/procfs.c b/ubuntu/aufs/procfs.c index 25463319bad7..8169782d5e3a 100644 --- a/ubuntu/aufs/procfs.c +++ b/ubuntu/aufs/procfs.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010-2011 Junjiro R. Okajima + * Copyright (C) 2010-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/rdu.c b/ubuntu/aufs/rdu.c index 4e0756dc312b..765650379404 100644 --- a/ubuntu/aufs/rdu.c +++ b/ubuntu/aufs/rdu.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -23,8 +23,6 @@ #include #include #include -#include -#include #include "aufs.h" /* bits for struct aufs_rdu.flags */ diff --git a/ubuntu/aufs/rwsem.h b/ubuntu/aufs/rwsem.h index dcd1def9eed8..d6c1b3767308 100644 --- a/ubuntu/aufs/rwsem.h +++ b/ubuntu/aufs/rwsem.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -25,7 +25,6 @@ #ifdef __KERNEL__ -#include #include "debug.h" struct au_rwsem { diff --git a/ubuntu/aufs/sbinfo.c b/ubuntu/aufs/sbinfo.c index d9141f8cc48c..2448efe08014 100644 --- a/ubuntu/aufs/sbinfo.c +++ b/ubuntu/aufs/sbinfo.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,7 +20,6 @@ * superblock private data */ -#include #include "aufs.h" /* diff --git a/ubuntu/aufs/spl.h b/ubuntu/aufs/spl.h index 37a032907a32..743a3076207c 100644 --- a/ubuntu/aufs/spl.h +++ b/ubuntu/aufs/spl.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -25,10 +25,6 @@ #ifdef __KERNEL__ -#include -#include -#include - struct au_splhead { spinlock_t spin; struct list_head head; diff --git a/ubuntu/aufs/super.c b/ubuntu/aufs/super.c index 595282e8700c..c798792b74e9 100644 --- a/ubuntu/aufs/super.c +++ b/ubuntu/aufs/super.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,8 +20,7 @@ * mount and super_block operations */ -#include -#include +#include #include #include #include diff --git a/ubuntu/aufs/super.h b/ubuntu/aufs/super.h index f490b4af4c86..9967dee0a50e 100644 --- a/ubuntu/aufs/super.h +++ b/ubuntu/aufs/super.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -26,7 +26,6 @@ #ifdef __KERNEL__ #include -#include #include "rwsem.h" #include "spl.h" #include "wkq.h" diff --git a/ubuntu/aufs/sysaufs.c b/ubuntu/aufs/sysaufs.c index ab4036e18a67..a675324ca6f2 100644 --- a/ubuntu/aufs/sysaufs.c +++ b/ubuntu/aufs/sysaufs.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -21,9 +21,7 @@ * they are necessary regardless sysfs is disabled. */ -#include #include -#include #include "aufs.h" unsigned long sysaufs_si_mask; diff --git a/ubuntu/aufs/sysaufs.h b/ubuntu/aufs/sysaufs.h index 5b6b8d0c5a85..5aaff3bb608b 100644 --- a/ubuntu/aufs/sysaufs.h +++ b/ubuntu/aufs/sysaufs.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -26,7 +26,6 @@ #ifdef __KERNEL__ #include -#include #include "module.h" struct super_block; diff --git a/ubuntu/aufs/sysfs.c b/ubuntu/aufs/sysfs.c index 215cd6e5a2fb..f72519dac361 100644 --- a/ubuntu/aufs/sysfs.c +++ b/ubuntu/aufs/sysfs.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,10 +20,7 @@ * sysfs interface */ -#include -#include #include -#include #include "aufs.h" #ifdef CONFIG_AUFS_FS_MODULE diff --git a/ubuntu/aufs/sysrq.c b/ubuntu/aufs/sysrq.c index 59d3b521b22a..c481d2b6dd58 100644 --- a/ubuntu/aufs/sysrq.c +++ b/ubuntu/aufs/sysrq.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,9 +20,6 @@ * magic sysrq hanlder */ -#include -#include -#include /* #include */ #include #include "aufs.h" diff --git a/ubuntu/aufs/vdir.c b/ubuntu/aufs/vdir.c index 5e4237686650..3f16d5153774 100644 --- a/ubuntu/aufs/vdir.c +++ b/ubuntu/aufs/vdir.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,7 +20,6 @@ * virtual or vertical directory */ -#include #include "aufs.h" static unsigned int calc_size(int nlen) diff --git a/ubuntu/aufs/vfsub.c b/ubuntu/aufs/vfsub.c index 301e34c50100..331ab17fb5c1 100644 --- a/ubuntu/aufs/vfsub.c +++ b/ubuntu/aufs/vfsub.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,12 +20,10 @@ * sub-routines for VFS */ -#include #include #include #include #include -#include #include "aufs.h" int vfsub_update_h_iattr(struct path *h_path, int *did) diff --git a/ubuntu/aufs/vfsub.h b/ubuntu/aufs/vfsub.h index fe0751811bbf..ce4d5b683eb0 100644 --- a/ubuntu/aufs/vfsub.h +++ b/ubuntu/aufs/vfsub.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/wbr_policy.c b/ubuntu/aufs/wbr_policy.c index dec340a4ef0d..2bf2efaa33b4 100644 --- a/ubuntu/aufs/wbr_policy.c +++ b/ubuntu/aufs/wbr_policy.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/whout.c b/ubuntu/aufs/whout.c index de3a9f5bee2c..676486362d54 100644 --- a/ubuntu/aufs/whout.c +++ b/ubuntu/aufs/whout.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,7 +20,6 @@ * whiteout for logical deletion and opaque directory */ -#include #include "aufs.h" #define WH_MASK S_IRUGO diff --git a/ubuntu/aufs/whout.h b/ubuntu/aufs/whout.h index 11137f40049d..f7c45f0998fa 100644 --- a/ubuntu/aufs/whout.h +++ b/ubuntu/aufs/whout.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -25,7 +25,6 @@ #ifdef __KERNEL__ -#include #include "dir.h" /* whout.c */ diff --git a/ubuntu/aufs/wkq.c b/ubuntu/aufs/wkq.c index d75808a1a7e3..91c739f476f3 100644 --- a/ubuntu/aufs/wkq.c +++ b/ubuntu/aufs/wkq.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/ubuntu/aufs/wkq.h b/ubuntu/aufs/wkq.h index c6352aeeac89..0e44f73c2a06 100644 --- a/ubuntu/aufs/wkq.h +++ b/ubuntu/aufs/wkq.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -26,10 +26,6 @@ #ifdef __KERNEL__ -#include -#include -#include - struct super_block; /* ---------------------------------------------------------------------- */ diff --git a/ubuntu/aufs/xino.c b/ubuntu/aufs/xino.c index 835fe5ca81f5..2285299293ef 100644 --- a/ubuntu/aufs/xino.c +++ b/ubuntu/aufs/xino.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,9 +20,7 @@ * external inode number translation table and bitmap */ -#include #include -#include #include "aufs.h" /* todo: unnecessary to support mmap_sem since kernel-space? */ diff --git a/ubuntu/include/linux/aufs_type.h b/ubuntu/include/linux/aufs_type.h index 6738afb001f0..2f2fbcdbcce5 100644 --- a/ubuntu/include/linux/aufs_type.h +++ b/ubuntu/include/linux/aufs_type.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2005-2011 Junjiro R. Okajima + * Copyright (C) 2005-2012 Junjiro R. Okajima * * This program, aufs is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -19,17 +19,27 @@ #ifndef __AUFS_TYPE_H__ #define __AUFS_TYPE_H__ -#include -#include -#include +#define AUFS_NAME "aufs" + #ifdef __KERNEL__ -#include +/* + * define it before including all other headers. + * sched.h may use pr_* macros before defining "current", so define the + * no-current version first, and re-define later. + */ +#define pr_fmt(fmt) AUFS_NAME " %s:%d: " fmt, __func__, __LINE__ +#include +#undef pr_fmt +#define pr_fmt(fmt) AUFS_NAME " %s:%d:%s[%d]: " fmt, \ + __func__, __LINE__, current->comm, current->pid #else #include #include -#endif +#endif /* __KERNEL__ */ + +#include -#define AUFS_VERSION "3.x-rcN-20111205" +#define AUFS_VERSION "3.2-20120109" /* todo? move this to linux-2.6.19/include/magic.h */ #define AUFS_SUPER_MAGIC ('a' << 24 | 'u' << 16 | 'f' << 8 | 's') @@ -58,7 +68,6 @@ typedef int16_t aufs_bindex_t; /* ---------------------------------------------------------------------- */ -#define AUFS_NAME "aufs" #define AUFS_FSTYPE AUFS_NAME #define AUFS_ROOT_INO 2 -- cgit v1.2.3 From 919360866c9e78a1bc0e8b3ca6d0b1a87ad45d91 Mon Sep 17 00:00:00 2001 From: Chase Douglas Date: Fri, 24 Feb 2012 15:05:50 -0800 Subject: UBUNTU: SAUCE: Input: synaptics - add second variant of two-button clickpad This is necessary for clickpad detection of Synaptics trackpads in Dell Mini 10 series of laptops. Signed-off-by: Chase Douglas Acked-by: Seth Forshee Acked-by: Andy Whitcroft Signed-off-by: Tim Gardner --- drivers/input/mouse/synaptics.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/mouse/synaptics.h b/drivers/input/mouse/synaptics.h index fd26ccca13d7..816d7bd07d85 100644 --- a/drivers/input/mouse/synaptics.h +++ b/drivers/input/mouse/synaptics.h @@ -80,6 +80,7 @@ */ #define SYN_CAP_CLICKPAD(ex0c) ((ex0c) & 0x100000) /* 1-button ClickPad */ #define SYN_CAP_CLICKPAD2BTN(ex0c) ((ex0c) & 0x000100) /* 2-button ClickPad */ +#define SYN_CAP_CLICKPAD2BTN2(ex0c) ((ex0c) & 0x200000) /* 2-button ClickPad */ #define SYN_CAP_MAX_DIMENSIONS(ex0c) ((ex0c) & 0x020000) #define SYN_CAP_MIN_DIMENSIONS(ex0c) ((ex0c) & 0x002000) #define SYN_CAP_ADV_GESTURE(ex0c) ((ex0c) & 0x080000) -- cgit v1.2.3 From 23a2c333616de450d4875f802ff163acd47e678c Mon Sep 17 00:00:00 2001 From: Chase Douglas Date: Fri, 24 Feb 2012 15:05:51 -0800 Subject: UBUNTU: SAUCE: Input: synapticss - Set buttonpad property for all clickpads Signed-off-by: Chase Douglas Acked-by: Seth Forshee Acked-by: Andy Whitcroft Signed-off-by: Tim Gardner --- drivers/input/mouse/synaptics.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c index c703d53be3a0..c6d986963992 100644 --- a/drivers/input/mouse/synaptics.c +++ b/drivers/input/mouse/synaptics.c @@ -1223,7 +1223,9 @@ static void set_input_params(struct input_dev *dev, struct synaptics_data *priv) /* Clickpads report only left button */ __clear_bit(BTN_RIGHT, dev->keybit); __clear_bit(BTN_MIDDLE, dev->keybit); - } + } else if (SYN_CAP_CLICKPAD2BTN(priv->ext_cap_0c) || + SYN_CAP_CLICKPAD2BTN2(priv->ext_cap_0c)) + __set_bit(INPUT_PROP_BUTTONPAD, dev->propbit); } static ssize_t synaptics_show_disable_gesture(struct psmouse *psmouse, -- cgit v1.2.3 From d261c57183425db9927f4280fd381fea24d9b8a7 Mon Sep 17 00:00:00 2001 From: Tim Gardner Date: Fri, 2 Mar 2012 06:06:13 -0700 Subject: UBUNTU: [Config] Enable aufs BugLink: http://bugs.launchpad.net/bugs/943119 https://lists.ubuntu.com/archives/ubuntu-devel/2012-March/034869.html Signed-off-by: Tim Gardner --- ubuntu/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ubuntu/Makefile b/ubuntu/Makefile index 66bbdf69d6f4..54550aa97286 100644 --- a/ubuntu/Makefile +++ b/ubuntu/Makefile @@ -19,7 +19,7 @@ ## ## ## -#obj-$(CONFIG_AUFS_FS) += aufs/ +obj-$(CONFIG_AUFS_FS) += aufs/ ## ## ## -- cgit v1.2.3 From 075ce3c9c1a69a281c8084e0b0496687eb80f0d0 Mon Sep 17 00:00:00 2001 From: Leann Ogasawara Date: Mon, 19 Mar 2012 10:06:18 -0700 Subject: UBUNTU: [Config] Disable AUFS Build failure: ubuntu/aufs/i_op.c:701:8: error: too many arguments to function 'security_path_chmod' Signed-off-by: Leann Ogasawara --- ubuntu/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ubuntu/Makefile b/ubuntu/Makefile index 54550aa97286..66bbdf69d6f4 100644 --- a/ubuntu/Makefile +++ b/ubuntu/Makefile @@ -19,7 +19,7 @@ ## ## ## -obj-$(CONFIG_AUFS_FS) += aufs/ +#obj-$(CONFIG_AUFS_FS) += aufs/ ## ## ## -- cgit v1.2.3 From fc15b3db597e21a2a1feb402ee07539be1b389b9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 30 Jan 2012 08:17:26 -0800 Subject: UBUNTU: SAUCE: AppArmor: Disable Add PR_{GET,SET}_NO_NEW_PRIVS to prevent execve from granting privs With this set, a lot of dangerous operations (chroot, unshare, etc) become a lot less dangerous because there is no possibility of subverting privileged binaries. This patch completely breaks apparmor. Someone who understands (and uses) apparmor should fix it or at least give me a hint. Signed-off-by: Andy Lutomirski Signed-off-by: Kees Cook --- security/apparmor/domain.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index b81ea10a17a3..507dd8583b05 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -360,6 +360,10 @@ int apparmor_bprm_set_creds(struct linux_binprm *bprm) if (bprm->cred_prepared) return 0; + /* XXX: no_new_privs is not usable with AppArmor yet */ + if (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS) + return -EPERM; + cxt = bprm->cred->security; BUG_ON(!cxt); -- cgit v1.2.3 From 3b358582a4fbbd81a56eeab5c6e7d76d99be23e4 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Thu, 22 Jul 2010 02:32:02 -0700 Subject: UBUNTU: SAUCE: AppArmor: Add profile introspection file to interface Add the dynamic profiles file to the interace, to allow load policy introspection. Signed-off-by: John Johansen Acked-by: Kees Cook Signed-off-by: Tim Gardner --- security/apparmor/apparmorfs.c | 227 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 227 insertions(+) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 16c15ec6f670..89bdc62ddd24 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -182,6 +182,232 @@ const struct file_operations aa_fs_seq_file_ops = { .release = single_release, }; +/** + * __next_namespace - find the next namespace to list + * @root: root namespace to stop search at (NOT NULL) + * @ns: current ns position (NOT NULL) + * + * Find the next namespace from @ns under @root and handle all locking needed + * while switching current namespace. + * + * Returns: next namespace or NULL if at last namespace under @root + * NOTE: will not unlock root->lock + */ +static struct aa_namespace *__next_namespace(struct aa_namespace *root, + struct aa_namespace *ns) +{ + struct aa_namespace *parent; + + /* is next namespace a child */ + if (!list_empty(&ns->sub_ns)) { + struct aa_namespace *next; + next = list_first_entry(&ns->sub_ns, typeof(*ns), base.list); + read_lock(&next->lock); + return next; + } + + /* check if the next ns is a sibling, parent, gp, .. */ + parent = ns->parent; + while (parent) { + read_unlock(&ns->lock); + list_for_each_entry_continue(ns, &parent->sub_ns, base.list) { + read_lock(&ns->lock); + return ns; + } + if (parent == root) + return NULL; + ns = parent; + parent = parent->parent; + } + + return NULL; +} + +/** + * __first_profile - find the first profile in a namespace + * @root: namespace that is root of profiles being displayed (NOT NULL) + * @ns: namespace to start in (NOT NULL) + * + * Returns: unrefcounted profile or NULL if no profile + */ +static struct aa_profile *__first_profile(struct aa_namespace *root, + struct aa_namespace *ns) +{ + for ( ; ns; ns = __next_namespace(root, ns)) { + if (!list_empty(&ns->base.profiles)) + return list_first_entry(&ns->base.profiles, + struct aa_profile, base.list); + } + return NULL; +} + +/** + * __next_profile - step to the next profile in a profile tree + * @profile: current profile in tree (NOT NULL) + * + * Perform a depth first taversal on the profile tree in a namespace + * + * Returns: next profile or NULL if done + * Requires: profile->ns.lock to be held + */ +static struct aa_profile *__next_profile(struct aa_profile *p) +{ + struct aa_profile *parent; + struct aa_namespace *ns = p->ns; + + /* is next profile a child */ + if (!list_empty(&p->base.profiles)) + return list_first_entry(&p->base.profiles, typeof(*p), + base.list); + + /* is next profile a sibling, parent sibling, gp, subling, .. */ + parent = p->parent; + while (parent) { + list_for_each_entry_continue(p, &parent->base.profiles, + base.list) + return p; + p = parent; + parent = parent->parent; + } + + /* is next another profile in the namespace */ + list_for_each_entry_continue(p, &ns->base.profiles, base.list) + return p; + + return NULL; +} + +/** + * next_profile - step to the next profile in where ever it may be + * @root: root namespace (NOT NULL) + * @profile: current profile (NOT NULL) + * + * Returns: next profile or NULL if there isn't one + */ +static struct aa_profile *next_profile(struct aa_namespace *root, + struct aa_profile *profile) +{ + struct aa_profile *next = __next_profile(profile); + if (next) + return next; + + /* finished all profiles in namespace move to next namespace */ + return __first_profile(root, __next_namespace(root, profile->ns)); +} + +/** + * p_start - start a depth first traversal of profile tree + * @f: seq_file to fill + * @pos: current position + * + * Returns: first profile under current namespace or NULL if none found + * + * acquires first ns->lock + */ +static void *p_start(struct seq_file *f, loff_t *pos) + __acquires(root->lock) +{ + struct aa_profile *profile = NULL; + struct aa_namespace *root = aa_current_profile()->ns; + loff_t l = *pos; + f->private = aa_get_namespace(root); + + + /* find the first profile */ + read_lock(&root->lock); + profile = __first_profile(root, root); + + /* skip to position */ + for (; profile && l > 0; l--) + profile = next_profile(root, profile); + + return profile; +} + +/** + * p_next - read the next profile entry + * @f: seq_file to fill + * @p: profile previously returned + * @pos: current position + * + * Returns: next profile after @p or NULL if none + * + * may acquire/release locks in namespace tree as necessary + */ +static void *p_next(struct seq_file *f, void *p, loff_t *pos) +{ + struct aa_profile *profile = p; + struct aa_namespace *root = f->private; + (*pos)++; + + return next_profile(root, profile); +} + +/** + * p_stop - stop depth first traversal + * @f: seq_file we are filling + * @p: the last profile writen + * + * Release all locking done by p_start/p_next on namespace tree + */ +static void p_stop(struct seq_file *f, void *p) + __releases(root->lock) +{ + struct aa_profile *profile = p; + struct aa_namespace *root = f->private, *ns; + + if (profile) { + for (ns = profile->ns; ns && ns != root; ns = ns->parent) + read_unlock(&ns->lock); + } + read_unlock(&root->lock); + aa_put_namespace(root); +} + +/** + * seq_show_profile - show a profile entry + * @f: seq_file to file + * @p: current position (profile) (NOT NULL) + * + * Returns: error on failure + */ +static int seq_show_profile(struct seq_file *f, void *p) +{ + struct aa_profile *profile = (struct aa_profile *)p; + struct aa_namespace *root = f->private; + + if (profile->ns != root) + seq_printf(f, ":%s://", aa_ns_name(root, profile->ns)); + seq_printf(f, "%s (%s)\n", profile->base.hname, + COMPLAIN_MODE(profile) ? "complain" : "enforce"); + + return 0; +} + +static const struct seq_operations aa_fs_profiles_op = { + .start = p_start, + .next = p_next, + .stop = p_stop, + .show = seq_show_profile, +}; + +static int profiles_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &aa_fs_profiles_op); +} + +static int profiles_release(struct inode *inode, struct file *file) +{ + return seq_release(inode, file); +} + +const struct file_operations aa_fs_profiles_fops = { + .open = profiles_open, + .read = seq_read, + .llseek = seq_lseek, + .release = profiles_release, +}; + /** Base file system setup **/ static struct aa_fs_entry aa_fs_entry_file[] = { @@ -210,6 +436,7 @@ static struct aa_fs_entry aa_fs_entry_apparmor[] = { AA_FS_FILE_FOPS(".load", 0640, &aa_fs_profile_load), AA_FS_FILE_FOPS(".replace", 0640, &aa_fs_profile_replace), AA_FS_FILE_FOPS(".remove", 0640, &aa_fs_profile_remove), + AA_FS_FILE_FOPS("profiles", 0640, &aa_fs_profiles_fops), AA_FS_DIR("features", aa_fs_entry_features), { } }; -- cgit v1.2.3 From 87ff1d64552fc3e1eb713481938ac4a8f606ea1c Mon Sep 17 00:00:00 2001 From: John Johansen Date: Thu, 22 Mar 2012 10:13:52 -0700 Subject: UBUNTU: SAUCE: Update aufs for build failure caused by apparmor backport Fix build failure in aufs introduced by commit 9cd98c046b57cd1bdbd53c3669f6cdd75edffd61 which has been backported from 3.4 as part of the AppArmor 3.4 backport Signed-off-by: John Johansen Signed-off-by: Tim Gardner --- ubuntu/aufs/i_op.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ubuntu/aufs/i_op.c b/ubuntu/aufs/i_op.c index fb0e646474e7..ec50afd99ef7 100644 --- a/ubuntu/aufs/i_op.c +++ b/ubuntu/aufs/i_op.c @@ -697,8 +697,7 @@ static int aufs_setattr(struct dentry *dentry, struct iattr *ia) a->h_path.mnt = au_sbr_mnt(sb, a->btgt); if ((ia->ia_valid & (ATTR_MODE | ATTR_CTIME)) == (ATTR_MODE | ATTR_CTIME)) { - err = security_path_chmod(a->h_path.dentry, a->h_path.mnt, - ia->ia_mode); + err = security_path_chmod(&a->h_path, ia->ia_mode); if (unlikely(err)) goto out_unlock; } else if ((ia->ia_valid & (ATTR_UID | ATTR_GID)) -- cgit v1.2.3 From 70dded051c939a6a519f72f75494a5c2dc1312eb Mon Sep 17 00:00:00 2001 From: Brad Figg Date: Thu, 29 Mar 2012 08:53:11 -0700 Subject: UBUNTU: SAUCE (no-up) Provide a param for allowing the BIOS to handle changing the brightness on AC/battery status changes. BugLink: http://bugs.launchpad.net/bugs/949311 We currently carry a SAUCE patch which lets the OS handle the brightness levels automatically when connecting/disconnecting AC. There are some laptops (MSI Wind) for which this doesn't work. Provide a driver param which allows this behaviour to be overriden. Signed-off-by: Brad Figg Acked-by: Colin King Signed-off-by: Leann Ogasawara --- drivers/acpi/video.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index 9006942eacbd..e58b53c86ae6 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -72,6 +72,14 @@ MODULE_LICENSE("GPL"); static bool brightness_switch_enabled = 1; module_param(brightness_switch_enabled, bool, 0644); +/* + * The Default is to let the OS handle brightness autoswitching due to + * AC/battery status changes. On some laptops (MSI Wind) this doesn't + * work so we need a workaround. + */ +static int brightness_autoswitch_via_bios = 0; +module_param(brightness_autoswitch_via_bios, bool, 0644); + /* * By default, we don't allow duplicate ACPI video bus devices * under the same VGA controller @@ -1423,7 +1431,7 @@ static int acpi_video_bus_put_devices(struct acpi_video_bus *video) static int acpi_video_bus_start_devices(struct acpi_video_bus *video) { - return acpi_video_bus_DOS(video, 0, 1); + return acpi_video_bus_DOS(video, 0, !brightness_autoswitch_via_bios); } static int acpi_video_bus_stop_devices(struct acpi_video_bus *video) -- cgit v1.2.3 From f00a708affc82649b423957e476b9b4d4cdb8a43 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 30 Mar 2012 11:13:54 +0100 Subject: UBUNTU: SAUCE: PCI: Allow pcie_aspm=force to work even when FADT indicates it is unsupported Submitted upstream. BugLink: http://bugs.launchpad.net/bugs/962038 Right now using pcie_aspm=force will not enable ASPM if the FADT indicates ASPM is unsupported. However, the semantics of force should probably allow for this, especially as they did before the ASPM disable rework with commit 3c076351c4027a56d5005a39a0b518a4ba393ce2 This patch just skips the clearing of any ASPM setup that the firmware has carried out on this bus if pcie_aspm=force is being used. Signed-off-by: Colin Ian King Acked-by: Stefan Bader Signed-off-by: Tim Gardner --- drivers/pci/pcie/aspm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index b500840a143b..474f22f304e4 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -798,6 +798,9 @@ void pcie_clear_aspm(struct pci_bus *bus) { struct pci_dev *child; + if (aspm_force) + return; + /* * Clear any ASPM setup that the firmware has carried out on this bus */ -- cgit v1.2.3 From 287e5f93e2948d746c1656d1bf652ad5ca840d4c Mon Sep 17 00:00:00 2001 From: Tim Gardner Date: Fri, 30 Mar 2012 09:16:21 -0600 Subject: UBUNTU: SAUCE: remove __initdata from vesafb_fix BugLink: http://bugs.launchpad.net/bugs/969309 OK. Then, I think we also want to fix these warnings probably introduced by commit a6021559 "UBUNTU: SAUCE: (no-up) Modularize vesafb". WARNING: drivers/video/vesafb.o(.exit.text+0x42): Section mismatch in reference from the function vesafb_remove() to the (unknown reference) .init.data:(unknown) The function __exit vesafb_remove() references a (unknown reference) __initdata (unknown). This is often seen when error handling in the exit function uses functionality in the init path. The fix is often to remove the __initdata annotation of (unknown) so it may be used outside an init section. WARNING: drivers/video/vesafb.o(.exit.text+0x4a): Section mismatch in reference from the function vesafb_remove() to the variable .init.data:vesafb_fix The function __exit vesafb_remove() references a variable __initdata vesafb_fix. This is often seen when error handling in the exit function uses functionality in the init path. The fix is often to remove the __initdata annotation of vesafb_fix so it may be used outside an init section. Reported-by: Tetsuo Honda Signed-off-by: Tim Gardner --- drivers/video/vesafb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/vesafb.c b/drivers/video/vesafb.c index 9d6995d2039f..eb78a5986c26 100644 --- a/drivers/video/vesafb.c +++ b/drivers/video/vesafb.c @@ -46,7 +46,7 @@ static struct fb_var_screeninfo vesafb_defined __initdata = { .vmode = FB_VMODE_NONINTERLACED, }; -static struct fb_fix_screeninfo vesafb_fix __initdata = { +static struct fb_fix_screeninfo vesafb_fix = { .id = "VESA VGA", .type = FB_TYPE_PACKED_PIXELS, .accel = FB_ACCEL_NONE, -- cgit v1.2.3 From 10dc9c130950e60995d7ae7a6a0eb2abe53dd24e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 30 Mar 2012 13:19:08 -0700 Subject: UBUNTU: SAUCE: SECCOMP: audit: always report seccomp violations Violations of seccomp filters should always be reported, regardless of audit context. This the minimal change version of what has been proposed upstream: https://lkml.org/lkml/2012/3/23/332 Signed-off-by: Kees Cook Signed-off-by: Leann Ogasawara --- include/linux/audit.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/audit.h b/include/linux/audit.h index 22f292a917a3..4a2f0d6acfb4 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -510,8 +510,7 @@ void audit_core_dumps(long signr); static inline void audit_seccomp(unsigned long syscall, long signr, int code) { - if (unlikely(!audit_dummy_context())) - __audit_seccomp(syscall, signr, code); + __audit_seccomp(syscall, signr, code); } static inline void audit_ptrace(struct task_struct *t) -- cgit v1.2.3 From af9c9cd36df6154031a29259e41aeeb4cb84bf06 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 3 Apr 2012 11:42:41 +0100 Subject: UBUNTU: SAUCE: (no-up) elide some ioctl warnings which are known benign BugLink: http://bugs.launchpad.net/bugs/972355 We have been seeing increasing reports of scarey ioctl messages in dmesg, such as the below often in bulk: mdadm: sending ioctl 1261 to a partition! mdadm: sending ioctl 800c0910 to a partition! Looking at the upstream discussions these are all benign and can be safely suppressed. This patch is based on some discussions at the link below, on some work SUSE did in this area. This is not suitable for upstreaming as we need some refactoring to fix the 32bit compat ioctl mess. Link: http://www.spinics.net/lists/raid/msg37770.html Signed-off-by: Andy Whitcroft Signed-off-by: Tim Gardner --- block/scsi_ioctl.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 260fa80ef575..becbc6f8619a 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -27,6 +27,9 @@ #include #include #include +#include +#include +#include #include #include @@ -710,8 +713,17 @@ int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd) case SG_GET_RESERVED_SIZE: case SG_SET_RESERVED_SIZE: case SG_EMULATED_HOST: + case BLKFLSBUF: + case BLKROSET: return 0; case CDROM_GET_CAPABILITY: + case CDROM_DRIVE_STATUS: + case FDGETPRM: + case RAID_VERSION: + case MTIOCGET: +#ifdef CONFIG_COMPAT + case 0x801c6d02: /* MTIOCGET32 */ +#endif /* Keep this until we remove the printk below. udev sends it * and we do not want to spam dmesg about it. CD-ROMs do * not have partitions, so we get here only for disks. -- cgit v1.2.3 From 36f843fe54f0722955c0ffb5ce65ab785dbfa959 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Fri, 6 Apr 2012 18:52:41 +0100 Subject: UBUNTU: tools/hv: add basic Makefile BugLink: http://bugs.launchpad.net/bugs/977246 Signed-off-by: Andy Whitcroft Acked-by: Leann Ogasawara Acked-by: Brad Figg Signed-off-by: Tim Gardner --- tools/hv/Makefile | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 tools/hv/Makefile diff --git a/tools/hv/Makefile b/tools/hv/Makefile new file mode 100644 index 000000000000..ef87f85849dc --- /dev/null +++ b/tools/hv/Makefile @@ -0,0 +1,7 @@ +hv_kvp_daemon: hv_kvp_daemon.c + +clean: + rm -f hv_kvp_daemon + +install: + install hv_kvp_daemon /usr/sbin/ -- cgit v1.2.3 From 9ae47e88b404184a2af4034c92f776fc87524c73 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Fri, 6 Apr 2012 18:52:42 +0100 Subject: UBUNTU: tools/hv: add basic manual pages BugLink: http://bugs.launchpad.net/bugs/977246 Signed-off-by: Andy Whitcroft Acked-by: Leann Ogasawara Acked-by: Brad Figg Signed-off-by: Tim Gardner --- tools/hv/hv_kvp_daemon.8 | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 tools/hv/hv_kvp_daemon.8 diff --git a/tools/hv/hv_kvp_daemon.8 b/tools/hv/hv_kvp_daemon.8 new file mode 100644 index 000000000000..0fb4577a1f07 --- /dev/null +++ b/tools/hv/hv_kvp_daemon.8 @@ -0,0 +1,26 @@ +.\" This page Copyright (C) 2012 Andy Whitcroft +.\" Distributed under the GPL v2 or later. +.TH HV_KVP_DAEMON 8 +.SH NAME +hv_kvp_daemon \- Hyper-V Key Value Pair daemon +.SH SYNOPSIS +.ft B +.B hv_kvp_daemon +.br +.SH DESCRIPTION +\fBhv_kvp_daemon\fP +is the userspace component of the Hyper-V key value pair functionality, +communicating via a netlink socket with the kernel HV-KVP driver. +This pairing allows the Hyper-V host to pass configuration information +(such as IP addresses) to the guest and allows the host to obtain guest +version information. + +.SH FILES +.ta +.nf +/var/opt/hyperv/.kvp_pool_* +.fi + +.SH AUTHORS +.nf +Written by K. Y. Srinivasan -- cgit v1.2.3 From 4cb23ca36970435a491e38a058ccbc39ae965df4 Mon Sep 17 00:00:00 2001 From: Tim Gardner Date: Tue, 24 Apr 2012 11:47:18 -0600 Subject: UBUNTU: SAUCE: Allow filtering of cpufreq drivers BugLink: http://bugs.launchpad.net/bugs/984288 Acked-by: Stefan Bader Acked-by: Herton Krzesinski Signed-off-by: Tim Gardner --- Documentation/kernel-parameters.txt | 4 ++++ drivers/cpufreq/cpufreq.c | 22 +++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index d79efa337357..ef9af9a052c0 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -588,6 +588,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. /proc//coredump_filter. See also Documentation/filesystems/proc.txt. + cpufreq_driver= [X86] Allow only the named cpu frequency scaling driver + to register. Example: cpufreq_driver=powernow-k8 + Format: { none | STRING } + cpuidle.off=1 [CPU_IDLE] disable the cpuidle sub-system diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 7f2f149ae40f..222966f34432 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1817,6 +1817,20 @@ static struct notifier_block __refdata cpufreq_cpu_notifier = { * REGISTER / UNREGISTER CPUFREQ DRIVER * *********************************************************************/ +static char cpufreq_driver_name[CPUFREQ_NAME_LEN]; + +static int __init cpufreq_driver_setup(char *str) +{ + strlcpy(cpufreq_driver_name, str, CPUFREQ_NAME_LEN); + return 1; +} + +/* + * Set this name to only allow one specific cpu freq driver, e.g., + * cpufreq_driver=powernow-k8 + */ +__setup("cpufreq_driver=", cpufreq_driver_setup); + /** * cpufreq_register_driver - register a CPU Frequency driver * @driver_data: A struct cpufreq_driver containing the values# @@ -1839,7 +1853,13 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) ((!driver_data->setpolicy) && (!driver_data->target))) return -EINVAL; - pr_debug("trying to register driver %s\n", driver_data->name); + pr_debug("trying to register driver %s, cpufreq_driver=%s\n", + driver_data->name, cpufreq_driver_name); + + if (cpufreq_driver_name[0]) + if (!driver_data->name || + strcmp(cpufreq_driver_name, driver_data->name)) + return -EINVAL; if (driver_data->setpolicy) driver_data->flags |= CPUFREQ_CONST_LOOPS; -- cgit v1.2.3 From 074b4612723963d847d52145b83a60547a3a7024 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 3 May 2012 16:01:39 +0100 Subject: kconfig: in debug mode some 0 length message prints occur When we enable the zconfdump() debugging we see assertion failures attempting to print the config. Convert this into a noop. Signed-off-by: Andy Whitcroft --- scripts/kconfig/lkc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/kconfig/lkc.h b/scripts/kconfig/lkc.h index c18f2bd9c095..608cf6558bdb 100644 --- a/scripts/kconfig/lkc.h +++ b/scripts/kconfig/lkc.h @@ -90,7 +90,9 @@ struct conf_printer { /* confdata.c and expr.c */ static inline void xfwrite(const void *str, size_t len, size_t count, FILE *out) { - assert(len != 0); + //assert(len != 0); + if (len == 0) + return; if (fwrite(str, len, count, out) != count) fprintf(stderr, "Error in writing or end of file.\n"); -- cgit v1.2.3 From 121d2320e39fb1ab4391288e027f4f986e270732 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 14 May 2012 13:06:13 +0200 Subject: UBUNTU: ubuntu: overlayfs -- vfs: pass struct path to __dentry_open() Make __dentry_open() take a struct path instead of separate vfsmount and dentry arguments. Change semantics as well, so that __dentry_open() acquires a reference to path instead of transferring it to the open file. Signed-off-by: Miklos Szeredi --- fs/open.c | 49 ++++++++++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/fs/open.c b/fs/open.c index 1637b209fa53..b7983d520ab0 100644 --- a/fs/open.c +++ b/fs/open.c @@ -671,24 +671,24 @@ int open_check_o_direct(struct file *f) return 0; } -static struct file *do_dentry_open(struct dentry *dentry, struct vfsmount *mnt, - struct file *f, - int (*open)(struct inode *, struct file *), - const struct cred *cred) +static struct file *do_dentry_open(struct path *path, struct file *f, + int (*open)(struct inode *, struct file *), + const struct cred *cred) { static const struct file_operations empty_fops = {}; struct inode *inode; int error; + path_get(path); f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; if (unlikely(f->f_flags & O_PATH)) f->f_mode = FMODE_PATH; - inode = dentry->d_inode; + inode = path->dentry->d_inode; if (f->f_mode & FMODE_WRITE) { - error = __get_file_write_access(inode, mnt); + error = __get_file_write_access(inode, path->mnt); if (error) goto cleanup_file; if (!special_file(inode->i_mode)) @@ -696,8 +696,7 @@ static struct file *do_dentry_open(struct dentry *dentry, struct vfsmount *mnt, } f->f_mapping = inode->i_mapping; - f->f_path.dentry = dentry; - f->f_path.mnt = mnt; + f->f_path = *path; f->f_pos = 0; file_sb_list_add(f, inode->i_sb); @@ -744,24 +743,22 @@ cleanup_all: * here, so just reset the state. */ file_reset_write(f); - mnt_drop_write(mnt); + mnt_drop_write(path->mnt); } } file_sb_list_del(f); f->f_path.dentry = NULL; f->f_path.mnt = NULL; cleanup_file: - dput(dentry); - mntput(mnt); + path_put(path); return ERR_PTR(error); } -static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, - struct file *f, +static struct file *__dentry_open(struct path *path, struct file *f, int (*open)(struct inode *, struct file *), const struct cred *cred) { - struct file *res = do_dentry_open(dentry, mnt, f, open, cred); + struct file *res = do_dentry_open(path, f, open, cred); if (!IS_ERR(res)) { int error = open_check_o_direct(f); if (error) { @@ -796,14 +793,14 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry, int (*open)(struct inode *, struct file *)) { + struct path path = { .dentry = dentry, .mnt = nd->path.mnt }; const struct cred *cred = current_cred(); if (IS_ERR(nd->intent.open.file)) goto out; if (IS_ERR(dentry)) goto out_err; - nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->path.mnt), - nd->intent.open.file, + nd->intent.open.file = __dentry_open(&path, nd->intent.open.file, open, cred); out: return nd->intent.open.file; @@ -835,9 +832,7 @@ struct file *nameidata_to_filp(struct nameidata *nd) } else { struct file *res; - path_get(&nd->path); - res = do_dentry_open(nd->path.dentry, nd->path.mnt, - filp, NULL, cred); + res = do_dentry_open(&nd->path, filp, NULL, cred); if (!IS_ERR(res)) { int error; @@ -864,24 +859,24 @@ struct file *nameidata_to_filp(struct nameidata *nd) struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, const struct cred *cred) { - int error; struct file *f; + struct file *ret; + struct path path = { .dentry = dentry, .mnt = mnt }; validate_creds(cred); /* We must always pass in a valid mount pointer. */ BUG_ON(!mnt); - error = -ENFILE; + ret = ERR_PTR(-ENFILE); f = get_empty_filp(); - if (f == NULL) { - dput(dentry); - mntput(mnt); - return ERR_PTR(error); + if (f != NULL) { + f->f_flags = flags; + ret = __dentry_open(&path, f, NULL, cred); } + path_put(&path); - f->f_flags = flags; - return __dentry_open(dentry, mnt, f, NULL, cred); + return ret; } EXPORT_SYMBOL(dentry_open); -- cgit v1.2.3 From 258bd2c2d70aa88da1b8400658bd8161054e7dbc Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 14 May 2012 13:06:13 +0200 Subject: UBUNTU: ubuntu: overlayfs -- vfs: add i_op->open() Add a new inode operation i_op->open(). This is for stacked filesystems that want to return a struct file from a different filesystem. Signed-off-by: Miklos Szeredi --- Documentation/filesystems/Locking | 3 ++- Documentation/filesystems/vfs.txt | 8 ++++++++ fs/open.c | 25 +++++++++++++++++++++++-- include/linux/fs.h | 3 +++ 4 files changed, 36 insertions(+), 3 deletions(-) diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 8e2da1e06e3b..d88588112b7e 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -62,6 +62,7 @@ ata *); int (*removexattr) (struct dentry *, const char *); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); void (*update_time)(struct inode *, struct timespec *, int); + struct file *(*open)(struct dentry *,struct file *,const struct cred *); locking rules: all may block @@ -89,7 +90,7 @@ listxattr: no removexattr: yes fiemap: no update_time: no - +open: no Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on victim. cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem. diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index efd23f481704..f586bc616391 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -364,6 +364,8 @@ struct inode_operations { ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*removexattr) (struct dentry *, const char *); void (*update_time)(struct inode *, struct timespec *, int); + struct file *(*open) (struct dentry *, struct file *, + const struct cred *); }; Again, all methods are called without any locks being held, unless @@ -476,6 +478,12 @@ otherwise noted. an inode. If this is not defined the VFS will update the inode itself and call mark_inode_dirty_sync. + open: this is an alternative to f_op->open(), the difference is that this + method may return any open file, not necessarily originating from the + same filesystem as the one i_op->open() was called on. It may be useful + for stacking filesystems which want to allow native I/O directly on + underlying files. + The Address Space Object ======================== diff --git a/fs/open.c b/fs/open.c index b7983d520ab0..21078e846219 100644 --- a/fs/open.c +++ b/fs/open.c @@ -832,7 +832,7 @@ struct file *nameidata_to_filp(struct nameidata *nd) } else { struct file *res; - res = do_dentry_open(&nd->path, filp, NULL, cred); + res = vfs_open(&nd->path, filp, cred); if (!IS_ERR(res)) { int error; @@ -872,7 +872,7 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, f = get_empty_filp(); if (f != NULL) { f->f_flags = flags; - ret = __dentry_open(&path, f, NULL, cred); + ret = vfs_open(&path, f, cred); } path_put(&path); @@ -880,6 +880,27 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, } EXPORT_SYMBOL(dentry_open); +/** + * vfs_open - open the file at the given path + * @path: path to open + * @filp: newly allocated file with f_flag initialized + * @cred: credentials to use + * + * Open the file. If successful, the returned file will have acquired + * an additional reference for path. + */ +struct file *vfs_open(struct path *path, struct file *filp, + const struct cred *cred) +{ + struct inode *inode = path->dentry->d_inode; + + if (inode->i_op->open) + return inode->i_op->open(path->dentry, filp, cred); + else + return __dentry_open(path, filp, NULL, cred); +} +EXPORT_SYMBOL(vfs_open); + static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); diff --git a/include/linux/fs.h b/include/linux/fs.h index 17fd887c798f..05d046f75e90 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1693,6 +1693,8 @@ struct inode_operations { int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); int (*update_time)(struct inode *, struct timespec *, int); + struct file *(*open) (struct dentry *, struct file *, + const struct cred *); } ____cacheline_aligned; struct seq_file; @@ -2057,6 +2059,7 @@ extern long do_sys_open(int dfd, const char __user *filename, int flags, extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(struct dentry *, struct vfsmount *, const char *, int); +extern struct file *vfs_open(struct path *, struct file *, const struct cred *); extern struct file * dentry_open(struct dentry *, struct vfsmount *, int, const struct cred *); extern int filp_close(struct file *, fl_owner_t id); -- cgit v1.2.3 From 628ac0930b07e4df73e08f49d1315d225b7071de Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 14 May 2012 13:06:13 +0200 Subject: UBUNTU: ubuntu: overlayfs -- vfs: export do_splice_direct() to modules Export do_splice_direct() to modules. Needed by overlay filesystem. Signed-off-by: Miklos Szeredi --- fs/splice.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/splice.c b/fs/splice.c index 701d34a9c8eb..ce525f34e1d6 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1303,6 +1303,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, return ret; } +EXPORT_SYMBOL(do_splice_direct); static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, -- cgit v1.2.3 From bf014f424140f54b5a1b7fb18bfcd1536ae6903c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 14 May 2012 13:06:13 +0200 Subject: UBUNTU: ubuntu: overlayfs -- vfs: introduce clone_private_mount() Overlayfs needs a private clone of the mount, so create a function for this and export to modules. Signed-off-by: Miklos Szeredi --- fs/namespace.c | 18 ++++++++++++++++++ include/linux/mount.h | 3 +++ 2 files changed, 21 insertions(+) diff --git a/fs/namespace.c b/fs/namespace.c index 7db5f2b15274..6731fc258b3f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1327,6 +1327,24 @@ void drop_collected_mounts(struct vfsmount *mnt) release_mounts(&umount_list); } +struct vfsmount *clone_private_mount(struct path *path) +{ + struct mount *old_mnt = real_mount(path->mnt); + struct mount *new_mnt; + + if (IS_MNT_UNBINDABLE(old_mnt)) + return ERR_PTR(-EINVAL); + + down_read(&namespace_sem); + new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); + up_read(&namespace_sem); + if (!new_mnt) + return ERR_PTR(-ENOMEM); + + return &new_mnt->mnt; +} +EXPORT_SYMBOL_GPL(clone_private_mount); + int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, struct vfsmount *root) { diff --git a/include/linux/mount.h b/include/linux/mount.h index d7029f4a191a..344a2623eb2a 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -66,6 +66,9 @@ extern void mnt_pin(struct vfsmount *mnt); extern void mnt_unpin(struct vfsmount *mnt); extern int __mnt_is_readonly(struct vfsmount *mnt); +struct path; +extern struct vfsmount *clone_private_mount(struct path *path); + struct file_system_type; extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, -- cgit v1.2.3 From ac58305ecc08b7d28d1a9d93f5f367253dcbea45 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 14 May 2012 13:06:13 +0200 Subject: UBUNTU: ubuntu: overlayfs -- overlay filesystem Overlayfs allows one, usually read-write, directory tree to be overlaid onto another, read-only directory tree. All modifications go to the upper, writable layer. This type of mechanism is most often used for live CDs but there's a wide variety of other uses. The implementation differs from other "union filesystem" implementations in that after a file is opened all operations go directly to the underlying, lower or upper, filesystems. This simplifies the implementation and allows native performance in these cases. The dentry tree is duplicated from the underlying filesystems, this enables fast cached lookups without adding special support into the VFS. This uses slightly more memory than union mounts, but dentries are relatively small. Currently inodes are duplicated as well, but it is a possible optimization to share inodes for non-directories. Opening non directories results in the open forwarded to the underlying filesystem. This makes the behavior very similar to union mounts (with the same limitations vs. fchmod/fchown on O_RDONLY file descriptors). Usage: mount -t overlay -olowerdir=/lower,upperdir=/upper overlay /mnt Supported: - all operations Missing: - Currently a crash in the middle of copy-up, rename, unlink, rmdir or create over a whiteout may result in filesystem corruption on the overlay level. IOW these operations need to become atomic or at least the corruption needs to be detected. The following cotributions have been folded into this patch: Neil Brown : - minimal remount support - use correct seek function for directories - initialise is_real before use - rename ovl_fill_cache to ovl_dir_read Felix Fietkau : - fix a deadlock in ovl_dir_read_merged - fix a deadlock in ovl_remove_whiteouts Erez Zadok - fix cleanup after WARN_ON Sedat Dilek - fix up permission to confirm to new API Also thanks to the following people for testing and reporting bugs: Jordi Pujol Andy Whitcroft Michal Suchanek Felix Fietkau Erez Zadok Randy Dunlap Signed-off-by: Miklos Szeredi --- fs/Kconfig | 1 + fs/Makefile | 1 + fs/overlayfs/Kconfig | 4 + fs/overlayfs/Makefile | 7 + fs/overlayfs/copy_up.c | 385 +++++++++++++++++++++++++++++ fs/overlayfs/dir.c | 597 +++++++++++++++++++++++++++++++++++++++++++++ fs/overlayfs/inode.c | 384 +++++++++++++++++++++++++++++ fs/overlayfs/overlayfs.h | 64 +++++ fs/overlayfs/readdir.c | 566 +++++++++++++++++++++++++++++++++++++++++++ fs/overlayfs/super.c | 611 +++++++++++++++++++++++++++++++++++++++++++++++ 10 files changed, 2620 insertions(+) create mode 100644 fs/overlayfs/Kconfig create mode 100644 fs/overlayfs/Makefile create mode 100644 fs/overlayfs/copy_up.c create mode 100644 fs/overlayfs/dir.c create mode 100644 fs/overlayfs/inode.c create mode 100644 fs/overlayfs/overlayfs.h create mode 100644 fs/overlayfs/readdir.c create mode 100644 fs/overlayfs/super.c diff --git a/fs/Kconfig b/fs/Kconfig index 1dd49481854d..390a2852c35c 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -67,6 +67,7 @@ source "fs/quota/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" +source "fs/overlayfs/Kconfig" config CUSE tristate "Character device in Userspace support" diff --git a/fs/Makefile b/fs/Makefile index 95cf9de6ae02..73cc7c116b3c 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -106,6 +106,7 @@ obj-$(CONFIG_QNX6FS_FS) += qnx6/ obj-$(CONFIG_AUTOFS4_FS) += autofs4/ obj-$(CONFIG_ADFS_FS) += adfs/ obj-$(CONFIG_FUSE_FS) += fuse/ +obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/ obj-$(CONFIG_UDF_FS) += udf/ obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ obj-$(CONFIG_OMFS_FS) += omfs/ diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig new file mode 100644 index 000000000000..c4517da01fa9 --- /dev/null +++ b/fs/overlayfs/Kconfig @@ -0,0 +1,4 @@ +config OVERLAYFS_FS + tristate "Overlay filesystem support" + help + Add support for overlay filesystem. diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile new file mode 100644 index 000000000000..8f91889480d0 --- /dev/null +++ b/fs/overlayfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the overlay filesystem. +# + +obj-$(CONFIG_OVERLAYFS_FS) += overlayfs.o + +overlayfs-objs := super.o inode.o dir.o readdir.o copy_up.o diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c new file mode 100644 index 000000000000..87dbeee0a14e --- /dev/null +++ b/fs/overlayfs/copy_up.c @@ -0,0 +1,385 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "overlayfs.h" + +#define OVL_COPY_UP_CHUNK_SIZE (1 << 20) + +static int ovl_copy_up_xattr(struct dentry *old, struct dentry *new) +{ + ssize_t list_size, size; + char *buf, *name, *value; + int error; + + if (!old->d_inode->i_op->getxattr || + !new->d_inode->i_op->getxattr) + return 0; + + list_size = vfs_listxattr(old, NULL, 0); + if (list_size <= 0) { + if (list_size == -EOPNOTSUPP) + return 0; + return list_size; + } + + buf = kzalloc(list_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + error = -ENOMEM; + value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL); + if (!value) + goto out; + + list_size = vfs_listxattr(old, buf, list_size); + if (list_size <= 0) { + error = list_size; + goto out_free_value; + } + + for (name = buf; name < (buf + list_size); name += strlen(name) + 1) { + size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX); + if (size <= 0) { + error = size; + goto out_free_value; + } + error = vfs_setxattr(new, name, value, size, 0); + if (error) + goto out_free_value; + } + +out_free_value: + kfree(value); +out: + kfree(buf); + return error; +} + +static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) +{ + struct file *old_file; + struct file *new_file; + int error = 0; + + if (len == 0) + return 0; + + old_file = ovl_path_open(old, O_RDONLY); + if (IS_ERR(old_file)) + return PTR_ERR(old_file); + + new_file = ovl_path_open(new, O_WRONLY); + if (IS_ERR(new_file)) { + error = PTR_ERR(new_file); + goto out_fput; + } + + /* FIXME: copy up sparse files efficiently */ + while (len) { + loff_t offset = new_file->f_pos; + size_t this_len = OVL_COPY_UP_CHUNK_SIZE; + long bytes; + + if (len < this_len) + this_len = len; + + if (signal_pending_state(TASK_KILLABLE, current)) { + error = -EINTR; + break; + } + + bytes = do_splice_direct(old_file, &offset, new_file, this_len, + SPLICE_F_MOVE); + if (bytes <= 0) { + error = bytes; + break; + } + + len -= bytes; + } + + fput(new_file); +out_fput: + fput(old_file); + return error; +} + +static char *ovl_read_symlink(struct dentry *realdentry) +{ + int res; + char *buf; + struct inode *inode = realdentry->d_inode; + mm_segment_t old_fs; + + res = -EINVAL; + if (!inode->i_op->readlink) + goto err; + + res = -ENOMEM; + buf = (char *) __get_free_page(GFP_KERNEL); + if (!buf) + goto err; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + res = inode->i_op->readlink(realdentry, + (char __user *)buf, PAGE_SIZE - 1); + set_fs(old_fs); + if (res < 0) { + free_page((unsigned long) buf); + goto err; + } + buf[res] = '\0'; + + return buf; + +err: + return ERR_PTR(res); +} + +static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat) +{ + struct iattr attr = { + .ia_valid = + ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET, + .ia_atime = stat->atime, + .ia_mtime = stat->mtime, + }; + + return notify_change(upperdentry, &attr); +} + +static int ovl_set_mode(struct dentry *upperdentry, umode_t mode) +{ + struct iattr attr = { + .ia_valid = ATTR_MODE, + .ia_mode = mode, + }; + + return notify_change(upperdentry, &attr); +} + +static int ovl_copy_up_locked(struct dentry *upperdir, struct dentry *dentry, + struct path *lowerpath, struct kstat *stat, + const char *link) +{ + int err; + struct path newpath; + umode_t mode = stat->mode; + + /* Can't properly set mode on creation because of the umask */ + stat->mode &= S_IFMT; + + ovl_path_upper(dentry, &newpath); + WARN_ON(newpath.dentry); + newpath.dentry = ovl_upper_create(upperdir, dentry, stat, link); + if (IS_ERR(newpath.dentry)) + return PTR_ERR(newpath.dentry); + + if (S_ISREG(stat->mode)) { + err = ovl_copy_up_data(lowerpath, &newpath, stat->size); + if (err) + goto err_remove; + } + + err = ovl_copy_up_xattr(lowerpath->dentry, newpath.dentry); + if (err) + goto err_remove; + + mutex_lock(&newpath.dentry->d_inode->i_mutex); + if (!S_ISLNK(stat->mode)) + err = ovl_set_mode(newpath.dentry, mode); + if (!err) + err = ovl_set_timestamps(newpath.dentry, stat); + mutex_unlock(&newpath.dentry->d_inode->i_mutex); + if (err) + goto err_remove; + + ovl_dentry_update(dentry, newpath.dentry); + + /* + * Easiest way to get rid of the lower dentry reference is to + * drop this dentry. This is neither needed nor possible for + * directories. + */ + if (!S_ISDIR(stat->mode)) + d_drop(dentry); + + return 0; + +err_remove: + if (S_ISDIR(stat->mode)) + vfs_rmdir(upperdir->d_inode, newpath.dentry); + else + vfs_unlink(upperdir->d_inode, newpath.dentry); + + dput(newpath.dentry); + + return err; +} + +/* + * Copy up a single dentry + * + * Directory renames only allowed on "pure upper" (already created on + * upper filesystem, never copied up). Directories which are on lower or + * are merged may not be renamed. For these -EXDEV is returned and + * userspace has to deal with it. This means, when copying up a + * directory we can rely on it and ancestors being stable. + * + * Non-directory renames start with copy up of source if necessary. The + * actual rename will only proceed once the copy up was successful. Copy + * up uses upper parent i_mutex for exclusion. Since rename can change + * d_parent it is possible that the copy up will lock the old parent. At + * that point the file will have already been copied up anyway. + */ +static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, + struct path *lowerpath, struct kstat *stat) +{ + int err; + struct kstat pstat; + struct path parentpath; + struct dentry *upperdir; + const struct cred *old_cred; + struct cred *override_cred; + char *link = NULL; + + ovl_path_upper(parent, &parentpath); + upperdir = parentpath.dentry; + + err = vfs_getattr(parentpath.mnt, parentpath.dentry, &pstat); + if (err) + return err; + + if (S_ISLNK(stat->mode)) { + link = ovl_read_symlink(lowerpath->dentry); + if (IS_ERR(link)) + return PTR_ERR(link); + } + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_free_link; + + override_cred->fsuid = stat->uid; + override_cred->fsgid = stat->gid; + /* + * CAP_SYS_ADMIN for copying up extended attributes + * CAP_DAC_OVERRIDE for create + * CAP_FOWNER for chmod, timestamp update + * CAP_FSETID for chmod + * CAP_MKNOD for mknod + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + cap_raise(override_cred->cap_effective, CAP_FSETID); + cap_raise(override_cred->cap_effective, CAP_MKNOD); + old_cred = override_creds(override_cred); + + mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT); + if (ovl_path_type(dentry) != OVL_PATH_LOWER) { + err = 0; + } else { + err = ovl_copy_up_locked(upperdir, dentry, lowerpath, + stat, link); + if (!err) { + /* Restore timestamps on parent (best effort) */ + ovl_set_timestamps(upperdir, &pstat); + } + } + + mutex_unlock(&upperdir->d_inode->i_mutex); + + revert_creds(old_cred); + put_cred(override_cred); + +out_free_link: + if (link) + free_page((unsigned long) link); + + return err; +} + +int ovl_copy_up(struct dentry *dentry) +{ + int err; + + err = 0; + while (!err) { + struct dentry *next; + struct dentry *parent; + struct path lowerpath; + struct kstat stat; + enum ovl_path_type type = ovl_path_type(dentry); + + if (type != OVL_PATH_LOWER) + break; + + next = dget(dentry); + /* find the topmost dentry not yet copied up */ + for (;;) { + parent = dget_parent(next); + + type = ovl_path_type(parent); + if (type != OVL_PATH_LOWER) + break; + + dput(next); + next = parent; + } + + ovl_path_lower(next, &lowerpath); + err = vfs_getattr(lowerpath.mnt, lowerpath.dentry, &stat); + if (!err) + err = ovl_copy_up_one(parent, next, &lowerpath, &stat); + + dput(parent); + dput(next); + } + + return err; +} + +/* Optimize by not copying up the file first and truncating later */ +int ovl_copy_up_truncate(struct dentry *dentry, loff_t size) +{ + int err; + struct kstat stat; + struct path lowerpath; + struct dentry *parent = dget_parent(dentry); + + err = ovl_copy_up(parent); + if (err) + goto out_dput_parent; + + ovl_path_lower(dentry, &lowerpath); + err = vfs_getattr(lowerpath.mnt, lowerpath.dentry, &stat); + if (err) + goto out_dput_parent; + + if (size < stat.size) + stat.size = size; + + err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat); + +out_dput_parent: + dput(parent); + return err; +} diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c new file mode 100644 index 000000000000..00aa6d99cb66 --- /dev/null +++ b/fs/overlayfs/dir.c @@ -0,0 +1,597 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include "overlayfs.h" + +static const char *ovl_whiteout_symlink = "(overlay-whiteout)"; + +static int ovl_whiteout(struct dentry *upperdir, struct dentry *dentry) +{ + int err; + struct dentry *newdentry; + const struct cred *old_cred; + struct cred *override_cred; + + /* FIXME: recheck lower dentry to see if whiteout is really needed */ + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out; + + /* + * CAP_SYS_ADMIN for setxattr + * CAP_DAC_OVERRIDE for symlink creation + * CAP_FOWNER for unlink in sticky directory + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + override_cred->fsuid = 0; + override_cred->fsgid = 0; + old_cred = override_creds(override_cred); + + newdentry = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_put_cred; + + /* Just been removed within the same locked region */ + WARN_ON(newdentry->d_inode); + + err = vfs_symlink(upperdir->d_inode, newdentry, ovl_whiteout_symlink); + if (err) + goto out_dput; + + ovl_dentry_version_inc(dentry->d_parent); + + err = vfs_setxattr(newdentry, ovl_whiteout_xattr, "y", 1, 0); + if (err) + vfs_unlink(upperdir->d_inode, newdentry); + +out_dput: + dput(newdentry); +out_put_cred: + revert_creds(old_cred); + put_cred(override_cred); +out: + if (err) { + /* + * There's no way to recover from failure to whiteout. + * What should we do? Log a big fat error and... ? + */ + printk(KERN_ERR "overlayfs: ERROR - failed to whiteout '%s'\n", + dentry->d_name.name); + } + + return err; +} + +static struct dentry *ovl_lookup_create(struct dentry *upperdir, + struct dentry *template) +{ + int err; + struct dentry *newdentry; + struct qstr *name = &template->d_name; + + newdentry = lookup_one_len(name->name, upperdir, name->len); + if (IS_ERR(newdentry)) + return newdentry; + + if (newdentry->d_inode) { + const struct cred *old_cred; + struct cred *override_cred; + + /* No need to check whiteout if lower parent is non-existent */ + err = -EEXIST; + if (!ovl_dentry_lower(template->d_parent)) + goto out_dput; + + if (!S_ISLNK(newdentry->d_inode->i_mode)) + goto out_dput; + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_dput; + + /* + * CAP_SYS_ADMIN for getxattr + * CAP_FOWNER for unlink in sticky directory + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + old_cred = override_creds(override_cred); + + err = -EEXIST; + if (ovl_is_whiteout(newdentry)) + err = vfs_unlink(upperdir->d_inode, newdentry); + + revert_creds(old_cred); + put_cred(override_cred); + if (err) + goto out_dput; + + dput(newdentry); + newdentry = lookup_one_len(name->name, upperdir, name->len); + if (IS_ERR(newdentry)) { + ovl_whiteout(upperdir, template); + return newdentry; + } + + /* + * Whiteout just been successfully removed, parent + * i_mutex is still held, there's no way the lookup + * could return positive. + */ + WARN_ON(newdentry->d_inode); + } + + return newdentry; + +out_dput: + dput(newdentry); + return ERR_PTR(err); +} + +struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry, + struct kstat *stat, const char *link) +{ + int err; + struct dentry *newdentry; + struct inode *dir = upperdir->d_inode; + + newdentry = ovl_lookup_create(upperdir, dentry); + if (IS_ERR(newdentry)) + goto out; + + switch (stat->mode & S_IFMT) { + case S_IFREG: + err = vfs_create(dir, newdentry, stat->mode, NULL); + break; + + case S_IFDIR: + err = vfs_mkdir(dir, newdentry, stat->mode); + break; + + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + err = vfs_mknod(dir, newdentry, stat->mode, stat->rdev); + break; + + case S_IFLNK: + err = vfs_symlink(dir, newdentry, link); + break; + + default: + err = -EPERM; + } + if (err) { + if (ovl_dentry_is_opaque(dentry)) + ovl_whiteout(upperdir, dentry); + dput(newdentry); + newdentry = ERR_PTR(err); + } else if (WARN_ON(!newdentry->d_inode)) { + /* + * Not quite sure if non-instantiated dentry is legal or not. + * VFS doesn't seem to care so check and warn here. + */ + dput(newdentry); + newdentry = ERR_PTR(-ENOENT); + } + +out: + return newdentry; + +} + +static int ovl_set_opaque(struct dentry *upperdentry) +{ + int err; + const struct cred *old_cred; + struct cred *override_cred; + + override_cred = prepare_creds(); + if (!override_cred) + return -ENOMEM; + + /* CAP_SYS_ADMIN for setxattr of "trusted" namespace */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + old_cred = override_creds(override_cred); + err = vfs_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0); + revert_creds(old_cred); + put_cred(override_cred); + + return err; +} + +static int ovl_remove_opaque(struct dentry *upperdentry) +{ + int err; + const struct cred *old_cred; + struct cred *override_cred; + + override_cred = prepare_creds(); + if (!override_cred) + return -ENOMEM; + + /* CAP_SYS_ADMIN for removexattr of "trusted" namespace */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + old_cred = override_creds(override_cred); + err = vfs_removexattr(upperdentry, ovl_opaque_xattr); + revert_creds(old_cred); + put_cred(override_cred); + + return err; +} + +static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + int err; + enum ovl_path_type type; + struct path realpath; + + type = ovl_path_real(dentry, &realpath); + err = vfs_getattr(realpath.mnt, realpath.dentry, stat); + if (err) + return err; + + stat->dev = dentry->d_sb->s_dev; + stat->ino = dentry->d_inode->i_ino; + + /* + * It's probably not worth it to count subdirs to get the + * correct link count. nlink=1 seems to pacify 'find' and + * other utilities. + */ + if (type == OVL_PATH_MERGE) + stat->nlink = 1; + + return 0; +} + +static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, + const char *link) +{ + int err; + struct dentry *newdentry; + struct dentry *upperdir; + struct inode *inode; + struct kstat stat = { + .mode = mode, + .rdev = rdev, + }; + + err = -ENOMEM; + inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata); + if (!inode) + goto out; + + err = ovl_copy_up(dentry->d_parent); + if (err) + goto out_iput; + + upperdir = ovl_dentry_upper(dentry->d_parent); + mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT); + + newdentry = ovl_upper_create(upperdir, dentry, &stat, link); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + + ovl_dentry_version_inc(dentry->d_parent); + if (ovl_dentry_is_opaque(dentry) && S_ISDIR(mode)) { + err = ovl_set_opaque(newdentry); + if (err) { + vfs_rmdir(upperdir->d_inode, newdentry); + ovl_whiteout(upperdir, dentry); + goto out_dput; + } + } + ovl_dentry_update(dentry, newdentry); + d_instantiate(dentry, inode); + inode = NULL; + newdentry = NULL; + err = 0; + +out_dput: + dput(newdentry); +out_unlock: + mutex_unlock(&upperdir->d_inode->i_mutex); +out_iput: + iput(inode); +out: + return err; +} + +static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode, + struct nameidata *nd) +{ + return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL); +} + +static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL); +} + +static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t rdev) +{ + return ovl_create_object(dentry, mode, rdev, NULL); +} + +static int ovl_symlink(struct inode *dir, struct dentry *dentry, + const char *link) +{ + return ovl_create_object(dentry, S_IFLNK, 0, link); +} + +static int ovl_do_remove(struct dentry *dentry, bool is_dir) +{ + int err; + enum ovl_path_type type; + struct path realpath; + struct dentry *upperdir; + + err = ovl_copy_up(dentry->d_parent); + if (err) + return err; + + upperdir = ovl_dentry_upper(dentry->d_parent); + mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT); + type = ovl_path_real(dentry, &realpath); + if (type != OVL_PATH_LOWER) { + err = -ESTALE; + if (realpath.dentry->d_parent != upperdir) + goto out_d_drop; + + /* FIXME: create whiteout up front and rename to target */ + + if (is_dir) + err = vfs_rmdir(upperdir->d_inode, realpath.dentry); + else + err = vfs_unlink(upperdir->d_inode, realpath.dentry); + if (err) + goto out_d_drop; + + ovl_dentry_version_inc(dentry->d_parent); + } + + if (type != OVL_PATH_UPPER || ovl_dentry_is_opaque(dentry)) + err = ovl_whiteout(upperdir, dentry); + + /* + * Keeping this dentry hashed would mean having to release + * upperpath/lowerpath, which could only be done if we are the + * sole user of this dentry. Too tricky... Just unhash for + * now. + */ +out_d_drop: + d_drop(dentry); + mutex_unlock(&upperdir->d_inode->i_mutex); + + return err; +} + +static int ovl_unlink(struct inode *dir, struct dentry *dentry) +{ + return ovl_do_remove(dentry, false); +} + + +static int ovl_rmdir(struct inode *dir, struct dentry *dentry) +{ + int err; + enum ovl_path_type type; + + type = ovl_path_type(dentry); + if (type != OVL_PATH_UPPER) { + err = ovl_check_empty_and_clear(dentry, type); + if (err) + return err; + } + + return ovl_do_remove(dentry, true); +} + +static int ovl_link(struct dentry *old, struct inode *newdir, + struct dentry *new) +{ + int err; + struct dentry *olddentry; + struct dentry *newdentry; + struct dentry *upperdir; + + err = ovl_copy_up(old); + if (err) + goto out; + + err = ovl_copy_up(new->d_parent); + if (err) + goto out; + + upperdir = ovl_dentry_upper(new->d_parent); + mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT); + newdentry = ovl_lookup_create(upperdir, new); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + + olddentry = ovl_dentry_upper(old); + err = vfs_link(olddentry, upperdir->d_inode, newdentry); + if (!err) { + if (WARN_ON(!newdentry->d_inode)) { + dput(newdentry); + err = -ENOENT; + goto out_unlock; + } + + ovl_dentry_version_inc(new->d_parent); + ovl_dentry_update(new, newdentry); + + ihold(old->d_inode); + d_instantiate(new, old->d_inode); + } else { + if (ovl_dentry_is_opaque(new)) + ovl_whiteout(upperdir, new); + dput(newdentry); + } +out_unlock: + mutex_unlock(&upperdir->d_inode->i_mutex); +out: + return err; + +} + +static int ovl_rename(struct inode *olddir, struct dentry *old, + struct inode *newdir, struct dentry *new) +{ + int err; + enum ovl_path_type old_type; + enum ovl_path_type new_type; + struct dentry *old_upperdir; + struct dentry *new_upperdir; + struct dentry *olddentry; + struct dentry *newdentry; + struct dentry *trap; + bool old_opaque; + bool new_opaque; + bool new_create = false; + bool is_dir = S_ISDIR(old->d_inode->i_mode); + + /* Don't copy up directory trees */ + old_type = ovl_path_type(old); + if (old_type != OVL_PATH_UPPER && is_dir) + return -EXDEV; + + if (new->d_inode) { + new_type = ovl_path_type(new); + + if (new_type == OVL_PATH_LOWER && old_type == OVL_PATH_LOWER) { + if (ovl_dentry_lower(old)->d_inode == + ovl_dentry_lower(new)->d_inode) + return 0; + } + if (new_type != OVL_PATH_LOWER && old_type != OVL_PATH_LOWER) { + if (ovl_dentry_upper(old)->d_inode == + ovl_dentry_upper(new)->d_inode) + return 0; + } + + if (new_type != OVL_PATH_UPPER && + S_ISDIR(new->d_inode->i_mode)) { + err = ovl_check_empty_and_clear(new, new_type); + if (err) + return err; + } + } else { + new_type = OVL_PATH_UPPER; + } + + err = ovl_copy_up(old); + if (err) + return err; + + err = ovl_copy_up(new->d_parent); + if (err) + return err; + + old_upperdir = ovl_dentry_upper(old->d_parent); + new_upperdir = ovl_dentry_upper(new->d_parent); + + trap = lock_rename(new_upperdir, old_upperdir); + + olddentry = ovl_dentry_upper(old); + newdentry = ovl_dentry_upper(new); + if (newdentry) { + dget(newdentry); + } else { + new_create = true; + newdentry = ovl_lookup_create(new_upperdir, new); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + } + + err = -ESTALE; + if (olddentry->d_parent != old_upperdir) + goto out_dput; + if (newdentry->d_parent != new_upperdir) + goto out_dput; + if (olddentry == trap) + goto out_dput; + if (newdentry == trap) + goto out_dput; + + old_opaque = ovl_dentry_is_opaque(old); + new_opaque = ovl_dentry_is_opaque(new) || new_type != OVL_PATH_UPPER; + + if (is_dir && !old_opaque && new_opaque) { + err = ovl_set_opaque(olddentry); + if (err) + goto out_dput; + } + + err = vfs_rename(old_upperdir->d_inode, olddentry, + new_upperdir->d_inode, newdentry); + + if (err) { + if (new_create && ovl_dentry_is_opaque(new)) + ovl_whiteout(new_upperdir, new); + if (is_dir && !old_opaque && new_opaque) + ovl_remove_opaque(olddentry); + goto out_dput; + } + + if (old_type != OVL_PATH_UPPER || old_opaque) + err = ovl_whiteout(old_upperdir, old); + if (is_dir && old_opaque && !new_opaque) + ovl_remove_opaque(olddentry); + + if (old_opaque != new_opaque) + ovl_dentry_set_opaque(old, new_opaque); + + ovl_dentry_version_inc(old->d_parent); + ovl_dentry_version_inc(new->d_parent); + +out_dput: + dput(newdentry); +out_unlock: + unlock_rename(new_upperdir, old_upperdir); + return err; +} + +const struct inode_operations ovl_dir_inode_operations = { + .lookup = ovl_lookup, + .mkdir = ovl_mkdir, + .symlink = ovl_symlink, + .unlink = ovl_unlink, + .rmdir = ovl_rmdir, + .rename = ovl_rename, + .link = ovl_link, + .setattr = ovl_setattr, + .create = ovl_create, + .mknod = ovl_mknod, + .permission = ovl_permission, + .getattr = ovl_dir_getattr, + .setxattr = ovl_setxattr, + .getxattr = ovl_getxattr, + .listxattr = ovl_listxattr, + .removexattr = ovl_removexattr, +}; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c new file mode 100644 index 000000000000..c220ea745f24 --- /dev/null +++ b/fs/overlayfs/inode.c @@ -0,0 +1,384 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include "overlayfs.h" + +int ovl_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct dentry *upperdentry; + int err; + + if ((attr->ia_valid & ATTR_SIZE) && !ovl_dentry_upper(dentry)) + err = ovl_copy_up_truncate(dentry, attr->ia_size); + else + err = ovl_copy_up(dentry); + if (err) + return err; + + upperdentry = ovl_dentry_upper(dentry); + + if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) + attr->ia_valid &= ~ATTR_MODE; + + mutex_lock(&upperdentry->d_inode->i_mutex); + err = notify_change(upperdentry, attr); + mutex_unlock(&upperdentry->d_inode->i_mutex); + + return err; +} + +static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct path realpath; + + ovl_path_real(dentry, &realpath); + return vfs_getattr(realpath.mnt, realpath.dentry, stat); +} + +int ovl_permission(struct inode *inode, int mask) +{ + struct ovl_entry *oe; + struct dentry *alias = NULL; + struct inode *realinode; + struct dentry *realdentry; + bool is_upper; + int err; + + if (S_ISDIR(inode->i_mode)) { + oe = inode->i_private; + } else if (mask & MAY_NOT_BLOCK) { + return -ECHILD; + } else { + /* + * For non-directories find an alias and get the info + * from there. + */ + spin_lock(&inode->i_lock); + if (WARN_ON(list_empty(&inode->i_dentry))) { + spin_unlock(&inode->i_lock); + return -ENOENT; + } + alias = list_entry(inode->i_dentry.next, + struct dentry, d_alias); + dget(alias); + spin_unlock(&inode->i_lock); + oe = alias->d_fsdata; + } + + realdentry = ovl_entry_real(oe, &is_upper); + + /* Careful in RCU walk mode */ + realinode = ACCESS_ONCE(realdentry->d_inode); + if (!realinode) { + WARN_ON(!(mask & MAY_NOT_BLOCK)); + err = -ENOENT; + goto out_dput; + } + + if (mask & MAY_WRITE) { + umode_t mode = realinode->i_mode; + + /* + * Writes will always be redirected to upper layer, so + * ignore lower layer being read-only. + * + * If the overlay itself is read-only then proceed + * with the permission check, don't return EROFS. + * This will only happen if this is the lower layer of + * another overlayfs. + * + * If upper fs becomes read-only after the overlay was + * constructed return EROFS to prevent modification of + * upper layer. + */ + err = -EROFS; + if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) + goto out_dput; + + /* + * Nobody gets write access to an immutable file. + */ + err = -EACCES; + if (IS_IMMUTABLE(realinode)) + goto out_dput; + } + + if (realinode->i_op->permission) + err = realinode->i_op->permission(realinode, mask); + else + err = generic_permission(realinode, mask); +out_dput: + dput(alias); + return err; +} + + +struct ovl_link_data { + struct dentry *realdentry; + void *cookie; +}; + +static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + void *ret; + struct dentry *realdentry; + struct inode *realinode; + + realdentry = ovl_dentry_real(dentry); + realinode = realdentry->d_inode; + + if (WARN_ON(!realinode->i_op->follow_link)) + return ERR_PTR(-EPERM); + + ret = realinode->i_op->follow_link(realdentry, nd); + if (IS_ERR(ret)) + return ret; + + if (realinode->i_op->put_link) { + struct ovl_link_data *data; + + data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL); + if (!data) { + realinode->i_op->put_link(realdentry, nd, ret); + return ERR_PTR(-ENOMEM); + } + data->realdentry = realdentry; + data->cookie = ret; + + return data; + } else { + return NULL; + } +} + +static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c) +{ + struct inode *realinode; + struct ovl_link_data *data = c; + + if (!data) + return; + + realinode = data->realdentry->d_inode; + realinode->i_op->put_link(data->realdentry, nd, data->cookie); + kfree(data); +} + +static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) +{ + struct path realpath; + struct inode *realinode; + + ovl_path_real(dentry, &realpath); + realinode = realpath.dentry->d_inode; + + if (!realinode->i_op->readlink) + return -EINVAL; + + touch_atime(&realpath); + + return realinode->i_op->readlink(realpath.dentry, buf, bufsiz); +} + + +static bool ovl_is_private_xattr(const char *name) +{ + return strncmp(name, "trusted.overlay.", 14) == 0; +} + +int ovl_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + int err; + struct dentry *upperdentry; + + if (ovl_is_private_xattr(name)) + return -EPERM; + + err = ovl_copy_up(dentry); + if (err) + return err; + + upperdentry = ovl_dentry_upper(dentry); + return vfs_setxattr(upperdentry, name, value, size, flags); +} + +ssize_t ovl_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size) +{ + if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE && + ovl_is_private_xattr(name)) + return -ENODATA; + + return vfs_getxattr(ovl_dentry_real(dentry), name, value, size); +} + +ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) +{ + ssize_t res; + int off; + + res = vfs_listxattr(ovl_dentry_real(dentry), list, size); + if (res <= 0 || size == 0) + return res; + + if (ovl_path_type(dentry->d_parent) != OVL_PATH_MERGE) + return res; + + /* filter out private xattrs */ + for (off = 0; off < res;) { + char *s = list + off; + size_t slen = strlen(s) + 1; + + BUG_ON(off + slen > res); + + if (ovl_is_private_xattr(s)) { + res -= slen; + memmove(s, s + slen, res - off); + } else { + off += slen; + } + } + + return res; +} + +int ovl_removexattr(struct dentry *dentry, const char *name) +{ + int err; + struct path realpath; + enum ovl_path_type type; + + if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE && + ovl_is_private_xattr(name)) + return -ENODATA; + + type = ovl_path_real(dentry, &realpath); + if (type == OVL_PATH_LOWER) { + err = vfs_getxattr(realpath.dentry, name, NULL, 0); + if (err < 0) + return err; + + err = ovl_copy_up(dentry); + if (err) + return err; + + ovl_path_upper(dentry, &realpath); + } + + return vfs_removexattr(realpath.dentry, name); +} + +static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type, + struct dentry *realdentry) +{ + if (type != OVL_PATH_LOWER) + return false; + + if (special_file(realdentry->d_inode->i_mode)) + return false; + + if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC)) + return false; + + return true; +} + +static struct file *ovl_open(struct dentry *dentry, struct file *file, + const struct cred *cred) +{ + int err; + struct path realpath; + enum ovl_path_type type; + + type = ovl_path_real(dentry, &realpath); + if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) { + if (file->f_flags & O_TRUNC) + err = ovl_copy_up_truncate(dentry, 0); + else + err = ovl_copy_up(dentry); + if (err) + return ERR_PTR(err); + + ovl_path_upper(dentry, &realpath); + } + + return vfs_open(&realpath, file, cred); +} + +static const struct inode_operations ovl_file_inode_operations = { + .setattr = ovl_setattr, + .permission = ovl_permission, + .getattr = ovl_getattr, + .setxattr = ovl_setxattr, + .getxattr = ovl_getxattr, + .listxattr = ovl_listxattr, + .removexattr = ovl_removexattr, + .open = ovl_open, +}; + +static const struct inode_operations ovl_symlink_inode_operations = { + .setattr = ovl_setattr, + .follow_link = ovl_follow_link, + .put_link = ovl_put_link, + .readlink = ovl_readlink, + .getattr = ovl_getattr, + .setxattr = ovl_setxattr, + .getxattr = ovl_getxattr, + .listxattr = ovl_listxattr, + .removexattr = ovl_removexattr, +}; + +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, + struct ovl_entry *oe) +{ + struct inode *inode; + + inode = new_inode(sb); + if (!inode) + return NULL; + + mode &= S_IFMT; + + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_flags |= S_NOATIME | S_NOCMTIME; + + switch (mode) { + case S_IFDIR: + inode->i_private = oe; + inode->i_op = &ovl_dir_inode_operations; + inode->i_fop = &ovl_dir_operations; + break; + + case S_IFLNK: + inode->i_op = &ovl_symlink_inode_operations; + break; + + case S_IFREG: + case S_IFSOCK: + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + inode->i_op = &ovl_file_inode_operations; + break; + + default: + WARN(1, "illegal file type: %i\n", mode); + inode = NULL; + } + + return inode; + +} diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h new file mode 100644 index 000000000000..1dd05f76604b --- /dev/null +++ b/fs/overlayfs/overlayfs.h @@ -0,0 +1,64 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +struct ovl_entry; + +enum ovl_path_type { + OVL_PATH_UPPER, + OVL_PATH_MERGE, + OVL_PATH_LOWER, +}; + +extern const char *ovl_opaque_xattr; +extern const char *ovl_whiteout_xattr; +extern const struct dentry_operations ovl_dentry_operations; + +enum ovl_path_type ovl_path_type(struct dentry *dentry); +u64 ovl_dentry_version_get(struct dentry *dentry); +void ovl_dentry_version_inc(struct dentry *dentry); +void ovl_path_upper(struct dentry *dentry, struct path *path); +void ovl_path_lower(struct dentry *dentry, struct path *path); +enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); +struct dentry *ovl_dentry_upper(struct dentry *dentry); +struct dentry *ovl_dentry_lower(struct dentry *dentry); +struct dentry *ovl_dentry_real(struct dentry *dentry); +struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper); +bool ovl_dentry_is_opaque(struct dentry *dentry); +void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque); +bool ovl_is_whiteout(struct dentry *dentry); +void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry); +struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd); +struct file *ovl_path_open(struct path *path, int flags); + +struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry, + struct kstat *stat, const char *link); + +/* readdir.c */ +extern const struct file_operations ovl_dir_operations; +int ovl_check_empty_and_clear(struct dentry *dentry, enum ovl_path_type type); + +/* inode.c */ +int ovl_setattr(struct dentry *dentry, struct iattr *attr); +int ovl_permission(struct inode *inode, int mask); +int ovl_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +ssize_t ovl_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size); +ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); +int ovl_removexattr(struct dentry *dentry, const char *name); + +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, + struct ovl_entry *oe); +/* dir.c */ +extern const struct inode_operations ovl_dir_inode_operations; + +/* copy_up.c */ +int ovl_copy_up(struct dentry *dentry); +int ovl_copy_up_truncate(struct dentry *dentry, loff_t size); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c new file mode 100644 index 000000000000..0797efbc7be4 --- /dev/null +++ b/fs/overlayfs/readdir.c @@ -0,0 +1,566 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "overlayfs.h" + +struct ovl_cache_entry { + const char *name; + unsigned int len; + unsigned int type; + u64 ino; + bool is_whiteout; + struct list_head l_node; + struct rb_node node; +}; + +struct ovl_readdir_data { + struct rb_root *root; + struct list_head *list; + struct list_head *middle; + struct dentry *dir; + int count; + int err; +}; + +struct ovl_dir_file { + bool is_real; + bool is_cached; + struct list_head cursor; + u64 cache_version; + struct list_head cache; + struct file *realfile; +}; + +static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n) +{ + return container_of(n, struct ovl_cache_entry, node); +} + +static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, + const char *name, int len) +{ + struct rb_node *node = root->rb_node; + int cmp; + + while (node) { + struct ovl_cache_entry *p = ovl_cache_entry_from_node(node); + + cmp = strncmp(name, p->name, len); + if (cmp > 0) + node = p->node.rb_right; + else if (cmp < 0 || len < p->len) + node = p->node.rb_left; + else + return p; + } + + return NULL; +} + +static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len, + u64 ino, unsigned int d_type) +{ + struct ovl_cache_entry *p; + + p = kmalloc(sizeof(*p) + len + 1, GFP_KERNEL); + if (p) { + char *name_copy = (char *) (p + 1); + memcpy(name_copy, name, len); + name_copy[len] = '\0'; + p->name = name_copy; + p->len = len; + p->type = d_type; + p->ino = ino; + p->is_whiteout = false; + } + + return p; +} + +static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, + const char *name, int len, u64 ino, + unsigned int d_type) +{ + struct rb_node **newp = &rdd->root->rb_node; + struct rb_node *parent = NULL; + struct ovl_cache_entry *p; + + while (*newp) { + int cmp; + struct ovl_cache_entry *tmp; + + parent = *newp; + tmp = ovl_cache_entry_from_node(*newp); + cmp = strncmp(name, tmp->name, len); + if (cmp > 0) + newp = &tmp->node.rb_right; + else if (cmp < 0 || len < tmp->len) + newp = &tmp->node.rb_left; + else + return 0; + } + + p = ovl_cache_entry_new(name, len, ino, d_type); + if (p == NULL) + return -ENOMEM; + + list_add_tail(&p->l_node, rdd->list); + rb_link_node(&p->node, parent, newp); + rb_insert_color(&p->node, rdd->root); + + return 0; +} + +static int ovl_fill_lower(void *buf, const char *name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct ovl_readdir_data *rdd = buf; + struct ovl_cache_entry *p; + + rdd->count++; + p = ovl_cache_entry_find(rdd->root, name, namelen); + if (p) { + list_move_tail(&p->l_node, rdd->middle); + } else { + p = ovl_cache_entry_new(name, namelen, ino, d_type); + if (p == NULL) + rdd->err = -ENOMEM; + else + list_add_tail(&p->l_node, rdd->middle); + } + + return rdd->err; +} + +static void ovl_cache_free(struct list_head *list) +{ + struct ovl_cache_entry *p; + struct ovl_cache_entry *n; + + list_for_each_entry_safe(p, n, list, l_node) + kfree(p); + + INIT_LIST_HEAD(list); +} + +static int ovl_fill_upper(void *buf, const char *name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct ovl_readdir_data *rdd = buf; + + rdd->count++; + return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type); +} + +static inline int ovl_dir_read(struct path *realpath, + struct ovl_readdir_data *rdd, filldir_t filler) +{ + struct file *realfile; + int err; + + realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY); + if (IS_ERR(realfile)) + return PTR_ERR(realfile); + + do { + rdd->count = 0; + rdd->err = 0; + err = vfs_readdir(realfile, filler, rdd); + if (err >= 0) + err = rdd->err; + } while (!err && rdd->count); + fput(realfile); + + return 0; +} + +static void ovl_dir_reset(struct file *file) +{ + struct ovl_dir_file *od = file->private_data; + enum ovl_path_type type = ovl_path_type(file->f_path.dentry); + + if (ovl_dentry_version_get(file->f_path.dentry) != od->cache_version) { + list_del_init(&od->cursor); + ovl_cache_free(&od->cache); + od->is_cached = false; + } + WARN_ON(!od->is_real && type != OVL_PATH_MERGE); + if (od->is_real && type == OVL_PATH_MERGE) { + fput(od->realfile); + od->realfile = NULL; + od->is_real = false; + } +} + +static int ovl_dir_mark_whiteouts(struct ovl_readdir_data *rdd) +{ + struct ovl_cache_entry *p; + struct dentry *dentry; + const struct cred *old_cred; + struct cred *override_cred; + + override_cred = prepare_creds(); + if (!override_cred) { + ovl_cache_free(rdd->list); + return -ENOMEM; + } + + /* + * CAP_SYS_ADMIN for getxattr + * CAP_DAC_OVERRIDE for lookup + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + old_cred = override_creds(override_cred); + + mutex_lock(&rdd->dir->d_inode->i_mutex); + list_for_each_entry(p, rdd->list, l_node) { + if (p->type != DT_LNK) + continue; + + dentry = lookup_one_len(p->name, rdd->dir, p->len); + if (IS_ERR(dentry)) + continue; + + p->is_whiteout = ovl_is_whiteout(dentry); + dput(dentry); + } + mutex_unlock(&rdd->dir->d_inode->i_mutex); + + revert_creds(old_cred); + put_cred(override_cred); + + return 0; +} + +static inline int ovl_dir_read_merged(struct path *upperpath, + struct path *lowerpath, + struct ovl_readdir_data *rdd) +{ + int err; + struct rb_root root = RB_ROOT; + struct list_head middle; + + rdd->root = &root; + if (upperpath->dentry) { + rdd->dir = upperpath->dentry; + err = ovl_dir_read(upperpath, rdd, ovl_fill_upper); + if (err) + goto out; + + err = ovl_dir_mark_whiteouts(rdd); + if (err) + goto out; + } + /* + * Insert lowerpath entries before upperpath ones, this allows + * offsets to be reasonably constant + */ + list_add(&middle, rdd->list); + rdd->middle = &middle; + err = ovl_dir_read(lowerpath, rdd, ovl_fill_lower); + list_del(&middle); +out: + rdd->root = NULL; + + return err; +} + +static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos) +{ + struct list_head *l; + loff_t off; + + l = od->cache.next; + for (off = 0; off < pos; off++) { + if (l == &od->cache) + break; + l = l->next; + } + list_move_tail(&od->cursor, l); +} + +static int ovl_readdir(struct file *file, void *buf, filldir_t filler) +{ + struct ovl_dir_file *od = file->private_data; + int res; + + if (!file->f_pos) + ovl_dir_reset(file); + + if (od->is_real) { + res = vfs_readdir(od->realfile, filler, buf); + file->f_pos = od->realfile->f_pos; + + return res; + } + + if (!od->is_cached) { + struct path lowerpath; + struct path upperpath; + struct ovl_readdir_data rdd = { .list = &od->cache }; + + ovl_path_lower(file->f_path.dentry, &lowerpath); + ovl_path_upper(file->f_path.dentry, &upperpath); + + res = ovl_dir_read_merged(&upperpath, &lowerpath, &rdd); + if (res) { + ovl_cache_free(rdd.list); + return res; + } + + od->cache_version = ovl_dentry_version_get(file->f_path.dentry); + od->is_cached = true; + + ovl_seek_cursor(od, file->f_pos); + } + + while (od->cursor.next != &od->cache) { + int over; + loff_t off; + struct ovl_cache_entry *p; + + p = list_entry(od->cursor.next, struct ovl_cache_entry, l_node); + off = file->f_pos; + if (!p->is_whiteout) { + over = filler(buf, p->name, p->len, off, p->ino, + p->type); + if (over) + break; + } + file->f_pos++; + list_move(&od->cursor, &p->l_node); + } + + return 0; +} + +static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t res; + struct ovl_dir_file *od = file->private_data; + + mutex_lock(&file->f_dentry->d_inode->i_mutex); + if (!file->f_pos) + ovl_dir_reset(file); + + if (od->is_real) { + res = vfs_llseek(od->realfile, offset, origin); + file->f_pos = od->realfile->f_pos; + } else { + res = -EINVAL; + + switch (origin) { + case SEEK_CUR: + offset += file->f_pos; + break; + case SEEK_SET: + break; + default: + goto out_unlock; + } + if (offset < 0) + goto out_unlock; + + if (offset != file->f_pos) { + file->f_pos = offset; + if (od->is_cached) + ovl_seek_cursor(od, offset); + } + res = offset; + } +out_unlock: + mutex_unlock(&file->f_dentry->d_inode->i_mutex); + + return res; +} + +static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct ovl_dir_file *od = file->private_data; + + /* May need to reopen directory if it got copied up */ + if (!od->realfile) { + struct path upperpath; + + ovl_path_upper(file->f_path.dentry, &upperpath); + od->realfile = ovl_path_open(&upperpath, O_RDONLY); + if (IS_ERR(od->realfile)) + return PTR_ERR(od->realfile); + } + + return vfs_fsync_range(od->realfile, start, end, datasync); +} + +static int ovl_dir_release(struct inode *inode, struct file *file) +{ + struct ovl_dir_file *od = file->private_data; + + list_del(&od->cursor); + ovl_cache_free(&od->cache); + if (od->realfile) + fput(od->realfile); + kfree(od); + + return 0; +} + +static int ovl_dir_open(struct inode *inode, struct file *file) +{ + struct path realpath; + struct file *realfile; + struct ovl_dir_file *od; + enum ovl_path_type type; + + od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL); + if (!od) + return -ENOMEM; + + type = ovl_path_real(file->f_path.dentry, &realpath); + realfile = ovl_path_open(&realpath, file->f_flags); + if (IS_ERR(realfile)) { + kfree(od); + return PTR_ERR(realfile); + } + INIT_LIST_HEAD(&od->cache); + INIT_LIST_HEAD(&od->cursor); + od->is_cached = false; + od->realfile = realfile; + od->is_real = (type != OVL_PATH_MERGE); + file->private_data = od; + + return 0; +} + +const struct file_operations ovl_dir_operations = { + .read = generic_read_dir, + .open = ovl_dir_open, + .readdir = ovl_readdir, + .llseek = ovl_dir_llseek, + .fsync = ovl_dir_fsync, + .release = ovl_dir_release, +}; + +static int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) +{ + int err; + struct path lowerpath; + struct path upperpath; + struct ovl_cache_entry *p; + struct ovl_readdir_data rdd = { .list = list }; + + ovl_path_upper(dentry, &upperpath); + ovl_path_lower(dentry, &lowerpath); + + err = ovl_dir_read_merged(&upperpath, &lowerpath, &rdd); + if (err) + return err; + + err = 0; + + list_for_each_entry(p, list, l_node) { + if (p->is_whiteout) + continue; + + if (p->name[0] == '.') { + if (p->len == 1) + continue; + if (p->len == 2 && p->name[1] == '.') + continue; + } + err = -ENOTEMPTY; + break; + } + + return err; +} + +static int ovl_remove_whiteouts(struct dentry *dir, struct list_head *list) +{ + struct path upperpath; + struct dentry *upperdir; + struct ovl_cache_entry *p; + const struct cred *old_cred; + struct cred *override_cred; + int err; + + ovl_path_upper(dir, &upperpath); + upperdir = upperpath.dentry; + + override_cred = prepare_creds(); + if (!override_cred) + return -ENOMEM; + + /* + * CAP_DAC_OVERRIDE for lookup and unlink + * CAP_SYS_ADMIN for setxattr of "trusted" namespace + * CAP_FOWNER for unlink in sticky directory + */ + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + old_cred = override_creds(override_cred); + + err = vfs_setxattr(upperdir, ovl_opaque_xattr, "y", 1, 0); + if (err) + goto out_revert_creds; + + mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT); + list_for_each_entry(p, list, l_node) { + struct dentry *dentry; + int ret; + + if (!p->is_whiteout) + continue; + + dentry = lookup_one_len(p->name, upperdir, p->len); + if (IS_ERR(dentry)) { + printk(KERN_WARNING + "overlayfs: failed to lookup whiteout %.*s: %li\n", + p->len, p->name, PTR_ERR(dentry)); + continue; + } + ret = vfs_unlink(upperdir->d_inode, dentry); + dput(dentry); + if (ret) + printk(KERN_WARNING + "overlayfs: failed to unlink whiteout %.*s: %i\n", + p->len, p->name, ret); + } + mutex_unlock(&upperdir->d_inode->i_mutex); + +out_revert_creds: + revert_creds(old_cred); + put_cred(override_cred); + + return err; +} + +int ovl_check_empty_and_clear(struct dentry *dentry, enum ovl_path_type type) +{ + int err; + LIST_HEAD(list); + + err = ovl_check_empty_dir(dentry, &list); + if (!err && type == OVL_PATH_MERGE) + err = ovl_remove_whiteouts(dentry, &list); + + ovl_cache_free(&list); + + return err; +} diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c new file mode 100644 index 000000000000..0aaa4c29546f --- /dev/null +++ b/fs/overlayfs/super.c @@ -0,0 +1,611 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "overlayfs.h" + +MODULE_AUTHOR("Miklos Szeredi "); +MODULE_DESCRIPTION("Overlay filesystem"); +MODULE_LICENSE("GPL"); + +struct ovl_fs { + struct vfsmount *upper_mnt; + struct vfsmount *lower_mnt; +}; + +struct ovl_entry { + /* + * Keep "double reference" on upper dentries, so that + * d_delete() doesn't think it's OK to reset d_inode to NULL. + */ + struct dentry *__upperdentry; + struct dentry *lowerdentry; + union { + struct { + u64 version; + bool opaque; + }; + struct rcu_head rcu; + }; +}; + +const char *ovl_whiteout_xattr = "trusted.overlay.whiteout"; +const char *ovl_opaque_xattr = "trusted.overlay.opaque"; + + +enum ovl_path_type ovl_path_type(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe->__upperdentry) { + if (oe->lowerdentry && S_ISDIR(dentry->d_inode->i_mode)) + return OVL_PATH_MERGE; + else + return OVL_PATH_UPPER; + } else { + return OVL_PATH_LOWER; + } +} + +static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe) +{ + struct dentry *upperdentry = ACCESS_ONCE(oe->__upperdentry); + smp_read_barrier_depends(); + return upperdentry; +} + +void ovl_path_upper(struct dentry *dentry, struct path *path) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_entry *oe = dentry->d_fsdata; + + path->mnt = ofs->upper_mnt; + path->dentry = ovl_upperdentry_dereference(oe); +} + +void ovl_path_lower(struct dentry *dentry, struct path *path) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_entry *oe = dentry->d_fsdata; + + path->mnt = ofs->lower_mnt; + path->dentry = oe->lowerdentry; +} + +enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) +{ + + enum ovl_path_type type = ovl_path_type(dentry); + + if (type == OVL_PATH_LOWER) + ovl_path_lower(dentry, path); + else + ovl_path_upper(dentry, path); + + return type; +} + +struct dentry *ovl_dentry_upper(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return ovl_upperdentry_dereference(oe); +} + +struct dentry *ovl_dentry_lower(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->lowerdentry; +} + +struct dentry *ovl_dentry_real(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + struct dentry *realdentry; + + realdentry = ovl_upperdentry_dereference(oe); + if (!realdentry) + realdentry = oe->lowerdentry; + + return realdentry; +} + +struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper) +{ + struct dentry *realdentry; + + realdentry = ovl_upperdentry_dereference(oe); + if (realdentry) { + *is_upper = true; + } else { + realdentry = oe->lowerdentry; + *is_upper = false; + } + return realdentry; +} + +bool ovl_dentry_is_opaque(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + return oe->opaque; +} + +void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque) +{ + struct ovl_entry *oe = dentry->d_fsdata; + oe->opaque = opaque; +} + +void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex)); + WARN_ON(oe->__upperdentry); + BUG_ON(!upperdentry->d_inode); + smp_wmb(); + oe->__upperdentry = dget(upperdentry); +} + +void ovl_dentry_version_inc(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + oe->version++; +} + +u64 ovl_dentry_version_get(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + return oe->version; +} + +bool ovl_is_whiteout(struct dentry *dentry) +{ + int res; + char val; + + if (!dentry) + return false; + if (!dentry->d_inode) + return false; + if (!S_ISLNK(dentry->d_inode->i_mode)) + return false; + + res = vfs_getxattr(dentry, ovl_whiteout_xattr, &val, 1); + if (res == 1 && val == 'y') + return true; + + return false; +} + +static bool ovl_is_opaquedir(struct dentry *dentry) +{ + int res; + char val; + + if (!S_ISDIR(dentry->d_inode->i_mode)) + return false; + + res = vfs_getxattr(dentry, ovl_opaque_xattr, &val, 1); + if (res == 1 && val == 'y') + return true; + + return false; +} + +static void ovl_entry_free(struct rcu_head *head) +{ + struct ovl_entry *oe = container_of(head, struct ovl_entry, rcu); + kfree(oe); +} + +static void ovl_dentry_release(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe) { + dput(oe->__upperdentry); + dput(oe->__upperdentry); + dput(oe->lowerdentry); + call_rcu(&oe->rcu, ovl_entry_free); + } +} + +const struct dentry_operations ovl_dentry_operations = { + .d_release = ovl_dentry_release, +}; + +static struct ovl_entry *ovl_alloc_entry(void) +{ + return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL); +} + +static inline struct dentry *ovl_lookup_real(struct dentry *dir, + struct qstr *name) +{ + struct dentry *dentry; + + mutex_lock(&dir->d_inode->i_mutex); + dentry = lookup_one_len(name->name, dir, name->len); + mutex_unlock(&dir->d_inode->i_mutex); + + if (IS_ERR(dentry)) { + if (PTR_ERR(dentry) == -ENOENT) + dentry = NULL; + } else if (!dentry->d_inode) { + dput(dentry); + dentry = NULL; + } + return dentry; +} + +static int ovl_do_lookup(struct dentry *dentry) +{ + struct ovl_entry *oe; + struct dentry *upperdir; + struct dentry *lowerdir; + struct dentry *upperdentry = NULL; + struct dentry *lowerdentry = NULL; + struct inode *inode = NULL; + int err; + + err = -ENOMEM; + oe = ovl_alloc_entry(); + if (!oe) + goto out; + + upperdir = ovl_dentry_upper(dentry->d_parent); + lowerdir = ovl_dentry_lower(dentry->d_parent); + + if (upperdir) { + upperdentry = ovl_lookup_real(upperdir, &dentry->d_name); + err = PTR_ERR(upperdentry); + if (IS_ERR(upperdentry)) + goto out_put_dir; + + if (lowerdir && upperdentry && + (S_ISLNK(upperdentry->d_inode->i_mode) || + S_ISDIR(upperdentry->d_inode->i_mode))) { + const struct cred *old_cred; + struct cred *override_cred; + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_dput_upper; + + /* CAP_SYS_ADMIN needed for getxattr */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + old_cred = override_creds(override_cred); + + if (ovl_is_opaquedir(upperdentry)) { + oe->opaque = true; + } else if (ovl_is_whiteout(upperdentry)) { + dput(upperdentry); + upperdentry = NULL; + oe->opaque = true; + } + revert_creds(old_cred); + put_cred(override_cred); + } + } + if (lowerdir && !oe->opaque) { + lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name); + err = PTR_ERR(lowerdentry); + if (IS_ERR(lowerdentry)) + goto out_dput_upper; + } + + if (lowerdentry && upperdentry && + (!S_ISDIR(upperdentry->d_inode->i_mode) || + !S_ISDIR(lowerdentry->d_inode->i_mode))) { + dput(lowerdentry); + lowerdentry = NULL; + oe->opaque = true; + } + + if (lowerdentry || upperdentry) { + struct dentry *realdentry; + + realdentry = upperdentry ? upperdentry : lowerdentry; + err = -ENOMEM; + inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode, + oe); + if (!inode) + goto out_dput; + } + + if (upperdentry) + oe->__upperdentry = dget(upperdentry); + + if (lowerdentry) + oe->lowerdentry = lowerdentry; + + dentry->d_fsdata = oe; + dentry->d_op = &ovl_dentry_operations; + d_add(dentry, inode); + + return 0; + +out_dput: + dput(lowerdentry); +out_dput_upper: + dput(upperdentry); +out_put_dir: + kfree(oe); +out: + return err; +} + +struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + int err = ovl_do_lookup(dentry); + + if (err) + return ERR_PTR(err); + + return NULL; +} + +struct file *ovl_path_open(struct path *path, int flags) +{ + path_get(path); + return dentry_open(path->dentry, path->mnt, flags, current_cred()); +} + +static void ovl_put_super(struct super_block *sb) +{ + struct ovl_fs *ufs = sb->s_fs_info; + + if (!(sb->s_flags & MS_RDONLY)) + mnt_drop_write(ufs->upper_mnt); + + mntput(ufs->upper_mnt); + mntput(ufs->lower_mnt); + + kfree(ufs); +} + +static int ovl_remount_fs(struct super_block *sb, int *flagsp, char *data) +{ + int flags = *flagsp; + struct ovl_fs *ufs = sb->s_fs_info; + + /* When remounting rw or ro, we need to adjust the write access to the + * upper fs. + */ + if (((flags ^ sb->s_flags) & MS_RDONLY) == 0) + /* No change to readonly status */ + return 0; + + if (flags & MS_RDONLY) { + mnt_drop_write(ufs->upper_mnt); + return 0; + } else + return mnt_want_write(ufs->upper_mnt); +} + +static const struct super_operations ovl_super_operations = { + .put_super = ovl_put_super, + .remount_fs = ovl_remount_fs, +}; + +struct ovl_config { + char *lowerdir; + char *upperdir; +}; + +enum { + Opt_lowerdir, + Opt_upperdir, + Opt_err, +}; + +static const match_table_t ovl_tokens = { + {Opt_lowerdir, "lowerdir=%s"}, + {Opt_upperdir, "upperdir=%s"}, + {Opt_err, NULL} +}; + +static int ovl_parse_opt(char *opt, struct ovl_config *config) +{ + char *p; + + config->upperdir = NULL; + config->lowerdir = NULL; + + while ((p = strsep(&opt, ",")) != NULL) { + int token; + substring_t args[MAX_OPT_ARGS]; + + if (!*p) + continue; + + token = match_token(p, ovl_tokens, args); + switch (token) { + case Opt_upperdir: + kfree(config->upperdir); + config->upperdir = match_strdup(&args[0]); + if (!config->upperdir) + return -ENOMEM; + break; + + case Opt_lowerdir: + kfree(config->lowerdir); + config->lowerdir = match_strdup(&args[0]); + if (!config->lowerdir) + return -ENOMEM; + break; + + default: + return -EINVAL; + } + } + return 0; +} + +static int ovl_fill_super(struct super_block *sb, void *data, int silent) +{ + struct path lowerpath; + struct path upperpath; + struct inode *root_inode; + struct dentry *root_dentry; + struct ovl_entry *oe; + struct ovl_fs *ufs; + struct ovl_config config; + int err; + + err = ovl_parse_opt((char *) data, &config); + if (err) + goto out; + + err = -EINVAL; + if (!config.upperdir || !config.lowerdir) { + printk(KERN_ERR "overlayfs: missing upperdir or lowerdir\n"); + goto out_free_config; + } + + err = -ENOMEM; + ufs = kmalloc(sizeof(struct ovl_fs), GFP_KERNEL); + if (!ufs) + goto out_free_config; + + oe = ovl_alloc_entry(); + if (oe == NULL) + goto out_free_ufs; + + err = kern_path(config.upperdir, LOOKUP_FOLLOW, &upperpath); + if (err) + goto out_free_oe; + + err = kern_path(config.lowerdir, LOOKUP_FOLLOW, &lowerpath); + if (err) + goto out_put_upperpath; + + err = -ENOTDIR; + if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) || + !S_ISDIR(lowerpath.dentry->d_inode->i_mode)) + goto out_put_lowerpath; + + ufs->upper_mnt = clone_private_mount(&upperpath); + err = PTR_ERR(ufs->upper_mnt); + if (IS_ERR(ufs->upper_mnt)) { + printk(KERN_ERR "overlayfs: failed to clone upperpath\n"); + goto out_put_lowerpath; + } + + ufs->lower_mnt = clone_private_mount(&lowerpath); + err = PTR_ERR(ufs->lower_mnt); + if (IS_ERR(ufs->lower_mnt)) { + printk(KERN_ERR "overlayfs: failed to clone lowerpath\n"); + goto out_put_upper_mnt; + } + + /* + * Make lower_mnt R/O. That way fchmod/fchown on lower file + * will fail instead of modifying lower fs. + */ + ufs->lower_mnt->mnt_flags |= MNT_READONLY; + + /* If the upper fs is r/o, we mark overlayfs r/o too */ + if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY) + sb->s_flags |= MS_RDONLY; + + if (!(sb->s_flags & MS_RDONLY)) { + err = mnt_want_write(ufs->upper_mnt); + if (err) + goto out_put_lower_mnt; + } + + err = -ENOMEM; + root_inode = ovl_new_inode(sb, S_IFDIR, oe); + if (!root_inode) + goto out_drop_write; + + root_dentry = d_make_root(root_inode); + if (!root_dentry) + goto out_drop_write; + + mntput(upperpath.mnt); + mntput(lowerpath.mnt); + + oe->__upperdentry = dget(upperpath.dentry); + oe->lowerdentry = lowerpath.dentry; + + root_dentry->d_fsdata = oe; + root_dentry->d_op = &ovl_dentry_operations; + + sb->s_op = &ovl_super_operations; + sb->s_root = root_dentry; + sb->s_fs_info = ufs; + + return 0; + +out_drop_write: + if (!(sb->s_flags & MS_RDONLY)) + mnt_drop_write(ufs->upper_mnt); +out_put_lower_mnt: + mntput(ufs->lower_mnt); +out_put_upper_mnt: + mntput(ufs->upper_mnt); +out_put_lowerpath: + path_put(&lowerpath); +out_put_upperpath: + path_put(&upperpath); +out_free_oe: + kfree(oe); +out_free_ufs: + kfree(ufs); +out_free_config: + kfree(config.lowerdir); + kfree(config.upperdir); +out: + return err; +} + +static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) +{ + return mount_nodev(fs_type, flags, raw_data, ovl_fill_super); +} + +static struct file_system_type ovl_fs_type = { + .owner = THIS_MODULE, + .name = "overlayfs", + .mount = ovl_mount, + .kill_sb = kill_anon_super, +}; + +static int __init ovl_init(void) +{ + return register_filesystem(&ovl_fs_type); +} + +static void __exit ovl_exit(void) +{ + unregister_filesystem(&ovl_fs_type); +} + +module_init(ovl_init); +module_exit(ovl_exit); -- cgit v1.2.3 From 6e3de5bb9f8fc243852dca4a88989b6afa8df719 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Fri, 1 Oct 2010 18:48:02 +0100 Subject: UBUNTU: ubuntu: overlayfs -- overlayfs: add statfs support Add support for statfs to the overlayfs filesystem. As the upper layer is the target of all write operations assume that the space in that filesystem is the space in the overlayfs. There will be some inaccuracy as overwriting a file will copy it up and consume space we were not expecting, but it is better than nothing. Use the upper layer dentry and mount from the overlayfs root inode, passing the statfs call to that filesystem. Signed-off-by: Andy Whitcroft Signed-off-by: Miklos Szeredi --- fs/overlayfs/super.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 0aaa4c29546f..50cbef5e4000 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -406,9 +406,29 @@ static int ovl_remount_fs(struct super_block *sb, int *flagsp, char *data) return mnt_want_write(ufs->upper_mnt); } +/** + * ovl_statfs + * @sb: The overlayfs super block + * @buf: The struct kstatfs to fill in with stats + * + * Get the filesystem statistics. As writes always target the upper layer + * filesystem pass the statfs to the same filesystem. + */ +static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct dentry *root_dentry = dentry->d_sb->s_root; + struct path path; + ovl_path_upper(root_dentry, &path); + + if (!path.dentry->d_sb->s_op->statfs) + return -ENOSYS; + return path.dentry->d_sb->s_op->statfs(path.dentry, buf); +} + static const struct super_operations ovl_super_operations = { .put_super = ovl_put_super, .remount_fs = ovl_remount_fs, + .statfs = ovl_statfs, }; struct ovl_config { -- cgit v1.2.3 From 4bc5d3cf995626b5fe123aa6d58080100547fab7 Mon Sep 17 00:00:00 2001 From: Erez Zadok Date: Mon, 23 May 2011 20:59:20 -0400 Subject: UBUNTU: ubuntu: overlayfs -- overlayfs: implement show_options This is useful because of the stacking nature of overlayfs. Users like to find out (via /proc/mounts) which lower/upper directory were used at mount time. Signed-off-by: Erez Zadok Signed-off-by: Miklos Szeredi --- fs/overlayfs/super.c | 63 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 50cbef5e4000..b8f8fa456302 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -17,17 +17,27 @@ #include #include #include +#include #include "overlayfs.h" MODULE_AUTHOR("Miklos Szeredi "); MODULE_DESCRIPTION("Overlay filesystem"); MODULE_LICENSE("GPL"); +struct ovl_config { + char *lowerdir; + char *upperdir; +}; + +/* private information held for overlayfs's superblock */ struct ovl_fs { struct vfsmount *upper_mnt; struct vfsmount *lower_mnt; + /* pathnames of lower and upper dirs, for show_options */ + struct ovl_config config; }; +/* private information held for every overlayfs dentry */ struct ovl_entry { /* * Keep "double reference" on upper dentries, so that @@ -384,6 +394,8 @@ static void ovl_put_super(struct super_block *sb) mntput(ufs->upper_mnt); mntput(ufs->lower_mnt); + kfree(ufs->config.lowerdir); + kfree(ufs->config.upperdir); kfree(ufs); } @@ -425,15 +437,27 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) return path.dentry->d_sb->s_op->statfs(path.dentry, buf); } +/** + * ovl_show_options + * + * Prints the mount options for a given superblock. + * Returns zero; does not fail. + */ +static int ovl_show_options(struct seq_file *m, struct dentry *dentry) +{ + struct super_block *sb = dentry->d_sb; + struct ovl_fs *ufs = sb->s_fs_info; + + seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir); + seq_printf(m, ",upperdir=%s", ufs->config.upperdir); + return 0; +} + static const struct super_operations ovl_super_operations = { .put_super = ovl_put_super, .remount_fs = ovl_remount_fs, .statfs = ovl_statfs, -}; - -struct ovl_config { - char *lowerdir; - char *upperdir; + .show_options = ovl_show_options, }; enum { @@ -493,33 +517,32 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) struct dentry *root_dentry; struct ovl_entry *oe; struct ovl_fs *ufs; - struct ovl_config config; int err; - err = ovl_parse_opt((char *) data, &config); - if (err) + err = -ENOMEM; + ufs = kmalloc(sizeof(struct ovl_fs), GFP_KERNEL); + if (!ufs) goto out; + err = ovl_parse_opt((char *) data, &ufs->config); + if (err) + goto out_free_ufs; + err = -EINVAL; - if (!config.upperdir || !config.lowerdir) { + if (!ufs->config.upperdir || !ufs->config.lowerdir) { printk(KERN_ERR "overlayfs: missing upperdir or lowerdir\n"); goto out_free_config; } - err = -ENOMEM; - ufs = kmalloc(sizeof(struct ovl_fs), GFP_KERNEL); - if (!ufs) - goto out_free_config; - oe = ovl_alloc_entry(); if (oe == NULL) - goto out_free_ufs; + goto out_free_config; - err = kern_path(config.upperdir, LOOKUP_FOLLOW, &upperpath); + err = kern_path(ufs->config.upperdir, LOOKUP_FOLLOW, &upperpath); if (err) goto out_free_oe; - err = kern_path(config.lowerdir, LOOKUP_FOLLOW, &lowerpath); + err = kern_path(ufs->config.lowerdir, LOOKUP_FOLLOW, &lowerpath); if (err) goto out_put_upperpath; @@ -595,11 +618,11 @@ out_put_upperpath: path_put(&upperpath); out_free_oe: kfree(oe); +out_free_config: + kfree(ufs->config.lowerdir); + kfree(ufs->config.upperdir); out_free_ufs: kfree(ufs); -out_free_config: - kfree(config.lowerdir); - kfree(config.upperdir); out: return err; } -- cgit v1.2.3 From e4d0f22b95510fcaea972a44b22004efdf390836 Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Mon, 14 May 2012 13:06:14 +0200 Subject: UBUNTU: ubuntu: overlayfs -- overlay: overlay filesystem documentation Document the overlay filesystem. Signed-off-by: Miklos Szeredi --- Documentation/filesystems/overlayfs.txt | 199 ++++++++++++++++++++++++++++++++ MAINTAINERS | 7 ++ 2 files changed, 206 insertions(+) create mode 100644 Documentation/filesystems/overlayfs.txt diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt new file mode 100644 index 000000000000..7161dc391fa9 --- /dev/null +++ b/Documentation/filesystems/overlayfs.txt @@ -0,0 +1,199 @@ +Written by: Neil Brown + +Overlay Filesystem +================== + +This document describes a prototype for a new approach to providing +overlay-filesystem functionality in Linux (sometimes referred to as +union-filesystems). An overlay-filesystem tries to present a +filesystem which is the result over overlaying one filesystem on top +of the other. + +The result will inevitably fail to look exactly like a normal +filesystem for various technical reasons. The expectation is that +many use cases will be able to ignore these differences. + +This approach is 'hybrid' because the objects that appear in the +filesystem do not all appear to belong to that filesystem. In many +cases an object accessed in the union will be indistinguishable +from accessing the corresponding object from the original filesystem. +This is most obvious from the 'st_dev' field returned by stat(2). + +While directories will report an st_dev from the overlay-filesystem, +all non-directory objects will report an st_dev from the lower or +upper filesystem that is providing the object. Similarly st_ino will +only be unique when combined with st_dev, and both of these can change +over the lifetime of a non-directory object. Many applications and +tools ignore these values and will not be affected. + +Upper and Lower +--------------- + +An overlay filesystem combines two filesystems - an 'upper' filesystem +and a 'lower' filesystem. When a name exists in both filesystems, the +object in the 'upper' filesystem is visible while the object in the +'lower' filesystem is either hidden or, in the case of directories, +merged with the 'upper' object. + +It would be more correct to refer to an upper and lower 'directory +tree' rather than 'filesystem' as it is quite possible for both +directory trees to be in the same filesystem and there is no +requirement that the root of a filesystem be given for either upper or +lower. + +The lower filesystem can be any filesystem supported by Linux and does +not need to be writable. The lower filesystem can even be another +overlayfs. The upper filesystem will normally be writable and if it +is it must support the creation of trusted.* extended attributes, and +must provide valid d_type in readdir responses, at least for symbolic +links - so NFS is not suitable. + +A read-only overlay of two read-only filesystems may use any +filesystem type. + +Directories +----------- + +Overlaying mainly involved directories. If a given name appears in both +upper and lower filesystems and refers to a non-directory in either, +then the lower object is hidden - the name refers only to the upper +object. + +Where both upper and lower objects are directories, a merged directory +is formed. + +At mount time, the two directories given as mount options are combined +into a merged directory: + + mount -t overlayfs overlayfs -olowerdir=/lower,upperdir=/upper /overlay + +Then whenever a lookup is requested in such a merged directory, the +lookup is performed in each actual directory and the combined result +is cached in the dentry belonging to the overlay filesystem. If both +actual lookups find directories, both are stored and a merged +directory is created, otherwise only one is stored: the upper if it +exists, else the lower. + +Only the lists of names from directories are merged. Other content +such as metadata and extended attributes are reported for the upper +directory only. These attributes of the lower directory are hidden. + +whiteouts and opaque directories +-------------------------------- + +In order to support rm and rmdir without changing the lower +filesystem, an overlay filesystem needs to record in the upper filesystem +that files have been removed. This is done using whiteouts and opaque +directories (non-directories are always opaque). + +The overlay filesystem uses extended attributes with a +"trusted.overlay." prefix to record these details. + +A whiteout is created as a symbolic link with target +"(overlay-whiteout)" and with xattr "trusted.overlay.whiteout" set to "y". +When a whiteout is found in the upper level of a merged directory, any +matching name in the lower level is ignored, and the whiteout itself +is also hidden. + +A directory is made opaque by setting the xattr "trusted.overlay.opaque" +to "y". Where the upper filesystem contains an opaque directory, any +directory in the lower filesystem with the same name is ignored. + +readdir +------- + +When a 'readdir' request is made on a merged directory, the upper and +lower directories are each read and the name lists merged in the +obvious way (upper is read first, then lower - entries that already +exist are not re-added). This merged name list is cached in the +'struct file' and so remains as long as the file is kept open. If the +directory is opened and read by two processes at the same time, they +will each have separate caches. A seekdir to the start of the +directory (offset 0) followed by a readdir will cause the cache to be +discarded and rebuilt. + +This means that changes to the merged directory do not appear while a +directory is being read. This is unlikely to be noticed by many +programs. + +seek offsets are assigned sequentially when the directories are read. +Thus if + - read part of a directory + - remember an offset, and close the directory + - re-open the directory some time later + - seek to the remembered offset + +there may be little correlation between the old and new locations in +the list of filenames, particularly if anything has changed in the +directory. + +Readdir on directories that are not merged is simply handled by the +underlying directory (upper or lower). + + +Non-directories +--------------- + +Objects that are not directories (files, symlinks, device-special +files etc.) are presented either from the upper or lower filesystem as +appropriate. When a file in the lower filesystem is accessed in a way +the requires write-access, such as opening for write access, changing +some metadata etc., the file is first copied from the lower filesystem +to the upper filesystem (copy_up). Note that creating a hard-link +also requires copy_up, though of course creation of a symlink does +not. + +The copy_up may turn out to be unnecessary, for example if the file is +opened for read-write but the data is not modified. + +The copy_up process first makes sure that the containing directory +exists in the upper filesystem - creating it and any parents as +necessary. It then creates the object with the same metadata (owner, +mode, mtime, symlink-target etc.) and then if the object is a file, the +data is copied from the lower to the upper filesystem. Finally any +extended attributes are copied up. + +Once the copy_up is complete, the overlay filesystem simply +provides direct access to the newly created file in the upper +filesystem - future operations on the file are barely noticed by the +overlay filesystem (though an operation on the name of the file such as +rename or unlink will of course be noticed and handled). + + +Non-standard behavior +--------------------- + +The copy_up operation essentially creates a new, identical file and +moves it over to the old name. The new file may be on a different +filesystem, so both st_dev and st_ino of the file may change. + +Any open files referring to this inode will access the old data and +metadata. Similarly any file locks obtained before copy_up will not +apply to the copied up file. + +On a file is opened with O_RDONLY fchmod(2), fchown(2), futimesat(2) +and fsetxattr(2) will fail with EROFS. + +If a file with multiple hard links is copied up, then this will +"break" the link. Changes will not be propagated to other names +referring to the same inode. + +Symlinks in /proc/PID/ and /proc/PID/fd which point to a non-directory +object in overlayfs will not contain vaid absolute paths, only +relative paths leading up to the filesystem's root. This will be +fixed in the future. + +Some operations are not atomic, for example a crash during copy_up or +rename will leave the filesystem in an inconsitent state. This will +be addressed in the future. + +Changes to underlying filesystems +--------------------------------- + +Offline changes, when the overlay is not mounted, are allowed to either +the upper or the lower trees. + +Changes to the underlying filesystems while part of a mounted overlay +filesystem are not allowed. If the underlying filesystem is changed, +the behavior of the overlay is undefined, though it will not result in +a crash or deadlock. diff --git a/MAINTAINERS b/MAINTAINERS index 14bc7071f9df..9c1da0d12f22 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5059,6 +5059,13 @@ F: drivers/scsi/osd/ F: include/scsi/osd_* F: fs/exofs/ +OVERLAYFS FILESYSTEM +M: Miklos Szeredi +L: linux-fsdevel@vger.kernel.org +S: Supported +F: fs/overlayfs/* +F: Documentation/filesystems/overlayfs.txt + P54 WIRELESS DRIVER M: Christian Lamparter L: linux-wireless@vger.kernel.org -- cgit v1.2.3 From 2d017ba0073ac4b69f39eef96bf69fa4b8b01e62 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 14 May 2012 13:06:14 +0200 Subject: UBUNTU: ubuntu: overlayfs -- fs: limit filesystem stacking depth Add a simple read-only counter to super_block that indicates deep this is in the stack of filesystems. Previously ecryptfs was the only stackable filesystem and it explicitly disallowed multiple layers of itself. Overlayfs, however, can be stacked recursively and also may be stacked on top of ecryptfs or vice versa. To limit the kernel stack usage we must limit the depth of the filesystem stack. Initially the limit is set to 2. Signed-off-by: Miklos Szeredi --- fs/ecryptfs/main.c | 7 +++++++ fs/overlayfs/super.c | 10 ++++++++++ include/linux/fs.h | 11 +++++++++++ 3 files changed, 28 insertions(+) diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 68954937a071..c54ea903a169 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -544,6 +544,13 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags s->s_maxbytes = path.dentry->d_sb->s_maxbytes; s->s_blocksize = path.dentry->d_sb->s_blocksize; s->s_magic = ECRYPTFS_SUPER_MAGIC; + s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1; + + rc = -EINVAL; + if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { + printk(KERN_ERR "eCryptfs: maximum fs stacking depth exceeded\n"); + goto out_free; + } inode = ecryptfs_get_inode(path.dentry->d_inode, s); rc = PTR_ERR(inode); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index b8f8fa456302..1d2d1e273696 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -551,6 +551,16 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) !S_ISDIR(lowerpath.dentry->d_inode->i_mode)) goto out_put_lowerpath; + sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth, + lowerpath.mnt->mnt_sb->s_stack_depth) + 1; + + err = -EINVAL; + if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { + printk(KERN_ERR "overlayfs: maximum fs stacking depth exceeded\n"); + goto out_put_lowerpath; + } + + ufs->upper_mnt = clone_private_mount(&upperpath); err = PTR_ERR(ufs->upper_mnt); if (IS_ERR(ufs->upper_mnt)) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 05d046f75e90..49b86cf0564a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -499,6 +499,12 @@ struct iattr { */ #include +/* + * Maximum number of layers of fs stack. Needs to be limited to + * prevent kernel stack overflow + */ +#define FILESYSTEM_MAX_STACK_DEPTH 2 + /** * enum positive_aop_returns - aop return codes with specific semantics * @@ -1542,6 +1548,11 @@ struct super_block { /* Being remounted read-only */ int s_readonly_remount; + + /* + * Indicates how deep in a filesystem stack this SB is + */ + int s_stack_depth; }; /* superblock cache pruning functions */ -- cgit v1.2.3 From 5a8d9deff6d1c74131d410473328722ab6afb363 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Mon, 12 Mar 2012 13:44:58 +0800 Subject: UBUNTU: ubuntu: overlayfs -- overlayfs: fix possible leak in ovl_new_inode After allocating a new inode, if the mode of inode is incorrect, we should release it by iput(). Signed-off-by: Robin Dong Signed-off-by: Miklos Szeredi --- fs/overlayfs/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index c220ea745f24..86bf66346ad8 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -376,6 +376,7 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, default: WARN(1, "illegal file type: %i\n", mode); + iput(inode); inode = NULL; } -- cgit v1.2.3 From f4737f464a1d0a851efc371e9fab72eb9eb0b323 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Mon, 12 Mar 2012 13:44:59 +0800 Subject: UBUNTU: ubuntu: overlayfs -- overlayfs: create new inode in ovl_link Imaging using ext4 as upperdir which has a file "hello" and lowdir is totally empty. 1. mount -t overlayfs overlayfs -o lowerdir=/lower,upperdir=/upper /overlay 2. cd /overlay 3. ln hello bye then the overlayfs code will call vfs_link to create a real ext4 dentry for "bye" and create a new overlayfs dentry point to overlayfs inode (which standed for "hello"). That means: two overlayfs dentries and only one overlayfs inode. and then 4. umount /overlay 5. mount -t overlayfs overlayfs -o lowerdir=/lower,upperdir=/upper /overlay (again) 6. cd /overlay 7. ls hello bye the overlayfs will create two inodes(one for the "hello", another for the "bye") and two dentries (each point a inode).That means: two dentries and two inodes. As above, with different order of "create link" and "mount", the result is not the same. In order to make the behavior coherent, we need to create inode in ovl_link. Signed-off-by: Robin Dong Signed-off-by: Miklos Szeredi --- fs/overlayfs/dir.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 00aa6d99cb66..c914c9770ca8 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -417,6 +417,7 @@ static int ovl_link(struct dentry *old, struct inode *newdir, struct dentry *olddentry; struct dentry *newdentry; struct dentry *upperdir; + struct inode *newinode; err = ovl_copy_up(old); if (err) @@ -441,13 +442,17 @@ static int ovl_link(struct dentry *old, struct inode *newdir, err = -ENOENT; goto out_unlock; } + newinode = ovl_new_inode(old->d_sb, newdentry->d_inode->i_mode, + new->d_fsdata); + if (!newinode) + goto link_fail; ovl_dentry_version_inc(new->d_parent); ovl_dentry_update(new, newdentry); - ihold(old->d_inode); - d_instantiate(new, old->d_inode); + d_instantiate(new, newinode); } else { +link_fail: if (ovl_dentry_is_opaque(new)) ovl_whiteout(upperdir, new); dput(newdentry); -- cgit v1.2.3 From f80b4b08630d217810bda1680995508feb6ab212 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 1 May 2012 16:17:51 +0100 Subject: UBUNTU: ubuntu: overlayfs -- inode_only_permission: export inode level permissions checks We need to be able to check inode permissions (but not filesystem implied permissions) for stackable filesystems. Now that permissions involve checking with the security LSM, cgroups and basic inode permissions it is easy to miss a key permission check and introduce a security vunerability. Expose a new interface for these checks. Signed-off-by: Andy Whitcroft Signed-off-by: Miklos Szeredi --- fs/namei.c | 48 +++++++++++++++++++++++++++++++----------------- include/linux/fs.h | 1 + 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index f09edf3eeef2..caf090c48629 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -314,6 +314,36 @@ static inline int do_inode_permission(struct inode *inode, int mask) return generic_permission(inode, mask); } +/** + * inode_only_permission - check access rights to a given inode only + * @inode: inode to check permissions on + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...) + * + * Uses to check read/write/execute permissions on an inode directly, we do + * not check filesystem permissions. + */ +int inode_only_permission(struct inode *inode, int mask) +{ + int retval; + + /* + * Nobody gets write access to an immutable file. + */ + if (unlikely(mask & MAY_WRITE) && IS_IMMUTABLE(inode)) + return -EACCES; + + retval = do_inode_permission(inode, mask); + if (retval) + return retval; + + retval = devcgroup_inode_permission(inode, mask); + if (retval) + return retval; + + return security_inode_permission(inode, mask); +} +EXPORT_SYMBOL(inode_only_permission); + /** * inode_permission - check for access rights to a given inode * @inode: inode to check permission on @@ -328,8 +358,6 @@ static inline int do_inode_permission(struct inode *inode, int mask) */ int inode_permission(struct inode *inode, int mask) { - int retval; - if (unlikely(mask & MAY_WRITE)) { umode_t mode = inode->i_mode; @@ -339,23 +367,9 @@ int inode_permission(struct inode *inode, int mask) if (IS_RDONLY(inode) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) return -EROFS; - - /* - * Nobody gets write access to an immutable file. - */ - if (IS_IMMUTABLE(inode)) - return -EACCES; } - retval = do_inode_permission(inode, mask); - if (retval) - return retval; - - retval = devcgroup_inode_permission(inode, mask); - if (retval) - return retval; - - return security_inode_permission(inode, mask); + return inode_only_permission(inode, mask); } /** diff --git a/include/linux/fs.h b/include/linux/fs.h index 49b86cf0564a..e99bb48293d5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2263,6 +2263,7 @@ extern sector_t bmap(struct inode *, sector_t); #endif extern int notify_change(struct dentry *, struct iattr *); extern int inode_permission(struct inode *, int); +extern int inode_only_permission(struct inode *, int); extern int generic_permission(struct inode *, int); static inline bool execute_ok(struct inode *inode) -- cgit v1.2.3 From 1867aff3b8906b1f5083070a8dc137ecfcaabc71 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 1 May 2012 16:17:52 +0100 Subject: UBUNTU: ubuntu: overlayfs -- overlayfs: switch to use inode_only_permissions When checking permissions on an overlayfs inode we do not take into account either device cgroup restrictions nor security permissions. This allows a user to mount an overlayfs layer over a restricted device directory and by pass those permissions to open otherwise restricted files. Switch over to the newly introduced inode_only_permissions. Signed-off-by: Andy Whitcroft Signed-off-by: Miklos Szeredi --- fs/overlayfs/inode.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 86bf66346ad8..1a8e232e2c6e 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -105,19 +105,9 @@ int ovl_permission(struct inode *inode, int mask) if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) goto out_dput; - - /* - * Nobody gets write access to an immutable file. - */ - err = -EACCES; - if (IS_IMMUTABLE(realinode)) - goto out_dput; } - if (realinode->i_op->permission) - err = realinode->i_op->permission(realinode, mask); - else - err = generic_permission(realinode, mask); + err = inode_only_permission(realinode, mask); out_dput: dput(alias); return err; -- cgit v1.2.3 From 0aacd94c86ed8c70cfb9b30cd68612e1a88dd187 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 22 May 2012 08:22:18 -0700 Subject: UBUNTU: SAUCE: apparmor: Add the ability to mediate mount Add the ability for apparmor to do mediation of mount operations. Mount rules require an updated apparmor_parser (2.8 series) for policy compilation. The basic form of the rules are. [audit] [deny] mount [conds]* [device] [ -> [conds] path], [audit] [deny] remount [conds]* [path], [audit] [deny] umount [conds]* [path], [audit] [deny] pivotroot [oldroot=] remount is just a short cut for mount options=remount where [conds] can be fstype= options= Example mount commands mount, # allow all mounts, but not umount or pivotroot mount fstype=procfs, # allow mounting procfs anywhere mount options=(bind, ro) /foo -> /bar, # readonly bind mount mount /dev/sda -> /mnt, mount /dev/sd** -> /mnt/**, mount fstype=overlayfs options=(rw,upperdir=/tmp/upper/,lowerdir=/) -> /mnt/ umount, umount /m*, See the apparmor userspace for full documentation Signed-off-by: John Johansen Acked-by: Kees Cook Signed-off-by: Tim Gardner --- security/apparmor/Makefile | 3 +- security/apparmor/apparmorfs.c | 13 + security/apparmor/audit.c | 4 + security/apparmor/domain.c | 2 +- security/apparmor/include/apparmor.h | 3 +- security/apparmor/include/audit.h | 11 + security/apparmor/include/domain.h | 2 + security/apparmor/include/mount.h | 54 +++ security/apparmor/lsm.c | 59 ++++ security/apparmor/mount.c | 620 +++++++++++++++++++++++++++++++++++ 10 files changed, 767 insertions(+), 4 deletions(-) create mode 100644 security/apparmor/include/mount.h create mode 100644 security/apparmor/mount.c diff --git a/security/apparmor/Makefile b/security/apparmor/Makefile index 806bd19af7f2..f6c51c03df08 100644 --- a/security/apparmor/Makefile +++ b/security/apparmor/Makefile @@ -4,11 +4,10 @@ obj-$(CONFIG_SECURITY_APPARMOR) += apparmor.o apparmor-y := apparmorfs.o audit.o capability.o context.o ipc.o lib.o match.o \ path.o domain.o policy.o policy_unpack.o procattr.o lsm.o \ - resource.o sid.o file.o + resource.o sid.o file.o mount.o clean-files := capability_names.h rlim_names.h - # Build a lower case string table of capability names # Transforms lines from # #define CAP_DAC_OVERRIDE 1 diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 89bdc62ddd24..fbedacb942e1 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -424,9 +424,22 @@ static struct aa_fs_entry aa_fs_entry_domain[] = { { } }; +static struct aa_fs_entry aa_fs_entry_mount[] = { + AA_FS_FILE_STRING("mask", "mount umount"), + { } +}; + +static struct aa_fs_entry aa_fs_entry_namespaces[] = { + AA_FS_FILE_BOOLEAN("profile", 1), + AA_FS_FILE_BOOLEAN("pivot_root", 1), + { } +}; + static struct aa_fs_entry aa_fs_entry_features[] = { AA_FS_DIR("domain", aa_fs_entry_domain), AA_FS_DIR("file", aa_fs_entry_file), + AA_FS_DIR("mount", aa_fs_entry_mount), + AA_FS_DIR("namespaces", aa_fs_entry_namespaces), AA_FS_FILE_U64("capability", VFS_CAP_FLAGS_MASK), AA_FS_DIR("rlimit", aa_fs_entry_rlimit), { } diff --git a/security/apparmor/audit.c b/security/apparmor/audit.c index 3ae28db5a64f..e26796312ec2 100644 --- a/security/apparmor/audit.c +++ b/security/apparmor/audit.c @@ -44,6 +44,10 @@ const char *const op_table[] = { "file_mmap", "file_mprotect", + "pivotroot", + "mount", + "umount", + "create", "post_create", "bind", diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 507dd8583b05..31a3f5284a76 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -242,7 +242,7 @@ static const char *next_name(int xtype, const char *name) * * Returns: refcounted profile, or NULL on failure (MAYBE NULL) */ -static struct aa_profile *x_table_lookup(struct aa_profile *profile, u32 xindex) +struct aa_profile *x_table_lookup(struct aa_profile *profile, u32 xindex) { struct aa_profile *new_profile = NULL; struct aa_namespace *ns = profile->ns; diff --git a/security/apparmor/include/apparmor.h b/security/apparmor/include/apparmor.h index 40aedd9f73ea..e243d96ddbd9 100644 --- a/security/apparmor/include/apparmor.h +++ b/security/apparmor/include/apparmor.h @@ -29,8 +29,9 @@ #define AA_CLASS_NET 4 #define AA_CLASS_RLIMITS 5 #define AA_CLASS_DOMAIN 6 +#define AA_CLASS_MOUNT 7 -#define AA_CLASS_LAST AA_CLASS_DOMAIN +#define AA_CLASS_LAST AA_CLASS_MOUNT /* Control parameters settable through module/boot flags */ extern enum audit_mode aa_g_audit; diff --git a/security/apparmor/include/audit.h b/security/apparmor/include/audit.h index 4b7e18951aea..72c19dbc0598 100644 --- a/security/apparmor/include/audit.h +++ b/security/apparmor/include/audit.h @@ -73,6 +73,10 @@ enum aa_ops { OP_FMMAP, OP_FMPROT, + OP_PIVOTROOT, + OP_MOUNT, + OP_UMOUNT, + OP_CREATE, OP_POST_CREATE, OP_BIND, @@ -121,6 +125,13 @@ struct apparmor_audit_data { int rlim; unsigned long max; } rlim; + struct { + const char *src_name; + const char *type; + const char *trans; + const char *data; + unsigned long flags; + } mnt; struct { const char *target; u32 request; diff --git a/security/apparmor/include/domain.h b/security/apparmor/include/domain.h index de04464f0a3f..a3f70c58ef3d 100644 --- a/security/apparmor/include/domain.h +++ b/security/apparmor/include/domain.h @@ -23,6 +23,8 @@ struct aa_domain { char **table; }; +struct aa_profile *x_table_lookup(struct aa_profile *profile, u32 xindex); + int apparmor_bprm_set_creds(struct linux_binprm *bprm); int apparmor_bprm_secureexec(struct linux_binprm *bprm); void apparmor_bprm_committing_creds(struct linux_binprm *bprm); diff --git a/security/apparmor/include/mount.h b/security/apparmor/include/mount.h new file mode 100644 index 000000000000..bc17a5388ae9 --- /dev/null +++ b/security/apparmor/include/mount.h @@ -0,0 +1,54 @@ +/* + * AppArmor security module + * + * This file contains AppArmor file mediation function definitions. + * + * Copyright 2012 Canonical Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#ifndef __AA_MOUNT_H +#define __AA_MOUNT_H + +#include +#include + +#include "domain.h" +#include "policy.h" + +/* mount perms */ +#define AA_MAY_PIVOTROOT 0x01 +#define AA_MAY_MOUNT 0x02 +#define AA_MAY_UMOUNT 0x04 +#define AA_AUDIT_DATA 0x40 +#define AA_CONT_MATCH 0x40 + +#define AA_MS_IGNORE_MASK (MS_KERNMOUNT | MS_NOSEC | MS_ACTIVE | MS_BORN) + +int aa_remount(struct aa_profile *profile, struct path *path, + unsigned long flags, void *data); + +int aa_bind_mount(struct aa_profile *profile, struct path *path, + const char *old_name, unsigned long flags); + + +int aa_mount_change_type(struct aa_profile *profile, struct path *path, + unsigned long flags); + +int aa_move_mount(struct aa_profile *profile, struct path *path, + const char *old_name); + +int aa_new_mount(struct aa_profile *profile, const char *dev_name, + struct path *path, const char *type, unsigned long flags, + void *data); + +int aa_umount(struct aa_profile *profile, struct vfsmount *mnt, int flags); + +int aa_pivotroot(struct aa_profile *profile, struct path *old_path, + struct path *new_path); + +#endif /* __AA_MOUNT_H */ diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 8ea39aabe948..43c818dfe4ff 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -35,6 +35,7 @@ #include "include/path.h" #include "include/policy.h" #include "include/procattr.h" +#include "include/mount.h" /* Flag indicating whether initialization completed */ int apparmor_initialized __initdata; @@ -503,6 +504,60 @@ static int apparmor_file_mprotect(struct vm_area_struct *vma, !(vma->vm_flags & VM_SHARED) ? MAP_PRIVATE : 0); } +static int apparmor_sb_mount(char *dev_name, struct path *path, char *type, + unsigned long flags, void *data) +{ + struct aa_profile *profile; + int error = 0; + + /* Discard magic */ + if ((flags & MS_MGC_MSK) == MS_MGC_VAL) + flags &= ~MS_MGC_MSK; + + flags &= ~AA_MS_IGNORE_MASK; + + profile = __aa_current_profile(); + if (!unconfined(profile)) { + if (flags & MS_REMOUNT) + error = aa_remount(profile, path, flags, data); + else if (flags & MS_BIND) + error = aa_bind_mount(profile, path, dev_name, flags); + else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | + MS_UNBINDABLE)) + error = aa_mount_change_type(profile, path, flags); + else if (flags & MS_MOVE) + error = aa_move_mount(profile, path, dev_name); + else + error = aa_new_mount(profile, dev_name, path, type, + flags, data); + } + return error; +} + +static int apparmor_sb_umount(struct vfsmount *mnt, int flags) +{ + struct aa_profile *profile; + int error = 0; + + profile = __aa_current_profile(); + if (!unconfined(profile)) + error = aa_umount(profile, mnt, flags); + + return error; +} + +static int apparmor_sb_pivotroot(struct path *old_path, struct path *new_path) +{ + struct aa_profile *profile; + int error = 0; + + profile = __aa_current_profile(); + if (!unconfined(profile)) + error = aa_pivotroot(profile, old_path, new_path); + + return error; +} + static int apparmor_getprocattr(struct task_struct *task, char *name, char **value) { @@ -622,6 +677,10 @@ static struct security_operations apparmor_ops = { .capget = apparmor_capget, .capable = apparmor_capable, + .sb_mount = apparmor_sb_mount, + .sb_umount = apparmor_sb_umount, + .sb_pivotroot = apparmor_sb_pivotroot, + .path_link = apparmor_path_link, .path_unlink = apparmor_path_unlink, .path_symlink = apparmor_path_symlink, diff --git a/security/apparmor/mount.c b/security/apparmor/mount.c new file mode 100644 index 000000000000..315979b269c4 --- /dev/null +++ b/security/apparmor/mount.c @@ -0,0 +1,620 @@ +/* + * AppArmor security module + * + * This file contains AppArmor mediation of files + * + * Copyright (C) 1998-2008 Novell/SUSE + * Copyright 2009-2012 Canonical Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include +#include +#include + +#include "include/apparmor.h" +#include "include/audit.h" +#include "include/context.h" +#include "include/domain.h" +#include "include/file.h" +#include "include/match.h" +#include "include/mount.h" +#include "include/path.h" +#include "include/policy.h" + + +static void audit_mnt_flags(struct audit_buffer *ab, unsigned long flags) +{ + if (flags & MS_RDONLY) + audit_log_format(ab, "ro"); + else + audit_log_format(ab, "rw"); + if (flags & MS_NOSUID) + audit_log_format(ab, ", nosuid"); + if (flags & MS_NODEV) + audit_log_format(ab, ", nodev"); + if (flags & MS_NOEXEC) + audit_log_format(ab, ", noexec"); + if (flags & MS_SYNCHRONOUS) + audit_log_format(ab, ", sync"); + if (flags & MS_REMOUNT) + audit_log_format(ab, ", remount"); + if (flags & MS_MANDLOCK) + audit_log_format(ab, ", mand"); + if (flags & MS_DIRSYNC) + audit_log_format(ab, ", dirsync"); + if (flags & MS_NOATIME) + audit_log_format(ab, ", noatime"); + if (flags & MS_NODIRATIME) + audit_log_format(ab, ", nodiratime"); + if (flags & MS_BIND) + audit_log_format(ab, flags & MS_REC ? ", rbind" : ", bind"); + if (flags & MS_MOVE) + audit_log_format(ab, ", move"); + if (flags & MS_SILENT) + audit_log_format(ab, ", silent"); + if (flags & MS_POSIXACL) + audit_log_format(ab, ", acl"); + if (flags & MS_UNBINDABLE) + audit_log_format(ab, flags & MS_REC ? ", runbindable" : + ", unbindable"); + if (flags & MS_PRIVATE) + audit_log_format(ab, flags & MS_REC ? ", rprivate" : + ", private"); + if (flags & MS_SLAVE) + audit_log_format(ab, flags & MS_REC ? ", rslave" : + ", slave"); + if (flags & MS_SHARED) + audit_log_format(ab, flags & MS_REC ? ", rshared" : + ", shared"); + if (flags & MS_RELATIME) + audit_log_format(ab, ", relatime"); + if (flags & MS_I_VERSION) + audit_log_format(ab, ", iversion"); + if (flags & MS_STRICTATIME) + audit_log_format(ab, ", strictatime"); + if (flags & MS_NOUSER) + audit_log_format(ab, ", nouser"); +} + +/** + * audit_cb - call back for mount specific audit fields + * @ab: audit_buffer (NOT NULL) + * @va: audit struct to audit values of (NOT NULL) + */ +static void audit_cb(struct audit_buffer *ab, void *va) +{ + struct common_audit_data *sa = va; + + if (sa->aad->mnt.type) { + audit_log_format(ab, " fstype="); + audit_log_untrustedstring(ab, sa->aad->mnt.type); + } + if (sa->aad->mnt.src_name) { + audit_log_format(ab, " srcname="); + audit_log_untrustedstring(ab, sa->aad->mnt.src_name); + } + if (sa->aad->mnt.trans) { + audit_log_format(ab, " trans="); + audit_log_untrustedstring(ab, sa->aad->mnt.trans); + } + if (sa->aad->mnt.flags || sa->aad->op == OP_MOUNT) { + audit_log_format(ab, " flags=\""); + audit_mnt_flags(ab, sa->aad->mnt.flags); + audit_log_format(ab, "\""); + } + if (sa->aad->mnt.data) { + audit_log_format(ab, " options="); + audit_log_untrustedstring(ab, sa->aad->mnt.data); + } +} + +/** + * audit_mount - handle the auditing of mount operations + * @profile: the profile being enforced (NOT NULL) + * @gfp: allocation flags + * @op: operation being mediated (NOT NULL) + * @name: name of object being mediated (MAYBE NULL) + * @src_name: src_name of object being mediated (MAYBE_NULL) + * @type: type of filesystem (MAYBE_NULL) + * @trans: name of trans (MAYBE NULL) + * @flags: filesystem idependent mount flags + * @data: filesystem mount flags + * @request: permissions requested + * @perms: the permissions computed for the request (NOT NULL) + * @info: extra information message (MAYBE NULL) + * @error: 0 if operation allowed else failure error code + * + * Returns: %0 or error on failure + */ +static int audit_mount(struct aa_profile *profile, gfp_t gfp, int op, + const char *name, const char *src_name, + const char *type, const char *trans, + unsigned long flags, const void *data, u32 request, + struct file_perms *perms, const char *info, int error) +{ + int audit_type = AUDIT_APPARMOR_AUTO; + struct common_audit_data sa; + struct apparmor_audit_data aad = { }; + + if (likely(!error)) { + u32 mask = perms->audit; + + if (unlikely(AUDIT_MODE(profile) == AUDIT_ALL)) + mask = 0xffff; + + /* mask off perms that are not being force audited */ + request &= mask; + + if (likely(!request)) + return 0; + audit_type = AUDIT_APPARMOR_AUDIT; + } else { + /* only report permissions that were denied */ + request = request & ~perms->allow; + + if (request & perms->kill) + audit_type = AUDIT_APPARMOR_KILL; + + /* quiet known rejects, assumes quiet and kill do not overlap */ + if ((request & perms->quiet) && + AUDIT_MODE(profile) != AUDIT_NOQUIET && + AUDIT_MODE(profile) != AUDIT_ALL) + request &= ~perms->quiet; + + if (!request) + return COMPLAIN_MODE(profile) ? + complain_error(error) : error; + } + + sa.type = LSM_AUDIT_DATA_NONE; + sa.aad = &aad; + sa.aad->op = op; + sa.aad->name = name; + sa.aad->mnt.src_name = src_name; + sa.aad->mnt.type = type; + sa.aad->mnt.trans = trans; + sa.aad->mnt.flags = flags; + if (data && (perms->audit & AA_AUDIT_DATA)) + sa.aad->mnt.data = data; + sa.aad->info = info; + sa.aad->error = error; + + return aa_audit(audit_type, profile, gfp, &sa, audit_cb); +} + +/** + * match_mnt_flags - Do an ordered match on mount flags + * @dfa: dfa to match against + * @state: state to start in + * @flags: mount flags to match against + * + * Mount flags are encoded as an ordered match. This is done instead of + * checking against a simple bitmask, to allow for logical operations + * on the flags. + * + * Returns: next state after flags match + */ +static unsigned int match_mnt_flags(struct aa_dfa *dfa, unsigned int state, + unsigned long flags) +{ + unsigned int i; + + for (i = 0; i <= 31 ; ++i) { + if ((1 << i) & flags) + state = aa_dfa_next(dfa, state, i + 1); + } + + return state; +} + +/** + * compute_mnt_perms - compute mount permission associated with @state + * @dfa: dfa to match against (NOT NULL) + * @state: state match finished in + * + * Returns: mount permissions + */ +static struct file_perms compute_mnt_perms(struct aa_dfa *dfa, + unsigned int state) +{ + struct file_perms perms; + + perms.kill = 0; + perms.allow = dfa_user_allow(dfa, state); + perms.audit = dfa_user_audit(dfa, state); + perms.quiet = dfa_user_quiet(dfa, state); + perms.xindex = dfa_user_xindex(dfa, state); + + return perms; +} + +static const char const *mnt_info_table[] = { + "match succeeded", + "failed mntpnt match", + "failed srcname match", + "failed type match", + "failed flags match", + "failed data match" +}; + +/* + * Returns 0 on success else element that match failed in, this is the + * index into the mnt_info_table above + */ +static int do_match_mnt(struct aa_dfa *dfa, unsigned int start, + const char *mntpnt, const char *devname, + const char *type, unsigned long flags, + void *data, bool binary, struct file_perms *perms) +{ + unsigned int state; + + state = aa_dfa_match(dfa, start, mntpnt); + state = aa_dfa_null_transition(dfa, state); + if (!state) + return 1; + + if (devname) + state = aa_dfa_match(dfa, state, devname); + state = aa_dfa_null_transition(dfa, state); + if (!state) + return 2; + + if (type) + state = aa_dfa_match(dfa, state, type); + state = aa_dfa_null_transition(dfa, state); + if (!state) + return 3; + + state = match_mnt_flags(dfa, state, flags); + if (!state) + return 4; + *perms = compute_mnt_perms(dfa, state); + if (perms->allow & AA_MAY_MOUNT) + return 0; + + /* only match data if not binary and the DFA flags data is expected */ + if (data && !binary && (perms->allow & AA_CONT_MATCH)) { + state = aa_dfa_null_transition(dfa, state); + if (!state) + return 4; + + state = aa_dfa_match(dfa, state, data); + if (!state) + return 5; + *perms = compute_mnt_perms(dfa, state); + if (perms->allow & AA_MAY_MOUNT) + return 0; + } + + /* failed at end of flags match */ + return 4; +} + +/** + * match_mnt - handle path matching for mount + * @profile: the confining profile + * @mntpnt: string for the mntpnt (NOT NULL) + * @devname: string for the devname/src_name (MAYBE NULL) + * @type: string for the dev type (MAYBE NULL) + * @flags: mount flags to match + * @data: fs mount data (MAYBE NULL) + * @binary: whether @data is binary + * @perms: Returns: permission found by the match + * @info: Returns: infomation string about the match for logging + * + * Returns: 0 on success else error + */ +static int match_mnt(struct aa_profile *profile, const char *mntpnt, + const char *devname, const char *type, + unsigned long flags, void *data, bool binary, + struct file_perms *perms, const char **info) +{ + int pos; + + if (!profile->policy.dfa) + return -EACCES; + + pos = do_match_mnt(profile->policy.dfa, + profile->policy.start[AA_CLASS_MOUNT], + mntpnt, devname, type, flags, data, binary, perms); + if (pos) { + *info = mnt_info_table[pos]; + return -EACCES; + } + + return 0; +} + +static int path_flags(struct aa_profile *profile, struct path *path) +{ + return profile->path_flags | + S_ISDIR(path->dentry->d_inode->i_mode) ? PATH_IS_DIR : 0; +} + +int aa_remount(struct aa_profile *profile, struct path *path, + unsigned long flags, void *data) +{ + struct file_perms perms = { }; + const char *name, *info = NULL; + char *buffer = NULL; + int binary, error; + + binary = path->dentry->d_sb->s_type->fs_flags & FS_BINARY_MOUNTDATA; + + error = aa_path_name(path, path_flags(profile, path), &buffer, &name, + &info); + if (error) + goto audit; + + error = match_mnt(profile, name, NULL, NULL, flags, data, binary, + &perms, &info); + +audit: + error = audit_mount(profile, GFP_KERNEL, OP_MOUNT, name, NULL, NULL, + NULL, flags, data, AA_MAY_MOUNT, &perms, info, + error); + kfree(buffer); + + return error; +} + +int aa_bind_mount(struct aa_profile *profile, struct path *path, + const char *dev_name, unsigned long flags) +{ + struct file_perms perms = { }; + char *buffer = NULL, *old_buffer = NULL; + const char *name, *old_name = NULL, *info = NULL; + struct path old_path; + int error; + + if (!dev_name || !*dev_name) + return -EINVAL; + + flags &= MS_REC | MS_BIND; + + error = aa_path_name(path, path_flags(profile, path), &buffer, &name, + &info); + if (error) + goto audit; + + error = kern_path(dev_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); + if (error) + goto audit; + + error = aa_path_name(&old_path, path_flags(profile, &old_path), + &old_buffer, &old_name, &info); + path_put(&old_path); + if (error) + goto audit; + + error = match_mnt(profile, name, old_name, NULL, flags, NULL, 0, + &perms, &info); + +audit: + error = audit_mount(profile, GFP_KERNEL, OP_MOUNT, name, old_name, + NULL, NULL, flags, NULL, AA_MAY_MOUNT, &perms, + info, error); + kfree(buffer); + kfree(old_buffer); + + return error; +} + +int aa_mount_change_type(struct aa_profile *profile, struct path *path, + unsigned long flags) +{ + struct file_perms perms = { }; + char *buffer = NULL; + const char *name, *info = NULL; + int error; + + /* These are the flags allowed by do_change_type() */ + flags &= (MS_REC | MS_SILENT | MS_SHARED | MS_PRIVATE | MS_SLAVE | + MS_UNBINDABLE); + + error = aa_path_name(path, path_flags(profile, path), &buffer, &name, + &info); + if (error) + goto audit; + + error = match_mnt(profile, name, NULL, NULL, flags, NULL, 0, &perms, + &info); + +audit: + error = audit_mount(profile, GFP_KERNEL, OP_MOUNT, name, NULL, NULL, + NULL, flags, NULL, AA_MAY_MOUNT, &perms, info, + error); + kfree(buffer); + + return error; +} + +int aa_move_mount(struct aa_profile *profile, struct path *path, + const char *orig_name) +{ + struct file_perms perms = { }; + char *buffer = NULL, *old_buffer = NULL; + const char *name, *old_name = NULL, *info = NULL; + struct path old_path; + int error; + + if (!orig_name || !*orig_name) + return -EINVAL; + + error = aa_path_name(path, path_flags(profile, path), &buffer, &name, + &info); + if (error) + goto audit; + + error = kern_path(orig_name, LOOKUP_FOLLOW, &old_path); + if (error) + goto audit; + + error = aa_path_name(&old_path, path_flags(profile, &old_path), + &old_buffer, &old_name, &info); + path_put(&old_path); + if (error) + goto audit; + + error = match_mnt(profile, name, old_name, NULL, MS_MOVE, NULL, 0, + &perms, &info); + +audit: + error = audit_mount(profile, GFP_KERNEL, OP_MOUNT, name, old_name, + NULL, NULL, MS_MOVE, NULL, AA_MAY_MOUNT, &perms, + info, error); + kfree(buffer); + kfree(old_buffer); + + return error; +} + +int aa_new_mount(struct aa_profile *profile, const char *orig_dev_name, + struct path *path, const char *type, unsigned long flags, + void *data) +{ + struct file_perms perms = { }; + char *buffer = NULL, *dev_buffer = NULL; + const char *name = NULL, *dev_name = NULL, *info = NULL; + int binary = 1; + int error; + + dev_name = orig_dev_name; + if (type) { + int requires_dev; + struct file_system_type *fstype = get_fs_type(type); + if (!fstype) + return -ENODEV; + + binary = fstype->fs_flags & FS_BINARY_MOUNTDATA; + requires_dev = fstype->fs_flags & FS_REQUIRES_DEV; + put_filesystem(fstype); + + if (requires_dev) { + struct path dev_path; + + if (!dev_name || !*dev_name) { + error = -ENOENT; + goto out; + } + + error = kern_path(dev_name, LOOKUP_FOLLOW, &dev_path); + if (error) + goto audit; + + error = aa_path_name(&dev_path, + path_flags(profile, &dev_path), + &dev_buffer, &dev_name, &info); + path_put(&dev_path); + if (error) + goto audit; + } + } + + error = aa_path_name(path, path_flags(profile, path), &buffer, &name, + &info); + if (error) + goto audit; + + error = match_mnt(profile, name, dev_name, type, flags, data, binary, + &perms, &info); + +audit: + error = audit_mount(profile, GFP_KERNEL, OP_MOUNT, name, dev_name, + type, NULL, flags, data, AA_MAY_MOUNT, &perms, info, + error); + kfree(buffer); + kfree(dev_buffer); + +out: + return error; + +} + +int aa_umount(struct aa_profile *profile, struct vfsmount *mnt, int flags) +{ + struct file_perms perms = { }; + char *buffer = NULL; + const char *name, *info = NULL; + int error; + + struct path path = { mnt, mnt->mnt_root }; + error = aa_path_name(&path, path_flags(profile, &path), &buffer, &name, + &info); + if (error) + goto audit; + + if (!error && profile->policy.dfa) { + unsigned int state; + state = aa_dfa_match(profile->policy.dfa, + profile->policy.start[AA_CLASS_MOUNT], + name); + perms = compute_mnt_perms(profile->policy.dfa, state); + } + + if (AA_MAY_UMOUNT & ~perms.allow) + error = -EACCES; + +audit: + error = audit_mount(profile, GFP_KERNEL, OP_UMOUNT, name, NULL, NULL, + NULL, 0, NULL, AA_MAY_UMOUNT, &perms, info, error); + kfree(buffer); + + return error; +} + +int aa_pivotroot(struct aa_profile *profile, struct path *old_path, + struct path *new_path) +{ + struct file_perms perms = { }; + struct aa_profile *target = NULL; + char *old_buffer = NULL, *new_buffer = NULL; + const char *old_name, *new_name = NULL, *info = NULL; + int error; + + error = aa_path_name(old_path, path_flags(profile, old_path), + &old_buffer, &old_name, &info); + if (error) + goto audit; + + error = aa_path_name(new_path, path_flags(profile, new_path), + &new_buffer, &new_name, &info); + if (error) + goto audit; + + if (profile->policy.dfa) { + unsigned int state; + state = aa_dfa_match(profile->policy.dfa, + profile->policy.start[AA_CLASS_MOUNT], + new_name); + state = aa_dfa_null_transition(profile->policy.dfa, state); + state = aa_dfa_match(profile->policy.dfa, state, old_name); + perms = compute_mnt_perms(profile->policy.dfa, state); + } + + if (AA_MAY_PIVOTROOT & perms.allow) { + if ((perms.xindex & AA_X_TYPE_MASK) == AA_X_TABLE) { + target = x_table_lookup(profile, perms.xindex); + if (!target) + error = -ENOENT; + else + error = aa_replace_current_profile(target); + } + } else + error = -EACCES; + +audit: + error = audit_mount(profile, GFP_KERNEL, OP_PIVOTROOT, new_name, + old_name, NULL, target ? target->base.name : NULL, + 0, NULL, AA_MAY_PIVOTROOT, &perms, info, error); + aa_put_profile(target); + kfree(old_buffer); + kfree(new_buffer); + + return error; +} -- cgit v1.2.3 From cfa06991ba7e512484cebf8af971843d8ac44047 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 22 May 2012 08:22:19 -0700 Subject: UBUNTU: SAUCE: AppArmor: basic networking rules Base support for network mediation. Signed-off-by: John Johansen Signed-off-by: Tim Gardner --- security/apparmor/.gitignore | 2 +- security/apparmor/Makefile | 42 +++++++++- security/apparmor/apparmorfs.c | 1 + security/apparmor/include/audit.h | 4 + security/apparmor/include/net.h | 44 ++++++++++ security/apparmor/include/policy.h | 3 + security/apparmor/lsm.c | 112 +++++++++++++++++++++++++ security/apparmor/net.c | 162 +++++++++++++++++++++++++++++++++++++ security/apparmor/policy.c | 1 + security/apparmor/policy_unpack.c | 46 +++++++++++ 10 files changed, 414 insertions(+), 3 deletions(-) create mode 100644 security/apparmor/include/net.h create mode 100644 security/apparmor/net.c diff --git a/security/apparmor/.gitignore b/security/apparmor/.gitignore index 4d995aeaebc0..d5b291e94264 100644 --- a/security/apparmor/.gitignore +++ b/security/apparmor/.gitignore @@ -1,6 +1,6 @@ # # Generated include files # -af_names.h +net_names.h capability_names.h rlim_names.h diff --git a/security/apparmor/Makefile b/security/apparmor/Makefile index f6c51c03df08..c575af92c8ba 100644 --- a/security/apparmor/Makefile +++ b/security/apparmor/Makefile @@ -4,9 +4,9 @@ obj-$(CONFIG_SECURITY_APPARMOR) += apparmor.o apparmor-y := apparmorfs.o audit.o capability.o context.o ipc.o lib.o match.o \ path.o domain.o policy.o policy_unpack.o procattr.o lsm.o \ - resource.o sid.o file.o mount.o + resource.o sid.o file.o mount.o net.o -clean-files := capability_names.h rlim_names.h +clean-files := capability_names.h rlim_names.h net_names.h # Build a lower case string table of capability names # Transforms lines from @@ -19,6 +19,38 @@ cmd_make-caps = echo "static const char *const capability_names[] = {" > $@ ;\ -e 's/^\#define[ \t]+CAP_([A-Z0-9_]+)[ \t]+([0-9]+)/[\2] = "\L\1",/p';\ echo "};" >> $@ +# Build a lower case string table of address family names +# Transform lines from +# define AF_LOCAL 1 /* POSIX name for AF_UNIX */ +# #define AF_INET 2 /* Internet IP Protocol */ +# to +# [1] = "local", +# [2] = "inet", +# +# and build the securityfs entries for the mapping. +# Transforms lines from +# #define AF_INET 2 /* Internet IP Protocol */ +# to +# #define AA_FS_AF_MASK "local inet" +quiet_cmd_make-af = GEN $@ +cmd_make-af = echo "static const char *address_family_names[] = {" > $@ ;\ + sed $< >>$@ -r -n -e "/AF_MAX/d" -e "/AF_LOCAL/d" -e \ + 's/^\#define[ \t]+AF_([A-Z0-9_]+)[ \t]+([0-9]+)(.*)/[\2] = "\L\1",/p';\ + echo "};" >> $@ ;\ + echo -n '\#define AA_FS_AF_MASK "' >> $@ ;\ + sed -r -n 's/^\#define[ \t]+AF_([A-Z0-9_]+)[ \t]+([0-9]+)(.*)/\L\1/p'\ + $< | tr '\n' ' ' | sed -e 's/ $$/"\n/' >> $@ + +# Build a lower case string table of sock type names +# Transform lines from +# SOCK_STREAM = 1, +# to +# [1] = "stream", +quiet_cmd_make-sock = GEN $@ +cmd_make-sock = echo "static const char *sock_type_names[] = {" >> $@ ;\ + sed $^ >>$@ -r -n \ + -e 's/^\tSOCK_([A-Z0-9_]+)[\t]+=[ \t]+([0-9]+)(.*)/[\2] = "\L\1",/p';\ + echo "};" >> $@ # Build a lower case string table of rlimit names. # Transforms lines from @@ -55,6 +87,7 @@ cmd_make-rlim = echo "static const char *const rlim_names[RLIM_NLIMITS] = {" \ tr '\n' ' ' | sed -e 's/ $$/"\n/' >> $@ $(obj)/capability.o : $(obj)/capability_names.h +$(obj)/net.o : $(obj)/net_names.h $(obj)/resource.o : $(obj)/rlim_names.h $(obj)/capability_names.h : $(srctree)/include/linux/capability.h \ $(src)/Makefile @@ -62,3 +95,8 @@ $(obj)/capability_names.h : $(srctree)/include/linux/capability.h \ $(obj)/rlim_names.h : $(srctree)/include/asm-generic/resource.h \ $(src)/Makefile $(call cmd,make-rlim) +$(obj)/net_names.h : $(srctree)/include/linux/socket.h \ + $(srctree)/include/linux/net.h \ + $(src)/Makefile + $(call cmd,make-af) + $(call cmd,make-sock) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index fbedacb942e1..01a335d15c27 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -438,6 +438,7 @@ static struct aa_fs_entry aa_fs_entry_namespaces[] = { static struct aa_fs_entry aa_fs_entry_features[] = { AA_FS_DIR("domain", aa_fs_entry_domain), AA_FS_DIR("file", aa_fs_entry_file), + AA_FS_DIR("network", aa_fs_entry_network), AA_FS_DIR("mount", aa_fs_entry_mount), AA_FS_DIR("namespaces", aa_fs_entry_namespaces), AA_FS_FILE_U64("capability", VFS_CAP_FLAGS_MASK), diff --git a/security/apparmor/include/audit.h b/security/apparmor/include/audit.h index 72c19dbc0598..66a738c5c9c3 100644 --- a/security/apparmor/include/audit.h +++ b/security/apparmor/include/audit.h @@ -138,6 +138,10 @@ struct apparmor_audit_data { u32 denied; uid_t ouid; } fs; + struct { + int type, protocol; + struct sock *sk; + } net; }; }; diff --git a/security/apparmor/include/net.h b/security/apparmor/include/net.h new file mode 100644 index 000000000000..cb8a12109b7a --- /dev/null +++ b/security/apparmor/include/net.h @@ -0,0 +1,44 @@ +/* + * AppArmor security module + * + * This file contains AppArmor network mediation definitions. + * + * Copyright (C) 1998-2008 Novell/SUSE + * Copyright 2009-2012 Canonical Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#ifndef __AA_NET_H +#define __AA_NET_H + +#include + +#include "apparmorfs.h" + +/* struct aa_net - network confinement data + * @allowed: basic network families permissions + * @audit_network: which network permissions to force audit + * @quiet_network: which network permissions to quiet rejects + */ +struct aa_net { + u16 allow[AF_MAX]; + u16 audit[AF_MAX]; + u16 quiet[AF_MAX]; +}; + +extern struct aa_fs_entry aa_fs_entry_network[]; + +extern int aa_net_perm(int op, struct aa_profile *profile, u16 family, + int type, int protocol, struct sock *sk); +extern int aa_revalidate_sk(int op, struct sock *sk); + +static inline void aa_free_net_rules(struct aa_net *new) +{ + /* NOP */ +} + +#endif /* __AA_NET_H */ diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h index bda4569fdd83..eb13a73f7383 100644 --- a/security/apparmor/include/policy.h +++ b/security/apparmor/include/policy.h @@ -27,6 +27,7 @@ #include "capability.h" #include "domain.h" #include "file.h" +#include "net.h" #include "resource.h" extern const char *const profile_mode_names[]; @@ -157,6 +158,7 @@ struct aa_policydb { * @policy: general match rules governing policy * @file: The set of rules governing basic file access and domain transitions * @caps: capabilities for the profile + * @net: network controls for the profile * @rlimits: rlimits for the profile * * The AppArmor profile contains the basic confinement data. Each profile @@ -194,6 +196,7 @@ struct aa_profile { struct aa_policydb policy; struct aa_file_rules file; struct aa_caps caps; + struct aa_net net; struct aa_rlimit rlimits; }; diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 43c818dfe4ff..65ff9e4a841b 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -32,6 +32,7 @@ #include "include/context.h" #include "include/file.h" #include "include/ipc.h" +#include "include/net.h" #include "include/path.h" #include "include/policy.h" #include "include/procattr.h" @@ -669,6 +670,104 @@ static int apparmor_task_setrlimit(struct task_struct *task, return error; } +static int apparmor_socket_create(int family, int type, int protocol, int kern) +{ + struct aa_profile *profile; + int error = 0; + + if (kern) + return 0; + + profile = __aa_current_profile(); + if (!unconfined(profile)) + error = aa_net_perm(OP_CREATE, profile, family, type, protocol, + NULL); + return error; +} + +static int apparmor_socket_bind(struct socket *sock, + struct sockaddr *address, int addrlen) +{ + struct sock *sk = sock->sk; + + return aa_revalidate_sk(OP_BIND, sk); +} + +static int apparmor_socket_connect(struct socket *sock, + struct sockaddr *address, int addrlen) +{ + struct sock *sk = sock->sk; + + return aa_revalidate_sk(OP_CONNECT, sk); +} + +static int apparmor_socket_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + + return aa_revalidate_sk(OP_LISTEN, sk); +} + +static int apparmor_socket_accept(struct socket *sock, struct socket *newsock) +{ + struct sock *sk = sock->sk; + + return aa_revalidate_sk(OP_ACCEPT, sk); +} + +static int apparmor_socket_sendmsg(struct socket *sock, + struct msghdr *msg, int size) +{ + struct sock *sk = sock->sk; + + return aa_revalidate_sk(OP_SENDMSG, sk); +} + +static int apparmor_socket_recvmsg(struct socket *sock, + struct msghdr *msg, int size, int flags) +{ + struct sock *sk = sock->sk; + + return aa_revalidate_sk(OP_RECVMSG, sk); +} + +static int apparmor_socket_getsockname(struct socket *sock) +{ + struct sock *sk = sock->sk; + + return aa_revalidate_sk(OP_GETSOCKNAME, sk); +} + +static int apparmor_socket_getpeername(struct socket *sock) +{ + struct sock *sk = sock->sk; + + return aa_revalidate_sk(OP_GETPEERNAME, sk); +} + +static int apparmor_socket_getsockopt(struct socket *sock, int level, + int optname) +{ + struct sock *sk = sock->sk; + + return aa_revalidate_sk(OP_GETSOCKOPT, sk); +} + +static int apparmor_socket_setsockopt(struct socket *sock, int level, + int optname) +{ + struct sock *sk = sock->sk; + + return aa_revalidate_sk(OP_SETSOCKOPT, sk); +} + +static int apparmor_socket_shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + + return aa_revalidate_sk(OP_SOCK_SHUTDOWN, sk); +} + static struct security_operations apparmor_ops = { .name = "apparmor", @@ -705,6 +804,19 @@ static struct security_operations apparmor_ops = { .getprocattr = apparmor_getprocattr, .setprocattr = apparmor_setprocattr, + .socket_create = apparmor_socket_create, + .socket_bind = apparmor_socket_bind, + .socket_connect = apparmor_socket_connect, + .socket_listen = apparmor_socket_listen, + .socket_accept = apparmor_socket_accept, + .socket_sendmsg = apparmor_socket_sendmsg, + .socket_recvmsg = apparmor_socket_recvmsg, + .socket_getsockname = apparmor_socket_getsockname, + .socket_getpeername = apparmor_socket_getpeername, + .socket_getsockopt = apparmor_socket_getsockopt, + .socket_setsockopt = apparmor_socket_setsockopt, + .socket_shutdown = apparmor_socket_shutdown, + .cred_alloc_blank = apparmor_cred_alloc_blank, .cred_free = apparmor_cred_free, .cred_prepare = apparmor_cred_prepare, diff --git a/security/apparmor/net.c b/security/apparmor/net.c new file mode 100644 index 000000000000..003dd18c61a5 --- /dev/null +++ b/security/apparmor/net.c @@ -0,0 +1,162 @@ +/* + * AppArmor security module + * + * This file contains AppArmor network mediation + * + * Copyright (C) 1998-2008 Novell/SUSE + * Copyright 2009-2012 Canonical Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include "include/apparmor.h" +#include "include/audit.h" +#include "include/context.h" +#include "include/net.h" +#include "include/policy.h" + +#include "net_names.h" + +struct aa_fs_entry aa_fs_entry_network[] = { + AA_FS_FILE_STRING("af_mask", AA_FS_AF_MASK), + { } +}; + +/* audit callback for net specific fields */ +static void audit_cb(struct audit_buffer *ab, void *va) +{ + struct common_audit_data *sa = va; + + audit_log_format(ab, " family="); + if (address_family_names[sa->u.net->family]) { + audit_log_string(ab, address_family_names[sa->u.net->family]); + } else { + audit_log_format(ab, "\"unknown(%d)\"", sa->u.net->family); + } + audit_log_format(ab, " sock_type="); + if (sock_type_names[sa->aad->net.type]) { + audit_log_string(ab, sock_type_names[sa->aad->net.type]); + } else { + audit_log_format(ab, "\"unknown(%d)\"", sa->aad->net.type); + } + audit_log_format(ab, " protocol=%d", sa->aad->net.protocol); +} + +/** + * audit_net - audit network access + * @profile: profile being enforced (NOT NULL) + * @op: operation being checked + * @family: network family + * @type: network type + * @protocol: network protocol + * @sk: socket auditing is being applied to + * @error: error code for failure else 0 + * + * Returns: %0 or sa->error else other errorcode on failure + */ +static int audit_net(struct aa_profile *profile, int op, u16 family, int type, + int protocol, struct sock *sk, int error) +{ + int audit_type = AUDIT_APPARMOR_AUTO; + struct common_audit_data sa; + struct apparmor_audit_data aad = { }; + struct lsm_network_audit net = { }; + if (sk) { + sa.type = LSM_AUDIT_DATA_NET; + } else { + sa.type = LSM_AUDIT_DATA_NONE; + } + /* todo fill in socket addr info */ + sa.aad = &aad; + sa.u.net = &net; + sa.aad->op = op, + sa.u.net->family = family; + sa.u.net->sk = sk; + sa.aad->net.type = type; + sa.aad->net.protocol = protocol; + sa.aad->error = error; + + if (likely(!sa.aad->error)) { + u16 audit_mask = profile->net.audit[sa.u.net->family]; + if (likely((AUDIT_MODE(profile) != AUDIT_ALL) && + !(1 << sa.aad->net.type & audit_mask))) + return 0; + audit_type = AUDIT_APPARMOR_AUDIT; + } else { + u16 quiet_mask = profile->net.quiet[sa.u.net->family]; + u16 kill_mask = 0; + u16 denied = (1 << sa.aad->net.type) & ~quiet_mask; + + if (denied & kill_mask) + audit_type = AUDIT_APPARMOR_KILL; + + if ((denied & quiet_mask) && + AUDIT_MODE(profile) != AUDIT_NOQUIET && + AUDIT_MODE(profile) != AUDIT_ALL) + return COMPLAIN_MODE(profile) ? 0 : sa.aad->error; + } + + return aa_audit(audit_type, profile, GFP_KERNEL, &sa, audit_cb); +} + +/** + * aa_net_perm - very course network access check + * @op: operation being checked + * @profile: profile being enforced (NOT NULL) + * @family: network family + * @type: network type + * @protocol: network protocol + * + * Returns: %0 else error if permission denied + */ +int aa_net_perm(int op, struct aa_profile *profile, u16 family, int type, + int protocol, struct sock *sk) +{ + u16 family_mask; + int error; + + if ((family < 0) || (family >= AF_MAX)) + return -EINVAL; + + if ((type < 0) || (type >= SOCK_MAX)) + return -EINVAL; + + /* unix domain and netlink sockets are handled by ipc */ + if (family == AF_UNIX || family == AF_NETLINK) + return 0; + + family_mask = profile->net.allow[family]; + + error = (family_mask & (1 << type)) ? 0 : -EACCES; + + return audit_net(profile, op, family, type, protocol, sk, error); +} + +/** + * aa_revalidate_sk - Revalidate access to a sock + * @op: operation being checked + * @sk: sock being revalidated (NOT NULL) + * + * Returns: %0 else error if permission denied + */ +int aa_revalidate_sk(int op, struct sock *sk) +{ + struct aa_profile *profile; + int error = 0; + + /* aa_revalidate_sk should not be called from interrupt context + * don't mediate these calls as they are not task related + */ + if (in_interrupt()) + return 0; + + profile = __aa_current_profile(); + if (!unconfined(profile)) + error = aa_net_perm(op, profile, sk->sk_family, sk->sk_type, + sk->sk_protocol, sk); + + return error; +} diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index cf5fd220309b..27c8161691ba 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -745,6 +745,7 @@ static void free_profile(struct aa_profile *profile) aa_free_file_rules(&profile->file); aa_free_cap_rules(&profile->caps); + aa_free_net_rules(&profile->net); aa_free_rlimit_rules(&profile->rlimits); aa_free_sid(profile->sid); diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 329b1fd30749..1b90dfaa5517 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -193,6 +193,19 @@ fail: return 0; } +static bool unpack_u16(struct aa_ext *e, u16 *data, const char *name) +{ + if (unpack_nameX(e, AA_U16, name)) { + if (!inbounds(e, sizeof(u16))) + return 0; + if (data) + *data = le16_to_cpu(get_unaligned((u16 *) e->pos)); + e->pos += sizeof(u16); + return 1; + } + return 0; +} + static bool unpack_u32(struct aa_ext *e, u32 *data, const char *name) { if (unpack_nameX(e, AA_U32, name)) { @@ -471,6 +484,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e) { struct aa_profile *profile = NULL; const char *name = NULL; + size_t size = 0; int i, error = -EPROTO; kernel_cap_t tmpcap; u32 tmp; @@ -564,6 +578,38 @@ static struct aa_profile *unpack_profile(struct aa_ext *e) if (!unpack_rlimits(e, profile)) goto fail; + size = unpack_array(e, "net_allowed_af"); + if (size) { + + for (i = 0; i < size; i++) { + /* discard extraneous rules that this kernel will + * never request + */ + if (i >= AF_MAX) { + u16 tmp; + if (!unpack_u16(e, &tmp, NULL) || + !unpack_u16(e, &tmp, NULL) || + !unpack_u16(e, &tmp, NULL)) + goto fail; + continue; + } + if (!unpack_u16(e, &profile->net.allow[i], NULL)) + goto fail; + if (!unpack_u16(e, &profile->net.audit[i], NULL)) + goto fail; + if (!unpack_u16(e, &profile->net.quiet[i], NULL)) + goto fail; + } + if (!unpack_nameX(e, AA_ARRAYEND, NULL)) + goto fail; + } + /* + * allow unix domain and netlink sockets they are handled + * by IPC + */ + profile->net.allow[AF_UNIX] = 0xffff; + profile->net.allow[AF_NETLINK] = 0xffff; + if (unpack_nameX(e, AA_STRUCT, "policydb")) { /* generic policy dfa - optional and may be NULL */ profile->policy.dfa = unpack_dfa(e); -- cgit v1.2.3 From dcb26de4a6f6dd7da03cc2c0afada2c2c9144c12 Mon Sep 17 00:00:00 2001 From: Herton Ronaldo Krzesinski Date: Tue, 22 May 2012 15:23:02 -0300 Subject: UBUNTU: SAUCE: async_populate_rootfs: fix build warnings BugLink: http://bugs.launchpad.net/bugs/1003417 Fix following build warnings: init/initramfs.c: In function 'populate_rootfs_early': init/initramfs.c:629:7: warning: passing argument 1 of 'async_schedule_domain' from incompatible pointer type [enabled by default] include/linux/async.h:20:23: note: expected 'void (*)(void *, async_cookie_t)' but argument is of type 'void (*)(void)' init/initramfs.c:631:1: warning: no return statement in function returning non-void [-Wreturn-type] init/initramfs.c: In function 'populate_rootfs': init/initramfs.c:636:7: warning: passing argument 1 of 'async_schedule_domain' from incompatible pointer type [enabled by default] include/linux/async.h:20:23: note: expected 'void (*)(void *, async_cookie_t)' but argument is of type 'void (*)(void)' init/initramfs.c:637:1: warning: no return statement in function returning non-void [-Wreturn-type] Signed-off-by: Herton Ronaldo Krzesinski Acked-by: Stefan Bader Signed-off-by: Tim Gardner --- init/initramfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/init/initramfs.c b/init/initramfs.c index c5a412346663..a9ea4066e2ef 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -589,7 +589,7 @@ void populate_rootfs_wait(void) } EXPORT_SYMBOL(populate_rootfs_wait); -static void __init async_populate_rootfs(void) +static void __init async_populate_rootfs(void *data, async_cookie_t cookie) { char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); if (err) @@ -638,12 +638,14 @@ static int __init populate_rootfs_early(void) async_schedule_domain(async_populate_rootfs, NULL, &populate_rootfs_domain); } + return 0; } static int __init populate_rootfs(void) { if (!rootfs_populated) async_schedule_domain(async_populate_rootfs, NULL, &populate_rootfs_domain); + return 0; } earlyrootfs_initcall(populate_rootfs_early); -- cgit v1.2.3 From 372c5d85a607d7a89e1ac20fe384e8491dc24e1b Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Tue, 28 Feb 2012 17:42:20 -0600 Subject: UBUNTU: SAUCE: arm highbank: add support for pl320-ipc driver BugLink: http://bugs.launchpad.net/bugs/1000831 Signed-off-by: Mark Langsdorf Signed-off-by: Ike Panhc Signed-off-by: Tim Gardner --- arch/arm/mach-highbank/Makefile | 2 + arch/arm/mach-highbank/include/mach/pl320-ipc.h | 20 ++ arch/arm/mach-highbank/pl320-ipc.c | 308 ++++++++++++++++++++++++ 3 files changed, 330 insertions(+) create mode 100644 arch/arm/mach-highbank/include/mach/pl320-ipc.h create mode 100644 arch/arm/mach-highbank/pl320-ipc.c diff --git a/arch/arm/mach-highbank/Makefile b/arch/arm/mach-highbank/Makefile index f8437dd238c2..a674ed38722a 100644 --- a/arch/arm/mach-highbank/Makefile +++ b/arch/arm/mach-highbank/Makefile @@ -3,3 +3,5 @@ obj-$(CONFIG_DEBUG_HIGHBANK_UART) += lluart.o obj-$(CONFIG_SMP) += platsmp.o obj-$(CONFIG_HOTPLUG_CPU) += hotplug.o obj-$(CONFIG_PM_SLEEP) += pm.o + +obj-y += pl320-ipc.o diff --git a/arch/arm/mach-highbank/include/mach/pl320-ipc.h b/arch/arm/mach-highbank/include/mach/pl320-ipc.h new file mode 100644 index 000000000000..a0e58eed4b0d --- /dev/null +++ b/arch/arm/mach-highbank/include/mach/pl320-ipc.h @@ -0,0 +1,20 @@ +/* + * Copyright 2010 Calxeda, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + */ +int ipc_call_fast(u32 *data); +int ipc_call_slow(u32 *data); + +extern int pl320_ipc_register_notifier(struct notifier_block *nb); +extern int pl320_ipc_unregister_notifier(struct notifier_block *nb); diff --git a/arch/arm/mach-highbank/pl320-ipc.c b/arch/arm/mach-highbank/pl320-ipc.c new file mode 100644 index 000000000000..b68d85663954 --- /dev/null +++ b/arch/arm/mach-highbank/pl320-ipc.c @@ -0,0 +1,308 @@ +/* + * Copyright 2010 Calxeda, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define IPCMxSOURCE(m) ((m) * 0x40) +#define IPCMxDSET(m) (((m) * 0x40) + 0x004) +#define IPCMxDCLEAR(m) (((m) * 0x40) + 0x008) +#define IPCMxDSTATUS(m) (((m) * 0x40) + 0x00C) +#define IPCMxMODE(m) (((m) * 0x40) + 0x010) +#define IPCMxMSET(m) (((m) * 0x40) + 0x014) +#define IPCMxMCLEAR(m) (((m) * 0x40) + 0x018) +#define IPCMxMSTATUS(m) (((m) * 0x40) + 0x01C) +#define IPCMxSEND(m) (((m) * 0x40) + 0x020) +#define IPCMxDR(m, dr) (((m) * 0x40) + ((dr) * 4) + 0x024) + +#define IPCMMIS(irq) (((irq) * 8) + 0x800) +#define IPCMRIS(irq) (((irq) * 8) + 0x804) + +#define MBOX_MASK(n) (1 << (n)) +#define IPC_FAST_MBOX 0 +#define IPC_SLOW_MBOX 1 +#define IPC_RX_MBOX 2 + +#define CHAN_MASK(n) (1 << (n)) +#define A9_SOURCE 1 +#define M3_SOURCE 0 + +static void __iomem *ipc_base; +static int ipc_irq; +static DEFINE_SPINLOCK(ipc_m0_lock); +static DEFINE_MUTEX(ipc_m1_lock); +static DECLARE_COMPLETION(ipc_completion); +static ATOMIC_NOTIFIER_HEAD(ipc_notifier); + +static inline void set_destination(int source, int mbox) +{ + __raw_writel(CHAN_MASK(source), ipc_base + IPCMxDSET(mbox)); + __raw_writel(CHAN_MASK(source), ipc_base + IPCMxMSET(mbox)); +} + +static inline void clear_destination(int source, int mbox) +{ + __raw_writel(CHAN_MASK(source), ipc_base + IPCMxDCLEAR(mbox)); + __raw_writel(CHAN_MASK(source), ipc_base + IPCMxMCLEAR(mbox)); +} + +static void __ipc_send(int mbox, u32 *data) +{ + int i; + for (i = 0; i < 7; i++) + __raw_writel(data[i], ipc_base + IPCMxDR(mbox, i)); + __raw_writel(0x1, ipc_base + IPCMxSEND(mbox)); +} + +static u32 __ipc_rcv(int mbox, u32 *data) +{ + int i; + for (i = 0; i < 7; i++) + data[i] = __raw_readl(ipc_base + IPCMxDR(mbox, i)); + return data[1]; +} + +/* non-blocking implementation from the A9 side, interrupt safe in theory */ +int ipc_call_fast(u32 *data) +{ + int timeout, ret; + + spin_lock(&ipc_m0_lock); + + __ipc_send(IPC_FAST_MBOX, data); + + for (timeout = 5000; timeout > 0; timeout--) { + if (__raw_readl(ipc_base + IPCMxSEND(IPC_FAST_MBOX)) == 0x2) + break; + udelay(100); + } + if (timeout == 0) { + ret = -ETIMEDOUT; + goto out; + } + + ret = __ipc_rcv(IPC_FAST_MBOX, data); +out: + __raw_writel(0, ipc_base + IPCMxSEND(IPC_FAST_MBOX)); + spin_unlock(&ipc_m0_lock); + return ret; +} +EXPORT_SYMBOL(ipc_call_fast); + +/* blocking implmentation from the A9 side, not usuable in interrupts! */ +int ipc_call_slow(u32 *data) +{ + int ret; + + mutex_lock(&ipc_m1_lock); + + init_completion(&ipc_completion); + __ipc_send(IPC_SLOW_MBOX, data); + ret = wait_for_completion_timeout(&ipc_completion, + msecs_to_jiffies(1000)); + if (ret == 0) + goto out; + + ret = __ipc_rcv(IPC_SLOW_MBOX, data); +out: + mutex_unlock(&ipc_m1_lock); + return ret; +} +EXPORT_SYMBOL(ipc_call_slow); + +irqreturn_t ipc_handler(int irq, void *dev) +{ + u32 irq_stat; + u32 data[7]; + + irq_stat = __raw_readl(ipc_base + IPCMMIS(1)); + if (irq_stat & MBOX_MASK(IPC_SLOW_MBOX)) { + __raw_writel(0, ipc_base + IPCMxSEND(IPC_SLOW_MBOX)); + complete(&ipc_completion); + } + if (irq_stat & MBOX_MASK(IPC_RX_MBOX)) { + __ipc_rcv(IPC_RX_MBOX, data); + atomic_notifier_call_chain(&ipc_notifier, data[0], data + 1); + __raw_writel(2, ipc_base + IPCMxSEND(IPC_RX_MBOX)); + } + + return IRQ_HANDLED; +} + +int pl320_ipc_register_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&ipc_notifier, nb); +} + +int pl320_ipc_unregister_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&ipc_notifier, nb); +} + +static int __devinit pl320_probe(struct amba_device *adev, const struct amba_id *id) +{ + int ret; + + ipc_base = ioremap(adev->res.start, resource_size(&adev->res)); + if (ipc_base == NULL) + return -ENOMEM; + + __raw_writel(0, ipc_base + IPCMxSEND(IPC_FAST_MBOX)); + __raw_writel(0, ipc_base + IPCMxSEND(IPC_SLOW_MBOX)); + + ipc_irq = adev->irq[0]; + ret = request_irq(ipc_irq, ipc_handler, 0, dev_name(&adev->dev), NULL); + if (ret < 0) + goto err; + + /* Init fast mailbox */ + __raw_writel(CHAN_MASK(A9_SOURCE), ipc_base + IPCMxSOURCE(IPC_FAST_MBOX)); + set_destination(M3_SOURCE, IPC_FAST_MBOX); + + /* Init slow mailbox */ + __raw_writel(CHAN_MASK(A9_SOURCE), ipc_base + IPCMxSOURCE(IPC_SLOW_MBOX)); + __raw_writel(CHAN_MASK(M3_SOURCE), ipc_base + IPCMxDSET(IPC_SLOW_MBOX)); + __raw_writel(CHAN_MASK(M3_SOURCE) | CHAN_MASK(A9_SOURCE), + ipc_base + IPCMxMSET(IPC_SLOW_MBOX)); + + /* Init receive mailbox */ + __raw_writel(CHAN_MASK(M3_SOURCE), ipc_base + IPCMxSOURCE(IPC_RX_MBOX)); + __raw_writel(CHAN_MASK(A9_SOURCE), ipc_base + IPCMxDSET(IPC_RX_MBOX)); + __raw_writel(CHAN_MASK(M3_SOURCE) | CHAN_MASK(A9_SOURCE), + ipc_base + IPCMxMSET(IPC_RX_MBOX)); + + return 0; +err: + iounmap(ipc_base); + return ret; +} + +static struct amba_id pl320_ids[] = { + { + .id = 0x00041320, + .mask = 0x000fffff, + }, + { 0, 0 }, +}; + +static struct amba_driver pl320_driver = { + .drv = { + .name = "pl320", + }, + .id_table = pl320_ids, + .probe = pl320_probe, +}; + +static int __init ipc_init(void) +{ + return amba_driver_register(&pl320_driver); +} +module_init(ipc_init); + +irqreturn_t ipc_test_handler(int irq, void *dev) +{ + u32 irq_stat; + + irq_stat = __raw_readl(ipc_base + IPCMMIS(irq - (ipc_irq - 1))); + if (irq_stat & MBOX_MASK(IPC_FAST_MBOX)) { + if ((__raw_readl(ipc_base + IPCMxDR(IPC_FAST_MBOX, 0)) == 0x900dbeef) && + (__raw_readl(ipc_base + IPCMxDR(IPC_FAST_MBOX, 1)) == 1) && + (__raw_readl(ipc_base + IPCMxDR(IPC_FAST_MBOX, 2)) == 2) && + (__raw_readl(ipc_base + IPCMxDR(IPC_FAST_MBOX, 3)) == 3) && + (__raw_readl(ipc_base + IPCMxDR(IPC_FAST_MBOX, 4)) == 4) && + (__raw_readl(ipc_base + IPCMxDR(IPC_FAST_MBOX, 5)) == 5) && + (__raw_readl(ipc_base + IPCMxDR(IPC_FAST_MBOX, 6)) == 6)) { + printk(KERN_ERR "ipc fast mbox message %X received\n", __raw_readl(ipc_base + IPCMxDR(IPC_FAST_MBOX, 0))); + __raw_writel(0xBADBEEF, ipc_base + IPCMxDR(IPC_FAST_MBOX, 1)); + } + __raw_writel(0x2, ipc_base + IPCMxSEND(IPC_FAST_MBOX)); + } + if (irq_stat & MBOX_MASK(IPC_SLOW_MBOX)) { + if ((__raw_readl(ipc_base + IPCMxDR(IPC_SLOW_MBOX, 0)) == 0x12345678) && + (__raw_readl(ipc_base + IPCMxDR(IPC_SLOW_MBOX, 1)) == 6) && + (__raw_readl(ipc_base + IPCMxDR(IPC_SLOW_MBOX, 2)) == 5) && + (__raw_readl(ipc_base + IPCMxDR(IPC_SLOW_MBOX, 3)) == 4) && + (__raw_readl(ipc_base + IPCMxDR(IPC_SLOW_MBOX, 4)) == 3) && + (__raw_readl(ipc_base + IPCMxDR(IPC_SLOW_MBOX, 5)) == 2) && + (__raw_readl(ipc_base + IPCMxDR(IPC_SLOW_MBOX, 6)) == 1)) { + printk("slow mbox message %X received\n", __raw_readl(ipc_base + IPCMxDR(IPC_SLOW_MBOX, 0))); + __raw_writel(0x87654321, ipc_base + IPCMxDR(IPC_SLOW_MBOX, 1)); + } + __raw_writel(0x2, ipc_base + IPCMxSEND(IPC_SLOW_MBOX)); + } + if (irq_stat & MBOX_MASK(IPC_RX_MBOX)) { + __raw_writel(0, ipc_base + IPCMxSEND(IPC_RX_MBOX)); + // handle events + } + + return IRQ_HANDLED; +} + +static void __init ipc_test(void) +{ + int ret, i; + + printk("ipc test start\n"); + + for (i = 0; i < 8; i++) { + u32 data[7]; + int j; + if (i == 1) continue; + + if (request_irq(ipc_irq - 1 + i, ipc_test_handler, 0, "ipc", NULL) < 0) { + printk("ipc - request_irq failed - FAIL\n"); + return; + } + + set_destination(i, IPC_FAST_MBOX); + set_destination(i, IPC_SLOW_MBOX); + + for (j = 1; j < 7; j++) + data[j] = j; + data[0] = 0x900dbeef; + ret = ipc_call_fast(data); + if (ret == 0xbadbeef) + printk(KERN_ERR "ipc %d fast call - PASS\n", i); + else + printk(KERN_ERR "ipc %d fast call fail %d\n", i, ret); + for (j = 1; j < 7; j++) + data[j] = 7 - j; + data[0] = 0x12345678; + ret = ipc_call_slow(data); + if (ret == 0x87654321) + printk("ipc %d slow call - PASS\n", i); + + clear_destination(i, IPC_FAST_MBOX); + clear_destination(i, IPC_SLOW_MBOX); + + free_irq(ipc_irq - 1 + i, NULL); + } + set_destination(M3_SOURCE, IPC_FAST_MBOX); + set_destination(M3_SOURCE, IPC_SLOW_MBOX); +} +//late_initcall(ipc_test); + -- cgit v1.2.3 From 6ed8a412cd35dc1ffb302c90b004419af7d6730e Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 22 Dec 2011 15:45:19 -0600 Subject: UBUNTU: SAUCE: input: add a key driver for highbank BugLink: http://bugs.launchpad.net/bugs/1000831 Add a keyboard driver to handle power and sleep keys from the management controller. These are generated via ipc messages. Signed-off-by: Rob Herring Signed-off-by: Ike Panhc Signed-off-by: Tim Gardner --- arch/arm/boot/dts/highbank.dts | 4 + drivers/input/keyboard/Kconfig | 11 +++ drivers/input/keyboard/Makefile | 1 + drivers/input/keyboard/highbank_keys.c | 141 +++++++++++++++++++++++++++++++++ 4 files changed, 157 insertions(+) create mode 100644 drivers/input/keyboard/highbank_keys.c diff --git a/arch/arm/boot/dts/highbank.dts b/arch/arm/boot/dts/highbank.dts index 83e72294aefb..e735731ae399 100644 --- a/arch/arm/boot/dts/highbank.dts +++ b/arch/arm/boot/dts/highbank.dts @@ -124,6 +124,10 @@ interrupts = <0 7 4>; }; + ipc-keys { + compatible = "calxeda,hb-keys"; + }; + gpioe: gpio@fff30000 { #gpio-cells = <2>; compatible = "arm,pl061", "arm,primecell"; diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig index c0e11ecc646f..c99bf9065f75 100644 --- a/drivers/input/keyboard/Kconfig +++ b/drivers/input/keyboard/Kconfig @@ -204,6 +204,17 @@ config KEYBOARD_GPIO_POLLED To compile this driver as a module, choose M here: the module will be called gpio_keys_polled. +config KEYBOARD_HIGHBANK + tristate "Calxeda Highbank Virtual Keys" + depends on ARCH_HIGHBANK + default y + help + This driver implements support for virtual power keys on Calxeda + Highbank systems. + + To compile this driver as a module, choose M here: the + module will be called highbank_keys. + config KEYBOARD_TCA6416 tristate "TCA6416/TCA6408A Keypad Support" depends on I2C diff --git a/drivers/input/keyboard/Makefile b/drivers/input/keyboard/Makefile index b03b02456a82..5dc046906f3f 100644 --- a/drivers/input/keyboard/Makefile +++ b/drivers/input/keyboard/Makefile @@ -15,6 +15,7 @@ obj-$(CONFIG_KEYBOARD_DAVINCI) += davinci_keyscan.o obj-$(CONFIG_KEYBOARD_EP93XX) += ep93xx_keypad.o obj-$(CONFIG_KEYBOARD_GPIO) += gpio_keys.o obj-$(CONFIG_KEYBOARD_GPIO_POLLED) += gpio_keys_polled.o +obj-$(CONFIG_KEYBOARD_HIGHBANK) += highbank_keys.o obj-$(CONFIG_KEYBOARD_TCA6416) += tca6416-keypad.o obj-$(CONFIG_KEYBOARD_TCA8418) += tca8418_keypad.o obj-$(CONFIG_KEYBOARD_HIL) += hil_kbd.o diff --git a/drivers/input/keyboard/highbank_keys.c b/drivers/input/keyboard/highbank_keys.c new file mode 100644 index 000000000000..a84ecf371755 --- /dev/null +++ b/drivers/input/keyboard/highbank_keys.c @@ -0,0 +1,141 @@ +/* + * Copyright 2011 Calxeda, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct hb_keys_drvdata { + struct input_dev *input; + struct notifier_block nb; +}; + +int hb_keys_notifier(struct notifier_block *nb, unsigned long event, void *data) +{ + struct hb_keys_drvdata *ddata = container_of(nb, struct hb_keys_drvdata, nb); + struct input_dev *input = ddata->input; + u32 *d = data; + u32 key = d[0]; + + if (event != 0x1000 /*HB_IPC_KEY*/) + return 0; + + input_event(input, EV_KEY, key, 1); + input_event(input, EV_KEY, key, 0); + input_sync(input); + return 0; +} + +static int hb_keys_open(struct input_dev *input) +{ + struct hb_keys_drvdata *ddata = input_get_drvdata(input); + return pl320_ipc_register_notifier(&ddata->nb); +} + +static void hb_keys_close(struct input_dev *input) +{ + struct hb_keys_drvdata *ddata = input_get_drvdata(input); + pl320_ipc_unregister_notifier(&ddata->nb); +} + +static int __devinit hb_keys_probe(struct platform_device *pdev) +{ + struct hb_keys_drvdata *ddata; + struct device *dev = &pdev->dev; + struct input_dev *input; + int error; + + ddata = kzalloc(sizeof(*ddata), GFP_KERNEL); + if (!ddata) + return -ENOMEM; + + input = input_allocate_device(); + if (!input) { + dev_err(dev, "failed to allocate state\n"); + error = -ENOMEM; + goto fail1; + } + + platform_set_drvdata(pdev, ddata); + input_set_drvdata(input, ddata); + + ddata->input = input; + ddata->nb.notifier_call = hb_keys_notifier; + + input->name = pdev->name; + input->phys = "highbank/input0"; + input->dev.parent = &pdev->dev; + input->open = hb_keys_open; + input->close = hb_keys_close; + + input->id.bustype = BUS_HOST; + input->id.vendor = 0x0001; + input->id.product = 0x0001; + input->id.version = 0x0100; + + input_set_capability(input, EV_KEY, KEY_POWER); + input_set_capability(input, EV_KEY, KEY_SLEEP); + + error = input_register_device(input); + if (error) { + dev_err(dev, "Unable to register input device, error: %d\n", + error); + goto fail2; + } + + return 0; + + fail2: + input_free_device(input); + fail1: + kfree(ddata); + return error; +} + +static int __devexit hb_keys_remove(struct platform_device *pdev) +{ + struct hb_keys_drvdata *ddata = platform_get_drvdata(pdev); + input_unregister_device(ddata->input); + kfree(ddata); + return 0; +} + +static struct of_device_id hb_keys_of_match[] = { + { .compatible = "calxeda,hb-keys", }, + { }, +}; +MODULE_DEVICE_TABLE(of, hb_keys_of_match); + +static struct platform_driver hb_keys_driver = { + .probe = hb_keys_probe, + .remove = __devexit_p(hb_keys_remove), + .driver = { + .name = "hb-keys", + .of_match_table = hb_keys_of_match, + } +}; + +module_platform_driver(hb_keys_driver); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Calxeda, Inc."); +MODULE_DESCRIPTION("Keys driver for Calxeda Highbank"); -- cgit v1.2.3 From a46494c603f80974c76eca4fa38c4d4fd32827ab Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 26 Apr 2012 11:40:26 -0500 Subject: UBUNTU: SAUCE: ARM: highbank: Add smc calls to enable/disable the L2 BugLink: http://bugs.launchpad.net/bugs/1000831 Linux runs in non-secure mode on highbank, so we need secure monitor calls to enable and disable the PL310. Signed-off-by: Rob Herring Signed-off-by: Ike Panhc Signed-off-by: Tim Gardner --- arch/arm/mach-highbank/Makefile | 6 +++++- arch/arm/mach-highbank/core.h | 1 + arch/arm/mach-highbank/highbank.c | 15 +++++++++++++++ arch/arm/mach-highbank/smc.S | 31 +++++++++++++++++++++++++++++++ 4 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 arch/arm/mach-highbank/smc.S diff --git a/arch/arm/mach-highbank/Makefile b/arch/arm/mach-highbank/Makefile index a674ed38722a..3d4126a9ed82 100644 --- a/arch/arm/mach-highbank/Makefile +++ b/arch/arm/mach-highbank/Makefile @@ -1,4 +1,8 @@ -obj-y := clock.o highbank.o system.o +obj-y := clock.o highbank.o system.o smc.o + +plus_sec := $(call as-instr,.arch_extension sec,+sec) +AFLAGS_smc.o :=-Wa,-march=armv7-a$(plus_sec) + obj-$(CONFIG_DEBUG_HIGHBANK_UART) += lluart.o obj-$(CONFIG_SMP) += platsmp.o obj-$(CONFIG_HOTPLUG_CPU) += hotplug.o diff --git a/arch/arm/mach-highbank/core.h b/arch/arm/mach-highbank/core.h index d8e2d0be64ac..141ed5171826 100644 --- a/arch/arm/mach-highbank/core.h +++ b/arch/arm/mach-highbank/core.h @@ -8,3 +8,4 @@ extern void highbank_lluart_map_io(void); static inline void highbank_lluart_map_io(void) {} #endif +extern void highbank_smc1(int fn, int arg); diff --git a/arch/arm/mach-highbank/highbank.c b/arch/arm/mach-highbank/highbank.c index 410a112bb52e..0ab6b34ba031 100644 --- a/arch/arm/mach-highbank/highbank.c +++ b/arch/arm/mach-highbank/highbank.c @@ -85,10 +85,25 @@ const static struct of_device_id irq_match[] = { {} }; +#ifdef CONFIG_CACHE_L2X0 +static void highbank_l2x0_disable(void) +{ + /* Disable PL310 L2 Cache controller */ + highbank_smc1(0x102, 0x0); +} +#endif + + static void __init highbank_init_irq(void) { of_irq_init(irq_match); + +#ifdef CONFIG_CACHE_L2X0 + /* Enable PL310 L2 Cache controller */ + highbank_smc1(0x102, 0x1); l2x0_of_init(0, ~0UL); + outer_cache.disable = highbank_l2x0_disable; +#endif } static void __init highbank_timer_init(void) diff --git a/arch/arm/mach-highbank/smc.S b/arch/arm/mach-highbank/smc.S new file mode 100644 index 000000000000..8b116325c8cc --- /dev/null +++ b/arch/arm/mach-highbank/smc.S @@ -0,0 +1,31 @@ +/* + * Copied from omap44xx-smc.S + * Copyright (C) 2010 Texas Instruments, Inc. + * Written by Santosh Shilimkar + * + * + * This program is free software,you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include + +/* + * This is common routine to manage secure monitor API + * used to modify the PL310 secure registers. + * 'r0' contains the value to be modified and 'r12' contains + * the monitor API number. It uses few CPU registers + * internally and hence they need be backed up including + * link register "lr". + * Function signature : void omap_smc1(u32 fn, u32 arg) + */ + +ENTRY(highbank_smc1) + stmfd sp!, {r2-r12, lr} + mov r12, r0 + mov r0, r1 + dsb + smc #0 + ldmfd sp!, {r2-r12, pc} +ENDPROC(highbank_smc1) -- cgit v1.2.3 From 929ae4bda925a5e3a42e3230cc14eea88b7bb9e9 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Mon, 30 Apr 2012 14:30:04 -0500 Subject: UBUNTU: SAUCE: force DMA buffers to non-bufferable on highbank BugLink: http://bugs.launchpad.net/bugs/1000831 The xgmac driver has problems with bufferable DMA descriptors. For now, change the memory type to get things working reliably. Signed-off-by: Rob Herring Signed-off-by: Ike Panhc Signed-off-by: Tim Gardner --- arch/arm/mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index 28773e6211f6..cf10218b9062 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -843,7 +843,7 @@ config ARM_L1_CACHE_SHIFT config ARM_DMA_MEM_BUFFERABLE bool "Use non-cacheable memory for DMA" if (CPU_V6 || CPU_V6K) && !CPU_V7 depends on !(MACH_REALVIEW_PB1176 || REALVIEW_EB_ARM11MP || \ - MACH_REALVIEW_PB11MP) + MACH_REALVIEW_PB11MP || ARCH_HIGHBANK) default y if CPU_V6 || CPU_V6K || CPU_V7 help Historically, the kernel has used strongly ordered mappings to -- cgit v1.2.3 From efb352977cdcb0df4909f9f2b8b1f9eece413148 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Sat, 28 Apr 2012 17:12:56 -0500 Subject: UBUNTU: SAUCE: net: calxedaxgmac: fix net timeout recovery BugLink: http://bugs.launchpad.net/bugs/1000831 Fix net tx watchdog timeout recovery. The descriptor ring was reset, but the DMA engine was not reset to the beginning of the ring. Also, set the number of AXI outstanding transactions to 8. Signed-off-by: Rob Herring Signed-off-by: Ike Panhc Signed-off-by: Tim Gardner --- drivers/net/ethernet/calxeda/xgmac.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/calxeda/xgmac.c b/drivers/net/ethernet/calxeda/xgmac.c index 11f667f6131a..8fc5fd0fb76c 100644 --- a/drivers/net/ethernet/calxeda/xgmac.c +++ b/drivers/net/ethernet/calxeda/xgmac.c @@ -210,7 +210,7 @@ #define DMA_INTR_ENA_TIE 0x00000001 /* Transmit Interrupt */ #define DMA_INTR_NORMAL (DMA_INTR_ENA_NIE | DMA_INTR_ENA_RIE | \ - DMA_INTR_ENA_TUE) + DMA_INTR_ENA_TUE | DMA_INTR_ENA_TIE) #define DMA_INTR_ABNORMAL (DMA_INTR_ENA_AIE | DMA_INTR_ENA_FBE | \ DMA_INTR_ENA_RWE | DMA_INTR_ENA_RSE | \ @@ -933,6 +933,7 @@ static void xgmac_tx_err(struct xgmac_priv *priv) desc_init_tx_desc(priv->dma_tx, DMA_TX_RING_SZ); priv->tx_tail = 0; priv->tx_head = 0; + writel(priv->dma_tx_phy, priv->base + XGMAC_DMA_TX_BASE_ADDR); writel(reg | DMA_CONTROL_ST, priv->base + XGMAC_DMA_CONTROL); writel(DMA_STATUS_TU | DMA_STATUS_TPS | DMA_STATUS_NIS | DMA_STATUS_AIS, @@ -972,7 +973,7 @@ static int xgmac_hw_init(struct net_device *dev) writel(DMA_INTR_DEFAULT_MASK, ioaddr + XGMAC_DMA_INTR_ENA); /* XGMAC requires AXI bus init. This is a 'magic number' for now */ - writel(0x000100E, ioaddr + XGMAC_DMA_AXI_BUS); + writel(0x0077000E, ioaddr + XGMAC_DMA_AXI_BUS); ctrl |= XGMAC_CONTROL_DDIC | XGMAC_CONTROL_JE | XGMAC_CONTROL_ACS | XGMAC_CONTROL_CAR; -- cgit v1.2.3 From 7121b8c3689185384896d6f4c358aa80a482f134 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Tue, 5 Jun 2012 08:17:40 -0700 Subject: UBUNTU: SAUCE: b43: do not call ieee80211_unregister_hw if we are not registred BugLink: http://bugs.launchpad.net/bugs/1008905 This patch fixes kernel Oops on rmmod b43 if firmware was not loaded: BUG: unable to handle kernel NULL pointer dereference at 0000000000000088 IP: [] drain_workqueue+0x25/0x142 PGD 153ac6067 PUD 153b82067 PMD 0 Oops: 0000 [#1] SMP Signed-off-by: Oleksij Rempel Acked-by: Seth Forshee Signed-off-by: Leann Ogasawara --- drivers/net/wireless/b43/b43.h | 4 ++++ drivers/net/wireless/b43/main.c | 19 ++++++++++++------- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/b43/b43.h b/drivers/net/wireless/b43/b43.h index 67c13af6f206..c06b6cb5c91e 100644 --- a/drivers/net/wireless/b43/b43.h +++ b/drivers/net/wireless/b43/b43.h @@ -877,6 +877,10 @@ struct b43_wl { * from the mac80211 subsystem. */ u16 mac80211_initially_registered_queues; + /* Set this if we call ieee80211_register_hw() and check if we call + * ieee80211_unregister_hw(). */ + bool hw_registred; + /* We can only have one operating interface (802.11 core) * at a time. General information about this interface follows. */ diff --git a/drivers/net/wireless/b43/main.c b/drivers/net/wireless/b43/main.c index 5a39b226b2e3..acd03a4f9730 100644 --- a/drivers/net/wireless/b43/main.c +++ b/drivers/net/wireless/b43/main.c @@ -2437,6 +2437,7 @@ start_ieee80211: err = ieee80211_register_hw(wl->hw); if (err) goto err_one_core_detach; + wl->hw_registred = true; b43_leds_register(wl->current_dev); goto out; @@ -5299,6 +5300,7 @@ static struct b43_wl *b43_wireless_init(struct b43_bus_dev *dev) hw->queues = modparam_qos ? B43_QOS_QUEUE_NUM : 1; wl->mac80211_initially_registered_queues = hw->queues; + wl->hw_registred = false; hw->max_rates = 2; SET_IEEE80211_DEV(hw, dev->dev); if (is_valid_ether_addr(sprom->et1mac)) @@ -5370,12 +5372,15 @@ static void b43_bcma_remove(struct bcma_device *core) * as the ieee80211 unreg will destroy the workqueue. */ cancel_work_sync(&wldev->restart_work); - /* Restore the queues count before unregistering, because firmware detect - * might have modified it. Restoring is important, so the networking - * stack can properly free resources. */ - wl->hw->queues = wl->mac80211_initially_registered_queues; - b43_leds_stop(wldev); - ieee80211_unregister_hw(wl->hw); + B43_WARN_ON(!wl); + if (wl->current_dev == wldev && wl->hw_registred) { + /* Restore the queues count before unregistering, because firmware detect + * might have modified it. Restoring is important, so the networking + * stack can properly free resources. */ + wl->hw->queues = wl->mac80211_initially_registered_queues; + b43_leds_stop(wldev); + ieee80211_unregister_hw(wl->hw); + } b43_one_core_detach(wldev->dev); @@ -5446,7 +5451,7 @@ static void b43_ssb_remove(struct ssb_device *sdev) cancel_work_sync(&wldev->restart_work); B43_WARN_ON(!wl); - if (wl->current_dev == wldev) { + if (wl->current_dev == wldev && wl->hw_registred) { /* Restore the queues count before unregistering, because firmware detect * might have modified it. Restoring is important, so the networking * stack can properly free resources. */ -- cgit v1.2.3 From 311042497928a4e3041d4b719a53c4099eec1af4 Mon Sep 17 00:00:00 2001 From: Stefan Bader Date: Mon, 4 Jun 2012 17:36:10 +0200 Subject: UBUNTU: SAUCE: Fix compile failures of dm-raid45 Must include module.h explicitely now. Also remove one compile time test which does not work anymore (will get rid of a warning). Signed-off-by: Stefan Bader Signed-off-by: Tim Gardner --- ubuntu/dm-raid4-5/dm-memcache.c | 1 + ubuntu/dm-raid4-5/dm-message.c | 1 + ubuntu/dm-raid4-5/dm-raid4-5.c | 5 +++++ 3 files changed, 7 insertions(+) diff --git a/ubuntu/dm-raid4-5/dm-memcache.c b/ubuntu/dm-raid4-5/dm-memcache.c index 1b15859b2762..346abb431b3f 100644 --- a/ubuntu/dm-raid4-5/dm-memcache.c +++ b/ubuntu/dm-raid4-5/dm-memcache.c @@ -19,6 +19,7 @@ #include "dm-memcache.h" #include #include +#include struct dm_mem_cache_client { spinlock_t lock; diff --git a/ubuntu/dm-raid4-5/dm-message.c b/ubuntu/dm-raid4-5/dm-message.c index a66b0152cb3d..de21e52c76ef 100644 --- a/ubuntu/dm-raid4-5/dm-message.c +++ b/ubuntu/dm-raid4-5/dm-message.c @@ -14,6 +14,7 @@ #include "dm.h" #include "dm-message.h" #include +#include #define DM_MSG_PREFIX "dm_message" diff --git a/ubuntu/dm-raid4-5/dm-raid4-5.c b/ubuntu/dm-raid4-5/dm-raid4-5.c index 88db7958b8d8..e05b0e14e347 100644 --- a/ubuntu/dm-raid4-5/dm-raid4-5.c +++ b/ubuntu/dm-raid4-5/dm-raid4-5.c @@ -54,6 +54,7 @@ static const char *version = "v0.2594b"; #include "dm-region-hash.h" #include +#include /* * Configurable parameters @@ -173,9 +174,13 @@ enum chunk_flags { CHUNK_UPTODATE, /* Chunk pages are uptodate. */ }; +/* + * This does not work anymore with __REQ_* values being enums + * #if READ != 0 || WRITE != 1 #error dm-raid45: READ/WRITE != 0/1 used as index!!! #endif +*/ enum bl_type { WRITE_QUEUED = WRITE + 1, -- cgit v1.2.3 From a8b0bdaa257a2c0ec4748d5ccc4feff2b60aa730 Mon Sep 17 00:00:00 2001 From: Stefan Bader Date: Mon, 4 Jun 2012 17:36:11 +0200 Subject: UBUNTU: [Config] Enable dm-raid45 https://lists.ubuntu.com/archives/kernel-team/2012-June/020541.html Signed-off-by: Stefan Bader Signed-off-by: Tim Gardner --- ubuntu/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ubuntu/Makefile b/ubuntu/Makefile index 66bbdf69d6f4..997d9b990f1b 100644 --- a/ubuntu/Makefile +++ b/ubuntu/Makefile @@ -12,7 +12,7 @@ ## ## ## -#obj-$(CONFIG_DM_RAID45) += dm-raid4-5/ +obj-$(CONFIG_DM_RAID45) += dm-raid4-5/ ## ## ## -- cgit v1.2.3