From 4da47859956cebdc4c58c38a931e21847458d744 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 12 Jan 2012 17:17:13 -0800 Subject: kernel.h: neaten panic prototype Use __printf macro. Convert NORET_AND to ATTRIB_NORET. Use the normal kernel style for pointer arguments. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d0a7a0c7166..60934395e36 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -185,8 +185,9 @@ static inline void might_fault(void) extern struct atomic_notifier_head panic_notifier_list; extern long (*panic_blink)(int state); -NORET_TYPE void panic(const char * fmt, ...) - __attribute__ ((NORET_AND format (printf, 1, 2))) __cold; +NORET_TYPE __printf(1, 2) +void panic(const char *fmt, ...) + ATTRIB_NORET __cold; extern void oops_enter(void); extern void oops_exit(void); void print_oops_end_marker(void); -- cgit v1.2.3 From 80bf007f20b16272f210e0803f739f5606cff59d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 12 Jan 2012 17:17:14 -0800 Subject: include/linux/linkage.h: remove unused NORET_AND macro The only use in kernel.h is gone so remove the macro. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/linkage.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 3f46aedea42..c75074cb8ad 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -90,6 +90,5 @@ #define NORET_TYPE /**/ #define ATTRIB_NORET __attribute__((noreturn)) -#define NORET_AND noreturn, #endif -- cgit v1.2.3 From 9402c95f34a66e81eba473a2f7267bbae5a1dee2 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 12 Jan 2012 17:17:17 -0800 Subject: treewide: remove useless NORET_TYPE macro and uses It's a very old and now unused prototype marking so just delete it. Neaten panic pointer argument style to keep checkpatch quiet. Signed-off-by: Joe Perches Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Tony Luck Cc: Fenghua Yu Acked-by: Geert Uytterhoeven Acked-by: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 6 +++--- include/linux/linkage.h | 1 - include/linux/sched.h | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 60934395e36..aaf1753dd2b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -185,16 +185,16 @@ static inline void might_fault(void) extern struct atomic_notifier_head panic_notifier_list; extern long (*panic_blink)(int state); -NORET_TYPE __printf(1, 2) +__printf(1, 2) void panic(const char *fmt, ...) ATTRIB_NORET __cold; extern void oops_enter(void); extern void oops_exit(void); void print_oops_end_marker(void); extern int oops_may_print(void); -NORET_TYPE void do_exit(long error_code) +void do_exit(long error_code) ATTRIB_NORET; -NORET_TYPE void complete_and_exit(struct completion *, long) +void complete_and_exit(struct completion *, long) ATTRIB_NORET; /* Internal, do not use. */ diff --git a/include/linux/linkage.h b/include/linux/linkage.h index c75074cb8ad..6a8f252e49e 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -88,7 +88,6 @@ #endif -#define NORET_TYPE /**/ #define ATTRIB_NORET __attribute__((noreturn)) #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 21cd0303af5..4032ec1cf83 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2275,7 +2275,7 @@ extern void __cleanup_sighand(struct sighand_struct *); extern void exit_itimers(struct signal_struct *); extern void flush_itimer_signals(void); -extern NORET_TYPE void do_group_exit(int); +extern void do_group_exit(int); extern void daemonize(const char *, ...); extern int allow_signal(int); -- cgit v1.2.3 From ff2d8b19a3a62559afba1c53360c8577a7697714 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 12 Jan 2012 17:17:21 -0800 Subject: treewide: convert uses of ATTRIB_NORETURN to __noreturn Use the more commonly used __noreturn instead of ATTRIB_NORETURN. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Joe Perches Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Tony Luck Cc: Fenghua Yu Acked-by: Geert Uytterhoeven Acked-by: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index aaf1753dd2b..e8343422240 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -187,15 +187,15 @@ extern struct atomic_notifier_head panic_notifier_list; extern long (*panic_blink)(int state); __printf(1, 2) void panic(const char *fmt, ...) - ATTRIB_NORET __cold; + __noreturn __cold; extern void oops_enter(void); extern void oops_exit(void); void print_oops_end_marker(void); extern int oops_may_print(void); void do_exit(long error_code) - ATTRIB_NORET; + __noreturn; void complete_and_exit(struct completion *, long) - ATTRIB_NORET; + __noreturn; /* Internal, do not use. */ int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res); -- cgit v1.2.3 From 0d259cf8190b9c446eefd5225ffcc3941e76a432 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 12 Jan 2012 17:17:25 -0800 Subject: include/linux/linkage.h: remove unused ATTRIB_NORET macro The uses have been renamed so delete the unused macro. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/linkage.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 6a8f252e49e..807f1e53322 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -88,6 +88,4 @@ #endif -#define ATTRIB_NORET __attribute__((noreturn)) - #endif -- cgit v1.2.3 From 43570fd2f47ba518145e9289f54cde3dba4c8b25 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 12 Jan 2012 17:17:27 -0800 Subject: mm,slub,x86: decouple size of struct page from CONFIG_CMPXCHG_LOCAL While implementing cmpxchg_double() on s390 I realized that we don't set CONFIG_CMPXCHG_LOCAL despite the fact that we have support for it. However setting that option will increase the size of struct page by eight bytes on 64 bit, which we certainly do not want. Also, it doesn't make sense that a present cpu feature should increase the size of struct page. Besides that it looks like the dependency to CMPXCHG_LOCAL is wrong and that it should depend on CMPXCHG_DOUBLE instead. This patch: If an architecture supports CMPXCHG_LOCAL this shouldn't result automatically in larger struct pages if the SLUB allocator is used. Instead introduce a new config option "HAVE_ALIGNED_STRUCT_PAGE" which can be selected if a double word aligned struct page is required. Also update x86 Kconfig so that it should work as before. Signed-off-by: Heiko Carstens Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5b42f1b34eb..3cc3062b376 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -151,12 +151,11 @@ struct page { #endif } /* - * If another subsystem starts using the double word pairing for atomic - * operations on struct page then it must change the #if to ensure - * proper alignment of the page struct. + * The struct page can be forced to be double word aligned so that atomic ops + * on double words work. The SLUB allocator can make use of such a feature. */ -#if defined(CONFIG_SLUB) && defined(CONFIG_CMPXCHG_LOCAL) - __attribute__((__aligned__(2*sizeof(unsigned long)))) +#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE + __aligned(2 * sizeof(unsigned long)) #endif ; -- cgit v1.2.3 From 28d82dc1c4edbc352129f97f4ca22624d1fe61de Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 12 Jan 2012 17:17:43 -0800 Subject: epoll: limit paths The current epoll code can be tickled to run basically indefinitely in both loop detection path check (on ep_insert()), and in the wakeup paths. The programs that tickle this behavior set up deeply linked networks of epoll file descriptors that cause the epoll algorithms to traverse them indefinitely. A couple of these sample programs have been previously posted in this thread: https://lkml.org/lkml/2011/2/25/297. To fix the loop detection path check algorithms, I simply keep track of the epoll nodes that have been already visited. Thus, the loop detection becomes proportional to the number of epoll file descriptor and links. This dramatically decreases the run-time of the loop check algorithm. In one diabolical case I tried it reduced the run-time from 15 mintues (all in kernel time) to .3 seconds. Fixing the wakeup paths could be done at wakeup time in a similar manner by keeping track of nodes that have already been visited, but the complexity is harder, since there can be multiple wakeups on different cpus...Thus, I've opted to limit the number of possible wakeup paths when the paths are created. This is accomplished, by noting that the end file descriptor points that are found during the loop detection pass (from the newly added link), are actually the sources for wakeup events. I keep a list of these file descriptors and limit the number and length of these paths that emanate from these 'source file descriptors'. In the current implemetation I allow 1000 paths of length 1, 500 of length 2, 100 of length 3, 50 of length 4 and 10 of length 5. Note that it is sufficient to check the 'source file descriptors' reachable from the newly added link, since no other 'source file descriptors' will have newly added links. This allows us to check only the wakeup paths that may have gotten too long, and not re-check all possible wakeup paths on the system. In terms of the path limit selection, I think its first worth noting that the most common case for epoll, is probably the model where you have 1 epoll file descriptor that is monitoring n number of 'source file descriptors'. In this case, each 'source file descriptor' has a 1 path of length 1. Thus, I believe that the limits I'm proposing are quite reasonable and in fact may be too generous. Thus, I'm hoping that the proposed limits will not prevent any workloads that currently work to fail. In terms of locking, I have extended the use of the 'epmutex' to all epoll_ctl add and remove operations. Currently its only used in a subset of the add paths. I need to hold the epmutex, so that we can correctly traverse a coherent graph, to check the number of paths. I believe that this additional locking is probably ok, since its in the setup/teardown paths, and doesn't affect the running paths, but it certainly is going to add some extra overhead. Also, worth noting is that the epmuex was recently added to the ep_ctl add operations in the initial path loop detection code using the argument that it was not on a critical path. Another thing to note here, is the length of epoll chains that is allowed. Currently, eventpoll.c defines: /* Maximum number of nesting allowed inside epoll sets */ #define EP_MAX_NESTS 4 This basically means that I am limited to a graph depth of 5 (EP_MAX_NESTS + 1). However, this limit is currently only enforced during the loop check detection code, and only when the epoll file descriptors are added in a certain order. Thus, this limit is currently easily bypassed. The newly added check for wakeup paths, stricly limits the wakeup paths to a length of 5, regardless of the order in which ep's are linked together. Thus, a side-effect of the new code is a more consistent enforcement of the graph depth. Thus far, I've tested this, using the sample programs previously mentioned, which now either return quickly or return -EINVAL. I've also testing using the piptest.c epoll tester, which showed no difference in performance. I've also created a number of different epoll networks and tested that they behave as expectded. I believe this solves the original diabolical test cases, while still preserving the sane epoll nesting. Signed-off-by: Jason Baron Cc: Nelson Elhage Cc: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/eventpoll.h | 1 + include/linux/fs.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index f362733186a..657ab55beda 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -61,6 +61,7 @@ struct file; static inline void eventpoll_init_file(struct file *file) { INIT_LIST_HEAD(&file->f_ep_links); + INIT_LIST_HEAD(&file->f_tfile_llink); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 7aacf31418f..a7409bc157e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1001,6 +1001,7 @@ struct file { #ifdef CONFIG_EPOLL /* Used by fs/eventpoll.c to link all the hooks to this file */ struct list_head f_ep_links; + struct list_head f_tfile_llink; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; #ifdef CONFIG_DEBUG_WRITECOUNT -- cgit v1.2.3 From ab936cbcd02072a34b60d268f94440fd5cf1970b Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 12 Jan 2012 17:17:44 -0800 Subject: memcg: add mem_cgroup_replace_page_cache() to fix LRU issue Commit ef6a3c6311 ("mm: add replace_page_cache_page() function") added a function replace_page_cache_page(). This function replaces a page in the radix-tree with a new page. WHen doing this, memory cgroup needs to fix up the accounting information. memcg need to check PCG_USED bit etc. In some(many?) cases, 'newpage' is on LRU before calling replace_page_cache(). So, memcg's LRU accounting information should be fixed, too. This patch adds mem_cgroup_replace_page_cache() and removes the old hooks. In that function, old pages will be unaccounted without touching res_counter and new page will be accounted to the memcg (of old page). WHen overwriting pc->mem_cgroup of newpage, take zone->lru_lock and avoid races with LRU handling. Background: replace_page_cache_page() is called by FUSE code in its splice() handling. Here, 'newpage' is replacing oldpage but this newpage is not a newly allocated page and may be on LRU. LRU mis-accounting will be critical for memory cgroup because rmdir() checks the whole LRU is empty and there is no account leak. If a page is on the other LRU than it should be, rmdir() will fail. This bug was added in March 2011, but no bug report yet. I guess there are not many people who use memcg and FUSE at the same time with upstream kernels. The result of this bug is that admin cannot destroy a memcg because of account leak. So, no panic, no deadlock. And, even if an active cgroup exist, umount can succseed. So no problem at shutdown. Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Miklos Szeredi Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f944591765e..3558a5e268c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -122,6 +122,8 @@ struct zone_reclaim_stat* mem_cgroup_get_reclaim_stat_from_page(struct page *page); extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p); +extern void mem_cgroup_replace_page_cache(struct page *oldpage, + struct page *newpage); #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP extern int do_swap_account; @@ -369,6 +371,10 @@ static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) { } +static inline void mem_cgroup_replace_page_cache(struct page *oldpage, + struct page *newpage) +{ +} #endif /* CONFIG_CGROUP_MEM_CONT */ #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) -- cgit v1.2.3 From 5660048ccac8735d9bc0a46325a02e6a6518b5b2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:17:59 -0800 Subject: mm: move memcg hierarchy reclaim to generic reclaim code Memory cgroup limit reclaim and traditional global pressure reclaim will soon share the same code to reclaim from a hierarchical tree of memory cgroups. In preparation of this, move the two right next to each other in shrink_zone(). The mem_cgroup_hierarchical_reclaim() polymath is split into a soft limit reclaim function, which still does hierarchy walking on its own, and a limit (shrinking) reclaim function, which relies on generic reclaim code to walk the hierarchy. Signed-off-by: Johannes Weiner Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Reviewed-by: Kirill A. Shutemov Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Ying Han Cc: Greg Thelen Cc: Michel Lespinasse Cc: Rik van Riel Cc: Minchan Kim Cc: Christoph Hellwig Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3558a5e268c..3b99dce8529 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -40,6 +40,12 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, struct mem_cgroup *mem_cont, int active, int file); +struct mem_cgroup_reclaim_cookie { + struct zone *zone; + int priority; + unsigned int generation; +}; + #ifdef CONFIG_CGROUP_MEM_RES_CTLR /* * All "charge" functions with gfp_mask should use GFP_KERNEL or @@ -106,6 +112,11 @@ mem_cgroup_prepare_migration(struct page *page, extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, struct page *oldpage, struct page *newpage, bool migration_ok); +struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, + struct mem_cgroup *, + struct mem_cgroup_reclaim_cookie *); +void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); + /* * For memory reclaim. */ @@ -281,6 +292,19 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg, { } +static inline struct mem_cgroup * +mem_cgroup_iter(struct mem_cgroup *root, + struct mem_cgroup *prev, + struct mem_cgroup_reclaim_cookie *reclaim) +{ + return NULL; +} + +static inline void mem_cgroup_iter_break(struct mem_cgroup *root, + struct mem_cgroup *prev) +{ +} + static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *memcg) { return 0; -- cgit v1.2.3 From 6290df545814990ca2663baf6e894669132d5f73 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:18:10 -0800 Subject: mm: collect LRU list heads into struct lruvec Having a unified structure with a LRU list set for both global zones and per-memcg zones allows to keep that code simple which deals with LRU lists and does not care about the container itself. Once the per-memcg LRU lists directly link struct pages, the isolation function and all other list manipulations are shared between the memcg case and the global LRU case. Signed-off-by: Johannes Weiner Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Reviewed-by: Kirill A. Shutemov Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Ying Han Cc: Greg Thelen Cc: Michel Lespinasse Cc: Rik van Riel Cc: Minchan Kim Cc: Christoph Hellwig Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 2 +- include/linux/mmzone.h | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8f7d24712dc..e6a7ffe16d3 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -33,7 +33,7 @@ __add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l, static inline void add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) { - __add_page_to_lru_list(zone, page, l, &zone->lru[l].list); + __add_page_to_lru_list(zone, page, l, &zone->lruvec.lists[l]); } static inline void diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ca6ca92418a..42e544cd4c9 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -159,6 +159,10 @@ static inline int is_unevictable_lru(enum lru_list l) return (l == LRU_UNEVICTABLE); } +struct lruvec { + struct list_head lists[NR_LRU_LISTS]; +}; + /* Mask used at gathering information at once (see memcontrol.c) */ #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) @@ -364,10 +368,8 @@ struct zone { ZONE_PADDING(_pad1_) /* Fields commonly accessed by the page reclaim scanner */ - spinlock_t lru_lock; - struct zone_lru { - struct list_head list; - } lru[NR_LRU_LISTS]; + spinlock_t lru_lock; + struct lruvec lruvec; struct zone_reclaim_stat reclaim_stat; -- cgit v1.2.3 From 925b7673cce39116ce61e7a06683a4a0dad1e72a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:18:15 -0800 Subject: mm: make per-memcg LRU lists exclusive Now that all code that operated on global per-zone LRU lists is converted to operate on per-memory cgroup LRU lists instead, there is no reason to keep the double-LRU scheme around any longer. The pc->lru member is removed and page->lru is linked directly to the per-memory cgroup LRU lists, which removes two pointers from a descriptor that exists for every page frame in the system. Signed-off-by: Johannes Weiner Signed-off-by: Hugh Dickins Signed-off-by: Ying Han Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Reviewed-by: Kirill A. Shutemov Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Greg Thelen Cc: Michel Lespinasse Cc: Rik van Riel Cc: Minchan Kim Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 51 +++++++++++++++++++-------------------------- include/linux/mm_inline.h | 21 +++++++------------ include/linux/page_cgroup.h | 1 - 3 files changed, 30 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3b99dce8529..e2f8e7caf04 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -32,14 +32,6 @@ enum mem_cgroup_page_stat_item { MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */ }; -extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, - struct list_head *dst, - unsigned long *scanned, int order, - isolate_mode_t mode, - struct zone *z, - struct mem_cgroup *mem_cont, - int active, int file); - struct mem_cgroup_reclaim_cookie { struct zone *zone; int priority; @@ -69,13 +61,14 @@ extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr); extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); -extern void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru); -extern void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru); -extern void mem_cgroup_rotate_reclaimable_page(struct page *page); -extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru); -extern void mem_cgroup_del_lru(struct page *page); -extern void mem_cgroup_move_lists(struct page *page, - enum lru_list from, enum lru_list to); + +struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); +struct lruvec *mem_cgroup_lru_add_list(struct zone *, struct page *, + enum lru_list); +void mem_cgroup_lru_del_list(struct page *, enum lru_list); +void mem_cgroup_lru_del(struct page *); +struct lruvec *mem_cgroup_lru_move_lists(struct zone *, struct page *, + enum lru_list, enum lru_list); /* For coalescing uncharge for reducing memcg' overhead*/ extern void mem_cgroup_uncharge_start(void); @@ -223,33 +216,33 @@ static inline void mem_cgroup_uncharge_cache_page(struct page *page) { } -static inline void mem_cgroup_add_lru_list(struct page *page, int lru) -{ -} - -static inline void mem_cgroup_del_lru_list(struct page *page, int lru) +static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, + struct mem_cgroup *memcg) { - return ; + return &zone->lruvec; } -static inline void mem_cgroup_rotate_reclaimable_page(struct page *page) +static inline struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, + struct page *page, + enum lru_list lru) { - return ; + return &zone->lruvec; } -static inline void mem_cgroup_rotate_lru_list(struct page *page, int lru) +static inline void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) { - return ; } -static inline void mem_cgroup_del_lru(struct page *page) +static inline void mem_cgroup_lru_del(struct page *page) { - return ; } -static inline void -mem_cgroup_move_lists(struct page *page, enum lru_list from, enum lru_list to) +static inline struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, + struct page *page, + enum lru_list from, + enum lru_list to) { + return &zone->lruvec; } static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index e6a7ffe16d3..4e3478e7192 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -21,27 +21,22 @@ static inline int page_is_file_cache(struct page *page) return !PageSwapBacked(page); } -static inline void -__add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l, - struct list_head *head) -{ - list_add(&page->lru, head); - __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page)); - mem_cgroup_add_lru_list(page, l); -} - static inline void add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) { - __add_page_to_lru_list(zone, page, l, &zone->lruvec.lists[l]); + struct lruvec *lruvec; + + lruvec = mem_cgroup_lru_add_list(zone, page, l); + list_add(&page->lru, &lruvec->lists[l]); + __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page)); } static inline void del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) { + mem_cgroup_lru_del_list(page, l); list_del(&page->lru); __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); - mem_cgroup_del_lru_list(page, l); } /** @@ -64,7 +59,6 @@ del_page_from_lru(struct zone *zone, struct page *page) { enum lru_list l; - list_del(&page->lru); if (PageUnevictable(page)) { __ClearPageUnevictable(page); l = LRU_UNEVICTABLE; @@ -75,8 +69,9 @@ del_page_from_lru(struct zone *zone, struct page *page) l += LRU_ACTIVE; } } + mem_cgroup_lru_del_list(page, l); + list_del(&page->lru); __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); - mem_cgroup_del_lru_list(page, l); } /** diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 961ecc7d30b..5bae7535c20 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -31,7 +31,6 @@ enum { struct page_cgroup { unsigned long flags; struct mem_cgroup *mem_cgroup; - struct list_head lru; /* per cgroup LRU list */ }; void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat); -- cgit v1.2.3 From 6b208e3f6e35aa76d254c395bdcd984b17c6b626 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:18:18 -0800 Subject: mm: memcg: remove unused node/section info from pc->flags To find the page corresponding to a certain page_cgroup, the pc->flags encoded the node or section ID with the base array to compare the pc pointer to. Now that the per-memory cgroup LRU lists link page descriptors directly, there is no longer any code that knows the struct page_cgroup of a PFN but not the struct page. [hughd@google.com: remove unused node/section info from pc->flags fix] Signed-off-by: Johannes Weiner Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Reviewed-by: Kirill A. Shutemov Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: "Kirill A. Shutemov" Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Ying Han Cc: Greg Thelen Cc: Michel Lespinasse Cc: Rik van Riel Cc: Minchan Kim Cc: Christoph Hellwig Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 33 --------------------------------- 1 file changed, 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 5bae7535c20..aaa60da8783 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -121,39 +121,6 @@ static inline void move_unlock_page_cgroup(struct page_cgroup *pc, local_irq_restore(*flags); } -#ifdef CONFIG_SPARSEMEM -#define PCG_ARRAYID_WIDTH SECTIONS_SHIFT -#else -#define PCG_ARRAYID_WIDTH NODES_SHIFT -#endif - -#if (PCG_ARRAYID_WIDTH > BITS_PER_LONG - NR_PCG_FLAGS) -#error Not enough space left in pc->flags to store page_cgroup array IDs -#endif - -/* pc->flags: ARRAY-ID | FLAGS */ - -#define PCG_ARRAYID_MASK ((1UL << PCG_ARRAYID_WIDTH) - 1) - -#define PCG_ARRAYID_OFFSET (BITS_PER_LONG - PCG_ARRAYID_WIDTH) -/* - * Zero the shift count for non-existent fields, to prevent compiler - * warnings and ensure references are optimized away. - */ -#define PCG_ARRAYID_SHIFT (PCG_ARRAYID_OFFSET * (PCG_ARRAYID_WIDTH != 0)) - -static inline void set_page_cgroup_array_id(struct page_cgroup *pc, - unsigned long id) -{ - pc->flags &= ~(PCG_ARRAYID_MASK << PCG_ARRAYID_SHIFT); - pc->flags |= (id & PCG_ARRAYID_MASK) << PCG_ARRAYID_SHIFT; -} - -static inline unsigned long page_cgroup_array_id(struct page_cgroup *pc) -{ - return (pc->flags >> PCG_ARRAYID_SHIFT) & PCG_ARRAYID_MASK; -} - #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct page_cgroup; -- cgit v1.2.3 From e94c8a9cbce1aee4af9e1285802785481b7f93c5 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 12 Jan 2012 17:18:20 -0800 Subject: memcg: make mem_cgroup_split_huge_fixup() more efficient In split_huge_page(), mem_cgroup_split_huge_fixup() is called to handle page_cgroup modifcations. It takes move_lock_page_cgroup() and modifies page_cgroup and LRU accounting jobs and called HPAGE_PMD_SIZE - 1 times. But thinking again, - compound_lock() is held at move_accout...then, it's not necessary to take move_lock_page_cgroup(). - LRU is locked and all tail pages will go into the same LRU as head is now on. - page_cgroup is contiguous in huge page range. This patch fixes mem_cgroup_split_huge_fixup() as to be called once per hugepage and reduce costs for spliting. [akpm@linux-foundation.org: fix typo, per Michal] Signed-off-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Andrea Arcangeli Reviewed-by: Michal Hocko Cc: Balbir Singh Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e2f8e7caf04..cee3761666f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -163,7 +163,7 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg); void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); #ifdef CONFIG_TRANSPARENT_HUGEPAGE -void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail); +void mem_cgroup_split_huge_fixup(struct page *head); #endif #ifdef CONFIG_DEBUG_VM @@ -379,8 +379,7 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) return 0; } -static inline void mem_cgroup_split_huge_fixup(struct page *head, - struct page *tail) +static inline void mem_cgroup_split_huge_fixup(struct page *head) { } -- cgit v1.2.3 From 72835c86ca15d0126354b73d5f29ce9194931c9b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:18:32 -0800 Subject: mm: unify remaining mem_cont, mem, etc. variable names to memcg Signed-off-by: Johannes Weiner Acked-by: David Rientjes Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: Balbir Singh Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 16 ++++++++-------- include/linux/oom.h | 2 +- include/linux/rmap.h | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index cee3761666f..b80de520670 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -54,10 +54,10 @@ extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); /* for swap handling */ extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm, - struct page *page, gfp_t mask, struct mem_cgroup **ptr); + struct page *page, gfp_t mask, struct mem_cgroup **memcgp); extern void mem_cgroup_commit_charge_swapin(struct page *page, - struct mem_cgroup *ptr); -extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr); + struct mem_cgroup *memcg); +extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg); extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); @@ -101,7 +101,7 @@ extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg); extern int mem_cgroup_prepare_migration(struct page *page, - struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask); + struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask); extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, struct page *oldpage, struct page *newpage, bool migration_ok); @@ -186,17 +186,17 @@ static inline int mem_cgroup_cache_charge(struct page *page, } static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm, - struct page *page, gfp_t gfp_mask, struct mem_cgroup **ptr) + struct page *page, gfp_t gfp_mask, struct mem_cgroup **memcgp) { return 0; } static inline void mem_cgroup_commit_charge_swapin(struct page *page, - struct mem_cgroup *ptr) + struct mem_cgroup *memcg) { } -static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr) +static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) { } @@ -275,7 +275,7 @@ static inline struct cgroup_subsys_state static inline int mem_cgroup_prepare_migration(struct page *page, struct page *newpage, - struct mem_cgroup **ptr, gfp_t gfp_mask) + struct mem_cgroup **memcgp, gfp_t gfp_mask) { return 0; } diff --git a/include/linux/oom.h b/include/linux/oom.h index 6f9d04a8533..552fba9c7d5 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -43,7 +43,7 @@ enum oom_constraint { extern void compare_swap_oom_score_adj(int old_val, int new_val); extern int test_set_oom_score_adj(int new_val); -extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, +extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 1afb9954bbf..1cdd62a2788 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -158,7 +158,7 @@ static inline void page_dup_rmap(struct page *page) * Called from mm/vmscan.c to handle paging out */ int page_referenced(struct page *, int is_locked, - struct mem_cgroup *cnt, unsigned long *vm_flags); + struct mem_cgroup *memcg, unsigned long *vm_flags); int page_referenced_one(struct page *, struct vm_area_struct *, unsigned long address, unsigned int *mapcount, unsigned long *vm_flags); @@ -236,7 +236,7 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *, #define anon_vma_link(vma) do {} while (0) static inline int page_referenced(struct page *page, int is_locked, - struct mem_cgroup *cnt, + struct mem_cgroup *memcg, unsigned long *vm_flags) { *vm_flags = 0; -- cgit v1.2.3 From 9fb4b7cc0724f178d4b24a2a566ea1e7cb120b82 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Thu, 12 Jan 2012 17:18:48 -0800 Subject: page_cgroup: add helper function to get swap_cgroup There are multiple places which need to get the swap_cgroup address, so add a helper function: static struct swap_cgroup *swap_cgroup_getsc(swp_entry_t ent, struct swap_cgroup_ctrl **ctrl); to simplify the code. Signed-off-by: Bob Liu Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index aaa60da8783..1153095ee45 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -149,7 +149,7 @@ static inline void __init page_cgroup_init_flatmem(void) extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, unsigned short old, unsigned short new); extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id); -extern unsigned short lookup_swap_cgroup(swp_entry_t ent); +extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); extern int swap_cgroup_swapon(int type, unsigned long max_pages); extern void swap_cgroup_swapoff(int type); #else @@ -161,7 +161,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) } static inline -unsigned short lookup_swap_cgroup(swp_entry_t ent) +unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { return 0; } -- cgit v1.2.3 From 4e5f01c2b9b94321992acb09c35d34f5ee5bb274 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 12 Jan 2012 17:18:58 -0800 Subject: memcg: clear pc->mem_cgroup if necessary. This is a preparation before removing a flag PCG_ACCT_LRU in page_cgroup and reducing atomic ops/complexity in memcg LRU handling. In some cases, pages are added to lru before charge to memcg and pages are not classfied to memory cgroup at lru addtion. Now, the lru where the page should be added is determined a bit in page_cgroup->flags and pc->mem_cgroup. I'd like to remove the check of flag. To handle the case pc->mem_cgroup may contain stale pointers if pages are added to LRU before classification. This patch resets pc->mem_cgroup to root_mem_cgroup before lru additions. [akpm@linux-foundation.org: fix CONFIG_CGROUP_MEM_CONT=n build] [hughd@google.com: fix CONFIG_CGROUP_MEM_RES_CTLR=y CONFIG_CGROUP_MEM_RES_CTLR_SWAP=n build] [akpm@linux-foundation.org: ksm.c needs memcontrol.h, per Michal] [hughd@google.com: stop oops in mem_cgroup_reset_owner()] [hughd@google.com: fix page migration to reset_owner] Signed-off-by: KAMEZAWA Hiroyuki Cc: Miklos Szeredi Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Ying Han Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b80de520670..4d34356fe64 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -129,6 +129,7 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, extern void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage); +extern void mem_cgroup_reset_owner(struct page *page); #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP extern int do_swap_account; #endif @@ -391,6 +392,10 @@ static inline void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage) { } + +static inline void mem_cgroup_reset_owner(struct page *page) +{ +} #endif /* CONFIG_CGROUP_MEM_CONT */ #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) -- cgit v1.2.3 From 38c5d72f3ebe5ddd57d2f08dc035070fc6c9a287 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 12 Jan 2012 17:19:01 -0800 Subject: memcg: simplify LRU handling by new rule Now, at LRU handling, memory cgroup needs to do complicated works to see valid pc->mem_cgroup, which may be overwritten. This patch is for relaxing the protocol. This patch guarantees - when pc->mem_cgroup is overwritten, page must not be on LRU. By this, LRU routine can believe pc->mem_cgroup and don't need to check bits on pc->flags. This new rule may adds small overheads to swapin. But in most case, lru handling gets faster. After this patch, PCG_ACCT_LRU bit is obsolete and removed. [akpm@linux-foundation.org: remove unneeded VM_BUG_ON(), restore hannes's christmas tree] [akpm@linux-foundation.org: clean up code comment] [hughd@google.com: fix NULL mem_cgroup_try_charge] Signed-off-by: KAMEZAWA Hiroyuki Cc: Miklos Szeredi Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Ying Han Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 1153095ee45..a2d11771c84 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -10,8 +10,6 @@ enum { /* flags for mem_cgroup and file and I/O status */ PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ PCG_FILE_MAPPED, /* page is accounted as "mapped" */ - /* No lock in page_cgroup */ - PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */ __NR_PCG_FLAGS, }; @@ -75,12 +73,6 @@ TESTPCGFLAG(Used, USED) CLEARPCGFLAG(Used, USED) SETPCGFLAG(Used, USED) -SETPCGFLAG(AcctLRU, ACCT_LRU) -CLEARPCGFLAG(AcctLRU, ACCT_LRU) -TESTPCGFLAG(AcctLRU, ACCT_LRU) -TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU) - - SETPCGFLAG(FileMapped, FILE_MAPPED) CLEARPCGFLAG(FileMapped, FILE_MAPPED) TESTPCGFLAG(FileMapped, FILE_MAPPED) -- cgit v1.2.3 From f21760b15dcd091e5afd38d0b97197b45f7ef2ea Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 12 Jan 2012 17:19:16 -0800 Subject: thp: add tlb_remove_pmd_tlb_entry We have tlb_remove_tlb_entry to indicate a pte tlb flush entry should be flushed, but not a corresponding API for pmd entry. This isn't a problem so far because THP is only for x86 currently and tlb_flush() under x86 will flush entire TLB. But this is confusion and could be missed if thp is ported to other arch. Also convert tlb->need_flush = 1 to a VM_BUG_ON(!tlb->need_flush) in __tlb_remove_page() as suggested by Andrea Arcangeli. The __tlb_remove_page() function is supposed to be called after tlb_remove_xxx_tlb_entry() and we can catch any misuse. Signed-off-by: Shaohua Li Reviewed-by: Andrea Arcangeli Cc: David Rientjes Cc: Johannes Weiner Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a9ace9c3250..1b921299abc 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -18,7 +18,7 @@ extern struct page *follow_trans_huge_pmd(struct mm_struct *mm, unsigned int flags); extern int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, - pmd_t *pmd); + pmd_t *pmd, unsigned long addr); extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec); -- cgit v1.2.3 From b969c4ab9f182a6e1b2a0848be349f99714947b0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:34 -0800 Subject: mm: compaction: determine if dirty pages can be migrated without blocking within ->migratepage Asynchronous compaction is used when allocating transparent hugepages to avoid blocking for long periods of time. Due to reports of stalling, there was a debate on disabling synchronous compaction but this severely impacted allocation success rates. Part of the reason was that many dirty pages are skipped in asynchronous compaction by the following check; if (PageDirty(page) && !sync && mapping->a_ops->migratepage != migrate_page) rc = -EBUSY; This skips over all mapping aops using buffer_migrate_page() even though it is possible to migrate some of these pages without blocking. This patch updates the ->migratepage callback with a "sync" parameter. It is the responsibility of the callback to fail gracefully if migration would block. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 9 ++++++--- include/linux/migrate.h | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index a7409bc157e..b92b73d0b2b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -609,9 +609,12 @@ struct address_space_operations { loff_t offset, unsigned long nr_segs); int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **, unsigned long *); - /* migrate the contents of a page to the specified target */ + /* + * migrate the contents of a page to the specified target. If sync + * is false, it must not block. + */ int (*migratepage) (struct address_space *, - struct page *, struct page *); + struct page *, struct page *, bool); int (*launder_page) (struct page *); int (*is_partially_uptodate) (struct page *, read_descriptor_t *, unsigned long); @@ -2537,7 +2540,7 @@ extern int generic_check_addressable(unsigned, u64); #ifdef CONFIG_MIGRATION extern int buffer_migrate_page(struct address_space *, - struct page *, struct page *); + struct page *, struct page *, bool); #else #define buffer_migrate_page NULL #endif diff --git a/include/linux/migrate.h b/include/linux/migrate.h index e39aeecfe9a..14e6d2a8847 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -11,7 +11,7 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **); extern void putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, - struct page *, struct page *); + struct page *, struct page *, bool); extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, bool sync); -- cgit v1.2.3 From c82449352854ff09e43062246af86bdeb628f0c3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:38 -0800 Subject: mm: compaction: make isolate_lru_page() filter-aware again Commit 39deaf85 ("mm: compaction: make isolate_lru_page() filter-aware") noted that compaction does not migrate dirty or writeback pages and that is was meaningless to pick the page and re-add it to the LRU list. This had to be partially reverted because some dirty pages can be migrated by compaction without blocking. This patch updates "mm: compaction: make isolate_lru_page" by skipping over pages that migration has no possibility of migrating to minimise LRU disruption. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Reviewed-by: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 42e544cd4c9..2038b90ca6e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -177,6 +177,8 @@ struct lruvec { #define ISOLATE_CLEAN ((__force isolate_mode_t)0x4) /* Isolate unmapped file */ #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x8) +/* Isolate for asynchronous migration */ +#define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x10) /* LRU Isolation modes. */ typedef unsigned __bitwise__ isolate_mode_t; -- cgit v1.2.3 From a6bc32b899223a877f595ef9ddc1e89ead5072b8 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:43 -0800 Subject: mm: compaction: introduce sync-light migration for use by compaction This patch adds a lightweight sync migrate operation MIGRATE_SYNC_LIGHT mode that avoids writing back pages to backing storage. Async compaction maps to MIGRATE_ASYNC while sync compaction maps to MIGRATE_SYNC_LIGHT. For other migrate_pages users such as memory hotplug, MIGRATE_SYNC is used. This avoids sync compaction stalling for an excessive length of time, particularly when copying files to a USB stick where there might be a large number of dirty pages backed by a filesystem that does not support ->writepages. [aarcange@redhat.com: This patch is heavily based on Andrea's work] [akpm@linux-foundation.org: fix fs/nfs/write.c build] [akpm@linux-foundation.org: fix fs/btrfs/disk-io.c build] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 6 ++++-- include/linux/migrate.h | 23 ++++++++++++++++++----- 2 files changed, 22 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index b92b73d0b2b..e694bd4434a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -525,6 +525,7 @@ enum positive_aop_returns { struct page; struct address_space; struct writeback_control; +enum migrate_mode; struct iov_iter { const struct iovec *iov; @@ -614,7 +615,7 @@ struct address_space_operations { * is false, it must not block. */ int (*migratepage) (struct address_space *, - struct page *, struct page *, bool); + struct page *, struct page *, enum migrate_mode); int (*launder_page) (struct page *); int (*is_partially_uptodate) (struct page *, read_descriptor_t *, unsigned long); @@ -2540,7 +2541,8 @@ extern int generic_check_addressable(unsigned, u64); #ifdef CONFIG_MIGRATION extern int buffer_migrate_page(struct address_space *, - struct page *, struct page *, bool); + struct page *, struct page *, + enum migrate_mode); #else #define buffer_migrate_page NULL #endif diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 14e6d2a8847..eaf867412f7 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -6,18 +6,31 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **); +/* + * MIGRATE_ASYNC means never block + * MIGRATE_SYNC_LIGHT in the current implementation means to allow blocking + * on most operations but not ->writepage as the potential stall time + * is too significant + * MIGRATE_SYNC will block when migrating pages + */ +enum migrate_mode { + MIGRATE_ASYNC, + MIGRATE_SYNC_LIGHT, + MIGRATE_SYNC, +}; + #ifdef CONFIG_MIGRATION #define PAGE_MIGRATION 1 extern void putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, - struct page *, struct page *, bool); + struct page *, struct page *, enum migrate_mode); extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - bool sync); + enum migrate_mode mode); extern int migrate_huge_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - bool sync); + enum migrate_mode mode); extern int fail_migrate_page(struct address_space *, struct page *, struct page *); @@ -36,10 +49,10 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, static inline void putback_lru_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - bool sync) { return -ENOSYS; } + enum migrate_mode mode) { return -ENOSYS; } static inline int migrate_huge_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - bool sync) { return -ENOSYS; } + enum migrate_mode mode) { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } -- cgit v1.2.3 From 2bcf887963812c075f80a14e1fad8ec7e1c67acf Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 12 Jan 2012 17:19:56 -0800 Subject: mm: take pagevecs off reclaim stack Replace pagevecs in putback_lru_pages() and move_active_pages_to_lru() by lists of pages_to_free: then apply Konstantin Khlebnikov's free_hot_cold_page_list() to them instead of pagevec_release(). Which simplifies the flow (no need to drop and retake lock whenever pagevec fills up) and reduces stale addresses in stack backtraces (which often showed through the pagevecs); but more importantly, removes another 120 bytes from the deepest stacks in page reclaim. Although I've not recently seen an actual stack overflow here with a vanilla kernel, move_active_pages_to_lru() has often featured in deep backtraces. However, free_hot_cold_page_list() does not handle compound pages (nor need it: a Transparent HugePage would have been split by the time it reaches the call in shrink_page_list()), but it is possible for putback_lru_pages() or move_active_pages_to_lru() to be left holding the last reference on a THP, so must exclude the unlikely compound case before putting on pages_to_free. Remove pagevec_strip(), its work now done in move_active_pages_to_lru(). The pagevec in scan_mapping_unevictable_pages() remains in mm/vmscan.c, but that is never on the reclaim path, and cannot be replaced by a list. Signed-off-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Reviewed-by: Konstantin Khlebnikov Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index ed17024d2eb..9def9121f8a 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -22,7 +22,6 @@ struct pagevec { void __pagevec_release(struct pagevec *pvec); void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru); -void pagevec_strip(struct pagevec *pvec); unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, unsigned nr_pages); unsigned pagevec_lookup_tag(struct pagevec *pvec, @@ -59,7 +58,6 @@ static inline unsigned pagevec_add(struct pagevec *pvec, struct page *page) return pagevec_space(pvec); } - static inline void pagevec_release(struct pagevec *pvec) { if (pagevec_count(pvec)) -- cgit v1.2.3 From 5095ae83759f035c823fb375c6ed2de99c81d5ec Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 12 Jan 2012 17:19:58 -0800 Subject: mm: fewer underscores in ____pagevec_lru_add What's so special about ____pagevec_lru_add() that it needs four leading underscores? Nothing, it just helped to distinguish from __pagevec_lru_add() in 2.6.28 development. Cut two leading underscores. Signed-off-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 9def9121f8a..2aa12b8499c 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -21,7 +21,7 @@ struct pagevec { }; void __pagevec_release(struct pagevec *pvec); -void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru); +void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru); unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, unsigned nr_pages); unsigned pagevec_lookup_tag(struct pagevec *pvec, @@ -66,22 +66,22 @@ static inline void pagevec_release(struct pagevec *pvec) static inline void __pagevec_lru_add_anon(struct pagevec *pvec) { - ____pagevec_lru_add(pvec, LRU_INACTIVE_ANON); + __pagevec_lru_add(pvec, LRU_INACTIVE_ANON); } static inline void __pagevec_lru_add_active_anon(struct pagevec *pvec) { - ____pagevec_lru_add(pvec, LRU_ACTIVE_ANON); + __pagevec_lru_add(pvec, LRU_ACTIVE_ANON); } static inline void __pagevec_lru_add_file(struct pagevec *pvec) { - ____pagevec_lru_add(pvec, LRU_INACTIVE_FILE); + __pagevec_lru_add(pvec, LRU_INACTIVE_FILE); } static inline void __pagevec_lru_add_active_file(struct pagevec *pvec) { - ____pagevec_lru_add(pvec, LRU_ACTIVE_FILE); + __pagevec_lru_add(pvec, LRU_ACTIVE_FILE); } static inline void pagevec_lru_add_file(struct pagevec *pvec) -- cgit v1.2.3 From 4111304dab198c687bc60f2e235a9f7ee92c47c8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 12 Jan 2012 17:20:01 -0800 Subject: mm: enum lru_list lru Mostly we use "enum lru_list lru": change those few "l"s to "lru"s. Signed-off-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 26 +++++++++++++------------- include/linux/mmzone.h | 16 ++++++++-------- 2 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 4e3478e7192..8f84d2e53d0 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -22,21 +22,21 @@ static inline int page_is_file_cache(struct page *page) } static inline void -add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) +add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list lru) { struct lruvec *lruvec; - lruvec = mem_cgroup_lru_add_list(zone, page, l); - list_add(&page->lru, &lruvec->lists[l]); - __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page)); + lruvec = mem_cgroup_lru_add_list(zone, page, lru); + list_add(&page->lru, &lruvec->lists[lru]); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, hpage_nr_pages(page)); } static inline void -del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) +del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list lru) { - mem_cgroup_lru_del_list(page, l); + mem_cgroup_lru_del_list(page, lru); list_del(&page->lru); - __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, -hpage_nr_pages(page)); } /** @@ -57,21 +57,21 @@ static inline enum lru_list page_lru_base_type(struct page *page) static inline void del_page_from_lru(struct zone *zone, struct page *page) { - enum lru_list l; + enum lru_list lru; if (PageUnevictable(page)) { __ClearPageUnevictable(page); - l = LRU_UNEVICTABLE; + lru = LRU_UNEVICTABLE; } else { - l = page_lru_base_type(page); + lru = page_lru_base_type(page); if (PageActive(page)) { __ClearPageActive(page); - l += LRU_ACTIVE; + lru += LRU_ACTIVE; } } - mem_cgroup_lru_del_list(page, l); + mem_cgroup_lru_del_list(page, lru); list_del(&page->lru); - __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, -hpage_nr_pages(page)); } /** diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2038b90ca6e..650ba2fb330 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -140,23 +140,23 @@ enum lru_list { NR_LRU_LISTS }; -#define for_each_lru(l) for (l = 0; l < NR_LRU_LISTS; l++) +#define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) -#define for_each_evictable_lru(l) for (l = 0; l <= LRU_ACTIVE_FILE; l++) +#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) -static inline int is_file_lru(enum lru_list l) +static inline int is_file_lru(enum lru_list lru) { - return (l == LRU_INACTIVE_FILE || l == LRU_ACTIVE_FILE); + return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); } -static inline int is_active_lru(enum lru_list l) +static inline int is_active_lru(enum lru_list lru) { - return (l == LRU_ACTIVE_ANON || l == LRU_ACTIVE_FILE); + return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } -static inline int is_unevictable_lru(enum lru_list l) +static inline int is_unevictable_lru(enum lru_list lru) { - return (l == LRU_UNEVICTABLE); + return (lru == LRU_UNEVICTABLE); } struct lruvec { -- cgit v1.2.3 From 1c1c53d43b387d02174911ecb42ce846577b0ea0 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 12 Jan 2012 17:20:04 -0800 Subject: mm: remove del_page_from_lru, add page_off_lru del_page_from_lru() repeats del_page_from_lru_list(), also working out which LRU the page was on, clearing the relevant bits. Decouple those functions: remove del_page_from_lru() and add page_off_lru(). Signed-off-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8f84d2e53d0..227fd3e9a9c 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -54,8 +54,14 @@ static inline enum lru_list page_lru_base_type(struct page *page) return LRU_INACTIVE_ANON; } -static inline void -del_page_from_lru(struct zone *zone, struct page *page) +/** + * page_off_lru - which LRU list was page on? clearing its lru flags. + * @page: the page to test + * + * Returns the LRU list a page was on, as an index into the array of LRU + * lists; and clears its Unevictable or Active flags, ready for freeing. + */ +static inline enum lru_list page_off_lru(struct page *page) { enum lru_list lru; @@ -69,9 +75,7 @@ del_page_from_lru(struct zone *zone, struct page *page) lru += LRU_ACTIVE; } } - mem_cgroup_lru_del_list(page, lru); - list_del(&page->lru); - __mod_zone_page_state(zone, NR_LRU_BASE + lru, -hpage_nr_pages(page)); + return lru; } /** @@ -92,7 +96,6 @@ static inline enum lru_list page_lru(struct page *page) if (PageActive(page)) lru += LRU_ACTIVE; } - return lru; } -- cgit v1.2.3 From a3dd3323058d281abd584b15ad4c5b65064d7a61 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 12 Jan 2012 17:20:11 -0800 Subject: kexec: remove KMSG_DUMP_KEXEC KMSG_DUMP_KEXEC is useless because we already save kernel messages inside /proc/vmcore, and it is unsafe to allow modules to do other stuffs in a crash dump scenario. [akpm@linux-foundation.org: fix powerpc build] Signed-off-by: WANG Cong Reported-by: Vivek Goyal Acked-by: Vivek Goyal Acked-by: Jarod Wilson Cc: "Eric W. Biederman" Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kmsg_dump.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index ee0c952188d..fee66317e07 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h @@ -18,7 +18,6 @@ enum kmsg_dump_reason { KMSG_DUMP_OOPS, KMSG_DUMP_PANIC, - KMSG_DUMP_KEXEC, KMSG_DUMP_RESTART, KMSG_DUMP_HALT, KMSG_DUMP_POWEROFF, -- cgit v1.2.3 From 1f536b9e9f85456df93614b3c2f6a1a2b7d7cb9b Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Thu, 12 Jan 2012 17:20:20 -0800 Subject: include/linux/crash_dump.h needs elf.h Building an ARM target we get the following warnings: CC arch/arm/kernel/setup.o In file included from arch/arm/kernel/setup.c:39: arch/arm/include/asm/elf.h:102:1: warning: "vmcore_elf64_check_arch" redefined In file included from arch/arm/kernel/setup.c:24: include/linux/crash_dump.h:30:1: warning: this is the location of the previous definition Quoting Russell King: "linux/crash_dump.h makes no attempt to include asm/elf.h, but it depends on stuff in asm/elf.h to determine how stuff inside this file is defined at parse time. So, if asm/elf.h is included after linux/crash_dump.h or not at all, you get a different result from the situation where asm/elf.h is included before." So add elf.h header to crash_dump.h to avoid this problem. The original discussion about this can be found at: http://www.spinics.net/lists/arm-kernel/msg154113.html Signed-off-by: Fabio Estevam Cc: Russell King Cc: [3.2.1] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/crash_dump.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 5c4abce94ad..b936763f223 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -5,6 +5,7 @@ #include #include #include +#include #define ELFCORE_ADDR_MAX (-1ULL) #define ELFCORE_ADDR_ERR (-2ULL) -- cgit v1.2.3 From 87192a2a49c475cf322cb143e0fa63b0102d8567 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Jan 2012 17:20:34 -0800 Subject: vfs: cache request_queue in struct block_device This makes it possible to get from the inode to the request_queue with one less cache miss. Used in followon optimization. The livetime of the pointer is the same as the gendisk. This assumes that the queue will always stay the same in the gendisk while it's visible to block_devices. I think that's safe correct? Signed-off-by: Andi Kleen Acked-by: Jeff Moyer Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e694bd4434a..4bc8169fb5a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -660,6 +660,7 @@ struct address_space { * must be enforced here for CRIS, to let the least significant bit * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. */ +struct request_queue; struct block_device { dev_t bd_dev; /* not a kdev_t - it's a search key */ @@ -682,6 +683,7 @@ struct block_device { unsigned bd_part_count; int bd_invalidated; struct gendisk * bd_disk; + struct request_queue * bd_queue; struct list_head bd_list; /* * Private data. You must have bd_claim'ed the block_device -- cgit v1.2.3 From 928da837aca77a9d3cb5076bf07b3224b1ba293b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 12 Jan 2012 17:20:39 -0800 Subject: radix_tree: remove radix_tree_indirect_to_ptr() It is not used anymore, remove it Signed-off-by: Xiao Guangrong Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 9d4539c52e5..07e360b1b28 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -49,9 +49,6 @@ #define RADIX_TREE_EXCEPTIONAL_ENTRY 2 #define RADIX_TREE_EXCEPTIONAL_SHIFT 2 -#define radix_tree_indirect_to_ptr(ptr) \ - radix_tree_indirect_to_ptr((void __force *)(ptr)) - static inline int radix_tree_is_indirect_ptr(void *ptr) { return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR); -- cgit v1.2.3 From 028ee4be34a09a6d48bdf30ab991ae933a7bc036 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 12 Jan 2012 17:20:55 -0800 Subject: c/r: prctl: add PR_SET_MM codes to set up mm_struct entries When we restore a task we need to set up text, data and data heap sizes from userspace to the values a task had at checkpoint time. This patch adds auxilary prctl codes for that. While most of them have a statistical nature (their values are involved into calculation of /proc//statm output) the start_brk and brk values are used to compute an allowed size of program data segment expansion. Which means an arbitrary changes of this values might be dangerous operation. So to restrict access the following requirements applied to prctl calls: - The process has to have CAP_SYS_ADMIN capability granted. - For all opcodes except start_brk/brk members an appropriate VMA area must exist and should fit certain VMA flags, such as: - code segment must be executable but not writable; - data segment must not be executable. start_brk/brk values must not intersect with data segment and must not exceed RLIMIT_DATA resource limit. Still the main guard is CAP_SYS_ADMIN capability check. Note the kernel should be compiled with CONFIG_CHECKPOINT_RESTORE support otherwise these prctl calls will return -EINVAL. [akpm@linux-foundation.org: cache current->mm in a local, saving 200 bytes text] Signed-off-by: Cyrill Gorcunov Reviewed-by: Kees Cook Cc: Tejun Heo Cc: Andrew Vagin Cc: Serge Hallyn Cc: Pavel Emelyanov Cc: Vasiliy Kulikov Cc: KAMEZAWA Hiroyuki Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/prctl.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/prctl.h b/include/linux/prctl.h index a3baeb2c216..7ddc7f1b480 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -102,4 +102,16 @@ #define PR_MCE_KILL_GET 34 +/* + * Tune up process memory map specifics. + */ +#define PR_SET_MM 35 +# define PR_SET_MM_START_CODE 1 +# define PR_SET_MM_END_CODE 2 +# define PR_SET_MM_START_DATA 3 +# define PR_SET_MM_END_DATA 4 +# define PR_SET_MM_START_STACK 5 +# define PR_SET_MM_START_BRK 6 +# define PR_SET_MM_BRK 7 + #endif /* _LINUX_PRCTL_H */ -- cgit v1.2.3