From f51bdd2e97098a5cbb3cba7c3a56fa0e9ac3c444 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 25 Aug 2011 15:59:10 -0700 Subject: mm: fix a vmscan warning I get the below warning: BUG: using smp_processor_id() in preemptible [00000000] code: bash/746 caller is native_sched_clock+0x37/0x6e Pid: 746, comm: bash Tainted: G W 3.0.0+ #254 Call Trace: [] debug_smp_processor_id+0xc2/0xdc [] native_sched_clock+0x37/0x6e [] try_to_free_mem_cgroup_pages+0x7d/0x270 [] mem_cgroup_force_empty+0x24b/0x27a [] ? sys_close+0x38/0x138 [] ? sys_close+0x38/0x138 [] mem_cgroup_force_empty_write+0x17/0x19 [] cgroup_file_write+0xa8/0xba [] vfs_write+0xb3/0x138 [] sys_write+0x4a/0x71 [] ? sys_close+0xf0/0x138 [] system_call_fastpath+0x16/0x1b sched_clock() can't be used with preempt enabled. And we don't need fast approach to get clock here, so let's use ktime API. Signed-off-by: Shaohua Li Acked-by: KAMEZAWA Hiroyuki Tested-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 7ef69124fa3e..22631e0994b3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2283,7 +2283,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .mem_cgroup = mem, .memcg_record = rec, }; - unsigned long start, end; + ktime_t start, end; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2292,7 +2292,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, sc.may_writepage, sc.gfp_mask); - start = sched_clock(); + start = ktime_get(); /* * NOTE: Although we can get the priority field, using it * here is not a good idea, since it limits the pages we can scan. @@ -2301,10 +2301,10 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, * the priority and make it zero. */ shrink_zone(0, zone, &sc); - end = sched_clock(); + end = ktime_get(); if (rec) - rec->elapsed += end - start; + rec->elapsed += ktime_to_ns(ktime_sub(end, start)); *scanned = sc.nr_scanned; trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -2319,7 +2319,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, { struct zonelist *zonelist; unsigned long nr_reclaimed; - unsigned long start, end; + ktime_t start, end; int nid; struct scan_control sc = { .may_writepage = !laptop_mode, @@ -2337,7 +2337,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .gfp_mask = sc.gfp_mask, }; - start = sched_clock(); + start = ktime_get(); /* * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't * take care of from where we get pages. So the node where we start the @@ -2352,9 +2352,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, sc.gfp_mask); nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); - end = sched_clock(); + end = ktime_get(); if (rec) - rec->elapsed += end - start; + rec->elapsed += ktime_to_ns(ktime_sub(end, start)); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); -- cgit v1.2.3 From 439423f6894aa0dec22187526827456f5004baed Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 25 Aug 2011 15:59:12 -0700 Subject: vmscan: clear ZONE_CONGESTED for zone with good watermark ZONE_CONGESTED is only cleared in kswapd, but pages can be freed in any task. It's possible ZONE_CONGESTED isn't cleared in some cases: 1. the zone is already balanced just entering balance_pgdat() for order-0 because concurrent tasks free memory. In this case, later check will skip the zone as it's balanced so the flag isn't cleared. 2. high order balance fallbacks to order-0. quote from Mel: At the end of balance_pgdat(), kswapd uses the following logic; If reclaiming at high order { for each zone { if all_unreclaimable skip if watermark is not met order = 0 loop again /* watermark is met */ clear congested } } i.e. it clears ZONE_CONGESTED if it the zone is balanced. if not, it restarts balancing at order-0. However, if the higher zones are balanced for order-0, kswapd will miss clearing ZONE_CONGESTED as that only happens after a zone is shrunk. This can mean that wait_iff_congested() stalls unnecessarily. This patch makes kswapd clear ZONE_CONGESTED during its initial highmem->dma scan for zones that are already balanced. Signed-off-by: Shaohua Li Acked-by: Mel Gorman Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 22631e0994b3..b7719ec10dc5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2529,6 +2529,9 @@ loop_again: high_wmark_pages(zone), 0, 0)) { end_zone = i; break; + } else { + /* If balanced, clear the congested flag */ + zone_clear_flag(zone, ZONE_CONGESTED); } } if (i < 0) -- cgit v1.2.3 From a4d3e9e76337059406fcf3ead288c0df22a790e9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 14 Sep 2011 16:21:52 -0700 Subject: mm: vmscan: fix force-scanning small targets without swap Without swap, anonymous pages are not scanned. As such, they should not count when considering force-scanning a small target if there is no swap. Otherwise, targets are not force-scanned even when their effective scan number is zero and the other conditions--kswapd/memcg--apply. This fixes 246e87a93934 ("memcg: fix get_scan_count() for small targets"). [akpm@linux-foundation.org: fix comment] Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Cc: Ying Han Cc: Balbir Singh Cc: KOSAKI Motohiro Cc: Daisuke Nishimura Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index b7719ec10dc5..e49bcb6d4948 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1808,23 +1808,15 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, u64 fraction[2], denominator; enum lru_list l; int noswap = 0; - int force_scan = 0; + bool force_scan = false; unsigned long nr_force_scan[2]; - - anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); - file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); - - if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { - /* kswapd does zone balancing and need to scan this zone */ - if (scanning_global_lru(sc) && current_is_kswapd()) - force_scan = 1; - /* memcg may have small limit and need to avoid priority drop */ - if (!scanning_global_lru(sc)) - force_scan = 1; - } + /* kswapd does zone balancing and needs to scan this zone */ + if (scanning_global_lru(sc) && current_is_kswapd()) + force_scan = true; + /* memcg may have small limit and need to avoid priority drop */ + if (!scanning_global_lru(sc)) + force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ if (!sc->may_swap || (nr_swap_pages <= 0)) { @@ -1837,6 +1829,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, goto out; } + anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); + file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + if (scanning_global_lru(sc)) { free = zone_page_state(zone, NR_FREE_PAGES); /* If we have very few page cache pages, -- cgit v1.2.3 From 185efc0f9a1f2d6ad6d4782c5d9e529f3290567f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 14 Sep 2011 16:21:58 -0700 Subject: memcg: Revert "memcg: add memory.vmscan_stat" Revert the post-3.0 commit 82f9d486e59f5 ("memcg: add memory.vmscan_stat"). The implementation of per-memcg reclaim statistics violates how memcg hierarchies usually behave: hierarchically. The reclaim statistics are accounted to child memcgs and the parent hitting the limit, but not to hierarchy levels in between. Usually, hierarchical statistics are perfectly recursive, with each level representing the sum of itself and all its children. Since this exports statistics to userspace, this may lead to confusion and problems with changing things after the release, so revert it now, we can try again later. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Michal Hocko Cc: Ying Han Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 39 +++++---------------------------------- 1 file changed, 5 insertions(+), 34 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index e49bcb6d4948..b55699cd9067 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -105,7 +105,6 @@ struct scan_control { /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; - struct memcg_scanrecord *memcg_record; /* * Nodemask of nodes allowed by the caller. If NULL, all nodes @@ -1349,8 +1348,6 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, int file = is_file_lru(lru); int numpages = hpage_nr_pages(page); reclaim_stat->recent_rotated[file] += numpages; - if (!scanning_global_lru(sc)) - sc->memcg_record->nr_rotated[file] += numpages; } if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); @@ -1394,10 +1391,6 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone, reclaim_stat->recent_scanned[0] += *nr_anon; reclaim_stat->recent_scanned[1] += *nr_file; - if (!scanning_global_lru(sc)) { - sc->memcg_record->nr_scanned[0] += *nr_anon; - sc->memcg_record->nr_scanned[1] += *nr_file; - } } /* @@ -1511,9 +1504,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, nr_reclaimed += shrink_page_list(&page_list, zone, sc); } - if (!scanning_global_lru(sc)) - sc->memcg_record->nr_freed[file] += nr_reclaimed; - local_irq_disable(); if (current_is_kswapd()) __count_vm_events(KSWAPD_STEAL, nr_reclaimed); @@ -1613,8 +1603,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } reclaim_stat->recent_scanned[file] += nr_taken; - if (!scanning_global_lru(sc)) - sc->memcg_record->nr_scanned[file] += nr_taken; __count_zone_vm_events(PGREFILL, zone, pgscanned); if (file) @@ -1666,8 +1654,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * get_scan_ratio. */ reclaim_stat->recent_rotated[file] += nr_rotated; - if (!scanning_global_lru(sc)) - sc->memcg_record->nr_rotated[file] += nr_rotated; move_active_pages_to_lru(zone, &l_active, LRU_ACTIVE + file * LRU_FILE); @@ -2265,10 +2251,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_CGROUP_MEM_RES_CTLR unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, - gfp_t gfp_mask, bool noswap, - struct zone *zone, - struct memcg_scanrecord *rec, - unsigned long *scanned) + gfp_t gfp_mask, bool noswap, + struct zone *zone, + unsigned long *nr_scanned) { struct scan_control sc = { .nr_scanned = 0, @@ -2278,9 +2263,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .may_swap = !noswap, .order = 0, .mem_cgroup = mem, - .memcg_record = rec, }; - ktime_t start, end; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2289,7 +2272,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, sc.may_writepage, sc.gfp_mask); - start = ktime_get(); /* * NOTE: Although we can get the priority field, using it * here is not a good idea, since it limits the pages we can scan. @@ -2298,25 +2280,19 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, * the priority and make it zero. */ shrink_zone(0, zone, &sc); - end = ktime_get(); - - if (rec) - rec->elapsed += ktime_to_ns(ktime_sub(end, start)); - *scanned = sc.nr_scanned; trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); + *nr_scanned = sc.nr_scanned; return sc.nr_reclaimed; } unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, gfp_t gfp_mask, - bool noswap, - struct memcg_scanrecord *rec) + bool noswap) { struct zonelist *zonelist; unsigned long nr_reclaimed; - ktime_t start, end; int nid; struct scan_control sc = { .may_writepage = !laptop_mode, @@ -2325,7 +2301,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .nr_to_reclaim = SWAP_CLUSTER_MAX, .order = 0, .mem_cgroup = mem_cont, - .memcg_record = rec, .nodemask = NULL, /* we don't care the placement */ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), @@ -2334,7 +2309,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .gfp_mask = sc.gfp_mask, }; - start = ktime_get(); /* * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't * take care of from where we get pages. So the node where we start the @@ -2349,9 +2323,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, sc.gfp_mask); nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); - end = ktime_get(); - if (rec) - rec->elapsed += ktime_to_ns(ktime_sub(end, start)); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); -- cgit v1.2.3