From 1bc36b6426ae49139e9f56491db76b95921454d7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 19 Oct 2011 11:44:41 +0200 Subject: writeback: Include all dirty inodes in background writeback Current livelock avoidance code makes background work to include only inodes that were dirtied before background writeback has started. However background writeback can be running for a long time and thus excluding newly dirtied inodes can eventually exclude significant portion of dirty inodes making background writeback inefficient. Since background writeback avoids livelocking the flusher thread by yielding to any other work, there is no real reason why background work should not include all dirty inodes so change the logic in wb_writeback(). Signed-off-by: Jan Kara Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 517f211a3bd..92d353e069d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -743,11 +743,17 @@ static long wb_writeback(struct bdi_writeback *wb, if (work->for_background && !over_bground_thresh(wb->bdi)) break; + /* + * Kupdate and background works are special and we want to + * include all inodes that need writing. Livelock avoidance is + * handled by these works yielding to any other work so we are + * safe. + */ if (work->for_kupdate) { oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); - work->older_than_this = &oldest_jif; - } + } else if (work->for_background) + oldest_jif = jiffies; trace_writeback_start(wb->bdi, work); if (list_empty(&wb->b_io)) -- cgit v1.2.3 From 54848d73f9f254631303d6eab9b976855988b266 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 5 Apr 2011 13:21:19 -0600 Subject: writeback: charge leaked page dirties to active tasks It's a years long problem that a large number of short-lived dirtiers (eg. gcc instances in a fast kernel build) may starve long-run dirtiers (eg. dd) as well as pushing the dirty pages to the global hard limit. The solution is to charge the pages dirtied by the exited gcc to the other random dirtying tasks. It sounds not perfect, however should behave good enough in practice, seeing as that throttled tasks aren't actually running so those that are running are more likely to pick it up and get throttled, therefore promoting an equal spread. Randy: fix compile error: 'dirty_throttle_leaks' undeclared in exit.c Acked-by: Jan Kara Acked-by: Peter Zijlstra Signed-off-by: Randy Dunlap Signed-off-by: Wu Fengguang --- include/linux/writeback.h | 2 ++ kernel/exit.c | 3 +++ mm/page-writeback.c | 27 +++++++++++++++++++++++++++ 3 files changed, 32 insertions(+) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a378c295851..05eaf5e3aad 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -7,6 +7,8 @@ #include #include +DECLARE_PER_CPU(int, dirty_throttle_leaks); + /* * The 1/4 region under the global dirty thresh is for smooth dirty throttling: * diff --git a/kernel/exit.c b/kernel/exit.c index d0b7d988f87..d4aac24cc46 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -1037,6 +1038,8 @@ NORET_TYPE void do_exit(long code) validate_creds_for_do_exit(tsk); preempt_disable(); + if (tsk->nr_dirtied) + __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 50f08241f98..619c445fc03 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1214,6 +1214,22 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) static DEFINE_PER_CPU(int, bdp_ratelimits); +/* + * Normal tasks are throttled by + * loop { + * dirty tsk->nr_dirtied_pause pages; + * take a snap in balance_dirty_pages(); + * } + * However there is a worst case. If every task exit immediately when dirtied + * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be + * called to throttle the page dirties. The solution is to save the not yet + * throttled page dirties in dirty_throttle_leaks on task exit and charge them + * randomly into the running tasks. This works well for the above worst case, + * as the new task will pick up and accumulate the old task's leaked dirty + * count and eventually get throttled. + */ +DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; + /** * balance_dirty_pages_ratelimited_nr - balance dirty memory state * @mapping: address_space which was dirtied @@ -1261,6 +1277,17 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, ratelimit = 0; } } + /* + * Pick up the dirtied pages by the exited tasks. This avoids lots of + * short-lived tasks (eg. gcc invocations in a kernel build) escaping + * the dirty throttling and livelock other long-run dirtiers. + */ + p = &__get_cpu_var(dirty_throttle_leaks); + if (*p > 0 && current->nr_dirtied < ratelimit) { + nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); + *p -= nr_pages_dirtied; + current->nr_dirtied += nr_pages_dirtied; + } preempt_enable(); if (unlikely(current->nr_dirtied >= ratelimit)) -- cgit v1.2.3 From d3bc1fef9389e409a772ea174a5e41a6f93d9b7b Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 14 Apr 2011 07:52:37 -0600 Subject: writeback: fix dirtied pages accounting on sub-page writes When dd in 512bytes, generic_perform_write() calls balance_dirty_pages_ratelimited() 8 times for the same page, but obviously the page is only dirtied once. Fix it by accounting tsk->nr_dirtied and bdp_ratelimits at page dirty time. Acked-by: Jan Kara Acked-by: Peter Zijlstra Signed-off-by: Wu Fengguang --- mm/page-writeback.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 619c445fc03..5d1ef5d8613 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1258,8 +1258,6 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, if (bdi->dirty_exceeded) ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); - current->nr_dirtied += nr_pages_dirtied; - preempt_disable(); /* * This prevents one CPU to accumulate too many dirtied pages without @@ -1270,12 +1268,9 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, p = &__get_cpu_var(bdp_ratelimits); if (unlikely(current->nr_dirtied >= ratelimit)) *p = 0; - else { - *p += nr_pages_dirtied; - if (unlikely(*p >= ratelimit_pages)) { - *p = 0; - ratelimit = 0; - } + else if (unlikely(*p >= ratelimit_pages)) { + *p = 0; + ratelimit = 0; } /* * Pick up the dirtied pages by the exited tasks. This avoids lots of @@ -1768,6 +1763,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); task_io_account_write(PAGE_CACHE_SIZE); + current->nr_dirtied++; + this_cpu_inc(bdp_ratelimits); } } EXPORT_SYMBOL(account_page_dirtied); -- cgit v1.2.3 From 2f800fbd777b792de54187088df19a7df0251254 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Mon, 8 Aug 2011 15:22:00 -0600 Subject: writeback: fix dirtied pages accounting on redirty De-account the accumulative dirty counters on page redirty. Page redirties (very common in ext4) will introduce mismatch between counters (a) and (b) a) NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied b) NR_WRITTEN, BDI_WRITTEN This will introduce systematic errors in balanced_rate and result in dirty page position errors (ie. the dirty pages are no longer balanced around the global/bdi setpoints). Acked-by: Jan Kara Acked-by: Peter Zijlstra Signed-off-by: Wu Fengguang --- include/linux/writeback.h | 2 ++ mm/page-writeback.c | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 05eaf5e3aad..b30419cd425 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -197,6 +197,8 @@ void writeback_set_ratelimit(void); void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end); +void account_page_redirty(struct page *page); + /* pdflush.c */ extern int nr_pdflush_threads; /* Global so it can be exported to sysctl read-only. */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5d1ef5d8613..96b3e7aa705 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1824,6 +1824,24 @@ int __set_page_dirty_nobuffers(struct page *page) } EXPORT_SYMBOL(__set_page_dirty_nobuffers); +/* + * Call this whenever redirtying a page, to de-account the dirty counters + * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written + * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to + * systematic errors in balanced_dirty_ratelimit and the dirty pages position + * control. + */ +void account_page_redirty(struct page *page) +{ + struct address_space *mapping = page->mapping; + if (mapping && mapping_cap_account_dirty(mapping)) { + current->nr_dirtied--; + dec_zone_page_state(page, NR_DIRTIED); + dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); + } +} +EXPORT_SYMBOL(account_page_redirty); + /* * When a writepage implementation decides that it doesn't want to write this * page for some reason, it should redirty the locked page via @@ -1832,6 +1850,7 @@ EXPORT_SYMBOL(__set_page_dirty_nobuffers); int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) { wbc->pages_skipped++; + account_page_redirty(page); return __set_page_dirty_nobuffers(page); } EXPORT_SYMBOL(redirty_page_for_writepage); -- cgit v1.2.3 From 32c7f202a4801252a0f3578807b75a961f792870 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Mon, 8 Aug 2011 15:19:47 -0600 Subject: btrfs: fix dirtied pages accounting on sub-page writes When doing 1KB sequential writes to the same page, balance_dirty_pages_ratelimited_nr() should be called once instead of 4 times, the latter makes the dirtier tasks be throttled much too heavy. Fix it with proper de-accounting on clear_page_dirty_for_io(). CC: Chris Mason Signed-off-by: Wu Fengguang --- fs/btrfs/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 97fbe939c05..bfb620ead29 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1136,7 +1136,8 @@ again: GFP_NOFS); } for (i = 0; i < num_pages; i++) { - clear_page_dirty_for_io(pages[i]); + if (clear_page_dirty_for_io(pages[i])) + account_page_redirty(pages[i]); set_page_extent_mapped(pages[i]); WARN_ON(!PageLocked(pages[i])); } -- cgit v1.2.3 From 83712358ba0a1497ce59a4f84ce4dd0f803fe6fc Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sat, 11 Jun 2011 19:25:42 -0600 Subject: writeback: dirty ratelimit - think time compensation Compensate the task's think time when computing the final pause time, so that ->dirty_ratelimit can be executed accurately. think time := time spend outside of balance_dirty_pages() In the rare case that the task slept longer than the 200ms period time (result in negative pause time), the sleep time will be compensated in the following periods, too, if it's less than 1 second. Accumulated errors are carefully avoided as long as the max pause area is not hitted. Pseudo code: period = pages_dirtied / task_ratelimit; think = jiffies - dirty_paused_when; pause = period - think; 1) normal case: period > think pause = period - think dirty_paused_when = jiffies + pause nr_dirtied = 0 period time |===============================>| think time pause time |===============>|==============>| ------|----------------|---------------|------------------------ dirty_paused_when jiffies 2) no pause case: period <= think don't pause; reduce future pause time by: dirty_paused_when += period nr_dirtied = 0 period time |===============================>| think time |===================================================>| ------|--------------------------------+-------------------|---- dirty_paused_when jiffies Acked-by: Jan Kara Acked-by: Peter Zijlstra Signed-off-by: Wu Fengguang --- include/linux/sched.h | 1 + include/trace/events/writeback.h | 14 +++++++++++--- kernel/fork.c | 1 + mm/page-writeback.c | 36 ++++++++++++++++++++++++++++++++---- 4 files changed, 45 insertions(+), 7 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 1c4f3e9b9bc..984c3b29597 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1527,6 +1527,7 @@ struct task_struct { */ int nr_dirtied; int nr_dirtied_pause; + unsigned long dirty_paused_when; /* start of a write-and-pause period */ #ifdef CONFIG_LATENCYTOP int latency_record_count; diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 99d1d0decf8..8588a891802 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -300,12 +300,13 @@ TRACE_EVENT(balance_dirty_pages, unsigned long dirty_ratelimit, unsigned long task_ratelimit, unsigned long dirtied, + unsigned long period, long pause, unsigned long start_time), TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, dirty_ratelimit, task_ratelimit, - dirtied, pause, start_time), + dirtied, period, pause, start_time), TP_STRUCT__entry( __array( char, bdi, 32) @@ -320,6 +321,8 @@ TRACE_EVENT(balance_dirty_pages, __field(unsigned int, dirtied_pause) __field(unsigned long, paused) __field( long, pause) + __field(unsigned long, period) + __field( long, think) ), TP_fast_assign( @@ -336,6 +339,9 @@ TRACE_EVENT(balance_dirty_pages, __entry->task_ratelimit = KBps(task_ratelimit); __entry->dirtied = dirtied; __entry->dirtied_pause = current->nr_dirtied_pause; + __entry->think = current->dirty_paused_when == 0 ? 0 : + (long)(jiffies - current->dirty_paused_when) * 1000/HZ; + __entry->period = period * 1000 / HZ; __entry->pause = pause * 1000 / HZ; __entry->paused = (jiffies - start_time) * 1000 / HZ; ), @@ -346,7 +352,7 @@ TRACE_EVENT(balance_dirty_pages, "bdi_setpoint=%lu bdi_dirty=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "dirtied=%u dirtied_pause=%u " - "paused=%lu pause=%ld", + "paused=%lu pause=%ld period=%lu think=%ld", __entry->bdi, __entry->limit, __entry->setpoint, @@ -358,7 +364,9 @@ TRACE_EVENT(balance_dirty_pages, __entry->dirtied, __entry->dirtied_pause, __entry->paused, /* ms */ - __entry->pause /* ms */ + __entry->pause, /* ms */ + __entry->period, /* ms */ + __entry->think /* ms */ ) ); diff --git a/kernel/fork.c b/kernel/fork.c index da4a6a10d08..f8668cf6a32 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1296,6 +1296,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->nr_dirtied = 0; p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); + p->dirty_paused_when = 0; /* * Ok, make it visible to the rest of the system. diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 96b3e7aa705..49193215582 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1016,6 +1016,7 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; + long period; long pause = 0; long uninitialized_var(max_pause); bool dirty_exceeded = false; @@ -1026,6 +1027,8 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long start_time = jiffies; for (;;) { + unsigned long now = jiffies; + /* * Unstable writes are a feature of certain networked * filesystems (i.e. NFS) in which data may have been @@ -1045,8 +1048,11 @@ static void balance_dirty_pages(struct address_space *mapping, */ freerun = dirty_freerun_ceiling(dirty_thresh, background_thresh); - if (nr_dirty <= freerun) + if (nr_dirty <= freerun) { + current->dirty_paused_when = now; + current->nr_dirtied = 0; break; + } if (unlikely(!writeback_in_progress(bdi))) bdi_start_background_writeback(bdi); @@ -1104,10 +1110,21 @@ static void balance_dirty_pages(struct address_space *mapping, task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> RATELIMIT_CALC_SHIFT; if (unlikely(task_ratelimit == 0)) { + period = max_pause; pause = max_pause; goto pause; } - pause = HZ * pages_dirtied / task_ratelimit; + period = HZ * pages_dirtied / task_ratelimit; + pause = period; + if (current->dirty_paused_when) + pause -= now - current->dirty_paused_when; + /* + * For less than 1s think time (ext3/4 may block the dirtier + * for up to 800ms from time to time on 1-HDD; so does xfs, + * however at much less frequency), try to compensate it in + * future periods by updating the virtual time; otherwise just + * do a reset, as it may be a light dirtier. + */ if (unlikely(pause <= 0)) { trace_balance_dirty_pages(bdi, dirty_thresh, @@ -1118,8 +1135,16 @@ static void balance_dirty_pages(struct address_space *mapping, dirty_ratelimit, task_ratelimit, pages_dirtied, + period, pause, start_time); + if (pause < -HZ) { + current->dirty_paused_when = now; + current->nr_dirtied = 0; + } else if (period) { + current->dirty_paused_when += period; + current->nr_dirtied = 0; + } pause = 1; /* avoid resetting nr_dirtied_pause below */ break; } @@ -1135,11 +1160,15 @@ pause: dirty_ratelimit, task_ratelimit, pages_dirtied, + period, pause, start_time); __set_current_state(TASK_KILLABLE); io_schedule_timeout(pause); + current->dirty_paused_when = now + pause; + current->nr_dirtied = 0; + /* * This is typically equal to (nr_dirty < dirty_thresh) and can * also keep "1000+ dd on a slow USB stick" under control. @@ -1167,11 +1196,10 @@ pause: if (!dirty_exceeded && bdi->dirty_exceeded) bdi->dirty_exceeded = 0; - current->nr_dirtied = 0; if (pause == 0) { /* in freerun area */ current->nr_dirtied_pause = dirty_poll_interval(nr_dirty, dirty_thresh); - } else if (pause <= max_pause / 4 && + } else if (period <= max_pause / 4 && pages_dirtied >= current->nr_dirtied_pause) { current->nr_dirtied_pause = clamp_val( dirty_ratelimit * (max_pause / 2) / HZ, -- cgit v1.2.3 From 7ccb9ad5364d6ac0c803096c67e76a7545cf7a77 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 30 Nov 2011 11:08:55 -0600 Subject: writeback: max, min and target dirty pause time Control the pause time and the call intervals to balance_dirty_pages() with three parameters: 1) max_pause, limited by bdi_dirty and MAX_PAUSE 2) the target pause time, grows with the number of dd tasks and is normally limited by max_pause/2 3) the minimal pause, set to half the target pause and is used to skip short sleeps and accumulate them into bigger ones The typical behaviors after patch: - if ever task_ratelimit is far below dirty_ratelimit, the pause time will remain constant at max_pause and nr_dirtied_pause will be fluctuating with task_ratelimit - in the normal cases, nr_dirtied_pause will remain stable (keep in the same pace with dirty_ratelimit) and the pause time will be fluctuating with task_ratelimit In summary, someone has to fluctuate with task_ratelimit, because task_ratelimit = nr_dirtied_pause / pause We normally prefer a stable nr_dirtied_pause, until reaching max_pause. The notable behavior changes are: - in stable workloads, there will no longer be sudden big trajectory switching of nr_dirtied_pause as concerned by Peter. It will be as smooth as dirty_ratelimit and changing proportionally with it (as always, assuming bdi bandwidth does not fluctuate across 2^N lines, otherwise nr_dirtied_pause will show up in 2+ parallel trajectories) - in the rare cases when something keeps task_ratelimit far below dirty_ratelimit, the smoothness can no longer be retained and nr_dirtied_pause will be "dancing" with task_ratelimit. This fixes a (not that destructive but still not good) bug that dirty_ratelimit gets brought down undesirably <= balanced_dirty_ratelimit is under estimated <= weakly executed task_ratelimit <= pause goes too large and gets trimmed down to max_pause <= nr_dirtied_pause (based on dirty_ratelimit) is set too large <= dirty_ratelimit being much larger than task_ratelimit - introduce min_pause to avoid small pause sleeps - when pause is trimmed down to max_pause, try to compensate it at the next pause time The "refactor" type of changes are: The max_pause equation is slightly transformed to make it slightly more efficient. We now scale target_pause by (N * 10ms) on 2^N concurrent tasks, which is effectively equal to the original scaling max_pause by (N * 20ms) because the original code does implicit target_pause ~= max_pause / 2. Based on the same implicit ratio, target_pause starts with 10ms on 1 dd. CC: Jan Kara CC: Peter Zijlstra Signed-off-by: Wu Fengguang --- mm/page-writeback.c | 125 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 81 insertions(+), 44 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 49193215582..5830991f261 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -962,40 +962,81 @@ static unsigned long dirty_poll_interval(unsigned long dirty, return 1; } -static unsigned long bdi_max_pause(struct backing_dev_info *bdi, - unsigned long bdi_dirty) +static long bdi_max_pause(struct backing_dev_info *bdi, + unsigned long bdi_dirty) { - unsigned long bw = bdi->avg_write_bandwidth; - unsigned long hi = ilog2(bw); - unsigned long lo = ilog2(bdi->dirty_ratelimit); - unsigned long t; + long bw = bdi->avg_write_bandwidth; + long t; - /* target for 20ms max pause on 1-dd case */ - t = HZ / 50; + /* + * Limit pause time for small memory systems. If sleeping for too long + * time, a small pool of dirty/writeback pages may go empty and disk go + * idle. + * + * 8 serves as the safety ratio. + */ + t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); + t++; + + return min_t(long, t, MAX_PAUSE); +} + +static long bdi_min_pause(struct backing_dev_info *bdi, + long max_pause, + unsigned long task_ratelimit, + unsigned long dirty_ratelimit, + int *nr_dirtied_pause) +{ + long hi = ilog2(bdi->avg_write_bandwidth); + long lo = ilog2(bdi->dirty_ratelimit); + long t; /* target pause */ + long pause; /* estimated next pause */ + int pages; /* target nr_dirtied_pause */ + + /* target for 10ms pause on 1-dd case */ + t = max(1, HZ / 100); /* * Scale up pause time for concurrent dirtiers in order to reduce CPU * overheads. * - * (N * 20ms) on 2^N concurrent tasks. + * (N * 10ms) on 2^N concurrent tasks. */ if (hi > lo) - t += (hi - lo) * (20 * HZ) / 1024; + t += (hi - lo) * (10 * HZ) / 1024; /* - * Limit pause time for small memory systems. If sleeping for too long - * time, a small pool of dirty/writeback pages may go empty and disk go - * idle. + * This is a bit convoluted. We try to base the next nr_dirtied_pause + * on the much more stable dirty_ratelimit. However the next pause time + * will be computed based on task_ratelimit and the two rate limits may + * depart considerably at some time. Especially if task_ratelimit goes + * below dirty_ratelimit/2 and the target pause is max_pause, the next + * pause time will be max_pause*2 _trimmed down_ to max_pause. As a + * result task_ratelimit won't be executed faithfully, which could + * eventually bring down dirty_ratelimit. * - * 8 serves as the safety ratio. + * We apply two rules to fix it up: + * 1) try to estimate the next pause time and if necessary, use a lower + * nr_dirtied_pause so as not to exceed max_pause. When this happens, + * nr_dirtied_pause will be "dancing" with task_ratelimit. + * 2) limit the target pause time to max_pause/2, so that the normal + * small fluctuations of task_ratelimit won't trigger rule (1) and + * nr_dirtied_pause will remain as stable as dirty_ratelimit. */ - t = min(t, bdi_dirty * HZ / (8 * bw + 1)); + t = min(t, 1 + max_pause / 2); + pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); + + pause = HZ * pages / (task_ratelimit + 1); + if (pause > max_pause) { + t = max_pause; + pages = task_ratelimit * t / roundup_pow_of_two(HZ); + } + *nr_dirtied_pause = pages; /* - * The pause time will be settled within range (max_pause/4, max_pause). - * Apply a minimal value of 4 to get a non-zero max_pause/4. + * The minimal pause time will normally be half the target pause time. */ - return clamp_val(t, 4, MAX_PAUSE); + return 1 + t / 2; } /* @@ -1017,11 +1058,13 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long dirty_thresh; unsigned long bdi_thresh; long period; - long pause = 0; - long uninitialized_var(max_pause); + long pause; + long max_pause; + long min_pause; + int nr_dirtied_pause; bool dirty_exceeded = false; unsigned long task_ratelimit; - unsigned long uninitialized_var(dirty_ratelimit); + unsigned long dirty_ratelimit; unsigned long pos_ratio; struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long start_time = jiffies; @@ -1051,6 +1094,8 @@ static void balance_dirty_pages(struct address_space *mapping, if (nr_dirty <= freerun) { current->dirty_paused_when = now; current->nr_dirtied = 0; + current->nr_dirtied_pause = + dirty_poll_interval(nr_dirty, dirty_thresh); break; } @@ -1101,14 +1146,17 @@ static void balance_dirty_pages(struct address_space *mapping, nr_dirty, bdi_thresh, bdi_dirty, start_time); - max_pause = bdi_max_pause(bdi, bdi_dirty); - dirty_ratelimit = bdi->dirty_ratelimit; pos_ratio = bdi_position_ratio(bdi, dirty_thresh, background_thresh, nr_dirty, bdi_thresh, bdi_dirty); task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> RATELIMIT_CALC_SHIFT; + max_pause = bdi_max_pause(bdi, bdi_dirty); + min_pause = bdi_min_pause(bdi, max_pause, + task_ratelimit, dirty_ratelimit, + &nr_dirtied_pause); + if (unlikely(task_ratelimit == 0)) { period = max_pause; pause = max_pause; @@ -1125,7 +1173,7 @@ static void balance_dirty_pages(struct address_space *mapping, * future periods by updating the virtual time; otherwise just * do a reset, as it may be a light dirtier. */ - if (unlikely(pause <= 0)) { + if (pause < min_pause) { trace_balance_dirty_pages(bdi, dirty_thresh, background_thresh, @@ -1136,7 +1184,7 @@ static void balance_dirty_pages(struct address_space *mapping, task_ratelimit, pages_dirtied, period, - pause, + min(pause, 0L), start_time); if (pause < -HZ) { current->dirty_paused_when = now; @@ -1144,11 +1192,15 @@ static void balance_dirty_pages(struct address_space *mapping, } else if (period) { current->dirty_paused_when += period; current->nr_dirtied = 0; - } - pause = 1; /* avoid resetting nr_dirtied_pause below */ + } else if (current->nr_dirtied_pause <= pages_dirtied) + current->nr_dirtied_pause += pages_dirtied; break; } - pause = min(pause, max_pause); + if (unlikely(pause > max_pause)) { + /* for occasional dropped task_ratelimit */ + now += min(pause - max_pause, max_pause); + pause = max_pause; + } pause: trace_balance_dirty_pages(bdi, @@ -1168,6 +1220,7 @@ pause: current->dirty_paused_when = now + pause; current->nr_dirtied = 0; + current->nr_dirtied_pause = nr_dirtied_pause; /* * This is typically equal to (nr_dirty < dirty_thresh) and can @@ -1196,22 +1249,6 @@ pause: if (!dirty_exceeded && bdi->dirty_exceeded) bdi->dirty_exceeded = 0; - if (pause == 0) { /* in freerun area */ - current->nr_dirtied_pause = - dirty_poll_interval(nr_dirty, dirty_thresh); - } else if (period <= max_pause / 4 && - pages_dirtied >= current->nr_dirtied_pause) { - current->nr_dirtied_pause = clamp_val( - dirty_ratelimit * (max_pause / 2) / HZ, - pages_dirtied + pages_dirtied / 8, - pages_dirtied * 4); - } else if (pause >= max_pause) { - current->nr_dirtied_pause = 1 | clamp_val( - dirty_ratelimit * (max_pause / 2) / HZ, - pages_dirtied / 4, - pages_dirtied - pages_dirtied / 8); - } - if (writeback_in_progress(bdi)) return; -- cgit v1.2.3 From 5b9b357435a51ff14835c06d8b00765a4c68f313 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 6 Dec 2011 13:17:17 -0600 Subject: writeback: avoid tiny dirty poll intervals The LKP tests see big 56% regression for the case fio_mmap_randwrite_64k. Shaohua manages to root cause it to be the much smaller dirty pause times and hence much more frequent invocations to the IO-less balance_dirty_pages(). Since fio_mmap_randwrite_64k effectively contains both reads and writes, the more frequent pauses triggered more idling in the cfq IO scheduler. The solution is to increase pause time all the way up to the max 200ms in this case, which is found to restore most performance. This will help reduce CPU overheads in other cases, too. Note that I don't expect many performance critical workloads to run this access pattern: the mmap read-on-write is rather inefficient and could be avoided by doing normal writes syscalls. CC: Jan Kara CC: Peter Zijlstra Reported-by: Li Shaohua Tested-by: Li Shaohua Signed-off-by: Wu Fengguang --- mm/page-writeback.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5830991f261..422cf4edab4 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -41,6 +41,12 @@ */ #define MAX_PAUSE max(HZ/5, 1) +/* + * Try to keep balance_dirty_pages() call intervals higher than this many pages + * by raising pause time to max_pause when falls below it. + */ +#define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10)) + /* * Estimate write bandwidth at 200ms intervals. */ @@ -1026,6 +1032,23 @@ static long bdi_min_pause(struct backing_dev_info *bdi, t = min(t, 1 + max_pause / 2); pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); + /* + * Tiny nr_dirtied_pause is found to hurt I/O performance in the test + * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}. + * When the 16 consecutive reads are often interrupted by some dirty + * throttling pause during the async writes, cfq will go into idles + * (deadline is fine). So push nr_dirtied_pause as high as possible + * until reaches DIRTY_POLL_THRESH=32 pages. + */ + if (pages < DIRTY_POLL_THRESH) { + t = max_pause; + pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); + if (pages > DIRTY_POLL_THRESH) { + pages = DIRTY_POLL_THRESH; + t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit; + } + } + pause = HZ * pages / (task_ratelimit + 1); if (pause > max_pause) { t = max_pause; @@ -1036,7 +1059,7 @@ static long bdi_min_pause(struct backing_dev_info *bdi, /* * The minimal pause time will normally be half the target pause time. */ - return 1 + t / 2; + return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; } /* -- cgit v1.2.3 From 82791940545be38810dfd5e03ee701e749f04aab Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sat, 3 Dec 2011 21:26:01 -0600 Subject: writeback: do strict bdi dirty_exceeded This helps to reduce dirty throttling polls and hence CPU overheads. bdi->dirty_exceeded typically only helps when suddenly starting 100+ dd's on a disk, in which case the dd's may need to poll balance_dirty_pages() earlier than tsk->nr_dirtied_pause. CC: Jan Kara CC: Peter Zijlstra Signed-off-by: Wu Fengguang --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 422cf4edab4..936dc7b61dc 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1160,7 +1160,7 @@ static void balance_dirty_pages(struct address_space *mapping, bdi_stat(bdi, BDI_WRITEBACK); } - dirty_exceeded = (bdi_dirty > bdi_thresh) || + dirty_exceeded = (bdi_dirty > bdi_thresh) && (nr_dirty > dirty_thresh); if (dirty_exceeded && !bdi->dirty_exceeded) bdi->dirty_exceeded = 1; -- cgit v1.2.3 From bdaac4902a8225bf247ecaeac46c4b2980cc70e5 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 3 Aug 2011 14:30:36 -0600 Subject: writeback: balanced_rate cannot exceed write bandwidth Add an upper limit to balanced_rate according to the below inequality. This filters out some rare but huge singular points, which at least enables more readable gnuplot figures. When there are N dd dirtiers, balanced_dirty_ratelimit = write_bw / N So it holds that balanced_dirty_ratelimit <= write_bw The singular points originate from dirty_rate in the below formular: balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate where dirty_rate = (number of page dirties in the past 200ms) / 200ms In the extreme case, if all dd tasks suddenly get blocked on something else and hence no pages are dirtied at all, dirty_rate will be 0 and balanced_dirty_ratelimit will be inf. This could happen in reality. Note that these huge singular points are not a real threat, since they are _guaranteed_ to be filtered out by the min(balanced_dirty_ratelimit, task_ratelimit) line in bdi_update_dirty_ratelimit(). task_ratelimit is based on the number of dirty pages, which will never _suddenly_ fly away like balanced_dirty_ratelimit. So any weirdly large balanced_dirty_ratelimit will be cut down to the level of task_ratelimit. There won't be tiny singular points though, as long as the dirty pages lie inside the dirty throttling region (above the freerun region). Because there the dd tasks will be throttled by balanced_dirty_pages() and won't be able to suddenly dirty much more pages than average. Acked-by: Jan Kara Acked-by: Peter Zijlstra Signed-off-by: Wu Fengguang --- mm/page-writeback.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 936dc7b61dc..0ae2008eb54 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -822,6 +822,11 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, */ balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, dirty_rate | 1); + /* + * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw + */ + if (unlikely(balanced_dirty_ratelimit > write_bw)) + balanced_dirty_ratelimit = write_bw; /* * We could safely do this and return immediately: -- cgit v1.2.3 From bc31b86a5923fad5f3fbb6192f767f410241ba27 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sat, 7 Jan 2012 20:41:55 -0600 Subject: writeback: move MIN_WRITEBACK_PAGES to fs-writeback.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix compile error fs/fs-writeback.c:515:33: error: ‘PAGE_CACHE_SHIFT’ undeclared (first use in this function) Reported-by: Randy Dunlap Acked-by: Randy Dunlap Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 6 ++++++ include/linux/writeback.h | 5 ----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 92d353e069d..22e2d42742a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -29,6 +30,11 @@ #include #include "internal.h" +/* + * 4MB minimal write chunk size + */ +#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) + /* * Passed into wb_writeback(), essentially a subset of writeback_control */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index b30419cd425..4e0a5549302 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -25,11 +25,6 @@ DECLARE_PER_CPU(int, dirty_throttle_leaks); #define DIRTY_SCOPE 8 #define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2) -/* - * 4MB minimal write chunk size - */ -#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) - struct backing_dev_info; /* -- cgit v1.2.3