aboutsummaryrefslogtreecommitdiff
path: root/mm/page-writeback.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page-writeback.c')
-rw-r--r--mm/page-writeback.c281
1 files changed, 157 insertions, 124 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 849d0ccbe91..e3bccac1f02 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
#include <linux/syscalls.h>
#include <linux/buffer_head.h>
#include <linux/pagevec.h>
+#include <trace/events/writeback.h>
/*
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
@@ -252,32 +253,6 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
}
}
-/*
- * Clip the earned share of dirty pages to that which is actually available.
- * This avoids exceeding the total dirty_limit when the floating averages
- * fluctuate too quickly.
- */
-static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
- unsigned long dirty, unsigned long *pbdi_dirty)
-{
- unsigned long avail_dirty;
-
- avail_dirty = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_WRITEBACK) +
- global_page_state(NR_UNSTABLE_NFS) +
- global_page_state(NR_WRITEBACK_TEMP);
-
- if (avail_dirty < dirty)
- avail_dirty = dirty - avail_dirty;
- else
- avail_dirty = 0;
-
- avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
- bdi_stat(bdi, BDI_WRITEBACK);
-
- *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
-}
-
static inline void task_dirties_fraction(struct task_struct *tsk,
long *numerator, long *denominator)
{
@@ -286,16 +261,24 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
}
/*
- * scale the dirty limit
+ * task_dirty_limit - scale down dirty throttling threshold for one task
*
* task specific dirty limit:
*
* dirty -= (dirty/8) * p_{t}
+ *
+ * To protect light/slow dirtying tasks from heavier/fast ones, we start
+ * throttling individual tasks before reaching the bdi dirty limit.
+ * Relatively low thresholds will be allocated to heavy dirtiers. So when
+ * dirty pages grow large, heavy dirtiers will be throttled first, which will
+ * effectively curb the growth of dirty pages. Light dirtiers with high enough
+ * dirty threshold may never get throttled.
*/
-static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
+static unsigned long task_dirty_limit(struct task_struct *tsk,
+ unsigned long bdi_dirty)
{
long numerator, denominator;
- unsigned long dirty = *pdirty;
+ unsigned long dirty = bdi_dirty;
u64 inv = dirty >> 3;
task_dirties_fraction(tsk, &numerator, &denominator);
@@ -303,10 +286,8 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
do_div(inv, denominator);
dirty -= inv;
- if (dirty < *pdirty/2)
- dirty = *pdirty/2;
- *pdirty = dirty;
+ return max(dirty, bdi_dirty/2);
}
/*
@@ -416,9 +397,16 @@ unsigned long determine_dirtyable_memory(void)
return x + 1; /* Ensure that we never return 0 */
}
-void
-get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
- unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
+/*
+ * global_dirty_limits - background-writeback and dirty-throttling thresholds
+ *
+ * Calculate the dirty thresholds based on sysctl parameters
+ * - vm.dirty_background_ratio or vm.dirty_background_bytes
+ * - vm.dirty_ratio or vm.dirty_bytes
+ * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
+ * runtime tasks.
+ */
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
{
unsigned long background;
unsigned long dirty;
@@ -450,27 +438,37 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
}
*pbackground = background;
*pdirty = dirty;
+}
- if (bdi) {
- u64 bdi_dirty;
- long numerator, denominator;
+/*
+ * bdi_dirty_limit - @bdi's share of dirty throttling threshold
+ *
+ * Allocate high/low dirty limits to fast/slow devices, in order to prevent
+ * - starving fast devices
+ * - piling up dirty pages (that will take long time to sync) on slow devices
+ *
+ * The bdi's share of dirty limit will be adapting to its throughput and
+ * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
+ */
+unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
+{
+ u64 bdi_dirty;
+ long numerator, denominator;
- /*
- * Calculate this BDI's share of the dirty ratio.
- */
- bdi_writeout_fraction(bdi, &numerator, &denominator);
-
- bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
- bdi_dirty *= numerator;
- do_div(bdi_dirty, denominator);
- bdi_dirty += (dirty * bdi->min_ratio) / 100;
- if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
- bdi_dirty = dirty * bdi->max_ratio / 100;
-
- *pbdi_dirty = bdi_dirty;
- clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
- task_dirty_limit(current, pbdi_dirty);
- }
+ /*
+ * Calculate this BDI's share of the dirty ratio.
+ */
+ bdi_writeout_fraction(bdi, &numerator, &denominator);
+
+ bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
+ bdi_dirty *= numerator;
+ do_div(bdi_dirty, denominator);
+
+ bdi_dirty += (dirty * bdi->min_ratio) / 100;
+ if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
+ bdi_dirty = dirty * bdi->max_ratio / 100;
+
+ return bdi_dirty;
}
/*
@@ -490,7 +488,7 @@ static void balance_dirty_pages(struct address_space *mapping,
unsigned long bdi_thresh;
unsigned long pages_written = 0;
unsigned long pause = 1;
-
+ bool dirty_exceeded = false;
struct backing_dev_info *bdi = mapping->backing_dev_info;
for (;;) {
@@ -501,18 +499,11 @@ static void balance_dirty_pages(struct address_space *mapping,
.range_cyclic = 1,
};
- get_dirty_limits(&background_thresh, &dirty_thresh,
- &bdi_thresh, bdi);
-
nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
nr_writeback = global_page_state(NR_WRITEBACK);
- bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
- bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
-
- if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
- break;
+ global_dirty_limits(&background_thresh, &dirty_thresh);
/*
* Throttle it only when the background writeback cannot
@@ -523,24 +514,8 @@ static void balance_dirty_pages(struct address_space *mapping,
(background_thresh + dirty_thresh) / 2)
break;
- if (!bdi->dirty_exceeded)
- bdi->dirty_exceeded = 1;
-
- /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
- * Unstable writes are a feature of certain networked
- * filesystems (i.e. NFS) in which data may have been
- * written to the server's write cache, but has not yet
- * been flushed to permanent storage.
- * Only move pages to writeback if this bdi is over its
- * threshold otherwise wait until the disk writes catch
- * up.
- */
- if (bdi_nr_reclaimable > bdi_thresh) {
- writeback_inodes_wb(&bdi->wb, &wbc);
- pages_written += write_chunk - wbc.nr_to_write;
- get_dirty_limits(&background_thresh, &dirty_thresh,
- &bdi_thresh, bdi);
- }
+ bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+ bdi_thresh = task_dirty_limit(current, bdi_thresh);
/*
* In order to avoid the stacked BDI deadlock we need
@@ -555,16 +530,45 @@ static void balance_dirty_pages(struct address_space *mapping,
if (bdi_thresh < 2*bdi_stat_error(bdi)) {
bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
- } else if (bdi_nr_reclaimable) {
+ } else {
bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
}
- if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+ /*
+ * The bdi thresh is somehow "soft" limit derived from the
+ * global "hard" limit. The former helps to prevent heavy IO
+ * bdi or process from holding back light ones; The latter is
+ * the last resort safeguard.
+ */
+ dirty_exceeded =
+ (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh)
+ || (nr_reclaimable + nr_writeback >= dirty_thresh);
+
+ if (!dirty_exceeded)
break;
- if (pages_written >= write_chunk)
- break; /* We've done our duty */
+ if (!bdi->dirty_exceeded)
+ bdi->dirty_exceeded = 1;
+
+ /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
+ * Unstable writes are a feature of certain networked
+ * filesystems (i.e. NFS) in which data may have been
+ * written to the server's write cache, but has not yet
+ * been flushed to permanent storage.
+ * Only move pages to writeback if this bdi is over its
+ * threshold otherwise wait until the disk writes catch
+ * up.
+ */
+ trace_wbc_balance_dirty_start(&wbc, bdi);
+ if (bdi_nr_reclaimable > bdi_thresh) {
+ writeback_inodes_wb(&bdi->wb, &wbc);
+ pages_written += write_chunk - wbc.nr_to_write;
+ trace_wbc_balance_dirty_written(&wbc, bdi);
+ if (pages_written >= write_chunk)
+ break; /* We've done our duty */
+ }
+ trace_wbc_balance_dirty_wait(&wbc, bdi);
__set_current_state(TASK_INTERRUPTIBLE);
io_schedule_timeout(pause);
@@ -577,8 +581,7 @@ static void balance_dirty_pages(struct address_space *mapping,
pause = HZ / 10;
}
- if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
- bdi->dirty_exceeded)
+ if (!dirty_exceeded && bdi->dirty_exceeded)
bdi->dirty_exceeded = 0;
if (writeback_in_progress(bdi))
@@ -593,9 +596,7 @@ static void balance_dirty_pages(struct address_space *mapping,
* background_thresh, to keep the amount of dirty memory low.
*/
if ((laptop_mode && pages_written) ||
- (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
- + global_page_state(NR_UNSTABLE_NFS))
- > background_thresh)))
+ (!laptop_mode && (nr_reclaimable > background_thresh)))
bdi_start_background_writeback(bdi);
}
@@ -659,7 +660,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
unsigned long dirty_thresh;
for ( ; ; ) {
- get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+ global_dirty_limits(&background_thresh, &dirty_thresh);
/*
* Boost the allowable dirty threshold a bit for page
@@ -805,6 +806,42 @@ void __init page_writeback_init(void)
}
/**
+ * tag_pages_for_writeback - tag pages to be written by write_cache_pages
+ * @mapping: address space structure to write
+ * @start: starting page index
+ * @end: ending page index (inclusive)
+ *
+ * This function scans the page range from @start to @end (inclusive) and tags
+ * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
+ * that write_cache_pages (or whoever calls this function) will then use
+ * TOWRITE tag to identify pages eligible for writeback. This mechanism is
+ * used to avoid livelocking of writeback by a process steadily creating new
+ * dirty pages in the file (thus it is important for this function to be quick
+ * so that it can tag pages faster than a dirtying process can create them).
+ */
+/*
+ * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
+ */
+void tag_pages_for_writeback(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+#define WRITEBACK_TAG_BATCH 4096
+ unsigned long tagged;
+
+ do {
+ spin_lock_irq(&mapping->tree_lock);
+ tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
+ &start, end, WRITEBACK_TAG_BATCH,
+ PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
+ spin_unlock_irq(&mapping->tree_lock);
+ WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
+ cond_resched();
+ /* We check 'start' to handle wrapping when end == ~0UL */
+ } while (tagged >= WRITEBACK_TAG_BATCH && start);
+}
+EXPORT_SYMBOL(tag_pages_for_writeback);
+
+/**
* write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
* @mapping: address space structure to write
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
@@ -818,6 +855,13 @@ void __init page_writeback_init(void)
* the call was made get new I/O started against them. If wbc->sync_mode is
* WB_SYNC_ALL then we were called for data integrity and we must wait for
* existing IO to complete.
+ *
+ * To avoid livelocks (when other process dirties new pages), we first tag
+ * pages which should be written back with TOWRITE tag and only then start
+ * writing them. For data-integrity sync we have to be careful so that we do
+ * not miss some pages (e.g., because some other process has cleared TOWRITE
+ * tag we set). The rule we follow is that TOWRITE tag can be cleared only
+ * by the process clearing the DIRTY tag (and submitting the page for IO).
*/
int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
@@ -833,6 +877,7 @@ int write_cache_pages(struct address_space *mapping,
pgoff_t done_index;
int cycled;
int range_whole = 0;
+ int tag;
pagevec_init(&pvec, 0);
if (wbc->range_cyclic) {
@@ -849,29 +894,19 @@ int write_cache_pages(struct address_space *mapping,
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
cycled = 1; /* ignore range_cyclic tests */
-
- /*
- * If this is a data integrity sync, cap the writeback to the
- * current end of file. Any extension to the file that occurs
- * after this is a new write and we don't need to write those
- * pages out to fulfil our data integrity requirements. If we
- * try to write them out, we can get stuck in this scan until
- * the concurrent writer stops adding dirty pages and extending
- * EOF.
- */
- if (wbc->sync_mode == WB_SYNC_ALL &&
- wbc->range_end == LLONG_MAX) {
- end = i_size_read(mapping->host) >> PAGE_CACHE_SHIFT;
- }
}
-
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag = PAGECACHE_TAG_TOWRITE;
+ else
+ tag = PAGECACHE_TAG_DIRTY;
retry:
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ tag_pages_for_writeback(mapping, index, end);
done_index = index;
while (!done && (index <= end)) {
int i;
- nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY,
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
if (nr_pages == 0)
break;
@@ -929,6 +964,7 @@ continue_unlock:
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
+ trace_wbc_writepage(wbc, mapping->backing_dev_info);
ret = (*writepage)(page, wbc, data);
if (unlikely(ret)) {
if (ret == AOP_WRITEPAGE_ACTIVATE) {
@@ -949,22 +985,16 @@ continue_unlock:
}
}
- if (wbc->nr_to_write > 0) {
- if (--wbc->nr_to_write == 0 &&
- wbc->sync_mode == WB_SYNC_NONE) {
- /*
- * We stop writing back only if we are
- * not doing integrity sync. In case of
- * integrity sync we have to keep going
- * because someone may be concurrently
- * dirtying pages, and we might have
- * synced a lot of newly appeared dirty
- * pages, but have not synced all of the
- * old dirty pages.
- */
- done = 1;
- break;
- }
+ /*
+ * We stop writing back only if we are not doing
+ * integrity sync. In case of integrity sync we have to
+ * keep going until we have written all the pages
+ * we tagged for writeback prior to entering this loop.
+ */
+ if (--wbc->nr_to_write <= 0 &&
+ wbc->sync_mode == WB_SYNC_NONE) {
+ done = 1;
+ break;
}
}
pagevec_release(&pvec);
@@ -1328,6 +1358,9 @@ int test_set_page_writeback(struct page *page)
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_DIRTY);
+ radix_tree_tag_clear(&mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_TOWRITE);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestSetPageWriteback(page);