aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/RCU/RTFP.txt2
-rw-r--r--Documentation/RCU/checklist.txt17
-rw-r--r--Documentation/RCU/listRCU.txt2
-rw-r--r--Documentation/RCU/rcuref.txt61
-rw-r--r--Documentation/RCU/trace.txt396
-rw-r--r--Documentation/RCU/whatisRCU.txt17
-rw-r--r--Documentation/memory-barriers.txt9
-rw-r--r--arch/um/drivers/mconsole_kern.c2
-rw-r--r--arch/x86/kernel/ptrace.c7
-rw-r--r--include/linux/rculist.h17
-rw-r--r--include/linux/rcupdate.h27
-rw-r--r--include/linux/sched.h10
-rw-r--r--include/linux/srcu.h34
-rw-r--r--init/Kconfig19
-rw-r--r--kernel/ksysfs.c18
-rw-r--r--kernel/rcu.h2
-rw-r--r--kernel/rcupdate.c3
-rw-r--r--kernel/rcutiny.c2
-rw-r--r--kernel/rcutiny_plugin.h5
-rw-r--r--kernel/rcutorture.c45
-rw-r--r--kernel/rcutree.c124
-rw-r--r--kernel/rcutree_plugin.h15
-rw-r--r--kernel/sched/core.c8
-rw-r--r--kernel/srcu.c16
-rw-r--r--lib/Kconfig.debug2
25 files changed, 557 insertions, 303 deletions
diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt
index 7c1dfb19fc4..7f40c72a9c5 100644
--- a/Documentation/RCU/RTFP.txt
+++ b/Documentation/RCU/RTFP.txt
@@ -186,7 +186,7 @@ Bibtex Entries
@article{Kung80
,author="H. T. Kung and Q. Lehman"
-,title="Concurrent Maintenance of Binary Search Trees"
+,title="Concurrent Manipulation of Binary Search Trees"
,Year="1980"
,Month="September"
,journal="ACM Transactions on Database Systems"
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index cdb20d41a44..31ef8fe07f8 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -271,15 +271,14 @@ over a rather long period of time, but improvements are always welcome!
The same cautions apply to call_rcu_bh() and call_rcu_sched().
9. All RCU list-traversal primitives, which include
- rcu_dereference(), list_for_each_entry_rcu(),
- list_for_each_continue_rcu(), and list_for_each_safe_rcu(),
- must be either within an RCU read-side critical section or
- must be protected by appropriate update-side locks. RCU
- read-side critical sections are delimited by rcu_read_lock()
- and rcu_read_unlock(), or by similar primitives such as
- rcu_read_lock_bh() and rcu_read_unlock_bh(), in which case
- the matching rcu_dereference() primitive must be used in order
- to keep lockdep happy, in this case, rcu_dereference_bh().
+ rcu_dereference(), list_for_each_entry_rcu(), and
+ list_for_each_safe_rcu(), must be either within an RCU read-side
+ critical section or must be protected by appropriate update-side
+ locks. RCU read-side critical sections are delimited by
+ rcu_read_lock() and rcu_read_unlock(), or by similar primitives
+ such as rcu_read_lock_bh() and rcu_read_unlock_bh(), in which
+ case the matching rcu_dereference() primitive must be used in
+ order to keep lockdep happy, in this case, rcu_dereference_bh().
The reason that it is permissible to use RCU list-traversal
primitives when the update-side lock is held is that doing so
diff --git a/Documentation/RCU/listRCU.txt b/Documentation/RCU/listRCU.txt
index 4349c1487e9..adb5a378284 100644
--- a/Documentation/RCU/listRCU.txt
+++ b/Documentation/RCU/listRCU.txt
@@ -205,7 +205,7 @@ RCU ("read-copy update") its name. The RCU code is as follows:
audit_copy_rule(&ne->rule, &e->rule);
ne->rule.action = newaction;
ne->rule.file_count = newfield_count;
- list_replace_rcu(e, ne);
+ list_replace_rcu(&e->list, &ne->list);
call_rcu(&e->rcu, audit_free_rule);
return 0;
}
diff --git a/Documentation/RCU/rcuref.txt b/Documentation/RCU/rcuref.txt
index 4202ad09313..141d531aa14 100644
--- a/Documentation/RCU/rcuref.txt
+++ b/Documentation/RCU/rcuref.txt
@@ -20,7 +20,7 @@ release_referenced() delete()
{ {
... write_lock(&list_lock);
atomic_dec(&el->rc, relfunc) ...
- ... delete_element
+ ... remove_element
} write_unlock(&list_lock);
...
if (atomic_dec_and_test(&el->rc))
@@ -52,7 +52,7 @@ release_referenced() delete()
{ {
... spin_lock(&list_lock);
if (atomic_dec_and_test(&el->rc)) ...
- call_rcu(&el->head, el_free); delete_element
+ call_rcu(&el->head, el_free); remove_element
... spin_unlock(&list_lock);
} ...
if (atomic_dec_and_test(&el->rc))
@@ -64,3 +64,60 @@ Sometimes, a reference to the element needs to be obtained in the
update (write) stream. In such cases, atomic_inc_not_zero() might be
overkill, since we hold the update-side spinlock. One might instead
use atomic_inc() in such cases.
+
+It is not always convenient to deal with "FAIL" in the
+search_and_reference() code path. In such cases, the
+atomic_dec_and_test() may be moved from delete() to el_free()
+as follows:
+
+1. 2.
+add() search_and_reference()
+{ {
+ alloc_object rcu_read_lock();
+ ... search_for_element
+ atomic_set(&el->rc, 1); atomic_inc(&el->rc);
+ spin_lock(&list_lock); ...
+
+ add_element rcu_read_unlock();
+ ... }
+ spin_unlock(&list_lock); 4.
+} delete()
+3. {
+release_referenced() spin_lock(&list_lock);
+{ ...
+ ... remove_element
+ if (atomic_dec_and_test(&el->rc)) spin_unlock(&list_lock);
+ kfree(el); ...
+ ... call_rcu(&el->head, el_free);
+} ...
+5. }
+void el_free(struct rcu_head *rhp)
+{
+ release_referenced();
+}
+
+The key point is that the initial reference added by add() is not removed
+until after a grace period has elapsed following removal. This means that
+search_and_reference() cannot find this element, which means that the value
+of el->rc cannot increase. Thus, once it reaches zero, there are no
+readers that can or ever will be able to reference the element. The
+element can therefore safely be freed. This in turn guarantees that if
+any reader finds the element, that reader may safely acquire a reference
+without checking the value of the reference counter.
+
+In cases where delete() can sleep, synchronize_rcu() can be called from
+delete(), so that el_free() can be subsumed into delete as follows:
+
+4.
+delete()
+{
+ spin_lock(&list_lock);
+ ...
+ remove_element
+ spin_unlock(&list_lock);
+ ...
+ synchronize_rcu();
+ if (atomic_dec_and_test(&el->rc))
+ kfree(el);
+ ...
+}
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 672d1908325..c776968f446 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -10,51 +10,63 @@ for rcutree and next for rcutiny.
CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats
-These implementations of RCU provides several debugfs files under the
+These implementations of RCU provide several debugfs directories under the
top-level directory "rcu":
-rcu/rcudata:
+rcu/rcu_bh
+rcu/rcu_preempt
+rcu/rcu_sched
+
+Each directory contains files for the corresponding flavor of RCU.
+Note that rcu/rcu_preempt is only present for CONFIG_TREE_PREEMPT_RCU.
+For CONFIG_TREE_RCU, the RCU flavor maps onto the RCU-sched flavor,
+so that activity for both appears in rcu/rcu_sched.
+
+In addition, the following file appears in the top-level directory:
+rcu/rcutorture. This file displays rcutorture test progress. The output
+of "cat rcu/rcutorture" looks as follows:
+
+rcutorture test sequence: 0 (test in progress)
+rcutorture update version number: 615
+
+The first line shows the number of rcutorture tests that have completed
+since boot. If a test is currently running, the "(test in progress)"
+string will appear as shown above. The second line shows the number of
+update cycles that the current test has started, or zero if there is
+no test in progress.
+
+
+Within each flavor directory (rcu/rcu_bh, rcu/rcu_sched, and possibly
+also rcu/rcu_preempt) the following files will be present:
+
+rcudata:
Displays fields in struct rcu_data.
-rcu/rcudata.csv:
- Comma-separated values spreadsheet version of rcudata.
-rcu/rcugp:
+rcuexp:
+ Displays statistics for expedited grace periods.
+rcugp:
Displays grace-period counters.
-rcu/rcuhier:
+rcuhier:
Displays the struct rcu_node hierarchy.
-rcu/rcu_pending:
+rcu_pending:
Displays counts of the reasons rcu_pending() decided that RCU had
work to do.
-rcu/rcutorture:
- Displays rcutorture test progress.
-rcu/rcuboost:
+rcuboost:
Displays RCU boosting statistics. Only present if
CONFIG_RCU_BOOST=y.
-The output of "cat rcu/rcudata" looks as follows:
-
-rcu_sched:
- 0 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=545/1/0 df=50 of=0 ql=163 qs=NRW. kt=0/W/0 ktl=ebc3 b=10 ci=153737 co=0 ca=0
- 1 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=967/1/0 df=58 of=0 ql=634 qs=NRW. kt=0/W/1 ktl=58c b=10 ci=191037 co=0 ca=0
- 2 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=1081/1/0 df=175 of=0 ql=74 qs=N.W. kt=0/W/2 ktl=da94 b=10 ci=75991 co=0 ca=0
- 3 c=20942 g=20943 pq=1 pgp=20942 qp=1 dt=1846/0/0 df=404 of=0 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=72261 co=0 ca=0
- 4 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=369/1/0 df=83 of=0 ql=48 qs=N.W. kt=0/W/4 ktl=e0e7 b=10 ci=128365 co=0 ca=0
- 5 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=381/1/0 df=64 of=0 ql=169 qs=NRW. kt=0/W/5 ktl=fb2f b=10 ci=164360 co=0 ca=0
- 6 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=1037/1/0 df=183 of=0 ql=62 qs=N.W. kt=0/W/6 ktl=d2ad b=10 ci=65663 co=0 ca=0
- 7 c=20897 g=20897 pq=1 pgp=20896 qp=0 dt=1572/0/0 df=382 of=0 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=75006 co=0 ca=0
-rcu_bh:
- 0 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=545/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/0 ktl=ebc3 b=10 ci=0 co=0 ca=0
- 1 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=967/1/0 df=3 of=0 ql=0 qs=.... kt=0/W/1 ktl=58c b=10 ci=151 co=0 ca=0
- 2 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1081/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/2 ktl=da94 b=10 ci=0 co=0 ca=0
- 3 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1846/0/0 df=8 of=0 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=0 co=0 ca=0
- 4 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=369/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/4 ktl=e0e7 b=10 ci=0 co=0 ca=0
- 5 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=381/1/0 df=4 of=0 ql=0 qs=.... kt=0/W/5 ktl=fb2f b=10 ci=0 co=0 ca=0
- 6 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1037/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/6 ktl=d2ad b=10 ci=0 co=0 ca=0
- 7 c=1474 g=1474 pq=1 pgp=1473 qp=0 dt=1572/0/0 df=8 of=0 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=0 co=0 ca=0
-
-The first section lists the rcu_data structures for rcu_sched, the second
-for rcu_bh. Note that CONFIG_TREE_PREEMPT_RCU kernels will have an
-additional section for rcu_preempt. Each section has one line per CPU,
-or eight for this 8-CPU system. The fields are as follows:
+The output of "cat rcu/rcu_preempt/rcudata" looks as follows:
+
+ 0!c=30455 g=30456 pq=1 qp=1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716
+ 1!c=30719 g=30720 pq=1 qp=0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982
+ 2!c=30150 g=30151 pq=1 qp=1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458
+ 3 c=31249 g=31250 pq=1 qp=0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622
+ 4!c=29502 g=29503 pq=1 qp=1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521
+ 5 c=31201 g=31202 pq=1 qp=1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698
+ 6!c=30253 g=30254 pq=1 qp=1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353
+ 7 c=31178 g=31178 pq=1 qp=0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969
+
+This file has one line per CPU, or eight for this 8-CPU system.
+The fields are as follows:
o The number at the beginning of each line is the CPU number.
CPUs numbers followed by an exclamation mark are offline,
@@ -64,11 +76,13 @@ o The number at the beginning of each line is the CPU number.
substantially larger than the number of actual CPUs.
o "c" is the count of grace periods that this CPU believes have
- completed. Offlined CPUs and CPUs in dynticks idle mode may
- lag quite a ways behind, for example, CPU 6 under "rcu_sched"
- above, which has been offline through not quite 40,000 RCU grace
- periods. It is not unusual to see CPUs lagging by thousands of
- grace periods.
+ completed. Offlined CPUs and CPUs in dynticks idle mode may lag
+ quite a ways behind, for example, CPU 4 under "rcu_sched" above,
+ which has been offline through 16 RCU grace periods. It is not
+ unusual to see offline CPUs lagging by thousands of grace periods.
+ Note that although the grace-period number is an unsigned long,
+ it is printed out as a signed long to allow more human-friendly
+ representation near boot time.
o "g" is the count of grace periods that this CPU believes have
started. Again, offlined CPUs and CPUs in dynticks idle mode
@@ -84,30 +98,25 @@ o "pq" indicates that this CPU has passed through a quiescent state
CPU has not yet reported that fact, (2) some other CPU has not
yet reported for this grace period, or (3) both.
-o "pgp" indicates which grace period the last-observed quiescent
- state for this CPU corresponds to. This is important for handling
- the race between CPU 0 reporting an extended dynticks-idle
- quiescent state for CPU 1 and CPU 1 suddenly waking up and
- reporting its own quiescent state. If CPU 1 was the last CPU
- for the current grace period, then the CPU that loses this race
- will attempt to incorrectly mark CPU 1 as having checked in for
- the next grace period!
-
o "qp" indicates that RCU still expects a quiescent state from
this CPU. Offlined CPUs and CPUs in dyntick idle mode might
well have qp=1, which is OK: RCU is still ignoring them.
o "dt" is the current value of the dyntick counter that is incremented
- when entering or leaving dynticks idle state, either by the
- scheduler or by irq. This number is even if the CPU is in
- dyntick idle mode and odd otherwise. The number after the first
- "/" is the interrupt nesting depth when in dyntick-idle state,
- or one greater than the interrupt-nesting depth otherwise.
- The number after the second "/" is the NMI nesting depth.
+ when entering or leaving idle, either due to a context switch or
+ due to an interrupt. This number is even if the CPU is in idle
+ from RCU's viewpoint and odd otherwise. The number after the
+ first "/" is the interrupt nesting depth when in idle state,
+ or a large number added to the interrupt-nesting depth when
+ running a non-idle task. Some architectures do not accurately
+ count interrupt nesting when running in non-idle kernel context,
+ which can result in interesting anomalies such as negative
+ interrupt-nesting levels. The number after the second "/"
+ is the NMI nesting depth.
o "df" is the number of times that some other CPU has forced a
quiescent state on behalf of this CPU due to this CPU being in
- dynticks-idle state.
+ idle state.
o "of" is the number of times that some other CPU has forced a
quiescent state on behalf of this CPU due to this CPU being
@@ -120,9 +129,13 @@ o "of" is the number of times that some other CPU has forced a
error, so it makes sense to err conservatively.
o "ql" is the number of RCU callbacks currently residing on
- this CPU. This is the total number of callbacks, regardless
- of what state they are in (new, waiting for grace period to
- start, waiting for grace period to end, ready to invoke).
+ this CPU. The first number is the number of "lazy" callbacks
+ that are known to RCU to only be freeing memory, and the number
+ after the "/" is the total number of callbacks, lazy or not.
+ These counters count callbacks regardless of what phase of
+ grace-period processing that they are in (new, waiting for
+ grace period to start, waiting for grace period to end, ready
+ to invoke).
o "qs" gives an indication of the state of the callback queue
with four characters:
@@ -150,6 +163,43 @@ o "qs" gives an indication of the state of the callback queue
If there are no callbacks in a given one of the above states,
the corresponding character is replaced by ".".
+o "b" is the batch limit for this CPU. If more than this number
+ of RCU callbacks is ready to invoke, then the remainder will
+ be deferred.
+
+o "ci" is the number of RCU callbacks that have been invoked for
+ this CPU. Note that ci+nci+ql is the number of callbacks that have
+ been registered in absence of CPU-hotplug activity.
+
+o "nci" is the number of RCU callbacks that have been offloaded from
+ this CPU. This will always be zero unless the kernel was built
+ with CONFIG_RCU_NOCB_CPU=y and the "rcu_nocbs=" kernel boot
+ parameter was specified.
+
+o "co" is the number of RCU callbacks that have been orphaned due to
+ this CPU going offline. These orphaned callbacks have been moved
+ to an arbitrarily chosen online CPU.
+
+o "ca" is the number of RCU callbacks that have been adopted by this
+ CPU due to other CPUs going offline. Note that ci+co-ca+ql is
+ the number of RCU callbacks registered on this CPU.
+
+
+Kernels compiled with CONFIG_RCU_BOOST=y display the following from
+/debug/rcu/rcu_preempt/rcudata:
+
+ 0!c=12865 g=12866 pq=1 qp=1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871
+ 1 c=14407 g=14408 pq=1 qp=0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485
+ 2 c=14407 g=14408 pq=1 qp=0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490
+ 3 c=14407 g=14408 pq=1 qp=0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290
+ 4 c=14405 g=14406 pq=1 qp=1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114
+ 5!c=14168 g=14169 pq=1 qp=0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722
+ 6 c=14404 g=14405 pq=1 qp=0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811
+ 7 c=14407 g=14408 pq=1 qp=1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042
+
+This is similar to the output discussed above, but contains the following
+additional fields:
+
o "kt" is the per-CPU kernel-thread state. The digit preceding
the first slash is zero if there is no work pending and 1
otherwise. The character between the first pair of slashes is
@@ -184,35 +234,51 @@ o "ktl" is the low-order 16 bits (in hexadecimal) of the count of
This field is displayed only for CONFIG_RCU_BOOST kernels.
-o "b" is the batch limit for this CPU. If more than this number
- of RCU callbacks is ready to invoke, then the remainder will
- be deferred.
-o "ci" is the number of RCU callbacks that have been invoked for
- this CPU. Note that ci+ql is the number of callbacks that have
- been registered in absence of CPU-hotplug activity.
+The output of "cat rcu/rcu_preempt/rcuexp" looks as follows:
-o "co" is the number of RCU callbacks that have been orphaned due to
- this CPU going offline. These orphaned callbacks have been moved
- to an arbitrarily chosen online CPU.
+s=21872 d=21872 w=0 tf=0 wd1=0 wd2=0 n=0 sc=21872 dt=21872 dl=0 dx=21872
+
+These fields are as follows:
+
+o "s" is the starting sequence number.
-o "ca" is the number of RCU callbacks that have been adopted due to
- other CPUs going offline. Note that ci+co-ca+ql is the number of
- RCU callbacks registered on this CPU.
+o "d" is the ending sequence number. When the starting and ending
+ numbers differ, there is an expedited grace period in progress.
-There is also an rcu/rcudata.csv file with the same information in
-comma-separated-variable spreadsheet format.
+o "w" is the number of times that the sequence numbers have been
+ in danger of wrapping.
+o "tf" is the number of times that contention has resulted in a
+ failure to begin an expedited grace period.
-The output of "cat rcu/rcugp" looks as follows:
+o "wd1" and "wd2" are the number of times that an attempt to
+ start an expedited grace period found that someone else had
+ completed an expedited grace period that satisfies the
+ attempted request. "Our work is done."
-rcu_sched: completed=33062 gpnum=33063
-rcu_bh: completed=464 gpnum=464
+o "n" is number of times that contention was so great that
+ the request was demoted from an expedited grace period to
+ a normal grace period.
+
+o "sc" is the number of times that the attempt to start a
+ new expedited grace period succeeded.
+
+o "dt" is the number of times that we attempted to update
+ the "d" counter.
+
+o "dl" is the number of times that we failed to update the "d"
+ counter.
+
+o "dx" is the number of times that we succeeded in updating
+ the "d" counter.
-Again, this output is for both "rcu_sched" and "rcu_bh". Note that
-kernels built with CONFIG_TREE_PREEMPT_RCU will have an additional
-"rcu_preempt" line. The fields are taken from the rcu_state structure,
-and are as follows:
+
+The output of "cat rcu/rcu_preempt/rcugp" looks as follows:
+
+completed=31249 gpnum=31250 age=1 max=18
+
+These fields are taken from the rcu_state structure, and are as follows:
o "completed" is the number of grace periods that have completed.
It is comparable to the "c" field from rcu/rcudata in that a
@@ -220,44 +286,42 @@ o "completed" is the number of grace periods that have completed.
that the corresponding RCU grace period has completed.
o "gpnum" is the number of grace periods that have started. It is
- comparable to the "g" field from rcu/rcudata in that a CPU
- whose "g" field matches the value of "gpnum" is aware that the
- corresponding RCU grace period has started.
+ similarly comparable to the "g" field from rcu/rcudata in that
+ a CPU whose "g" field matches the value of "gpnum" is aware that
+ the corresponding RCU grace period has started.
+
+ If these two fields are equal, then there is no grace period
+ in progress, in other words, RCU is idle. On the other hand,
+ if the two fields differ (as they are above), then an RCU grace
+ period is in progress.
- If these two fields are equal (as they are for "rcu_bh" above),
- then there is no grace period in progress, in other words, RCU
- is idle. On the other hand, if the two fields differ (as they
- do for "rcu_sched" above), then an RCU grace period is in progress.
+o "age" is the number of jiffies that the current grace period
+ has extended for, or zero if there is no grace period currently
+ in effect.
+o "max" is the age in jiffies of the longest-duration grace period
+ thus far.
-The output of "cat rcu/rcuhier" looks as follows, with very long lines:
+The output of "cat rcu/rcu_preempt/rcuhier" looks as follows:
-c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
-1/1 ..>. 0:127 ^0
-3/3 ..>. 0:35 ^0 0/0 ..>. 36:71 ^1 0/0 ..>. 72:107 ^2 0/0 ..>. 108:127 ^3
-3/3f ..>. 0:5 ^0 2/3 ..>. 6:11 ^1 0/0 ..>. 12:17 ^2 0/0 ..>. 18:23 ^3 0/0 ..>. 24:29 ^4 0/0 ..>. 30:35 ^5 0/0 ..>. 36:41 ^0 0/0 ..>. 42:47 ^1 0/0 ..>. 48:53 ^2 0/0 ..>. 54:59 ^3 0/0 ..>. 60:65 ^4 0/0 ..>. 66:71 ^5 0/0 ..>. 72:77 ^0 0/0 ..>. 78:83 ^1 0/0 ..>. 84:89 ^2 0/0 ..>. 90:95 ^3 0/0 ..>. 96:101 ^4 0/0 ..>. 102:107 ^5 0/0 ..>. 108:113 ^0 0/0 ..>. 114:119 ^1 0/0 ..>. 120:125 ^2 0/0 ..>. 126:127 ^3
-rcu_bh:
-c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
-0/1 ..>. 0:127 ^0
-0/3 ..>. 0:35 ^0 0/0 ..>. 36:71 ^1 0/0 ..>. 72:107 ^2 0/0 ..>. 108:127 ^3
-0/3f ..>. 0:5 ^0 0/3 ..>. 6:11 ^1 0/0 ..>. 12:17 ^2 0/0 ..>. 18:23 ^3 0/0 ..>. 24:29 ^4 0/0 ..>. 30:35 ^5 0/0 ..>. 36:41 ^0 0/0 ..>. 42:47 ^1 0/0 ..>. 48:53 ^2 0/0 ..>. 54:59 ^3 0/0 ..>. 60:65 ^4 0/0 ..>. 66:71 ^5 0/0 ..>. 72:77 ^0 0/0 ..>. 78:83 ^1 0/0 ..>. 84:89 ^2 0/0 ..>. 90:95 ^3 0/0 ..>. 96:101 ^4 0/0 ..>. 102:107 ^5 0/0 ..>. 108:113 ^0 0/0 ..>. 114:119 ^1 0/0 ..>. 120:125 ^2 0/0 ..>. 126:127 ^3
+c=14407 g=14408 s=0 jfq=2 j=c863 nfqs=12040/nfqsng=0(12040) fqlh=1051 oqlen=0/0
+3/3 ..>. 0:7 ^0
+e/e ..>. 0:3 ^0 d/d ..>. 4:7 ^1
-This is once again split into "rcu_sched" and "rcu_bh" portions,
-and CONFIG_TREE_PREEMPT_RCU kernels will again have an additional
-"rcu_preempt" section. The fields are as follows:
+The fields are as follows:
-o "c" is exactly the same as "completed" under rcu/rcugp.
+o "c" is exactly the same as "completed" under rcu/rcu_preempt/rcugp.
-o "g" is exactly the same as "gpnum" under rcu/rcugp.
+o "g" is exactly the same as "gpnum" under rcu/rcu_preempt/rcugp.
-o "s" is the "signaled" state that drives force_quiescent_state()'s
+o "s" is the current state of the force_quiescent_state()
state machine.
o "jfq" is the number of jiffies remaining for this grace period
before force_quiescent_state() is invoked to help push things
- along. Note that CPUs in dyntick-idle mode throughout the grace
- period will not report on their own, but rather must be check by
- some other CPU via force_quiescent_state().
+ along. Note that CPUs in idle mode throughout the grace period
+ will not report on their own, but rather must be check by some
+ other CPU via force_quiescent_state().
o "j" is the low-order four hex digits of the jiffies counter.
Yes, Paul did run into a number of problems that turned out to
@@ -268,7 +332,8 @@ o "nfqs" is the number of calls to force_quiescent_state() since
o "nfqsng" is the number of useless calls to force_quiescent_state(),
where there wasn't actually a grace period active. This can
- happen due to races. The number in parentheses is the difference
+ no longer happen due to grace-period processing being pushed
+ into a kthread. The number in parentheses is the difference
between "nfqs" and "nfqsng", or the number of times that
force_quiescent_state() actually did some real work.
@@ -276,28 +341,27 @@ o "fqlh" is the number of calls to force_quiescent_state() that
exited immediately (without even being counted in nfqs above)
due to contention on ->fqslock.
-o Each element of the form "1/1 0:127 ^0" represents one struct
- rcu_node. Each line represents one level of the hierarchy, from
- root to leaves. It is best to think of the rcu_data structures
- as forming yet another level after the leaves. Note that there
- might be either one, two, or three levels of rcu_node structures,
- depending on the relationship between CONFIG_RCU_FANOUT and
- CONFIG_NR_CPUS.
+o Each element of the form "3/3 ..>. 0:7 ^0" represents one rcu_node
+ structure. Each line represents one level of the hierarchy,
+ from root to leaves. It is best to think of the rcu_data
+ structures as forming yet another level after the leaves.
+ Note that there might be either one, two, three, or even four
+ levels of rcu_node structures, depending on the relationship
+ between CONFIG_RCU_FANOUT, CONFIG_RCU_FANOUT_LEAF (possibly
+ adjusted using the rcu_fanout_leaf kernel boot parameter), and
+ CONFIG_NR_CPUS (possibly adjusted using the nr_cpu_ids count of
+ possible CPUs for the booting hardware).
o The numbers separated by the "/" are the qsmask followed
by the qsmaskinit. The qsmask will have one bit
- set for each entity in the next lower level that
- has not yet checked in for the current grace period.
+ set for each entity in the next lower level that has
+ not yet checked in for the current grace period ("e"
+ indicating CPUs 5, 6, and 7 in the example above).
The qsmaskinit will have one bit for each entity that is
currently expected to check in during each grace period.
The value of qsmaskinit is assigned to that of qsmask
at the beginning of each grace period.
- For example, for "rcu_sched", the qsmask of the first
- entry of the lowest level is 0x14, meaning that we
- are still waiting for CPUs 2 and 4 to check in for the
- current grace period.
-
o The characters separated by the ">" indicate the state
of the blocked-tasks lists. A "G" preceding the ">"
indicates that at least one task blocked in an RCU
@@ -312,48 +376,39 @@ o Each element of the form "1/1 0:127 ^0" represents one struct
A "." character appears if the corresponding condition
does not hold, so that "..>." indicates that no tasks
are blocked. In contrast, "GE>T" indicates maximal
- inconvenience from blocked tasks.
+ inconvenience from blocked tasks. CONFIG_TREE_RCU
+ builds of the kernel will always show "..>.".
o The numbers separated by the ":" are the range of CPUs
served by this struct rcu_node. This can be helpful
in working out how the hierarchy is wired together.
- For example, the first entry at the lowest level shows
- "0:5", indicating that it covers CPUs 0 through 5.
+ For example, the example rcu_node structure shown above
+ has "0:7", indicating that it covers CPUs 0 through 7.
o The number after the "^" indicates the bit in the
- next higher level rcu_node structure that this
- rcu_node structure corresponds to.
-
- For example, the first entry at the lowest level shows
- "^0", indicating that it corresponds to bit zero in
- the first entry at the middle level.
-
-
-The output of "cat rcu/rcu_pending" looks as follows:
-
-rcu_sched:
- 0 np=255892 qsp=53936 rpq=85 cbr=0 cng=14417 gpc=10033 gps=24320 nn=146741
- 1 np=261224 qsp=54638 rpq=33 cbr=0 cng=25723 gpc=16310 gps=2849 nn=155792
- 2 np=237496 qsp=49664 rpq=23 cbr=0 cng=2762 gpc=45478 gps=1762 nn=136629
- 3 np=236249 qsp=48766 rpq=98 cbr=0 cng=286 gpc=48049 gps=1218 nn=137723
- 4 np=221310 qsp=46850 rpq=7 cbr=0 cng=26 gpc=43161 gps=4634 nn=123110
- 5 np=237332 qsp=48449 rpq=9 cbr=0 cng=54 gpc=47920 gps=3252 nn=137456
- 6 np=219995 qsp=46718 rpq=12 cbr=0 cng=50 gpc=42098 gps=6093 nn=120834
- 7 np=249893 qsp=49390 rpq=42 cbr=0 cng=72 gpc=38400 gps=17102 nn=144888
-rcu_bh:
- 0 np=146741 qsp=1419 rpq=6 cbr=0 cng=6 gpc=0 gps=0 nn=145314
- 1 np=155792 qsp=12597 rpq=3 cbr=0 cng=0 gpc=4 gps=8 nn=143180
- 2 np=136629 qsp=18680 rpq=1 cbr=0 cng=0 gpc=7 gps=6 nn=117936
- 3 np=137723 qsp=2843 rpq=0 cbr=0 cng=0 gpc=10 gps=7 nn=134863
- 4 np=123110 qsp=12433 rpq=0 cbr=0 cng=0 gpc=4 gps=2 nn=110671
- 5 np=137456 qsp=4210 rpq=1 cbr=0 cng=0 gpc=6 gps=5 nn=133235
- 6 np=120834 qsp=9902 rpq=2 cbr=0 cng=0 gpc=6 gps=3 nn=110921
- 7 np=144888 qsp=26336 rpq=0 cbr=0 cng=0 gpc=8 gps=2 nn=118542
-
-As always, this is once again split into "rcu_sched" and "rcu_bh"
-portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional
-"rcu_preempt" section. The fields are as follows:
+ next higher level rcu_node structure that this rcu_node
+ structure corresponds to. For example, the "d/d ..>. 4:7
+ ^1" has a "1" in this position, indicating that it
+ corresponds to the "1" bit in the "3" shown in the
+ "3/3 ..>. 0:7 ^0" entry on the next level up.
+
+
+The output of "cat rcu/rcu_sched/rcu_pending" looks as follows:
+
+ 0!np=26111 qsp=29 rpq=5386 cbr=1 cng=570 gpc=3674 gps=577 nn=15903
+ 1!np=28913 qsp=35 rpq=6097 cbr=1 cng=448 gpc=3700 gps=554 nn=18113
+ 2!np=32740 qsp=37 rpq=6202 cbr=0 cng=476 gpc=4627 gps=546 nn=20889
+ 3 np=23679 qsp=22 rpq=5044 cbr=1 cng=415 gpc=3403 gps=347 nn=14469
+ 4!np=30714 qsp=4 rpq=5574 cbr=0 cng=528 gpc=3931 gps=639 nn=20042
+ 5 np=28910 qsp=2 rpq=5246 cbr=0 cng=428 gpc=4105 gps=709 nn=18422
+ 6!np=38648 qsp=5 rpq=7076 cbr=0 cng=840 gpc=4072 gps=961 nn=25699
+ 7 np=37275 qsp=2 rpq=6873 cbr=0 cng=868 gpc=3416 gps=971 nn=25147
+
+The fields are as follows:
+
+o The leading number is the CPU number, with "!" indicating
+ an offline CPU.
o "np" is the number of times that __rcu_pending() has been invoked
for the corresponding flavor of RCU.
@@ -377,38 +432,23 @@ o "gpc" is the number of times that an old grace period had
o "gps" is the number of times that a new grace period had started,
but this CPU was not yet aware of it.
-o "nn" is the number of times that this CPU needed nothing. Alert
- readers will note that the rcu "nn" number for a given CPU very
- closely matches the rcu_bh "np" number for that same CPU. This
- is due to short-circuit evaluation in rcu_pending().
-
-
-The output of "cat rcu/rcutorture" looks as follows:
-
-rcutorture test sequence: 0 (test in progress)
-rcutorture update version number: 615
-
-The first line shows the number of rcutorture tests that have completed
-since boot. If a test is currently running, the "(test in progress)"
-string will appear as shown above. The second line shows the number of
-update cycles that the current test has started, or zero if there is
-no test in progress.
+o "nn" is the number of times that this CPU needed nothing.
The output of "cat rcu/rcuboost" looks as follows:
-0:5 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=2f95 bt=300f
- balk: nt=0 egt=989 bt=0 nb=0 ny=0 nos=16
-6:7 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=2f95 bt=300f
- balk: nt=0 egt=225 bt=0 nb=0 ny=0 nos=6
+0:3 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894
+ balk: nt=0 egt=4695 bt=0 nb=0 ny=56 nos=0
+4:7 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894
+ balk: nt=0 egt=6541 bt=0 nb=0 ny=126 nos=0
This information is output only for rcu_preempt. Each two-line entry
corresponds to a leaf rcu_node strcuture. The fields are as follows:
o "n:m" is the CPU-number range for the corresponding two-line
entry. In the sample output above, the first entry covers
- CPUs zero through five and the second entry covers CPUs 6
- and 7.
+ CPUs zero through three and the second entry covers CPUs four
+ through seven.
o "tasks=TNEB" gives the state of the various segments of the
rnp->blocked_tasks list:
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index bf0f6de2aa0..0cc7820967f 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -499,6 +499,8 @@ The foo_reclaim() function might appear as follows:
{
struct foo *fp = container_of(rp, struct foo, rcu);
+ foo_cleanup(fp->a);
+
kfree(fp);
}
@@ -521,6 +523,12 @@ o Use call_rcu() -after- removing a data element from an
read-side critical sections that might be referencing that
data item.
+If the callback for call_rcu() is not doing anything more than calling
+kfree() on the structure, you can use kfree_rcu() instead of call_rcu()
+to avoid having to write your own callback:
+
+ kfree_rcu(old_fp, rcu);
+
Again, see checklist.txt for additional rules governing the use of RCU.
@@ -773,8 +781,8 @@ a single atomic update, converting to RCU will require special care.
Also, the presence of synchronize_rcu() means that the RCU version of
delete() can now block. If this is a problem, there is a callback-based
-mechanism that never blocks, namely call_rcu(), that can be used in
-place of synchronize_rcu().
+mechanism that never blocks, namely call_rcu() or kfree_rcu(), that can
+be used in place of synchronize_rcu().
7. FULL LIST OF RCU APIs
@@ -789,9 +797,7 @@ RCU list traversal:
list_for_each_entry_rcu
hlist_for_each_entry_rcu
hlist_nulls_for_each_entry_rcu
-
- list_for_each_continue_rcu (to be deprecated in favor of new
- list_for_each_entry_continue_rcu)
+ list_for_each_entry_continue_rcu
RCU pointer/list update:
@@ -813,6 +819,7 @@ RCU: Critical sections Grace period Barrier
rcu_read_unlock synchronize_rcu
rcu_dereference synchronize_rcu_expedited
call_rcu
+ kfree_rcu
bh: Critical sections Grace period Barrier
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 2759f7c188f..3c4e1b3b80a 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -251,12 +251,13 @@ And there are a number of things that _must_ or _must_not_ be assumed:
And for:
- *A = X; Y = *A;
+ *A = X; *(A + 4) = Y;
- we may get either of:
+ we may get any of:
- STORE *A = X; Y = LOAD *A;
- STORE *A = Y = X;
+ STORE *A = X; STORE *(A + 4) = Y;
+ STORE *(A + 4) = Y; STORE *A = X;
+ STORE {*A, *(A + 4) } = {X, Y};
=========================
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 79ccfe6c707..49e3b49e552 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -648,7 +648,7 @@ static void stack_proc(void *arg)
struct task_struct *from = current, *to = arg;
to->thread.saved_task = from;
- rcu_switch(from, to);
+ rcu_user_hooks_switch(from, to);
switch_to(from, to, from);
}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index b00b33a1839..eff5b8c6865 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1511,6 +1511,13 @@ void syscall_trace_leave(struct pt_regs *regs)
{
bool step;
+ /*
+ * We may come here right after calling schedule_user()
+ * or do_notify_resume(), in which case we can be in RCU
+ * user mode.
+ */
+ rcu_user_exit();
+
audit_syscall_exit(regs);
if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index e0f0fab2041..c92dd28eaa6 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -286,23 +286,6 @@ static inline void list_splice_init_rcu(struct list_head *list,
&pos->member != (head); \
pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
-
-/**
- * list_for_each_continue_rcu
- * @pos: the &struct list_head to use as a loop cursor.
- * @head: the head for your list.
- *
- * Iterate over an rcu-protected list, continuing after current point.
- *
- * This list-traversal primitive may safely run concurrently with
- * the _rcu list-mutation primitives such as list_add_rcu()
- * as long as the traversal is guarded by rcu_read_lock().
- */
-#define list_for_each_continue_rcu(pos, head) \
- for ((pos) = rcu_dereference_raw(list_next_rcu(pos)); \
- (pos) != (head); \
- (pos) = rcu_dereference_raw(list_next_rcu(pos)))
-
/**
* list_for_each_entry_continue_rcu - continue iteration over list of given type
* @pos: the type * to use as a loop cursor.
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 7c968e4f929..8fe7c1840d3 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -90,6 +90,25 @@ extern void do_trace_rcu_torture_read(char *rcutorturename,
* that started after call_rcu() was invoked. RCU read-side critical
* sections are delimited by rcu_read_lock() and rcu_read_unlock(),
* and may be nested.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing RCU read-side critical section. On systems with more
+ * than one CPU, this means that when "func()" is invoked, each CPU is
+ * guaranteed to have executed a full memory barrier since the end of its
+ * last RCU read-side critical section whose beginning preceded the call
+ * to call_rcu(). It also means that each CPU executing an RCU read-side
+ * critical section that continues beyond the start of "func()" must have
+ * executed a memory barrier after the call_rcu() but before the beginning
+ * of that RCU read-side critical section. Note that these guarantees
+ * include CPUs that are offline, idle, or executing in user mode, as
+ * well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
+ * resulting RCU callback function "func()", then both CPU A and CPU B are
+ * guaranteed to execute a full memory barrier during the time interval
+ * between the call to call_rcu() and the invocation of "func()" -- even
+ * if CPU A and CPU B are the same CPU (but again only if the system has
+ * more than one CPU).
*/
extern void call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *head));
@@ -118,6 +137,9 @@ extern void call_rcu(struct rcu_head *head,
* OR
* - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
* These may be nested.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
*/
extern void call_rcu_bh(struct rcu_head *head,
void (*func)(struct rcu_head *head));
@@ -137,6 +159,9 @@ extern void call_rcu_bh(struct rcu_head *head,
* OR
* anything that disables preemption.
* These may be nested.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
*/
extern void call_rcu_sched(struct rcu_head *head,
void (*func)(struct rcu_head *rcu));
@@ -204,6 +229,8 @@ static inline void rcu_user_enter(void) { }
static inline void rcu_user_exit(void) { }
static inline void rcu_user_enter_after_irq(void) { }
static inline void rcu_user_exit_after_irq(void) { }
+static inline void rcu_user_hooks_switch(struct task_struct *prev,
+ struct task_struct *next) { }
#endif /* CONFIG_RCU_USER_QS */
extern void exit_rcu(void);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0dd42a02df2..530c52ef873 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -109,6 +109,8 @@ extern void update_cpu_load_nohz(void);
extern unsigned long get_parent_ip(unsigned long addr);
+extern void dump_cpu_task(int cpu);
+
struct seq_file;
struct cfs_rq;
struct task_group;
@@ -1844,14 +1846,6 @@ static inline void rcu_copy_process(struct task_struct *p)
#endif
-static inline void rcu_switch(struct task_struct *prev,
- struct task_struct *next)
-{
-#ifdef CONFIG_RCU_USER_QS
- rcu_user_hooks_switch(prev, next);
-#endif
-}
-
static inline void tsk_restore_flags(struct task_struct *task,
unsigned long orig_flags, unsigned long flags)
{
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 55a5c52cbb2..6eb691b0835 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -16,8 +16,10 @@
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) IBM Corporation, 2006
+ * Copyright (C) Fujitsu, 2012
*
* Author: Paul McKenney <paulmck@us.ibm.com>
+ * Lai Jiangshan <laijs@cn.fujitsu.com>
*
* For detailed explanation of Read-Copy Update mechanism see -
* Documentation/RCU/ *.txt
@@ -40,6 +42,8 @@ struct rcu_batch {
struct rcu_head *head, **tail;
};
+#define RCU_BATCH_INIT(name) { NULL, &(name.head) }
+
struct srcu_struct {
unsigned completed;
struct srcu_struct_array __percpu *per_cpu_ref;
@@ -70,12 +74,42 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name,
__init_srcu_struct((sp), #sp, &__srcu_key); \
})
+#define __SRCU_DEP_MAP_INIT(srcu_name) .dep_map = { .name = #srcu_name },
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
int init_srcu_struct(struct srcu_struct *sp);
+#define __SRCU_DEP_MAP_INIT(srcu_name)
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+void process_srcu(struct work_struct *work);
+
+#define __SRCU_STRUCT_INIT(name) \
+ { \
+ .completed = -300, \
+ .per_cpu_ref = &name##_srcu_array, \
+ .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \
+ .running = false, \
+ .batch_queue = RCU_BATCH_INIT(name.batch_queue), \
+ .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \
+ .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \
+ .batch_done = RCU_BATCH_INIT(name.batch_done), \
+ .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
+ __SRCU_DEP_MAP_INIT(name) \
+ }
+
+/*
+ * define and init a srcu struct at build time.
+ * dont't call init_srcu_struct() nor cleanup_srcu_struct() on it.
+ */
+#define DEFINE_SRCU(name) \
+ static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
+ struct srcu_struct name = __SRCU_STRUCT_INIT(name);
+
+#define DEFINE_STATIC_SRCU(name) \
+ static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
+ static struct srcu_struct name = __SRCU_STRUCT_INIT(name);
+
/**
* call_srcu() - Queue a callback for invocation after an SRCU grace period
* @sp: srcu_struct in queue the callback
diff --git a/init/Kconfig b/init/Kconfig
index 6fdd6e33932..ec62139207d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -494,11 +494,11 @@ config RCU_USER_QS
puts RCU in extended quiescent state when the CPU runs in
userspace. It means that when a CPU runs in userspace, it is
excluded from the global RCU state machine and thus doesn't
- to keep the timer tick on for RCU.
+ try to keep the timer tick on for RCU.
Unless you want to hack and help the development of the full
- tickless feature, you shouldn't enable this option. It adds
- unnecessary overhead.
+ tickless feature, you shouldn't enable this option. It also
+ adds unnecessary overhead.
If unsure say N
@@ -582,14 +582,13 @@ config RCU_FAST_NO_HZ
depends on NO_HZ && SMP
default n
help
- This option causes RCU to attempt to accelerate grace periods
- in order to allow CPUs to enter dynticks-idle state more
- quickly. On the other hand, this option increases the overhead
- of the dynticks-idle checking, particularly on systems with
- large numbers of CPUs.
+ This option causes RCU to attempt to accelerate grace periods in
+ order to allow CPUs to enter dynticks-idle state more quickly.
+ On the other hand, this option increases the overhead of the
+ dynticks-idle checking, thus degrading scheduling latency.
- Say Y if energy efficiency is critically important, particularly
- if you have relatively few CPUs.
+ Say Y if energy efficiency is critically important, and you don't
+ care about real-time response.
Say N if you are unsure.
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 4e316e1acf5..8715a798aa7 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -141,6 +141,23 @@ static ssize_t fscaps_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(fscaps);
+int rcu_expedited;
+static ssize_t rcu_expedited_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", rcu_expedited);
+}
+static ssize_t rcu_expedited_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (kstrtoint(buf, 0, &rcu_expedited))
+ return -EINVAL;
+
+ return count;
+}
+KERNEL_ATTR_RW(rcu_expedited);
+
/*
* Make /sys/kernel/notes give the raw contents of our kernel .notes section.
*/
@@ -182,6 +199,7 @@ static struct attribute * kernel_attrs[] = {
&kexec_crash_size_attr.attr,
&vmcoreinfo_attr.attr,
#endif
+ &rcu_expedited_attr.attr,
NULL
};
diff --git a/kernel/rcu.h b/kernel/rcu.h
index 8ba99cdc651..20dfba576c2 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -109,4 +109,6 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
}
}
+extern int rcu_expedited;
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 29ca1c6da59..a2cf76177b4 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -46,12 +46,15 @@
#include <linux/export.h>
#include <linux/hardirq.h>
#include <linux/delay.h>
+#include <linux/module.h>
#define CREATE_TRACE_POINTS
#include <trace/events/rcu.h>
#include "rcu.h"
+module_param(rcu_expedited, int, 0);
+
#ifdef CONFIG_PREEMPT_RCU
/*
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index e4c6a598d6f..e7dce58f9c2 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -195,7 +195,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle);
*/
int rcu_is_cpu_rrupt_from_idle(void)
{
- return rcu_dynticks_nesting <= 0;
+ return rcu_dynticks_nesting <= 1;
}
/*
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 3d019028220..f85016a2309 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -706,7 +706,10 @@ void synchronize_rcu(void)
return;
/* Once we get past the fastpath checks, same code as rcu_barrier(). */
- rcu_barrier();
+ if (rcu_expedited)
+ synchronize_rcu_expedited();
+ else
+ rcu_barrier();
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9900f560f1b..31dea01c85f 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -339,7 +339,6 @@ rcu_stutter_wait(char *title)
struct rcu_torture_ops {
void (*init)(void);
- void (*cleanup)(void);
int (*readlock)(void);
void (*read_delay)(struct rcu_random_state *rrsp);
void (*readunlock)(int idx);
@@ -431,7 +430,6 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
static struct rcu_torture_ops rcu_ops = {
.init = NULL,
- .cleanup = NULL,
.readlock = rcu_torture_read_lock,
.read_delay = rcu_read_delay,
.readunlock = rcu_torture_read_unlock,
@@ -475,7 +473,6 @@ static void rcu_sync_torture_init(void)
static struct rcu_torture_ops rcu_sync_ops = {
.init = rcu_sync_torture_init,
- .cleanup = NULL,
.readlock = rcu_torture_read_lock,
.read_delay = rcu_read_delay,
.readunlock = rcu_torture_read_unlock,
@@ -493,7 +490,6 @@ static struct rcu_torture_ops rcu_sync_ops = {
static struct rcu_torture_ops rcu_expedited_ops = {
.init = rcu_sync_torture_init,
- .cleanup = NULL,
.readlock = rcu_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_torture_read_unlock,
@@ -536,7 +532,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
static struct rcu_torture_ops rcu_bh_ops = {
.init = NULL,
- .cleanup = NULL,
.readlock = rcu_bh_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_bh_torture_read_unlock,
@@ -553,7 +548,6 @@ static struct rcu_torture_ops rcu_bh_ops = {
static struct rcu_torture_ops rcu_bh_sync_ops = {
.init = rcu_sync_torture_init,
- .cleanup = NULL,
.readlock = rcu_bh_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_bh_torture_read_unlock,
@@ -570,7 +564,6 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
static struct rcu_torture_ops rcu_bh_expedited_ops = {
.init = rcu_sync_torture_init,
- .cleanup = NULL,
.readlock = rcu_bh_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_bh_torture_read_unlock,
@@ -589,19 +582,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = {
* Definitions for srcu torture testing.
*/
-static struct srcu_struct srcu_ctl;
-
-static void srcu_torture_init(void)
-{
- init_srcu_struct(&srcu_ctl);
- rcu_sync_torture_init();
-}
-
-static void srcu_torture_cleanup(void)
-{
- synchronize_srcu(&srcu_ctl);
- cleanup_srcu_struct(&srcu_ctl);
-}
+DEFINE_STATIC_SRCU(srcu_ctl);
static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
{
@@ -672,8 +653,7 @@ static int srcu_torture_stats(char *page)
}
static struct rcu_torture_ops srcu_ops = {
- .init = srcu_torture_init,
- .cleanup = srcu_torture_cleanup,
+ .init = rcu_sync_torture_init,
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
@@ -687,8 +667,7 @@ static struct rcu_torture_ops srcu_ops = {
};
static struct rcu_torture_ops srcu_sync_ops = {
- .init = srcu_torture_init,
- .cleanup = srcu_torture_cleanup,
+ .init = rcu_sync_torture_init,
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
@@ -712,8 +691,7 @@ static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
}
static struct rcu_torture_ops srcu_raw_ops = {
- .init = srcu_torture_init,
- .cleanup = srcu_torture_cleanup,
+ .init = rcu_sync_torture_init,
.readlock = srcu_torture_read_lock_raw,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock_raw,
@@ -727,8 +705,7 @@ static struct rcu_torture_ops srcu_raw_ops = {
};
static struct rcu_torture_ops srcu_raw_sync_ops = {
- .init = srcu_torture_init,
- .cleanup = srcu_torture_cleanup,
+ .init = rcu_sync_torture_init,
.readlock = srcu_torture_read_lock_raw,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock_raw,
@@ -747,8 +724,7 @@ static void srcu_torture_synchronize_expedited(void)
}
static struct rcu_torture_ops srcu_expedited_ops = {
- .init = srcu_torture_init,
- .cleanup = srcu_torture_cleanup,
+ .init = rcu_sync_torture_init,
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
@@ -783,7 +759,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
static struct rcu_torture_ops sched_ops = {
.init = rcu_sync_torture_init,
- .cleanup = NULL,
.readlock = sched_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = sched_torture_read_unlock,
@@ -799,7 +774,6 @@ static struct rcu_torture_ops sched_ops = {
static struct rcu_torture_ops sched_sync_ops = {
.init = rcu_sync_torture_init,
- .cleanup = NULL,
.readlock = sched_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = sched_torture_read_unlock,
@@ -814,7 +788,6 @@ static struct rcu_torture_ops sched_sync_ops = {
static struct rcu_torture_ops sched_expedited_ops = {
.init = rcu_sync_torture_init,
- .cleanup = NULL,
.readlock = sched_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = sched_torture_read_unlock,
@@ -1396,12 +1369,16 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
"fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
"test_boost=%d/%d test_boost_interval=%d "
"test_boost_duration=%d shutdown_secs=%d "
+ "stall_cpu=%d stall_cpu_holdoff=%d "
+ "n_barrier_cbs=%d "
"onoff_interval=%d onoff_holdoff=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
test_boost, cur_ops->can_boost,
test_boost_interval, test_boost_duration, shutdown_secs,
+ stall_cpu, stall_cpu_holdoff,
+ n_barrier_cbs,
onoff_interval, onoff_holdoff);
}
@@ -1943,8 +1920,6 @@ rcu_torture_cleanup(void)
rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
- if (cur_ops->cleanup)
- cur_ops->cleanup();
if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
else if (n_online_successes != n_online_attempts ||
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 8ed9c481db0..5ffadcc3bb2 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -212,13 +212,13 @@ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
#endif
};
-static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */
-static int qhimark = 10000; /* If this many pending, ignore blimit. */
-static int qlowmark = 100; /* Once only this many pending, use blimit. */
+static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
+static long qhimark = 10000; /* If this many pending, ignore blimit. */
+static long qlowmark = 100; /* Once only this many pending, use blimit. */
-module_param(blimit, int, 0444);
-module_param(qhimark, int, 0444);
-module_param(qlowmark, int, 0444);
+module_param(blimit, long, 0444);
+module_param(qhimark, long, 0444);
+module_param(qlowmark, long, 0444);
int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
@@ -313,7 +313,7 @@ static int
cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
{
return *rdp->nxttail[RCU_DONE_TAIL +
- ACCESS_ONCE(rsp->completed) != rdp->completed] &&
+ (ACCESS_ONCE(rsp->completed) != rdp->completed)] &&
!rcu_gp_in_progress(rsp);
}
@@ -873,6 +873,29 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
rsp->jiffies_stall = jiffies + jiffies_till_stall_check();
}
+/*
+ * Dump stacks of all tasks running on stalled CPUs. This is a fallback
+ * for architectures that do not implement trigger_all_cpu_backtrace().
+ * The NMI-triggered stack traces are more accurate because they are
+ * printed by the target CPU.
+ */
+static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
+{
+ int cpu;
+ unsigned long flags;
+ struct rcu_node *rnp;
+
+ rcu_for_each_leaf_node(rsp, rnp) {
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ if (rnp->qsmask != 0) {
+ for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
+ if (rnp->qsmask & (1UL << cpu))
+ dump_cpu_task(rnp->grplo + cpu);
+ }
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ }
+}
+
static void print_other_cpu_stall(struct rcu_state *rsp)
{
int cpu;
@@ -880,6 +903,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
unsigned long flags;
int ndetected = 0;
struct rcu_node *rnp = rcu_get_root(rsp);
+ long totqlen = 0;
/* Only let one CPU complain about others per time interval. */
@@ -924,12 +948,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
raw_spin_unlock_irqrestore(&rnp->lock, flags);
print_cpu_stall_info_end();
- printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n",
- smp_processor_id(), (long)(jiffies - rsp->gp_start));
+ for_each_possible_cpu(cpu)
+ totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
+ pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n",
+ smp_processor_id(), (long)(jiffies - rsp->gp_start),
+ rsp->gpnum, rsp->completed, totqlen);
if (ndetected == 0)
printk(KERN_ERR "INFO: Stall ended before state dump start\n");
else if (!trigger_all_cpu_backtrace())
- dump_stack();
+ rcu_dump_cpu_stacks(rsp);
/* Complain about tasks blocking the grace period. */
@@ -940,8 +967,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
static void print_cpu_stall(struct rcu_state *rsp)
{
+ int cpu;
unsigned long flags;
struct rcu_node *rnp = rcu_get_root(rsp);
+ long totqlen = 0;
/*
* OK, time to rat on ourselves...
@@ -952,7 +981,10 @@ static void print_cpu_stall(struct rcu_state *rsp)
print_cpu_stall_info_begin();
print_cpu_stall_info(rsp, smp_processor_id());
print_cpu_stall_info_end();
- printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start);
+ for_each_possible_cpu(cpu)
+ totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
+ pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n",
+ jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen);
if (!trigger_all_cpu_backtrace())
dump_stack();
@@ -1404,15 +1436,37 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
!cpu_needs_another_gp(rsp, rdp)) {
/*
* Either we have not yet spawned the grace-period
- * task or this CPU does not need another grace period.
+ * task, this CPU does not need another grace period,
+ * or a grace period is already in progress.
* Either way, don't start a new grace period.
*/
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
+ /*
+ * Because there is no grace period in progress right now,
+ * any callbacks we have up to this point will be satisfied
+ * by the next grace period. So promote all callbacks to be
+ * handled after the end of the next grace period. If the
+ * CPU is not yet aware of the end of the previous grace period,
+ * we need to allow for the callback advancement that will
+ * occur when it does become aware. Deadlock prevents us from
+ * making it aware at this point: We cannot acquire a leaf
+ * rcu_node ->lock while holding the root rcu_node ->lock.
+ */
+ rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+ if (rdp->completed == rsp->completed)
+ rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
rsp->gp_flags = RCU_GP_FLAG_INIT;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
+
+ /* Ensure that CPU is aware of completion of last grace period. */
+ rcu_process_gp_end(rsp, rdp);
+ local_irq_restore(flags);
+
+ /* Wake up rcu_gp_kthread() to start the grace period. */
wake_up(&rsp->gp_wq);
}
@@ -1769,7 +1823,8 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
{
unsigned long flags;
struct rcu_head *next, *list, **tail;
- int bl, count, count_lazy, i;
+ long bl, count, count_lazy;
+ int i;
/* If no callbacks are ready, just return.*/
if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
@@ -2205,10 +2260,28 @@ static inline int rcu_blocking_is_gp(void)
* rcu_read_lock_sched().
*
* This means that all preempt_disable code sequences, including NMI and
- * hardware-interrupt handlers, in progress on entry will have completed
- * before this primitive returns. However, this does not guarantee that
- * softirq handlers will have completed, since in some kernels, these
- * handlers can run in process context, and can block.
+ * non-threaded hardware-interrupt handlers, in progress on entry will
+ * have completed before this primitive returns. However, this does not
+ * guarantee that softirq handlers will have completed, since in some
+ * kernels, these handlers can run in process context, and can block.
+ *
+ * Note that this guarantee implies further memory-ordering guarantees.
+ * On systems with more than one CPU, when synchronize_sched() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since the
+ * end of its last RCU-sched read-side critical section whose beginning
+ * preceded the call to synchronize_sched(). In addition, each CPU having
+ * an RCU read-side critical section that extends beyond the return from
+ * synchronize_sched() is guaranteed to have executed a full memory barrier
+ * after the beginning of synchronize_sched() and before the beginning of
+ * that RCU read-side critical section. Note that these guarantees include
+ * CPUs that are offline, idle, or executing in user mode, as well as CPUs
+ * that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked synchronize_sched(), which returned
+ * to its caller on CPU B, then both CPU A and CPU B are guaranteed
+ * to have executed a full memory barrier during the execution of
+ * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but
+ * again only if the system has more than one CPU).
*
* This primitive provides the guarantees made by the (now removed)
* synchronize_kernel() API. In contrast, synchronize_rcu() only
@@ -2224,7 +2297,10 @@ void synchronize_sched(void)
"Illegal synchronize_sched() in RCU-sched read-side critical section");
if (rcu_blocking_is_gp())
return;
- wait_rcu_gp(call_rcu_sched);
+ if (rcu_expedited)
+ synchronize_sched_expedited();
+ else
+ wait_rcu_gp(call_rcu_sched);
}
EXPORT_SYMBOL_GPL(synchronize_sched);
@@ -2236,6 +2312,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
* read-side critical sections have completed. RCU read-side critical
* sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
* and may be nested.
+ *
+ * See the description of synchronize_sched() for more detailed information
+ * on memory ordering guarantees.
*/
void synchronize_rcu_bh(void)
{
@@ -2245,7 +2324,10 @@ void synchronize_rcu_bh(void)
"Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
if (rcu_blocking_is_gp())
return;
- wait_rcu_gp(call_rcu_bh);
+ if (rcu_expedited)
+ synchronize_rcu_bh_expedited();
+ else
+ wait_rcu_gp(call_rcu_bh);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
@@ -2357,7 +2439,7 @@ void synchronize_sched_expedited(void)
if (trycount++ < 10) {
udelay(trycount * num_online_cpus());
} else {
- synchronize_sched();
+ wait_rcu_gp(call_rcu_sched);
atomic_long_inc(&rsp->expedited_normal);
return;
}
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 2b281cf0b6f..5ce3352505e 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -670,6 +670,9 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu);
* concurrently with new RCU read-side critical sections that began while
* synchronize_rcu() was waiting. RCU read-side critical sections are
* delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
+ *
+ * See the description of synchronize_sched() for more detailed information
+ * on memory ordering guarantees.
*/
void synchronize_rcu(void)
{
@@ -679,7 +682,10 @@ void synchronize_rcu(void)
"Illegal synchronize_rcu() in RCU read-side critical section");
if (!rcu_scheduler_active)
return;
- wait_rcu_gp(call_rcu);
+ if (rcu_expedited)
+ synchronize_rcu_expedited();
+ else
+ wait_rcu_gp(call_rcu);
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
@@ -832,7 +838,7 @@ void synchronize_rcu_expedited(void)
udelay(trycount * num_online_cpus());
} else {
put_online_cpus();
- synchronize_rcu();
+ wait_rcu_gp(call_rcu);
return;
}
}
@@ -876,6 +882,11 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
/**
* rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
+ *
+ * Note that this primitive does not necessarily wait for an RCU grace period
+ * to complete. For example, if there are no RCU callbacks queued anywhere
+ * in the system, then rcu_barrier() is within its rights to return
+ * immediately, without waiting for anything, much less an RCU grace period.
*/
void rcu_barrier(void)
{
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927fda71..6d4569e0924 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1887,7 +1887,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
#endif
/* Here we just switch the register state and the stack. */
- rcu_switch(prev, next);
+ rcu_user_hooks_switch(prev, next);
switch_to(prev, next, prev);
barrier();
@@ -8076,3 +8076,9 @@ struct cgroup_subsys cpuacct_subsys = {
.base_cftypes = files,
};
#endif /* CONFIG_CGROUP_CPUACCT */
+
+void dump_cpu_task(int cpu)
+{
+ pr_info("Task dump for CPU %d:\n", cpu);
+ sched_show_task(cpu_curr(cpu));
+}
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 97c465ebd84..2b859828cdc 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -16,8 +16,10 @@
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) IBM Corporation, 2006
+ * Copyright (C) Fujitsu, 2012
*
* Author: Paul McKenney <paulmck@us.ibm.com>
+ * Lai Jiangshan <laijs@cn.fujitsu.com>
*
* For detailed explanation of Read-Copy Update mechanism see -
* Documentation/RCU/ *.txt
@@ -34,6 +36,10 @@
#include <linux/delay.h>
#include <linux/srcu.h>
+#include <trace/events/rcu.h>
+
+#include "rcu.h"
+
/*
* Initialize an rcu_batch structure to empty.
*/
@@ -92,9 +98,6 @@ static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
}
}
-/* single-thread state-machine */
-static void process_srcu(struct work_struct *work);
-
static int init_srcu_struct_fields(struct srcu_struct *sp)
{
sp->completed = 0;
@@ -464,7 +467,9 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
*/
void synchronize_srcu(struct srcu_struct *sp)
{
- __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT);
+ __synchronize_srcu(sp, rcu_expedited
+ ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
+ : SYNCHRONIZE_SRCU_TRYCOUNT);
}
EXPORT_SYMBOL_GPL(synchronize_srcu);
@@ -637,7 +642,7 @@ static void srcu_reschedule(struct srcu_struct *sp)
/*
* This is the work-queue function that handles SRCU grace periods.
*/
-static void process_srcu(struct work_struct *work)
+void process_srcu(struct work_struct *work)
{
struct srcu_struct *sp;
@@ -648,3 +653,4 @@ static void process_srcu(struct work_struct *work)
srcu_invoke_callbacks(sp);
srcu_reschedule(sp);
}
+EXPORT_SYMBOL_GPL(process_srcu);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 28e9d6c9894..41faf0b8df1 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -972,7 +972,7 @@ config RCU_CPU_STALL_TIMEOUT
int "RCU CPU stall timeout in seconds"
depends on TREE_RCU || TREE_PREEMPT_RCU
range 3 300
- default 60
+ default 21
help
If a given RCU grace period extends more than the specified
number of seconds, a CPU stall warning is printed. If the