From 77535a0a168b2bf7c4aa35a20792a6e895844424 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 11 Dec 2013 18:50:25 -0800 Subject: timekeeping: Fix lost updates to tai adjustment commit f55c07607a38f84b5c7e6066ee1cfe433fa5643c upstream. Since 48cdc135d4840 (Implement a shadow timekeeper), we have to call timekeeping_update() after any adjustment to the timekeeping structure in order to make sure that any adjustments to the structure persist. Unfortunately, the updates to the tai offset via adjtimex do not trigger this update, causing adjustments to the tai offset to be made and then over-written by the previous value at the next update_wall_time() call. This patch resovles the issue by calling timekeeping_update() right after setting the tai offset. Cc: Sasha Levin Cc: Thomas Gleixner Cc: Prarit Bhargava Cc: Richard Cochran Cc: Ingo Molnar Signed-off-by: John Stultz Signed-off-by: Greg Kroah-Hartman --- kernel/time/timekeeping.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/time/timekeeping.c') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1c5b0fcd83b..88c2b8b4686 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -605,6 +605,7 @@ void timekeeping_set_tai_offset(s32 tai_offset) raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&timekeeper_seq); __timekeeping_set_tai_offset(tk, tai_offset); + timekeeping_update(tk, false, true); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); clock_was_set(); @@ -1677,6 +1678,7 @@ int do_adjtimex(struct timex *txc) if (tai != orig_tai) { __timekeeping_set_tai_offset(tk, tai); + timekeeping_update(tk, false, true); clock_was_set_delayed(); } write_seqcount_end(&timekeeper_seq); -- cgit v1.2.3 From a8ad6b67721e81c2e181ae3e0f3aea79da779cd7 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 10 Dec 2013 17:13:35 -0800 Subject: timekeeping: Fix CLOCK_TAI timer/nanosleep delays commit 04005f6011e3b504cd4d791d9769f7cb9a3b2eae upstream. A think-o in the calculation of the monotonic -> tai time offset results in CLOCK_TAI timers and nanosleeps to expire late (the latency is ~2x the tai offset). Fix this by adding the tai offset from the realtime offset instead of subtracting. Cc: Sasha Levin Cc: Thomas Gleixner Cc: Prarit Bhargava Cc: Richard Cochran Cc: Ingo Molnar Signed-off-by: John Stultz Signed-off-by: Greg Kroah-Hartman --- kernel/time/timekeeping.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/time/timekeeping.c') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 88c2b8b4686..40f75a038d1 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -72,7 +72,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) tk->wall_to_monotonic = wtm; set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); tk->offs_real = timespec_to_ktime(tmp); - tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0)); + tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); } static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) @@ -590,7 +590,7 @@ s32 timekeeping_get_tai_offset(void) static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) { tk->tai_offset = tai_offset; - tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0)); + tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0)); } /** -- cgit v1.2.3 From 226e0f713f585c549f4200bb8a69b6753dff28d0 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 11 Dec 2013 19:10:36 -0800 Subject: timekeeping: Fix missing timekeeping_update in suspend path commit 330a1617b0a6268d427aa5922c94d082b1d3e96d upstream. Since 48cdc135d4840 (Implement a shadow timekeeper), we have to call timekeeping_update() after any adjustment to the timekeeping structure in order to make sure that any adjustments to the structure persist. In the timekeeping suspend path, we udpate the timekeeper structure, so we should be sure to update the shadow-timekeeper before releasing the timekeeping locks. Currently this isn't done. In most cases, the next time related code to run would be timekeeping_resume, which does update the shadow-timekeeper, but in an abundence of caution, this patch adds the call to timekeeping_update() in the suspend path. Cc: Sasha Levin Cc: Thomas Gleixner Cc: Prarit Bhargava Cc: Richard Cochran Cc: Ingo Molnar Signed-off-by: John Stultz Signed-off-by: Greg Kroah-Hartman --- kernel/time/timekeeping.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/time/timekeeping.c') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 40f75a038d1..d81b1117561 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1008,6 +1008,8 @@ static int timekeeping_suspend(void) timekeeping_suspend_time = timespec_add(timekeeping_suspend_time, delta_delta); } + + timekeeping_update(tk, false, true); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); -- cgit v1.2.3 From d9e8fada0c0161f6fe2499a1b7dc9ce18e20fec2 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 10 Dec 2013 17:18:18 -0800 Subject: timekeeping: Avoid possible deadlock from clock_was_set_delayed commit 6fdda9a9c5db367130cf32df5d6618d08b89f46a upstream. As part of normal operaions, the hrtimer subsystem frequently calls into the timekeeping code, creating a locking order of hrtimer locks -> timekeeping locks clock_was_set_delayed() was suppoed to allow us to avoid deadlocks between the timekeeping the hrtimer subsystem, so that we could notify the hrtimer subsytem the time had changed while holding the timekeeping locks. This was done by scheduling delayed work that would run later once we were out of the timekeeing code. But unfortunately the lock chains are complex enoguh that in scheduling delayed work, we end up eventually trying to grab an hrtimer lock. Sasha Levin noticed this in testing when the new seqlock lockdep enablement triggered the following (somewhat abrieviated) message: [ 251.100221] ====================================================== [ 251.100221] [ INFO: possible circular locking dependency detected ] [ 251.100221] 3.13.0-rc2-next-20131206-sasha-00005-g8be2375-dirty #4053 Not tainted [ 251.101967] ------------------------------------------------------- [ 251.101967] kworker/10:1/4506 is trying to acquire lock: [ 251.101967] (timekeeper_seq){----..}, at: [] retrigger_next_event+0x56/0x70 [ 251.101967] [ 251.101967] but task is already holding lock: [ 251.101967] (hrtimer_bases.lock#11){-.-...}, at: [] retrigger_next_event+0x3c/0x70 [ 251.101967] [ 251.101967] which lock already depends on the new lock. [ 251.101967] [ 251.101967] [ 251.101967] the existing dependency chain (in reverse order) is: [ 251.101967] -> #5 (hrtimer_bases.lock#11){-.-...}: [snipped] -> #4 (&rt_b->rt_runtime_lock){-.-...}: [snipped] -> #3 (&rq->lock){-.-.-.}: [snipped] -> #2 (&p->pi_lock){-.-.-.}: [snipped] -> #1 (&(&pool->lock)->rlock){-.-...}: [ 251.101967] [] validate_chain+0x6c3/0x7b0 [ 251.101967] [] __lock_acquire+0x4ad/0x580 [ 251.101967] [] lock_acquire+0x182/0x1d0 [ 251.101967] [] _raw_spin_lock+0x40/0x80 [ 251.101967] [] __queue_work+0x1a9/0x3f0 [ 251.101967] [] queue_work_on+0x98/0x120 [ 251.101967] [] clock_was_set_delayed+0x21/0x30 [ 251.101967] [] do_adjtimex+0x111/0x160 [ 251.101967] [] compat_sys_adjtimex+0x41/0x70 [ 251.101967] [] ia32_sysret+0x0/0x5 [ 251.101967] -> #0 (timekeeper_seq){----..}: [snipped] [ 251.101967] other info that might help us debug this: [ 251.101967] [ 251.101967] Chain exists of: timekeeper_seq --> &rt_b->rt_runtime_lock --> hrtimer_bases.lock#11 [ 251.101967] Possible unsafe locking scenario: [ 251.101967] [ 251.101967] CPU0 CPU1 [ 251.101967] ---- ---- [ 251.101967] lock(hrtimer_bases.lock#11); [ 251.101967] lock(&rt_b->rt_runtime_lock); [ 251.101967] lock(hrtimer_bases.lock#11); [ 251.101967] lock(timekeeper_seq); [ 251.101967] [ 251.101967] *** DEADLOCK *** [ 251.101967] [ 251.101967] 3 locks held by kworker/10:1/4506: [ 251.101967] #0: (events){.+.+.+}, at: [] process_one_work+0x200/0x530 [ 251.101967] #1: (hrtimer_work){+.+...}, at: [] process_one_work+0x200/0x530 [ 251.101967] #2: (hrtimer_bases.lock#11){-.-...}, at: [] retrigger_next_event+0x3c/0x70 [ 251.101967] [ 251.101967] stack backtrace: [ 251.101967] CPU: 10 PID: 4506 Comm: kworker/10:1 Not tainted 3.13.0-rc2-next-20131206-sasha-00005-g8be2375-dirty #4053 [ 251.101967] Workqueue: events clock_was_set_work So the best solution is to avoid calling clock_was_set_delayed() while holding the timekeeping lock, and instead using a flag variable to decide if we should call clock_was_set() once we've released the locks. This works for the case here, where the do_adjtimex() was the deadlock trigger point. Unfortuantely, in update_wall_time() we still hold the jiffies lock, which would deadlock with the ipi triggered by clock_was_set(), preventing us from calling it even after we drop the timekeeping lock. So instead call clock_was_set_delayed() at that point. Cc: Thomas Gleixner Cc: Prarit Bhargava Cc: Richard Cochran Cc: Ingo Molnar Cc: Sasha Levin Reported-by: Sasha Levin Tested-by: Sasha Levin Signed-off-by: John Stultz Signed-off-by: Greg Kroah-Hartman --- kernel/time/timekeeping.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'kernel/time/timekeeping.c') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d81b1117561..76fefb1613b 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1239,9 +1239,10 @@ out_adjust: * It also calls into the NTP code to handle leapsecond processing. * */ -static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) +static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) { u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; + unsigned int clock_set = 0; while (tk->xtime_nsec >= nsecps) { int leap; @@ -1263,9 +1264,10 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); - clock_was_set_delayed(); + clock_set = 1; } } + return clock_set; } /** @@ -1278,7 +1280,8 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) * Returns the unconsumed cycles. */ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, - u32 shift) + u32 shift, + unsigned int *clock_set) { cycle_t interval = tk->cycle_interval << shift; u64 raw_nsecs; @@ -1292,7 +1295,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, tk->cycle_last += interval; tk->xtime_nsec += tk->xtime_interval << shift; - accumulate_nsecs_to_secs(tk); + *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ raw_nsecs = (u64)tk->raw_interval << shift; @@ -1350,6 +1353,7 @@ static void update_wall_time(void) struct timekeeper *tk = &shadow_timekeeper; cycle_t offset; int shift = 0, maxshift; + unsigned int clock_set = 0; unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); @@ -1384,7 +1388,8 @@ static void update_wall_time(void) maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; shift = min(shift, maxshift); while (offset >= tk->cycle_interval) { - offset = logarithmic_accumulation(tk, offset, shift); + offset = logarithmic_accumulation(tk, offset, shift, + &clock_set); if (offset < tk->cycle_interval<cycle_last with the new value */ @@ -1422,6 +1427,10 @@ static void update_wall_time(void) write_seqcount_end(&timekeeper_seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + if (clock_set) + /* have to call outside the timekeeper_seq */ + clock_was_set_delayed(); + } /** @@ -1681,11 +1690,13 @@ int do_adjtimex(struct timex *txc) if (tai != orig_tai) { __timekeeping_set_tai_offset(tk, tai); timekeeping_update(tk, false, true); - clock_was_set_delayed(); } write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + if (tai != orig_tai) + clock_was_set(); + ntp_notify_cmos_timer(); return ret; -- cgit v1.2.3