aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2017-05-30 12:45:57 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2017-05-30 12:45:57 +1000
commit186ac60535bb981da129890d3bf1ed99d88da8c9 (patch)
treebea294b3409d6d241d17037faa9be406ba01ee7d
parentfdabcf0c35152c82c06b09831b9b0812b7315146 (diff)
parent8e3c0499d0c92e96cb9295d45f1ac3784ecad8ec (diff)
Merge remote-tracking branch 'tip/auto-latest'
-rw-r--r--Documentation/IRQ-domain.txt41
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt9
-rw-r--r--arch/arm/kernel/hw_breakpoint.c11
-rw-r--r--arch/arm/kernel/patch.c2
-rw-r--r--arch/arm/kernel/smp.c3
-rw-r--r--arch/arm/probes/kprobes/core.c3
-rw-r--r--arch/arm64/include/asm/insn.h1
-rw-r--r--arch/arm64/kernel/insn.c5
-rw-r--r--arch/arm64/kernel/smp.c3
-rw-r--r--arch/metag/kernel/smp.c3
-rw-r--r--arch/mips/kernel/jump_label.c2
-rw-r--r--arch/powerpc/kernel/smp.c2
-rw-r--r--arch/powerpc/kvm/book3s_hv.c14
-rw-r--r--arch/powerpc/platforms/powernv/subcore.c7
-rw-r--r--arch/s390/kernel/jump_label.c2
-rw-r--r--arch/s390/kernel/kprobes.c4
-rw-r--r--arch/s390/kernel/time.c6
-rw-r--r--arch/sparc/kernel/jump_label.c2
-rw-r--r--arch/tile/kernel/jump_label.c2
-rw-r--r--arch/x86/boot/compressed/cmdline.c2
-rw-r--r--arch/x86/boot/compressed/kaslr.c190
-rw-r--r--arch/x86/boot/string.c8
-rw-r--r--arch/x86/events/core.c23
-rw-r--r--arch/x86/events/intel/core.c74
-rw-r--r--arch/x86/events/intel/cqm.c16
-rw-r--r--arch/x86/events/perf_event.h3
-rw-r--r--arch/x86/include/asm/amd_nb.h3
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/x86/include/asm/timer.h8
-rw-r--r--arch/x86/include/asm/tlbbatch.h16
-rw-r--r--arch/x86/include/asm/tlbflush.h14
-rw-r--r--arch/x86/kernel/apic/io_apic.c22
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c229
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c16
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c2
-rw-r--r--arch/x86/kernel/jump_label.c2
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/kernel/tsc.c206
-rw-r--r--arch/x86/mm/tlb.c78
-rw-r--r--arch/x86/platform/efi/efi.c6
-rw-r--r--arch/x86/platform/efi/efi_64.c79
-rw-r--r--arch/x86/platform/efi/quirks.c3
-rw-r--r--arch/x86/platform/uv/tlb_uv.c14
-rw-r--r--drivers/acpi/apei/ghes.c39
-rw-r--r--drivers/acpi/pci_root.c2
-rw-r--r--drivers/acpi/processor_driver.c4
-rw-r--r--drivers/acpi/processor_throttling.c16
-rw-r--r--drivers/base/node.c2
-rw-r--r--drivers/char/Kconfig9
-rw-r--r--drivers/char/Makefile1
-rw-r--r--drivers/char/mmtimer.c858
-rw-r--r--drivers/cpufreq/cpufreq.c21
-rw-r--r--drivers/cpufreq/pasemi-cpufreq.c2
-rw-r--r--drivers/cpuidle/cpuidle.c1
-rw-r--r--drivers/firmware/efi/efi-bgrt.c3
-rw-r--r--drivers/firmware/efi/libstub/secureboot.c4
-rw-r--r--drivers/hwtracing/coresight/coresight-etm3x.c20
-rw-r--r--drivers/hwtracing/coresight/coresight-etm4x.c20
-rw-r--r--drivers/iommu/intel-iommu.c4
-rw-r--r--drivers/iommu/of_iommu.c2
-rw-r--r--drivers/pci/pci-driver.c47
-rw-r--r--drivers/ras/ras.c2
-rw-r--r--drivers/xen/manage.c1
-rw-r--r--include/linux/clocksource.h1
-rw-r--r--include/linux/cpu.h36
-rw-r--r--include/linux/cpuhotplug.h38
-rw-r--r--include/linux/cpumask.h28
-rw-r--r--include/linux/kernel.h6
-rw-r--r--include/linux/llist.h19
-rw-r--r--include/linux/mm_types_task.h15
-rw-r--r--include/linux/padata.h3
-rw-r--r--include/linux/pci.h1
-rw-r--r--include/linux/perf_event.h2
-rw-r--r--include/linux/posix-timers.h15
-rw-r--r--include/linux/sched.h10
-rw-r--r--include/linux/sched/clock.h11
-rw-r--r--include/linux/stop_machine.h26
-rw-r--r--include/uapi/linux/time.h6
-rw-r--r--init/main.c27
-rw-r--r--kernel/async.c8
-rw-r--r--kernel/cpu.c213
-rw-r--r--kernel/events/core.c116
-rw-r--r--kernel/extable.c2
-rw-r--r--kernel/irq/irqdesc.c5
-rw-r--r--kernel/irq/irqdomain.c81
-rw-r--r--kernel/irq/msi.c10
-rw-r--r--kernel/jump_label.c20
-rw-r--r--kernel/kprobes.c59
-rw-r--r--kernel/padata.c43
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/sched/clock.c128
-rw-r--r--kernel/sched/core.c29
-rw-r--r--kernel/sched/deadline.c2
-rw-r--r--kernel/sched/fair.c153
-rw-r--r--kernel/sched/features.h1
-rw-r--r--kernel/sched/rt.c13
-rw-r--r--kernel/sched/sched.h19
-rw-r--r--kernel/sched/topology.c430
-rw-r--r--kernel/smp.c16
-rw-r--r--kernel/stop_machine.c11
-rw-r--r--kernel/time/alarmtimer.c91
-rw-r--r--kernel/time/clocksource.c3
-rw-r--r--kernel/time/posix-clock.c2
-rw-r--r--kernel/time/posix-cpu-timers.c34
-rw-r--r--kernel/time/posix-timers.c191
-rw-r--r--kernel/time/tick-sched.c40
-rw-r--r--kernel/time/tick-sched.h2
-rw-r--r--lib/cpumask.c32
-rw-r--r--lib/smp_processor_id.c2
-rw-r--r--mm/rmap.c16
-rw-r--r--mm/vmscan.c2
112 files changed, 2053 insertions, 2167 deletions
diff --git a/Documentation/IRQ-domain.txt b/Documentation/IRQ-domain.txt
index 82001a25a14b..1f246eb25ca5 100644
--- a/Documentation/IRQ-domain.txt
+++ b/Documentation/IRQ-domain.txt
@@ -231,5 +231,42 @@ needs to:
4) No need to implement irq_domain_ops.map and irq_domain_ops.unmap,
they are unused with hierarchy irq_domain.
-Hierarchy irq_domain may also be used to support other architectures,
-such as ARM, ARM64 etc.
+Hierarchy irq_domain is in no way x86 specific, and is heavily used to
+support other architectures, such as ARM, ARM64 etc.
+
+=== Debugging ===
+
+If you switch on CONFIG_IRQ_DOMAIN_DEBUG (which depends on
+CONFIG_IRQ_DOMAIN and CONFIG_DEBUG_FS), you will find a new file in
+your debugfs mount point, called irq_domain_mapping. This file
+contains a live snapshot of all the IRQ domains in the system:
+
+ name mapped linear-max direct-max devtree-node
+ pl061 8 8 0 /smb/gpio@e0080000
+ pl061 8 8 0 /smb/gpio@e1050000
+ pMSI 0 0 0 /interrupt-controller@e1101000/v2m@e0080000
+ MSI 37 0 0 /interrupt-controller@e1101000/v2m@e0080000
+ GICv2m 37 0 0 /interrupt-controller@e1101000/v2m@e0080000
+ GICv2 448 448 0 /interrupt-controller@e1101000
+
+it also iterates over the interrupts to display their mapping in the
+domains, and makes the domain stacking visible:
+
+
+irq hwirq chip name chip data active type domain
+ 1 0x00019 GICv2 0xffff00000916bfd8 * LINEAR GICv2
+ 2 0x0001d GICv2 0xffff00000916bfd8 LINEAR GICv2
+ 3 0x0001e GICv2 0xffff00000916bfd8 * LINEAR GICv2
+ 4 0x0001b GICv2 0xffff00000916bfd8 * LINEAR GICv2
+ 5 0x0001a GICv2 0xffff00000916bfd8 LINEAR GICv2
+[...]
+ 96 0x81808 MSI 0x (null) RADIX MSI
+ 96+ 0x00063 GICv2m 0xffff8003ee116980 RADIX GICv2m
+ 96+ 0x00063 GICv2 0xffff00000916bfd8 LINEAR GICv2
+ 97 0x08800 MSI 0x (null) * RADIX MSI
+ 97+ 0x00064 GICv2m 0xffff8003ee116980 * RADIX GICv2m
+ 97+ 0x00064 GICv2 0xffff00000916bfd8 * LINEAR GICv2
+
+Here, interrupts 1-5 are only using a single domain, while 96 and 97
+are build out of a stack of three domain, each level performing a
+particular function.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 15f79c27748d..4e4c3402412e 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2127,6 +2127,12 @@
memmap=nn[KMG]@ss[KMG]
[KNL] Force usage of a specific region of memory.
Region of memory to be used is from ss to ss+nn.
+ If @ss[KMG] is omitted, it is equivalent to mem=nn[KMG],
+ which limits max address to nn[KMG].
+ Multiple different regions can be specified,
+ comma delimited.
+ Example:
+ memmap=100M@2G,100M#3G,1G!1024G
memmap=nn[KMG]#ss[KMG]
[KNL,ACPI] Mark specific memory as ACPI data.
@@ -2139,6 +2145,9 @@
memmap=64K$0x18690000
or
memmap=0x10000$0x18690000
+ Some bootloaders may need an escape character before '$',
+ like Grub2, otherwise '$' and the following number
+ will be eaten.
memmap=nn[KMG]!ss[KMG]
[KNL,X86] Mark specific memory as protected.
diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c
index be3b3fbd382f..63cb4c7c6593 100644
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@@ -1090,7 +1090,7 @@ static int __init arch_hw_breakpoint_init(void)
* driven low on this core and there isn't an architected way to
* determine that.
*/
- get_online_cpus();
+ cpus_read_lock();
register_undef_hook(&debug_reg_hook);
/*
@@ -1098,15 +1098,16 @@ static int __init arch_hw_breakpoint_init(void)
* assume that a halting debugger will leave the world in a nice state
* for us.
*/
- ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "arm/hw_breakpoint:online",
- dbg_reset_online, NULL);
+ ret = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN,
+ "arm/hw_breakpoint:online",
+ dbg_reset_online, NULL);
unregister_undef_hook(&debug_reg_hook);
if (WARN_ON(ret < 0) || !cpumask_empty(&debug_err_mask)) {
core_num_brps = 0;
core_num_wrps = 0;
if (ret > 0)
cpuhp_remove_state_nocalls(ret);
- put_online_cpus();
+ cpus_read_unlock();
return 0;
}
@@ -1124,7 +1125,7 @@ static int __init arch_hw_breakpoint_init(void)
TRAP_HWBKPT, "watchpoint debug exception");
hook_ifault_code(FAULT_CODE_DEBUG, hw_breakpoint_pending, SIGTRAP,
TRAP_HWBKPT, "breakpoint debug exception");
- put_online_cpus();
+ cpus_read_unlock();
/* Register PM notifiers. */
pm_init();
diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c
index 020560b2dcb7..a1a34722c655 100644
--- a/arch/arm/kernel/patch.c
+++ b/arch/arm/kernel/patch.c
@@ -124,5 +124,5 @@ void __kprobes patch_text(void *addr, unsigned int insn)
.insn = insn,
};
- stop_machine(patch_text_stop_machine, &patch, NULL);
+ stop_machine_cpuslocked(patch_text_stop_machine, &patch, NULL);
}
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 572a8df1b766..c9a0a5299827 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -555,8 +555,7 @@ static DEFINE_RAW_SPINLOCK(stop_lock);
*/
static void ipi_cpu_stop(unsigned int cpu)
{
- if (system_state == SYSTEM_BOOTING ||
- system_state == SYSTEM_RUNNING) {
+ if (system_state <= SYSTEM_RUNNING) {
raw_spin_lock(&stop_lock);
pr_crit("CPU%u: stopping\n", cpu);
dump_stack();
diff --git a/arch/arm/probes/kprobes/core.c b/arch/arm/probes/kprobes/core.c
index ad1f4e6a9e33..52d1cd14fda4 100644
--- a/arch/arm/probes/kprobes/core.c
+++ b/arch/arm/probes/kprobes/core.c
@@ -182,7 +182,8 @@ void __kprobes kprobes_remove_breakpoint(void *addr, unsigned int insn)
.addr = addr,
.insn = insn,
};
- stop_machine(__kprobes_remove_breakpoint, &p, cpu_online_mask);
+ stop_machine_cpuslocked(__kprobes_remove_breakpoint, &p,
+ cpu_online_mask);
}
void __kprobes arch_disarm_kprobe(struct kprobe *p)
diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 29cb2ca756f6..4214c38d016b 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -433,7 +433,6 @@ u32 aarch64_set_branch_offset(u32 insn, s32 offset);
bool aarch64_insn_hotpatch_safe(u32 old_insn, u32 new_insn);
int aarch64_insn_patch_text_nosync(void *addr, u32 insn);
-int aarch64_insn_patch_text_sync(void *addrs[], u32 insns[], int cnt);
int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt);
s32 aarch64_insn_adrp_get_offset(u32 insn);
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index b884a926a632..cd872133e88e 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -255,6 +255,7 @@ static int __kprobes aarch64_insn_patch_text_cb(void *arg)
return ret;
}
+static
int __kprobes aarch64_insn_patch_text_sync(void *addrs[], u32 insns[], int cnt)
{
struct aarch64_insn_patch patch = {
@@ -267,8 +268,8 @@ int __kprobes aarch64_insn_patch_text_sync(void *addrs[], u32 insns[], int cnt)
if (cnt <= 0)
return -EINVAL;
- return stop_machine(aarch64_insn_patch_text_cb, &patch,
- cpu_online_mask);
+ return stop_machine_cpuslocked(aarch64_insn_patch_text_cb, &patch,
+ cpu_online_mask);
}
int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt)
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 6e0e16a3a7d4..321119881abf 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -961,8 +961,7 @@ void smp_send_stop(void)
cpumask_copy(&mask, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), &mask);
- if (system_state == SYSTEM_BOOTING ||
- system_state == SYSTEM_RUNNING)
+ if (system_state <= SYSTEM_RUNNING)
pr_crit("SMP: stopping secondary CPUs\n");
smp_cross_call(&mask, IPI_CPU_STOP);
}
diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c
index 232a12bf3f99..2dbbb7c66043 100644
--- a/arch/metag/kernel/smp.c
+++ b/arch/metag/kernel/smp.c
@@ -567,8 +567,7 @@ static void stop_this_cpu(void *data)
{
unsigned int cpu = smp_processor_id();
- if (system_state == SYSTEM_BOOTING ||
- system_state == SYSTEM_RUNNING) {
+ if (system_state <= SYSTEM_RUNNING) {
spin_lock(&stop_lock);
pr_crit("CPU%u: stopping\n", cpu);
dump_stack();
diff --git a/arch/mips/kernel/jump_label.c b/arch/mips/kernel/jump_label.c
index 3e586daa3a32..32e3168316cd 100644
--- a/arch/mips/kernel/jump_label.c
+++ b/arch/mips/kernel/jump_label.c
@@ -58,7 +58,6 @@ void arch_jump_label_transform(struct jump_entry *e,
insn.word = 0; /* nop */
}
- get_online_cpus();
mutex_lock(&text_mutex);
if (IS_ENABLED(CONFIG_CPU_MICROMIPS)) {
insn_p->halfword[0] = insn.word >> 16;
@@ -70,7 +69,6 @@ void arch_jump_label_transform(struct jump_entry *e,
(unsigned long)insn_p + sizeof(*insn_p));
mutex_unlock(&text_mutex);
- put_online_cpus();
}
#endif /* HAVE_JUMP_LABEL */
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index df2a41647d8e..1069f74fca47 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -97,7 +97,7 @@ int smp_generic_cpu_bootable(unsigned int nr)
/* Special case - we inhibit secondary thread startup
* during boot if the user requests it.
*/
- if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) {
+ if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) {
if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
return 0;
if (smt_enabled_at_boot
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 42b7a4fd57d9..48a6bd160011 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3317,7 +3317,7 @@ void kvmppc_alloc_host_rm_ops(void)
return;
}
- get_online_cpus();
+ cpus_read_lock();
for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) {
if (!cpu_online(cpu))
@@ -3339,17 +3339,17 @@ void kvmppc_alloc_host_rm_ops(void)
l_ops = (unsigned long) ops;
if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) {
- put_online_cpus();
+ cpus_read_unlock();
kfree(ops->rm_core);
kfree(ops);
return;
}
- cpuhp_setup_state_nocalls(CPUHP_KVM_PPC_BOOK3S_PREPARE,
- "ppc/kvm_book3s:prepare",
- kvmppc_set_host_core,
- kvmppc_clear_host_core);
- put_online_cpus();
+ cpuhp_setup_state_nocalls_cpuslocked(CPUHP_KVM_PPC_BOOK3S_PREPARE,
+ "ppc/kvm_book3s:prepare",
+ kvmppc_set_host_core,
+ kvmppc_clear_host_core);
+ cpus_read_unlock();
}
void kvmppc_free_host_rm_ops(void)
diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c
index 0babef11136f..e6230f104dd9 100644
--- a/arch/powerpc/platforms/powernv/subcore.c
+++ b/arch/powerpc/platforms/powernv/subcore.c
@@ -348,7 +348,7 @@ static int set_subcores_per_core(int new_mode)
state->master = 0;
}
- get_online_cpus();
+ cpus_read_lock();
/* This cpu will update the globals before exiting stop machine */
this_cpu_ptr(&split_state)->master = 1;
@@ -356,9 +356,10 @@ static int set_subcores_per_core(int new_mode)
/* Ensure state is consistent before we call the other cpus */
mb();
- stop_machine(cpu_update_split_mode, &new_mode, cpu_online_mask);
+ stop_machine_cpuslocked(cpu_update_split_mode, &new_mode,
+ cpu_online_mask);
- put_online_cpus();
+ cpus_read_unlock();
return 0;
}
diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c
index 6aa630a8d24f..262506cee4c3 100644
--- a/arch/s390/kernel/jump_label.c
+++ b/arch/s390/kernel/jump_label.c
@@ -93,7 +93,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
args.entry = entry;
args.type = type;
- stop_machine(__sm_arch_jump_label_transform, &args, NULL);
+ stop_machine_cpuslocked(__sm_arch_jump_label_transform, &args, NULL);
}
void arch_jump_label_transform_static(struct jump_entry *entry,
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index 3d6a99746454..6842e4501e2e 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -196,7 +196,7 @@ void arch_arm_kprobe(struct kprobe *p)
{
struct swap_insn_args args = {.p = p, .arm_kprobe = 1};
- stop_machine(swap_instruction, &args, NULL);
+ stop_machine_cpuslocked(swap_instruction, &args, NULL);
}
NOKPROBE_SYMBOL(arch_arm_kprobe);
@@ -204,7 +204,7 @@ void arch_disarm_kprobe(struct kprobe *p)
{
struct swap_insn_args args = {.p = p, .arm_kprobe = 0};
- stop_machine(swap_instruction, &args, NULL);
+ stop_machine_cpuslocked(swap_instruction, &args, NULL);
}
NOKPROBE_SYMBOL(arch_disarm_kprobe);
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index c3a52f9a69a0..192efdfac918 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -636,10 +636,10 @@ static void stp_work_fn(struct work_struct *work)
goto out_unlock;
memset(&stp_sync, 0, sizeof(stp_sync));
- get_online_cpus();
+ cpus_read_lock();
atomic_set(&stp_sync.cpus, num_online_cpus() - 1);
- stop_machine(stp_sync_clock, &stp_sync, cpu_online_mask);
- put_online_cpus();
+ stop_machine_cpuslocked(stp_sync_clock, &stp_sync, cpu_online_mask);
+ cpus_read_unlock();
if (!check_sync_clock())
/*
diff --git a/arch/sparc/kernel/jump_label.c b/arch/sparc/kernel/jump_label.c
index 07933b9e9ce0..93adde1ac166 100644
--- a/arch/sparc/kernel/jump_label.c
+++ b/arch/sparc/kernel/jump_label.c
@@ -41,12 +41,10 @@ void arch_jump_label_transform(struct jump_entry *entry,
val = 0x01000000;
}
- get_online_cpus();
mutex_lock(&text_mutex);
*insn = val;
flushi(insn);
mutex_unlock(&text_mutex);
- put_online_cpus();
}
#endif
diff --git a/arch/tile/kernel/jump_label.c b/arch/tile/kernel/jump_label.c
index 07802d586988..93931a46625b 100644
--- a/arch/tile/kernel/jump_label.c
+++ b/arch/tile/kernel/jump_label.c
@@ -45,14 +45,12 @@ static void __jump_label_transform(struct jump_entry *e,
void arch_jump_label_transform(struct jump_entry *e,
enum jump_label_type type)
{
- get_online_cpus();
mutex_lock(&text_mutex);
__jump_label_transform(e, type);
flush_icache_range(e->code, e->code + sizeof(tilegx_bundle_bits));
mutex_unlock(&text_mutex);
- put_online_cpus();
}
__init_or_module void arch_jump_label_transform_static(struct jump_entry *e,
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index 73ccf63b0f48..9dc1ce6ba3c0 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -13,7 +13,7 @@ static inline char rdfs8(addr_t addr)
return *((char *)(fs + addr));
}
#include "../cmdline.c"
-static unsigned long get_cmd_line_ptr(void)
+unsigned long get_cmd_line_ptr(void)
{
unsigned long cmd_line_ptr = boot_params->hdr.cmd_line_ptr;
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 54c24f0a43d3..e0eba12bffe7 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -9,16 +9,41 @@
* contain the entire properly aligned running kernel image.
*
*/
+
+/*
+ * isspace() in linux/ctype.h is expected by next_args() to filter
+ * out "space/lf/tab". While boot/ctype.h conflicts with linux/ctype.h,
+ * since isdigit() is implemented in both of them. Hence disable it
+ * here.
+ */
+#define BOOT_CTYPE_H
+
+/*
+ * _ctype[] in lib/ctype.c is needed by isspace() of linux/ctype.h.
+ * While both lib/ctype.c and lib/cmdline.c will bring EXPORT_SYMBOL
+ * which is meaningless and will cause compiling error in some cases.
+ * So do not include linux/export.h and define EXPORT_SYMBOL(sym)
+ * as empty.
+ */
+#define _LINUX_EXPORT_H
+#define EXPORT_SYMBOL(sym)
+
#include "misc.h"
#include "error.h"
-#include "../boot.h"
#include <generated/compile.h>
#include <linux/module.h>
#include <linux/uts.h>
#include <linux/utsname.h>
+#include <linux/ctype.h>
#include <generated/utsrelease.h>
+/* Macros used by the included decompressor code below. */
+#define STATIC
+#include <linux/decompress/mm.h>
+
+extern unsigned long get_cmd_line_ptr(void);
+
/* Simplified build-specific string for starting entropy. */
static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
@@ -62,6 +87,11 @@ struct mem_vector {
static bool memmap_too_large;
+
+/* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */
+unsigned long long mem_limit = ULLONG_MAX;
+
+
enum mem_avoid_index {
MEM_AVOID_ZO_RANGE = 0,
MEM_AVOID_INITRD,
@@ -85,49 +115,14 @@ static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
return true;
}
-/**
- * _memparse - Parse a string with mem suffixes into a number
- * @ptr: Where parse begins
- * @retptr: (output) Optional pointer to next char after parse completes
- *
- * Parses a string into a number. The number stored at @ptr is
- * potentially suffixed with K, M, G, T, P, E.
- */
-static unsigned long long _memparse(const char *ptr, char **retptr)
+char *skip_spaces(const char *str)
{
- char *endptr; /* Local pointer to end of parsed string */
-
- unsigned long long ret = simple_strtoull(ptr, &endptr, 0);
-
- switch (*endptr) {
- case 'E':
- case 'e':
- ret <<= 10;
- case 'P':
- case 'p':
- ret <<= 10;
- case 'T':
- case 't':
- ret <<= 10;
- case 'G':
- case 'g':
- ret <<= 10;
- case 'M':
- case 'm':
- ret <<= 10;
- case 'K':
- case 'k':
- ret <<= 10;
- endptr++;
- default:
- break;
- }
-
- if (retptr)
- *retptr = endptr;
-
- return ret;
+ while (isspace(*str))
+ ++str;
+ return (char *)str;
}
+#include "../../../../lib/ctype.c"
+#include "../../../../lib/cmdline.c"
static int
parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
@@ -142,40 +137,41 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
return -EINVAL;
oldp = p;
- *size = _memparse(p, &p);
+ *size = memparse(p, &p);
if (p == oldp)
return -EINVAL;
switch (*p) {
- case '@':
- /* Skip this region, usable */
- *start = 0;
- *size = 0;
- return 0;
case '#':
case '$':
case '!':
- *start = _memparse(p + 1, &p);
+ *start = memparse(p + 1, &p);
+ return 0;
+ case '@':
+ /* memmap=nn@ss specifies usable region, should be skipped */
+ *size = 0;
+ /* Fall through */
+ default:
+ /*
+ * If w/o offset, only size specified, memmap=nn[KMG] has the
+ * same behaviour as mem=nn[KMG]. It limits the max address
+ * system can use. Region above the limit should be avoided.
+ */
+ *start = 0;
return 0;
}
return -EINVAL;
}
-static void mem_avoid_memmap(void)
+static void mem_avoid_memmap(char *str)
{
- char arg[128];
+ static int i;
int rc;
- int i;
- char *str;
- /* See if we have any memmap areas */
- rc = cmdline_find_option("memmap", arg, sizeof(arg));
- if (rc <= 0)
+ if (i >= MAX_MEMMAP_REGIONS)
return;
- i = 0;
- str = arg;
while (str && (i < MAX_MEMMAP_REGIONS)) {
int rc;
unsigned long long start, size;
@@ -188,9 +184,14 @@ static void mem_avoid_memmap(void)
if (rc < 0)
break;
str = k;
- /* A usable region that should not be skipped */
- if (size == 0)
+
+ if (start == 0) {
+ /* Store the specified memory limit if size > 0 */
+ if (size > 0)
+ mem_limit = size;
+
continue;
+ }
mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start;
mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size;
@@ -202,6 +203,57 @@ static void mem_avoid_memmap(void)
memmap_too_large = true;
}
+static int handle_mem_memmap(void)
+{
+ char *args = (char *)get_cmd_line_ptr();
+ size_t len = strlen((char *)args);
+ char *tmp_cmdline;
+ char *param, *val;
+ u64 mem_size;
+
+ if (!strstr(args, "memmap=") && !strstr(args, "mem="))
+ return 0;
+
+ tmp_cmdline = malloc(len + 1);
+ if (!tmp_cmdline )
+ error("Failed to allocate space for tmp_cmdline");
+
+ memcpy(tmp_cmdline, args, len);
+ tmp_cmdline[len] = 0;
+ args = tmp_cmdline;
+
+ /* Chew leading spaces */
+ args = skip_spaces(args);
+
+ while (*args) {
+ args = next_arg(args, &param, &val);
+ /* Stop at -- */
+ if (!val && strcmp(param, "--") == 0) {
+ warn("Only '--' specified in cmdline");
+ free(tmp_cmdline);
+ return -1;
+ }
+
+ if (!strcmp(param, "memmap")) {
+ mem_avoid_memmap(val);
+ } else if (!strcmp(param, "mem")) {
+ char *p = val;
+
+ if (!strcmp(p, "nopentium"))
+ continue;
+ mem_size = memparse(p, &p);
+ if (mem_size == 0) {
+ free(tmp_cmdline);
+ return -EINVAL;
+ }
+ mem_limit = mem_size;
+ }
+ }
+
+ free(tmp_cmdline);
+ return 0;
+}
+
/*
* In theory, KASLR can put the kernel anywhere in the range of [16M, 64T).
* The mem_avoid array is used to store the ranges that need to be avoided
@@ -323,7 +375,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
/* We don't need to set a mapping for setup_data. */
/* Mark the memmap regions we need to avoid */
- mem_avoid_memmap();
+ handle_mem_memmap();
#ifdef CONFIG_X86_VERBOSE_BOOTUP
/* Make sure video RAM can be used. */
@@ -432,7 +484,8 @@ static void process_e820_entry(struct boot_e820_entry *entry,
{
struct mem_vector region, overlap;
struct slot_area slot_area;
- unsigned long start_orig;
+ unsigned long start_orig, end;
+ struct boot_e820_entry cur_entry;
/* Skip non-RAM entries. */
if (entry->type != E820_TYPE_RAM)
@@ -446,8 +499,15 @@ static void process_e820_entry(struct boot_e820_entry *entry,
if (entry->addr + entry->size < minimum)
return;
- region.start = entry->addr;
- region.size = entry->size;
+ /* Ignore entries above memory limit */
+ end = min(entry->size + entry->addr, mem_limit);
+ if (entry->addr >= end)
+ return;
+ cur_entry.addr = entry->addr;
+ cur_entry.size = end - entry->addr;
+
+ region.start = cur_entry.addr;
+ region.size = cur_entry.size;
/* Give up if slot area array is full. */
while (slot_area_index < MAX_SLOT_AREA) {
@@ -461,7 +521,7 @@ static void process_e820_entry(struct boot_e820_entry *entry,
region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
/* Did we raise the address above this e820 region? */
- if (region.start > entry->addr + entry->size)
+ if (region.start > cur_entry.addr + cur_entry.size)
return;
/* Reduce size by any delta from the original address. */
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 5457b02fc050..630e3664906b 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -122,6 +122,14 @@ unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int bas
return result;
}
+long simple_strtol(const char *cp, char **endp, unsigned int base)
+{
+ if (*cp == '-')
+ return -simple_strtoull(cp + 1, endp, base);
+
+ return simple_strtoull(cp, endp, base);
+}
+
/**
* strlen - Find the length of a string
* @s: The string to be sized
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 580b60f5ac83..637feed9a594 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1750,6 +1750,8 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
return ret;
}
+static struct attribute_group x86_pmu_attr_group;
+
static int __init init_hw_perf_events(void)
{
struct x86_pmu_quirk *quirk;
@@ -1813,6 +1815,14 @@ static int __init init_hw_perf_events(void)
x86_pmu_events_group.attrs = tmp;
}
+ if (x86_pmu.attrs) {
+ struct attribute **tmp;
+
+ tmp = merge_attr(x86_pmu_attr_group.attrs, x86_pmu.attrs);
+ if (!WARN_ON(!tmp))
+ x86_pmu_attr_group.attrs = tmp;
+ }
+
pr_info("... version: %d\n", x86_pmu.version);
pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
pr_info("... generic registers: %d\n", x86_pmu.num_counters);
@@ -2224,7 +2234,6 @@ void perf_check_microcode(void)
if (x86_pmu.check_microcode)
x86_pmu.check_microcode();
}
-EXPORT_SYMBOL_GPL(perf_check_microcode);
static struct pmu pmu = {
.pmu_enable = x86_pmu_enable,
@@ -2255,7 +2264,7 @@ static struct pmu pmu = {
void arch_perf_update_userpage(struct perf_event *event,
struct perf_event_mmap_page *userpg, u64 now)
{
- struct cyc2ns_data *data;
+ struct cyc2ns_data data;
u64 offset;
userpg->cap_user_time = 0;
@@ -2267,17 +2276,17 @@ void arch_perf_update_userpage(struct perf_event *event,
if (!using_native_sched_clock() || !sched_clock_stable())
return;
- data = cyc2ns_read_begin();
+ cyc2ns_read_begin(&data);
- offset = data->cyc2ns_offset + __sched_clock_offset;
+ offset = data.cyc2ns_offset + __sched_clock_offset;
/*
* Internal timekeeping for enabled/running/stopped times
* is always in the local_clock domain.
*/
userpg->cap_user_time = 1;
- userpg->time_mult = data->cyc2ns_mul;
- userpg->time_shift = data->cyc2ns_shift;
+ userpg->time_mult = data.cyc2ns_mul;
+ userpg->time_shift = data.cyc2ns_shift;
userpg->time_offset = offset - now;
/*
@@ -2289,7 +2298,7 @@ void arch_perf_update_userpage(struct perf_event *event,
userpg->time_zero = offset;
}
- cyc2ns_read_end(data);
+ cyc2ns_read_end();
}
void
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index a6d91d4e37a1..de26a370176f 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3160,6 +3160,19 @@ err:
return -ENOMEM;
}
+static void flip_smm_bit(void *data)
+{
+ unsigned long set = *(unsigned long *)data;
+
+ if (set > 0) {
+ msr_set_bit(MSR_IA32_DEBUGCTLMSR,
+ DEBUGCTLMSR_FREEZE_IN_SMM_BIT);
+ } else {
+ msr_clear_bit(MSR_IA32_DEBUGCTLMSR,
+ DEBUGCTLMSR_FREEZE_IN_SMM_BIT);
+ }
+}
+
static void intel_pmu_cpu_starting(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
@@ -3174,6 +3187,8 @@ static void intel_pmu_cpu_starting(int cpu)
cpuc->lbr_sel = NULL;
+ flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
+
if (!cpuc->shared_regs)
return;
@@ -3410,12 +3425,10 @@ static void intel_snb_check_microcode(void)
int pebs_broken = 0;
int cpu;
- get_online_cpus();
for_each_online_cpu(cpu) {
if ((pebs_broken = intel_snb_pebs_broken(cpu)))
break;
}
- put_online_cpus();
if (pebs_broken == x86_pmu.pebs_broken)
return;
@@ -3488,7 +3501,9 @@ static bool check_msr(unsigned long msr, u64 mask)
static __init void intel_sandybridge_quirk(void)
{
x86_pmu.check_microcode = intel_snb_check_microcode;
+ cpus_read_lock();
intel_snb_check_microcode();
+ cpus_read_unlock();
}
static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
@@ -3595,6 +3610,52 @@ static struct attribute *hsw_events_attrs[] = {
NULL
};
+static ssize_t freeze_on_smi_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%lu\n", x86_pmu.attr_freeze_on_smi);
+}
+
+static DEFINE_MUTEX(freeze_on_smi_mutex);
+
+static ssize_t freeze_on_smi_store(struct device *cdev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long val;
+ ssize_t ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val > 1)
+ return -EINVAL;
+
+ mutex_lock(&freeze_on_smi_mutex);
+
+ if (x86_pmu.attr_freeze_on_smi == val)
+ goto done;
+
+ x86_pmu.attr_freeze_on_smi = val;
+
+ get_online_cpus();
+ on_each_cpu(flip_smm_bit, &val, 1);
+ put_online_cpus();
+done:
+ mutex_unlock(&freeze_on_smi_mutex);
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(freeze_on_smi);
+
+static struct attribute *intel_pmu_attrs[] = {
+ &dev_attr_freeze_on_smi.attr,
+ NULL,
+};
+
__init int intel_pmu_init(void)
{
union cpuid10_edx edx;
@@ -3641,6 +3702,8 @@ __init int intel_pmu_init(void)
x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
+
+ x86_pmu.attrs = intel_pmu_attrs;
/*
* Quirk: v2 perfmon does not report fixed-purpose events, so
* assume at least 3 events, when not running in a hypervisor:
@@ -4112,13 +4175,12 @@ static __init int fixup_ht_bug(void)
lockup_detector_resume();
- get_online_cpus();
+ cpus_read_lock();
- for_each_online_cpu(c) {
+ for_each_online_cpu(c)
free_excl_cntrs(c);
- }
- put_online_cpus();
+ cpus_read_unlock();
pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
return 0;
}
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index 8c00dc09a5d2..2521f771f2f5 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -1682,7 +1682,7 @@ static int __init intel_cqm_init(void)
*
* Also, check that the scales match on all cpus.
*/
- get_online_cpus();
+ cpus_read_lock();
for_each_online_cpu(cpu) {
struct cpuinfo_x86 *c = &cpu_data(cpu);
@@ -1746,14 +1746,14 @@ static int __init intel_cqm_init(void)
* Setup the hot cpu notifier once we are sure cqm
* is enabled to avoid notifier leak.
*/
- cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_STARTING,
- "perf/x86/cqm:starting",
- intel_cqm_cpu_starting, NULL);
- cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_ONLINE, "perf/x86/cqm:online",
- NULL, intel_cqm_cpu_exit);
-
+ cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_STARTING,
+ "perf/x86/cqm:starting",
+ intel_cqm_cpu_starting, NULL);
+ cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_ONLINE,
+ "perf/x86/cqm:online",
+ NULL, intel_cqm_cpu_exit);
out:
- put_online_cpus();
+ cpus_read_unlock();
if (ret) {
kfree(str);
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index be3d36254040..53728eea1bed 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -562,6 +562,9 @@ struct x86_pmu {
ssize_t (*events_sysfs_show)(char *page, u64 config);
struct attribute **cpu_events;
+ unsigned long attr_freeze_on_smi;
+ struct attribute **attrs;
+
/*
* CPU Hotplug hooks
*/
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 00c88a01301d..da181ad1d5f8 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -3,6 +3,7 @@
#include <linux/ioport.h>
#include <linux/pci.h>
+#include <linux/refcount.h>
struct amd_nb_bus_dev_range {
u8 bus;
@@ -55,7 +56,7 @@ struct threshold_bank {
struct threshold_block *blocks;
/* initialized to the number of CPUs on the node sharing this bank */
- atomic_t cpus;
+ refcount_t cpus;
};
struct amd_northbridge {
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index d8638e2419cc..d406894cd9a2 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -137,6 +137,8 @@
#define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9)
#define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10)
#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11)
+#define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14
+#define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
#define MSR_PEBS_FRONTEND 0x000003f7
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 27e9f9d769b8..2016962103df 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -29,11 +29,9 @@ struct cyc2ns_data {
u32 cyc2ns_mul;
u32 cyc2ns_shift;
u64 cyc2ns_offset;
- u32 __count;
- /* u32 hole */
-}; /* 24 bytes -- do not grow */
+}; /* 16 bytes */
-extern struct cyc2ns_data *cyc2ns_read_begin(void);
-extern void cyc2ns_read_end(struct cyc2ns_data *);
+extern void cyc2ns_read_begin(struct cyc2ns_data *);
+extern void cyc2ns_read_end(void);
#endif /* _ASM_X86_TIMER_H */
diff --git a/arch/x86/include/asm/tlbbatch.h b/arch/x86/include/asm/tlbbatch.h
new file mode 100644
index 000000000000..01a6de16fb96
--- /dev/null
+++ b/arch/x86/include/asm/tlbbatch.h
@@ -0,0 +1,16 @@
+#ifndef _ARCH_X86_TLBBATCH_H
+#define _ARCH_X86_TLBBATCH_H
+
+#include <linux/cpumask.h>
+
+#ifdef CONFIG_SMP
+struct arch_tlbflush_unmap_batch {
+ /*
+ * Each bit set is a CPU that potentially has a TLB entry for one of
+ * the PFNs being flushed..
+ */
+ struct cpumask cpumask;
+};
+#endif
+
+#endif /* _ARCH_X86_TLBBATCH_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6ed9ea469b48..8f6e2f87511b 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -307,11 +307,15 @@ static inline void flush_tlb_kernel_range(unsigned long start,
flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
extern void flush_tlb_all(void);
-extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag);
extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
+static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
+{
+ flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
+}
+
void native_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm,
unsigned long start, unsigned long end);
@@ -325,6 +329,14 @@ static inline void reset_lazy_tlbstate(void)
this_cpu_write(cpu_tlbstate.active_mm, &init_mm);
}
+static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
+ struct mm_struct *mm)
+{
+ cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+}
+
+extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
+
#endif /* SMP */
#ifndef CONFIG_PARAVIRT
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 347bb9f65737..247880fc29f9 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1200,28 +1200,6 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
static struct irq_chip ioapic_chip, ioapic_ir_chip;
-#ifdef CONFIG_X86_32
-static inline int IO_APIC_irq_trigger(int irq)
-{
- int apic, idx, pin;
-
- for_each_ioapic_pin(apic, pin) {
- idx = find_irq_entry(apic, pin, mp_INT);
- if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin, 0)))
- return irq_trigger(idx);
- }
- /*
- * nonexistent IRQs are edge default
- */
- return 0;
-}
-#else
-static inline int IO_APIC_irq_trigger(int irq)
-{
- return 1;
-}
-#endif
-
static void __init setup_IO_APIC_irqs(void)
{
unsigned int ioapic, pin;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 6e4a047e4b68..d00f299f2ada 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -164,17 +164,48 @@ static void default_deferred_error_interrupt(void)
}
void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
-static void get_smca_bank_info(unsigned int bank)
+static void smca_configure(unsigned int bank, unsigned int cpu)
{
- unsigned int i, hwid_mcatype, cpu = smp_processor_id();
+ unsigned int i, hwid_mcatype;
struct smca_hwid *s_hwid;
- u32 high, instance_id;
+ u32 high, low;
+ u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank);
+
+ /* Set appropriate bits in MCA_CONFIG */
+ if (!rdmsr_safe(smca_config, &low, &high)) {
+ /*
+ * OS is required to set the MCAX bit to acknowledge that it is
+ * now using the new MSR ranges and new registers under each
+ * bank. It also means that the OS will configure deferred
+ * errors in the new MCx_CONFIG register. If the bit is not set,
+ * uncorrectable errors will cause a system panic.
+ *
+ * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.)
+ */
+ high |= BIT(0);
+
+ /*
+ * SMCA sets the Deferred Error Interrupt type per bank.
+ *
+ * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
+ * if the DeferredIntType bit field is available.
+ *
+ * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
+ * high portion of the MSR). OS should set this to 0x1 to enable
+ * APIC based interrupt. First, check that no interrupt has been
+ * set.
+ */
+ if ((low & BIT(5)) && !((high >> 5) & 0x3))
+ high |= BIT(5);
+
+ wrmsr(smca_config, low, high);
+ }
/* Collect bank_info using CPU 0 for now. */
if (cpu)
return;
- if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instance_id, &high)) {
+ if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
return;
}
@@ -191,7 +222,7 @@ static void get_smca_bank_info(unsigned int bank)
smca_get_name(s_hwid->bank_type));
smca_banks[bank].hwid = s_hwid;
- smca_banks[bank].id = instance_id;
+ smca_banks[bank].id = low;
smca_banks[bank].sysfs_id = s_hwid->count++;
break;
}
@@ -433,7 +464,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
int offset, u32 misc_high)
{
unsigned int cpu = smp_processor_id();
- u32 smca_low, smca_high, smca_addr;
+ u32 smca_low, smca_high;
struct threshold_block b;
int new;
@@ -457,51 +488,6 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
goto set_offset;
}
- smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank);
-
- if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) {
- /*
- * OS is required to set the MCAX bit to acknowledge that it is
- * now using the new MSR ranges and new registers under each
- * bank. It also means that the OS will configure deferred
- * errors in the new MCx_CONFIG register. If the bit is not set,
- * uncorrectable errors will cause a system panic.
- *
- * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.)
- */
- smca_high |= BIT(0);
-
- /*
- * SMCA logs Deferred Error information in MCA_DE{STAT,ADDR}
- * registers with the option of additionally logging to
- * MCA_{STATUS,ADDR} if MCA_CONFIG[LogDeferredInMcaStat] is set.
- *
- * This bit is usually set by BIOS to retain the old behavior
- * for OSes that don't use the new registers. Linux supports the
- * new registers so let's disable that additional logging here.
- *
- * MCA_CONFIG[LogDeferredInMcaStat] is bit 34 (bit 2 in the high
- * portion of the MSR).
- */
- smca_high &= ~BIT(2);
-
- /*
- * SMCA sets the Deferred Error Interrupt type per bank.
- *
- * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
- * if the DeferredIntType bit field is available.
- *
- * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
- * high portion of the MSR). OS should set this to 0x1 to enable
- * APIC based interrupt. First, check that no interrupt has been
- * set.
- */
- if ((smca_low & BIT(5)) && !((smca_high >> 5) & 0x3))
- smca_high |= BIT(5);
-
- wrmsr(smca_addr, smca_low, smca_high);
- }
-
/* Gather LVT offset for thresholding: */
if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
goto out;
@@ -530,7 +516,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (mce_flags.smca)
- get_smca_bank_info(bank);
+ smca_configure(bank, cpu);
for (block = 0; block < NR_BLOCKS; ++block) {
address = get_block_address(cpu, address, low, high, bank, block);
@@ -755,37 +741,19 @@ out_err:
}
EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
-static void
-__log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
+static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
{
- u32 msr_status = msr_ops.status(bank);
- u32 msr_addr = msr_ops.addr(bank);
struct mce m;
- u64 status;
-
- WARN_ON_ONCE(deferred_err && threshold_err);
-
- if (deferred_err && mce_flags.smca) {
- msr_status = MSR_AMD64_SMCA_MCx_DESTAT(bank);
- msr_addr = MSR_AMD64_SMCA_MCx_DEADDR(bank);
- }
-
- rdmsrl(msr_status, status);
-
- if (!(status & MCI_STATUS_VAL))
- return;
mce_setup(&m);
m.status = status;
+ m.misc = misc;
m.bank = bank;
m.tsc = rdtsc();
- if (threshold_err)
- m.misc = misc;
-
if (m.status & MCI_STATUS_ADDRV) {
- rdmsrl(msr_addr, m.addr);
+ m.addr = addr;
/*
* Extract [55:<lsb>] where lsb is the least significant
@@ -806,8 +774,6 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
}
mce_log(&m);
-
- wrmsrl(msr_status, 0);
}
static inline void __smp_deferred_error_interrupt(void)
@@ -832,45 +798,85 @@ asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void)
exiting_ack_irq();
}
-/* APIC interrupt handler for deferred errors */
-static void amd_deferred_error_interrupt(void)
+/*
+ * Returns true if the logged error is deferred. False, otherwise.
+ */
+static inline bool
+_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
{
- unsigned int bank;
- u32 msr_status;
- u64 status;
+ u64 status, addr = 0;
- for (bank = 0; bank < mca_cfg.banks; ++bank) {
- msr_status = (mce_flags.smca) ? MSR_AMD64_SMCA_MCx_DESTAT(bank)
- : msr_ops.status(bank);
+ rdmsrl(msr_stat, status);
+ if (!(status & MCI_STATUS_VAL))
+ return false;
- rdmsrl(msr_status, status);
+ if (status & MCI_STATUS_ADDRV)
+ rdmsrl(msr_addr, addr);
- if (!(status & MCI_STATUS_VAL) ||
- !(status & MCI_STATUS_DEFERRED))
- continue;
+ __log_error(bank, status, addr, misc);
- __log_error(bank, true, false, 0);
- break;
- }
+ wrmsrl(status, 0);
+
+ return status & MCI_STATUS_DEFERRED;
}
/*
- * APIC Interrupt Handler
+ * We have three scenarios for checking for Deferred errors:
+ *
+ * 1) Non-SMCA systems check MCA_STATUS and log error if found.
+ * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
+ * clear MCA_DESTAT.
+ * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
+ * log it.
*/
+static void log_error_deferred(unsigned int bank)
+{
+ bool defrd;
+
+ defrd = _log_error_bank(bank, msr_ops.status(bank),
+ msr_ops.addr(bank), 0);
+
+ if (!mce_flags.smca)
+ return;
+
+ /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */
+ if (defrd) {
+ wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
+ return;
+ }
+
+ /*
+ * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check
+ * for a valid error.
+ */
+ _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank),
+ MSR_AMD64_SMCA_MCx_DEADDR(bank), 0);
+}
+
+/* APIC interrupt handler for deferred errors */
+static void amd_deferred_error_interrupt(void)
+{
+ unsigned int bank;
+
+ for (bank = 0; bank < mca_cfg.banks; ++bank)
+ log_error_deferred(bank);
+}
+
+static void log_error_thresholding(unsigned int bank, u64 misc)
+{
+ _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc);
+}
/*
- * threshold interrupt handler will service THRESHOLD_APIC_VECTOR.
- * the interrupt goes off when error_count reaches threshold_limit.
- * the handler will simply log mcelog w/ software defined bank number.
+ * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt
+ * goes off when error_count reaches threshold_limit.
*/
-
static void amd_threshold_interrupt(void)
{
u32 low = 0, high = 0, address = 0;
unsigned int bank, block, cpu = smp_processor_id();
struct thresh_restart tr;
- /* assume first bank caused it */
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
@@ -893,23 +899,18 @@ static void amd_threshold_interrupt(void)
(high & MASK_LOCKED_HI))
continue;
- /*
- * Log the machine check that caused the threshold
- * event.
- */
- if (high & MASK_OVERFLOW_HI)
- goto log;
- }
- }
- return;
+ if (!(high & MASK_OVERFLOW_HI))
+ continue;
-log:
- __log_error(bank, false, true, ((u64)high << 32) | low);
+ /* Log the MCE which caused the threshold event. */
+ log_error_thresholding(bank, ((u64)high << 32) | low);
- /* Reset threshold block after logging error. */
- memset(&tr, 0, sizeof(tr));
- tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block];
- threshold_restart_bank(&tr);
+ /* Reset threshold block after logging error. */
+ memset(&tr, 0, sizeof(tr));
+ tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block];
+ threshold_restart_bank(&tr);
+ }
+ }
}
/*
@@ -1202,7 +1203,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
goto out;
per_cpu(threshold_banks, cpu)[bank] = b;
- atomic_inc(&b->cpus);
+ refcount_inc(&b->cpus);
err = __threshold_add_blocks(b);
@@ -1225,7 +1226,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
per_cpu(threshold_banks, cpu)[bank] = b;
if (is_shared_bank(bank)) {
- atomic_set(&b->cpus, 1);
+ refcount_set(&b->cpus, 1);
/* nb is already initialized, see above */
if (nb) {
@@ -1289,7 +1290,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
goto free_out;
if (is_shared_bank(bank)) {
- if (!atomic_dec_and_test(&b->cpus)) {
+ if (!refcount_dec_and_test(&b->cpus)) {
__threshold_remove_blocks(b);
per_cpu(threshold_banks, cpu)[bank] = NULL;
return;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 45db4d2ebd01..e9f4d762aa5b 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -320,7 +320,7 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax)
}
static enum ucode_state
-load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size);
+load_microcode_amd(bool save, u8 family, const u8 *data, size_t size);
int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
{
@@ -338,8 +338,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
if (!desc.mc)
return -EINVAL;
- ret = load_microcode_amd(smp_processor_id(), x86_family(cpuid_1_eax),
- desc.data, desc.size);
+ ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size);
if (ret != UCODE_OK)
return -EINVAL;
@@ -675,7 +674,7 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
}
static enum ucode_state
-load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
+load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
{
enum ucode_state ret;
@@ -689,8 +688,8 @@ load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
#ifdef CONFIG_X86_32
/* save BSP's matching patch for early load */
- if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) {
- struct ucode_patch *p = find_patch(cpu);
+ if (save) {
+ struct ucode_patch *p = find_patch(0);
if (p) {
memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data),
@@ -722,11 +721,12 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ bool bsp = c->cpu_index == boot_cpu_data.cpu_index;
enum ucode_state ret = UCODE_NFOUND;
const struct firmware *fw;
/* reload ucode container only on the boot cpu */
- if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index)
+ if (!refresh_fw || !bsp)
return UCODE_OK;
if (c->x86 >= 0x15)
@@ -743,7 +743,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
goto fw_release;
}
- ret = load_microcode_amd(cpu, c->x86, fw->data, fw->size);
+ ret = load_microcode_amd(bsp, c->x86, fw->data, fw->size);
fw_release:
release_firmware(fw);
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 2bce84d91c2b..c5bb63be4ba1 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -807,10 +807,8 @@ void mtrr_save_state(void)
if (!mtrr_enabled())
return;
- get_online_cpus();
first_cpu = cpumask_first(cpu_online_mask);
smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
- put_online_cpus();
}
void set_mtrr_aps_delayed_init(void)
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index c37bd0f39c70..ab4f491da2a9 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -105,11 +105,9 @@ static void __jump_label_transform(struct jump_entry *entry,
void arch_jump_label_transform(struct jump_entry *entry,
enum jump_label_type type)
{
- get_online_cpus();
mutex_lock(&text_mutex);
__jump_label_transform(entry, type, NULL, 0);
mutex_unlock(&text_mutex);
- put_online_cpus();
}
static enum {
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ff40e74c9181..ffeae818aa7a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -78,7 +78,7 @@ void __show_regs(struct pt_regs *regs, int all)
printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip);
printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags,
- smp_processor_id());
+ raw_smp_processor_id());
printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
regs->ax, regs->bx, regs->cx, regs->dx);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f04479a8f74f..045e4f993bd2 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -863,7 +863,7 @@ static void announce_cpu(int cpu, int apicid)
if (cpu == 1)
printk(KERN_INFO "x86: Booting SMP configuration:\n");
- if (system_state == SYSTEM_BOOTING) {
+ if (system_state < SYSTEM_RUNNING) {
if (node != current_node) {
if (current_node > (-1))
pr_cont("\n");
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 714dfba6a1e7..5270fc0c2df6 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -51,115 +51,34 @@ static u32 art_to_tsc_denominator;
static u64 art_to_tsc_offset;
struct clocksource *art_related_clocksource;
-/*
- * Use a ring-buffer like data structure, where a writer advances the head by
- * writing a new data entry and a reader advances the tail when it observes a
- * new entry.
- *
- * Writers are made to wait on readers until there's space to write a new
- * entry.
- *
- * This means that we can always use an {offset, mul} pair to compute a ns
- * value that is 'roughly' in the right direction, even if we're writing a new
- * {offset, mul} pair during the clock read.
- *
- * The down-side is that we can no longer guarantee strict monotonicity anymore
- * (assuming the TSC was that to begin with), because while we compute the
- * intersection point of the two clock slopes and make sure the time is
- * continuous at the point of switching; we can no longer guarantee a reader is
- * strictly before or after the switch point.
- *
- * It does mean a reader no longer needs to disable IRQs in order to avoid
- * CPU-Freq updates messing with his times, and similarly an NMI reader will
- * no longer run the risk of hitting half-written state.
- */
-
struct cyc2ns {
- struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */
- struct cyc2ns_data *head; /* 48 + 8 = 56 */
- struct cyc2ns_data *tail; /* 56 + 8 = 64 */
-}; /* exactly fits one cacheline */
-
-static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
-
-struct cyc2ns_data *cyc2ns_read_begin(void)
-{
- struct cyc2ns_data *head;
-
- preempt_disable();
-
- head = this_cpu_read(cyc2ns.head);
- /*
- * Ensure we observe the entry when we observe the pointer to it.
- * matches the wmb from cyc2ns_write_end().
- */
- smp_read_barrier_depends();
- head->__count++;
- barrier();
+ struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */
+ seqcount_t seq; /* 32 + 4 = 36 */
- return head;
-}
+}; /* fits one cacheline */
-void cyc2ns_read_end(struct cyc2ns_data *head)
-{
- barrier();
- /*
- * If we're the outer most nested read; update the tail pointer
- * when we're done. This notifies possible pending writers
- * that we've observed the head pointer and that the other
- * entry is now free.
- */
- if (!--head->__count) {
- /*
- * x86-TSO does not reorder writes with older reads;
- * therefore once this write becomes visible to another
- * cpu, we must be finished reading the cyc2ns_data.
- *
- * matches with cyc2ns_write_begin().
- */
- this_cpu_write(cyc2ns.tail, head);
- }
- preempt_enable();
-}
+static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
-/*
- * Begin writing a new @data entry for @cpu.
- *
- * Assumes some sort of write side lock; currently 'provided' by the assumption
- * that cpufreq will call its notifiers sequentially.
- */
-static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
+void cyc2ns_read_begin(struct cyc2ns_data *data)
{
- struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
- struct cyc2ns_data *data = c2n->data;
+ int seq, idx;
- if (data == c2n->head)
- data++;
+ preempt_disable_notrace();
- /* XXX send an IPI to @cpu in order to guarantee a read? */
+ do {
+ seq = this_cpu_read(cyc2ns.seq.sequence);
+ idx = seq & 1;
- /*
- * When we observe the tail write from cyc2ns_read_end(),
- * the cpu must be done with that entry and its safe
- * to start writing to it.
- */
- while (c2n->tail == data)
- cpu_relax();
+ data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
+ data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
+ data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
- return data;
+ } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
}
-static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
+void cyc2ns_read_end(void)
{
- struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
-
- /*
- * Ensure the @data writes are visible before we publish the
- * entry. Matches the data-depencency in cyc2ns_read_begin().
- */
- smp_wmb();
-
- ACCESS_ONCE(c2n->head) = data;
+ preempt_enable_notrace();
}
/*
@@ -191,7 +110,6 @@ static void cyc2ns_data_init(struct cyc2ns_data *data)
data->cyc2ns_mul = 0;
data->cyc2ns_shift = 0;
data->cyc2ns_offset = 0;
- data->__count = 0;
}
static void cyc2ns_init(int cpu)
@@ -201,51 +119,29 @@ static void cyc2ns_init(int cpu)
cyc2ns_data_init(&c2n->data[0]);
cyc2ns_data_init(&c2n->data[1]);
- c2n->head = c2n->data;
- c2n->tail = c2n->data;
+ seqcount_init(&c2n->seq);
}
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
- struct cyc2ns_data *data, *tail;
+ struct cyc2ns_data data;
unsigned long long ns;
- /*
- * See cyc2ns_read_*() for details; replicated in order to avoid
- * an extra few instructions that came with the abstraction.
- * Notable, it allows us to only do the __count and tail update
- * dance when its actually needed.
- */
-
- preempt_disable_notrace();
- data = this_cpu_read(cyc2ns.head);
- tail = this_cpu_read(cyc2ns.tail);
-
- if (likely(data == tail)) {
- ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
- } else {
- data->__count++;
-
- barrier();
-
- ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
+ cyc2ns_read_begin(&data);
- barrier();
+ ns = data.cyc2ns_offset;
+ ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
- if (!--data->__count)
- this_cpu_write(cyc2ns.tail, data);
- }
- preempt_enable_notrace();
+ cyc2ns_read_end();
return ns;
}
-static void set_cyc2ns_scale(unsigned long khz, int cpu)
+static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
{
- unsigned long long tsc_now, ns_now;
- struct cyc2ns_data *data;
+ unsigned long long ns_now;
+ struct cyc2ns_data data;
+ struct cyc2ns *c2n;
unsigned long flags;
local_irq_save(flags);
@@ -254,9 +150,6 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
if (!khz)
goto done;
- data = cyc2ns_write_begin(cpu);
-
- tsc_now = rdtsc();
ns_now = cycles_2_ns(tsc_now);
/*
@@ -264,7 +157,7 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
* time function is continuous; see the comment near struct
* cyc2ns_data.
*/
- clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, khz,
+ clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz,
NSEC_PER_MSEC, 0);
/*
@@ -273,20 +166,26 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
* conversion algorithm shifting a 32-bit value (now specifies a 64-bit
* value) - refer perf_event_mmap_page documentation in perf_event.h.
*/
- if (data->cyc2ns_shift == 32) {
- data->cyc2ns_shift = 31;
- data->cyc2ns_mul >>= 1;
+ if (data.cyc2ns_shift == 32) {
+ data.cyc2ns_shift = 31;
+ data.cyc2ns_mul >>= 1;
}
- data->cyc2ns_offset = ns_now -
- mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
+ data.cyc2ns_offset = ns_now -
+ mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift);
+
+ c2n = per_cpu_ptr(&cyc2ns, cpu);
- cyc2ns_write_end(cpu, data);
+ raw_write_seqcount_latch(&c2n->seq);
+ c2n->data[0] = data;
+ raw_write_seqcount_latch(&c2n->seq);
+ c2n->data[1] = data;
done:
- sched_clock_idle_wakeup_event(0);
+ sched_clock_idle_wakeup_event();
local_irq_restore(flags);
}
+
/*
* Scheduler clock - returns current time in nanosec units.
*/
@@ -374,6 +273,8 @@ static int __init tsc_setup(char *str)
tsc_clocksource_reliable = 1;
if (!strncmp(str, "noirqtime", 9))
no_sched_irq_time = 1;
+ if (!strcmp(str, "unstable"))
+ mark_tsc_unstable("boot parameter");
return 1;
}
@@ -986,7 +887,6 @@ void tsc_restore_sched_clock_state(void)
}
#ifdef CONFIG_CPU_FREQ
-
/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
* changes.
*
@@ -1027,7 +927,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
mark_tsc_unstable("cpufreq changes");
- set_cyc2ns_scale(tsc_khz, freq->cpu);
+ set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
}
return 0;
@@ -1127,6 +1027,15 @@ static void tsc_cs_mark_unstable(struct clocksource *cs)
pr_info("Marking TSC unstable due to clocksource watchdog\n");
}
+static void tsc_cs_tick_stable(struct clocksource *cs)
+{
+ if (tsc_unstable)
+ return;
+
+ if (using_native_sched_clock())
+ sched_clock_tick_stable();
+}
+
/*
* .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
*/
@@ -1140,6 +1049,7 @@ static struct clocksource clocksource_tsc = {
.archdata = { .vclock_mode = VCLOCK_TSC },
.resume = tsc_resume,
.mark_unstable = tsc_cs_mark_unstable,
+ .tick_stable = tsc_cs_tick_stable,
};
void mark_tsc_unstable(char *reason)
@@ -1255,6 +1165,7 @@ static void tsc_refine_calibration_work(struct work_struct *work)
static int hpet;
u64 tsc_stop, ref_stop, delta;
unsigned long freq;
+ int cpu;
/* Don't bother refining TSC on unstable systems */
if (check_tsc_unstable())
@@ -1305,6 +1216,10 @@ static void tsc_refine_calibration_work(struct work_struct *work)
/* Inform the TSC deadline clockevent devices about the recalibration */
lapic_update_tsc_freq();
+ /* Update the sched_clock() rate to match the clocksource one */
+ for_each_possible_cpu(cpu)
+ set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);
+
out:
if (boot_cpu_has(X86_FEATURE_ART))
art_related_clocksource = &clocksource_tsc;
@@ -1350,7 +1265,7 @@ device_initcall(init_tsc_clocksource);
void __init tsc_init(void)
{
- u64 lpj;
+ u64 lpj, cyc;
int cpu;
if (!boot_cpu_has(X86_FEATURE_TSC)) {
@@ -1390,9 +1305,10 @@ void __init tsc_init(void)
* speed as the bootup CPU. (cpufreq notifiers will fix this
* up if their speed diverges)
*/
+ cyc = rdtsc();
for_each_possible_cpu(cpu) {
cyc2ns_init(cpu);
- set_cyc2ns_scale(tsc_khz, cpu);
+ set_cyc2ns_scale(tsc_khz, cpu, cyc);
}
if (tsc_disabled > 0)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6e7bedf69af7..743e4c6b4529 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -237,24 +237,26 @@ static void flush_tlb_func(void *info)
return;
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
- if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
- if (f->flush_end == TLB_FLUSH_ALL) {
- local_flush_tlb();
- trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
- } else {
- unsigned long addr;
- unsigned long nr_pages =
- (f->flush_end - f->flush_start) / PAGE_SIZE;
- addr = f->flush_start;
- while (addr < f->flush_end) {
- __flush_tlb_single(addr);
- addr += PAGE_SIZE;
- }
- trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
- }
- } else
+
+ if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
leave_mm(smp_processor_id());
+ return;
+ }
+ if (f->flush_end == TLB_FLUSH_ALL) {
+ local_flush_tlb();
+ trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
+ } else {
+ unsigned long addr;
+ unsigned long nr_pages =
+ (f->flush_end - f->flush_start) / PAGE_SIZE;
+ addr = f->flush_start;
+ while (addr < f->flush_end) {
+ __flush_tlb_single(addr);
+ addr += PAGE_SIZE;
+ }
+ trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
+ }
}
void native_flush_tlb_others(const struct cpumask *cpumask,
@@ -354,33 +356,6 @@ out:
preempt_enable();
}
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
-{
- struct mm_struct *mm = vma->vm_mm;
-
- preempt_disable();
-
- if (current->active_mm == mm) {
- if (current->mm) {
- /*
- * Implicit full barrier (INVLPG) that synchronizes
- * with switch_mm.
- */
- __flush_tlb_one(start);
- } else {
- leave_mm(smp_processor_id());
-
- /* Synchronize with switch_mm. */
- smp_mb();
- }
- }
-
- if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE);
-
- preempt_enable();
-}
-
static void do_flush_tlb_all(void *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
@@ -420,6 +395,23 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
}
}
+void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
+{
+ int cpu = get_cpu();
+
+ if (cpumask_test_cpu(cpu, &batch->cpumask)) {
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ local_flush_tlb();
+ trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
+ }
+
+ if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
+ flush_tlb_others(&batch->cpumask, NULL, 0, TLB_FLUSH_ALL);
+ cpumask_clear(&batch->cpumask);
+
+ put_cpu();
+}
+
static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 7e76a4d8304b..43b96f5f78ba 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -828,9 +828,11 @@ static void __init kexec_enter_virtual_mode(void)
/*
* We don't do virtual mode, since we don't do runtime services, on
- * non-native EFI
+ * non-native EFI. With efi=old_map, we don't do runtime services in
+ * kexec kernel because in the initial boot something else might
+ * have been mapped at these virtual addresses.
*/
- if (!efi_is_native()) {
+ if (!efi_is_native() || efi_enabled(EFI_OLD_MEMMAP)) {
efi_memmap_unmap();
clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
return;
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index c488625c9712..eb8dff15a7f6 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -71,11 +71,13 @@ static void __init early_code_mapping_set_exec(int executable)
pgd_t * __init efi_call_phys_prolog(void)
{
- unsigned long vaddress;
- pgd_t *save_pgd;
+ unsigned long vaddr, addr_pgd, addr_p4d, addr_pud;
+ pgd_t *save_pgd, *pgd_k, *pgd_efi;
+ p4d_t *p4d, *p4d_k, *p4d_efi;
+ pud_t *pud;
int pgd;
- int n_pgds;
+ int n_pgds, i, j;
if (!efi_enabled(EFI_OLD_MEMMAP)) {
save_pgd = (pgd_t *)read_cr3();
@@ -88,10 +90,49 @@ pgd_t * __init efi_call_phys_prolog(void)
n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE);
save_pgd = kmalloc_array(n_pgds, sizeof(*save_pgd), GFP_KERNEL);
+ /*
+ * Build 1:1 identity mapping for efi=old_map usage. Note that
+ * PAGE_OFFSET is PGDIR_SIZE aligned when KASLR is disabled, while
+ * it is PUD_SIZE ALIGNED with KASLR enabled. So for a given physical
+ * address X, the pud_index(X) != pud_index(__va(X)), we can only copy
+ * PUD entry of __va(X) to fill in pud entry of X to build 1:1 mapping.
+ * This means here we can only reuse the PMD tables of the direct mapping.
+ */
for (pgd = 0; pgd < n_pgds; pgd++) {
- save_pgd[pgd] = *pgd_offset_k(pgd * PGDIR_SIZE);
- vaddress = (unsigned long)__va(pgd * PGDIR_SIZE);
- set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress));
+ addr_pgd = (unsigned long)(pgd * PGDIR_SIZE);
+ vaddr = (unsigned long)__va(pgd * PGDIR_SIZE);
+ pgd_efi = pgd_offset_k(addr_pgd);
+ save_pgd[pgd] = *pgd_efi;
+
+ p4d = p4d_alloc(&init_mm, pgd_efi, addr_pgd);
+ if (!p4d) {
+ pr_err("Failed to allocate p4d table!\n");
+ goto out;
+ }
+
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ addr_p4d = addr_pgd + i * P4D_SIZE;
+ p4d_efi = p4d + p4d_index(addr_p4d);
+
+ pud = pud_alloc(&init_mm, p4d_efi, addr_p4d);
+ if (!pud) {
+ pr_err("Failed to allocate pud table!\n");
+ goto out;
+ }
+
+ for (j = 0; j < PTRS_PER_PUD; j++) {
+ addr_pud = addr_p4d + j * PUD_SIZE;
+
+ if (addr_pud > (max_pfn << PAGE_SHIFT))
+ break;
+
+ vaddr = (unsigned long)__va(addr_pud);
+
+ pgd_k = pgd_offset_k(vaddr);
+ p4d_k = p4d_offset(pgd_k, vaddr);
+ pud[j] = *pud_offset(p4d_k, vaddr);
+ }
+ }
}
out:
__flush_tlb_all();
@@ -104,8 +145,11 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
/*
* After the lock is released, the original page table is restored.
*/
- int pgd_idx;
+ int pgd_idx, i;
int nr_pgds;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
if (!efi_enabled(EFI_OLD_MEMMAP)) {
write_cr3((unsigned long)save_pgd);
@@ -115,9 +159,28 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
nr_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
- for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++)
+ for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++) {
+ pgd = pgd_offset_k(pgd_idx * PGDIR_SIZE);
set_pgd(pgd_offset_k(pgd_idx * PGDIR_SIZE), save_pgd[pgd_idx]);
+ if (!(pgd_val(*pgd) & _PAGE_PRESENT))
+ continue;
+
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ p4d = p4d_offset(pgd,
+ pgd_idx * PGDIR_SIZE + i * P4D_SIZE);
+
+ if (!(p4d_val(*p4d) & _PAGE_PRESENT))
+ continue;
+
+ pud = (pud_t *)p4d_page_vaddr(*p4d);
+ pud_free(&init_mm, pud);
+ }
+
+ p4d = (p4d_t *)pgd_page_vaddr(*pgd);
+ p4d_free(&init_mm, p4d);
+ }
+
kfree(save_pgd);
__flush_tlb_all();
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 26615991d69c..e0cf95a83f3f 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -360,6 +360,9 @@ void __init efi_free_boot_services(void)
free_bootmem_late(start, size);
}
+ if (!num_entries)
+ return;
+
new_size = efi.memmap.desc_size * num_entries;
new_phys = efi_memmap_alloc(num_entries);
if (!new_phys) {
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 42e65fee5673..795671593528 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -456,12 +456,13 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
*/
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
- struct cyc2ns_data *data = cyc2ns_read_begin();
+ struct cyc2ns_data data;
unsigned long long ns;
- ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
+ cyc2ns_read_begin(&data);
+ ns = mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
+ cyc2ns_read_end();
- cyc2ns_read_end(data);
return ns;
}
@@ -470,12 +471,13 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
*/
static inline unsigned long long ns_2_cycles(unsigned long long ns)
{
- struct cyc2ns_data *data = cyc2ns_read_begin();
+ struct cyc2ns_data data;
unsigned long long cyc;
- cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul;
+ cyc2ns_read_begin(&data);
+ cyc = (ns << data.cyc2ns_shift) / data.cyc2ns_mul;
+ cyc2ns_read_end();
- cyc2ns_read_end(data);
return cyc;
}
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index d0855c09f32f..d2c8a9286fa8 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -89,14 +89,14 @@ bool ghes_disable;
module_param_named(disable, ghes_disable, bool, 0);
/*
- * All error sources notified with SCI shares one notifier function,
- * so they need to be linked and checked one by one. This is applied
- * to NMI too.
+ * All error sources notified with HED (Hardware Error Device) share a
+ * single notifier callback, so they need to be linked and checked one
+ * by one. This holds true for NMI too.
*
* RCU is used for these lists, so ghes_list_mutex is only used for
* list changing, not for traversing.
*/
-static LIST_HEAD(ghes_sci);
+static LIST_HEAD(ghes_hed);
static DEFINE_MUTEX(ghes_list_mutex);
/*
@@ -702,14 +702,14 @@ static irqreturn_t ghes_irq_func(int irq, void *data)
return IRQ_HANDLED;
}
-static int ghes_notify_sci(struct notifier_block *this,
- unsigned long event, void *data)
+static int ghes_notify_hed(struct notifier_block *this, unsigned long event,
+ void *data)
{
struct ghes *ghes;
int ret = NOTIFY_DONE;
rcu_read_lock();
- list_for_each_entry_rcu(ghes, &ghes_sci, list) {
+ list_for_each_entry_rcu(ghes, &ghes_hed, list) {
if (!ghes_proc(ghes))
ret = NOTIFY_OK;
}
@@ -718,8 +718,8 @@ static int ghes_notify_sci(struct notifier_block *this,
return ret;
}
-static struct notifier_block ghes_notifier_sci = {
- .notifier_call = ghes_notify_sci,
+static struct notifier_block ghes_notifier_hed = {
+ .notifier_call = ghes_notify_hed,
};
#ifdef CONFIG_HAVE_ACPI_APEI_NMI
@@ -966,7 +966,10 @@ static int ghes_probe(struct platform_device *ghes_dev)
case ACPI_HEST_NOTIFY_POLLED:
case ACPI_HEST_NOTIFY_EXTERNAL:
case ACPI_HEST_NOTIFY_SCI:
+ case ACPI_HEST_NOTIFY_GSIV:
+ case ACPI_HEST_NOTIFY_GPIO:
break;
+
case ACPI_HEST_NOTIFY_NMI:
if (!IS_ENABLED(CONFIG_HAVE_ACPI_APEI_NMI)) {
pr_warn(GHES_PFX "Generic hardware error source: %d notified via NMI interrupt is not supported!\n",
@@ -1024,13 +1027,17 @@ static int ghes_probe(struct platform_device *ghes_dev)
goto err_edac_unreg;
}
break;
+
case ACPI_HEST_NOTIFY_SCI:
+ case ACPI_HEST_NOTIFY_GSIV:
+ case ACPI_HEST_NOTIFY_GPIO:
mutex_lock(&ghes_list_mutex);
- if (list_empty(&ghes_sci))
- register_acpi_hed_notifier(&ghes_notifier_sci);
- list_add_rcu(&ghes->list, &ghes_sci);
+ if (list_empty(&ghes_hed))
+ register_acpi_hed_notifier(&ghes_notifier_hed);
+ list_add_rcu(&ghes->list, &ghes_hed);
mutex_unlock(&ghes_list_mutex);
break;
+
case ACPI_HEST_NOTIFY_NMI:
ghes_nmi_add(ghes);
break;
@@ -1066,14 +1073,18 @@ static int ghes_remove(struct platform_device *ghes_dev)
case ACPI_HEST_NOTIFY_EXTERNAL:
free_irq(ghes->irq, ghes);
break;
+
case ACPI_HEST_NOTIFY_SCI:
+ case ACPI_HEST_NOTIFY_GSIV:
+ case ACPI_HEST_NOTIFY_GPIO:
mutex_lock(&ghes_list_mutex);
list_del_rcu(&ghes->list);
- if (list_empty(&ghes_sci))
- unregister_acpi_hed_notifier(&ghes_notifier_sci);
+ if (list_empty(&ghes_hed))
+ unregister_acpi_hed_notifier(&ghes_notifier_hed);
mutex_unlock(&ghes_list_mutex);
synchronize_rcu();
break;
+
case ACPI_HEST_NOTIFY_NMI:
ghes_nmi_remove(ghes);
break;
diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 919be0aa2578..240544253ccd 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -523,7 +523,7 @@ static int acpi_pci_root_add(struct acpi_device *device,
struct acpi_pci_root *root;
acpi_handle handle = device->handle;
int no_aspm = 0;
- bool hotadd = system_state != SYSTEM_BOOTING;
+ bool hotadd = system_state == SYSTEM_RUNNING;
root = kzalloc(sizeof(struct acpi_pci_root), GFP_KERNEL);
if (!root)
diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
index 8697a82bd465..591d1dd3f04e 100644
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -268,9 +268,9 @@ static int acpi_processor_start(struct device *dev)
return -ENODEV;
/* Protect against concurrent CPU hotplug operations */
- get_online_cpus();
+ cpu_hotplug_disable();
ret = __acpi_processor_start(device);
- put_online_cpus();
+ cpu_hotplug_enable();
return ret;
}
diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c
index 3de34633f7f9..7f9aff4b8d62 100644
--- a/drivers/acpi/processor_throttling.c
+++ b/drivers/acpi/processor_throttling.c
@@ -909,6 +909,13 @@ static long __acpi_processor_get_throttling(void *data)
return pr->throttling.acpi_processor_get_throttling(pr);
}
+static int call_on_cpu(int cpu, long (*fn)(void *), void *arg, bool direct)
+{
+ if (direct || (is_percpu_thread() && cpu == smp_processor_id()))
+ return fn(arg);
+ return work_on_cpu(cpu, fn, arg);
+}
+
static int acpi_processor_get_throttling(struct acpi_processor *pr)
{
if (!pr)
@@ -926,7 +933,7 @@ static int acpi_processor_get_throttling(struct acpi_processor *pr)
if (!cpu_online(pr->id))
return -ENODEV;
- return work_on_cpu(pr->id, __acpi_processor_get_throttling, pr);
+ return call_on_cpu(pr->id, __acpi_processor_get_throttling, pr, false);
}
static int acpi_processor_get_fadt_info(struct acpi_processor *pr)
@@ -1076,13 +1083,6 @@ static long acpi_processor_throttling_fn(void *data)
arg->target_state, arg->force);
}
-static int call_on_cpu(int cpu, long (*fn)(void *), void *arg, bool direct)
-{
- if (direct)
- return fn(arg);
- return work_on_cpu(cpu, fn, arg);
-}
-
static int __acpi_processor_set_throttling(struct acpi_processor *pr,
int state, bool force, bool direct)
{
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 5548f9686016..0440d95c9b5b 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -377,7 +377,7 @@ static int __ref get_nid_for_pfn(unsigned long pfn)
if (!pfn_valid_within(pfn))
return -1;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
- if (system_state == SYSTEM_BOOTING)
+ if (system_state < SYSTEM_RUNNING)
return early_pfn_to_nid(pfn);
#endif
page = pfn_to_page(pfn);
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 31adbebf812e..2af70014ee5a 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -539,15 +539,6 @@ config HANGCHECK_TIMER
out to lunch past a certain margin. It can reboot the system
or merely print a warning.
-config MMTIMER
- tristate "MMTIMER Memory mapped RTC for SGI Altix"
- depends on IA64_GENERIC || IA64_SGI_SN2
- depends on POSIX_TIMERS
- default y
- help
- The mmtimer device allows direct userspace access to the
- Altix system timer.
-
config UV_MMTIMER
tristate "UV_MMTIMER Memory mapped RTC for SGI UV"
depends on X86_UV
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index 6e6c244a66a0..53e33720818c 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -10,7 +10,6 @@ obj-$(CONFIG_VIRTIO_CONSOLE) += virtio_console.o
obj-$(CONFIG_RAW_DRIVER) += raw.o
obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o
obj-$(CONFIG_MSPEC) += mspec.o
-obj-$(CONFIG_MMTIMER) += mmtimer.o
obj-$(CONFIG_UV_MMTIMER) += uv_mmtimer.o
obj-$(CONFIG_IBM_BSR) += bsr.o
obj-$(CONFIG_SGI_MBCS) += mbcs.o
diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
deleted file mode 100644
index 0e7fcb04f01e..000000000000
--- a/drivers/char/mmtimer.c
+++ /dev/null
@@ -1,858 +0,0 @@
-/*
- * Timer device implementation for SGI SN platforms.
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (c) 2001-2006 Silicon Graphics, Inc. All rights reserved.
- *
- * This driver exports an API that should be supportable by any HPET or IA-PC
- * multimedia timer. The code below is currently specific to the SGI Altix
- * SHub RTC, however.
- *
- * 11/01/01 - jbarnes - initial revision
- * 9/10/04 - Christoph Lameter - remove interrupt support for kernel inclusion
- * 10/1/04 - Christoph Lameter - provide posix clock CLOCK_SGI_CYCLE
- * 10/13/04 - Christoph Lameter, Dimitri Sivanich - provide timer interrupt
- * support via the posix timer interface
- */
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/ioctl.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/mmtimer.h>
-#include <linux/miscdevice.h>
-#include <linux/posix-timers.h>
-#include <linux/interrupt.h>
-#include <linux/time.h>
-#include <linux/math64.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-
-#include <linux/uaccess.h>
-#include <asm/sn/addrs.h>
-#include <asm/sn/intr.h>
-#include <asm/sn/shub_mmr.h>
-#include <asm/sn/nodepda.h>
-#include <asm/sn/shubio.h>
-
-MODULE_AUTHOR("Jesse Barnes <jbarnes@sgi.com>");
-MODULE_DESCRIPTION("SGI Altix RTC Timer");
-MODULE_LICENSE("GPL");
-
-/* name of the device, usually in /dev */
-#define MMTIMER_NAME "mmtimer"
-#define MMTIMER_DESC "SGI Altix RTC Timer"
-#define MMTIMER_VERSION "2.1"
-
-#define RTC_BITS 55 /* 55 bits for this implementation */
-
-static struct k_clock sgi_clock;
-
-extern unsigned long sn_rtc_cycles_per_second;
-
-#define RTC_COUNTER_ADDR ((long *)LOCAL_MMR_ADDR(SH_RTC))
-
-#define rtc_time() (*RTC_COUNTER_ADDR)
-
-static DEFINE_MUTEX(mmtimer_mutex);
-static long mmtimer_ioctl(struct file *file, unsigned int cmd,
- unsigned long arg);
-static int mmtimer_mmap(struct file *file, struct vm_area_struct *vma);
-
-/*
- * Period in femtoseconds (10^-15 s)
- */
-static unsigned long mmtimer_femtoperiod = 0;
-
-static const struct file_operations mmtimer_fops = {
- .owner = THIS_MODULE,
- .mmap = mmtimer_mmap,
- .unlocked_ioctl = mmtimer_ioctl,
- .llseek = noop_llseek,
-};
-
-/*
- * We only have comparison registers RTC1-4 currently available per
- * node. RTC0 is used by SAL.
- */
-/* Check for an RTC interrupt pending */
-static int mmtimer_int_pending(int comparator)
-{
- if (HUB_L((unsigned long *)LOCAL_MMR_ADDR(SH_EVENT_OCCURRED)) &
- SH_EVENT_OCCURRED_RTC1_INT_MASK << comparator)
- return 1;
- else
- return 0;
-}
-
-/* Clear the RTC interrupt pending bit */
-static void mmtimer_clr_int_pending(int comparator)
-{
- HUB_S((u64 *)LOCAL_MMR_ADDR(SH_EVENT_OCCURRED_ALIAS),
- SH_EVENT_OCCURRED_RTC1_INT_MASK << comparator);
-}
-
-/* Setup timer on comparator RTC1 */
-static void mmtimer_setup_int_0(int cpu, u64 expires)
-{
- u64 val;
-
- /* Disable interrupt */
- HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC1_INT_ENABLE), 0UL);
-
- /* Initialize comparator value */
- HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPB), -1L);
-
- /* Clear pending bit */
- mmtimer_clr_int_pending(0);
-
- val = ((u64)SGI_MMTIMER_VECTOR << SH_RTC1_INT_CONFIG_IDX_SHFT) |
- ((u64)cpu_physical_id(cpu) <<
- SH_RTC1_INT_CONFIG_PID_SHFT);
-
- /* Set configuration */
- HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC1_INT_CONFIG), val);
-
- /* Enable RTC interrupts */
- HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC1_INT_ENABLE), 1UL);
-
- /* Initialize comparator value */
- HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPB), expires);
-
-
-}
-
-/* Setup timer on comparator RTC2 */
-static void mmtimer_setup_int_1(int cpu, u64 expires)
-{
- u64 val;
-
- HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC2_INT_ENABLE), 0UL);
-
- HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPC), -1L);
-
- mmtimer_clr_int_pending(1);
-
- val = ((u64)SGI_MMTIMER_VECTOR << SH_RTC2_INT_CONFIG_IDX_SHFT) |
- ((u64)cpu_physical_id(cpu) <<
- SH_RTC2_INT_CONFIG_PID_SHFT);
-
- HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC2_INT_CONFIG), val);
-
- HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC2_INT_ENABLE), 1UL);
-
- HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPC), expires);
-}
-
-/* Setup timer on comparator RTC3 */
-static void mmtimer_setup_int_2(int cpu, u64 expires)
-{
- u64 val;
-
- HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC3_INT_ENABLE), 0UL);
-
- HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPD), -1L);
-
- mmtimer_clr_int_pending(2);
-
- val = ((u64)SGI_MMTIMER_VECTOR << SH_RTC3_INT_CONFIG_IDX_SHFT) |
- ((u64)cpu_physical_id(cpu) <<
- SH_RTC3_INT_CONFIG_PID_SHFT);
-
- HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC3_INT_CONFIG), val);
-
- HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC3_INT_ENABLE), 1UL);
-
- HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPD), expires);
-}
-
-/*
- * This function must be called with interrupts disabled and preemption off
- * in order to insure that the setup succeeds in a deterministic time frame.
- * It will check if the interrupt setup succeeded.
- */
-static int mmtimer_setup(int cpu, int comparator, unsigned long expires,
- u64 *set_completion_time)
-{
- switch (comparator) {
- case 0:
- mmtimer_setup_int_0(cpu, expires);
- break;
- case 1:
- mmtimer_setup_int_1(cpu, expires);
- break;
- case 2:
- mmtimer_setup_int_2(cpu, expires);
- break;
- }
- /* We might've missed our expiration time */
- *set_completion_time = rtc_time();
- if (*set_completion_time <= expires)
- return 1;
-
- /*
- * If an interrupt is already pending then its okay
- * if not then we failed
- */
- return mmtimer_int_pending(comparator);
-}
-
-static int mmtimer_disable_int(long nasid, int comparator)
-{
- switch (comparator) {
- case 0:
- nasid == -1 ? HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC1_INT_ENABLE),
- 0UL) : REMOTE_HUB_S(nasid, SH_RTC1_INT_ENABLE, 0UL);
- break;
- case 1:
- nasid == -1 ? HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC2_INT_ENABLE),
- 0UL) : REMOTE_HUB_S(nasid, SH_RTC2_INT_ENABLE, 0UL);
- break;
- case 2:
- nasid == -1 ? HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC3_INT_ENABLE),
- 0UL) : REMOTE_HUB_S(nasid, SH_RTC3_INT_ENABLE, 0UL);
- break;
- default:
- return -EFAULT;
- }
- return 0;
-}
-
-#define COMPARATOR 1 /* The comparator to use */
-
-#define TIMER_OFF 0xbadcabLL /* Timer is not setup */
-#define TIMER_SET 0 /* Comparator is set for this timer */
-
-#define MMTIMER_INTERVAL_RETRY_INCREMENT_DEFAULT 40
-
-/* There is one of these for each timer */
-struct mmtimer {
- struct rb_node list;
- struct k_itimer *timer;
- int cpu;
-};
-
-struct mmtimer_node {
- spinlock_t lock ____cacheline_aligned;
- struct rb_root timer_head;
- struct rb_node *next;
- struct tasklet_struct tasklet;
-};
-static struct mmtimer_node *timers;
-
-static unsigned mmtimer_interval_retry_increment =
- MMTIMER_INTERVAL_RETRY_INCREMENT_DEFAULT;
-module_param(mmtimer_interval_retry_increment, uint, 0644);
-MODULE_PARM_DESC(mmtimer_interval_retry_increment,
- "RTC ticks to add to expiration on interval retry (default 40)");
-
-/*
- * Add a new mmtimer struct to the node's mmtimer list.
- * This function assumes the struct mmtimer_node is locked.
- */
-static void mmtimer_add_list(struct mmtimer *n)
-{
- int nodeid = n->timer->it.mmtimer.node;
- unsigned long expires = n->timer->it.mmtimer.expires;
- struct rb_node **link = &timers[nodeid].timer_head.rb_node;
- struct rb_node *parent = NULL;
- struct mmtimer *x;
-
- /*
- * Find the right place in the rbtree:
- */
- while (*link) {
- parent = *link;
- x = rb_entry(parent, struct mmtimer, list);
-
- if (expires < x->timer->it.mmtimer.expires)
- link = &(*link)->rb_left;
- else
- link = &(*link)->rb_right;
- }
-
- /*
- * Insert the timer to the rbtree and check whether it
- * replaces the first pending timer
- */
- rb_link_node(&n->list, parent, link);
- rb_insert_color(&n->list, &timers[nodeid].timer_head);
-
- if (!timers[nodeid].next || expires < rb_entry(timers[nodeid].next,
- struct mmtimer, list)->timer->it.mmtimer.expires)
- timers[nodeid].next = &n->list;
-}
-
-/*
- * Set the comparator for the next timer.
- * This function assumes the struct mmtimer_node is locked.
- */
-static void mmtimer_set_next_timer(int nodeid)
-{
- struct mmtimer_node *n = &timers[nodeid];
- struct mmtimer *x;
- struct k_itimer *t;
- u64 expires, exp, set_completion_time;
- int i;
-
-restart:
- if (n->next == NULL)
- return;
-
- x = rb_entry(n->next, struct mmtimer, list);
- t = x->timer;
- if (!t->it.mmtimer.incr) {
- /* Not an interval timer */
- if (!mmtimer_setup(x->cpu, COMPARATOR,
- t->it.mmtimer.expires,
- &set_completion_time)) {
- /* Late setup, fire now */
- tasklet_schedule(&n->tasklet);
- }
- return;
- }
-
- /* Interval timer */
- i = 0;
- expires = exp = t->it.mmtimer.expires;
- while (!mmtimer_setup(x->cpu, COMPARATOR, expires,
- &set_completion_time)) {
- int to;
-
- i++;
- expires = set_completion_time +
- mmtimer_interval_retry_increment + (1 << i);
- /* Calculate overruns as we go. */
- to = ((u64)(expires - exp) / t->it.mmtimer.incr);
- if (to) {
- t->it_overrun += to;
- t->it.mmtimer.expires += t->it.mmtimer.incr * to;
- exp = t->it.mmtimer.expires;
- }
- if (i > 20) {
- printk(KERN_ALERT "mmtimer: cannot reschedule timer\n");
- t->it.mmtimer.clock = TIMER_OFF;
- n->next = rb_next(&x->list);
- rb_erase(&x->list, &n->timer_head);
- kfree(x);
- goto restart;
- }
- }
-}
-
-/**
- * mmtimer_ioctl - ioctl interface for /dev/mmtimer
- * @file: file structure for the device
- * @cmd: command to execute
- * @arg: optional argument to command
- *
- * Executes the command specified by @cmd. Returns 0 for success, < 0 for
- * failure.
- *
- * Valid commands:
- *
- * %MMTIMER_GETOFFSET - Should return the offset (relative to the start
- * of the page where the registers are mapped) for the counter in question.
- *
- * %MMTIMER_GETRES - Returns the resolution of the clock in femto (10^-15)
- * seconds
- *
- * %MMTIMER_GETFREQ - Copies the frequency of the clock in Hz to the address
- * specified by @arg
- *
- * %MMTIMER_GETBITS - Returns the number of bits in the clock's counter
- *
- * %MMTIMER_MMAPAVAIL - Returns 1 if the registers can be mmap'd into userspace
- *
- * %MMTIMER_GETCOUNTER - Gets the current value in the counter and places it
- * in the address specified by @arg.
- */
-static long mmtimer_ioctl(struct file *file, unsigned int cmd,
- unsigned long arg)
-{
- int ret = 0;
-
- mutex_lock(&mmtimer_mutex);
-
- switch (cmd) {
- case MMTIMER_GETOFFSET: /* offset of the counter */
- /*
- * SN RTC registers are on their own 64k page
- */
- if(PAGE_SIZE <= (1 << 16))
- ret = (((long)RTC_COUNTER_ADDR) & (PAGE_SIZE-1)) / 8;
- else
- ret = -ENOSYS;
- break;
-
- case MMTIMER_GETRES: /* resolution of the clock in 10^-15 s */
- if(copy_to_user((unsigned long __user *)arg,
- &mmtimer_femtoperiod, sizeof(unsigned long)))
- ret = -EFAULT;
- break;
-
- case MMTIMER_GETFREQ: /* frequency in Hz */
- if(copy_to_user((unsigned long __user *)arg,
- &sn_rtc_cycles_per_second,
- sizeof(unsigned long)))
- ret = -EFAULT;
- break;
-
- case MMTIMER_GETBITS: /* number of bits in the clock */
- ret = RTC_BITS;
- break;
-
- case MMTIMER_MMAPAVAIL: /* can we mmap the clock into userspace? */
- ret = (PAGE_SIZE <= (1 << 16)) ? 1 : 0;
- break;
-
- case MMTIMER_GETCOUNTER:
- if(copy_to_user((unsigned long __user *)arg,
- RTC_COUNTER_ADDR, sizeof(unsigned long)))
- ret = -EFAULT;
- break;
- default:
- ret = -ENOTTY;
- break;
- }
- mutex_unlock(&mmtimer_mutex);
- return ret;
-}
-
-/**
- * mmtimer_mmap - maps the clock's registers into userspace
- * @file: file structure for the device
- * @vma: VMA to map the registers into
- *
- * Calls remap_pfn_range() to map the clock's registers into
- * the calling process' address space.
- */
-static int mmtimer_mmap(struct file *file, struct vm_area_struct *vma)
-{
- unsigned long mmtimer_addr;
-
- if (vma->vm_end - vma->vm_start != PAGE_SIZE)
- return -EINVAL;
-
- if (vma->vm_flags & VM_WRITE)
- return -EPERM;
-
- if (PAGE_SIZE > (1 << 16))
- return -ENOSYS;
-
- vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-
- mmtimer_addr = __pa(RTC_COUNTER_ADDR);
- mmtimer_addr &= ~(PAGE_SIZE - 1);
- mmtimer_addr &= 0xfffffffffffffffUL;
-
- if (remap_pfn_range(vma, vma->vm_start, mmtimer_addr >> PAGE_SHIFT,
- PAGE_SIZE, vma->vm_page_prot)) {
- printk(KERN_ERR "remap_pfn_range failed in mmtimer.c\n");
- return -EAGAIN;
- }
-
- return 0;
-}
-
-static struct miscdevice mmtimer_miscdev = {
- .minor = SGI_MMTIMER,
- .name = MMTIMER_NAME,
- .fops = &mmtimer_fops
-};
-
-static struct timespec sgi_clock_offset;
-static int sgi_clock_period;
-
-/*
- * Posix Timer Interface
- */
-
-static struct timespec sgi_clock_offset;
-static int sgi_clock_period;
-
-static int sgi_clock_get(clockid_t clockid, struct timespec64 *tp)
-{
- u64 nsec;
-
- nsec = rtc_time() * sgi_clock_period
- + sgi_clock_offset.tv_nsec;
- *tp = ns_to_timespec64(nsec);
- tp->tv_sec += sgi_clock_offset.tv_sec;
- return 0;
-};
-
-static int sgi_clock_set(const clockid_t clockid, const struct timespec64 *tp)
-{
-
- u64 nsec;
- u32 rem;
-
- nsec = rtc_time() * sgi_clock_period;
-
- sgi_clock_offset.tv_sec = tp->tv_sec - div_u64_rem(nsec, NSEC_PER_SEC, &rem);
-
- if (rem <= tp->tv_nsec)
- sgi_clock_offset.tv_nsec = tp->tv_sec - rem;
- else {
- sgi_clock_offset.tv_nsec = tp->tv_sec + NSEC_PER_SEC - rem;
- sgi_clock_offset.tv_sec--;
- }
- return 0;
-}
-
-/**
- * mmtimer_interrupt - timer interrupt handler
- * @irq: irq received
- * @dev_id: device the irq came from
- *
- * Called when one of the comarators matches the counter, This
- * routine will send signals to processes that have requested
- * them.
- *
- * This interrupt is run in an interrupt context
- * by the SHUB. It is therefore safe to locally access SHub
- * registers.
- */
-static irqreturn_t
-mmtimer_interrupt(int irq, void *dev_id)
-{
- unsigned long expires = 0;
- int result = IRQ_NONE;
- unsigned indx = cpu_to_node(smp_processor_id());
- struct mmtimer *base;
-
- spin_lock(&timers[indx].lock);
- base = rb_entry(timers[indx].next, struct mmtimer, list);
- if (base == NULL) {
- spin_unlock(&timers[indx].lock);
- return result;
- }
-
- if (base->cpu == smp_processor_id()) {
- if (base->timer)
- expires = base->timer->it.mmtimer.expires;
- /* expires test won't work with shared irqs */
- if ((mmtimer_int_pending(COMPARATOR) > 0) ||
- (expires && (expires <= rtc_time()))) {
- mmtimer_clr_int_pending(COMPARATOR);
- tasklet_schedule(&timers[indx].tasklet);
- result = IRQ_HANDLED;
- }
- }
- spin_unlock(&timers[indx].lock);
- return result;
-}
-
-static void mmtimer_tasklet(unsigned long data)
-{
- int nodeid = data;
- struct mmtimer_node *mn = &timers[nodeid];
- struct mmtimer *x;
- struct k_itimer *t;
- unsigned long flags;
-
- /* Send signal and deal with periodic signals */
- spin_lock_irqsave(&mn->lock, flags);
- if (!mn->next)
- goto out;
-
- x = rb_entry(mn->next, struct mmtimer, list);
- t = x->timer;
-
- if (t->it.mmtimer.clock == TIMER_OFF)
- goto out;
-
- t->it_overrun = 0;
-
- mn->next = rb_next(&x->list);
- rb_erase(&x->list, &mn->timer_head);
-
- if (posix_timer_event(t, 0) != 0)
- t->it_overrun++;
-
- if(t->it.mmtimer.incr) {
- t->it.mmtimer.expires += t->it.mmtimer.incr;
- mmtimer_add_list(x);
- } else {
- /* Ensure we don't false trigger in mmtimer_interrupt */
- t->it.mmtimer.clock = TIMER_OFF;
- t->it.mmtimer.expires = 0;
- kfree(x);
- }
- /* Set comparator for next timer, if there is one */
- mmtimer_set_next_timer(nodeid);
-
- t->it_overrun_last = t->it_overrun;
-out:
- spin_unlock_irqrestore(&mn->lock, flags);
-}
-
-static int sgi_timer_create(struct k_itimer *timer)
-{
- /* Insure that a newly created timer is off */
- timer->it.mmtimer.clock = TIMER_OFF;
- return 0;
-}
-
-/* This does not really delete a timer. It just insures
- * that the timer is not active
- *
- * Assumption: it_lock is already held with irq's disabled
- */
-static int sgi_timer_del(struct k_itimer *timr)
-{
- cnodeid_t nodeid = timr->it.mmtimer.node;
- unsigned long irqflags;
-
- spin_lock_irqsave(&timers[nodeid].lock, irqflags);
- if (timr->it.mmtimer.clock != TIMER_OFF) {
- unsigned long expires = timr->it.mmtimer.expires;
- struct rb_node *n = timers[nodeid].timer_head.rb_node;
- struct mmtimer *uninitialized_var(t);
- int r = 0;
-
- timr->it.mmtimer.clock = TIMER_OFF;
- timr->it.mmtimer.expires = 0;
-
- while (n) {
- t = rb_entry(n, struct mmtimer, list);
- if (t->timer == timr)
- break;
-
- if (expires < t->timer->it.mmtimer.expires)
- n = n->rb_left;
- else
- n = n->rb_right;
- }
-
- if (!n) {
- spin_unlock_irqrestore(&timers[nodeid].lock, irqflags);
- return 0;
- }
-
- if (timers[nodeid].next == n) {
- timers[nodeid].next = rb_next(n);
- r = 1;
- }
-
- rb_erase(n, &timers[nodeid].timer_head);
- kfree(t);
-
- if (r) {
- mmtimer_disable_int(cnodeid_to_nasid(nodeid),
- COMPARATOR);
- mmtimer_set_next_timer(nodeid);
- }
- }
- spin_unlock_irqrestore(&timers[nodeid].lock, irqflags);
- return 0;
-}
-
-/* Assumption: it_lock is already held with irq's disabled */
-static void sgi_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
-{
-
- if (timr->it.mmtimer.clock == TIMER_OFF) {
- cur_setting->it_interval.tv_nsec = 0;
- cur_setting->it_interval.tv_sec = 0;
- cur_setting->it_value.tv_nsec = 0;
- cur_setting->it_value.tv_sec =0;
- return;
- }
-
- cur_setting->it_interval = ns_to_timespec64(timr->it.mmtimer.incr * sgi_clock_period);
- cur_setting->it_value = ns_to_timespec64((timr->it.mmtimer.expires - rtc_time()) * sgi_clock_period);
-}
-
-
-static int sgi_timer_set(struct k_itimer *timr, int flags,
- struct itimerspec64 *new_setting,
- struct itimerspec64 *old_setting)
-{
- unsigned long when, period, irqflags;
- int err = 0;
- cnodeid_t nodeid;
- struct mmtimer *base;
- struct rb_node *n;
-
- if (old_setting)
- sgi_timer_get(timr, old_setting);
-
- sgi_timer_del(timr);
- when = timespec64_to_ns(&new_setting->it_value);
- period = timespec64_to_ns(&new_setting->it_interval);
-
- if (when == 0)
- /* Clear timer */
- return 0;
-
- base = kmalloc(sizeof(struct mmtimer), GFP_KERNEL);
- if (base == NULL)
- return -ENOMEM;
-
- if (flags & TIMER_ABSTIME) {
- struct timespec64 n;
- unsigned long now;
-
- getnstimeofday64(&n);
- now = timespec64_to_ns(&n);
- if (when > now)
- when -= now;
- else
- /* Fire the timer immediately */
- when = 0;
- }
-
- /*
- * Convert to sgi clock period. Need to keep rtc_time() as near as possible
- * to getnstimeofday() in order to be as faithful as possible to the time
- * specified.
- */
- when = (when + sgi_clock_period - 1) / sgi_clock_period + rtc_time();
- period = (period + sgi_clock_period - 1) / sgi_clock_period;
-
- /*
- * We are allocating a local SHub comparator. If we would be moved to another
- * cpu then another SHub may be local to us. Prohibit that by switching off
- * preemption.
- */
- preempt_disable();
-
- nodeid = cpu_to_node(smp_processor_id());
-
- /* Lock the node timer structure */
- spin_lock_irqsave(&timers[nodeid].lock, irqflags);
-
- base->timer = timr;
- base->cpu = smp_processor_id();
-
- timr->it.mmtimer.clock = TIMER_SET;
- timr->it.mmtimer.node = nodeid;
- timr->it.mmtimer.incr = period;
- timr->it.mmtimer.expires = when;
-
- n = timers[nodeid].next;
-
- /* Add the new struct mmtimer to node's timer list */
- mmtimer_add_list(base);
-
- if (timers[nodeid].next == n) {
- /* No need to reprogram comparator for now */
- spin_unlock_irqrestore(&timers[nodeid].lock, irqflags);
- preempt_enable();
- return err;
- }
-
- /* We need to reprogram the comparator */
- if (n)
- mmtimer_disable_int(cnodeid_to_nasid(nodeid), COMPARATOR);
-
- mmtimer_set_next_timer(nodeid);
-
- /* Unlock the node timer structure */
- spin_unlock_irqrestore(&timers[nodeid].lock, irqflags);
-
- preempt_enable();
-
- return err;
-}
-
-static int sgi_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
-{
- tp->tv_sec = 0;
- tp->tv_nsec = sgi_clock_period;
- return 0;
-}
-
-static struct k_clock sgi_clock = {
- .clock_set = sgi_clock_set,
- .clock_get = sgi_clock_get,
- .clock_getres = sgi_clock_getres,
- .timer_create = sgi_timer_create,
- .timer_set = sgi_timer_set,
- .timer_del = sgi_timer_del,
- .timer_get = sgi_timer_get
-};
-
-/**
- * mmtimer_init - device initialization routine
- *
- * Does initial setup for the mmtimer device.
- */
-static int __init mmtimer_init(void)
-{
- cnodeid_t node, maxn = -1;
-
- if (!ia64_platform_is("sn2"))
- return 0;
-
- /*
- * Sanity check the cycles/sec variable
- */
- if (sn_rtc_cycles_per_second < 100000) {
- printk(KERN_ERR "%s: unable to determine clock frequency\n",
- MMTIMER_NAME);
- goto out1;
- }
-
- mmtimer_femtoperiod = ((unsigned long)1E15 + sn_rtc_cycles_per_second /
- 2) / sn_rtc_cycles_per_second;
-
- if (request_irq(SGI_MMTIMER_VECTOR, mmtimer_interrupt, IRQF_PERCPU, MMTIMER_NAME, NULL)) {
- printk(KERN_WARNING "%s: unable to allocate interrupt.",
- MMTIMER_NAME);
- goto out1;
- }
-
- if (misc_register(&mmtimer_miscdev)) {
- printk(KERN_ERR "%s: failed to register device\n",
- MMTIMER_NAME);
- goto out2;
- }
-
- /* Get max numbered node, calculate slots needed */
- for_each_online_node(node) {
- maxn = node;
- }
- maxn++;
-
- /* Allocate list of node ptrs to mmtimer_t's */
- timers = kzalloc(sizeof(struct mmtimer_node)*maxn, GFP_KERNEL);
- if (!timers) {
- printk(KERN_ERR "%s: failed to allocate memory for device\n",
- MMTIMER_NAME);
- goto out3;
- }
-
- /* Initialize struct mmtimer's for each online node */
- for_each_online_node(node) {
- spin_lock_init(&timers[node].lock);
- tasklet_init(&timers[node].tasklet, mmtimer_tasklet,
- (unsigned long) node);
- }
-
- sgi_clock_period = NSEC_PER_SEC / sn_rtc_cycles_per_second;
- posix_timers_register_clock(CLOCK_SGI_CYCLE, &sgi_clock);
-
- printk(KERN_INFO "%s: v%s, %ld MHz\n", MMTIMER_DESC, MMTIMER_VERSION,
- sn_rtc_cycles_per_second/(unsigned long)1E6);
-
- return 0;
-
-out3:
- misc_deregister(&mmtimer_miscdev);
-out2:
- free_irq(SGI_MMTIMER_VECTOR, NULL);
-out1:
- return -1;
-}
-
-module_init(mmtimer_init);
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 0e3f6496524d..6001369f9aeb 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -887,7 +887,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
struct freq_attr *fattr = to_attr(attr);
ssize_t ret = -EINVAL;
- get_online_cpus();
+ cpus_read_lock();
if (cpu_online(policy->cpu)) {
down_write(&policy->rwsem);
@@ -895,7 +895,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
up_write(&policy->rwsem);
}
- put_online_cpus();
+ cpus_read_unlock();
return ret;
}
@@ -2441,7 +2441,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
pr_debug("trying to register driver %s\n", driver_data->name);
/* Protect against concurrent CPU online/offline. */
- get_online_cpus();
+ cpus_read_lock();
write_lock_irqsave(&cpufreq_driver_lock, flags);
if (cpufreq_driver) {
@@ -2473,9 +2473,10 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
goto err_if_unreg;
}
- ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "cpufreq:online",
- cpuhp_cpufreq_online,
- cpuhp_cpufreq_offline);
+ ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN,
+ "cpufreq:online",
+ cpuhp_cpufreq_online,
+ cpuhp_cpufreq_offline);
if (ret < 0)
goto err_if_unreg;
hp_online = ret;
@@ -2493,7 +2494,7 @@ err_null_driver:
cpufreq_driver = NULL;
write_unlock_irqrestore(&cpufreq_driver_lock, flags);
out:
- put_online_cpus();
+ cpus_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(cpufreq_register_driver);
@@ -2516,17 +2517,17 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver)
pr_debug("unregistering driver %s\n", driver->name);
/* Protect against concurrent cpu hotplug */
- get_online_cpus();
+ cpus_read_lock();
subsys_interface_unregister(&cpufreq_interface);
remove_boost_sysfs_file();
- cpuhp_remove_state_nocalls(hp_online);
+ cpuhp_remove_state_nocalls_cpuslocked(hp_online);
write_lock_irqsave(&cpufreq_driver_lock, flags);
cpufreq_driver = NULL;
write_unlock_irqrestore(&cpufreq_driver_lock, flags);
- put_online_cpus();
+ cpus_read_unlock();
return 0;
}
diff --git a/drivers/cpufreq/pasemi-cpufreq.c b/drivers/cpufreq/pasemi-cpufreq.c
index 35dd4d7ffee0..b257fc7d5204 100644
--- a/drivers/cpufreq/pasemi-cpufreq.c
+++ b/drivers/cpufreq/pasemi-cpufreq.c
@@ -226,7 +226,7 @@ static int pas_cpufreq_cpu_exit(struct cpufreq_policy *policy)
* We don't support CPU hotplug. Don't unmap after the system
* has already made it to a running state.
*/
- if (system_state != SYSTEM_BOOTING)
+ if (system_state >= SYSTEM_RUNNING)
return 0;
if (sdcasr_mapbase)
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 2706be7ed334..60bb64f4329d 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -220,6 +220,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
entered_state = target_state->enter(dev, drv, index);
start_critical_timings();
+ sched_clock_idle_wakeup_event();
time_end = ns_to_ktime(local_clock());
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
diff --git a/drivers/firmware/efi/efi-bgrt.c b/drivers/firmware/efi/efi-bgrt.c
index 04ca8764f0c0..8bf27323f7a3 100644
--- a/drivers/firmware/efi/efi-bgrt.c
+++ b/drivers/firmware/efi/efi-bgrt.c
@@ -36,6 +36,9 @@ void __init efi_bgrt_init(struct acpi_table_header *table)
if (acpi_disabled)
return;
+ if (!efi_enabled(EFI_BOOT))
+ return;
+
if (table->length < sizeof(bgrt_tab)) {
pr_notice("Ignoring BGRT: invalid length %u (expected %zu)\n",
table->length, sizeof(bgrt_tab));
diff --git a/drivers/firmware/efi/libstub/secureboot.c b/drivers/firmware/efi/libstub/secureboot.c
index 8c34d50a4d80..959777ec8a77 100644
--- a/drivers/firmware/efi/libstub/secureboot.c
+++ b/drivers/firmware/efi/libstub/secureboot.c
@@ -16,10 +16,10 @@
/* BIOS variables */
static const efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID;
-static const efi_char16_t const efi_SecureBoot_name[] = {
+static const efi_char16_t efi_SecureBoot_name[] = {
'S', 'e', 'c', 'u', 'r', 'e', 'B', 'o', 'o', 't', 0
};
-static const efi_char16_t const efi_SetupMode_name[] = {
+static const efi_char16_t efi_SetupMode_name[] = {
'S', 'e', 't', 'u', 'p', 'M', 'o', 'd', 'e', 0
};
diff --git a/drivers/hwtracing/coresight/coresight-etm3x.c b/drivers/hwtracing/coresight/coresight-etm3x.c
index a51b6b64ecdf..93ee8fc539be 100644
--- a/drivers/hwtracing/coresight/coresight-etm3x.c
+++ b/drivers/hwtracing/coresight/coresight-etm3x.c
@@ -587,7 +587,7 @@ static void etm_disable_sysfs(struct coresight_device *csdev)
* after cpu online mask indicates the cpu is offline but before the
* DYING hotplug callback is serviced by the ETM driver.
*/
- get_online_cpus();
+ cpus_read_lock();
spin_lock(&drvdata->spinlock);
/*
@@ -597,7 +597,7 @@ static void etm_disable_sysfs(struct coresight_device *csdev)
smp_call_function_single(drvdata->cpu, etm_disable_hw, drvdata, 1);
spin_unlock(&drvdata->spinlock);
- put_online_cpus();
+ cpus_read_unlock();
dev_info(drvdata->dev, "ETM tracing disabled\n");
}
@@ -795,7 +795,7 @@ static int etm_probe(struct amba_device *adev, const struct amba_id *id)
drvdata->cpu = pdata ? pdata->cpu : 0;
- get_online_cpus();
+ cpus_read_lock();
etmdrvdata[drvdata->cpu] = drvdata;
if (smp_call_function_single(drvdata->cpu,
@@ -803,17 +803,17 @@ static int etm_probe(struct amba_device *adev, const struct amba_id *id)
dev_err(dev, "ETM arch init failed\n");
if (!etm_count++) {
- cpuhp_setup_state_nocalls(CPUHP_AP_ARM_CORESIGHT_STARTING,
- "arm/coresight:starting",
- etm_starting_cpu, etm_dying_cpu);
- ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
- "arm/coresight:online",
- etm_online_cpu, NULL);
+ cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ARM_CORESIGHT_STARTING,
+ "arm/coresight:starting",
+ etm_starting_cpu, etm_dying_cpu);
+ ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN,
+ "arm/coresight:online",
+ etm_online_cpu, NULL);
if (ret < 0)
goto err_arch_supported;
hp_online = ret;
}
- put_online_cpus();
+ cpus_read_unlock();
if (etm_arch_supported(drvdata->arch) == false) {
ret = -EINVAL;
diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c b/drivers/hwtracing/coresight/coresight-etm4x.c
index d1340fb4e457..532adc9dd32a 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x.c
@@ -371,7 +371,7 @@ static void etm4_disable_sysfs(struct coresight_device *csdev)
* after cpu online mask indicates the cpu is offline but before the
* DYING hotplug callback is serviced by the ETM driver.
*/
- get_online_cpus();
+ cpus_read_lock();
spin_lock(&drvdata->spinlock);
/*
@@ -381,7 +381,7 @@ static void etm4_disable_sysfs(struct coresight_device *csdev)
smp_call_function_single(drvdata->cpu, etm4_disable_hw, drvdata, 1);
spin_unlock(&drvdata->spinlock);
- put_online_cpus();
+ cpus_read_unlock();
dev_info(drvdata->dev, "ETM tracing disabled\n");
}
@@ -982,7 +982,7 @@ static int etm4_probe(struct amba_device *adev, const struct amba_id *id)
drvdata->cpu = pdata ? pdata->cpu : 0;
- get_online_cpus();
+ cpus_read_lock();
etmdrvdata[drvdata->cpu] = drvdata;
if (smp_call_function_single(drvdata->cpu,
@@ -990,18 +990,18 @@ static int etm4_probe(struct amba_device *adev, const struct amba_id *id)
dev_err(dev, "ETM arch init failed\n");
if (!etm4_count++) {
- cpuhp_setup_state_nocalls(CPUHP_AP_ARM_CORESIGHT_STARTING,
- "arm/coresight4:starting",
- etm4_starting_cpu, etm4_dying_cpu);
- ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
- "arm/coresight4:online",
- etm4_online_cpu, NULL);
+ cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ARM_CORESIGHT_STARTING,
+ "arm/coresight4:starting",
+ etm4_starting_cpu, etm4_dying_cpu);
+ ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN,
+ "arm/coresight4:online",
+ etm4_online_cpu, NULL);
if (ret < 0)
goto err_arch_supported;
hp_online = ret;
}
- put_online_cpus();
+ cpus_read_unlock();
if (etm4_arch_supported(drvdata->arch) == false) {
ret = -EINVAL;
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index fc2765ccdb57..8500deda9175 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4315,7 +4315,7 @@ int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
struct acpi_dmar_atsr *atsr;
struct dmar_atsr_unit *atsru;
- if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
+ if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
return 0;
atsr = container_of(hdr, struct acpi_dmar_atsr, header);
@@ -4565,7 +4565,7 @@ int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
struct acpi_dmar_atsr *atsr;
struct acpi_dmar_reserved_memory *rmrr;
- if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
+ if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
return 0;
list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 9f44ee8ea1bc..b8dcf44df441 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -103,7 +103,7 @@ static bool of_iommu_driver_present(struct device_node *np)
* it never will be. We don't want to defer indefinitely, nor attempt
* to dereference __iommu_of_table after it's been freed.
*/
- if (system_state > SYSTEM_BOOTING)
+ if (system_state >= SYSTEM_RUNNING)
return false;
return of_match_node(&__iommu_of_table, np);
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 192e7b681b96..fe6be6382505 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -320,10 +320,19 @@ static long local_pci_probe(void *_ddi)
return 0;
}
+static bool pci_physfn_is_probed(struct pci_dev *dev)
+{
+#ifdef CONFIG_PCI_IOV
+ return dev->is_virtfn && dev->physfn->is_probed;
+#else
+ return false;
+#endif
+}
+
static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev,
const struct pci_device_id *id)
{
- int error, node;
+ int error, node, cpu;
struct drv_dev_and_id ddi = { drv, dev, id };
/*
@@ -332,33 +341,27 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev,
* on the right node.
*/
node = dev_to_node(&dev->dev);
+ dev->is_probed = 1;
+
+ cpu_hotplug_disable();
/*
- * On NUMA systems, we are likely to call a PF probe function using
- * work_on_cpu(). If that probe calls pci_enable_sriov() (which
- * adds the VF devices via pci_bus_add_device()), we may re-enter
- * this function to call the VF probe function. Calling
- * work_on_cpu() again will cause a lockdep warning. Since VFs are
- * always on the same node as the PF, we can work around this by
- * avoiding work_on_cpu() when we're already on the correct node.
- *
- * Preemption is enabled, so it's theoretically unsafe to use
- * numa_node_id(), but even if we run the probe function on the
- * wrong node, it should be functionally correct.
+ * Prevent nesting work_on_cpu() for the case where a Virtual Function
+ * device is probed from work_on_cpu() of the Physical device.
*/
- if (node >= 0 && node != numa_node_id()) {
- int cpu;
-
- get_online_cpus();
+ if (node < 0 || node >= MAX_NUMNODES || !node_online(node) ||
+ pci_physfn_is_probed(dev))
+ cpu = nr_cpu_ids;
+ else
cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask);
- if (cpu < nr_cpu_ids)
- error = work_on_cpu(cpu, local_pci_probe, &ddi);
- else
- error = local_pci_probe(&ddi);
- put_online_cpus();
- } else
+
+ if (cpu < nr_cpu_ids)
+ error = work_on_cpu(cpu, local_pci_probe, &ddi);
+ else
error = local_pci_probe(&ddi);
+ dev->is_probed = 0;
+ cpu_hotplug_enable();
return error;
}
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index 94f8038864b4..ed4c343d08c4 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -29,7 +29,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
-int __init parse_ras_param(char *str)
+static int __init parse_ras_param(char *str)
{
#ifdef CONFIG_RAS_CEC
parse_cec_param(str);
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index c1ec8ee80924..9e35032351a0 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -190,6 +190,7 @@ static void do_poweroff(void)
{
switch (system_state) {
case SYSTEM_BOOTING:
+ case SYSTEM_SCHEDULING:
orderly_poweroff(true);
break;
case SYSTEM_RUNNING:
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index f2b10d9ebd04..81490456c242 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -96,6 +96,7 @@ struct clocksource {
void (*suspend)(struct clocksource *cs);
void (*resume)(struct clocksource *cs);
void (*mark_unstable)(struct clocksource *cs);
+ void (*tick_stable)(struct clocksource *cs);
/* private: */
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index f92081234afd..ca73bc1563f4 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -99,26 +99,32 @@ static inline void cpu_maps_update_done(void)
extern struct bus_type cpu_subsys;
#ifdef CONFIG_HOTPLUG_CPU
-/* Stop CPUs going up and down. */
-
-extern void cpu_hotplug_begin(void);
-extern void cpu_hotplug_done(void);
-extern void get_online_cpus(void);
-extern void put_online_cpus(void);
+extern void cpus_write_lock(void);
+extern void cpus_write_unlock(void);
+extern void cpus_read_lock(void);
+extern void cpus_read_unlock(void);
+extern void lockdep_assert_cpus_held(void);
extern void cpu_hotplug_disable(void);
extern void cpu_hotplug_enable(void);
void clear_tasks_mm_cpumask(int cpu);
int cpu_down(unsigned int cpu);
-#else /* CONFIG_HOTPLUG_CPU */
-
-static inline void cpu_hotplug_begin(void) {}
-static inline void cpu_hotplug_done(void) {}
-#define get_online_cpus() do { } while (0)
-#define put_online_cpus() do { } while (0)
-#define cpu_hotplug_disable() do { } while (0)
-#define cpu_hotplug_enable() do { } while (0)
-#endif /* CONFIG_HOTPLUG_CPU */
+#else /* CONFIG_HOTPLUG_CPU */
+
+static inline void cpus_write_lock(void) { }
+static inline void cpus_write_unlock(void) { }
+static inline void cpus_read_lock(void) { }
+static inline void cpus_read_unlock(void) { }
+static inline void lockdep_assert_cpus_held(void) { }
+static inline void cpu_hotplug_disable(void) { }
+static inline void cpu_hotplug_enable(void) { }
+#endif /* !CONFIG_HOTPLUG_CPU */
+
+/* Wrappers which go away once all code is converted */
+static inline void cpu_hotplug_begin(void) { cpus_write_lock(); }
+static inline void cpu_hotplug_done(void) { cpus_write_unlock(); }
+static inline void get_online_cpus(void) { cpus_read_lock(); }
+static inline void put_online_cpus(void) { cpus_read_unlock(); }
#ifdef CONFIG_PM_SLEEP_SMP
extern int freeze_secondary_cpus(int primary);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 0f2a80377520..df3d2719a796 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -153,6 +153,11 @@ int __cpuhp_setup_state(enum cpuhp_state state, const char *name, bool invoke,
int (*startup)(unsigned int cpu),
int (*teardown)(unsigned int cpu), bool multi_instance);
+int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, const char *name,
+ bool invoke,
+ int (*startup)(unsigned int cpu),
+ int (*teardown)(unsigned int cpu),
+ bool multi_instance);
/**
* cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks
* @state: The state for which the calls are installed
@@ -171,6 +176,15 @@ static inline int cpuhp_setup_state(enum cpuhp_state state,
return __cpuhp_setup_state(state, name, true, startup, teardown, false);
}
+static inline int cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
+ const char *name,
+ int (*startup)(unsigned int cpu),
+ int (*teardown)(unsigned int cpu))
+{
+ return __cpuhp_setup_state_cpuslocked(state, name, true, startup,
+ teardown, false);
+}
+
/**
* cpuhp_setup_state_nocalls - Setup hotplug state callbacks without calling the
* callbacks
@@ -191,6 +205,15 @@ static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state,
false);
}
+static inline int cpuhp_setup_state_nocalls_cpuslocked(enum cpuhp_state state,
+ const char *name,
+ int (*startup)(unsigned int cpu),
+ int (*teardown)(unsigned int cpu))
+{
+ return __cpuhp_setup_state_cpuslocked(state, name, false, startup,
+ teardown, false);
+}
+
/**
* cpuhp_setup_state_multi - Add callbacks for multi state
* @state: The state for which the calls are installed
@@ -217,6 +240,8 @@ static inline int cpuhp_setup_state_multi(enum cpuhp_state state,
int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
bool invoke);
+int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state,
+ struct hlist_node *node, bool invoke);
/**
* cpuhp_state_add_instance - Add an instance for a state and invoke startup
@@ -249,7 +274,15 @@ static inline int cpuhp_state_add_instance_nocalls(enum cpuhp_state state,
return __cpuhp_state_add_instance(state, node, false);
}
+static inline int
+cpuhp_state_add_instance_nocalls_cpuslocked(enum cpuhp_state state,
+ struct hlist_node *node)
+{
+ return __cpuhp_state_add_instance_cpuslocked(state, node, false);
+}
+
void __cpuhp_remove_state(enum cpuhp_state state, bool invoke);
+void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke);
/**
* cpuhp_remove_state - Remove hotplug state callbacks and invoke the teardown
@@ -273,6 +306,11 @@ static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state)
__cpuhp_remove_state(state, false);
}
+static inline void cpuhp_remove_state_nocalls_cpuslocked(enum cpuhp_state state)
+{
+ __cpuhp_remove_state_cpuslocked(state, false);
+}
+
/**
* cpuhp_remove_multi_state - Remove hotplug multi state callback
* @state: The state for which the calls are removed
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 2404ad238c0b..4bf4479a3a80 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -236,6 +236,23 @@ unsigned int cpumask_local_spread(unsigned int i, int node);
(cpu) = cpumask_next_zero((cpu), (mask)), \
(cpu) < nr_cpu_ids;)
+extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
+
+/**
+ * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
+ * @cpu: the (optionally unsigned) integer iterator
+ * @mask: the cpumask poiter
+ * @start: the start location
+ *
+ * The implementation does not assume any bit in @mask is set (including @start).
+ *
+ * After the loop, cpu is >= nr_cpu_ids.
+ */
+#define for_each_cpu_wrap(cpu, mask, start) \
+ for ((cpu) = cpumask_next_wrap((start)-1, (mask), (start), false); \
+ (cpu) < nr_cpumask_bits; \
+ (cpu) = cpumask_next_wrap((cpu), (mask), (start), true))
+
/**
* for_each_cpu_and - iterate over every cpu in both masks
* @cpu: the (optionally unsigned) integer iterator
@@ -276,6 +293,12 @@ static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
set_bit(cpumask_check(cpu), cpumask_bits(dstp));
}
+static inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
+{
+ __set_bit(cpumask_check(cpu), cpumask_bits(dstp));
+}
+
+
/**
* cpumask_clear_cpu - clear a cpu in a cpumask
* @cpu: cpu number (< nr_cpu_ids)
@@ -286,6 +309,11 @@ static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp)
clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
}
+static inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp)
+{
+ __clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
+}
+
/**
* cpumask_test_cpu - test for a cpu in a cpumask
* @cpu: cpu number (< nr_cpu_ids)
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 13bc08aba704..1c91f26e2996 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -490,9 +490,13 @@ extern int root_mountflags;
extern bool early_boot_irqs_disabled;
-/* Values used for system_state */
+/*
+ * Values used for system_state. Ordering of the states must not be changed
+ * as code checks for <, <=, >, >= STATE.
+ */
extern enum system_states {
SYSTEM_BOOTING,
+ SYSTEM_SCHEDULING,
SYSTEM_RUNNING,
SYSTEM_HALT,
SYSTEM_POWER_OFF,
diff --git a/include/linux/llist.h b/include/linux/llist.h
index 171baa90f6f6..d11738110a7a 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -110,6 +110,25 @@ static inline void init_llist_head(struct llist_head *list)
for ((pos) = (node); pos; (pos) = (pos)->next)
/**
+ * llist_for_each_safe - iterate over some deleted entries of a lock-less list
+ * safe against removal of list entry
+ * @pos: the &struct llist_node to use as a loop cursor
+ * @n: another &struct llist_node to use as temporary storage
+ * @node: the first entry of deleted list entries
+ *
+ * In general, some entries of the lock-less list can be traversed
+ * safely only after being deleted from list, so start with an entry
+ * instead of list head.
+ *
+ * If being used on entries deleted from lock-less list directly, the
+ * traverse order is from the newest to the oldest added entry. If
+ * you want to traverse from the oldest to the newest, you must
+ * reverse the order by yourself before traversing.
+ */
+#define llist_for_each_safe(pos, n, node) \
+ for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n))
+
+/**
* llist_for_each_entry - iterate over some deleted entries of lock-less list of given type
* @pos: the type * to use as a loop cursor.
* @node: the fist entry of deleted list entries.
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index 136dfdf63ba1..fc412fbd80bd 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -14,6 +14,10 @@
#include <asm/page.h>
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+#include <asm/tlbbatch.h>
+#endif
+
#define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
#define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \
IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK))
@@ -67,12 +71,15 @@ struct page_frag {
struct tlbflush_unmap_batch {
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/*
- * Each bit set is a CPU that potentially has a TLB entry for one of
- * the PFNs being flushed. See set_tlb_ubc_flush_pending().
+ * The arch code makes the following promise: generic code can modify a
+ * PTE, then call arch_tlbbatch_add_mm() (which internally provides all
+ * needed barriers), then call arch_tlbbatch_flush(), and the entries
+ * will be flushed on all CPUs by the time that arch_tlbbatch_flush()
+ * returns.
*/
- struct cpumask cpumask;
+ struct arch_tlbflush_unmap_batch arch;
- /* True if any bit in cpumask is set */
+ /* True if a flush is needed. */
bool flush_required;
/*
diff --git a/include/linux/padata.h b/include/linux/padata.h
index 0f9e567d5e15..2f9c1f93b1ce 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -166,9 +166,6 @@ struct padata_instance {
extern struct padata_instance *padata_alloc_possible(
struct workqueue_struct *wq);
-extern struct padata_instance *padata_alloc(struct workqueue_struct *wq,
- const struct cpumask *pcpumask,
- const struct cpumask *cbcpumask);
extern void padata_free(struct padata_instance *pinst);
extern int padata_do_parallel(struct padata_instance *pinst,
struct padata_priv *padata, int cb_cpu);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8039f9f0ca05..58f1ab06c4e8 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -376,6 +376,7 @@ struct pci_dev {
unsigned int irq_managed:1;
unsigned int has_secondary_link:1;
unsigned int non_compliant_bars:1; /* broken BARs; ignore them */
+ unsigned int is_probed:1; /* device probing in progress */
pci_dev_flags_t dev_flags;
atomic_t enable_cnt; /* pci_enable_device has been called */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 24a635887f28..7d6aa29094b2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -801,6 +801,8 @@ struct perf_cpu_context {
struct list_head sched_cb_entry;
int sched_cb_usage;
+
+ int online;
};
struct perf_output_handle {
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 8c1e43ab14a9..34e893a75771 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -73,12 +73,6 @@ struct k_itimer {
} real;
struct cpu_timer_list cpu;
struct {
- unsigned int clock;
- unsigned int node;
- unsigned long incr;
- unsigned long expires;
- } mmtimer;
- struct {
struct alarm alarmtimer;
ktime_t interval;
} alarm;
@@ -105,10 +99,11 @@ struct k_clock {
struct itimerspec64 *cur_setting);
};
-extern struct k_clock clock_posix_cpu;
-extern struct k_clock clock_posix_dynamic;
-
-void posix_timers_register_clock(const clockid_t clock_id, struct k_clock *new_clock);
+extern const struct k_clock clock_posix_cpu;
+extern const struct k_clock clock_posix_dynamic;
+extern const struct k_clock clock_process;
+extern const struct k_clock clock_thread;
+extern const struct k_clock alarm_clock;
/* function to call to trigger timer event */
int posix_timer_event(struct k_itimer *timr, int si_private);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2b69fc650201..3dfa5f99d6ee 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1265,6 +1265,16 @@ extern struct pid *cad_pid;
#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
#define used_math() tsk_used_math(current)
+static inline bool is_percpu_thread(void)
+{
+#ifdef CONFIG_SMP
+ return (current->flags & PF_NO_SETAFFINITY) &&
+ (current->nr_cpus_allowed == 1);
+#else
+ return true;
+#endif
+}
+
/* Per-process atomic flags. */
#define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */
#define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index 34fe92ce1ebd..a55600ffdf4b 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -23,10 +23,6 @@ extern u64 sched_clock_cpu(int cpu);
extern void sched_clock_init(void);
#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-static inline void sched_clock_init_late(void)
-{
-}
-
static inline void sched_clock_tick(void)
{
}
@@ -39,7 +35,7 @@ static inline void sched_clock_idle_sleep_event(void)
{
}
-static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
+static inline void sched_clock_idle_wakeup_event(void)
{
}
@@ -53,7 +49,6 @@ static inline u64 local_clock(void)
return sched_clock();
}
#else
-extern void sched_clock_init_late(void);
extern int sched_clock_stable(void);
extern void clear_sched_clock_stable(void);
@@ -63,10 +58,10 @@ extern void clear_sched_clock_stable(void);
*/
extern u64 __sched_clock_offset;
-
extern void sched_clock_tick(void);
+extern void sched_clock_tick_stable(void);
extern void sched_clock_idle_sleep_event(void);
-extern void sched_clock_idle_wakeup_event(u64 delta_ns);
+extern void sched_clock_idle_wakeup_event(void);
/*
* As outlined in clock.c, provides a fast, high resolution, nanosecond
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 3cc9632dcc2a..3d60275e3ba9 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -116,15 +116,29 @@ static inline int try_stop_cpus(const struct cpumask *cpumask,
* @fn() runs.
*
* This can be thought of as a very heavy write lock, equivalent to
- * grabbing every spinlock in the kernel. */
+ * grabbing every spinlock in the kernel.
+ *
+ * Protects against CPU hotplug.
+ */
int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus);
+/**
+ * stop_machine_cpuslocked: freeze the machine on all CPUs and run this function
+ * @fn: the function to run
+ * @data: the data ptr for the @fn()
+ * @cpus: the cpus to run the @fn() on (NULL = any online cpu)
+ *
+ * Same as above. Must be called from with in a cpus_read_lock() protected
+ * region. Avoids nested calls to cpus_read_lock().
+ */
+int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus);
+
int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
const struct cpumask *cpus);
#else /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */
-static inline int stop_machine(cpu_stop_fn_t fn, void *data,
- const struct cpumask *cpus)
+static inline int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
+ const struct cpumask *cpus)
{
unsigned long flags;
int ret;
@@ -134,6 +148,12 @@ static inline int stop_machine(cpu_stop_fn_t fn, void *data,
return ret;
}
+static inline int stop_machine(cpu_stop_fn_t fn, void *data,
+ const struct cpumask *cpus)
+{
+ return stop_machine_cpuslocked(fn, data, cpus);
+}
+
static inline int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
const struct cpumask *cpus)
{
diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h
index e75e1b6ff27f..09299fcb842a 100644
--- a/include/uapi/linux/time.h
+++ b/include/uapi/linux/time.h
@@ -54,7 +54,11 @@ struct itimerval {
#define CLOCK_BOOTTIME 7
#define CLOCK_REALTIME_ALARM 8
#define CLOCK_BOOTTIME_ALARM 9
-#define CLOCK_SGI_CYCLE 10 /* Hardware specific */
+/*
+ * The driver implementing this got removed. The clock ID is kept as a
+ * place holder. Do not reuse!
+ */
+#define CLOCK_SGI_CYCLE 10
#define CLOCK_TAI 11
#define MAX_CLOCKS 16
diff --git a/init/main.c b/init/main.c
index f866510472d7..df58a416dd1d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -389,6 +389,7 @@ static __initdata DECLARE_COMPLETION(kthreadd_done);
static noinline void __ref rest_init(void)
{
+ struct task_struct *tsk;
int pid;
rcu_scheduler_starting();
@@ -397,12 +398,32 @@ static noinline void __ref rest_init(void)
* the init task will end up wanting to create kthreads, which, if
* we schedule it before we create kthreadd, will OOPS.
*/
- kernel_thread(kernel_init, NULL, CLONE_FS);
+ pid = kernel_thread(kernel_init, NULL, CLONE_FS);
+ /*
+ * Pin init on the boot CPU. Task migration is not properly working
+ * until sched_init_smp() has been run. It will set the allowed
+ * CPUs for init to the non isolated CPUs.
+ */
+ rcu_read_lock();
+ tsk = find_task_by_pid_ns(pid, &init_pid_ns);
+ set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
+ rcu_read_unlock();
+
numa_default_policy();
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
rcu_read_lock();
kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
rcu_read_unlock();
+
+ /*
+ * Enable might_sleep() and smp_processor_id() checks.
+ * They cannot be enabled earlier because with CONFIG_PRREMPT=y
+ * kernel_thread() would trigger might_sleep() splats. With
+ * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled
+ * already, but it's stuck on the kthreadd_done completion.
+ */
+ system_state = SYSTEM_SCHEDULING;
+
complete(&kthreadd_done);
/*
@@ -1015,10 +1036,6 @@ static noinline void __init kernel_init_freeable(void)
* init can allocate pages on any node
*/
set_mems_allowed(node_states[N_MEMORY]);
- /*
- * init can run on any cpu.
- */
- set_cpus_allowed_ptr(current, cpu_all_mask);
cad_pid = task_pid(current);
diff --git a/kernel/async.c b/kernel/async.c
index d2edd6efec56..2cbd3dd5940d 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -114,14 +114,14 @@ static void async_run_entry_fn(struct work_struct *work)
ktime_t uninitialized_var(calltime), delta, rettime;
/* 1) run (and print duration) */
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ if (initcall_debug && system_state < SYSTEM_RUNNING) {
pr_debug("calling %lli_%pF @ %i\n",
(long long)entry->cookie,
entry->func, task_pid_nr(current));
calltime = ktime_get();
}
entry->func(entry->data, entry->cookie);
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ if (initcall_debug && system_state < SYSTEM_RUNNING) {
rettime = ktime_get();
delta = ktime_sub(rettime, calltime);
pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
@@ -284,14 +284,14 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
{
ktime_t uninitialized_var(starttime), delta, endtime;
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ if (initcall_debug && system_state < SYSTEM_RUNNING) {
pr_debug("async_waiting @ %i\n", task_pid_nr(current));
starttime = ktime_get();
}
wait_event(async_done, lowest_in_progress(domain) >= cookie);
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ if (initcall_debug && system_state < SYSTEM_RUNNING) {
endtime = ktime_get();
delta = ktime_sub(endtime, starttime);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9ae6fbe5b5cf..7435ffc6163b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -27,6 +27,7 @@
#include <linux/smpboot.h>
#include <linux/relay.h>
#include <linux/slab.h>
+#include <linux/percpu-rwsem.h>
#include <trace/events/power.h>
#define CREATE_TRACE_POINTS
@@ -65,6 +66,12 @@ struct cpuhp_cpu_state {
static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
+#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
+static struct lock_class_key cpuhp_state_key;
+static struct lockdep_map cpuhp_state_lock_map =
+ STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key);
+#endif
+
/**
* cpuhp_step - Hotplug state machine step
* @name: Name of the step
@@ -196,121 +203,41 @@ void cpu_maps_update_done(void)
mutex_unlock(&cpu_add_remove_lock);
}
-/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
+/*
+ * If set, cpu_up and cpu_down will return -EBUSY and do nothing.
* Should always be manipulated under cpu_add_remove_lock
*/
static int cpu_hotplug_disabled;
#ifdef CONFIG_HOTPLUG_CPU
-static struct {
- struct task_struct *active_writer;
- /* wait queue to wake up the active_writer */
- wait_queue_head_t wq;
- /* verifies that no writer will get active while readers are active */
- struct mutex lock;
- /*
- * Also blocks the new readers during
- * an ongoing cpu hotplug operation.
- */
- atomic_t refcount;
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map dep_map;
-#endif
-} cpu_hotplug = {
- .active_writer = NULL,
- .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
- .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map),
-#endif
-};
-
-/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
-#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
-#define cpuhp_lock_acquire_tryread() \
- lock_map_acquire_tryread(&cpu_hotplug.dep_map)
-#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
-#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
+DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);
-
-void get_online_cpus(void)
+void cpus_read_lock(void)
{
- might_sleep();
- if (cpu_hotplug.active_writer == current)
- return;
- cpuhp_lock_acquire_read();
- mutex_lock(&cpu_hotplug.lock);
- atomic_inc(&cpu_hotplug.refcount);
- mutex_unlock(&cpu_hotplug.lock);
+ percpu_down_read(&cpu_hotplug_lock);
}
-EXPORT_SYMBOL_GPL(get_online_cpus);
+EXPORT_SYMBOL_GPL(cpus_read_lock);
-void put_online_cpus(void)
+void cpus_read_unlock(void)
{
- int refcount;
-
- if (cpu_hotplug.active_writer == current)
- return;
-
- refcount = atomic_dec_return(&cpu_hotplug.refcount);
- if (WARN_ON(refcount < 0)) /* try to fix things up */
- atomic_inc(&cpu_hotplug.refcount);
-
- if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
- wake_up(&cpu_hotplug.wq);
-
- cpuhp_lock_release();
-
+ percpu_up_read(&cpu_hotplug_lock);
}
-EXPORT_SYMBOL_GPL(put_online_cpus);
+EXPORT_SYMBOL_GPL(cpus_read_unlock);
-/*
- * This ensures that the hotplug operation can begin only when the
- * refcount goes to zero.
- *
- * Note that during a cpu-hotplug operation, the new readers, if any,
- * will be blocked by the cpu_hotplug.lock
- *
- * Since cpu_hotplug_begin() is always called after invoking
- * cpu_maps_update_begin(), we can be sure that only one writer is active.
- *
- * Note that theoretically, there is a possibility of a livelock:
- * - Refcount goes to zero, last reader wakes up the sleeping
- * writer.
- * - Last reader unlocks the cpu_hotplug.lock.
- * - A new reader arrives at this moment, bumps up the refcount.
- * - The writer acquires the cpu_hotplug.lock finds the refcount
- * non zero and goes to sleep again.
- *
- * However, this is very difficult to achieve in practice since
- * get_online_cpus() not an api which is called all that often.
- *
- */
-void cpu_hotplug_begin(void)
+void cpus_write_lock(void)
{
- DEFINE_WAIT(wait);
-
- cpu_hotplug.active_writer = current;
- cpuhp_lock_acquire();
+ percpu_down_write(&cpu_hotplug_lock);
+}
- for (;;) {
- mutex_lock(&cpu_hotplug.lock);
- prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
- if (likely(!atomic_read(&cpu_hotplug.refcount)))
- break;
- mutex_unlock(&cpu_hotplug.lock);
- schedule();
- }
- finish_wait(&cpu_hotplug.wq, &wait);
+void cpus_write_unlock(void)
+{
+ percpu_up_write(&cpu_hotplug_lock);
}
-void cpu_hotplug_done(void)
+void lockdep_assert_cpus_held(void)
{
- cpu_hotplug.active_writer = NULL;
- mutex_unlock(&cpu_hotplug.lock);
- cpuhp_lock_release();
+ percpu_rwsem_assert_held(&cpu_hotplug_lock);
}
/*
@@ -344,8 +271,6 @@ void cpu_hotplug_enable(void)
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
#endif /* CONFIG_HOTPLUG_CPU */
-/* Notifier wrappers for transitioning to state machine */
-
static int bringup_wait_for_ap(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
@@ -484,6 +409,7 @@ static void cpuhp_thread_fun(unsigned int cpu)
st->should_run = false;
+ lock_map_acquire(&cpuhp_state_lock_map);
/* Single callback invocation for [un]install ? */
if (st->single) {
if (st->cb_state < CPUHP_AP_ONLINE) {
@@ -510,6 +436,7 @@ static void cpuhp_thread_fun(unsigned int cpu)
else if (st->state > st->target)
ret = cpuhp_ap_offline(cpu, st);
}
+ lock_map_release(&cpuhp_state_lock_map);
st->result = ret;
complete(&st->done);
}
@@ -524,6 +451,9 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
if (!cpu_online(cpu))
return 0;
+ lock_map_acquire(&cpuhp_state_lock_map);
+ lock_map_release(&cpuhp_state_lock_map);
+
/*
* If we are up and running, use the hotplug thread. For early calls
* we invoke the thread function directly.
@@ -567,6 +497,8 @@ static int cpuhp_kick_ap_work(unsigned int cpu)
enum cpuhp_state state = st->state;
trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work);
+ lock_map_acquire(&cpuhp_state_lock_map);
+ lock_map_release(&cpuhp_state_lock_map);
__cpuhp_kick_ap_work(st);
wait_for_completion(&st->done);
trace_cpuhp_exit(cpu, st->state, state, st->result);
@@ -701,7 +633,7 @@ static int takedown_cpu(unsigned int cpu)
/*
* So now all preempt/rcu users must observe !cpu_active().
*/
- err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu));
+ err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
if (err) {
/* CPU refused to die */
irq_unlock_sparse();
@@ -773,7 +705,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
if (!cpu_present(cpu))
return -EINVAL;
- cpu_hotplug_begin();
+ cpus_write_lock();
cpuhp_tasks_frozen = tasks_frozen;
@@ -811,7 +743,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
}
out:
- cpu_hotplug_done();
+ cpus_write_unlock();
return ret;
}
@@ -893,7 +825,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
struct task_struct *idle;
int ret = 0;
- cpu_hotplug_begin();
+ cpus_write_lock();
if (!cpu_present(cpu)) {
ret = -EINVAL;
@@ -941,7 +873,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
target = min((int)target, CPUHP_BRINGUP_CPU);
ret = cpuhp_up_callbacks(cpu, st, target);
out:
- cpu_hotplug_done();
+ cpus_write_unlock();
return ret;
}
@@ -1413,18 +1345,20 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
}
}
-int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
- bool invoke)
+int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state,
+ struct hlist_node *node,
+ bool invoke)
{
struct cpuhp_step *sp;
int cpu;
int ret;
+ lockdep_assert_cpus_held();
+
sp = cpuhp_get_step(state);
if (sp->multi_instance == false)
return -EINVAL;
- get_online_cpus();
mutex_lock(&cpuhp_state_mutex);
if (!invoke || !sp->startup.multi)
@@ -1453,13 +1387,23 @@ add_node:
hlist_add_head(node, &sp->list);
unlock:
mutex_unlock(&cpuhp_state_mutex);
- put_online_cpus();
+ return ret;
+}
+
+int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
+ bool invoke)
+{
+ int ret;
+
+ cpus_read_lock();
+ ret = __cpuhp_state_add_instance_cpuslocked(state, node, invoke);
+ cpus_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
/**
- * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state
+ * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state
* @state: The state to setup
* @invoke: If true, the startup function is invoked for cpus where
* cpu state >= @state
@@ -1468,25 +1412,27 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
* @multi_instance: State is set up for multiple instances which get
* added afterwards.
*
+ * The caller needs to hold cpus read locked while calling this function.
* Returns:
* On success:
* Positive state number if @state is CPUHP_AP_ONLINE_DYN
* 0 for all other states
* On failure: proper (negative) error code
*/
-int __cpuhp_setup_state(enum cpuhp_state state,
- const char *name, bool invoke,
- int (*startup)(unsigned int cpu),
- int (*teardown)(unsigned int cpu),
- bool multi_instance)
+int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
+ const char *name, bool invoke,
+ int (*startup)(unsigned int cpu),
+ int (*teardown)(unsigned int cpu),
+ bool multi_instance)
{
int cpu, ret = 0;
bool dynstate;
+ lockdep_assert_cpus_held();
+
if (cpuhp_cb_check(state) || !name)
return -EINVAL;
- get_online_cpus();
mutex_lock(&cpuhp_state_mutex);
ret = cpuhp_store_callbacks(state, name, startup, teardown,
@@ -1522,7 +1468,6 @@ int __cpuhp_setup_state(enum cpuhp_state state,
}
out:
mutex_unlock(&cpuhp_state_mutex);
- put_online_cpus();
/*
* If the requested state is CPUHP_AP_ONLINE_DYN, return the
* dynamically allocated state in case of success.
@@ -1531,6 +1476,22 @@ out:
return state;
return ret;
}
+EXPORT_SYMBOL(__cpuhp_setup_state_cpuslocked);
+
+int __cpuhp_setup_state(enum cpuhp_state state,
+ const char *name, bool invoke,
+ int (*startup)(unsigned int cpu),
+ int (*teardown)(unsigned int cpu),
+ bool multi_instance)
+{
+ int ret;
+
+ cpus_read_lock();
+ ret = __cpuhp_setup_state_cpuslocked(state, name, invoke, startup,
+ teardown, multi_instance);
+ cpus_read_unlock();
+ return ret;
+}
EXPORT_SYMBOL(__cpuhp_setup_state);
int __cpuhp_state_remove_instance(enum cpuhp_state state,
@@ -1544,7 +1505,7 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state,
if (!sp->multi_instance)
return -EINVAL;
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&cpuhp_state_mutex);
if (!invoke || !cpuhp_get_teardown_cb(state))
@@ -1565,29 +1526,30 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state,
remove:
hlist_del(node);
mutex_unlock(&cpuhp_state_mutex);
- put_online_cpus();
+ cpus_read_unlock();
return 0;
}
EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance);
/**
- * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state
+ * __cpuhp_remove_state_cpuslocked - Remove the callbacks for an hotplug machine state
* @state: The state to remove
* @invoke: If true, the teardown function is invoked for cpus where
* cpu state >= @state
*
+ * The caller needs to hold cpus read locked while calling this function.
* The teardown callback is currently not allowed to fail. Think
* about module removal!
*/
-void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
+void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke)
{
struct cpuhp_step *sp = cpuhp_get_step(state);
int cpu;
BUG_ON(cpuhp_cb_check(state));
- get_online_cpus();
+ lockdep_assert_cpus_held();
mutex_lock(&cpuhp_state_mutex);
if (sp->multi_instance) {
@@ -1615,7 +1577,14 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
remove:
cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
mutex_unlock(&cpuhp_state_mutex);
- put_online_cpus();
+}
+EXPORT_SYMBOL(__cpuhp_remove_state_cpuslocked);
+
+void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
+{
+ cpus_read_lock();
+ __cpuhp_remove_state_cpuslocked(state, invoke);
+ cpus_read_unlock();
}
EXPORT_SYMBOL(__cpuhp_remove_state);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6e75a5c9412d..8d6acaeeea17 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -389,6 +389,7 @@ static atomic_t nr_switch_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
+static cpumask_var_t perf_online_mask;
/*
* perf event paranoia level:
@@ -3812,14 +3813,6 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
return ERR_PTR(-EACCES);
- /*
- * We could be clever and allow to attach a event to an
- * offline CPU and activate it when the CPU comes up, but
- * that's for later.
- */
- if (!cpu_online(cpu))
- return ERR_PTR(-ENODEV);
-
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
ctx = &cpuctx->ctx;
get_ctx(ctx);
@@ -7703,7 +7696,8 @@ static int swevent_hlist_get_cpu(int cpu)
int err = 0;
mutex_lock(&swhash->hlist_mutex);
- if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
+ if (!swevent_hlist_deref(swhash) &&
+ cpumask_test_cpu(cpu, perf_online_mask)) {
struct swevent_hlist *hlist;
hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -7724,7 +7718,7 @@ static int swevent_hlist_get(void)
{
int err, cpu, failed_cpu;
- get_online_cpus();
+ mutex_lock(&pmus_lock);
for_each_possible_cpu(cpu) {
err = swevent_hlist_get_cpu(cpu);
if (err) {
@@ -7732,8 +7726,7 @@ static int swevent_hlist_get(void)
goto fail;
}
}
- put_online_cpus();
-
+ mutex_unlock(&pmus_lock);
return 0;
fail:
for_each_possible_cpu(cpu) {
@@ -7741,8 +7734,7 @@ fail:
break;
swevent_hlist_put_cpu(cpu);
}
-
- put_online_cpus();
+ mutex_unlock(&pmus_lock);
return err;
}
@@ -8920,7 +8912,7 @@ perf_event_mux_interval_ms_store(struct device *dev,
pmu->hrtimer_interval_ms = timer;
/* update all cpuctx for this PMU */
- get_online_cpus();
+ cpus_read_lock();
for_each_online_cpu(cpu) {
struct perf_cpu_context *cpuctx;
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
@@ -8929,7 +8921,7 @@ perf_event_mux_interval_ms_store(struct device *dev,
cpu_function_call(cpu,
(remote_function_f)perf_mux_hrtimer_restart, cpuctx);
}
- put_online_cpus();
+ cpus_read_unlock();
mutex_unlock(&mux_interval_mutex);
return count;
@@ -9059,6 +9051,7 @@ skip_type:
lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
cpuctx->ctx.pmu = pmu;
+ cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
__perf_mux_hrtimer_init(cpuctx, cpu);
}
@@ -9172,7 +9165,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
static struct pmu *perf_init_event(struct perf_event *event)
{
- struct pmu *pmu = NULL;
+ struct pmu *pmu;
int idx;
int ret;
@@ -9456,9 +9449,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
pmu = perf_init_event(event);
- if (!pmu)
- goto err_ns;
- else if (IS_ERR(pmu)) {
+ if (IS_ERR(pmu)) {
err = PTR_ERR(pmu);
goto err_ns;
}
@@ -9471,8 +9462,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
sizeof(unsigned long),
GFP_KERNEL);
- if (!event->addr_filters_offs)
+ if (!event->addr_filters_offs) {
+ err = -ENOMEM;
goto err_per_task;
+ }
/* force hw sync on the address filters */
event->addr_filters_gen = 1;
@@ -9882,12 +9875,10 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_task;
}
- get_online_cpus();
-
if (task) {
err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
if (err)
- goto err_cpus;
+ goto err_cred;
/*
* Reuse ptrace permission checks for now.
@@ -10073,6 +10064,23 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_locked;
}
+ if (!task) {
+ /*
+ * Check if the @cpu we're creating an event for is online.
+ *
+ * We use the perf_cpu_context::ctx::mutex to serialize against
+ * the hotplug notifiers. See perf_event_{init,exit}_cpu().
+ */
+ struct perf_cpu_context *cpuctx =
+ container_of(ctx, struct perf_cpu_context, ctx);
+
+ if (!cpuctx->online) {
+ err = -ENODEV;
+ goto err_locked;
+ }
+ }
+
+
/*
* Must be under the same ctx::mutex as perf_install_in_context(),
* because we need to serialize with concurrent event creation.
@@ -10162,8 +10170,6 @@ SYSCALL_DEFINE5(perf_event_open,
put_task_struct(task);
}
- put_online_cpus();
-
mutex_lock(&current->perf_event_mutex);
list_add_tail(&event->owner_entry, &current->perf_event_list);
mutex_unlock(&current->perf_event_mutex);
@@ -10197,8 +10203,6 @@ err_alloc:
err_cred:
if (task)
mutex_unlock(&task->signal->cred_guard_mutex);
-err_cpus:
- put_online_cpus();
err_task:
if (task)
put_task_struct(task);
@@ -10253,6 +10257,21 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err_unlock;
}
+ if (!task) {
+ /*
+ * Check if the @cpu we're creating an event for is online.
+ *
+ * We use the perf_cpu_context::ctx::mutex to serialize against
+ * the hotplug notifiers. See perf_event_{init,exit}_cpu().
+ */
+ struct perf_cpu_context *cpuctx =
+ container_of(ctx, struct perf_cpu_context, ctx);
+ if (!cpuctx->online) {
+ err = -ENODEV;
+ goto err_unlock;
+ }
+ }
+
if (!exclusive_event_installable(event, ctx)) {
err = -EBUSY;
goto err_unlock;
@@ -10920,6 +10939,8 @@ static void __init perf_event_init_all_cpus(void)
struct swevent_htable *swhash;
int cpu;
+ zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
+
for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu);
mutex_init(&swhash->hlist_mutex);
@@ -10935,7 +10956,7 @@ static void __init perf_event_init_all_cpus(void)
}
}
-int perf_event_init_cpu(unsigned int cpu)
+void perf_swevent_init_cpu(unsigned int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
@@ -10948,7 +10969,6 @@ int perf_event_init_cpu(unsigned int cpu)
rcu_assign_pointer(swhash->swevent_hlist, hlist);
}
mutex_unlock(&swhash->hlist_mutex);
- return 0;
}
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
@@ -10966,19 +10986,22 @@ static void __perf_event_exit_context(void *__info)
static void perf_event_exit_cpu_context(int cpu)
{
+ struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
struct pmu *pmu;
- int idx;
- idx = srcu_read_lock(&pmus_srcu);
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
+ mutex_lock(&pmus_lock);
+ list_for_each_entry(pmu, &pmus, entry) {
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
mutex_lock(&ctx->mutex);
smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+ cpuctx->online = 0;
mutex_unlock(&ctx->mutex);
}
- srcu_read_unlock(&pmus_srcu, idx);
+ cpumask_clear_cpu(cpu, perf_online_mask);
+ mutex_unlock(&pmus_lock);
}
#else
@@ -10986,6 +11009,29 @@ static void perf_event_exit_cpu_context(int cpu) { }
#endif
+int perf_event_init_cpu(unsigned int cpu)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
+ struct pmu *pmu;
+
+ perf_swevent_init_cpu(cpu);
+
+ mutex_lock(&pmus_lock);
+ cpumask_set_cpu(cpu, perf_online_mask);
+ list_for_each_entry(pmu, &pmus, entry) {
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
+
+ mutex_lock(&ctx->mutex);
+ cpuctx->online = 1;
+ mutex_unlock(&ctx->mutex);
+ }
+ mutex_unlock(&pmus_lock);
+
+ return 0;
+}
+
int perf_event_exit_cpu(unsigned int cpu)
{
perf_event_exit_cpu_context(cpu);
diff --git a/kernel/extable.c b/kernel/extable.c
index 2676d7f8baf6..0fbdd8582f08 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -75,7 +75,7 @@ int core_kernel_text(unsigned long addr)
addr < (unsigned long)_etext)
return 1;
- if (system_state == SYSTEM_BOOTING &&
+ if (system_state < SYSTEM_RUNNING &&
init_kernel_text(addr))
return 1;
return 0;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 22e443133987..c5bbaed5a5e6 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -480,7 +480,8 @@ int __init early_irq_init(void)
/* Let arch update nr_irqs and return the nr of preallocated irqs */
initcnt = arch_probe_nr_irqs();
- printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
+ printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n",
+ NR_IRQS, nr_irqs, initcnt);
if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))
nr_irqs = IRQ_BITMAP_BITS;
@@ -516,7 +517,7 @@ int __init early_irq_init(void)
init_irq_default_affinity();
- printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
+ printk(KERN_INFO "NR_IRQS: %d\n", NR_IRQS);
desc = irq_desc;
count = ARRAY_SIZE(irq_desc);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 31805f237396..70b9da72018b 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -746,13 +746,54 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
EXPORT_SYMBOL_GPL(irq_find_mapping);
#ifdef CONFIG_IRQ_DOMAIN_DEBUG
+static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc)
+{
+ struct irq_domain *domain;
+ struct irq_data *data;
+
+ domain = desc->irq_data.domain;
+ data = &desc->irq_data;
+
+ while (domain) {
+ unsigned int irq = data->irq;
+ unsigned long hwirq = data->hwirq;
+ struct irq_chip *chip;
+ bool direct;
+
+ if (data == &desc->irq_data)
+ seq_printf(m, "%5d ", irq);
+ else
+ seq_printf(m, "%5d+ ", irq);
+ seq_printf(m, "0x%05lx ", hwirq);
+
+ chip = irq_data_get_irq_chip(data);
+ seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none");
+
+ seq_printf(m, data ? "0x%p " : " %p ",
+ irq_data_get_irq_chip_data(data));
+
+ seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' ');
+ direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq);
+ seq_printf(m, "%6s%-8s ",
+ (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX",
+ direct ? "(DIRECT)" : "");
+ seq_printf(m, "%s\n", domain->name);
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ domain = domain->parent;
+ data = data->parent_data;
+#else
+ domain = NULL;
+#endif
+ }
+}
+
static int virq_debug_show(struct seq_file *m, void *private)
{
unsigned long flags;
struct irq_desc *desc;
struct irq_domain *domain;
struct radix_tree_iter iter;
- void *data, **slot;
+ void **slot;
int i;
seq_printf(m, " %-16s %-6s %-10s %-10s %s\n",
@@ -760,15 +801,26 @@ static int virq_debug_show(struct seq_file *m, void *private)
mutex_lock(&irq_domain_mutex);
list_for_each_entry(domain, &irq_domain_list, link) {
struct device_node *of_node;
+ const char *name;
+
int count = 0;
+
of_node = irq_domain_get_of_node(domain);
+ if (of_node)
+ name = of_node_full_name(of_node);
+ else if (is_fwnode_irqchip(domain->fwnode))
+ name = container_of(domain->fwnode, struct irqchip_fwid,
+ fwnode)->name;
+ else
+ name = "";
+
radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0)
count++;
seq_printf(m, "%c%-16s %6u %10u %10u %s\n",
domain == irq_default_domain ? '*' : ' ', domain->name,
domain->revmap_size + count, domain->revmap_size,
domain->revmap_direct_max_irq,
- of_node ? of_node_full_name(of_node) : "");
+ name);
}
mutex_unlock(&irq_domain_mutex);
@@ -782,30 +834,7 @@ static int virq_debug_show(struct seq_file *m, void *private)
continue;
raw_spin_lock_irqsave(&desc->lock, flags);
- domain = desc->irq_data.domain;
-
- if (domain) {
- struct irq_chip *chip;
- int hwirq = desc->irq_data.hwirq;
- bool direct;
-
- seq_printf(m, "%5d ", i);
- seq_printf(m, "0x%05x ", hwirq);
-
- chip = irq_desc_get_chip(desc);
- seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none");
-
- data = irq_desc_get_chip_data(desc);
- seq_printf(m, data ? "0x%p " : " %p ", data);
-
- seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' ');
- direct = (i == hwirq) && (i < domain->revmap_direct_max_irq);
- seq_printf(m, "%6s%-8s ",
- (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX",
- direct ? "(DIRECT)" : "");
- seq_printf(m, "%s\n", desc->irq_data.domain->name);
- }
-
+ virq_debug_show_one(m, desc);
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index ddc2f5427f75..fe4d48ec5bc4 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -265,13 +265,19 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
struct msi_domain_info *info,
struct irq_domain *parent)
{
+ struct irq_domain *domain;
+
if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
msi_domain_update_dom_ops(info);
if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
msi_domain_update_chip_ops(info);
- return irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0,
- fwnode, &msi_domain_ops, info);
+ domain = irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0,
+ fwnode, &msi_domain_ops, info);
+ if (domain && info->chip && info->chip->name)
+ domain->name = info->chip->name;
+
+ return domain;
}
int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 6c9cb208ac48..d11c506a6ac3 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -15,6 +15,7 @@
#include <linux/static_key.h>
#include <linux/jump_label_ratelimit.h>
#include <linux/bug.h>
+#include <linux/cpu.h>
#ifdef HAVE_JUMP_LABEL
@@ -124,6 +125,7 @@ void static_key_slow_inc(struct static_key *key)
return;
}
+ cpus_read_lock();
jump_label_lock();
if (atomic_read(&key->enabled) == 0) {
atomic_set(&key->enabled, -1);
@@ -133,12 +135,14 @@ void static_key_slow_inc(struct static_key *key)
atomic_inc(&key->enabled);
}
jump_label_unlock();
+ cpus_read_unlock();
}
EXPORT_SYMBOL_GPL(static_key_slow_inc);
static void __static_key_slow_dec(struct static_key *key,
unsigned long rate_limit, struct delayed_work *work)
{
+ cpus_read_lock();
/*
* The negative count check is valid even when a negative
* key->enabled is in use by static_key_slow_inc(); a
@@ -149,6 +153,7 @@ static void __static_key_slow_dec(struct static_key *key,
if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
WARN(atomic_read(&key->enabled) < 0,
"jump label: negative count!\n");
+ cpus_read_unlock();
return;
}
@@ -159,6 +164,7 @@ static void __static_key_slow_dec(struct static_key *key,
jump_label_update(key);
}
jump_label_unlock();
+ cpus_read_unlock();
}
static void jump_label_update_timeout(struct work_struct *work)
@@ -334,6 +340,7 @@ void __init jump_label_init(void)
if (static_key_initialized)
return;
+ cpus_read_lock();
jump_label_lock();
jump_label_sort_entries(iter_start, iter_stop);
@@ -353,6 +360,7 @@ void __init jump_label_init(void)
}
static_key_initialized = true;
jump_label_unlock();
+ cpus_read_unlock();
}
#ifdef CONFIG_MODULES
@@ -590,28 +598,28 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
struct module *mod = data;
int ret = 0;
+ cpus_read_lock();
+ jump_label_lock();
+
switch (val) {
case MODULE_STATE_COMING:
- jump_label_lock();
ret = jump_label_add_module(mod);
if (ret) {
WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n");
jump_label_del_module(mod);
}
- jump_label_unlock();
break;
case MODULE_STATE_GOING:
- jump_label_lock();
jump_label_del_module(mod);
- jump_label_unlock();
break;
case MODULE_STATE_LIVE:
- jump_label_lock();
jump_label_invalidate_module_init(mod);
- jump_label_unlock();
break;
}
+ jump_label_unlock();
+ cpus_read_unlock();
+
return notifier_from_errno(ret);
}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index adfe3b4cfe05..6756d750b31b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -483,11 +483,6 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
*/
static void do_optimize_kprobes(void)
{
- /* Optimization never be done when disarmed */
- if (kprobes_all_disarmed || !kprobes_allow_optimization ||
- list_empty(&optimizing_list))
- return;
-
/*
* The optimization/unoptimization refers online_cpus via
* stop_machine() and cpu-hotplug modifies online_cpus.
@@ -495,14 +490,19 @@ static void do_optimize_kprobes(void)
* This combination can cause a deadlock (cpu-hotplug try to lock
* text_mutex but stop_machine can not be done because online_cpus
* has been changed)
- * To avoid this deadlock, we need to call get_online_cpus()
+ * To avoid this deadlock, caller must have locked cpu hotplug
* for preventing cpu-hotplug outside of text_mutex locking.
*/
- get_online_cpus();
+ lockdep_assert_cpus_held();
+
+ /* Optimization never be done when disarmed */
+ if (kprobes_all_disarmed || !kprobes_allow_optimization ||
+ list_empty(&optimizing_list))
+ return;
+
mutex_lock(&text_mutex);
arch_optimize_kprobes(&optimizing_list);
mutex_unlock(&text_mutex);
- put_online_cpus();
}
/*
@@ -513,12 +513,13 @@ static void do_unoptimize_kprobes(void)
{
struct optimized_kprobe *op, *tmp;
+ /* See comment in do_optimize_kprobes() */
+ lockdep_assert_cpus_held();
+
/* Unoptimization must be done anytime */
if (list_empty(&unoptimizing_list))
return;
- /* Ditto to do_optimize_kprobes */
- get_online_cpus();
mutex_lock(&text_mutex);
arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
/* Loop free_list for disarming */
@@ -537,7 +538,6 @@ static void do_unoptimize_kprobes(void)
list_del_init(&op->list);
}
mutex_unlock(&text_mutex);
- put_online_cpus();
}
/* Reclaim all kprobes on the free_list */
@@ -562,6 +562,7 @@ static void kick_kprobe_optimizer(void)
static void kprobe_optimizer(struct work_struct *work)
{
mutex_lock(&kprobe_mutex);
+ cpus_read_lock();
/* Lock modules while optimizing kprobes */
mutex_lock(&module_mutex);
@@ -587,6 +588,7 @@ static void kprobe_optimizer(struct work_struct *work)
do_free_cleaned_kprobes();
mutex_unlock(&module_mutex);
+ cpus_read_unlock();
mutex_unlock(&kprobe_mutex);
/* Step 5: Kick optimizer again if needed */
@@ -650,9 +652,8 @@ static void optimize_kprobe(struct kprobe *p)
/* Short cut to direct unoptimizing */
static void force_unoptimize_kprobe(struct optimized_kprobe *op)
{
- get_online_cpus();
+ lockdep_assert_cpus_held();
arch_unoptimize_kprobe(op);
- put_online_cpus();
if (kprobe_disabled(&op->kp))
arch_disarm_kprobe(&op->kp);
}
@@ -791,6 +792,7 @@ static void try_to_optimize_kprobe(struct kprobe *p)
return;
/* For preparing optimization, jump_label_text_reserved() is called */
+ cpus_read_lock();
jump_label_lock();
mutex_lock(&text_mutex);
@@ -812,6 +814,7 @@ static void try_to_optimize_kprobe(struct kprobe *p)
out:
mutex_unlock(&text_mutex);
jump_label_unlock();
+ cpus_read_unlock();
}
#ifdef CONFIG_SYSCTL
@@ -826,6 +829,7 @@ static void optimize_all_kprobes(void)
if (kprobes_allow_optimization)
goto out;
+ cpus_read_lock();
kprobes_allow_optimization = true;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
@@ -833,6 +837,7 @@ static void optimize_all_kprobes(void)
if (!kprobe_disabled(p))
optimize_kprobe(p);
}
+ cpus_read_unlock();
printk(KERN_INFO "Kprobes globally optimized\n");
out:
mutex_unlock(&kprobe_mutex);
@@ -851,6 +856,7 @@ static void unoptimize_all_kprobes(void)
return;
}
+ cpus_read_lock();
kprobes_allow_optimization = false;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
@@ -859,6 +865,7 @@ static void unoptimize_all_kprobes(void)
unoptimize_kprobe(p, false);
}
}
+ cpus_read_unlock();
mutex_unlock(&kprobe_mutex);
/* Wait for unoptimizing completion */
@@ -1010,14 +1017,11 @@ static void arm_kprobe(struct kprobe *kp)
arm_kprobe_ftrace(kp);
return;
}
- /*
- * Here, since __arm_kprobe() doesn't use stop_machine(),
- * this doesn't cause deadlock on text_mutex. So, we don't
- * need get_online_cpus().
- */
+ cpus_read_lock();
mutex_lock(&text_mutex);
__arm_kprobe(kp);
mutex_unlock(&text_mutex);
+ cpus_read_unlock();
}
/* Disarm a kprobe with text_mutex */
@@ -1027,10 +1031,12 @@ static void disarm_kprobe(struct kprobe *kp, bool reopt)
disarm_kprobe_ftrace(kp);
return;
}
- /* Ditto */
+
+ cpus_read_lock();
mutex_lock(&text_mutex);
__disarm_kprobe(kp, reopt);
mutex_unlock(&text_mutex);
+ cpus_read_unlock();
}
/*
@@ -1298,13 +1304,10 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
int ret = 0;
struct kprobe *ap = orig_p;
+ cpus_read_lock();
+
/* For preparing optimization, jump_label_text_reserved() is called */
jump_label_lock();
- /*
- * Get online CPUs to avoid text_mutex deadlock.with stop machine,
- * which is invoked by unoptimize_kprobe() in add_new_kprobe()
- */
- get_online_cpus();
mutex_lock(&text_mutex);
if (!kprobe_aggrprobe(orig_p)) {
@@ -1352,8 +1355,8 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
out:
mutex_unlock(&text_mutex);
- put_online_cpus();
jump_label_unlock();
+ cpus_read_unlock();
if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) {
ap->flags &= ~KPROBE_FLAG_DISABLED;
@@ -1555,9 +1558,12 @@ int register_kprobe(struct kprobe *p)
goto out;
}
- mutex_lock(&text_mutex); /* Avoiding text modification */
+ cpus_read_lock();
+ /* Prevent text modification */
+ mutex_lock(&text_mutex);
ret = prepare_kprobe(p);
mutex_unlock(&text_mutex);
+ cpus_read_unlock();
if (ret)
goto out;
@@ -1570,7 +1576,6 @@ int register_kprobe(struct kprobe *p)
/* Try to optimize kprobe */
try_to_optimize_kprobe(p);
-
out:
mutex_unlock(&kprobe_mutex);
diff --git a/kernel/padata.c b/kernel/padata.c
index ac8f1e524836..868f947166d7 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -934,29 +934,18 @@ static struct kobj_type padata_attr_type = {
};
/**
- * padata_alloc_possible - Allocate and initialize padata instance.
- * Use the cpu_possible_mask for serial and
- * parallel workers.
- *
- * @wq: workqueue to use for the allocated padata instance
- */
-struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq)
-{
- return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
-}
-EXPORT_SYMBOL(padata_alloc_possible);
-
-/**
* padata_alloc - allocate and initialize a padata instance and specify
* cpumasks for serial and parallel workers.
*
* @wq: workqueue to use for the allocated padata instance
* @pcpumask: cpumask that will be used for padata parallelization
* @cbcpumask: cpumask that will be used for padata serialization
+ *
+ * Must be called from a cpus_read_lock() protected region
*/
-struct padata_instance *padata_alloc(struct workqueue_struct *wq,
- const struct cpumask *pcpumask,
- const struct cpumask *cbcpumask)
+static struct padata_instance *padata_alloc(struct workqueue_struct *wq,
+ const struct cpumask *pcpumask,
+ const struct cpumask *cbcpumask)
{
struct padata_instance *pinst;
struct parallel_data *pd = NULL;
@@ -965,7 +954,6 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq,
if (!pinst)
goto err;
- get_online_cpus();
if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL))
goto err_free_inst;
if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) {
@@ -989,14 +977,12 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq,
pinst->flags = 0;
- put_online_cpus();
-
BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
kobject_init(&pinst->kobj, &padata_attr_type);
mutex_init(&pinst->lock);
#ifdef CONFIG_HOTPLUG_CPU
- cpuhp_state_add_instance_nocalls(hp_online, &pinst->node);
+ cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node);
#endif
return pinst;
@@ -1005,12 +991,27 @@ err_free_masks:
free_cpumask_var(pinst->cpumask.cbcpu);
err_free_inst:
kfree(pinst);
- put_online_cpus();
err:
return NULL;
}
/**
+ * padata_alloc_possible - Allocate and initialize padata instance.
+ * Use the cpu_possible_mask for serial and
+ * parallel workers.
+ *
+ * @wq: workqueue to use for the allocated padata instance
+ *
+ * Must be called from a cpus_read_lock() protected region
+ */
+struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq)
+{
+ lockdep_assert_cpus_held();
+ return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
+}
+EXPORT_SYMBOL(padata_alloc_possible);
+
+/**
* padata_free - free a padata instance
*
* @padata_inst: padata instance to free
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 9995b2dfaaff..e782e9049e73 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1176,7 +1176,7 @@ static void boot_delay_msec(int level)
unsigned long long k;
unsigned long timeout;
- if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
+ if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING)
|| suppress_message_printing(level)) {
return;
}
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 00a45c45beca..ca0f8fc945c6 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -64,6 +64,7 @@
#include <linux/workqueue.h>
#include <linux/compiler.h>
#include <linux/tick.h>
+#include <linux/init.h>
/*
* Scheduler clock - returns current time in nanosec units.
@@ -124,14 +125,27 @@ int sched_clock_stable(void)
return static_branch_likely(&__sched_clock_stable);
}
+static void __scd_stamp(struct sched_clock_data *scd)
+{
+ scd->tick_gtod = ktime_get_ns();
+ scd->tick_raw = sched_clock();
+}
+
static void __set_sched_clock_stable(void)
{
- struct sched_clock_data *scd = this_scd();
+ struct sched_clock_data *scd;
/*
+ * Since we're still unstable and the tick is already running, we have
+ * to disable IRQs in order to get a consistent scd->tick* reading.
+ */
+ local_irq_disable();
+ scd = this_scd();
+ /*
* Attempt to make the (initial) unstable->stable transition continuous.
*/
__sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw);
+ local_irq_enable();
printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n",
scd->tick_gtod, __gtod_offset,
@@ -141,8 +155,38 @@ static void __set_sched_clock_stable(void)
tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
}
+/*
+ * If we ever get here, we're screwed, because we found out -- typically after
+ * the fact -- that TSC wasn't good. This means all our clocksources (including
+ * ktime) could have reported wrong values.
+ *
+ * What we do here is an attempt to fix up and continue sort of where we left
+ * off in a coherent manner.
+ *
+ * The only way to fully avoid random clock jumps is to boot with:
+ * "tsc=unstable".
+ */
static void __sched_clock_work(struct work_struct *work)
{
+ struct sched_clock_data *scd;
+ int cpu;
+
+ /* take a current timestamp and set 'now' */
+ preempt_disable();
+ scd = this_scd();
+ __scd_stamp(scd);
+ scd->clock = scd->tick_gtod + __gtod_offset;
+ preempt_enable();
+
+ /* clone to all CPUs */
+ for_each_possible_cpu(cpu)
+ per_cpu(sched_clock_data, cpu) = *scd;
+
+ printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n");
+ printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
+ scd->tick_gtod, __gtod_offset,
+ scd->tick_raw, __sched_clock_offset);
+
static_branch_disable(&__sched_clock_stable);
}
@@ -150,27 +194,11 @@ static DECLARE_WORK(sched_clock_work, __sched_clock_work);
static void __clear_sched_clock_stable(void)
{
- struct sched_clock_data *scd = this_scd();
-
- /*
- * Attempt to make the stable->unstable transition continuous.
- *
- * Trouble is, this is typically called from the TSC watchdog
- * timer, which is late per definition. This means the tick
- * values can already be screwy.
- *
- * Still do what we can.
- */
- __gtod_offset = (scd->tick_raw + __sched_clock_offset) - (scd->tick_gtod);
-
- printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
- scd->tick_gtod, __gtod_offset,
- scd->tick_raw, __sched_clock_offset);
+ if (!sched_clock_stable())
+ return;
tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
-
- if (sched_clock_stable())
- schedule_work(&sched_clock_work);
+ schedule_work(&sched_clock_work);
}
void clear_sched_clock_stable(void)
@@ -183,7 +211,11 @@ void clear_sched_clock_stable(void)
__clear_sched_clock_stable();
}
-void sched_clock_init_late(void)
+/*
+ * We run this as late_initcall() such that it runs after all built-in drivers,
+ * notably: acpi_processor and intel_idle, which can mark the TSC as unstable.
+ */
+static int __init sched_clock_init_late(void)
{
sched_clock_running = 2;
/*
@@ -197,7 +229,10 @@ void sched_clock_init_late(void)
if (__sched_clock_stable_early)
__set_sched_clock_stable();
+
+ return 0;
}
+late_initcall(sched_clock_init_late);
/*
* min, max except they take wrapping into account
@@ -347,21 +382,38 @@ void sched_clock_tick(void)
{
struct sched_clock_data *scd;
+ if (sched_clock_stable())
+ return;
+
+ if (unlikely(!sched_clock_running))
+ return;
+
WARN_ON_ONCE(!irqs_disabled());
+ scd = this_scd();
+ __scd_stamp(scd);
+ sched_clock_local(scd);
+}
+
+void sched_clock_tick_stable(void)
+{
+ u64 gtod, clock;
+
+ if (!sched_clock_stable())
+ return;
+
/*
- * Update these values even if sched_clock_stable(), because it can
- * become unstable at any point in time at which point we need some
- * values to fall back on.
+ * Called under watchdog_lock.
*
- * XXX arguably we can skip this if we expose tsc_clocksource_reliable
+ * The watchdog just found this TSC to (still) be stable, so now is a
+ * good moment to update our __gtod_offset. Because once we find the
+ * TSC to be unstable, any computation will be computing crap.
*/
- scd = this_scd();
- scd->tick_raw = sched_clock();
- scd->tick_gtod = ktime_get_ns();
-
- if (!sched_clock_stable() && likely(sched_clock_running))
- sched_clock_local(scd);
+ local_irq_disable();
+ gtod = ktime_get_ns();
+ clock = sched_clock();
+ __gtod_offset = (clock + __sched_clock_offset) - gtod;
+ local_irq_enable();
}
/*
@@ -374,15 +426,21 @@ void sched_clock_idle_sleep_event(void)
EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
/*
- * We just idled delta nanoseconds (called with irqs disabled):
+ * We just idled; resync with ktime.
*/
-void sched_clock_idle_wakeup_event(u64 delta_ns)
+void sched_clock_idle_wakeup_event(void)
{
- if (timekeeping_suspended)
+ unsigned long flags;
+
+ if (sched_clock_stable())
+ return;
+
+ if (unlikely(timekeeping_suspended))
return;
+ local_irq_save(flags);
sched_clock_tick();
- touch_softlockup_watchdog_sched();
+ local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 803c3bc274c4..c3e50cada84d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1731,7 +1731,7 @@ void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
- struct task_struct *p;
+ struct task_struct *p, *t;
struct rq_flags rf;
if (!llist)
@@ -1740,17 +1740,8 @@ void sched_ttwu_pending(void)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- while (llist) {
- int wake_flags = 0;
-
- p = llist_entry(llist, struct task_struct, wake_entry);
- llist = llist_next(llist);
-
- if (p->sched_remote_wakeup)
- wake_flags = WF_MIGRATED;
-
- ttwu_do_activate(rq, p, wake_flags, &rf);
- }
+ llist_for_each_entry_safe(p, t, llist, wake_entry)
+ ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
rq_unlock_irqrestore(rq, &rf);
}
@@ -4197,8 +4188,8 @@ static int __sched_setscheduler(struct task_struct *p,
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq *rq;
- /* May grab non-irq protected spin_locks: */
- BUG_ON(in_interrupt());
+ /* The pi code expects interrupts enabled */
+ BUG_ON(pi && in_interrupt());
recheck:
/* Double check policy once rq lock held: */
if (policy < 0) {
@@ -5958,7 +5949,6 @@ void __init sched_init_smp(void)
cpumask_var_t non_isolated_cpus;
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
- alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
sched_init_numa();
@@ -5968,7 +5958,7 @@ void __init sched_init_smp(void)
* happen.
*/
mutex_lock(&sched_domains_mutex);
- init_sched_domains(cpu_active_mask);
+ sched_init_domains(cpu_active_mask);
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
if (cpumask_empty(non_isolated_cpus))
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -5984,7 +5974,6 @@ void __init sched_init_smp(void)
init_sched_dl_class();
sched_init_smt();
- sched_clock_init_late();
sched_smp_initialized = true;
}
@@ -6000,7 +5989,6 @@ early_initcall(migration_init);
void __init sched_init_smp(void)
{
sched_init_granularity();
- sched_clock_init_late();
}
#endif /* CONFIG_SMP */
@@ -6199,7 +6187,6 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;
#ifdef CONFIG_SMP
- zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
/* May be allocated at isolcpus cmdline parse time */
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
@@ -6251,8 +6238,10 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
!is_idle_task(current)) ||
- system_state != SYSTEM_RUNNING || oops_in_progress)
+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
+ oops_in_progress)
return;
+
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
prev_jiffy = jiffies;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a2ce59015642..df6c2912bd60 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1533,7 +1533,7 @@ retry:
* then possible that next_task has migrated.
*/
task = pick_next_pushable_dl_task(rq);
- if (task_cpu(next_task) == rq->cpu && task == next_task) {
+ if (task == next_task) {
/*
* The task is still there. We don't try
* again, some other cpu will pull it when ready.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d71109321841..47a0c552c77b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -369,8 +369,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
}
/* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
- list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+ list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
+ leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */
static inline struct cfs_rq *
@@ -463,8 +464,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
- for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+ for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
@@ -2469,7 +2470,8 @@ void task_numa_work(struct callback_head *work)
return;
- down_read(&mm->mmap_sem);
+ if (!down_read_trylock(&mm->mmap_sem))
+ return;
vma = find_vma(mm, start);
if (!vma) {
reset_ptenuma_scan(p);
@@ -2916,12 +2918,12 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
/*
* Step 2: update *_avg.
*/
- sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+ sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
if (cfs_rq) {
cfs_rq->runnable_load_avg =
- div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+ div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
}
- sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+ sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
return 1;
}
@@ -4642,24 +4644,43 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
hrtimer_cancel(&cfs_b->slack_timer);
}
+/*
+ * Both these cpu hotplug callbacks race against unregister_fair_sched_group()
+ *
+ * The race is harmless, since modifying bandwidth settings of unhooked group
+ * bits doesn't do much.
+ */
+
+/* cpu online calback */
static void __maybe_unused update_runtime_enabled(struct rq *rq)
{
- struct cfs_rq *cfs_rq;
+ struct task_group *tg;
- for_each_leaf_cfs_rq(rq, cfs_rq) {
- struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+ lockdep_assert_held(&rq->lock);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
raw_spin_lock(&cfs_b->lock);
cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
raw_spin_unlock(&cfs_b->lock);
}
+ rcu_read_unlock();
}
+/* cpu offline callback */
static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
{
- struct cfs_rq *cfs_rq;
+ struct task_group *tg;
+
+ lockdep_assert_held(&rq->lock);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
- for_each_leaf_cfs_rq(rq, cfs_rq) {
if (!cfs_rq->runtime_enabled)
continue;
@@ -4677,6 +4698,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
}
+ rcu_read_unlock();
}
#else /* CONFIG_CFS_BANDWIDTH */
@@ -5484,12 +5506,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int i;
/* Skip over this group if it has no CPUs allowed */
- if (!cpumask_intersects(sched_group_cpus(group),
+ if (!cpumask_intersects(sched_group_span(group),
&p->cpus_allowed))
continue;
local_group = cpumask_test_cpu(this_cpu,
- sched_group_cpus(group));
+ sched_group_span(group));
/*
* Tally up the load of all CPUs in the group and find
@@ -5499,7 +5521,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
runnable_load = 0;
max_spare_cap = 0;
- for_each_cpu(i, sched_group_cpus(group)) {
+ for_each_cpu(i, sched_group_span(group)) {
/* Bias balancing toward cpus of our domain */
if (local_group)
load = source_load(i, load_idx);
@@ -5602,10 +5624,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
/* Check if we have any choice: */
if (group->group_weight == 1)
- return cpumask_first(sched_group_cpus(group));
+ return cpumask_first(sched_group_span(group));
/* Traverse only the allowed CPUs */
- for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
+ for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
if (idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
@@ -5640,43 +5662,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
-/*
- * Implement a for_each_cpu() variant that starts the scan at a given cpu
- * (@start), and wraps around.
- *
- * This is used to scan for idle CPUs; such that not all CPUs looking for an
- * idle CPU find the same CPU. The down-side is that tasks tend to cycle
- * through the LLC domain.
- *
- * Especially tbench is found sensitive to this.
- */
-
-static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
-{
- int next;
-
-again:
- next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
-
- if (*wrapped) {
- if (next >= start)
- return nr_cpumask_bits;
- } else {
- if (next >= nr_cpumask_bits) {
- *wrapped = 1;
- n = -1;
- goto again;
- }
- }
-
- return next;
-}
-
-#define for_each_cpu_wrap(cpu, mask, start, wrap) \
- for ((wrap) = 0, (cpu) = (start)-1; \
- (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
- (cpu) < nr_cpumask_bits; )
-
#ifdef CONFIG_SCHED_SMT
static inline void set_idle_cores(int cpu, int val)
@@ -5736,7 +5721,7 @@ unlock:
static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
- int core, cpu, wrap;
+ int core, cpu;
if (!static_branch_likely(&sched_smt_present))
return -1;
@@ -5746,7 +5731,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
- for_each_cpu_wrap(core, cpus, target, wrap) {
+ for_each_cpu_wrap(core, cpus, target) {
bool idle = true;
for_each_cpu(cpu, cpu_smt_mask(core)) {
@@ -5812,7 +5797,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
u64 avg_cost, avg_idle = this_rq()->avg_idle;
u64 time, cost;
s64 delta;
- int cpu, wrap;
+ int cpu;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
@@ -5829,7 +5814,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
time = local_clock();
- for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
+ for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue;
if (idle_cpu(cpu))
@@ -6970,10 +6955,28 @@ static void attach_tasks(struct lb_env *env)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
+
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->load.weight)
+ return false;
+
+ if (cfs_rq->avg.load_sum)
+ return false;
+
+ if (cfs_rq->avg.util_sum)
+ return false;
+
+ if (cfs_rq->runnable_load_sum)
+ return false;
+
+ return true;
+}
+
static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *pos;
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
@@ -6983,7 +6986,7 @@ static void update_blocked_averages(int cpu)
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
- for_each_leaf_cfs_rq(rq, cfs_rq) {
+ for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
struct sched_entity *se;
/* throttled entities do not contribute to load */
@@ -6997,6 +7000,13 @@ static void update_blocked_averages(int cpu)
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
update_load_avg(se, 0);
+
+ /*
+ * There can be a lot of idle CPU cgroups. Don't let fully
+ * decayed cfs_rqs linger on the list.
+ */
+ if (cfs_rq_is_decayed(cfs_rq))
+ list_del_leaf_cfs_rq(cfs_rq);
}
rq_unlock_irqrestore(rq, &rf);
}
@@ -7229,7 +7239,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
* span the current group.
*/
- for_each_cpu(cpu, sched_group_cpus(sdg)) {
+ for_each_cpu(cpu, sched_group_span(sdg)) {
struct sched_group_capacity *sgc;
struct rq *rq = cpu_rq(cpu);
@@ -7408,7 +7418,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
memset(sgs, 0, sizeof(*sgs));
- for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
+ for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
/* Bias balancing toward cpus of our domain */
@@ -7572,7 +7582,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sg_lb_stats *sgs = &tmp_sgs;
int local_group;
- local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
+ local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
if (local_group) {
sds->local = sg;
sgs = local;
@@ -7927,7 +7937,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
unsigned long busiest_load = 0, busiest_capacity = 1;
int i;
- for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
+ for_each_cpu_and(i, sched_group_span(group), env->cpus) {
unsigned long capacity, wl;
enum fbq_type rt;
@@ -8033,7 +8043,6 @@ static int active_load_balance_cpu_stop(void *data);
static int should_we_balance(struct lb_env *env)
{
struct sched_group *sg = env->sd->groups;
- struct cpumask *sg_cpus, *sg_mask;
int cpu, balance_cpu = -1;
/*
@@ -8043,11 +8052,9 @@ static int should_we_balance(struct lb_env *env)
if (env->idle == CPU_NEWLY_IDLE)
return 1;
- sg_cpus = sched_group_cpus(sg);
- sg_mask = sched_group_mask(sg);
/* Try to find first idle cpu */
- for_each_cpu_and(cpu, sg_cpus, env->cpus) {
- if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
+ for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
+ if (!idle_cpu(cpu))
continue;
balance_cpu = cpu;
@@ -8083,7 +8090,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
- .dst_grpmask = sched_group_cpus(sd->groups),
+ .dst_grpmask = sched_group_span(sd->groups),
.idle = idle,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
@@ -9523,10 +9530,10 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_SCHED_DEBUG
void print_cfs_stats(struct seq_file *m, int cpu)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *pos;
rcu_read_lock();
- for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+ for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
print_cfs_rq(m, cpu, cfs_rq);
rcu_read_unlock();
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 11192e0cb122..dc4d1483b038 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -76,7 +76,6 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
SCHED_FEAT(RT_PUSH_IPI, true)
#endif
-SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 979b7341008a..581d5c7a5264 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -840,6 +840,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
struct rq *rq = rq_of_rt_rq(rt_rq);
+ int skip;
+
+ /*
+ * When span == cpu_online_mask, taking each rq->lock
+ * can be time-consuming. Try to avoid it when possible.
+ */
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ if (skip)
+ continue;
raw_spin_lock(&rq->lock);
if (rt_rq->rt_time) {
@@ -1819,7 +1830,7 @@ retry:
* pushing.
*/
task = pick_next_pushable_task(rq);
- if (task_cpu(next_task) == rq->cpu && task == next_task) {
+ if (task == next_task) {
/*
* The task hasn't migrated, and is still the next
* eligible task, but we failed to find a run-queue
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6dda2aab731e..f8cf1d87f065 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -606,11 +606,9 @@ struct root_domain {
extern struct root_domain def_root_domain;
extern struct mutex sched_domains_mutex;
-extern cpumask_var_t fallback_doms;
-extern cpumask_var_t sched_domains_tmpmask;
extern void init_defrootdomain(void);
-extern int init_sched_domains(const struct cpumask *cpu_map);
+extern int sched_init_domains(const struct cpumask *cpu_map);
extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
#endif /* CONFIG_SMP */
@@ -1025,7 +1023,11 @@ struct sched_group_capacity {
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
- unsigned long cpumask[0]; /* iteration mask */
+#ifdef CONFIG_SCHED_DEBUG
+ int id;
+#endif
+
+ unsigned long cpumask[0]; /* balance mask */
};
struct sched_group {
@@ -1046,16 +1048,15 @@ struct sched_group {
unsigned long cpumask[0];
};
-static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+static inline struct cpumask *sched_group_span(struct sched_group *sg)
{
return to_cpumask(sg->cpumask);
}
/*
- * cpumask masking which cpus in the group are allowed to iterate up the domain
- * tree.
+ * See build_balance_mask().
*/
-static inline struct cpumask *sched_group_mask(struct sched_group *sg)
+static inline struct cpumask *group_balance_mask(struct sched_group *sg)
{
return to_cpumask(sg->sgc->cpumask);
}
@@ -1066,7 +1067,7 @@ static inline struct cpumask *sched_group_mask(struct sched_group *sg)
*/
static inline unsigned int group_first_cpu(struct sched_group *group)
{
- return cpumask_first(sched_group_cpus(group));
+ return cpumask_first(sched_group_span(group));
}
extern int group_balance_cpu(struct sched_group *sg);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 1b0b4fb12837..79895aec281e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -10,6 +10,7 @@ DEFINE_MUTEX(sched_domains_mutex);
/* Protected by sched_domains_mutex: */
cpumask_var_t sched_domains_tmpmask;
+cpumask_var_t sched_domains_tmpmask2;
#ifdef CONFIG_SCHED_DEBUG
@@ -35,7 +36,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_clear(groupmask);
- printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
+ printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
if (!(sd->flags & SD_LOAD_BALANCE)) {
printk("does not load-balance\n");
@@ -45,14 +46,14 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
return -1;
}
- printk(KERN_CONT "span %*pbl level %s\n",
+ printk(KERN_CONT "span=%*pbl level=%s\n",
cpumask_pr_args(sched_domain_span(sd)), sd->name);
if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
printk(KERN_ERR "ERROR: domain->span does not contain "
"CPU%d\n", cpu);
}
- if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
+ if (!cpumask_test_cpu(cpu, sched_group_span(group))) {
printk(KERN_ERR "ERROR: domain->groups does not contain"
" CPU%d\n", cpu);
}
@@ -65,29 +66,47 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!cpumask_weight(sched_group_cpus(group))) {
+ if (!cpumask_weight(sched_group_span(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: empty group\n");
break;
}
if (!(sd->flags & SD_OVERLAP) &&
- cpumask_intersects(groupmask, sched_group_cpus(group))) {
+ cpumask_intersects(groupmask, sched_group_span(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: repeated CPUs\n");
break;
}
- cpumask_or(groupmask, groupmask, sched_group_cpus(group));
+ cpumask_or(groupmask, groupmask, sched_group_span(group));
- printk(KERN_CONT " %*pbl",
- cpumask_pr_args(sched_group_cpus(group)));
- if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
- printk(KERN_CONT " (cpu_capacity = %lu)",
- group->sgc->capacity);
+ printk(KERN_CONT " %d:{ span=%*pbl",
+ group->sgc->id,
+ cpumask_pr_args(sched_group_span(group)));
+
+ if ((sd->flags & SD_OVERLAP) &&
+ !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
+ printk(KERN_CONT " mask=%*pbl",
+ cpumask_pr_args(group_balance_mask(group)));
+ }
+
+ if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
+ printk(KERN_CONT " cap=%lu", group->sgc->capacity);
+
+ if (group == sd->groups && sd->child &&
+ !cpumask_equal(sched_domain_span(sd->child),
+ sched_group_span(group))) {
+ printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
}
+ printk(KERN_CONT " }");
+
group = group->next;
+
+ if (group != sd->groups)
+ printk(KERN_CONT ",");
+
} while (group != sd->groups);
printk(KERN_CONT "\n");
@@ -113,7 +132,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
return;
}
- printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+ printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
for (;;) {
if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
@@ -477,46 +496,214 @@ enum s_alloc {
};
/*
- * Build an iteration mask that can exclude certain CPUs from the upwards
- * domain traversal.
+ * Return the canonical balance CPU for this group, this is the first CPU
+ * of this group that's also in the balance mask.
*
- * Asymmetric node setups can result in situations where the domain tree is of
- * unequal depth, make sure to skip domains that already cover the entire
- * range.
+ * The balance mask are all those CPUs that could actually end up at this
+ * group. See build_balance_mask().
*
- * In that case build_sched_domains() will have terminated the iteration early
- * and our sibling sd spans will be empty. Domains should always include the
- * CPU they're built on, so check that.
+ * Also see should_we_balance().
*/
-static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+int group_balance_cpu(struct sched_group *sg)
{
- const struct cpumask *span = sched_domain_span(sd);
+ return cpumask_first(group_balance_mask(sg));
+}
+
+
+/*
+ * NUMA topology (first read the regular topology blurb below)
+ *
+ * Given a node-distance table, for example:
+ *
+ * node 0 1 2 3
+ * 0: 10 20 30 20
+ * 1: 20 10 20 30
+ * 2: 30 20 10 20
+ * 3: 20 30 20 10
+ *
+ * which represents a 4 node ring topology like:
+ *
+ * 0 ----- 1
+ * | |
+ * | |
+ * | |
+ * 3 ----- 2
+ *
+ * We want to construct domains and groups to represent this. The way we go
+ * about doing this is to build the domains on 'hops'. For each NUMA level we
+ * construct the mask of all nodes reachable in @level hops.
+ *
+ * For the above NUMA topology that gives 3 levels:
+ *
+ * NUMA-2 0-3 0-3 0-3 0-3
+ * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2}
+ *
+ * NUMA-1 0-1,3 0-2 1-3 0,2-3
+ * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3}
+ *
+ * NUMA-0 0 1 2 3
+ *
+ *
+ * As can be seen; things don't nicely line up as with the regular topology.
+ * When we iterate a domain in child domain chunks some nodes can be
+ * represented multiple times -- hence the "overlap" naming for this part of
+ * the topology.
+ *
+ * In order to minimize this overlap, we only build enough groups to cover the
+ * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
+ *
+ * Because:
+ *
+ * - the first group of each domain is its child domain; this
+ * gets us the first 0-1,3
+ * - the only uncovered node is 2, who's child domain is 1-3.
+ *
+ * However, because of the overlap, computing a unique CPU for each group is
+ * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
+ * groups include the CPUs of Node-0, while those CPUs would not in fact ever
+ * end up at those groups (they would end up in group: 0-1,3).
+ *
+ * To correct this we have to introduce the group balance mask. This mask
+ * will contain those CPUs in the group that can reach this group given the
+ * (child) domain tree.
+ *
+ * With this we can once again compute balance_cpu and sched_group_capacity
+ * relations.
+ *
+ * XXX include words on how balance_cpu is unique and therefore can be
+ * used for sched_group_capacity links.
+ *
+ *
+ * Another 'interesting' topology is:
+ *
+ * node 0 1 2 3
+ * 0: 10 20 20 30
+ * 1: 20 10 20 20
+ * 2: 20 20 10 20
+ * 3: 30 20 20 10
+ *
+ * Which looks a little like:
+ *
+ * 0 ----- 1
+ * | / |
+ * | / |
+ * | / |
+ * 2 ----- 3
+ *
+ * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
+ * are not.
+ *
+ * This leads to a few particularly weird cases where the sched_domain's are
+ * not of the same number for each cpu. Consider:
+ *
+ * NUMA-2 0-3 0-3
+ * groups: {0-2},{1-3} {1-3},{0-2}
+ *
+ * NUMA-1 0-2 0-3 0-3 1-3
+ *
+ * NUMA-0 0 1 2 3
+ *
+ */
+
+
+/*
+ * Build the balance mask; it contains only those CPUs that can arrive at this
+ * group and should be considered to continue balancing.
+ *
+ * We do this during the group creation pass, therefore the group information
+ * isn't complete yet, however since each group represents a (child) domain we
+ * can fully construct this using the sched_domain bits (which are already
+ * complete).
+ */
+static void
+build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
+{
+ const struct cpumask *sg_span = sched_group_span(sg);
struct sd_data *sdd = sd->private;
struct sched_domain *sibling;
int i;
- for_each_cpu(i, span) {
+ cpumask_clear(mask);
+
+ for_each_cpu(i, sg_span) {
sibling = *per_cpu_ptr(sdd->sd, i);
- if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+
+ /*
+ * Can happen in the asymmetric case, where these siblings are
+ * unused. The mask will not be empty because those CPUs that
+ * do have the top domain _should_ span the domain.
+ */
+ if (!sibling->child)
continue;
- cpumask_set_cpu(i, sched_group_mask(sg));
+ /* If we would not end up here, we can't continue from here */
+ if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
+ continue;
+
+ cpumask_set_cpu(i, mask);
}
+
+ /* We must not have empty masks here */
+ WARN_ON_ONCE(cpumask_empty(mask));
}
/*
- * Return the canonical balance CPU for this group, this is the first CPU
- * of this group that's also in the iteration mask.
+ * XXX: This creates per-node group entries; since the load-balancer will
+ * immediately access remote memory to construct this group's load-balance
+ * statistics having the groups node local is of dubious benefit.
*/
-int group_balance_cpu(struct sched_group *sg)
+static struct sched_group *
+build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
{
- return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+ struct sched_group *sg;
+ struct cpumask *sg_span;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(cpu));
+
+ if (!sg)
+ return NULL;
+
+ sg_span = sched_group_span(sg);
+ if (sd->child)
+ cpumask_copy(sg_span, sched_domain_span(sd->child));
+ else
+ cpumask_copy(sg_span, sched_domain_span(sd));
+
+ return sg;
+}
+
+static void init_overlap_sched_group(struct sched_domain *sd,
+ struct sched_group *sg)
+{
+ struct cpumask *mask = sched_domains_tmpmask2;
+ struct sd_data *sdd = sd->private;
+ struct cpumask *sg_span;
+ int cpu;
+
+ build_balance_mask(sd, sg, mask);
+ cpu = cpumask_first_and(sched_group_span(sg), mask);
+
+ sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+ if (atomic_inc_return(&sg->sgc->ref) == 1)
+ cpumask_copy(group_balance_mask(sg), mask);
+ else
+ WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
+
+ /*
+ * Initialize sgc->capacity such that even if we mess up the
+ * domains and no possible iteration will get us here, we won't
+ * die on a /0 trap.
+ */
+ sg_span = sched_group_span(sg);
+ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+ sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
}
static int
build_overlap_sched_groups(struct sched_domain *sd, int cpu)
{
- struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+ struct sched_group *first = NULL, *last = NULL, *sg;
const struct cpumask *span = sched_domain_span(sd);
struct cpumask *covered = sched_domains_tmpmask;
struct sd_data *sdd = sd->private;
@@ -525,7 +712,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
cpumask_clear(covered);
- for_each_cpu(i, span) {
+ for_each_cpu_wrap(i, span, cpu) {
struct cpumask *sg_span;
if (cpumask_test_cpu(i, covered))
@@ -533,44 +720,27 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
sibling = *per_cpu_ptr(sdd->sd, i);
- /* See the comment near build_group_mask(). */
+ /*
+ * Asymmetric node setups can result in situations where the
+ * domain tree is of unequal depth, make sure to skip domains
+ * that already cover the entire range.
+ *
+ * In that case build_sched_domains() will have terminated the
+ * iteration early and our sibling sd spans will be empty.
+ * Domains should always include the CPU they're built on, so
+ * check that.
+ */
if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
continue;
- sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, cpu_to_node(cpu));
-
+ sg = build_group_from_child_sched_domain(sibling, cpu);
if (!sg)
goto fail;
- sg_span = sched_group_cpus(sg);
- if (sibling->child)
- cpumask_copy(sg_span, sched_domain_span(sibling->child));
- else
- cpumask_set_cpu(i, sg_span);
-
+ sg_span = sched_group_span(sg);
cpumask_or(covered, covered, sg_span);
- sg->sgc = *per_cpu_ptr(sdd->sgc, i);
- if (atomic_inc_return(&sg->sgc->ref) == 1)
- build_group_mask(sd, sg);
-
- /*
- * Initialize sgc->capacity such that even if we mess up the
- * domains and no possible iteration will get us here, we won't
- * die on a /0 trap.
- */
- sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
- sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
-
- /*
- * Make sure the first group of this domain contains the
- * canonical balance CPU. Otherwise the sched_domain iteration
- * breaks. See update_sg_lb_stats().
- */
- if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
- group_balance_cpu(sg) == cpu)
- groups = sg;
+ init_overlap_sched_group(sd, sg);
if (!first)
first = sg;
@@ -579,7 +749,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
last = sg;
last->next = first;
}
- sd->groups = groups;
+ sd->groups = first;
return 0;
@@ -589,23 +759,106 @@ fail:
return -ENOMEM;
}
-static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
+
+/*
+ * Package topology (also see the load-balance blurb in fair.c)
+ *
+ * The scheduler builds a tree structure to represent a number of important
+ * topology features. By default (default_topology[]) these include:
+ *
+ * - Simultaneous multithreading (SMT)
+ * - Multi-Core Cache (MC)
+ * - Package (DIE)
+ *
+ * Where the last one more or less denotes everything up to a NUMA node.
+ *
+ * The tree consists of 3 primary data structures:
+ *
+ * sched_domain -> sched_group -> sched_group_capacity
+ * ^ ^ ^ ^
+ * `-' `-'
+ *
+ * The sched_domains are per-cpu and have a two way link (parent & child) and
+ * denote the ever growing mask of CPUs belonging to that level of topology.
+ *
+ * Each sched_domain has a circular (double) linked list of sched_group's, each
+ * denoting the domains of the level below (or individual CPUs in case of the
+ * first domain level). The sched_group linked by a sched_domain includes the
+ * CPU of that sched_domain [*].
+ *
+ * Take for instance a 2 threaded, 2 core, 2 cache cluster part:
+ *
+ * CPU 0 1 2 3 4 5 6 7
+ *
+ * DIE [ ]
+ * MC [ ] [ ]
+ * SMT [ ] [ ] [ ] [ ]
+ *
+ * - or -
+ *
+ * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
+ * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
+ * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
+ *
+ * CPU 0 1 2 3 4 5 6 7
+ *
+ * One way to think about it is: sched_domain moves you up and down among these
+ * topology levels, while sched_group moves you sideways through it, at child
+ * domain granularity.
+ *
+ * sched_group_capacity ensures each unique sched_group has shared storage.
+ *
+ * There are two related construction problems, both require a CPU that
+ * uniquely identify each group (for a given domain):
+ *
+ * - The first is the balance_cpu (see should_we_balance() and the
+ * load-balance blub in fair.c); for each group we only want 1 CPU to
+ * continue balancing at a higher domain.
+ *
+ * - The second is the sched_group_capacity; we want all identical groups
+ * to share a single sched_group_capacity.
+ *
+ * Since these topologies are exclusive by construction. That is, its
+ * impossible for an SMT thread to belong to multiple cores, and cores to
+ * be part of multiple caches. There is a very clear and unique location
+ * for each CPU in the hierarchy.
+ *
+ * Therefore computing a unique CPU for each group is trivial (the iteration
+ * mask is redundant and set all 1s; all CPUs in a group will end up at _that_
+ * group), we can simply pick the first CPU in each group.
+ *
+ *
+ * [*] in other words, the first group of each domain is its child domain.
+ */
+
+static struct sched_group *get_group(int cpu, struct sd_data *sdd)
{
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
struct sched_domain *child = sd->child;
+ struct sched_group *sg;
if (child)
cpu = cpumask_first(sched_domain_span(child));
- if (sg) {
- *sg = *per_cpu_ptr(sdd->sg, cpu);
- (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+ sg = *per_cpu_ptr(sdd->sg, cpu);
+ sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+
+ /* For claim_allocations: */
+ atomic_inc(&sg->ref);
+ atomic_inc(&sg->sgc->ref);
- /* For claim_allocations: */
- atomic_set(&(*sg)->sgc->ref, 1);
+ if (child) {
+ cpumask_copy(sched_group_span(sg), sched_domain_span(child));
+ cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
+ } else {
+ cpumask_set_cpu(cpu, sched_group_span(sg));
+ cpumask_set_cpu(cpu, group_balance_mask(sg));
}
- return cpu;
+ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
+ sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+
+ return sg;
}
/*
@@ -624,34 +877,20 @@ build_sched_groups(struct sched_domain *sd, int cpu)
struct cpumask *covered;
int i;
- get_group(cpu, sdd, &sd->groups);
- atomic_inc(&sd->groups->ref);
-
- if (cpu != cpumask_first(span))
- return 0;
-
lockdep_assert_held(&sched_domains_mutex);
covered = sched_domains_tmpmask;
cpumask_clear(covered);
- for_each_cpu(i, span) {
+ for_each_cpu_wrap(i, span, cpu) {
struct sched_group *sg;
- int group, j;
if (cpumask_test_cpu(i, covered))
continue;
- group = get_group(i, sdd, &sg);
- cpumask_setall(sched_group_mask(sg));
+ sg = get_group(i, sdd);
- for_each_cpu(j, span) {
- if (get_group(j, sdd, NULL) != group)
- continue;
-
- cpumask_set_cpu(j, covered);
- cpumask_set_cpu(j, sched_group_cpus(sg));
- }
+ cpumask_or(covered, covered, sched_group_span(sg));
if (!first)
first = sg;
@@ -660,6 +899,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
last = sg;
}
last->next = first;
+ sd->groups = first;
return 0;
}
@@ -683,12 +923,12 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
do {
int cpu, max_cpu = -1;
- sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+ sg->group_weight = cpumask_weight(sched_group_span(sg));
if (!(sd->flags & SD_ASYM_PACKING))
goto next;
- for_each_cpu(cpu, sched_group_cpus(sg)) {
+ for_each_cpu(cpu, sched_group_span(sg)) {
if (max_cpu < 0)
max_cpu = cpu;
else if (sched_asym_prefer(cpu, max_cpu))
@@ -1308,6 +1548,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sgc)
return -ENOMEM;
+#ifdef CONFIG_SCHED_DEBUG
+ sgc->id = j;
+#endif
+
*per_cpu_ptr(sdd->sgc, j) = sgc;
}
}
@@ -1407,7 +1651,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
sd = build_sched_domain(tl, cpu_map, attr, sd, i);
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
- if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
+ if (tl->flags & SDTL_OVERLAP)
sd->flags |= SD_OVERLAP;
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
break;
@@ -1478,7 +1722,7 @@ static struct sched_domain_attr *dattr_cur;
* cpumask) fails, then fallback to a single sched domain,
* as determined by the single cpumask fallback_doms.
*/
-cpumask_var_t fallback_doms;
+static cpumask_var_t fallback_doms;
/*
* arch_update_cpu_topology lets virtualized architectures update the
@@ -1520,10 +1764,14 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
* For now this just excludes isolated CPUs, but could be used to
* exclude other special cases in the future.
*/
-int init_sched_domains(const struct cpumask *cpu_map)
+int sched_init_domains(const struct cpumask *cpu_map)
{
int err;
+ zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
+ zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
+ zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+
arch_update_cpu_topology();
ndoms_cur = 1;
doms_cur = alloc_sched_domains(ndoms_cur);
diff --git a/kernel/smp.c b/kernel/smp.c
index a817769b53c0..3061483cb3ad 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -30,6 +30,7 @@ enum {
struct call_function_data {
struct call_single_data __percpu *csd;
cpumask_var_t cpumask;
+ cpumask_var_t cpumask_ipi;
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
@@ -45,9 +46,15 @@ int smpcfd_prepare_cpu(unsigned int cpu)
if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
cpu_to_node(cpu)))
return -ENOMEM;
+ if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
+ cpu_to_node(cpu))) {
+ free_cpumask_var(cfd->cpumask);
+ return -ENOMEM;
+ }
cfd->csd = alloc_percpu(struct call_single_data);
if (!cfd->csd) {
free_cpumask_var(cfd->cpumask);
+ free_cpumask_var(cfd->cpumask_ipi);
return -ENOMEM;
}
@@ -59,6 +66,7 @@ int smpcfd_dead_cpu(unsigned int cpu)
struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
free_cpumask_var(cfd->cpumask);
+ free_cpumask_var(cfd->cpumask_ipi);
free_percpu(cfd->csd);
return 0;
}
@@ -428,12 +436,13 @@ void smp_call_function_many(const struct cpumask *mask,
cfd = this_cpu_ptr(&cfd_data);
cpumask_and(cfd->cpumask, mask, cpu_online_mask);
- cpumask_clear_cpu(this_cpu, cfd->cpumask);
+ __cpumask_clear_cpu(this_cpu, cfd->cpumask);
/* Some callers race with other cpus changing the passed mask */
if (unlikely(!cpumask_weight(cfd->cpumask)))
return;
+ cpumask_clear(cfd->cpumask_ipi);
for_each_cpu(cpu, cfd->cpumask) {
struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
@@ -442,11 +451,12 @@ void smp_call_function_many(const struct cpumask *mask,
csd->flags |= CSD_FLAG_SYNCHRONOUS;
csd->func = func;
csd->info = info;
- llist_add(&csd->llist, &per_cpu(call_single_queue, cpu));
+ if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
+ __cpumask_set_cpu(cpu, cfd->cpumask_ipi);
}
/* Send a message to all CPUs in the map */
- arch_send_call_function_ipi_mask(cfd->cpumask);
+ arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
if (wait) {
for_each_cpu(cpu, cfd->cpumask) {
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 1eb82661ecdb..b7591261652d 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -552,7 +552,8 @@ static int __init cpu_stop_init(void)
}
early_initcall(cpu_stop_init);
-static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
+int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
+ const struct cpumask *cpus)
{
struct multi_stop_data msdata = {
.fn = fn,
@@ -561,6 +562,8 @@ static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cp
.active_cpus = cpus,
};
+ lockdep_assert_cpus_held();
+
if (!stop_machine_initialized) {
/*
* Handle the case where stop_machine() is called
@@ -590,9 +593,9 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
int ret;
/* No CPUs can come up or down during this. */
- get_online_cpus();
- ret = __stop_machine(fn, data, cpus);
- put_online_cpus();
+ cpus_read_lock();
+ ret = stop_machine_cpuslocked(fn, data, cpus);
+ cpus_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(stop_machine);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 5cb5b0008d97..4cfebfff848d 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -45,11 +45,13 @@ static struct alarm_base {
clockid_t base_clockid;
} alarm_bases[ALARM_NUMTYPE];
+#if defined(CONFIG_POSIX_TIMERS) || defined(CONFIG_RTC_CLASS)
/* freezer information to handle clock_nanosleep triggered wakeups */
static enum alarmtimer_type freezer_alarmtype;
static ktime_t freezer_expires;
static ktime_t freezer_delta;
static DEFINE_SPINLOCK(freezer_delta_lock);
+#endif
static struct wakeup_source *ws;
@@ -307,38 +309,6 @@ static int alarmtimer_resume(struct device *dev)
}
#endif
-static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
-{
- struct alarm_base *base;
- unsigned long flags;
- ktime_t delta;
-
- switch(type) {
- case ALARM_REALTIME:
- base = &alarm_bases[ALARM_REALTIME];
- type = ALARM_REALTIME_FREEZER;
- break;
- case ALARM_BOOTTIME:
- base = &alarm_bases[ALARM_BOOTTIME];
- type = ALARM_BOOTTIME_FREEZER;
- break;
- default:
- WARN_ONCE(1, "Invalid alarm type: %d\n", type);
- return;
- }
-
- delta = ktime_sub(absexp, base->gettime());
-
- spin_lock_irqsave(&freezer_delta_lock, flags);
- if (!freezer_delta || (delta < freezer_delta)) {
- freezer_delta = delta;
- freezer_expires = absexp;
- freezer_alarmtype = type;
- }
- spin_unlock_irqrestore(&freezer_delta_lock, flags);
-}
-
-
/**
* alarm_init - Initialize an alarm structure
* @alarm: ptr to alarm to be initialized
@@ -488,6 +458,38 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
}
EXPORT_SYMBOL_GPL(alarm_forward_now);
+#ifdef CONFIG_POSIX_TIMERS
+
+static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
+{
+ struct alarm_base *base;
+ unsigned long flags;
+ ktime_t delta;
+
+ switch(type) {
+ case ALARM_REALTIME:
+ base = &alarm_bases[ALARM_REALTIME];
+ type = ALARM_REALTIME_FREEZER;
+ break;
+ case ALARM_BOOTTIME:
+ base = &alarm_bases[ALARM_BOOTTIME];
+ type = ALARM_BOOTTIME_FREEZER;
+ break;
+ default:
+ WARN_ONCE(1, "Invalid alarm type: %d\n", type);
+ return;
+ }
+
+ delta = ktime_sub(absexp, base->gettime());
+
+ spin_lock_irqsave(&freezer_delta_lock, flags);
+ if (!freezer_delta || (delta < freezer_delta)) {
+ freezer_delta = delta;
+ freezer_expires = absexp;
+ freezer_alarmtype = type;
+ }
+ spin_unlock_irqrestore(&freezer_delta_lock, flags);
+}
/**
* clock2alarm - helper that converts from clockid to alarmtypes
@@ -846,6 +848,17 @@ out:
return ret;
}
+const struct k_clock alarm_clock = {
+ .clock_getres = alarm_clock_getres,
+ .clock_get = alarm_clock_get,
+ .timer_create = alarm_timer_create,
+ .timer_set = alarm_timer_set,
+ .timer_del = alarm_timer_del,
+ .timer_get = alarm_timer_get,
+ .nsleep = alarm_timer_nsleep,
+};
+#endif /* CONFIG_POSIX_TIMERS */
+
/* Suspend hook structures */
static const struct dev_pm_ops alarmtimer_pm_ops = {
@@ -871,23 +884,9 @@ static int __init alarmtimer_init(void)
struct platform_device *pdev;
int error = 0;
int i;
- struct k_clock alarm_clock = {
- .clock_getres = alarm_clock_getres,
- .clock_get = alarm_clock_get,
- .timer_create = alarm_timer_create,
- .timer_set = alarm_timer_set,
- .timer_del = alarm_timer_del,
- .timer_get = alarm_timer_get,
- .nsleep = alarm_timer_nsleep,
- };
alarmtimer_rtc_timer_init();
- if (IS_ENABLED(CONFIG_POSIX_TIMERS)) {
- posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
- posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
- }
-
/* Initialize alarm bases */
alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 93621ae718d3..03918a19cf2d 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -233,6 +233,9 @@ static void clocksource_watchdog(unsigned long data)
continue;
}
+ if (cs == curr_clocksource && cs->tick_stable)
+ cs->tick_stable(cs);
+
if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
(watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 31d588d37a17..7e453005e078 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -434,7 +434,7 @@ static int pc_timer_settime(struct k_itimer *kit, int flags,
return err;
}
-struct k_clock clock_posix_dynamic = {
+const struct k_clock clock_posix_dynamic = {
.clock_getres = pc_clock_getres,
.clock_set = pc_clock_settime,
.clock_get = pc_clock_gettime,
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index d2a1e6dd0291..c99434739fd5 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1421,7 +1421,7 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
return posix_cpu_timer_create(timer);
}
-struct k_clock clock_posix_cpu = {
+const struct k_clock clock_posix_cpu = {
.clock_getres = posix_cpu_clock_getres,
.clock_set = posix_cpu_clock_set,
.clock_get = posix_cpu_clock_get,
@@ -1433,24 +1433,16 @@ struct k_clock clock_posix_cpu = {
.timer_get = posix_cpu_timer_get,
};
-static __init int init_posix_cpu_timers(void)
-{
- struct k_clock process = {
- .clock_getres = process_cpu_clock_getres,
- .clock_get = process_cpu_clock_get,
- .timer_create = process_cpu_timer_create,
- .nsleep = process_cpu_nsleep,
- .nsleep_restart = process_cpu_nsleep_restart,
- };
- struct k_clock thread = {
- .clock_getres = thread_cpu_clock_getres,
- .clock_get = thread_cpu_clock_get,
- .timer_create = thread_cpu_timer_create,
- };
-
- posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
- posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
+const struct k_clock clock_process = {
+ .clock_getres = process_cpu_clock_getres,
+ .clock_get = process_cpu_clock_get,
+ .timer_create = process_cpu_timer_create,
+ .nsleep = process_cpu_nsleep,
+ .nsleep_restart = process_cpu_nsleep_restart,
+};
- return 0;
-}
-__initcall(init_posix_cpu_timers);
+const struct k_clock clock_thread = {
+ .clock_getres = thread_cpu_clock_getres,
+ .clock_get = thread_cpu_clock_get,
+ .timer_create = thread_cpu_timer_create,
+};
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 4d7b2ce09c27..0c0cccfa3586 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -125,8 +125,6 @@ static DEFINE_SPINLOCK(hash_lock);
* which we beg off on and pass to do_sys_settimeofday().
*/
-static struct k_clock posix_clocks[MAX_CLOCKS];
-
/*
* These ones are defined below.
*/
@@ -280,74 +278,87 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
return 0;
}
+
+static const struct k_clock clock_realtime = {
+ .clock_getres = posix_get_hrtimer_res,
+ .clock_get = posix_clock_realtime_get,
+ .clock_set = posix_clock_realtime_set,
+ .clock_adj = posix_clock_realtime_adj,
+ .nsleep = common_nsleep,
+ .nsleep_restart = hrtimer_nanosleep_restart,
+ .timer_create = common_timer_create,
+ .timer_set = common_timer_set,
+ .timer_get = common_timer_get,
+ .timer_del = common_timer_del,
+};
+
+static const struct k_clock clock_monotonic = {
+ .clock_getres = posix_get_hrtimer_res,
+ .clock_get = posix_ktime_get_ts,
+ .nsleep = common_nsleep,
+ .nsleep_restart = hrtimer_nanosleep_restart,
+ .timer_create = common_timer_create,
+ .timer_set = common_timer_set,
+ .timer_get = common_timer_get,
+ .timer_del = common_timer_del,
+};
+
+static const struct k_clock clock_monotonic_raw = {
+ .clock_getres = posix_get_hrtimer_res,
+ .clock_get = posix_get_monotonic_raw,
+};
+
+static const struct k_clock clock_realtime_coarse = {
+ .clock_getres = posix_get_coarse_res,
+ .clock_get = posix_get_realtime_coarse,
+};
+
+static const struct k_clock clock_monotonic_coarse = {
+ .clock_getres = posix_get_coarse_res,
+ .clock_get = posix_get_monotonic_coarse,
+};
+
+static const struct k_clock clock_tai = {
+ .clock_getres = posix_get_hrtimer_res,
+ .clock_get = posix_get_tai,
+ .nsleep = common_nsleep,
+ .nsleep_restart = hrtimer_nanosleep_restart,
+ .timer_create = common_timer_create,
+ .timer_set = common_timer_set,
+ .timer_get = common_timer_get,
+ .timer_del = common_timer_del,
+};
+
+static const struct k_clock clock_boottime = {
+ .clock_getres = posix_get_hrtimer_res,
+ .clock_get = posix_get_boottime,
+ .nsleep = common_nsleep,
+ .nsleep_restart = hrtimer_nanosleep_restart,
+ .timer_create = common_timer_create,
+ .timer_set = common_timer_set,
+ .timer_get = common_timer_get,
+ .timer_del = common_timer_del,
+};
+
+static const struct k_clock * const posix_clocks[] = {
+ [CLOCK_REALTIME] = &clock_realtime,
+ [CLOCK_MONOTONIC] = &clock_monotonic,
+ [CLOCK_PROCESS_CPUTIME_ID] = &clock_process,
+ [CLOCK_THREAD_CPUTIME_ID] = &clock_thread,
+ [CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw,
+ [CLOCK_REALTIME_COARSE] = &clock_realtime_coarse,
+ [CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse,
+ [CLOCK_BOOTTIME] = &clock_boottime,
+ [CLOCK_REALTIME_ALARM] = &alarm_clock,
+ [CLOCK_BOOTTIME_ALARM] = &alarm_clock,
+ [CLOCK_TAI] = &clock_tai,
+};
+
/*
* Initialize everything, well, just everything in Posix clocks/timers ;)
*/
static __init int init_posix_timers(void)
{
- struct k_clock clock_realtime = {
- .clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_clock_realtime_get,
- .clock_set = posix_clock_realtime_set,
- .clock_adj = posix_clock_realtime_adj,
- .nsleep = common_nsleep,
- .nsleep_restart = hrtimer_nanosleep_restart,
- .timer_create = common_timer_create,
- .timer_set = common_timer_set,
- .timer_get = common_timer_get,
- .timer_del = common_timer_del,
- };
- struct k_clock clock_monotonic = {
- .clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_ktime_get_ts,
- .nsleep = common_nsleep,
- .nsleep_restart = hrtimer_nanosleep_restart,
- .timer_create = common_timer_create,
- .timer_set = common_timer_set,
- .timer_get = common_timer_get,
- .timer_del = common_timer_del,
- };
- struct k_clock clock_monotonic_raw = {
- .clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_get_monotonic_raw,
- };
- struct k_clock clock_realtime_coarse = {
- .clock_getres = posix_get_coarse_res,
- .clock_get = posix_get_realtime_coarse,
- };
- struct k_clock clock_monotonic_coarse = {
- .clock_getres = posix_get_coarse_res,
- .clock_get = posix_get_monotonic_coarse,
- };
- struct k_clock clock_tai = {
- .clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_get_tai,
- .nsleep = common_nsleep,
- .nsleep_restart = hrtimer_nanosleep_restart,
- .timer_create = common_timer_create,
- .timer_set = common_timer_set,
- .timer_get = common_timer_get,
- .timer_del = common_timer_del,
- };
- struct k_clock clock_boottime = {
- .clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_get_boottime,
- .nsleep = common_nsleep,
- .nsleep_restart = hrtimer_nanosleep_restart,
- .timer_create = common_timer_create,
- .timer_set = common_timer_set,
- .timer_get = common_timer_get,
- .timer_del = common_timer_del,
- };
-
- posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
- posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
- posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
- posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
- posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
- posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
- posix_timers_register_clock(CLOCK_TAI, &clock_tai);
-
posix_timers_cache = kmem_cache_create("posix_timers_cache",
sizeof (struct k_itimer), 0, SLAB_PANIC,
NULL);
@@ -521,30 +532,6 @@ static struct pid *good_sigevent(sigevent_t * event)
return task_pid(rtn);
}
-void posix_timers_register_clock(const clockid_t clock_id,
- struct k_clock *new_clock)
-{
- if ((unsigned) clock_id >= MAX_CLOCKS) {
- printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n",
- clock_id);
- return;
- }
-
- if (!new_clock->clock_get) {
- printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n",
- clock_id);
- return;
- }
- if (!new_clock->clock_getres) {
- printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n",
- clock_id);
- return;
- }
-
- posix_clocks[clock_id] = *new_clock;
-}
-EXPORT_SYMBOL_GPL(posix_timers_register_clock);
-
static struct k_itimer * alloc_posix_timer(void)
{
struct k_itimer *tmr;
@@ -581,15 +568,15 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
}
-static struct k_clock *clockid_to_kclock(const clockid_t id)
+static const struct k_clock *clockid_to_kclock(const clockid_t id)
{
if (id < 0)
return (id & CLOCKFD_MASK) == CLOCKFD ?
&clock_posix_dynamic : &clock_posix_cpu;
- if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
+ if (id >= ARRAY_SIZE(posix_clocks) || !posix_clocks[id])
return NULL;
- return &posix_clocks[id];
+ return posix_clocks[id];
}
static int common_timer_create(struct k_itimer *new_timer)
@@ -604,7 +591,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
struct sigevent __user *, timer_event_spec,
timer_t __user *, created_timer_id)
{
- struct k_clock *kc = clockid_to_kclock(which_clock);
+ const struct k_clock *kc = clockid_to_kclock(which_clock);
struct k_itimer *new_timer;
int error, new_timer_id;
sigevent_t event;
@@ -781,7 +768,7 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
struct itimerspec64 cur_setting64;
struct itimerspec cur_setting;
struct k_itimer *timr;
- struct k_clock *kc;
+ const struct k_clock *kc;
unsigned long flags;
int ret = 0;
@@ -890,7 +877,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
struct itimerspec new_spec, old_spec;
struct k_itimer *timr;
unsigned long flag;
- struct k_clock *kc;
+ const struct k_clock *kc;
int error = 0;
if (!new_setting)
@@ -939,7 +926,7 @@ static int common_timer_del(struct k_itimer *timer)
static inline int timer_delete_hook(struct k_itimer *timer)
{
- struct k_clock *kc = clockid_to_kclock(timer->it_clock);
+ const struct k_clock *kc = clockid_to_kclock(timer->it_clock);
if (WARN_ON_ONCE(!kc || !kc->timer_del))
return -EINVAL;
@@ -1018,7 +1005,7 @@ void exit_itimers(struct signal_struct *sig)
SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
const struct timespec __user *, tp)
{
- struct k_clock *kc = clockid_to_kclock(which_clock);
+ const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 new_tp64;
struct timespec new_tp;
@@ -1035,7 +1022,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
struct timespec __user *,tp)
{
- struct k_clock *kc = clockid_to_kclock(which_clock);
+ const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 kernel_tp64;
struct timespec kernel_tp;
int error;
@@ -1055,7 +1042,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
struct timex __user *, utx)
{
- struct k_clock *kc = clockid_to_kclock(which_clock);
+ const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timex ktx;
int err;
@@ -1078,7 +1065,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
struct timespec __user *, tp)
{
- struct k_clock *kc = clockid_to_kclock(which_clock);
+ const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 rtn_tp64;
struct timespec rtn_tp;
int error;
@@ -1110,7 +1097,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
const struct timespec __user *, rqtp,
struct timespec __user *, rmtp)
{
- struct k_clock *kc = clockid_to_kclock(which_clock);
+ const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 t64;
struct timespec t;
@@ -1136,7 +1123,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
long clock_nanosleep_restart(struct restart_block *restart_block)
{
clockid_t which_clock = restart_block->nanosleep.clockid;
- struct k_clock *kc = clockid_to_kclock(which_clock);
+ const struct k_clock *kc = clockid_to_kclock(which_clock);
if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
return -EINVAL;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 64c97fc130c4..2de9c553682f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -150,6 +150,12 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
touch_softlockup_watchdog_sched();
if (is_idle_task(current))
ts->idle_jiffies++;
+ /*
+ * In case the current tick fired too early past its expected
+ * expiration, make sure we don't bypass the next clock reprogramming
+ * to the same deadline.
+ */
+ ts->next_tick = 0;
}
#endif
update_process_times(user_mode(regs));
@@ -554,7 +560,7 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
update_ts_time_stats(smp_processor_id(), ts, now, NULL);
ts->idle_active = 0;
- sched_clock_idle_wakeup_event(0);
+ sched_clock_idle_wakeup_event();
}
static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
@@ -660,6 +666,12 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
else
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+
+ /*
+ * Reset to make sure next tick stop doesn't get fooled by past
+ * cached clock deadline.
+ */
+ ts->next_tick = 0;
}
static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
@@ -771,8 +783,16 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
tick = expires;
/* Skip reprogram of event if its not changed */
- if (ts->tick_stopped && (expires == dev->next_event))
- goto out;
+ if (ts->tick_stopped && (expires == ts->next_tick)) {
+ /* Sanity check: make sure clockevent is actually programmed */
+ if (likely(dev->next_event <= ts->next_tick))
+ goto out;
+
+ WARN_ON_ONCE(1);
+ printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n",
+ basemono, ts->next_tick, dev->next_event,
+ hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer));
+ }
/*
* nohz_stop_sched_tick can be called several times before
@@ -791,6 +811,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
trace_tick_stop(1, TICK_DEP_MASK_NONE);
}
+ ts->next_tick = tick;
+
/*
* If the expiration time == KTIME_MAX, then we simply stop
* the tick timer.
@@ -806,7 +828,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
else
tick_program_event(tick, 1);
out:
- /* Update the estimated sleep length */
+ /*
+ * Update the estimated sleep length until the next timer
+ * (not only the tick).
+ */
ts->sleep_length = ktime_sub(dev->next_event, now);
return tick;
}
@@ -864,6 +889,11 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
if (unlikely(!cpu_online(cpu))) {
if (cpu == tick_do_timer_cpu)
tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+ /*
+ * Make sure the CPU doesn't get fooled by obsolete tick
+ * deadline if it comes back online later.
+ */
+ ts->next_tick = 0;
return false;
}
@@ -1172,6 +1202,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
*/
if (regs)
tick_sched_handle(ts, regs);
+ else
+ ts->next_tick = 0;
/* No need to reprogram if we are in idle or full dynticks mode */
if (unlikely(ts->tick_stopped))
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index bf38226e5c17..075444e3d48e 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -27,6 +27,7 @@ enum tick_nohz_mode {
* timer is modified for nohz sleeps. This is necessary
* to resume the tick timer operation in the timeline
* when the CPU returns from nohz sleep.
+ * @next_tick: Next tick to be fired when in dynticks mode.
* @tick_stopped: Indicator that the idle tick has been stopped
* @idle_jiffies: jiffies at the entry to idle for idle time accounting
* @idle_calls: Total number of idle calls
@@ -44,6 +45,7 @@ struct tick_sched {
unsigned long check_clocks;
enum tick_nohz_mode nohz_mode;
ktime_t last_tick;
+ ktime_t next_tick;
int inidle;
int tick_stopped;
unsigned long idle_jiffies;
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 81dedaab36cc..4731a0895760 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -43,6 +43,38 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
}
EXPORT_SYMBOL(cpumask_any_but);
+/**
+ * cpumask_next_wrap - helper to implement for_each_cpu_wrap
+ * @n: the cpu prior to the place to search
+ * @mask: the cpumask pointer
+ * @start: the start point of the iteration
+ * @wrap: assume @n crossing @start terminates the iteration
+ *
+ * Returns >= nr_cpu_ids on completion
+ *
+ * Note: the @wrap argument is required for the start condition when
+ * we cannot assume @start is set in @mask.
+ */
+int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
+{
+ int next;
+
+again:
+ next = cpumask_next(n, mask);
+
+ if (wrap && n < start && next >= start) {
+ return nr_cpumask_bits;
+
+ } else if (next >= nr_cpumask_bits) {
+ wrap = true;
+ n = -1;
+ goto again;
+ }
+
+ return next;
+}
+EXPORT_SYMBOL(cpumask_next_wrap);
+
/* These are not inline because of header tangles. */
#ifdef CONFIG_CPUMASK_OFFSTACK
/**
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 690d75b132fa..2fb007be0212 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -28,7 +28,7 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
/*
* It is valid to assume CPU-locality during early bootup:
*/
- if (system_state != SYSTEM_RUNNING)
+ if (system_state < SYSTEM_SCHEDULING)
goto out;
/*
diff --git a/mm/rmap.c b/mm/rmap.c
index d405f0e0ee96..130c238fe384 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -579,25 +579,13 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
void try_to_unmap_flush(void)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
- int cpu;
if (!tlb_ubc->flush_required)
return;
- cpu = get_cpu();
-
- if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) {
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- local_flush_tlb();
- trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
- }
-
- if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids)
- flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL);
- cpumask_clear(&tlb_ubc->cpumask);
+ arch_tlbbatch_flush(&tlb_ubc->arch);
tlb_ubc->flush_required = false;
tlb_ubc->writable = false;
- put_cpu();
}
/* Flush iff there are potentially writable TLB entries that can race with IO */
@@ -613,7 +601,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
- cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
+ arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
tlb_ubc->flush_required = true;
/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8ad39bbc79e6..c3c1c6ac62da 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3652,7 +3652,7 @@ int kswapd_run(int nid)
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
- BUG_ON(system_state == SYSTEM_BOOTING);
+ BUG_ON(system_state < SYSTEM_RUNNING);
pr_err("Failed to start kswapd on node %d\n", nid);
ret = PTR_ERR(pgdat->kswapd);
pgdat->kswapd = NULL;