diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2017-05-30 12:45:57 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2017-05-30 12:45:57 +1000 |
commit | 186ac60535bb981da129890d3bf1ed99d88da8c9 (patch) | |
tree | bea294b3409d6d241d17037faa9be406ba01ee7d | |
parent | fdabcf0c35152c82c06b09831b9b0812b7315146 (diff) | |
parent | 8e3c0499d0c92e96cb9295d45f1ac3784ecad8ec (diff) |
Merge remote-tracking branch 'tip/auto-latest'
112 files changed, 2053 insertions, 2167 deletions
diff --git a/Documentation/IRQ-domain.txt b/Documentation/IRQ-domain.txt index 82001a25a14b..1f246eb25ca5 100644 --- a/Documentation/IRQ-domain.txt +++ b/Documentation/IRQ-domain.txt @@ -231,5 +231,42 @@ needs to: 4) No need to implement irq_domain_ops.map and irq_domain_ops.unmap, they are unused with hierarchy irq_domain. -Hierarchy irq_domain may also be used to support other architectures, -such as ARM, ARM64 etc. +Hierarchy irq_domain is in no way x86 specific, and is heavily used to +support other architectures, such as ARM, ARM64 etc. + +=== Debugging === + +If you switch on CONFIG_IRQ_DOMAIN_DEBUG (which depends on +CONFIG_IRQ_DOMAIN and CONFIG_DEBUG_FS), you will find a new file in +your debugfs mount point, called irq_domain_mapping. This file +contains a live snapshot of all the IRQ domains in the system: + + name mapped linear-max direct-max devtree-node + pl061 8 8 0 /smb/gpio@e0080000 + pl061 8 8 0 /smb/gpio@e1050000 + pMSI 0 0 0 /interrupt-controller@e1101000/v2m@e0080000 + MSI 37 0 0 /interrupt-controller@e1101000/v2m@e0080000 + GICv2m 37 0 0 /interrupt-controller@e1101000/v2m@e0080000 + GICv2 448 448 0 /interrupt-controller@e1101000 + +it also iterates over the interrupts to display their mapping in the +domains, and makes the domain stacking visible: + + +irq hwirq chip name chip data active type domain + 1 0x00019 GICv2 0xffff00000916bfd8 * LINEAR GICv2 + 2 0x0001d GICv2 0xffff00000916bfd8 LINEAR GICv2 + 3 0x0001e GICv2 0xffff00000916bfd8 * LINEAR GICv2 + 4 0x0001b GICv2 0xffff00000916bfd8 * LINEAR GICv2 + 5 0x0001a GICv2 0xffff00000916bfd8 LINEAR GICv2 +[...] + 96 0x81808 MSI 0x (null) RADIX MSI + 96+ 0x00063 GICv2m 0xffff8003ee116980 RADIX GICv2m + 96+ 0x00063 GICv2 0xffff00000916bfd8 LINEAR GICv2 + 97 0x08800 MSI 0x (null) * RADIX MSI + 97+ 0x00064 GICv2m 0xffff8003ee116980 * RADIX GICv2m + 97+ 0x00064 GICv2 0xffff00000916bfd8 * LINEAR GICv2 + +Here, interrupts 1-5 are only using a single domain, while 96 and 97 +are build out of a stack of three domain, each level performing a +particular function. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 15f79c27748d..4e4c3402412e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2127,6 +2127,12 @@ memmap=nn[KMG]@ss[KMG] [KNL] Force usage of a specific region of memory. Region of memory to be used is from ss to ss+nn. + If @ss[KMG] is omitted, it is equivalent to mem=nn[KMG], + which limits max address to nn[KMG]. + Multiple different regions can be specified, + comma delimited. + Example: + memmap=100M@2G,100M#3G,1G!1024G memmap=nn[KMG]#ss[KMG] [KNL,ACPI] Mark specific memory as ACPI data. @@ -2139,6 +2145,9 @@ memmap=64K$0x18690000 or memmap=0x10000$0x18690000 + Some bootloaders may need an escape character before '$', + like Grub2, otherwise '$' and the following number + will be eaten. memmap=nn[KMG]!ss[KMG] [KNL,X86] Mark specific memory as protected. diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index be3b3fbd382f..63cb4c7c6593 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -1090,7 +1090,7 @@ static int __init arch_hw_breakpoint_init(void) * driven low on this core and there isn't an architected way to * determine that. */ - get_online_cpus(); + cpus_read_lock(); register_undef_hook(&debug_reg_hook); /* @@ -1098,15 +1098,16 @@ static int __init arch_hw_breakpoint_init(void) * assume that a halting debugger will leave the world in a nice state * for us. */ - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "arm/hw_breakpoint:online", - dbg_reset_online, NULL); + ret = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN, + "arm/hw_breakpoint:online", + dbg_reset_online, NULL); unregister_undef_hook(&debug_reg_hook); if (WARN_ON(ret < 0) || !cpumask_empty(&debug_err_mask)) { core_num_brps = 0; core_num_wrps = 0; if (ret > 0) cpuhp_remove_state_nocalls(ret); - put_online_cpus(); + cpus_read_unlock(); return 0; } @@ -1124,7 +1125,7 @@ static int __init arch_hw_breakpoint_init(void) TRAP_HWBKPT, "watchpoint debug exception"); hook_ifault_code(FAULT_CODE_DEBUG, hw_breakpoint_pending, SIGTRAP, TRAP_HWBKPT, "breakpoint debug exception"); - put_online_cpus(); + cpus_read_unlock(); /* Register PM notifiers. */ pm_init(); diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c index 020560b2dcb7..a1a34722c655 100644 --- a/arch/arm/kernel/patch.c +++ b/arch/arm/kernel/patch.c @@ -124,5 +124,5 @@ void __kprobes patch_text(void *addr, unsigned int insn) .insn = insn, }; - stop_machine(patch_text_stop_machine, &patch, NULL); + stop_machine_cpuslocked(patch_text_stop_machine, &patch, NULL); } diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 572a8df1b766..c9a0a5299827 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -555,8 +555,7 @@ static DEFINE_RAW_SPINLOCK(stop_lock); */ static void ipi_cpu_stop(unsigned int cpu) { - if (system_state == SYSTEM_BOOTING || - system_state == SYSTEM_RUNNING) { + if (system_state <= SYSTEM_RUNNING) { raw_spin_lock(&stop_lock); pr_crit("CPU%u: stopping\n", cpu); dump_stack(); diff --git a/arch/arm/probes/kprobes/core.c b/arch/arm/probes/kprobes/core.c index ad1f4e6a9e33..52d1cd14fda4 100644 --- a/arch/arm/probes/kprobes/core.c +++ b/arch/arm/probes/kprobes/core.c @@ -182,7 +182,8 @@ void __kprobes kprobes_remove_breakpoint(void *addr, unsigned int insn) .addr = addr, .insn = insn, }; - stop_machine(__kprobes_remove_breakpoint, &p, cpu_online_mask); + stop_machine_cpuslocked(__kprobes_remove_breakpoint, &p, + cpu_online_mask); } void __kprobes arch_disarm_kprobe(struct kprobe *p) diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index 29cb2ca756f6..4214c38d016b 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -433,7 +433,6 @@ u32 aarch64_set_branch_offset(u32 insn, s32 offset); bool aarch64_insn_hotpatch_safe(u32 old_insn, u32 new_insn); int aarch64_insn_patch_text_nosync(void *addr, u32 insn); -int aarch64_insn_patch_text_sync(void *addrs[], u32 insns[], int cnt); int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt); s32 aarch64_insn_adrp_get_offset(u32 insn); diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c index b884a926a632..cd872133e88e 100644 --- a/arch/arm64/kernel/insn.c +++ b/arch/arm64/kernel/insn.c @@ -255,6 +255,7 @@ static int __kprobes aarch64_insn_patch_text_cb(void *arg) return ret; } +static int __kprobes aarch64_insn_patch_text_sync(void *addrs[], u32 insns[], int cnt) { struct aarch64_insn_patch patch = { @@ -267,8 +268,8 @@ int __kprobes aarch64_insn_patch_text_sync(void *addrs[], u32 insns[], int cnt) if (cnt <= 0) return -EINVAL; - return stop_machine(aarch64_insn_patch_text_cb, &patch, - cpu_online_mask); + return stop_machine_cpuslocked(aarch64_insn_patch_text_cb, &patch, + cpu_online_mask); } int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 6e0e16a3a7d4..321119881abf 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -961,8 +961,7 @@ void smp_send_stop(void) cpumask_copy(&mask, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), &mask); - if (system_state == SYSTEM_BOOTING || - system_state == SYSTEM_RUNNING) + if (system_state <= SYSTEM_RUNNING) pr_crit("SMP: stopping secondary CPUs\n"); smp_cross_call(&mask, IPI_CPU_STOP); } diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c index 232a12bf3f99..2dbbb7c66043 100644 --- a/arch/metag/kernel/smp.c +++ b/arch/metag/kernel/smp.c @@ -567,8 +567,7 @@ static void stop_this_cpu(void *data) { unsigned int cpu = smp_processor_id(); - if (system_state == SYSTEM_BOOTING || - system_state == SYSTEM_RUNNING) { + if (system_state <= SYSTEM_RUNNING) { spin_lock(&stop_lock); pr_crit("CPU%u: stopping\n", cpu); dump_stack(); diff --git a/arch/mips/kernel/jump_label.c b/arch/mips/kernel/jump_label.c index 3e586daa3a32..32e3168316cd 100644 --- a/arch/mips/kernel/jump_label.c +++ b/arch/mips/kernel/jump_label.c @@ -58,7 +58,6 @@ void arch_jump_label_transform(struct jump_entry *e, insn.word = 0; /* nop */ } - get_online_cpus(); mutex_lock(&text_mutex); if (IS_ENABLED(CONFIG_CPU_MICROMIPS)) { insn_p->halfword[0] = insn.word >> 16; @@ -70,7 +69,6 @@ void arch_jump_label_transform(struct jump_entry *e, (unsigned long)insn_p + sizeof(*insn_p)); mutex_unlock(&text_mutex); - put_online_cpus(); } #endif /* HAVE_JUMP_LABEL */ diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index df2a41647d8e..1069f74fca47 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -97,7 +97,7 @@ int smp_generic_cpu_bootable(unsigned int nr) /* Special case - we inhibit secondary thread startup * during boot if the user requests it. */ - if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) { + if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) { if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0) return 0; if (smt_enabled_at_boot diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 42b7a4fd57d9..48a6bd160011 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3317,7 +3317,7 @@ void kvmppc_alloc_host_rm_ops(void) return; } - get_online_cpus(); + cpus_read_lock(); for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) { if (!cpu_online(cpu)) @@ -3339,17 +3339,17 @@ void kvmppc_alloc_host_rm_ops(void) l_ops = (unsigned long) ops; if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) { - put_online_cpus(); + cpus_read_unlock(); kfree(ops->rm_core); kfree(ops); return; } - cpuhp_setup_state_nocalls(CPUHP_KVM_PPC_BOOK3S_PREPARE, - "ppc/kvm_book3s:prepare", - kvmppc_set_host_core, - kvmppc_clear_host_core); - put_online_cpus(); + cpuhp_setup_state_nocalls_cpuslocked(CPUHP_KVM_PPC_BOOK3S_PREPARE, + "ppc/kvm_book3s:prepare", + kvmppc_set_host_core, + kvmppc_clear_host_core); + cpus_read_unlock(); } void kvmppc_free_host_rm_ops(void) diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c index 0babef11136f..e6230f104dd9 100644 --- a/arch/powerpc/platforms/powernv/subcore.c +++ b/arch/powerpc/platforms/powernv/subcore.c @@ -348,7 +348,7 @@ static int set_subcores_per_core(int new_mode) state->master = 0; } - get_online_cpus(); + cpus_read_lock(); /* This cpu will update the globals before exiting stop machine */ this_cpu_ptr(&split_state)->master = 1; @@ -356,9 +356,10 @@ static int set_subcores_per_core(int new_mode) /* Ensure state is consistent before we call the other cpus */ mb(); - stop_machine(cpu_update_split_mode, &new_mode, cpu_online_mask); + stop_machine_cpuslocked(cpu_update_split_mode, &new_mode, + cpu_online_mask); - put_online_cpus(); + cpus_read_unlock(); return 0; } diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c index 6aa630a8d24f..262506cee4c3 100644 --- a/arch/s390/kernel/jump_label.c +++ b/arch/s390/kernel/jump_label.c @@ -93,7 +93,7 @@ void arch_jump_label_transform(struct jump_entry *entry, args.entry = entry; args.type = type; - stop_machine(__sm_arch_jump_label_transform, &args, NULL); + stop_machine_cpuslocked(__sm_arch_jump_label_transform, &args, NULL); } void arch_jump_label_transform_static(struct jump_entry *entry, diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 3d6a99746454..6842e4501e2e 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -196,7 +196,7 @@ void arch_arm_kprobe(struct kprobe *p) { struct swap_insn_args args = {.p = p, .arm_kprobe = 1}; - stop_machine(swap_instruction, &args, NULL); + stop_machine_cpuslocked(swap_instruction, &args, NULL); } NOKPROBE_SYMBOL(arch_arm_kprobe); @@ -204,7 +204,7 @@ void arch_disarm_kprobe(struct kprobe *p) { struct swap_insn_args args = {.p = p, .arm_kprobe = 0}; - stop_machine(swap_instruction, &args, NULL); + stop_machine_cpuslocked(swap_instruction, &args, NULL); } NOKPROBE_SYMBOL(arch_disarm_kprobe); diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index c3a52f9a69a0..192efdfac918 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -636,10 +636,10 @@ static void stp_work_fn(struct work_struct *work) goto out_unlock; memset(&stp_sync, 0, sizeof(stp_sync)); - get_online_cpus(); + cpus_read_lock(); atomic_set(&stp_sync.cpus, num_online_cpus() - 1); - stop_machine(stp_sync_clock, &stp_sync, cpu_online_mask); - put_online_cpus(); + stop_machine_cpuslocked(stp_sync_clock, &stp_sync, cpu_online_mask); + cpus_read_unlock(); if (!check_sync_clock()) /* diff --git a/arch/sparc/kernel/jump_label.c b/arch/sparc/kernel/jump_label.c index 07933b9e9ce0..93adde1ac166 100644 --- a/arch/sparc/kernel/jump_label.c +++ b/arch/sparc/kernel/jump_label.c @@ -41,12 +41,10 @@ void arch_jump_label_transform(struct jump_entry *entry, val = 0x01000000; } - get_online_cpus(); mutex_lock(&text_mutex); *insn = val; flushi(insn); mutex_unlock(&text_mutex); - put_online_cpus(); } #endif diff --git a/arch/tile/kernel/jump_label.c b/arch/tile/kernel/jump_label.c index 07802d586988..93931a46625b 100644 --- a/arch/tile/kernel/jump_label.c +++ b/arch/tile/kernel/jump_label.c @@ -45,14 +45,12 @@ static void __jump_label_transform(struct jump_entry *e, void arch_jump_label_transform(struct jump_entry *e, enum jump_label_type type) { - get_online_cpus(); mutex_lock(&text_mutex); __jump_label_transform(e, type); flush_icache_range(e->code, e->code + sizeof(tilegx_bundle_bits)); mutex_unlock(&text_mutex); - put_online_cpus(); } __init_or_module void arch_jump_label_transform_static(struct jump_entry *e, diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index 73ccf63b0f48..9dc1ce6ba3c0 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -13,7 +13,7 @@ static inline char rdfs8(addr_t addr) return *((char *)(fs + addr)); } #include "../cmdline.c" -static unsigned long get_cmd_line_ptr(void) +unsigned long get_cmd_line_ptr(void) { unsigned long cmd_line_ptr = boot_params->hdr.cmd_line_ptr; diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 54c24f0a43d3..e0eba12bffe7 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -9,16 +9,41 @@ * contain the entire properly aligned running kernel image. * */ + +/* + * isspace() in linux/ctype.h is expected by next_args() to filter + * out "space/lf/tab". While boot/ctype.h conflicts with linux/ctype.h, + * since isdigit() is implemented in both of them. Hence disable it + * here. + */ +#define BOOT_CTYPE_H + +/* + * _ctype[] in lib/ctype.c is needed by isspace() of linux/ctype.h. + * While both lib/ctype.c and lib/cmdline.c will bring EXPORT_SYMBOL + * which is meaningless and will cause compiling error in some cases. + * So do not include linux/export.h and define EXPORT_SYMBOL(sym) + * as empty. + */ +#define _LINUX_EXPORT_H +#define EXPORT_SYMBOL(sym) + #include "misc.h" #include "error.h" -#include "../boot.h" #include <generated/compile.h> #include <linux/module.h> #include <linux/uts.h> #include <linux/utsname.h> +#include <linux/ctype.h> #include <generated/utsrelease.h> +/* Macros used by the included decompressor code below. */ +#define STATIC +#include <linux/decompress/mm.h> + +extern unsigned long get_cmd_line_ptr(void); + /* Simplified build-specific string for starting entropy. */ static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; @@ -62,6 +87,11 @@ struct mem_vector { static bool memmap_too_large; + +/* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */ +unsigned long long mem_limit = ULLONG_MAX; + + enum mem_avoid_index { MEM_AVOID_ZO_RANGE = 0, MEM_AVOID_INITRD, @@ -85,49 +115,14 @@ static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) return true; } -/** - * _memparse - Parse a string with mem suffixes into a number - * @ptr: Where parse begins - * @retptr: (output) Optional pointer to next char after parse completes - * - * Parses a string into a number. The number stored at @ptr is - * potentially suffixed with K, M, G, T, P, E. - */ -static unsigned long long _memparse(const char *ptr, char **retptr) +char *skip_spaces(const char *str) { - char *endptr; /* Local pointer to end of parsed string */ - - unsigned long long ret = simple_strtoull(ptr, &endptr, 0); - - switch (*endptr) { - case 'E': - case 'e': - ret <<= 10; - case 'P': - case 'p': - ret <<= 10; - case 'T': - case 't': - ret <<= 10; - case 'G': - case 'g': - ret <<= 10; - case 'M': - case 'm': - ret <<= 10; - case 'K': - case 'k': - ret <<= 10; - endptr++; - default: - break; - } - - if (retptr) - *retptr = endptr; - - return ret; + while (isspace(*str)) + ++str; + return (char *)str; } +#include "../../../../lib/ctype.c" +#include "../../../../lib/cmdline.c" static int parse_memmap(char *p, unsigned long long *start, unsigned long long *size) @@ -142,40 +137,41 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size) return -EINVAL; oldp = p; - *size = _memparse(p, &p); + *size = memparse(p, &p); if (p == oldp) return -EINVAL; switch (*p) { - case '@': - /* Skip this region, usable */ - *start = 0; - *size = 0; - return 0; case '#': case '$': case '!': - *start = _memparse(p + 1, &p); + *start = memparse(p + 1, &p); + return 0; + case '@': + /* memmap=nn@ss specifies usable region, should be skipped */ + *size = 0; + /* Fall through */ + default: + /* + * If w/o offset, only size specified, memmap=nn[KMG] has the + * same behaviour as mem=nn[KMG]. It limits the max address + * system can use. Region above the limit should be avoided. + */ + *start = 0; return 0; } return -EINVAL; } -static void mem_avoid_memmap(void) +static void mem_avoid_memmap(char *str) { - char arg[128]; + static int i; int rc; - int i; - char *str; - /* See if we have any memmap areas */ - rc = cmdline_find_option("memmap", arg, sizeof(arg)); - if (rc <= 0) + if (i >= MAX_MEMMAP_REGIONS) return; - i = 0; - str = arg; while (str && (i < MAX_MEMMAP_REGIONS)) { int rc; unsigned long long start, size; @@ -188,9 +184,14 @@ static void mem_avoid_memmap(void) if (rc < 0) break; str = k; - /* A usable region that should not be skipped */ - if (size == 0) + + if (start == 0) { + /* Store the specified memory limit if size > 0 */ + if (size > 0) + mem_limit = size; + continue; + } mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start; mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size; @@ -202,6 +203,57 @@ static void mem_avoid_memmap(void) memmap_too_large = true; } +static int handle_mem_memmap(void) +{ + char *args = (char *)get_cmd_line_ptr(); + size_t len = strlen((char *)args); + char *tmp_cmdline; + char *param, *val; + u64 mem_size; + + if (!strstr(args, "memmap=") && !strstr(args, "mem=")) + return 0; + + tmp_cmdline = malloc(len + 1); + if (!tmp_cmdline ) + error("Failed to allocate space for tmp_cmdline"); + + memcpy(tmp_cmdline, args, len); + tmp_cmdline[len] = 0; + args = tmp_cmdline; + + /* Chew leading spaces */ + args = skip_spaces(args); + + while (*args) { + args = next_arg(args, ¶m, &val); + /* Stop at -- */ + if (!val && strcmp(param, "--") == 0) { + warn("Only '--' specified in cmdline"); + free(tmp_cmdline); + return -1; + } + + if (!strcmp(param, "memmap")) { + mem_avoid_memmap(val); + } else if (!strcmp(param, "mem")) { + char *p = val; + + if (!strcmp(p, "nopentium")) + continue; + mem_size = memparse(p, &p); + if (mem_size == 0) { + free(tmp_cmdline); + return -EINVAL; + } + mem_limit = mem_size; + } + } + + free(tmp_cmdline); + return 0; +} + /* * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T). * The mem_avoid array is used to store the ranges that need to be avoided @@ -323,7 +375,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, /* We don't need to set a mapping for setup_data. */ /* Mark the memmap regions we need to avoid */ - mem_avoid_memmap(); + handle_mem_memmap(); #ifdef CONFIG_X86_VERBOSE_BOOTUP /* Make sure video RAM can be used. */ @@ -432,7 +484,8 @@ static void process_e820_entry(struct boot_e820_entry *entry, { struct mem_vector region, overlap; struct slot_area slot_area; - unsigned long start_orig; + unsigned long start_orig, end; + struct boot_e820_entry cur_entry; /* Skip non-RAM entries. */ if (entry->type != E820_TYPE_RAM) @@ -446,8 +499,15 @@ static void process_e820_entry(struct boot_e820_entry *entry, if (entry->addr + entry->size < minimum) return; - region.start = entry->addr; - region.size = entry->size; + /* Ignore entries above memory limit */ + end = min(entry->size + entry->addr, mem_limit); + if (entry->addr >= end) + return; + cur_entry.addr = entry->addr; + cur_entry.size = end - entry->addr; + + region.start = cur_entry.addr; + region.size = cur_entry.size; /* Give up if slot area array is full. */ while (slot_area_index < MAX_SLOT_AREA) { @@ -461,7 +521,7 @@ static void process_e820_entry(struct boot_e820_entry *entry, region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); /* Did we raise the address above this e820 region? */ - if (region.start > entry->addr + entry->size) + if (region.start > cur_entry.addr + cur_entry.size) return; /* Reduce size by any delta from the original address. */ diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 5457b02fc050..630e3664906b 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -122,6 +122,14 @@ unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int bas return result; } +long simple_strtol(const char *cp, char **endp, unsigned int base) +{ + if (*cp == '-') + return -simple_strtoull(cp + 1, endp, base); + + return simple_strtoull(cp, endp, base); +} + /** * strlen - Find the length of a string * @s: The string to be sized diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 580b60f5ac83..637feed9a594 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1750,6 +1750,8 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) return ret; } +static struct attribute_group x86_pmu_attr_group; + static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -1813,6 +1815,14 @@ static int __init init_hw_perf_events(void) x86_pmu_events_group.attrs = tmp; } + if (x86_pmu.attrs) { + struct attribute **tmp; + + tmp = merge_attr(x86_pmu_attr_group.attrs, x86_pmu.attrs); + if (!WARN_ON(!tmp)) + x86_pmu_attr_group.attrs = tmp; + } + pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); pr_info("... generic registers: %d\n", x86_pmu.num_counters); @@ -2224,7 +2234,6 @@ void perf_check_microcode(void) if (x86_pmu.check_microcode) x86_pmu.check_microcode(); } -EXPORT_SYMBOL_GPL(perf_check_microcode); static struct pmu pmu = { .pmu_enable = x86_pmu_enable, @@ -2255,7 +2264,7 @@ static struct pmu pmu = { void arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) { - struct cyc2ns_data *data; + struct cyc2ns_data data; u64 offset; userpg->cap_user_time = 0; @@ -2267,17 +2276,17 @@ void arch_perf_update_userpage(struct perf_event *event, if (!using_native_sched_clock() || !sched_clock_stable()) return; - data = cyc2ns_read_begin(); + cyc2ns_read_begin(&data); - offset = data->cyc2ns_offset + __sched_clock_offset; + offset = data.cyc2ns_offset + __sched_clock_offset; /* * Internal timekeeping for enabled/running/stopped times * is always in the local_clock domain. */ userpg->cap_user_time = 1; - userpg->time_mult = data->cyc2ns_mul; - userpg->time_shift = data->cyc2ns_shift; + userpg->time_mult = data.cyc2ns_mul; + userpg->time_shift = data.cyc2ns_shift; userpg->time_offset = offset - now; /* @@ -2289,7 +2298,7 @@ void arch_perf_update_userpage(struct perf_event *event, userpg->time_zero = offset; } - cyc2ns_read_end(data); + cyc2ns_read_end(); } void diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a6d91d4e37a1..de26a370176f 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3160,6 +3160,19 @@ err: return -ENOMEM; } +static void flip_smm_bit(void *data) +{ + unsigned long set = *(unsigned long *)data; + + if (set > 0) { + msr_set_bit(MSR_IA32_DEBUGCTLMSR, + DEBUGCTLMSR_FREEZE_IN_SMM_BIT); + } else { + msr_clear_bit(MSR_IA32_DEBUGCTLMSR, + DEBUGCTLMSR_FREEZE_IN_SMM_BIT); + } +} + static void intel_pmu_cpu_starting(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); @@ -3174,6 +3187,8 @@ static void intel_pmu_cpu_starting(int cpu) cpuc->lbr_sel = NULL; + flip_smm_bit(&x86_pmu.attr_freeze_on_smi); + if (!cpuc->shared_regs) return; @@ -3410,12 +3425,10 @@ static void intel_snb_check_microcode(void) int pebs_broken = 0; int cpu; - get_online_cpus(); for_each_online_cpu(cpu) { if ((pebs_broken = intel_snb_pebs_broken(cpu))) break; } - put_online_cpus(); if (pebs_broken == x86_pmu.pebs_broken) return; @@ -3488,7 +3501,9 @@ static bool check_msr(unsigned long msr, u64 mask) static __init void intel_sandybridge_quirk(void) { x86_pmu.check_microcode = intel_snb_check_microcode; + cpus_read_lock(); intel_snb_check_microcode(); + cpus_read_unlock(); } static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { @@ -3595,6 +3610,52 @@ static struct attribute *hsw_events_attrs[] = { NULL }; +static ssize_t freeze_on_smi_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%lu\n", x86_pmu.attr_freeze_on_smi); +} + +static DEFINE_MUTEX(freeze_on_smi_mutex); + +static ssize_t freeze_on_smi_store(struct device *cdev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val; + ssize_t ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + if (val > 1) + return -EINVAL; + + mutex_lock(&freeze_on_smi_mutex); + + if (x86_pmu.attr_freeze_on_smi == val) + goto done; + + x86_pmu.attr_freeze_on_smi = val; + + get_online_cpus(); + on_each_cpu(flip_smm_bit, &val, 1); + put_online_cpus(); +done: + mutex_unlock(&freeze_on_smi_mutex); + + return count; +} + +static DEVICE_ATTR_RW(freeze_on_smi); + +static struct attribute *intel_pmu_attrs[] = { + &dev_attr_freeze_on_smi.attr, + NULL, +}; + __init int intel_pmu_init(void) { union cpuid10_edx edx; @@ -3641,6 +3702,8 @@ __init int intel_pmu_init(void) x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters); + + x86_pmu.attrs = intel_pmu_attrs; /* * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events, when not running in a hypervisor: @@ -4112,13 +4175,12 @@ static __init int fixup_ht_bug(void) lockup_detector_resume(); - get_online_cpus(); + cpus_read_lock(); - for_each_online_cpu(c) { + for_each_online_cpu(c) free_excl_cntrs(c); - } - put_online_cpus(); + cpus_read_unlock(); pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n"); return 0; } diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c index 8c00dc09a5d2..2521f771f2f5 100644 --- a/arch/x86/events/intel/cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -1682,7 +1682,7 @@ static int __init intel_cqm_init(void) * * Also, check that the scales match on all cpus. */ - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -1746,14 +1746,14 @@ static int __init intel_cqm_init(void) * Setup the hot cpu notifier once we are sure cqm * is enabled to avoid notifier leak. */ - cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_STARTING, - "perf/x86/cqm:starting", - intel_cqm_cpu_starting, NULL); - cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_ONLINE, "perf/x86/cqm:online", - NULL, intel_cqm_cpu_exit); - + cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_STARTING, + "perf/x86/cqm:starting", + intel_cqm_cpu_starting, NULL); + cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_ONLINE, + "perf/x86/cqm:online", + NULL, intel_cqm_cpu_exit); out: - put_online_cpus(); + cpus_read_unlock(); if (ret) { kfree(str); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index be3d36254040..53728eea1bed 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -562,6 +562,9 @@ struct x86_pmu { ssize_t (*events_sysfs_show)(char *page, u64 config); struct attribute **cpu_events; + unsigned long attr_freeze_on_smi; + struct attribute **attrs; + /* * CPU Hotplug hooks */ diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 00c88a01301d..da181ad1d5f8 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -3,6 +3,7 @@ #include <linux/ioport.h> #include <linux/pci.h> +#include <linux/refcount.h> struct amd_nb_bus_dev_range { u8 bus; @@ -55,7 +56,7 @@ struct threshold_bank { struct threshold_block *blocks; /* initialized to the number of CPUs on the node sharing this bank */ - atomic_t cpus; + refcount_t cpus; }; struct amd_northbridge { diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index d8638e2419cc..d406894cd9a2 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -137,6 +137,8 @@ #define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9) #define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10) #define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11) +#define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14 +#define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT) #define MSR_PEBS_FRONTEND 0x000003f7 diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 27e9f9d769b8..2016962103df 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -29,11 +29,9 @@ struct cyc2ns_data { u32 cyc2ns_mul; u32 cyc2ns_shift; u64 cyc2ns_offset; - u32 __count; - /* u32 hole */ -}; /* 24 bytes -- do not grow */ +}; /* 16 bytes */ -extern struct cyc2ns_data *cyc2ns_read_begin(void); -extern void cyc2ns_read_end(struct cyc2ns_data *); +extern void cyc2ns_read_begin(struct cyc2ns_data *); +extern void cyc2ns_read_end(void); #endif /* _ASM_X86_TIMER_H */ diff --git a/arch/x86/include/asm/tlbbatch.h b/arch/x86/include/asm/tlbbatch.h new file mode 100644 index 000000000000..01a6de16fb96 --- /dev/null +++ b/arch/x86/include/asm/tlbbatch.h @@ -0,0 +1,16 @@ +#ifndef _ARCH_X86_TLBBATCH_H +#define _ARCH_X86_TLBBATCH_H + +#include <linux/cpumask.h> + +#ifdef CONFIG_SMP +struct arch_tlbflush_unmap_batch { + /* + * Each bit set is a CPU that potentially has a TLB entry for one of + * the PFNs being flushed.. + */ + struct cpumask cpumask; +}; +#endif + +#endif /* _ARCH_X86_TLBBATCH_H */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6ed9ea469b48..8f6e2f87511b 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -307,11 +307,15 @@ static inline void flush_tlb_kernel_range(unsigned long start, flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) extern void flush_tlb_all(void); -extern void flush_tlb_page(struct vm_area_struct *, unsigned long); extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag); extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); +static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) +{ + flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE); +} + void native_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, unsigned long end); @@ -325,6 +329,14 @@ static inline void reset_lazy_tlbstate(void) this_cpu_write(cpu_tlbstate.active_mm, &init_mm); } +static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, + struct mm_struct *mm) +{ + cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); +} + +extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); + #endif /* SMP */ #ifndef CONFIG_PARAVIRT diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 347bb9f65737..247880fc29f9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1200,28 +1200,6 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); static struct irq_chip ioapic_chip, ioapic_ir_chip; -#ifdef CONFIG_X86_32 -static inline int IO_APIC_irq_trigger(int irq) -{ - int apic, idx, pin; - - for_each_ioapic_pin(apic, pin) { - idx = find_irq_entry(apic, pin, mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin, 0))) - return irq_trigger(idx); - } - /* - * nonexistent IRQs are edge default - */ - return 0; -} -#else -static inline int IO_APIC_irq_trigger(int irq) -{ - return 1; -} -#endif - static void __init setup_IO_APIC_irqs(void) { unsigned int ioapic, pin; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 6e4a047e4b68..d00f299f2ada 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -164,17 +164,48 @@ static void default_deferred_error_interrupt(void) } void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; -static void get_smca_bank_info(unsigned int bank) +static void smca_configure(unsigned int bank, unsigned int cpu) { - unsigned int i, hwid_mcatype, cpu = smp_processor_id(); + unsigned int i, hwid_mcatype; struct smca_hwid *s_hwid; - u32 high, instance_id; + u32 high, low; + u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank); + + /* Set appropriate bits in MCA_CONFIG */ + if (!rdmsr_safe(smca_config, &low, &high)) { + /* + * OS is required to set the MCAX bit to acknowledge that it is + * now using the new MSR ranges and new registers under each + * bank. It also means that the OS will configure deferred + * errors in the new MCx_CONFIG register. If the bit is not set, + * uncorrectable errors will cause a system panic. + * + * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.) + */ + high |= BIT(0); + + /* + * SMCA sets the Deferred Error Interrupt type per bank. + * + * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us + * if the DeferredIntType bit field is available. + * + * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the + * high portion of the MSR). OS should set this to 0x1 to enable + * APIC based interrupt. First, check that no interrupt has been + * set. + */ + if ((low & BIT(5)) && !((high >> 5) & 0x3)) + high |= BIT(5); + + wrmsr(smca_config, low, high); + } /* Collect bank_info using CPU 0 for now. */ if (cpu) return; - if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instance_id, &high)) { + if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; } @@ -191,7 +222,7 @@ static void get_smca_bank_info(unsigned int bank) smca_get_name(s_hwid->bank_type)); smca_banks[bank].hwid = s_hwid; - smca_banks[bank].id = instance_id; + smca_banks[bank].id = low; smca_banks[bank].sysfs_id = s_hwid->count++; break; } @@ -433,7 +464,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, int offset, u32 misc_high) { unsigned int cpu = smp_processor_id(); - u32 smca_low, smca_high, smca_addr; + u32 smca_low, smca_high; struct threshold_block b; int new; @@ -457,51 +488,6 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, goto set_offset; } - smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank); - - if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) { - /* - * OS is required to set the MCAX bit to acknowledge that it is - * now using the new MSR ranges and new registers under each - * bank. It also means that the OS will configure deferred - * errors in the new MCx_CONFIG register. If the bit is not set, - * uncorrectable errors will cause a system panic. - * - * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.) - */ - smca_high |= BIT(0); - - /* - * SMCA logs Deferred Error information in MCA_DE{STAT,ADDR} - * registers with the option of additionally logging to - * MCA_{STATUS,ADDR} if MCA_CONFIG[LogDeferredInMcaStat] is set. - * - * This bit is usually set by BIOS to retain the old behavior - * for OSes that don't use the new registers. Linux supports the - * new registers so let's disable that additional logging here. - * - * MCA_CONFIG[LogDeferredInMcaStat] is bit 34 (bit 2 in the high - * portion of the MSR). - */ - smca_high &= ~BIT(2); - - /* - * SMCA sets the Deferred Error Interrupt type per bank. - * - * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us - * if the DeferredIntType bit field is available. - * - * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the - * high portion of the MSR). OS should set this to 0x1 to enable - * APIC based interrupt. First, check that no interrupt has been - * set. - */ - if ((smca_low & BIT(5)) && !((smca_high >> 5) & 0x3)) - smca_high |= BIT(5); - - wrmsr(smca_addr, smca_low, smca_high); - } - /* Gather LVT offset for thresholding: */ if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high)) goto out; @@ -530,7 +516,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) for (bank = 0; bank < mca_cfg.banks; ++bank) { if (mce_flags.smca) - get_smca_bank_info(bank); + smca_configure(bank, cpu); for (block = 0; block < NR_BLOCKS; ++block) { address = get_block_address(cpu, address, low, high, bank, block); @@ -755,37 +741,19 @@ out_err: } EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); -static void -__log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) +static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) { - u32 msr_status = msr_ops.status(bank); - u32 msr_addr = msr_ops.addr(bank); struct mce m; - u64 status; - - WARN_ON_ONCE(deferred_err && threshold_err); - - if (deferred_err && mce_flags.smca) { - msr_status = MSR_AMD64_SMCA_MCx_DESTAT(bank); - msr_addr = MSR_AMD64_SMCA_MCx_DEADDR(bank); - } - - rdmsrl(msr_status, status); - - if (!(status & MCI_STATUS_VAL)) - return; mce_setup(&m); m.status = status; + m.misc = misc; m.bank = bank; m.tsc = rdtsc(); - if (threshold_err) - m.misc = misc; - if (m.status & MCI_STATUS_ADDRV) { - rdmsrl(msr_addr, m.addr); + m.addr = addr; /* * Extract [55:<lsb>] where lsb is the least significant @@ -806,8 +774,6 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) } mce_log(&m); - - wrmsrl(msr_status, 0); } static inline void __smp_deferred_error_interrupt(void) @@ -832,45 +798,85 @@ asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void) exiting_ack_irq(); } -/* APIC interrupt handler for deferred errors */ -static void amd_deferred_error_interrupt(void) +/* + * Returns true if the logged error is deferred. False, otherwise. + */ +static inline bool +_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) { - unsigned int bank; - u32 msr_status; - u64 status; + u64 status, addr = 0; - for (bank = 0; bank < mca_cfg.banks; ++bank) { - msr_status = (mce_flags.smca) ? MSR_AMD64_SMCA_MCx_DESTAT(bank) - : msr_ops.status(bank); + rdmsrl(msr_stat, status); + if (!(status & MCI_STATUS_VAL)) + return false; - rdmsrl(msr_status, status); + if (status & MCI_STATUS_ADDRV) + rdmsrl(msr_addr, addr); - if (!(status & MCI_STATUS_VAL) || - !(status & MCI_STATUS_DEFERRED)) - continue; + __log_error(bank, status, addr, misc); - __log_error(bank, true, false, 0); - break; - } + wrmsrl(status, 0); + + return status & MCI_STATUS_DEFERRED; } /* - * APIC Interrupt Handler + * We have three scenarios for checking for Deferred errors: + * + * 1) Non-SMCA systems check MCA_STATUS and log error if found. + * 2) SMCA systems check MCA_STATUS. If error is found then log it and also + * clear MCA_DESTAT. + * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and + * log it. */ +static void log_error_deferred(unsigned int bank) +{ + bool defrd; + + defrd = _log_error_bank(bank, msr_ops.status(bank), + msr_ops.addr(bank), 0); + + if (!mce_flags.smca) + return; + + /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */ + if (defrd) { + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); + return; + } + + /* + * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check + * for a valid error. + */ + _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank), + MSR_AMD64_SMCA_MCx_DEADDR(bank), 0); +} + +/* APIC interrupt handler for deferred errors */ +static void amd_deferred_error_interrupt(void) +{ + unsigned int bank; + + for (bank = 0; bank < mca_cfg.banks; ++bank) + log_error_deferred(bank); +} + +static void log_error_thresholding(unsigned int bank, u64 misc) +{ + _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc); +} /* - * threshold interrupt handler will service THRESHOLD_APIC_VECTOR. - * the interrupt goes off when error_count reaches threshold_limit. - * the handler will simply log mcelog w/ software defined bank number. + * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt + * goes off when error_count reaches threshold_limit. */ - static void amd_threshold_interrupt(void) { u32 low = 0, high = 0, address = 0; unsigned int bank, block, cpu = smp_processor_id(); struct thresh_restart tr; - /* assume first bank caused it */ for (bank = 0; bank < mca_cfg.banks; ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; @@ -893,23 +899,18 @@ static void amd_threshold_interrupt(void) (high & MASK_LOCKED_HI)) continue; - /* - * Log the machine check that caused the threshold - * event. - */ - if (high & MASK_OVERFLOW_HI) - goto log; - } - } - return; + if (!(high & MASK_OVERFLOW_HI)) + continue; -log: - __log_error(bank, false, true, ((u64)high << 32) | low); + /* Log the MCE which caused the threshold event. */ + log_error_thresholding(bank, ((u64)high << 32) | low); - /* Reset threshold block after logging error. */ - memset(&tr, 0, sizeof(tr)); - tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block]; - threshold_restart_bank(&tr); + /* Reset threshold block after logging error. */ + memset(&tr, 0, sizeof(tr)); + tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block]; + threshold_restart_bank(&tr); + } + } } /* @@ -1202,7 +1203,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; per_cpu(threshold_banks, cpu)[bank] = b; - atomic_inc(&b->cpus); + refcount_inc(&b->cpus); err = __threshold_add_blocks(b); @@ -1225,7 +1226,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) per_cpu(threshold_banks, cpu)[bank] = b; if (is_shared_bank(bank)) { - atomic_set(&b->cpus, 1); + refcount_set(&b->cpus, 1); /* nb is already initialized, see above */ if (nb) { @@ -1289,7 +1290,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) goto free_out; if (is_shared_bank(bank)) { - if (!atomic_dec_and_test(&b->cpus)) { + if (!refcount_dec_and_test(&b->cpus)) { __threshold_remove_blocks(b); per_cpu(threshold_banks, cpu)[bank] = NULL; return; diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 45db4d2ebd01..e9f4d762aa5b 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -320,7 +320,7 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax) } static enum ucode_state -load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size); +load_microcode_amd(bool save, u8 family, const u8 *data, size_t size); int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) { @@ -338,8 +338,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) if (!desc.mc) return -EINVAL; - ret = load_microcode_amd(smp_processor_id(), x86_family(cpuid_1_eax), - desc.data, desc.size); + ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size); if (ret != UCODE_OK) return -EINVAL; @@ -675,7 +674,7 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, } static enum ucode_state -load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size) +load_microcode_amd(bool save, u8 family, const u8 *data, size_t size) { enum ucode_state ret; @@ -689,8 +688,8 @@ load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size) #ifdef CONFIG_X86_32 /* save BSP's matching patch for early load */ - if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) { - struct ucode_patch *p = find_patch(cpu); + if (save) { + struct ucode_patch *p = find_patch(0); if (p) { memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), @@ -722,11 +721,12 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, { char fw_name[36] = "amd-ucode/microcode_amd.bin"; struct cpuinfo_x86 *c = &cpu_data(cpu); + bool bsp = c->cpu_index == boot_cpu_data.cpu_index; enum ucode_state ret = UCODE_NFOUND; const struct firmware *fw; /* reload ucode container only on the boot cpu */ - if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index) + if (!refresh_fw || !bsp) return UCODE_OK; if (c->x86 >= 0x15) @@ -743,7 +743,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, goto fw_release; } - ret = load_microcode_amd(cpu, c->x86, fw->data, fw->size); + ret = load_microcode_amd(bsp, c->x86, fw->data, fw->size); fw_release: release_firmware(fw); diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 2bce84d91c2b..c5bb63be4ba1 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -807,10 +807,8 @@ void mtrr_save_state(void) if (!mtrr_enabled()) return; - get_online_cpus(); first_cpu = cpumask_first(cpu_online_mask); smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); - put_online_cpus(); } void set_mtrr_aps_delayed_init(void) diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index c37bd0f39c70..ab4f491da2a9 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -105,11 +105,9 @@ static void __jump_label_transform(struct jump_entry *entry, void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { - get_online_cpus(); mutex_lock(&text_mutex); __jump_label_transform(entry, type, NULL, 0); mutex_unlock(&text_mutex); - put_online_cpus(); } static enum { diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index ff40e74c9181..ffeae818aa7a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -78,7 +78,7 @@ void __show_regs(struct pt_regs *regs, int all) printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip); printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags, - smp_processor_id()); + raw_smp_processor_id()); printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f04479a8f74f..045e4f993bd2 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -863,7 +863,7 @@ static void announce_cpu(int cpu, int apicid) if (cpu == 1) printk(KERN_INFO "x86: Booting SMP configuration:\n"); - if (system_state == SYSTEM_BOOTING) { + if (system_state < SYSTEM_RUNNING) { if (node != current_node) { if (current_node > (-1)) pr_cont("\n"); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 714dfba6a1e7..5270fc0c2df6 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -51,115 +51,34 @@ static u32 art_to_tsc_denominator; static u64 art_to_tsc_offset; struct clocksource *art_related_clocksource; -/* - * Use a ring-buffer like data structure, where a writer advances the head by - * writing a new data entry and a reader advances the tail when it observes a - * new entry. - * - * Writers are made to wait on readers until there's space to write a new - * entry. - * - * This means that we can always use an {offset, mul} pair to compute a ns - * value that is 'roughly' in the right direction, even if we're writing a new - * {offset, mul} pair during the clock read. - * - * The down-side is that we can no longer guarantee strict monotonicity anymore - * (assuming the TSC was that to begin with), because while we compute the - * intersection point of the two clock slopes and make sure the time is - * continuous at the point of switching; we can no longer guarantee a reader is - * strictly before or after the switch point. - * - * It does mean a reader no longer needs to disable IRQs in order to avoid - * CPU-Freq updates messing with his times, and similarly an NMI reader will - * no longer run the risk of hitting half-written state. - */ - struct cyc2ns { - struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */ - struct cyc2ns_data *head; /* 48 + 8 = 56 */ - struct cyc2ns_data *tail; /* 56 + 8 = 64 */ -}; /* exactly fits one cacheline */ - -static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); - -struct cyc2ns_data *cyc2ns_read_begin(void) -{ - struct cyc2ns_data *head; - - preempt_disable(); - - head = this_cpu_read(cyc2ns.head); - /* - * Ensure we observe the entry when we observe the pointer to it. - * matches the wmb from cyc2ns_write_end(). - */ - smp_read_barrier_depends(); - head->__count++; - barrier(); + struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ + seqcount_t seq; /* 32 + 4 = 36 */ - return head; -} +}; /* fits one cacheline */ -void cyc2ns_read_end(struct cyc2ns_data *head) -{ - barrier(); - /* - * If we're the outer most nested read; update the tail pointer - * when we're done. This notifies possible pending writers - * that we've observed the head pointer and that the other - * entry is now free. - */ - if (!--head->__count) { - /* - * x86-TSO does not reorder writes with older reads; - * therefore once this write becomes visible to another - * cpu, we must be finished reading the cyc2ns_data. - * - * matches with cyc2ns_write_begin(). - */ - this_cpu_write(cyc2ns.tail, head); - } - preempt_enable(); -} +static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); -/* - * Begin writing a new @data entry for @cpu. - * - * Assumes some sort of write side lock; currently 'provided' by the assumption - * that cpufreq will call its notifiers sequentially. - */ -static struct cyc2ns_data *cyc2ns_write_begin(int cpu) +void cyc2ns_read_begin(struct cyc2ns_data *data) { - struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); - struct cyc2ns_data *data = c2n->data; + int seq, idx; - if (data == c2n->head) - data++; + preempt_disable_notrace(); - /* XXX send an IPI to @cpu in order to guarantee a read? */ + do { + seq = this_cpu_read(cyc2ns.seq.sequence); + idx = seq & 1; - /* - * When we observe the tail write from cyc2ns_read_end(), - * the cpu must be done with that entry and its safe - * to start writing to it. - */ - while (c2n->tail == data) - cpu_relax(); + data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset); + data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul); + data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift); - return data; + } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence))); } -static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data) +void cyc2ns_read_end(void) { - struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); - - /* - * Ensure the @data writes are visible before we publish the - * entry. Matches the data-depencency in cyc2ns_read_begin(). - */ - smp_wmb(); - - ACCESS_ONCE(c2n->head) = data; + preempt_enable_notrace(); } /* @@ -191,7 +110,6 @@ static void cyc2ns_data_init(struct cyc2ns_data *data) data->cyc2ns_mul = 0; data->cyc2ns_shift = 0; data->cyc2ns_offset = 0; - data->__count = 0; } static void cyc2ns_init(int cpu) @@ -201,51 +119,29 @@ static void cyc2ns_init(int cpu) cyc2ns_data_init(&c2n->data[0]); cyc2ns_data_init(&c2n->data[1]); - c2n->head = c2n->data; - c2n->tail = c2n->data; + seqcount_init(&c2n->seq); } static inline unsigned long long cycles_2_ns(unsigned long long cyc) { - struct cyc2ns_data *data, *tail; + struct cyc2ns_data data; unsigned long long ns; - /* - * See cyc2ns_read_*() for details; replicated in order to avoid - * an extra few instructions that came with the abstraction. - * Notable, it allows us to only do the __count and tail update - * dance when its actually needed. - */ - - preempt_disable_notrace(); - data = this_cpu_read(cyc2ns.head); - tail = this_cpu_read(cyc2ns.tail); - - if (likely(data == tail)) { - ns = data->cyc2ns_offset; - ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); - } else { - data->__count++; - - barrier(); - - ns = data->cyc2ns_offset; - ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); + cyc2ns_read_begin(&data); - barrier(); + ns = data.cyc2ns_offset; + ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift); - if (!--data->__count) - this_cpu_write(cyc2ns.tail, data); - } - preempt_enable_notrace(); + cyc2ns_read_end(); return ns; } -static void set_cyc2ns_scale(unsigned long khz, int cpu) +static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) { - unsigned long long tsc_now, ns_now; - struct cyc2ns_data *data; + unsigned long long ns_now; + struct cyc2ns_data data; + struct cyc2ns *c2n; unsigned long flags; local_irq_save(flags); @@ -254,9 +150,6 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu) if (!khz) goto done; - data = cyc2ns_write_begin(cpu); - - tsc_now = rdtsc(); ns_now = cycles_2_ns(tsc_now); /* @@ -264,7 +157,7 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu) * time function is continuous; see the comment near struct * cyc2ns_data. */ - clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, khz, + clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz, NSEC_PER_MSEC, 0); /* @@ -273,20 +166,26 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu) * conversion algorithm shifting a 32-bit value (now specifies a 64-bit * value) - refer perf_event_mmap_page documentation in perf_event.h. */ - if (data->cyc2ns_shift == 32) { - data->cyc2ns_shift = 31; - data->cyc2ns_mul >>= 1; + if (data.cyc2ns_shift == 32) { + data.cyc2ns_shift = 31; + data.cyc2ns_mul >>= 1; } - data->cyc2ns_offset = ns_now - - mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift); + data.cyc2ns_offset = ns_now - + mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift); + + c2n = per_cpu_ptr(&cyc2ns, cpu); - cyc2ns_write_end(cpu, data); + raw_write_seqcount_latch(&c2n->seq); + c2n->data[0] = data; + raw_write_seqcount_latch(&c2n->seq); + c2n->data[1] = data; done: - sched_clock_idle_wakeup_event(0); + sched_clock_idle_wakeup_event(); local_irq_restore(flags); } + /* * Scheduler clock - returns current time in nanosec units. */ @@ -374,6 +273,8 @@ static int __init tsc_setup(char *str) tsc_clocksource_reliable = 1; if (!strncmp(str, "noirqtime", 9)) no_sched_irq_time = 1; + if (!strcmp(str, "unstable")) + mark_tsc_unstable("boot parameter"); return 1; } @@ -986,7 +887,6 @@ void tsc_restore_sched_clock_state(void) } #ifdef CONFIG_CPU_FREQ - /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency * changes. * @@ -1027,7 +927,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, if (!(freq->flags & CPUFREQ_CONST_LOOPS)) mark_tsc_unstable("cpufreq changes"); - set_cyc2ns_scale(tsc_khz, freq->cpu); + set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc()); } return 0; @@ -1127,6 +1027,15 @@ static void tsc_cs_mark_unstable(struct clocksource *cs) pr_info("Marking TSC unstable due to clocksource watchdog\n"); } +static void tsc_cs_tick_stable(struct clocksource *cs) +{ + if (tsc_unstable) + return; + + if (using_native_sched_clock()) + sched_clock_tick_stable(); +} + /* * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() */ @@ -1140,6 +1049,7 @@ static struct clocksource clocksource_tsc = { .archdata = { .vclock_mode = VCLOCK_TSC }, .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, + .tick_stable = tsc_cs_tick_stable, }; void mark_tsc_unstable(char *reason) @@ -1255,6 +1165,7 @@ static void tsc_refine_calibration_work(struct work_struct *work) static int hpet; u64 tsc_stop, ref_stop, delta; unsigned long freq; + int cpu; /* Don't bother refining TSC on unstable systems */ if (check_tsc_unstable()) @@ -1305,6 +1216,10 @@ static void tsc_refine_calibration_work(struct work_struct *work) /* Inform the TSC deadline clockevent devices about the recalibration */ lapic_update_tsc_freq(); + /* Update the sched_clock() rate to match the clocksource one */ + for_each_possible_cpu(cpu) + set_cyc2ns_scale(tsc_khz, cpu, tsc_stop); + out: if (boot_cpu_has(X86_FEATURE_ART)) art_related_clocksource = &clocksource_tsc; @@ -1350,7 +1265,7 @@ device_initcall(init_tsc_clocksource); void __init tsc_init(void) { - u64 lpj; + u64 lpj, cyc; int cpu; if (!boot_cpu_has(X86_FEATURE_TSC)) { @@ -1390,9 +1305,10 @@ void __init tsc_init(void) * speed as the bootup CPU. (cpufreq notifiers will fix this * up if their speed diverges) */ + cyc = rdtsc(); for_each_possible_cpu(cpu) { cyc2ns_init(cpu); - set_cyc2ns_scale(tsc_khz, cpu); + set_cyc2ns_scale(tsc_khz, cpu, cyc); } if (tsc_disabled > 0) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6e7bedf69af7..743e4c6b4529 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -237,24 +237,26 @@ static void flush_tlb_func(void *info) return; count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { - if (f->flush_end == TLB_FLUSH_ALL) { - local_flush_tlb(); - trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL); - } else { - unsigned long addr; - unsigned long nr_pages = - (f->flush_end - f->flush_start) / PAGE_SIZE; - addr = f->flush_start; - while (addr < f->flush_end) { - __flush_tlb_single(addr); - addr += PAGE_SIZE; - } - trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages); - } - } else + + if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) { leave_mm(smp_processor_id()); + return; + } + if (f->flush_end == TLB_FLUSH_ALL) { + local_flush_tlb(); + trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL); + } else { + unsigned long addr; + unsigned long nr_pages = + (f->flush_end - f->flush_start) / PAGE_SIZE; + addr = f->flush_start; + while (addr < f->flush_end) { + __flush_tlb_single(addr); + addr += PAGE_SIZE; + } + trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages); + } } void native_flush_tlb_others(const struct cpumask *cpumask, @@ -354,33 +356,6 @@ out: preempt_enable(); } -void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) -{ - struct mm_struct *mm = vma->vm_mm; - - preempt_disable(); - - if (current->active_mm == mm) { - if (current->mm) { - /* - * Implicit full barrier (INVLPG) that synchronizes - * with switch_mm. - */ - __flush_tlb_one(start); - } else { - leave_mm(smp_processor_id()); - - /* Synchronize with switch_mm. */ - smp_mb(); - } - } - - if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE); - - preempt_enable(); -} - static void do_flush_tlb_all(void *info) { count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); @@ -420,6 +395,23 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) } } +void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) +{ + int cpu = get_cpu(); + + if (cpumask_test_cpu(cpu, &batch->cpumask)) { + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + local_flush_tlb(); + trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); + } + + if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) + flush_tlb_others(&batch->cpumask, NULL, 0, TLB_FLUSH_ALL); + cpumask_clear(&batch->cpumask); + + put_cpu(); +} + static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 7e76a4d8304b..43b96f5f78ba 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -828,9 +828,11 @@ static void __init kexec_enter_virtual_mode(void) /* * We don't do virtual mode, since we don't do runtime services, on - * non-native EFI + * non-native EFI. With efi=old_map, we don't do runtime services in + * kexec kernel because in the initial boot something else might + * have been mapped at these virtual addresses. */ - if (!efi_is_native()) { + if (!efi_is_native() || efi_enabled(EFI_OLD_MEMMAP)) { efi_memmap_unmap(); clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index c488625c9712..eb8dff15a7f6 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -71,11 +71,13 @@ static void __init early_code_mapping_set_exec(int executable) pgd_t * __init efi_call_phys_prolog(void) { - unsigned long vaddress; - pgd_t *save_pgd; + unsigned long vaddr, addr_pgd, addr_p4d, addr_pud; + pgd_t *save_pgd, *pgd_k, *pgd_efi; + p4d_t *p4d, *p4d_k, *p4d_efi; + pud_t *pud; int pgd; - int n_pgds; + int n_pgds, i, j; if (!efi_enabled(EFI_OLD_MEMMAP)) { save_pgd = (pgd_t *)read_cr3(); @@ -88,10 +90,49 @@ pgd_t * __init efi_call_phys_prolog(void) n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE); save_pgd = kmalloc_array(n_pgds, sizeof(*save_pgd), GFP_KERNEL); + /* + * Build 1:1 identity mapping for efi=old_map usage. Note that + * PAGE_OFFSET is PGDIR_SIZE aligned when KASLR is disabled, while + * it is PUD_SIZE ALIGNED with KASLR enabled. So for a given physical + * address X, the pud_index(X) != pud_index(__va(X)), we can only copy + * PUD entry of __va(X) to fill in pud entry of X to build 1:1 mapping. + * This means here we can only reuse the PMD tables of the direct mapping. + */ for (pgd = 0; pgd < n_pgds; pgd++) { - save_pgd[pgd] = *pgd_offset_k(pgd * PGDIR_SIZE); - vaddress = (unsigned long)__va(pgd * PGDIR_SIZE); - set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress)); + addr_pgd = (unsigned long)(pgd * PGDIR_SIZE); + vaddr = (unsigned long)__va(pgd * PGDIR_SIZE); + pgd_efi = pgd_offset_k(addr_pgd); + save_pgd[pgd] = *pgd_efi; + + p4d = p4d_alloc(&init_mm, pgd_efi, addr_pgd); + if (!p4d) { + pr_err("Failed to allocate p4d table!\n"); + goto out; + } + + for (i = 0; i < PTRS_PER_P4D; i++) { + addr_p4d = addr_pgd + i * P4D_SIZE; + p4d_efi = p4d + p4d_index(addr_p4d); + + pud = pud_alloc(&init_mm, p4d_efi, addr_p4d); + if (!pud) { + pr_err("Failed to allocate pud table!\n"); + goto out; + } + + for (j = 0; j < PTRS_PER_PUD; j++) { + addr_pud = addr_p4d + j * PUD_SIZE; + + if (addr_pud > (max_pfn << PAGE_SHIFT)) + break; + + vaddr = (unsigned long)__va(addr_pud); + + pgd_k = pgd_offset_k(vaddr); + p4d_k = p4d_offset(pgd_k, vaddr); + pud[j] = *pud_offset(p4d_k, vaddr); + } + } } out: __flush_tlb_all(); @@ -104,8 +145,11 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd) /* * After the lock is released, the original page table is restored. */ - int pgd_idx; + int pgd_idx, i; int nr_pgds; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; if (!efi_enabled(EFI_OLD_MEMMAP)) { write_cr3((unsigned long)save_pgd); @@ -115,9 +159,28 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd) nr_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE); - for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++) + for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++) { + pgd = pgd_offset_k(pgd_idx * PGDIR_SIZE); set_pgd(pgd_offset_k(pgd_idx * PGDIR_SIZE), save_pgd[pgd_idx]); + if (!(pgd_val(*pgd) & _PAGE_PRESENT)) + continue; + + for (i = 0; i < PTRS_PER_P4D; i++) { + p4d = p4d_offset(pgd, + pgd_idx * PGDIR_SIZE + i * P4D_SIZE); + + if (!(p4d_val(*p4d) & _PAGE_PRESENT)) + continue; + + pud = (pud_t *)p4d_page_vaddr(*p4d); + pud_free(&init_mm, pud); + } + + p4d = (p4d_t *)pgd_page_vaddr(*pgd); + p4d_free(&init_mm, p4d); + } + kfree(save_pgd); __flush_tlb_all(); diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 26615991d69c..e0cf95a83f3f 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -360,6 +360,9 @@ void __init efi_free_boot_services(void) free_bootmem_late(start, size); } + if (!num_entries) + return; + new_size = efi.memmap.desc_size * num_entries; new_phys = efi_memmap_alloc(num_entries); if (!new_phys) { diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 42e65fee5673..795671593528 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -456,12 +456,13 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) */ static inline unsigned long long cycles_2_ns(unsigned long long cyc) { - struct cyc2ns_data *data = cyc2ns_read_begin(); + struct cyc2ns_data data; unsigned long long ns; - ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); + cyc2ns_read_begin(&data); + ns = mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift); + cyc2ns_read_end(); - cyc2ns_read_end(data); return ns; } @@ -470,12 +471,13 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) */ static inline unsigned long long ns_2_cycles(unsigned long long ns) { - struct cyc2ns_data *data = cyc2ns_read_begin(); + struct cyc2ns_data data; unsigned long long cyc; - cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul; + cyc2ns_read_begin(&data); + cyc = (ns << data.cyc2ns_shift) / data.cyc2ns_mul; + cyc2ns_read_end(); - cyc2ns_read_end(data); return cyc; } diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index d0855c09f32f..d2c8a9286fa8 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -89,14 +89,14 @@ bool ghes_disable; module_param_named(disable, ghes_disable, bool, 0); /* - * All error sources notified with SCI shares one notifier function, - * so they need to be linked and checked one by one. This is applied - * to NMI too. + * All error sources notified with HED (Hardware Error Device) share a + * single notifier callback, so they need to be linked and checked one + * by one. This holds true for NMI too. * * RCU is used for these lists, so ghes_list_mutex is only used for * list changing, not for traversing. */ -static LIST_HEAD(ghes_sci); +static LIST_HEAD(ghes_hed); static DEFINE_MUTEX(ghes_list_mutex); /* @@ -702,14 +702,14 @@ static irqreturn_t ghes_irq_func(int irq, void *data) return IRQ_HANDLED; } -static int ghes_notify_sci(struct notifier_block *this, - unsigned long event, void *data) +static int ghes_notify_hed(struct notifier_block *this, unsigned long event, + void *data) { struct ghes *ghes; int ret = NOTIFY_DONE; rcu_read_lock(); - list_for_each_entry_rcu(ghes, &ghes_sci, list) { + list_for_each_entry_rcu(ghes, &ghes_hed, list) { if (!ghes_proc(ghes)) ret = NOTIFY_OK; } @@ -718,8 +718,8 @@ static int ghes_notify_sci(struct notifier_block *this, return ret; } -static struct notifier_block ghes_notifier_sci = { - .notifier_call = ghes_notify_sci, +static struct notifier_block ghes_notifier_hed = { + .notifier_call = ghes_notify_hed, }; #ifdef CONFIG_HAVE_ACPI_APEI_NMI @@ -966,7 +966,10 @@ static int ghes_probe(struct platform_device *ghes_dev) case ACPI_HEST_NOTIFY_POLLED: case ACPI_HEST_NOTIFY_EXTERNAL: case ACPI_HEST_NOTIFY_SCI: + case ACPI_HEST_NOTIFY_GSIV: + case ACPI_HEST_NOTIFY_GPIO: break; + case ACPI_HEST_NOTIFY_NMI: if (!IS_ENABLED(CONFIG_HAVE_ACPI_APEI_NMI)) { pr_warn(GHES_PFX "Generic hardware error source: %d notified via NMI interrupt is not supported!\n", @@ -1024,13 +1027,17 @@ static int ghes_probe(struct platform_device *ghes_dev) goto err_edac_unreg; } break; + case ACPI_HEST_NOTIFY_SCI: + case ACPI_HEST_NOTIFY_GSIV: + case ACPI_HEST_NOTIFY_GPIO: mutex_lock(&ghes_list_mutex); - if (list_empty(&ghes_sci)) - register_acpi_hed_notifier(&ghes_notifier_sci); - list_add_rcu(&ghes->list, &ghes_sci); + if (list_empty(&ghes_hed)) + register_acpi_hed_notifier(&ghes_notifier_hed); + list_add_rcu(&ghes->list, &ghes_hed); mutex_unlock(&ghes_list_mutex); break; + case ACPI_HEST_NOTIFY_NMI: ghes_nmi_add(ghes); break; @@ -1066,14 +1073,18 @@ static int ghes_remove(struct platform_device *ghes_dev) case ACPI_HEST_NOTIFY_EXTERNAL: free_irq(ghes->irq, ghes); break; + case ACPI_HEST_NOTIFY_SCI: + case ACPI_HEST_NOTIFY_GSIV: + case ACPI_HEST_NOTIFY_GPIO: mutex_lock(&ghes_list_mutex); list_del_rcu(&ghes->list); - if (list_empty(&ghes_sci)) - unregister_acpi_hed_notifier(&ghes_notifier_sci); + if (list_empty(&ghes_hed)) + unregister_acpi_hed_notifier(&ghes_notifier_hed); mutex_unlock(&ghes_list_mutex); synchronize_rcu(); break; + case ACPI_HEST_NOTIFY_NMI: ghes_nmi_remove(ghes); break; diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index 919be0aa2578..240544253ccd 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -523,7 +523,7 @@ static int acpi_pci_root_add(struct acpi_device *device, struct acpi_pci_root *root; acpi_handle handle = device->handle; int no_aspm = 0; - bool hotadd = system_state != SYSTEM_BOOTING; + bool hotadd = system_state == SYSTEM_RUNNING; root = kzalloc(sizeof(struct acpi_pci_root), GFP_KERNEL); if (!root) diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c index 8697a82bd465..591d1dd3f04e 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c @@ -268,9 +268,9 @@ static int acpi_processor_start(struct device *dev) return -ENODEV; /* Protect against concurrent CPU hotplug operations */ - get_online_cpus(); + cpu_hotplug_disable(); ret = __acpi_processor_start(device); - put_online_cpus(); + cpu_hotplug_enable(); return ret; } diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c index 3de34633f7f9..7f9aff4b8d62 100644 --- a/drivers/acpi/processor_throttling.c +++ b/drivers/acpi/processor_throttling.c @@ -909,6 +909,13 @@ static long __acpi_processor_get_throttling(void *data) return pr->throttling.acpi_processor_get_throttling(pr); } +static int call_on_cpu(int cpu, long (*fn)(void *), void *arg, bool direct) +{ + if (direct || (is_percpu_thread() && cpu == smp_processor_id())) + return fn(arg); + return work_on_cpu(cpu, fn, arg); +} + static int acpi_processor_get_throttling(struct acpi_processor *pr) { if (!pr) @@ -926,7 +933,7 @@ static int acpi_processor_get_throttling(struct acpi_processor *pr) if (!cpu_online(pr->id)) return -ENODEV; - return work_on_cpu(pr->id, __acpi_processor_get_throttling, pr); + return call_on_cpu(pr->id, __acpi_processor_get_throttling, pr, false); } static int acpi_processor_get_fadt_info(struct acpi_processor *pr) @@ -1076,13 +1083,6 @@ static long acpi_processor_throttling_fn(void *data) arg->target_state, arg->force); } -static int call_on_cpu(int cpu, long (*fn)(void *), void *arg, bool direct) -{ - if (direct) - return fn(arg); - return work_on_cpu(cpu, fn, arg); -} - static int __acpi_processor_set_throttling(struct acpi_processor *pr, int state, bool force, bool direct) { diff --git a/drivers/base/node.c b/drivers/base/node.c index 5548f9686016..0440d95c9b5b 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -377,7 +377,7 @@ static int __ref get_nid_for_pfn(unsigned long pfn) if (!pfn_valid_within(pfn)) return -1; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT - if (system_state == SYSTEM_BOOTING) + if (system_state < SYSTEM_RUNNING) return early_pfn_to_nid(pfn); #endif page = pfn_to_page(pfn); diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 31adbebf812e..2af70014ee5a 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -539,15 +539,6 @@ config HANGCHECK_TIMER out to lunch past a certain margin. It can reboot the system or merely print a warning. -config MMTIMER - tristate "MMTIMER Memory mapped RTC for SGI Altix" - depends on IA64_GENERIC || IA64_SGI_SN2 - depends on POSIX_TIMERS - default y - help - The mmtimer device allows direct userspace access to the - Altix system timer. - config UV_MMTIMER tristate "UV_MMTIMER Memory mapped RTC for SGI UV" depends on X86_UV diff --git a/drivers/char/Makefile b/drivers/char/Makefile index 6e6c244a66a0..53e33720818c 100644 --- a/drivers/char/Makefile +++ b/drivers/char/Makefile @@ -10,7 +10,6 @@ obj-$(CONFIG_VIRTIO_CONSOLE) += virtio_console.o obj-$(CONFIG_RAW_DRIVER) += raw.o obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o obj-$(CONFIG_MSPEC) += mspec.o -obj-$(CONFIG_MMTIMER) += mmtimer.o obj-$(CONFIG_UV_MMTIMER) += uv_mmtimer.o obj-$(CONFIG_IBM_BSR) += bsr.o obj-$(CONFIG_SGI_MBCS) += mbcs.o diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c deleted file mode 100644 index 0e7fcb04f01e..000000000000 --- a/drivers/char/mmtimer.c +++ /dev/null @@ -1,858 +0,0 @@ -/* - * Timer device implementation for SGI SN platforms. - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (c) 2001-2006 Silicon Graphics, Inc. All rights reserved. - * - * This driver exports an API that should be supportable by any HPET or IA-PC - * multimedia timer. The code below is currently specific to the SGI Altix - * SHub RTC, however. - * - * 11/01/01 - jbarnes - initial revision - * 9/10/04 - Christoph Lameter - remove interrupt support for kernel inclusion - * 10/1/04 - Christoph Lameter - provide posix clock CLOCK_SGI_CYCLE - * 10/13/04 - Christoph Lameter, Dimitri Sivanich - provide timer interrupt - * support via the posix timer interface - */ - -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/ioctl.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/errno.h> -#include <linux/mm.h> -#include <linux/fs.h> -#include <linux/mmtimer.h> -#include <linux/miscdevice.h> -#include <linux/posix-timers.h> -#include <linux/interrupt.h> -#include <linux/time.h> -#include <linux/math64.h> -#include <linux/mutex.h> -#include <linux/slab.h> - -#include <linux/uaccess.h> -#include <asm/sn/addrs.h> -#include <asm/sn/intr.h> -#include <asm/sn/shub_mmr.h> -#include <asm/sn/nodepda.h> -#include <asm/sn/shubio.h> - -MODULE_AUTHOR("Jesse Barnes <jbarnes@sgi.com>"); -MODULE_DESCRIPTION("SGI Altix RTC Timer"); -MODULE_LICENSE("GPL"); - -/* name of the device, usually in /dev */ -#define MMTIMER_NAME "mmtimer" -#define MMTIMER_DESC "SGI Altix RTC Timer" -#define MMTIMER_VERSION "2.1" - -#define RTC_BITS 55 /* 55 bits for this implementation */ - -static struct k_clock sgi_clock; - -extern unsigned long sn_rtc_cycles_per_second; - -#define RTC_COUNTER_ADDR ((long *)LOCAL_MMR_ADDR(SH_RTC)) - -#define rtc_time() (*RTC_COUNTER_ADDR) - -static DEFINE_MUTEX(mmtimer_mutex); -static long mmtimer_ioctl(struct file *file, unsigned int cmd, - unsigned long arg); -static int mmtimer_mmap(struct file *file, struct vm_area_struct *vma); - -/* - * Period in femtoseconds (10^-15 s) - */ -static unsigned long mmtimer_femtoperiod = 0; - -static const struct file_operations mmtimer_fops = { - .owner = THIS_MODULE, - .mmap = mmtimer_mmap, - .unlocked_ioctl = mmtimer_ioctl, - .llseek = noop_llseek, -}; - -/* - * We only have comparison registers RTC1-4 currently available per - * node. RTC0 is used by SAL. - */ -/* Check for an RTC interrupt pending */ -static int mmtimer_int_pending(int comparator) -{ - if (HUB_L((unsigned long *)LOCAL_MMR_ADDR(SH_EVENT_OCCURRED)) & - SH_EVENT_OCCURRED_RTC1_INT_MASK << comparator) - return 1; - else - return 0; -} - -/* Clear the RTC interrupt pending bit */ -static void mmtimer_clr_int_pending(int comparator) -{ - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_EVENT_OCCURRED_ALIAS), - SH_EVENT_OCCURRED_RTC1_INT_MASK << comparator); -} - -/* Setup timer on comparator RTC1 */ -static void mmtimer_setup_int_0(int cpu, u64 expires) -{ - u64 val; - - /* Disable interrupt */ - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC1_INT_ENABLE), 0UL); - - /* Initialize comparator value */ - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPB), -1L); - - /* Clear pending bit */ - mmtimer_clr_int_pending(0); - - val = ((u64)SGI_MMTIMER_VECTOR << SH_RTC1_INT_CONFIG_IDX_SHFT) | - ((u64)cpu_physical_id(cpu) << - SH_RTC1_INT_CONFIG_PID_SHFT); - - /* Set configuration */ - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC1_INT_CONFIG), val); - - /* Enable RTC interrupts */ - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC1_INT_ENABLE), 1UL); - - /* Initialize comparator value */ - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPB), expires); - - -} - -/* Setup timer on comparator RTC2 */ -static void mmtimer_setup_int_1(int cpu, u64 expires) -{ - u64 val; - - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC2_INT_ENABLE), 0UL); - - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPC), -1L); - - mmtimer_clr_int_pending(1); - - val = ((u64)SGI_MMTIMER_VECTOR << SH_RTC2_INT_CONFIG_IDX_SHFT) | - ((u64)cpu_physical_id(cpu) << - SH_RTC2_INT_CONFIG_PID_SHFT); - - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC2_INT_CONFIG), val); - - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC2_INT_ENABLE), 1UL); - - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPC), expires); -} - -/* Setup timer on comparator RTC3 */ -static void mmtimer_setup_int_2(int cpu, u64 expires) -{ - u64 val; - - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC3_INT_ENABLE), 0UL); - - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPD), -1L); - - mmtimer_clr_int_pending(2); - - val = ((u64)SGI_MMTIMER_VECTOR << SH_RTC3_INT_CONFIG_IDX_SHFT) | - ((u64)cpu_physical_id(cpu) << - SH_RTC3_INT_CONFIG_PID_SHFT); - - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC3_INT_CONFIG), val); - - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC3_INT_ENABLE), 1UL); - - HUB_S((u64 *)LOCAL_MMR_ADDR(SH_INT_CMPD), expires); -} - -/* - * This function must be called with interrupts disabled and preemption off - * in order to insure that the setup succeeds in a deterministic time frame. - * It will check if the interrupt setup succeeded. - */ -static int mmtimer_setup(int cpu, int comparator, unsigned long expires, - u64 *set_completion_time) -{ - switch (comparator) { - case 0: - mmtimer_setup_int_0(cpu, expires); - break; - case 1: - mmtimer_setup_int_1(cpu, expires); - break; - case 2: - mmtimer_setup_int_2(cpu, expires); - break; - } - /* We might've missed our expiration time */ - *set_completion_time = rtc_time(); - if (*set_completion_time <= expires) - return 1; - - /* - * If an interrupt is already pending then its okay - * if not then we failed - */ - return mmtimer_int_pending(comparator); -} - -static int mmtimer_disable_int(long nasid, int comparator) -{ - switch (comparator) { - case 0: - nasid == -1 ? HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC1_INT_ENABLE), - 0UL) : REMOTE_HUB_S(nasid, SH_RTC1_INT_ENABLE, 0UL); - break; - case 1: - nasid == -1 ? HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC2_INT_ENABLE), - 0UL) : REMOTE_HUB_S(nasid, SH_RTC2_INT_ENABLE, 0UL); - break; - case 2: - nasid == -1 ? HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC3_INT_ENABLE), - 0UL) : REMOTE_HUB_S(nasid, SH_RTC3_INT_ENABLE, 0UL); - break; - default: - return -EFAULT; - } - return 0; -} - -#define COMPARATOR 1 /* The comparator to use */ - -#define TIMER_OFF 0xbadcabLL /* Timer is not setup */ -#define TIMER_SET 0 /* Comparator is set for this timer */ - -#define MMTIMER_INTERVAL_RETRY_INCREMENT_DEFAULT 40 - -/* There is one of these for each timer */ -struct mmtimer { - struct rb_node list; - struct k_itimer *timer; - int cpu; -}; - -struct mmtimer_node { - spinlock_t lock ____cacheline_aligned; - struct rb_root timer_head; - struct rb_node *next; - struct tasklet_struct tasklet; -}; -static struct mmtimer_node *timers; - -static unsigned mmtimer_interval_retry_increment = - MMTIMER_INTERVAL_RETRY_INCREMENT_DEFAULT; -module_param(mmtimer_interval_retry_increment, uint, 0644); -MODULE_PARM_DESC(mmtimer_interval_retry_increment, - "RTC ticks to add to expiration on interval retry (default 40)"); - -/* - * Add a new mmtimer struct to the node's mmtimer list. - * This function assumes the struct mmtimer_node is locked. - */ -static void mmtimer_add_list(struct mmtimer *n) -{ - int nodeid = n->timer->it.mmtimer.node; - unsigned long expires = n->timer->it.mmtimer.expires; - struct rb_node **link = &timers[nodeid].timer_head.rb_node; - struct rb_node *parent = NULL; - struct mmtimer *x; - - /* - * Find the right place in the rbtree: - */ - while (*link) { - parent = *link; - x = rb_entry(parent, struct mmtimer, list); - - if (expires < x->timer->it.mmtimer.expires) - link = &(*link)->rb_left; - else - link = &(*link)->rb_right; - } - - /* - * Insert the timer to the rbtree and check whether it - * replaces the first pending timer - */ - rb_link_node(&n->list, parent, link); - rb_insert_color(&n->list, &timers[nodeid].timer_head); - - if (!timers[nodeid].next || expires < rb_entry(timers[nodeid].next, - struct mmtimer, list)->timer->it.mmtimer.expires) - timers[nodeid].next = &n->list; -} - -/* - * Set the comparator for the next timer. - * This function assumes the struct mmtimer_node is locked. - */ -static void mmtimer_set_next_timer(int nodeid) -{ - struct mmtimer_node *n = &timers[nodeid]; - struct mmtimer *x; - struct k_itimer *t; - u64 expires, exp, set_completion_time; - int i; - -restart: - if (n->next == NULL) - return; - - x = rb_entry(n->next, struct mmtimer, list); - t = x->timer; - if (!t->it.mmtimer.incr) { - /* Not an interval timer */ - if (!mmtimer_setup(x->cpu, COMPARATOR, - t->it.mmtimer.expires, - &set_completion_time)) { - /* Late setup, fire now */ - tasklet_schedule(&n->tasklet); - } - return; - } - - /* Interval timer */ - i = 0; - expires = exp = t->it.mmtimer.expires; - while (!mmtimer_setup(x->cpu, COMPARATOR, expires, - &set_completion_time)) { - int to; - - i++; - expires = set_completion_time + - mmtimer_interval_retry_increment + (1 << i); - /* Calculate overruns as we go. */ - to = ((u64)(expires - exp) / t->it.mmtimer.incr); - if (to) { - t->it_overrun += to; - t->it.mmtimer.expires += t->it.mmtimer.incr * to; - exp = t->it.mmtimer.expires; - } - if (i > 20) { - printk(KERN_ALERT "mmtimer: cannot reschedule timer\n"); - t->it.mmtimer.clock = TIMER_OFF; - n->next = rb_next(&x->list); - rb_erase(&x->list, &n->timer_head); - kfree(x); - goto restart; - } - } -} - -/** - * mmtimer_ioctl - ioctl interface for /dev/mmtimer - * @file: file structure for the device - * @cmd: command to execute - * @arg: optional argument to command - * - * Executes the command specified by @cmd. Returns 0 for success, < 0 for - * failure. - * - * Valid commands: - * - * %MMTIMER_GETOFFSET - Should return the offset (relative to the start - * of the page where the registers are mapped) for the counter in question. - * - * %MMTIMER_GETRES - Returns the resolution of the clock in femto (10^-15) - * seconds - * - * %MMTIMER_GETFREQ - Copies the frequency of the clock in Hz to the address - * specified by @arg - * - * %MMTIMER_GETBITS - Returns the number of bits in the clock's counter - * - * %MMTIMER_MMAPAVAIL - Returns 1 if the registers can be mmap'd into userspace - * - * %MMTIMER_GETCOUNTER - Gets the current value in the counter and places it - * in the address specified by @arg. - */ -static long mmtimer_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - int ret = 0; - - mutex_lock(&mmtimer_mutex); - - switch (cmd) { - case MMTIMER_GETOFFSET: /* offset of the counter */ - /* - * SN RTC registers are on their own 64k page - */ - if(PAGE_SIZE <= (1 << 16)) - ret = (((long)RTC_COUNTER_ADDR) & (PAGE_SIZE-1)) / 8; - else - ret = -ENOSYS; - break; - - case MMTIMER_GETRES: /* resolution of the clock in 10^-15 s */ - if(copy_to_user((unsigned long __user *)arg, - &mmtimer_femtoperiod, sizeof(unsigned long))) - ret = -EFAULT; - break; - - case MMTIMER_GETFREQ: /* frequency in Hz */ - if(copy_to_user((unsigned long __user *)arg, - &sn_rtc_cycles_per_second, - sizeof(unsigned long))) - ret = -EFAULT; - break; - - case MMTIMER_GETBITS: /* number of bits in the clock */ - ret = RTC_BITS; - break; - - case MMTIMER_MMAPAVAIL: /* can we mmap the clock into userspace? */ - ret = (PAGE_SIZE <= (1 << 16)) ? 1 : 0; - break; - - case MMTIMER_GETCOUNTER: - if(copy_to_user((unsigned long __user *)arg, - RTC_COUNTER_ADDR, sizeof(unsigned long))) - ret = -EFAULT; - break; - default: - ret = -ENOTTY; - break; - } - mutex_unlock(&mmtimer_mutex); - return ret; -} - -/** - * mmtimer_mmap - maps the clock's registers into userspace - * @file: file structure for the device - * @vma: VMA to map the registers into - * - * Calls remap_pfn_range() to map the clock's registers into - * the calling process' address space. - */ -static int mmtimer_mmap(struct file *file, struct vm_area_struct *vma) -{ - unsigned long mmtimer_addr; - - if (vma->vm_end - vma->vm_start != PAGE_SIZE) - return -EINVAL; - - if (vma->vm_flags & VM_WRITE) - return -EPERM; - - if (PAGE_SIZE > (1 << 16)) - return -ENOSYS; - - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - - mmtimer_addr = __pa(RTC_COUNTER_ADDR); - mmtimer_addr &= ~(PAGE_SIZE - 1); - mmtimer_addr &= 0xfffffffffffffffUL; - - if (remap_pfn_range(vma, vma->vm_start, mmtimer_addr >> PAGE_SHIFT, - PAGE_SIZE, vma->vm_page_prot)) { - printk(KERN_ERR "remap_pfn_range failed in mmtimer.c\n"); - return -EAGAIN; - } - - return 0; -} - -static struct miscdevice mmtimer_miscdev = { - .minor = SGI_MMTIMER, - .name = MMTIMER_NAME, - .fops = &mmtimer_fops -}; - -static struct timespec sgi_clock_offset; -static int sgi_clock_period; - -/* - * Posix Timer Interface - */ - -static struct timespec sgi_clock_offset; -static int sgi_clock_period; - -static int sgi_clock_get(clockid_t clockid, struct timespec64 *tp) -{ - u64 nsec; - - nsec = rtc_time() * sgi_clock_period - + sgi_clock_offset.tv_nsec; - *tp = ns_to_timespec64(nsec); - tp->tv_sec += sgi_clock_offset.tv_sec; - return 0; -}; - -static int sgi_clock_set(const clockid_t clockid, const struct timespec64 *tp) -{ - - u64 nsec; - u32 rem; - - nsec = rtc_time() * sgi_clock_period; - - sgi_clock_offset.tv_sec = tp->tv_sec - div_u64_rem(nsec, NSEC_PER_SEC, &rem); - - if (rem <= tp->tv_nsec) - sgi_clock_offset.tv_nsec = tp->tv_sec - rem; - else { - sgi_clock_offset.tv_nsec = tp->tv_sec + NSEC_PER_SEC - rem; - sgi_clock_offset.tv_sec--; - } - return 0; -} - -/** - * mmtimer_interrupt - timer interrupt handler - * @irq: irq received - * @dev_id: device the irq came from - * - * Called when one of the comarators matches the counter, This - * routine will send signals to processes that have requested - * them. - * - * This interrupt is run in an interrupt context - * by the SHUB. It is therefore safe to locally access SHub - * registers. - */ -static irqreturn_t -mmtimer_interrupt(int irq, void *dev_id) -{ - unsigned long expires = 0; - int result = IRQ_NONE; - unsigned indx = cpu_to_node(smp_processor_id()); - struct mmtimer *base; - - spin_lock(&timers[indx].lock); - base = rb_entry(timers[indx].next, struct mmtimer, list); - if (base == NULL) { - spin_unlock(&timers[indx].lock); - return result; - } - - if (base->cpu == smp_processor_id()) { - if (base->timer) - expires = base->timer->it.mmtimer.expires; - /* expires test won't work with shared irqs */ - if ((mmtimer_int_pending(COMPARATOR) > 0) || - (expires && (expires <= rtc_time()))) { - mmtimer_clr_int_pending(COMPARATOR); - tasklet_schedule(&timers[indx].tasklet); - result = IRQ_HANDLED; - } - } - spin_unlock(&timers[indx].lock); - return result; -} - -static void mmtimer_tasklet(unsigned long data) -{ - int nodeid = data; - struct mmtimer_node *mn = &timers[nodeid]; - struct mmtimer *x; - struct k_itimer *t; - unsigned long flags; - - /* Send signal and deal with periodic signals */ - spin_lock_irqsave(&mn->lock, flags); - if (!mn->next) - goto out; - - x = rb_entry(mn->next, struct mmtimer, list); - t = x->timer; - - if (t->it.mmtimer.clock == TIMER_OFF) - goto out; - - t->it_overrun = 0; - - mn->next = rb_next(&x->list); - rb_erase(&x->list, &mn->timer_head); - - if (posix_timer_event(t, 0) != 0) - t->it_overrun++; - - if(t->it.mmtimer.incr) { - t->it.mmtimer.expires += t->it.mmtimer.incr; - mmtimer_add_list(x); - } else { - /* Ensure we don't false trigger in mmtimer_interrupt */ - t->it.mmtimer.clock = TIMER_OFF; - t->it.mmtimer.expires = 0; - kfree(x); - } - /* Set comparator for next timer, if there is one */ - mmtimer_set_next_timer(nodeid); - - t->it_overrun_last = t->it_overrun; -out: - spin_unlock_irqrestore(&mn->lock, flags); -} - -static int sgi_timer_create(struct k_itimer *timer) -{ - /* Insure that a newly created timer is off */ - timer->it.mmtimer.clock = TIMER_OFF; - return 0; -} - -/* This does not really delete a timer. It just insures - * that the timer is not active - * - * Assumption: it_lock is already held with irq's disabled - */ -static int sgi_timer_del(struct k_itimer *timr) -{ - cnodeid_t nodeid = timr->it.mmtimer.node; - unsigned long irqflags; - - spin_lock_irqsave(&timers[nodeid].lock, irqflags); - if (timr->it.mmtimer.clock != TIMER_OFF) { - unsigned long expires = timr->it.mmtimer.expires; - struct rb_node *n = timers[nodeid].timer_head.rb_node; - struct mmtimer *uninitialized_var(t); - int r = 0; - - timr->it.mmtimer.clock = TIMER_OFF; - timr->it.mmtimer.expires = 0; - - while (n) { - t = rb_entry(n, struct mmtimer, list); - if (t->timer == timr) - break; - - if (expires < t->timer->it.mmtimer.expires) - n = n->rb_left; - else - n = n->rb_right; - } - - if (!n) { - spin_unlock_irqrestore(&timers[nodeid].lock, irqflags); - return 0; - } - - if (timers[nodeid].next == n) { - timers[nodeid].next = rb_next(n); - r = 1; - } - - rb_erase(n, &timers[nodeid].timer_head); - kfree(t); - - if (r) { - mmtimer_disable_int(cnodeid_to_nasid(nodeid), - COMPARATOR); - mmtimer_set_next_timer(nodeid); - } - } - spin_unlock_irqrestore(&timers[nodeid].lock, irqflags); - return 0; -} - -/* Assumption: it_lock is already held with irq's disabled */ -static void sgi_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) -{ - - if (timr->it.mmtimer.clock == TIMER_OFF) { - cur_setting->it_interval.tv_nsec = 0; - cur_setting->it_interval.tv_sec = 0; - cur_setting->it_value.tv_nsec = 0; - cur_setting->it_value.tv_sec =0; - return; - } - - cur_setting->it_interval = ns_to_timespec64(timr->it.mmtimer.incr * sgi_clock_period); - cur_setting->it_value = ns_to_timespec64((timr->it.mmtimer.expires - rtc_time()) * sgi_clock_period); -} - - -static int sgi_timer_set(struct k_itimer *timr, int flags, - struct itimerspec64 *new_setting, - struct itimerspec64 *old_setting) -{ - unsigned long when, period, irqflags; - int err = 0; - cnodeid_t nodeid; - struct mmtimer *base; - struct rb_node *n; - - if (old_setting) - sgi_timer_get(timr, old_setting); - - sgi_timer_del(timr); - when = timespec64_to_ns(&new_setting->it_value); - period = timespec64_to_ns(&new_setting->it_interval); - - if (when == 0) - /* Clear timer */ - return 0; - - base = kmalloc(sizeof(struct mmtimer), GFP_KERNEL); - if (base == NULL) - return -ENOMEM; - - if (flags & TIMER_ABSTIME) { - struct timespec64 n; - unsigned long now; - - getnstimeofday64(&n); - now = timespec64_to_ns(&n); - if (when > now) - when -= now; - else - /* Fire the timer immediately */ - when = 0; - } - - /* - * Convert to sgi clock period. Need to keep rtc_time() as near as possible - * to getnstimeofday() in order to be as faithful as possible to the time - * specified. - */ - when = (when + sgi_clock_period - 1) / sgi_clock_period + rtc_time(); - period = (period + sgi_clock_period - 1) / sgi_clock_period; - - /* - * We are allocating a local SHub comparator. If we would be moved to another - * cpu then another SHub may be local to us. Prohibit that by switching off - * preemption. - */ - preempt_disable(); - - nodeid = cpu_to_node(smp_processor_id()); - - /* Lock the node timer structure */ - spin_lock_irqsave(&timers[nodeid].lock, irqflags); - - base->timer = timr; - base->cpu = smp_processor_id(); - - timr->it.mmtimer.clock = TIMER_SET; - timr->it.mmtimer.node = nodeid; - timr->it.mmtimer.incr = period; - timr->it.mmtimer.expires = when; - - n = timers[nodeid].next; - - /* Add the new struct mmtimer to node's timer list */ - mmtimer_add_list(base); - - if (timers[nodeid].next == n) { - /* No need to reprogram comparator for now */ - spin_unlock_irqrestore(&timers[nodeid].lock, irqflags); - preempt_enable(); - return err; - } - - /* We need to reprogram the comparator */ - if (n) - mmtimer_disable_int(cnodeid_to_nasid(nodeid), COMPARATOR); - - mmtimer_set_next_timer(nodeid); - - /* Unlock the node timer structure */ - spin_unlock_irqrestore(&timers[nodeid].lock, irqflags); - - preempt_enable(); - - return err; -} - -static int sgi_clock_getres(const clockid_t which_clock, struct timespec64 *tp) -{ - tp->tv_sec = 0; - tp->tv_nsec = sgi_clock_period; - return 0; -} - -static struct k_clock sgi_clock = { - .clock_set = sgi_clock_set, - .clock_get = sgi_clock_get, - .clock_getres = sgi_clock_getres, - .timer_create = sgi_timer_create, - .timer_set = sgi_timer_set, - .timer_del = sgi_timer_del, - .timer_get = sgi_timer_get -}; - -/** - * mmtimer_init - device initialization routine - * - * Does initial setup for the mmtimer device. - */ -static int __init mmtimer_init(void) -{ - cnodeid_t node, maxn = -1; - - if (!ia64_platform_is("sn2")) - return 0; - - /* - * Sanity check the cycles/sec variable - */ - if (sn_rtc_cycles_per_second < 100000) { - printk(KERN_ERR "%s: unable to determine clock frequency\n", - MMTIMER_NAME); - goto out1; - } - - mmtimer_femtoperiod = ((unsigned long)1E15 + sn_rtc_cycles_per_second / - 2) / sn_rtc_cycles_per_second; - - if (request_irq(SGI_MMTIMER_VECTOR, mmtimer_interrupt, IRQF_PERCPU, MMTIMER_NAME, NULL)) { - printk(KERN_WARNING "%s: unable to allocate interrupt.", - MMTIMER_NAME); - goto out1; - } - - if (misc_register(&mmtimer_miscdev)) { - printk(KERN_ERR "%s: failed to register device\n", - MMTIMER_NAME); - goto out2; - } - - /* Get max numbered node, calculate slots needed */ - for_each_online_node(node) { - maxn = node; - } - maxn++; - - /* Allocate list of node ptrs to mmtimer_t's */ - timers = kzalloc(sizeof(struct mmtimer_node)*maxn, GFP_KERNEL); - if (!timers) { - printk(KERN_ERR "%s: failed to allocate memory for device\n", - MMTIMER_NAME); - goto out3; - } - - /* Initialize struct mmtimer's for each online node */ - for_each_online_node(node) { - spin_lock_init(&timers[node].lock); - tasklet_init(&timers[node].tasklet, mmtimer_tasklet, - (unsigned long) node); - } - - sgi_clock_period = NSEC_PER_SEC / sn_rtc_cycles_per_second; - posix_timers_register_clock(CLOCK_SGI_CYCLE, &sgi_clock); - - printk(KERN_INFO "%s: v%s, %ld MHz\n", MMTIMER_DESC, MMTIMER_VERSION, - sn_rtc_cycles_per_second/(unsigned long)1E6); - - return 0; - -out3: - misc_deregister(&mmtimer_miscdev); -out2: - free_irq(SGI_MMTIMER_VECTOR, NULL); -out1: - return -1; -} - -module_init(mmtimer_init); diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 0e3f6496524d..6001369f9aeb 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -887,7 +887,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, struct freq_attr *fattr = to_attr(attr); ssize_t ret = -EINVAL; - get_online_cpus(); + cpus_read_lock(); if (cpu_online(policy->cpu)) { down_write(&policy->rwsem); @@ -895,7 +895,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, up_write(&policy->rwsem); } - put_online_cpus(); + cpus_read_unlock(); return ret; } @@ -2441,7 +2441,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) pr_debug("trying to register driver %s\n", driver_data->name); /* Protect against concurrent CPU online/offline. */ - get_online_cpus(); + cpus_read_lock(); write_lock_irqsave(&cpufreq_driver_lock, flags); if (cpufreq_driver) { @@ -2473,9 +2473,10 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) goto err_if_unreg; } - ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "cpufreq:online", - cpuhp_cpufreq_online, - cpuhp_cpufreq_offline); + ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, + "cpufreq:online", + cpuhp_cpufreq_online, + cpuhp_cpufreq_offline); if (ret < 0) goto err_if_unreg; hp_online = ret; @@ -2493,7 +2494,7 @@ err_null_driver: cpufreq_driver = NULL; write_unlock_irqrestore(&cpufreq_driver_lock, flags); out: - put_online_cpus(); + cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(cpufreq_register_driver); @@ -2516,17 +2517,17 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver) pr_debug("unregistering driver %s\n", driver->name); /* Protect against concurrent cpu hotplug */ - get_online_cpus(); + cpus_read_lock(); subsys_interface_unregister(&cpufreq_interface); remove_boost_sysfs_file(); - cpuhp_remove_state_nocalls(hp_online); + cpuhp_remove_state_nocalls_cpuslocked(hp_online); write_lock_irqsave(&cpufreq_driver_lock, flags); cpufreq_driver = NULL; write_unlock_irqrestore(&cpufreq_driver_lock, flags); - put_online_cpus(); + cpus_read_unlock(); return 0; } diff --git a/drivers/cpufreq/pasemi-cpufreq.c b/drivers/cpufreq/pasemi-cpufreq.c index 35dd4d7ffee0..b257fc7d5204 100644 --- a/drivers/cpufreq/pasemi-cpufreq.c +++ b/drivers/cpufreq/pasemi-cpufreq.c @@ -226,7 +226,7 @@ static int pas_cpufreq_cpu_exit(struct cpufreq_policy *policy) * We don't support CPU hotplug. Don't unmap after the system * has already made it to a running state. */ - if (system_state != SYSTEM_BOOTING) + if (system_state >= SYSTEM_RUNNING) return 0; if (sdcasr_mapbase) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 2706be7ed334..60bb64f4329d 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -220,6 +220,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, entered_state = target_state->enter(dev, drv, index); start_critical_timings(); + sched_clock_idle_wakeup_event(); time_end = ns_to_ktime(local_clock()); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); diff --git a/drivers/firmware/efi/efi-bgrt.c b/drivers/firmware/efi/efi-bgrt.c index 04ca8764f0c0..8bf27323f7a3 100644 --- a/drivers/firmware/efi/efi-bgrt.c +++ b/drivers/firmware/efi/efi-bgrt.c @@ -36,6 +36,9 @@ void __init efi_bgrt_init(struct acpi_table_header *table) if (acpi_disabled) return; + if (!efi_enabled(EFI_BOOT)) + return; + if (table->length < sizeof(bgrt_tab)) { pr_notice("Ignoring BGRT: invalid length %u (expected %zu)\n", table->length, sizeof(bgrt_tab)); diff --git a/drivers/firmware/efi/libstub/secureboot.c b/drivers/firmware/efi/libstub/secureboot.c index 8c34d50a4d80..959777ec8a77 100644 --- a/drivers/firmware/efi/libstub/secureboot.c +++ b/drivers/firmware/efi/libstub/secureboot.c @@ -16,10 +16,10 @@ /* BIOS variables */ static const efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID; -static const efi_char16_t const efi_SecureBoot_name[] = { +static const efi_char16_t efi_SecureBoot_name[] = { 'S', 'e', 'c', 'u', 'r', 'e', 'B', 'o', 'o', 't', 0 }; -static const efi_char16_t const efi_SetupMode_name[] = { +static const efi_char16_t efi_SetupMode_name[] = { 'S', 'e', 't', 'u', 'p', 'M', 'o', 'd', 'e', 0 }; diff --git a/drivers/hwtracing/coresight/coresight-etm3x.c b/drivers/hwtracing/coresight/coresight-etm3x.c index a51b6b64ecdf..93ee8fc539be 100644 --- a/drivers/hwtracing/coresight/coresight-etm3x.c +++ b/drivers/hwtracing/coresight/coresight-etm3x.c @@ -587,7 +587,7 @@ static void etm_disable_sysfs(struct coresight_device *csdev) * after cpu online mask indicates the cpu is offline but before the * DYING hotplug callback is serviced by the ETM driver. */ - get_online_cpus(); + cpus_read_lock(); spin_lock(&drvdata->spinlock); /* @@ -597,7 +597,7 @@ static void etm_disable_sysfs(struct coresight_device *csdev) smp_call_function_single(drvdata->cpu, etm_disable_hw, drvdata, 1); spin_unlock(&drvdata->spinlock); - put_online_cpus(); + cpus_read_unlock(); dev_info(drvdata->dev, "ETM tracing disabled\n"); } @@ -795,7 +795,7 @@ static int etm_probe(struct amba_device *adev, const struct amba_id *id) drvdata->cpu = pdata ? pdata->cpu : 0; - get_online_cpus(); + cpus_read_lock(); etmdrvdata[drvdata->cpu] = drvdata; if (smp_call_function_single(drvdata->cpu, @@ -803,17 +803,17 @@ static int etm_probe(struct amba_device *adev, const struct amba_id *id) dev_err(dev, "ETM arch init failed\n"); if (!etm_count++) { - cpuhp_setup_state_nocalls(CPUHP_AP_ARM_CORESIGHT_STARTING, - "arm/coresight:starting", - etm_starting_cpu, etm_dying_cpu); - ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "arm/coresight:online", - etm_online_cpu, NULL); + cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ARM_CORESIGHT_STARTING, + "arm/coresight:starting", + etm_starting_cpu, etm_dying_cpu); + ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, + "arm/coresight:online", + etm_online_cpu, NULL); if (ret < 0) goto err_arch_supported; hp_online = ret; } - put_online_cpus(); + cpus_read_unlock(); if (etm_arch_supported(drvdata->arch) == false) { ret = -EINVAL; diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c b/drivers/hwtracing/coresight/coresight-etm4x.c index d1340fb4e457..532adc9dd32a 100644 --- a/drivers/hwtracing/coresight/coresight-etm4x.c +++ b/drivers/hwtracing/coresight/coresight-etm4x.c @@ -371,7 +371,7 @@ static void etm4_disable_sysfs(struct coresight_device *csdev) * after cpu online mask indicates the cpu is offline but before the * DYING hotplug callback is serviced by the ETM driver. */ - get_online_cpus(); + cpus_read_lock(); spin_lock(&drvdata->spinlock); /* @@ -381,7 +381,7 @@ static void etm4_disable_sysfs(struct coresight_device *csdev) smp_call_function_single(drvdata->cpu, etm4_disable_hw, drvdata, 1); spin_unlock(&drvdata->spinlock); - put_online_cpus(); + cpus_read_unlock(); dev_info(drvdata->dev, "ETM tracing disabled\n"); } @@ -982,7 +982,7 @@ static int etm4_probe(struct amba_device *adev, const struct amba_id *id) drvdata->cpu = pdata ? pdata->cpu : 0; - get_online_cpus(); + cpus_read_lock(); etmdrvdata[drvdata->cpu] = drvdata; if (smp_call_function_single(drvdata->cpu, @@ -990,18 +990,18 @@ static int etm4_probe(struct amba_device *adev, const struct amba_id *id) dev_err(dev, "ETM arch init failed\n"); if (!etm4_count++) { - cpuhp_setup_state_nocalls(CPUHP_AP_ARM_CORESIGHT_STARTING, - "arm/coresight4:starting", - etm4_starting_cpu, etm4_dying_cpu); - ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "arm/coresight4:online", - etm4_online_cpu, NULL); + cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ARM_CORESIGHT_STARTING, + "arm/coresight4:starting", + etm4_starting_cpu, etm4_dying_cpu); + ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, + "arm/coresight4:online", + etm4_online_cpu, NULL); if (ret < 0) goto err_arch_supported; hp_online = ret; } - put_online_cpus(); + cpus_read_unlock(); if (etm4_arch_supported(drvdata->arch) == false) { ret = -EINVAL; diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index fc2765ccdb57..8500deda9175 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -4315,7 +4315,7 @@ int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg) struct acpi_dmar_atsr *atsr; struct dmar_atsr_unit *atsru; - if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled) + if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled) return 0; atsr = container_of(hdr, struct acpi_dmar_atsr, header); @@ -4565,7 +4565,7 @@ int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) struct acpi_dmar_atsr *atsr; struct acpi_dmar_reserved_memory *rmrr; - if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING) + if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING) return 0; list_for_each_entry(rmrru, &dmar_rmrr_units, list) { diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 9f44ee8ea1bc..b8dcf44df441 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -103,7 +103,7 @@ static bool of_iommu_driver_present(struct device_node *np) * it never will be. We don't want to defer indefinitely, nor attempt * to dereference __iommu_of_table after it's been freed. */ - if (system_state > SYSTEM_BOOTING) + if (system_state >= SYSTEM_RUNNING) return false; return of_match_node(&__iommu_of_table, np); diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 192e7b681b96..fe6be6382505 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -320,10 +320,19 @@ static long local_pci_probe(void *_ddi) return 0; } +static bool pci_physfn_is_probed(struct pci_dev *dev) +{ +#ifdef CONFIG_PCI_IOV + return dev->is_virtfn && dev->physfn->is_probed; +#else + return false; +#endif +} + static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev, const struct pci_device_id *id) { - int error, node; + int error, node, cpu; struct drv_dev_and_id ddi = { drv, dev, id }; /* @@ -332,33 +341,27 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev, * on the right node. */ node = dev_to_node(&dev->dev); + dev->is_probed = 1; + + cpu_hotplug_disable(); /* - * On NUMA systems, we are likely to call a PF probe function using - * work_on_cpu(). If that probe calls pci_enable_sriov() (which - * adds the VF devices via pci_bus_add_device()), we may re-enter - * this function to call the VF probe function. Calling - * work_on_cpu() again will cause a lockdep warning. Since VFs are - * always on the same node as the PF, we can work around this by - * avoiding work_on_cpu() when we're already on the correct node. - * - * Preemption is enabled, so it's theoretically unsafe to use - * numa_node_id(), but even if we run the probe function on the - * wrong node, it should be functionally correct. + * Prevent nesting work_on_cpu() for the case where a Virtual Function + * device is probed from work_on_cpu() of the Physical device. */ - if (node >= 0 && node != numa_node_id()) { - int cpu; - - get_online_cpus(); + if (node < 0 || node >= MAX_NUMNODES || !node_online(node) || + pci_physfn_is_probed(dev)) + cpu = nr_cpu_ids; + else cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask); - if (cpu < nr_cpu_ids) - error = work_on_cpu(cpu, local_pci_probe, &ddi); - else - error = local_pci_probe(&ddi); - put_online_cpus(); - } else + + if (cpu < nr_cpu_ids) + error = work_on_cpu(cpu, local_pci_probe, &ddi); + else error = local_pci_probe(&ddi); + dev->is_probed = 0; + cpu_hotplug_enable(); return error; } diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index 94f8038864b4..ed4c343d08c4 100644 --- a/drivers/ras/ras.c +++ b/drivers/ras/ras.c @@ -29,7 +29,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event); EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event); -int __init parse_ras_param(char *str) +static int __init parse_ras_param(char *str) { #ifdef CONFIG_RAS_CEC parse_cec_param(str); diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index c1ec8ee80924..9e35032351a0 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -190,6 +190,7 @@ static void do_poweroff(void) { switch (system_state) { case SYSTEM_BOOTING: + case SYSTEM_SCHEDULING: orderly_poweroff(true); break; case SYSTEM_RUNNING: diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index f2b10d9ebd04..81490456c242 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -96,6 +96,7 @@ struct clocksource { void (*suspend)(struct clocksource *cs); void (*resume)(struct clocksource *cs); void (*mark_unstable)(struct clocksource *cs); + void (*tick_stable)(struct clocksource *cs); /* private: */ #ifdef CONFIG_CLOCKSOURCE_WATCHDOG diff --git a/include/linux/cpu.h b/include/linux/cpu.h index f92081234afd..ca73bc1563f4 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -99,26 +99,32 @@ static inline void cpu_maps_update_done(void) extern struct bus_type cpu_subsys; #ifdef CONFIG_HOTPLUG_CPU -/* Stop CPUs going up and down. */ - -extern void cpu_hotplug_begin(void); -extern void cpu_hotplug_done(void); -extern void get_online_cpus(void); -extern void put_online_cpus(void); +extern void cpus_write_lock(void); +extern void cpus_write_unlock(void); +extern void cpus_read_lock(void); +extern void cpus_read_unlock(void); +extern void lockdep_assert_cpus_held(void); extern void cpu_hotplug_disable(void); extern void cpu_hotplug_enable(void); void clear_tasks_mm_cpumask(int cpu); int cpu_down(unsigned int cpu); -#else /* CONFIG_HOTPLUG_CPU */ - -static inline void cpu_hotplug_begin(void) {} -static inline void cpu_hotplug_done(void) {} -#define get_online_cpus() do { } while (0) -#define put_online_cpus() do { } while (0) -#define cpu_hotplug_disable() do { } while (0) -#define cpu_hotplug_enable() do { } while (0) -#endif /* CONFIG_HOTPLUG_CPU */ +#else /* CONFIG_HOTPLUG_CPU */ + +static inline void cpus_write_lock(void) { } +static inline void cpus_write_unlock(void) { } +static inline void cpus_read_lock(void) { } +static inline void cpus_read_unlock(void) { } +static inline void lockdep_assert_cpus_held(void) { } +static inline void cpu_hotplug_disable(void) { } +static inline void cpu_hotplug_enable(void) { } +#endif /* !CONFIG_HOTPLUG_CPU */ + +/* Wrappers which go away once all code is converted */ +static inline void cpu_hotplug_begin(void) { cpus_write_lock(); } +static inline void cpu_hotplug_done(void) { cpus_write_unlock(); } +static inline void get_online_cpus(void) { cpus_read_lock(); } +static inline void put_online_cpus(void) { cpus_read_unlock(); } #ifdef CONFIG_PM_SLEEP_SMP extern int freeze_secondary_cpus(int primary); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 0f2a80377520..df3d2719a796 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -153,6 +153,11 @@ int __cpuhp_setup_state(enum cpuhp_state state, const char *name, bool invoke, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu), bool multi_instance); +int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, const char *name, + bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu), + bool multi_instance); /** * cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks * @state: The state for which the calls are installed @@ -171,6 +176,15 @@ static inline int cpuhp_setup_state(enum cpuhp_state state, return __cpuhp_setup_state(state, name, true, startup, teardown, false); } +static inline int cpuhp_setup_state_cpuslocked(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + return __cpuhp_setup_state_cpuslocked(state, name, true, startup, + teardown, false); +} + /** * cpuhp_setup_state_nocalls - Setup hotplug state callbacks without calling the * callbacks @@ -191,6 +205,15 @@ static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state, false); } +static inline int cpuhp_setup_state_nocalls_cpuslocked(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + return __cpuhp_setup_state_cpuslocked(state, name, false, startup, + teardown, false); +} + /** * cpuhp_setup_state_multi - Add callbacks for multi state * @state: The state for which the calls are installed @@ -217,6 +240,8 @@ static inline int cpuhp_setup_state_multi(enum cpuhp_state state, int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, bool invoke); +int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state, + struct hlist_node *node, bool invoke); /** * cpuhp_state_add_instance - Add an instance for a state and invoke startup @@ -249,7 +274,15 @@ static inline int cpuhp_state_add_instance_nocalls(enum cpuhp_state state, return __cpuhp_state_add_instance(state, node, false); } +static inline int +cpuhp_state_add_instance_nocalls_cpuslocked(enum cpuhp_state state, + struct hlist_node *node) +{ + return __cpuhp_state_add_instance_cpuslocked(state, node, false); +} + void __cpuhp_remove_state(enum cpuhp_state state, bool invoke); +void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke); /** * cpuhp_remove_state - Remove hotplug state callbacks and invoke the teardown @@ -273,6 +306,11 @@ static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state) __cpuhp_remove_state(state, false); } +static inline void cpuhp_remove_state_nocalls_cpuslocked(enum cpuhp_state state) +{ + __cpuhp_remove_state_cpuslocked(state, false); +} + /** * cpuhp_remove_multi_state - Remove hotplug multi state callback * @state: The state for which the calls are removed diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 2404ad238c0b..4bf4479a3a80 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -236,6 +236,23 @@ unsigned int cpumask_local_spread(unsigned int i, int node); (cpu) = cpumask_next_zero((cpu), (mask)), \ (cpu) < nr_cpu_ids;) +extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap); + +/** + * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location + * @cpu: the (optionally unsigned) integer iterator + * @mask: the cpumask poiter + * @start: the start location + * + * The implementation does not assume any bit in @mask is set (including @start). + * + * After the loop, cpu is >= nr_cpu_ids. + */ +#define for_each_cpu_wrap(cpu, mask, start) \ + for ((cpu) = cpumask_next_wrap((start)-1, (mask), (start), false); \ + (cpu) < nr_cpumask_bits; \ + (cpu) = cpumask_next_wrap((cpu), (mask), (start), true)) + /** * for_each_cpu_and - iterate over every cpu in both masks * @cpu: the (optionally unsigned) integer iterator @@ -276,6 +293,12 @@ static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } +static inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) +{ + __set_bit(cpumask_check(cpu), cpumask_bits(dstp)); +} + + /** * cpumask_clear_cpu - clear a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) @@ -286,6 +309,11 @@ static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp) clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); } +static inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp) +{ + __clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); +} + /** * cpumask_test_cpu - test for a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 13bc08aba704..1c91f26e2996 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -490,9 +490,13 @@ extern int root_mountflags; extern bool early_boot_irqs_disabled; -/* Values used for system_state */ +/* + * Values used for system_state. Ordering of the states must not be changed + * as code checks for <, <=, >, >= STATE. + */ extern enum system_states { SYSTEM_BOOTING, + SYSTEM_SCHEDULING, SYSTEM_RUNNING, SYSTEM_HALT, SYSTEM_POWER_OFF, diff --git a/include/linux/llist.h b/include/linux/llist.h index 171baa90f6f6..d11738110a7a 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -110,6 +110,25 @@ static inline void init_llist_head(struct llist_head *list) for ((pos) = (node); pos; (pos) = (pos)->next) /** + * llist_for_each_safe - iterate over some deleted entries of a lock-less list + * safe against removal of list entry + * @pos: the &struct llist_node to use as a loop cursor + * @n: another &struct llist_node to use as temporary storage + * @node: the first entry of deleted list entries + * + * In general, some entries of the lock-less list can be traversed + * safely only after being deleted from list, so start with an entry + * instead of list head. + * + * If being used on entries deleted from lock-less list directly, the + * traverse order is from the newest to the oldest added entry. If + * you want to traverse from the oldest to the newest, you must + * reverse the order by yourself before traversing. + */ +#define llist_for_each_safe(pos, n, node) \ + for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n)) + +/** * llist_for_each_entry - iterate over some deleted entries of lock-less list of given type * @pos: the type * to use as a loop cursor. * @node: the fist entry of deleted list entries. diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index 136dfdf63ba1..fc412fbd80bd 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -14,6 +14,10 @@ #include <asm/page.h> +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH +#include <asm/tlbbatch.h> +#endif + #define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) #define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \ IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK)) @@ -67,12 +71,15 @@ struct page_frag { struct tlbflush_unmap_batch { #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* - * Each bit set is a CPU that potentially has a TLB entry for one of - * the PFNs being flushed. See set_tlb_ubc_flush_pending(). + * The arch code makes the following promise: generic code can modify a + * PTE, then call arch_tlbbatch_add_mm() (which internally provides all + * needed barriers), then call arch_tlbbatch_flush(), and the entries + * will be flushed on all CPUs by the time that arch_tlbbatch_flush() + * returns. */ - struct cpumask cpumask; + struct arch_tlbflush_unmap_batch arch; - /* True if any bit in cpumask is set */ + /* True if a flush is needed. */ bool flush_required; /* diff --git a/include/linux/padata.h b/include/linux/padata.h index 0f9e567d5e15..2f9c1f93b1ce 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -166,9 +166,6 @@ struct padata_instance { extern struct padata_instance *padata_alloc_possible( struct workqueue_struct *wq); -extern struct padata_instance *padata_alloc(struct workqueue_struct *wq, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask); extern void padata_free(struct padata_instance *pinst); extern int padata_do_parallel(struct padata_instance *pinst, struct padata_priv *padata, int cb_cpu); diff --git a/include/linux/pci.h b/include/linux/pci.h index 8039f9f0ca05..58f1ab06c4e8 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -376,6 +376,7 @@ struct pci_dev { unsigned int irq_managed:1; unsigned int has_secondary_link:1; unsigned int non_compliant_bars:1; /* broken BARs; ignore them */ + unsigned int is_probed:1; /* device probing in progress */ pci_dev_flags_t dev_flags; atomic_t enable_cnt; /* pci_enable_device has been called */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 24a635887f28..7d6aa29094b2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -801,6 +801,8 @@ struct perf_cpu_context { struct list_head sched_cb_entry; int sched_cb_usage; + + int online; }; struct perf_output_handle { diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 8c1e43ab14a9..34e893a75771 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -73,12 +73,6 @@ struct k_itimer { } real; struct cpu_timer_list cpu; struct { - unsigned int clock; - unsigned int node; - unsigned long incr; - unsigned long expires; - } mmtimer; - struct { struct alarm alarmtimer; ktime_t interval; } alarm; @@ -105,10 +99,11 @@ struct k_clock { struct itimerspec64 *cur_setting); }; -extern struct k_clock clock_posix_cpu; -extern struct k_clock clock_posix_dynamic; - -void posix_timers_register_clock(const clockid_t clock_id, struct k_clock *new_clock); +extern const struct k_clock clock_posix_cpu; +extern const struct k_clock clock_posix_dynamic; +extern const struct k_clock clock_process; +extern const struct k_clock clock_thread; +extern const struct k_clock alarm_clock; /* function to call to trigger timer event */ int posix_timer_event(struct k_itimer *timr, int si_private); diff --git a/include/linux/sched.h b/include/linux/sched.h index 2b69fc650201..3dfa5f99d6ee 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1265,6 +1265,16 @@ extern struct pid *cad_pid; #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) +static inline bool is_percpu_thread(void) +{ +#ifdef CONFIG_SMP + return (current->flags & PF_NO_SETAFFINITY) && + (current->nr_cpus_allowed == 1); +#else + return true; +#endif +} + /* Per-process atomic flags. */ #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h index 34fe92ce1ebd..a55600ffdf4b 100644 --- a/include/linux/sched/clock.h +++ b/include/linux/sched/clock.h @@ -23,10 +23,6 @@ extern u64 sched_clock_cpu(int cpu); extern void sched_clock_init(void); #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -static inline void sched_clock_init_late(void) -{ -} - static inline void sched_clock_tick(void) { } @@ -39,7 +35,7 @@ static inline void sched_clock_idle_sleep_event(void) { } -static inline void sched_clock_idle_wakeup_event(u64 delta_ns) +static inline void sched_clock_idle_wakeup_event(void) { } @@ -53,7 +49,6 @@ static inline u64 local_clock(void) return sched_clock(); } #else -extern void sched_clock_init_late(void); extern int sched_clock_stable(void); extern void clear_sched_clock_stable(void); @@ -63,10 +58,10 @@ extern void clear_sched_clock_stable(void); */ extern u64 __sched_clock_offset; - extern void sched_clock_tick(void); +extern void sched_clock_tick_stable(void); extern void sched_clock_idle_sleep_event(void); -extern void sched_clock_idle_wakeup_event(u64 delta_ns); +extern void sched_clock_idle_wakeup_event(void); /* * As outlined in clock.c, provides a fast, high resolution, nanosecond diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 3cc9632dcc2a..3d60275e3ba9 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -116,15 +116,29 @@ static inline int try_stop_cpus(const struct cpumask *cpumask, * @fn() runs. * * This can be thought of as a very heavy write lock, equivalent to - * grabbing every spinlock in the kernel. */ + * grabbing every spinlock in the kernel. + * + * Protects against CPU hotplug. + */ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); +/** + * stop_machine_cpuslocked: freeze the machine on all CPUs and run this function + * @fn: the function to run + * @data: the data ptr for the @fn() + * @cpus: the cpus to run the @fn() on (NULL = any online cpu) + * + * Same as above. Must be called from with in a cpus_read_lock() protected + * region. Avoids nested calls to cpus_read_lock(). + */ +int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); + int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); #else /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ -static inline int stop_machine(cpu_stop_fn_t fn, void *data, - const struct cpumask *cpus) +static inline int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, + const struct cpumask *cpus) { unsigned long flags; int ret; @@ -134,6 +148,12 @@ static inline int stop_machine(cpu_stop_fn_t fn, void *data, return ret; } +static inline int stop_machine(cpu_stop_fn_t fn, void *data, + const struct cpumask *cpus) +{ + return stop_machine_cpuslocked(fn, data, cpus); +} + static inline int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h index e75e1b6ff27f..09299fcb842a 100644 --- a/include/uapi/linux/time.h +++ b/include/uapi/linux/time.h @@ -54,7 +54,11 @@ struct itimerval { #define CLOCK_BOOTTIME 7 #define CLOCK_REALTIME_ALARM 8 #define CLOCK_BOOTTIME_ALARM 9 -#define CLOCK_SGI_CYCLE 10 /* Hardware specific */ +/* + * The driver implementing this got removed. The clock ID is kept as a + * place holder. Do not reuse! + */ +#define CLOCK_SGI_CYCLE 10 #define CLOCK_TAI 11 #define MAX_CLOCKS 16 diff --git a/init/main.c b/init/main.c index f866510472d7..df58a416dd1d 100644 --- a/init/main.c +++ b/init/main.c @@ -389,6 +389,7 @@ static __initdata DECLARE_COMPLETION(kthreadd_done); static noinline void __ref rest_init(void) { + struct task_struct *tsk; int pid; rcu_scheduler_starting(); @@ -397,12 +398,32 @@ static noinline void __ref rest_init(void) * the init task will end up wanting to create kthreads, which, if * we schedule it before we create kthreadd, will OOPS. */ - kernel_thread(kernel_init, NULL, CLONE_FS); + pid = kernel_thread(kernel_init, NULL, CLONE_FS); + /* + * Pin init on the boot CPU. Task migration is not properly working + * until sched_init_smp() has been run. It will set the allowed + * CPUs for init to the non isolated CPUs. + */ + rcu_read_lock(); + tsk = find_task_by_pid_ns(pid, &init_pid_ns); + set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id())); + rcu_read_unlock(); + numa_default_policy(); pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); rcu_read_lock(); kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns); rcu_read_unlock(); + + /* + * Enable might_sleep() and smp_processor_id() checks. + * They cannot be enabled earlier because with CONFIG_PRREMPT=y + * kernel_thread() would trigger might_sleep() splats. With + * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled + * already, but it's stuck on the kthreadd_done completion. + */ + system_state = SYSTEM_SCHEDULING; + complete(&kthreadd_done); /* @@ -1015,10 +1036,6 @@ static noinline void __init kernel_init_freeable(void) * init can allocate pages on any node */ set_mems_allowed(node_states[N_MEMORY]); - /* - * init can run on any cpu. - */ - set_cpus_allowed_ptr(current, cpu_all_mask); cad_pid = task_pid(current); diff --git a/kernel/async.c b/kernel/async.c index d2edd6efec56..2cbd3dd5940d 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -114,14 +114,14 @@ static void async_run_entry_fn(struct work_struct *work) ktime_t uninitialized_var(calltime), delta, rettime; /* 1) run (and print duration) */ - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { pr_debug("calling %lli_%pF @ %i\n", (long long)entry->cookie, entry->func, task_pid_nr(current)); calltime = ktime_get(); } entry->func(entry->data, entry->cookie); - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { rettime = ktime_get(); delta = ktime_sub(rettime, calltime); pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", @@ -284,14 +284,14 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain { ktime_t uninitialized_var(starttime), delta, endtime; - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { pr_debug("async_waiting @ %i\n", task_pid_nr(current)); starttime = ktime_get(); } wait_event(async_done, lowest_in_progress(domain) >= cookie); - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { endtime = ktime_get(); delta = ktime_sub(endtime, starttime); diff --git a/kernel/cpu.c b/kernel/cpu.c index 9ae6fbe5b5cf..7435ffc6163b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -27,6 +27,7 @@ #include <linux/smpboot.h> #include <linux/relay.h> #include <linux/slab.h> +#include <linux/percpu-rwsem.h> #include <trace/events/power.h> #define CREATE_TRACE_POINTS @@ -65,6 +66,12 @@ struct cpuhp_cpu_state { static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); +#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) +static struct lock_class_key cpuhp_state_key; +static struct lockdep_map cpuhp_state_lock_map = + STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key); +#endif + /** * cpuhp_step - Hotplug state machine step * @name: Name of the step @@ -196,121 +203,41 @@ void cpu_maps_update_done(void) mutex_unlock(&cpu_add_remove_lock); } -/* If set, cpu_up and cpu_down will return -EBUSY and do nothing. +/* + * If set, cpu_up and cpu_down will return -EBUSY and do nothing. * Should always be manipulated under cpu_add_remove_lock */ static int cpu_hotplug_disabled; #ifdef CONFIG_HOTPLUG_CPU -static struct { - struct task_struct *active_writer; - /* wait queue to wake up the active_writer */ - wait_queue_head_t wq; - /* verifies that no writer will get active while readers are active */ - struct mutex lock; - /* - * Also blocks the new readers during - * an ongoing cpu hotplug operation. - */ - atomic_t refcount; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -} cpu_hotplug = { - .active_writer = NULL, - .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), - .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), -#ifdef CONFIG_DEBUG_LOCK_ALLOC - .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map), -#endif -}; - -/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ -#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map) -#define cpuhp_lock_acquire_tryread() \ - lock_map_acquire_tryread(&cpu_hotplug.dep_map) -#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) -#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) +DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock); - -void get_online_cpus(void) +void cpus_read_lock(void) { - might_sleep(); - if (cpu_hotplug.active_writer == current) - return; - cpuhp_lock_acquire_read(); - mutex_lock(&cpu_hotplug.lock); - atomic_inc(&cpu_hotplug.refcount); - mutex_unlock(&cpu_hotplug.lock); + percpu_down_read(&cpu_hotplug_lock); } -EXPORT_SYMBOL_GPL(get_online_cpus); +EXPORT_SYMBOL_GPL(cpus_read_lock); -void put_online_cpus(void) +void cpus_read_unlock(void) { - int refcount; - - if (cpu_hotplug.active_writer == current) - return; - - refcount = atomic_dec_return(&cpu_hotplug.refcount); - if (WARN_ON(refcount < 0)) /* try to fix things up */ - atomic_inc(&cpu_hotplug.refcount); - - if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq)) - wake_up(&cpu_hotplug.wq); - - cpuhp_lock_release(); - + percpu_up_read(&cpu_hotplug_lock); } -EXPORT_SYMBOL_GPL(put_online_cpus); +EXPORT_SYMBOL_GPL(cpus_read_unlock); -/* - * This ensures that the hotplug operation can begin only when the - * refcount goes to zero. - * - * Note that during a cpu-hotplug operation, the new readers, if any, - * will be blocked by the cpu_hotplug.lock - * - * Since cpu_hotplug_begin() is always called after invoking - * cpu_maps_update_begin(), we can be sure that only one writer is active. - * - * Note that theoretically, there is a possibility of a livelock: - * - Refcount goes to zero, last reader wakes up the sleeping - * writer. - * - Last reader unlocks the cpu_hotplug.lock. - * - A new reader arrives at this moment, bumps up the refcount. - * - The writer acquires the cpu_hotplug.lock finds the refcount - * non zero and goes to sleep again. - * - * However, this is very difficult to achieve in practice since - * get_online_cpus() not an api which is called all that often. - * - */ -void cpu_hotplug_begin(void) +void cpus_write_lock(void) { - DEFINE_WAIT(wait); - - cpu_hotplug.active_writer = current; - cpuhp_lock_acquire(); + percpu_down_write(&cpu_hotplug_lock); +} - for (;;) { - mutex_lock(&cpu_hotplug.lock); - prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE); - if (likely(!atomic_read(&cpu_hotplug.refcount))) - break; - mutex_unlock(&cpu_hotplug.lock); - schedule(); - } - finish_wait(&cpu_hotplug.wq, &wait); +void cpus_write_unlock(void) +{ + percpu_up_write(&cpu_hotplug_lock); } -void cpu_hotplug_done(void) +void lockdep_assert_cpus_held(void) { - cpu_hotplug.active_writer = NULL; - mutex_unlock(&cpu_hotplug.lock); - cpuhp_lock_release(); + percpu_rwsem_assert_held(&cpu_hotplug_lock); } /* @@ -344,8 +271,6 @@ void cpu_hotplug_enable(void) EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ -/* Notifier wrappers for transitioning to state machine */ - static int bringup_wait_for_ap(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); @@ -484,6 +409,7 @@ static void cpuhp_thread_fun(unsigned int cpu) st->should_run = false; + lock_map_acquire(&cpuhp_state_lock_map); /* Single callback invocation for [un]install ? */ if (st->single) { if (st->cb_state < CPUHP_AP_ONLINE) { @@ -510,6 +436,7 @@ static void cpuhp_thread_fun(unsigned int cpu) else if (st->state > st->target) ret = cpuhp_ap_offline(cpu, st); } + lock_map_release(&cpuhp_state_lock_map); st->result = ret; complete(&st->done); } @@ -524,6 +451,9 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, if (!cpu_online(cpu)) return 0; + lock_map_acquire(&cpuhp_state_lock_map); + lock_map_release(&cpuhp_state_lock_map); + /* * If we are up and running, use the hotplug thread. For early calls * we invoke the thread function directly. @@ -567,6 +497,8 @@ static int cpuhp_kick_ap_work(unsigned int cpu) enum cpuhp_state state = st->state; trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); + lock_map_acquire(&cpuhp_state_lock_map); + lock_map_release(&cpuhp_state_lock_map); __cpuhp_kick_ap_work(st); wait_for_completion(&st->done); trace_cpuhp_exit(cpu, st->state, state, st->result); @@ -701,7 +633,7 @@ static int takedown_cpu(unsigned int cpu) /* * So now all preempt/rcu users must observe !cpu_active(). */ - err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu)); + err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu)); if (err) { /* CPU refused to die */ irq_unlock_sparse(); @@ -773,7 +705,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, if (!cpu_present(cpu)) return -EINVAL; - cpu_hotplug_begin(); + cpus_write_lock(); cpuhp_tasks_frozen = tasks_frozen; @@ -811,7 +743,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, } out: - cpu_hotplug_done(); + cpus_write_unlock(); return ret; } @@ -893,7 +825,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) struct task_struct *idle; int ret = 0; - cpu_hotplug_begin(); + cpus_write_lock(); if (!cpu_present(cpu)) { ret = -EINVAL; @@ -941,7 +873,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) target = min((int)target, CPUHP_BRINGUP_CPU); ret = cpuhp_up_callbacks(cpu, st, target); out: - cpu_hotplug_done(); + cpus_write_unlock(); return ret; } @@ -1413,18 +1345,20 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, } } -int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, - bool invoke) +int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state, + struct hlist_node *node, + bool invoke) { struct cpuhp_step *sp; int cpu; int ret; + lockdep_assert_cpus_held(); + sp = cpuhp_get_step(state); if (sp->multi_instance == false) return -EINVAL; - get_online_cpus(); mutex_lock(&cpuhp_state_mutex); if (!invoke || !sp->startup.multi) @@ -1453,13 +1387,23 @@ add_node: hlist_add_head(node, &sp->list); unlock: mutex_unlock(&cpuhp_state_mutex); - put_online_cpus(); + return ret; +} + +int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, + bool invoke) +{ + int ret; + + cpus_read_lock(); + ret = __cpuhp_state_add_instance_cpuslocked(state, node, invoke); + cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); /** - * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state + * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state * @state: The state to setup * @invoke: If true, the startup function is invoked for cpus where * cpu state >= @state @@ -1468,25 +1412,27 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); * @multi_instance: State is set up for multiple instances which get * added afterwards. * + * The caller needs to hold cpus read locked while calling this function. * Returns: * On success: * Positive state number if @state is CPUHP_AP_ONLINE_DYN * 0 for all other states * On failure: proper (negative) error code */ -int __cpuhp_setup_state(enum cpuhp_state state, - const char *name, bool invoke, - int (*startup)(unsigned int cpu), - int (*teardown)(unsigned int cpu), - bool multi_instance) +int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, + const char *name, bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu), + bool multi_instance) { int cpu, ret = 0; bool dynstate; + lockdep_assert_cpus_held(); + if (cpuhp_cb_check(state) || !name) return -EINVAL; - get_online_cpus(); mutex_lock(&cpuhp_state_mutex); ret = cpuhp_store_callbacks(state, name, startup, teardown, @@ -1522,7 +1468,6 @@ int __cpuhp_setup_state(enum cpuhp_state state, } out: mutex_unlock(&cpuhp_state_mutex); - put_online_cpus(); /* * If the requested state is CPUHP_AP_ONLINE_DYN, return the * dynamically allocated state in case of success. @@ -1531,6 +1476,22 @@ out: return state; return ret; } +EXPORT_SYMBOL(__cpuhp_setup_state_cpuslocked); + +int __cpuhp_setup_state(enum cpuhp_state state, + const char *name, bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu), + bool multi_instance) +{ + int ret; + + cpus_read_lock(); + ret = __cpuhp_setup_state_cpuslocked(state, name, invoke, startup, + teardown, multi_instance); + cpus_read_unlock(); + return ret; +} EXPORT_SYMBOL(__cpuhp_setup_state); int __cpuhp_state_remove_instance(enum cpuhp_state state, @@ -1544,7 +1505,7 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state, if (!sp->multi_instance) return -EINVAL; - get_online_cpus(); + cpus_read_lock(); mutex_lock(&cpuhp_state_mutex); if (!invoke || !cpuhp_get_teardown_cb(state)) @@ -1565,29 +1526,30 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state, remove: hlist_del(node); mutex_unlock(&cpuhp_state_mutex); - put_online_cpus(); + cpus_read_unlock(); return 0; } EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance); /** - * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state + * __cpuhp_remove_state_cpuslocked - Remove the callbacks for an hotplug machine state * @state: The state to remove * @invoke: If true, the teardown function is invoked for cpus where * cpu state >= @state * + * The caller needs to hold cpus read locked while calling this function. * The teardown callback is currently not allowed to fail. Think * about module removal! */ -void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) +void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke) { struct cpuhp_step *sp = cpuhp_get_step(state); int cpu; BUG_ON(cpuhp_cb_check(state)); - get_online_cpus(); + lockdep_assert_cpus_held(); mutex_lock(&cpuhp_state_mutex); if (sp->multi_instance) { @@ -1615,7 +1577,14 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) remove: cpuhp_store_callbacks(state, NULL, NULL, NULL, false); mutex_unlock(&cpuhp_state_mutex); - put_online_cpus(); +} +EXPORT_SYMBOL(__cpuhp_remove_state_cpuslocked); + +void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) +{ + cpus_read_lock(); + __cpuhp_remove_state_cpuslocked(state, invoke); + cpus_read_unlock(); } EXPORT_SYMBOL(__cpuhp_remove_state); diff --git a/kernel/events/core.c b/kernel/events/core.c index 6e75a5c9412d..8d6acaeeea17 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -389,6 +389,7 @@ static atomic_t nr_switch_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); static struct srcu_struct pmus_srcu; +static cpumask_var_t perf_online_mask; /* * perf event paranoia level: @@ -3812,14 +3813,6 @@ find_get_context(struct pmu *pmu, struct task_struct *task, if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); - /* - * We could be clever and allow to attach a event to an - * offline CPU and activate it when the CPU comes up, but - * that's for later. - */ - if (!cpu_online(cpu)) - return ERR_PTR(-ENODEV); - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); ctx = &cpuctx->ctx; get_ctx(ctx); @@ -7703,7 +7696,8 @@ static int swevent_hlist_get_cpu(int cpu) int err = 0; mutex_lock(&swhash->hlist_mutex); - if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { + if (!swevent_hlist_deref(swhash) && + cpumask_test_cpu(cpu, perf_online_mask)) { struct swevent_hlist *hlist; hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); @@ -7724,7 +7718,7 @@ static int swevent_hlist_get(void) { int err, cpu, failed_cpu; - get_online_cpus(); + mutex_lock(&pmus_lock); for_each_possible_cpu(cpu) { err = swevent_hlist_get_cpu(cpu); if (err) { @@ -7732,8 +7726,7 @@ static int swevent_hlist_get(void) goto fail; } } - put_online_cpus(); - + mutex_unlock(&pmus_lock); return 0; fail: for_each_possible_cpu(cpu) { @@ -7741,8 +7734,7 @@ fail: break; swevent_hlist_put_cpu(cpu); } - - put_online_cpus(); + mutex_unlock(&pmus_lock); return err; } @@ -8920,7 +8912,7 @@ perf_event_mux_interval_ms_store(struct device *dev, pmu->hrtimer_interval_ms = timer; /* update all cpuctx for this PMU */ - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) { struct perf_cpu_context *cpuctx; cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); @@ -8929,7 +8921,7 @@ perf_event_mux_interval_ms_store(struct device *dev, cpu_function_call(cpu, (remote_function_f)perf_mux_hrtimer_restart, cpuctx); } - put_online_cpus(); + cpus_read_unlock(); mutex_unlock(&mux_interval_mutex); return count; @@ -9059,6 +9051,7 @@ skip_type: lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); cpuctx->ctx.pmu = pmu; + cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); __perf_mux_hrtimer_init(cpuctx, cpu); } @@ -9172,7 +9165,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) static struct pmu *perf_init_event(struct perf_event *event) { - struct pmu *pmu = NULL; + struct pmu *pmu; int idx; int ret; @@ -9456,9 +9449,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } pmu = perf_init_event(event); - if (!pmu) - goto err_ns; - else if (IS_ERR(pmu)) { + if (IS_ERR(pmu)) { err = PTR_ERR(pmu); goto err_ns; } @@ -9471,8 +9462,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->addr_filters_offs = kcalloc(pmu->nr_addr_filters, sizeof(unsigned long), GFP_KERNEL); - if (!event->addr_filters_offs) + if (!event->addr_filters_offs) { + err = -ENOMEM; goto err_per_task; + } /* force hw sync on the address filters */ event->addr_filters_gen = 1; @@ -9882,12 +9875,10 @@ SYSCALL_DEFINE5(perf_event_open, goto err_task; } - get_online_cpus(); - if (task) { err = mutex_lock_interruptible(&task->signal->cred_guard_mutex); if (err) - goto err_cpus; + goto err_cred; /* * Reuse ptrace permission checks for now. @@ -10073,6 +10064,23 @@ SYSCALL_DEFINE5(perf_event_open, goto err_locked; } + if (!task) { + /* + * Check if the @cpu we're creating an event for is online. + * + * We use the perf_cpu_context::ctx::mutex to serialize against + * the hotplug notifiers. See perf_event_{init,exit}_cpu(). + */ + struct perf_cpu_context *cpuctx = + container_of(ctx, struct perf_cpu_context, ctx); + + if (!cpuctx->online) { + err = -ENODEV; + goto err_locked; + } + } + + /* * Must be under the same ctx::mutex as perf_install_in_context(), * because we need to serialize with concurrent event creation. @@ -10162,8 +10170,6 @@ SYSCALL_DEFINE5(perf_event_open, put_task_struct(task); } - put_online_cpus(); - mutex_lock(¤t->perf_event_mutex); list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); @@ -10197,8 +10203,6 @@ err_alloc: err_cred: if (task) mutex_unlock(&task->signal->cred_guard_mutex); -err_cpus: - put_online_cpus(); err_task: if (task) put_task_struct(task); @@ -10253,6 +10257,21 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_unlock; } + if (!task) { + /* + * Check if the @cpu we're creating an event for is online. + * + * We use the perf_cpu_context::ctx::mutex to serialize against + * the hotplug notifiers. See perf_event_{init,exit}_cpu(). + */ + struct perf_cpu_context *cpuctx = + container_of(ctx, struct perf_cpu_context, ctx); + if (!cpuctx->online) { + err = -ENODEV; + goto err_unlock; + } + } + if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; goto err_unlock; @@ -10920,6 +10939,8 @@ static void __init perf_event_init_all_cpus(void) struct swevent_htable *swhash; int cpu; + zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); + for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); @@ -10935,7 +10956,7 @@ static void __init perf_event_init_all_cpus(void) } } -int perf_event_init_cpu(unsigned int cpu) +void perf_swevent_init_cpu(unsigned int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); @@ -10948,7 +10969,6 @@ int perf_event_init_cpu(unsigned int cpu) rcu_assign_pointer(swhash->swevent_hlist, hlist); } mutex_unlock(&swhash->hlist_mutex); - return 0; } #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE @@ -10966,19 +10986,22 @@ static void __perf_event_exit_context(void *__info) static void perf_event_exit_cpu_context(int cpu) { + struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; struct pmu *pmu; - int idx; - idx = srcu_read_lock(&pmus_srcu); - list_for_each_entry_rcu(pmu, &pmus, entry) { - ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; + mutex_lock(&pmus_lock); + list_for_each_entry(pmu, &pmus, entry) { + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + ctx = &cpuctx->ctx; mutex_lock(&ctx->mutex); smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + cpuctx->online = 0; mutex_unlock(&ctx->mutex); } - srcu_read_unlock(&pmus_srcu, idx); + cpumask_clear_cpu(cpu, perf_online_mask); + mutex_unlock(&pmus_lock); } #else @@ -10986,6 +11009,29 @@ static void perf_event_exit_cpu_context(int cpu) { } #endif +int perf_event_init_cpu(unsigned int cpu) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct pmu *pmu; + + perf_swevent_init_cpu(cpu); + + mutex_lock(&pmus_lock); + cpumask_set_cpu(cpu, perf_online_mask); + list_for_each_entry(pmu, &pmus, entry) { + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + ctx = &cpuctx->ctx; + + mutex_lock(&ctx->mutex); + cpuctx->online = 1; + mutex_unlock(&ctx->mutex); + } + mutex_unlock(&pmus_lock); + + return 0; +} + int perf_event_exit_cpu(unsigned int cpu) { perf_event_exit_cpu_context(cpu); diff --git a/kernel/extable.c b/kernel/extable.c index 2676d7f8baf6..0fbdd8582f08 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -75,7 +75,7 @@ int core_kernel_text(unsigned long addr) addr < (unsigned long)_etext) return 1; - if (system_state == SYSTEM_BOOTING && + if (system_state < SYSTEM_RUNNING && init_kernel_text(addr)) return 1; return 0; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 22e443133987..c5bbaed5a5e6 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -480,7 +480,8 @@ int __init early_irq_init(void) /* Let arch update nr_irqs and return the nr of preallocated irqs */ initcnt = arch_probe_nr_irqs(); - printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); + printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n", + NR_IRQS, nr_irqs, initcnt); if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS)) nr_irqs = IRQ_BITMAP_BITS; @@ -516,7 +517,7 @@ int __init early_irq_init(void) init_irq_default_affinity(); - printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); + printk(KERN_INFO "NR_IRQS: %d\n", NR_IRQS); desc = irq_desc; count = ARRAY_SIZE(irq_desc); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 31805f237396..70b9da72018b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -746,13 +746,54 @@ unsigned int irq_find_mapping(struct irq_domain *domain, EXPORT_SYMBOL_GPL(irq_find_mapping); #ifdef CONFIG_IRQ_DOMAIN_DEBUG +static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc) +{ + struct irq_domain *domain; + struct irq_data *data; + + domain = desc->irq_data.domain; + data = &desc->irq_data; + + while (domain) { + unsigned int irq = data->irq; + unsigned long hwirq = data->hwirq; + struct irq_chip *chip; + bool direct; + + if (data == &desc->irq_data) + seq_printf(m, "%5d ", irq); + else + seq_printf(m, "%5d+ ", irq); + seq_printf(m, "0x%05lx ", hwirq); + + chip = irq_data_get_irq_chip(data); + seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); + + seq_printf(m, data ? "0x%p " : " %p ", + irq_data_get_irq_chip_data(data)); + + seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); + direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq); + seq_printf(m, "%6s%-8s ", + (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX", + direct ? "(DIRECT)" : ""); + seq_printf(m, "%s\n", domain->name); +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + domain = domain->parent; + data = data->parent_data; +#else + domain = NULL; +#endif + } +} + static int virq_debug_show(struct seq_file *m, void *private) { unsigned long flags; struct irq_desc *desc; struct irq_domain *domain; struct radix_tree_iter iter; - void *data, **slot; + void **slot; int i; seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", @@ -760,15 +801,26 @@ static int virq_debug_show(struct seq_file *m, void *private) mutex_lock(&irq_domain_mutex); list_for_each_entry(domain, &irq_domain_list, link) { struct device_node *of_node; + const char *name; + int count = 0; + of_node = irq_domain_get_of_node(domain); + if (of_node) + name = of_node_full_name(of_node); + else if (is_fwnode_irqchip(domain->fwnode)) + name = container_of(domain->fwnode, struct irqchip_fwid, + fwnode)->name; + else + name = ""; + radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) count++; seq_printf(m, "%c%-16s %6u %10u %10u %s\n", domain == irq_default_domain ? '*' : ' ', domain->name, domain->revmap_size + count, domain->revmap_size, domain->revmap_direct_max_irq, - of_node ? of_node_full_name(of_node) : ""); + name); } mutex_unlock(&irq_domain_mutex); @@ -782,30 +834,7 @@ static int virq_debug_show(struct seq_file *m, void *private) continue; raw_spin_lock_irqsave(&desc->lock, flags); - domain = desc->irq_data.domain; - - if (domain) { - struct irq_chip *chip; - int hwirq = desc->irq_data.hwirq; - bool direct; - - seq_printf(m, "%5d ", i); - seq_printf(m, "0x%05x ", hwirq); - - chip = irq_desc_get_chip(desc); - seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); - - data = irq_desc_get_chip_data(desc); - seq_printf(m, data ? "0x%p " : " %p ", data); - - seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); - direct = (i == hwirq) && (i < domain->revmap_direct_max_irq); - seq_printf(m, "%6s%-8s ", - (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX", - direct ? "(DIRECT)" : ""); - seq_printf(m, "%s\n", desc->irq_data.domain->name); - } - + virq_debug_show_one(m, desc); raw_spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index ddc2f5427f75..fe4d48ec5bc4 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -265,13 +265,19 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, struct msi_domain_info *info, struct irq_domain *parent) { + struct irq_domain *domain; + if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS) msi_domain_update_dom_ops(info); if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) msi_domain_update_chip_ops(info); - return irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0, - fwnode, &msi_domain_ops, info); + domain = irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0, + fwnode, &msi_domain_ops, info); + if (domain && info->chip && info->chip->name) + domain->name = info->chip->name; + + return domain; } int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 6c9cb208ac48..d11c506a6ac3 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -15,6 +15,7 @@ #include <linux/static_key.h> #include <linux/jump_label_ratelimit.h> #include <linux/bug.h> +#include <linux/cpu.h> #ifdef HAVE_JUMP_LABEL @@ -124,6 +125,7 @@ void static_key_slow_inc(struct static_key *key) return; } + cpus_read_lock(); jump_label_lock(); if (atomic_read(&key->enabled) == 0) { atomic_set(&key->enabled, -1); @@ -133,12 +135,14 @@ void static_key_slow_inc(struct static_key *key) atomic_inc(&key->enabled); } jump_label_unlock(); + cpus_read_unlock(); } EXPORT_SYMBOL_GPL(static_key_slow_inc); static void __static_key_slow_dec(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { + cpus_read_lock(); /* * The negative count check is valid even when a negative * key->enabled is in use by static_key_slow_inc(); a @@ -149,6 +153,7 @@ static void __static_key_slow_dec(struct static_key *key, if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { WARN(atomic_read(&key->enabled) < 0, "jump label: negative count!\n"); + cpus_read_unlock(); return; } @@ -159,6 +164,7 @@ static void __static_key_slow_dec(struct static_key *key, jump_label_update(key); } jump_label_unlock(); + cpus_read_unlock(); } static void jump_label_update_timeout(struct work_struct *work) @@ -334,6 +340,7 @@ void __init jump_label_init(void) if (static_key_initialized) return; + cpus_read_lock(); jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); @@ -353,6 +360,7 @@ void __init jump_label_init(void) } static_key_initialized = true; jump_label_unlock(); + cpus_read_unlock(); } #ifdef CONFIG_MODULES @@ -590,28 +598,28 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, struct module *mod = data; int ret = 0; + cpus_read_lock(); + jump_label_lock(); + switch (val) { case MODULE_STATE_COMING: - jump_label_lock(); ret = jump_label_add_module(mod); if (ret) { WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n"); jump_label_del_module(mod); } - jump_label_unlock(); break; case MODULE_STATE_GOING: - jump_label_lock(); jump_label_del_module(mod); - jump_label_unlock(); break; case MODULE_STATE_LIVE: - jump_label_lock(); jump_label_invalidate_module_init(mod); - jump_label_unlock(); break; } + jump_label_unlock(); + cpus_read_unlock(); + return notifier_from_errno(ret); } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index adfe3b4cfe05..6756d750b31b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -483,11 +483,6 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); */ static void do_optimize_kprobes(void) { - /* Optimization never be done when disarmed */ - if (kprobes_all_disarmed || !kprobes_allow_optimization || - list_empty(&optimizing_list)) - return; - /* * The optimization/unoptimization refers online_cpus via * stop_machine() and cpu-hotplug modifies online_cpus. @@ -495,14 +490,19 @@ static void do_optimize_kprobes(void) * This combination can cause a deadlock (cpu-hotplug try to lock * text_mutex but stop_machine can not be done because online_cpus * has been changed) - * To avoid this deadlock, we need to call get_online_cpus() + * To avoid this deadlock, caller must have locked cpu hotplug * for preventing cpu-hotplug outside of text_mutex locking. */ - get_online_cpus(); + lockdep_assert_cpus_held(); + + /* Optimization never be done when disarmed */ + if (kprobes_all_disarmed || !kprobes_allow_optimization || + list_empty(&optimizing_list)) + return; + mutex_lock(&text_mutex); arch_optimize_kprobes(&optimizing_list); mutex_unlock(&text_mutex); - put_online_cpus(); } /* @@ -513,12 +513,13 @@ static void do_unoptimize_kprobes(void) { struct optimized_kprobe *op, *tmp; + /* See comment in do_optimize_kprobes() */ + lockdep_assert_cpus_held(); + /* Unoptimization must be done anytime */ if (list_empty(&unoptimizing_list)) return; - /* Ditto to do_optimize_kprobes */ - get_online_cpus(); mutex_lock(&text_mutex); arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); /* Loop free_list for disarming */ @@ -537,7 +538,6 @@ static void do_unoptimize_kprobes(void) list_del_init(&op->list); } mutex_unlock(&text_mutex); - put_online_cpus(); } /* Reclaim all kprobes on the free_list */ @@ -562,6 +562,7 @@ static void kick_kprobe_optimizer(void) static void kprobe_optimizer(struct work_struct *work) { mutex_lock(&kprobe_mutex); + cpus_read_lock(); /* Lock modules while optimizing kprobes */ mutex_lock(&module_mutex); @@ -587,6 +588,7 @@ static void kprobe_optimizer(struct work_struct *work) do_free_cleaned_kprobes(); mutex_unlock(&module_mutex); + cpus_read_unlock(); mutex_unlock(&kprobe_mutex); /* Step 5: Kick optimizer again if needed */ @@ -650,9 +652,8 @@ static void optimize_kprobe(struct kprobe *p) /* Short cut to direct unoptimizing */ static void force_unoptimize_kprobe(struct optimized_kprobe *op) { - get_online_cpus(); + lockdep_assert_cpus_held(); arch_unoptimize_kprobe(op); - put_online_cpus(); if (kprobe_disabled(&op->kp)) arch_disarm_kprobe(&op->kp); } @@ -791,6 +792,7 @@ static void try_to_optimize_kprobe(struct kprobe *p) return; /* For preparing optimization, jump_label_text_reserved() is called */ + cpus_read_lock(); jump_label_lock(); mutex_lock(&text_mutex); @@ -812,6 +814,7 @@ static void try_to_optimize_kprobe(struct kprobe *p) out: mutex_unlock(&text_mutex); jump_label_unlock(); + cpus_read_unlock(); } #ifdef CONFIG_SYSCTL @@ -826,6 +829,7 @@ static void optimize_all_kprobes(void) if (kprobes_allow_optimization) goto out; + cpus_read_lock(); kprobes_allow_optimization = true; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; @@ -833,6 +837,7 @@ static void optimize_all_kprobes(void) if (!kprobe_disabled(p)) optimize_kprobe(p); } + cpus_read_unlock(); printk(KERN_INFO "Kprobes globally optimized\n"); out: mutex_unlock(&kprobe_mutex); @@ -851,6 +856,7 @@ static void unoptimize_all_kprobes(void) return; } + cpus_read_lock(); kprobes_allow_optimization = false; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; @@ -859,6 +865,7 @@ static void unoptimize_all_kprobes(void) unoptimize_kprobe(p, false); } } + cpus_read_unlock(); mutex_unlock(&kprobe_mutex); /* Wait for unoptimizing completion */ @@ -1010,14 +1017,11 @@ static void arm_kprobe(struct kprobe *kp) arm_kprobe_ftrace(kp); return; } - /* - * Here, since __arm_kprobe() doesn't use stop_machine(), - * this doesn't cause deadlock on text_mutex. So, we don't - * need get_online_cpus(). - */ + cpus_read_lock(); mutex_lock(&text_mutex); __arm_kprobe(kp); mutex_unlock(&text_mutex); + cpus_read_unlock(); } /* Disarm a kprobe with text_mutex */ @@ -1027,10 +1031,12 @@ static void disarm_kprobe(struct kprobe *kp, bool reopt) disarm_kprobe_ftrace(kp); return; } - /* Ditto */ + + cpus_read_lock(); mutex_lock(&text_mutex); __disarm_kprobe(kp, reopt); mutex_unlock(&text_mutex); + cpus_read_unlock(); } /* @@ -1298,13 +1304,10 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) int ret = 0; struct kprobe *ap = orig_p; + cpus_read_lock(); + /* For preparing optimization, jump_label_text_reserved() is called */ jump_label_lock(); - /* - * Get online CPUs to avoid text_mutex deadlock.with stop machine, - * which is invoked by unoptimize_kprobe() in add_new_kprobe() - */ - get_online_cpus(); mutex_lock(&text_mutex); if (!kprobe_aggrprobe(orig_p)) { @@ -1352,8 +1355,8 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) out: mutex_unlock(&text_mutex); - put_online_cpus(); jump_label_unlock(); + cpus_read_unlock(); if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) { ap->flags &= ~KPROBE_FLAG_DISABLED; @@ -1555,9 +1558,12 @@ int register_kprobe(struct kprobe *p) goto out; } - mutex_lock(&text_mutex); /* Avoiding text modification */ + cpus_read_lock(); + /* Prevent text modification */ + mutex_lock(&text_mutex); ret = prepare_kprobe(p); mutex_unlock(&text_mutex); + cpus_read_unlock(); if (ret) goto out; @@ -1570,7 +1576,6 @@ int register_kprobe(struct kprobe *p) /* Try to optimize kprobe */ try_to_optimize_kprobe(p); - out: mutex_unlock(&kprobe_mutex); diff --git a/kernel/padata.c b/kernel/padata.c index ac8f1e524836..868f947166d7 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -934,29 +934,18 @@ static struct kobj_type padata_attr_type = { }; /** - * padata_alloc_possible - Allocate and initialize padata instance. - * Use the cpu_possible_mask for serial and - * parallel workers. - * - * @wq: workqueue to use for the allocated padata instance - */ -struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) -{ - return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); -} -EXPORT_SYMBOL(padata_alloc_possible); - -/** * padata_alloc - allocate and initialize a padata instance and specify * cpumasks for serial and parallel workers. * * @wq: workqueue to use for the allocated padata instance * @pcpumask: cpumask that will be used for padata parallelization * @cbcpumask: cpumask that will be used for padata serialization + * + * Must be called from a cpus_read_lock() protected region */ -struct padata_instance *padata_alloc(struct workqueue_struct *wq, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask) +static struct padata_instance *padata_alloc(struct workqueue_struct *wq, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) { struct padata_instance *pinst; struct parallel_data *pd = NULL; @@ -965,7 +954,6 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq, if (!pinst) goto err; - get_online_cpus(); if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) goto err_free_inst; if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { @@ -989,14 +977,12 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq, pinst->flags = 0; - put_online_cpus(); - BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); kobject_init(&pinst->kobj, &padata_attr_type); mutex_init(&pinst->lock); #ifdef CONFIG_HOTPLUG_CPU - cpuhp_state_add_instance_nocalls(hp_online, &pinst->node); + cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node); #endif return pinst; @@ -1005,12 +991,27 @@ err_free_masks: free_cpumask_var(pinst->cpumask.cbcpu); err_free_inst: kfree(pinst); - put_online_cpus(); err: return NULL; } /** + * padata_alloc_possible - Allocate and initialize padata instance. + * Use the cpu_possible_mask for serial and + * parallel workers. + * + * @wq: workqueue to use for the allocated padata instance + * + * Must be called from a cpus_read_lock() protected region + */ +struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) +{ + lockdep_assert_cpus_held(); + return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); +} +EXPORT_SYMBOL(padata_alloc_possible); + +/** * padata_free - free a padata instance * * @padata_inst: padata instance to free diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9995b2dfaaff..e782e9049e73 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1176,7 +1176,7 @@ static void boot_delay_msec(int level) unsigned long long k; unsigned long timeout; - if ((boot_delay == 0 || system_state != SYSTEM_BOOTING) + if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) || suppress_message_printing(level)) { return; } diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 00a45c45beca..ca0f8fc945c6 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -64,6 +64,7 @@ #include <linux/workqueue.h> #include <linux/compiler.h> #include <linux/tick.h> +#include <linux/init.h> /* * Scheduler clock - returns current time in nanosec units. @@ -124,14 +125,27 @@ int sched_clock_stable(void) return static_branch_likely(&__sched_clock_stable); } +static void __scd_stamp(struct sched_clock_data *scd) +{ + scd->tick_gtod = ktime_get_ns(); + scd->tick_raw = sched_clock(); +} + static void __set_sched_clock_stable(void) { - struct sched_clock_data *scd = this_scd(); + struct sched_clock_data *scd; /* + * Since we're still unstable and the tick is already running, we have + * to disable IRQs in order to get a consistent scd->tick* reading. + */ + local_irq_disable(); + scd = this_scd(); + /* * Attempt to make the (initial) unstable->stable transition continuous. */ __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw); + local_irq_enable(); printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n", scd->tick_gtod, __gtod_offset, @@ -141,8 +155,38 @@ static void __set_sched_clock_stable(void) tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); } +/* + * If we ever get here, we're screwed, because we found out -- typically after + * the fact -- that TSC wasn't good. This means all our clocksources (including + * ktime) could have reported wrong values. + * + * What we do here is an attempt to fix up and continue sort of where we left + * off in a coherent manner. + * + * The only way to fully avoid random clock jumps is to boot with: + * "tsc=unstable". + */ static void __sched_clock_work(struct work_struct *work) { + struct sched_clock_data *scd; + int cpu; + + /* take a current timestamp and set 'now' */ + preempt_disable(); + scd = this_scd(); + __scd_stamp(scd); + scd->clock = scd->tick_gtod + __gtod_offset; + preempt_enable(); + + /* clone to all CPUs */ + for_each_possible_cpu(cpu) + per_cpu(sched_clock_data, cpu) = *scd; + + printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n"); + printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", + scd->tick_gtod, __gtod_offset, + scd->tick_raw, __sched_clock_offset); + static_branch_disable(&__sched_clock_stable); } @@ -150,27 +194,11 @@ static DECLARE_WORK(sched_clock_work, __sched_clock_work); static void __clear_sched_clock_stable(void) { - struct sched_clock_data *scd = this_scd(); - - /* - * Attempt to make the stable->unstable transition continuous. - * - * Trouble is, this is typically called from the TSC watchdog - * timer, which is late per definition. This means the tick - * values can already be screwy. - * - * Still do what we can. - */ - __gtod_offset = (scd->tick_raw + __sched_clock_offset) - (scd->tick_gtod); - - printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", - scd->tick_gtod, __gtod_offset, - scd->tick_raw, __sched_clock_offset); + if (!sched_clock_stable()) + return; tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); - - if (sched_clock_stable()) - schedule_work(&sched_clock_work); + schedule_work(&sched_clock_work); } void clear_sched_clock_stable(void) @@ -183,7 +211,11 @@ void clear_sched_clock_stable(void) __clear_sched_clock_stable(); } -void sched_clock_init_late(void) +/* + * We run this as late_initcall() such that it runs after all built-in drivers, + * notably: acpi_processor and intel_idle, which can mark the TSC as unstable. + */ +static int __init sched_clock_init_late(void) { sched_clock_running = 2; /* @@ -197,7 +229,10 @@ void sched_clock_init_late(void) if (__sched_clock_stable_early) __set_sched_clock_stable(); + + return 0; } +late_initcall(sched_clock_init_late); /* * min, max except they take wrapping into account @@ -347,21 +382,38 @@ void sched_clock_tick(void) { struct sched_clock_data *scd; + if (sched_clock_stable()) + return; + + if (unlikely(!sched_clock_running)) + return; + WARN_ON_ONCE(!irqs_disabled()); + scd = this_scd(); + __scd_stamp(scd); + sched_clock_local(scd); +} + +void sched_clock_tick_stable(void) +{ + u64 gtod, clock; + + if (!sched_clock_stable()) + return; + /* - * Update these values even if sched_clock_stable(), because it can - * become unstable at any point in time at which point we need some - * values to fall back on. + * Called under watchdog_lock. * - * XXX arguably we can skip this if we expose tsc_clocksource_reliable + * The watchdog just found this TSC to (still) be stable, so now is a + * good moment to update our __gtod_offset. Because once we find the + * TSC to be unstable, any computation will be computing crap. */ - scd = this_scd(); - scd->tick_raw = sched_clock(); - scd->tick_gtod = ktime_get_ns(); - - if (!sched_clock_stable() && likely(sched_clock_running)) - sched_clock_local(scd); + local_irq_disable(); + gtod = ktime_get_ns(); + clock = sched_clock(); + __gtod_offset = (clock + __sched_clock_offset) - gtod; + local_irq_enable(); } /* @@ -374,15 +426,21 @@ void sched_clock_idle_sleep_event(void) EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); /* - * We just idled delta nanoseconds (called with irqs disabled): + * We just idled; resync with ktime. */ -void sched_clock_idle_wakeup_event(u64 delta_ns) +void sched_clock_idle_wakeup_event(void) { - if (timekeeping_suspended) + unsigned long flags; + + if (sched_clock_stable()) + return; + + if (unlikely(timekeeping_suspended)) return; + local_irq_save(flags); sched_clock_tick(); - touch_softlockup_watchdog_sched(); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 803c3bc274c4..c3e50cada84d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1731,7 +1731,7 @@ void sched_ttwu_pending(void) { struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); - struct task_struct *p; + struct task_struct *p, *t; struct rq_flags rf; if (!llist) @@ -1740,17 +1740,8 @@ void sched_ttwu_pending(void) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - while (llist) { - int wake_flags = 0; - - p = llist_entry(llist, struct task_struct, wake_entry); - llist = llist_next(llist); - - if (p->sched_remote_wakeup) - wake_flags = WF_MIGRATED; - - ttwu_do_activate(rq, p, wake_flags, &rf); - } + llist_for_each_entry_safe(p, t, llist, wake_entry) + ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); rq_unlock_irqrestore(rq, &rf); } @@ -4197,8 +4188,8 @@ static int __sched_setscheduler(struct task_struct *p, int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct rq *rq; - /* May grab non-irq protected spin_locks: */ - BUG_ON(in_interrupt()); + /* The pi code expects interrupts enabled */ + BUG_ON(pi && in_interrupt()); recheck: /* Double check policy once rq lock held: */ if (policy < 0) { @@ -5958,7 +5949,6 @@ void __init sched_init_smp(void) cpumask_var_t non_isolated_cpus; alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); - alloc_cpumask_var(&fallback_doms, GFP_KERNEL); sched_init_numa(); @@ -5968,7 +5958,7 @@ void __init sched_init_smp(void) * happen. */ mutex_lock(&sched_domains_mutex); - init_sched_domains(cpu_active_mask); + sched_init_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); @@ -5984,7 +5974,6 @@ void __init sched_init_smp(void) init_sched_dl_class(); sched_init_smt(); - sched_clock_init_late(); sched_smp_initialized = true; } @@ -6000,7 +5989,6 @@ early_initcall(migration_init); void __init sched_init_smp(void) { sched_init_granularity(); - sched_clock_init_late(); } #endif /* CONFIG_SMP */ @@ -6199,7 +6187,6 @@ void __init sched_init(void) calc_load_update = jiffies + LOAD_FREQ; #ifdef CONFIG_SMP - zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); /* May be allocated at isolcpus cmdline parse time */ if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); @@ -6251,8 +6238,10 @@ void ___might_sleep(const char *file, int line, int preempt_offset) if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && !is_idle_task(current)) || - system_state != SYSTEM_RUNNING || oops_in_progress) + system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || + oops_in_progress) return; + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a2ce59015642..df6c2912bd60 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1533,7 +1533,7 @@ retry: * then possible that next_task has migrated. */ task = pick_next_pushable_dl_task(rq); - if (task_cpu(next_task) == rq->cpu && task == next_task) { + if (task == next_task) { /* * The task is still there. We don't try * again, some other cpu will pull it when ready. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d71109321841..47a0c552c77b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -369,8 +369,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) } /* Iterate thr' all leaf cfs_rq's on a runqueue */ -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ + leaf_cfs_rq_list) /* Do the two (enqueued) entities belong to the same group ? */ static inline struct cfs_rq * @@ -463,8 +464,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) { } -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos) static inline struct sched_entity *parent_entity(struct sched_entity *se) { @@ -2469,7 +2470,8 @@ void task_numa_work(struct callback_head *work) return; - down_read(&mm->mmap_sem); + if (!down_read_trylock(&mm->mmap_sem)) + return; vma = find_vma(mm, start); if (!vma) { reset_ptenuma_scan(p); @@ -2916,12 +2918,12 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, /* * Step 2: update *_avg. */ - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); if (cfs_rq) { cfs_rq->runnable_load_avg = - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); } - sa->util_avg = sa->util_sum / LOAD_AVG_MAX; + sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib); return 1; } @@ -4642,24 +4644,43 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_cancel(&cfs_b->slack_timer); } +/* + * Both these cpu hotplug callbacks race against unregister_fair_sched_group() + * + * The race is harmless, since modifying bandwidth settings of unhooked group + * bits doesn't do much. + */ + +/* cpu online calback */ static void __maybe_unused update_runtime_enabled(struct rq *rq) { - struct cfs_rq *cfs_rq; + struct task_group *tg; - for_each_leaf_cfs_rq(rq, cfs_rq) { - struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; + lockdep_assert_held(&rq->lock); + + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; raw_spin_lock(&cfs_b->lock); cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; raw_spin_unlock(&cfs_b->lock); } + rcu_read_unlock(); } +/* cpu offline callback */ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { - struct cfs_rq *cfs_rq; + struct task_group *tg; + + lockdep_assert_held(&rq->lock); + + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - for_each_leaf_cfs_rq(rq, cfs_rq) { if (!cfs_rq->runtime_enabled) continue; @@ -4677,6 +4698,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); } + rcu_read_unlock(); } #else /* CONFIG_CFS_BANDWIDTH */ @@ -5484,12 +5506,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int i; /* Skip over this group if it has no CPUs allowed */ - if (!cpumask_intersects(sched_group_cpus(group), + if (!cpumask_intersects(sched_group_span(group), &p->cpus_allowed)) continue; local_group = cpumask_test_cpu(this_cpu, - sched_group_cpus(group)); + sched_group_span(group)); /* * Tally up the load of all CPUs in the group and find @@ -5499,7 +5521,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, runnable_load = 0; max_spare_cap = 0; - for_each_cpu(i, sched_group_cpus(group)) { + for_each_cpu(i, sched_group_span(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); @@ -5602,10 +5624,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* Check if we have any choice: */ if (group->group_weight == 1) - return cpumask_first(sched_group_cpus(group)); + return cpumask_first(sched_group_span(group)); /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { + for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { if (idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); @@ -5640,43 +5662,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } -/* - * Implement a for_each_cpu() variant that starts the scan at a given cpu - * (@start), and wraps around. - * - * This is used to scan for idle CPUs; such that not all CPUs looking for an - * idle CPU find the same CPU. The down-side is that tasks tend to cycle - * through the LLC domain. - * - * Especially tbench is found sensitive to this. - */ - -static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped) -{ - int next; - -again: - next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1); - - if (*wrapped) { - if (next >= start) - return nr_cpumask_bits; - } else { - if (next >= nr_cpumask_bits) { - *wrapped = 1; - n = -1; - goto again; - } - } - - return next; -} - -#define for_each_cpu_wrap(cpu, mask, start, wrap) \ - for ((wrap) = 0, (cpu) = (start)-1; \ - (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \ - (cpu) < nr_cpumask_bits; ) - #ifdef CONFIG_SCHED_SMT static inline void set_idle_cores(int cpu, int val) @@ -5736,7 +5721,7 @@ unlock: static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - int core, cpu, wrap; + int core, cpu; if (!static_branch_likely(&sched_smt_present)) return -1; @@ -5746,7 +5731,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); - for_each_cpu_wrap(core, cpus, target, wrap) { + for_each_cpu_wrap(core, cpus, target) { bool idle = true; for_each_cpu(cpu, cpu_smt_mask(core)) { @@ -5812,7 +5797,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t u64 avg_cost, avg_idle = this_rq()->avg_idle; u64 time, cost; s64 delta; - int cpu, wrap; + int cpu; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) @@ -5829,7 +5814,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t time = local_clock(); - for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { + for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) continue; if (idle_cpu(cpu)) @@ -6970,10 +6955,28 @@ static void attach_tasks(struct lb_env *env) } #ifdef CONFIG_FAIR_GROUP_SCHED + +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load.weight) + return false; + + if (cfs_rq->avg.load_sum) + return false; + + if (cfs_rq->avg.util_sum) + return false; + + if (cfs_rq->runnable_load_sum) + return false; + + return true; +} + static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; struct rq_flags rf; rq_lock_irqsave(rq, &rf); @@ -6983,7 +6986,7 @@ static void update_blocked_averages(int cpu) * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ - for_each_leaf_cfs_rq(rq, cfs_rq) { + for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { struct sched_entity *se; /* throttled entities do not contribute to load */ @@ -6997,6 +7000,13 @@ static void update_blocked_averages(int cpu) se = cfs_rq->tg->se[cpu]; if (se && !skip_blocked_update(se)) update_load_avg(se, 0); + + /* + * There can be a lot of idle CPU cgroups. Don't let fully + * decayed cfs_rqs linger on the list. + */ + if (cfs_rq_is_decayed(cfs_rq)) + list_del_leaf_cfs_rq(cfs_rq); } rq_unlock_irqrestore(rq, &rf); } @@ -7229,7 +7239,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * span the current group. */ - for_each_cpu(cpu, sched_group_cpus(sdg)) { + for_each_cpu(cpu, sched_group_span(sdg)) { struct sched_group_capacity *sgc; struct rq *rq = cpu_rq(cpu); @@ -7408,7 +7418,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, memset(sgs, 0, sizeof(*sgs)); - for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { + for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); /* Bias balancing toward cpus of our domain */ @@ -7572,7 +7582,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sg_lb_stats *sgs = &tmp_sgs; int local_group; - local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); + local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg)); if (local_group) { sds->local = sg; sgs = local; @@ -7927,7 +7937,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, unsigned long busiest_load = 0, busiest_capacity = 1; int i; - for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { + for_each_cpu_and(i, sched_group_span(group), env->cpus) { unsigned long capacity, wl; enum fbq_type rt; @@ -8033,7 +8043,6 @@ static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) { struct sched_group *sg = env->sd->groups; - struct cpumask *sg_cpus, *sg_mask; int cpu, balance_cpu = -1; /* @@ -8043,11 +8052,9 @@ static int should_we_balance(struct lb_env *env) if (env->idle == CPU_NEWLY_IDLE) return 1; - sg_cpus = sched_group_cpus(sg); - sg_mask = sched_group_mask(sg); /* Try to find first idle cpu */ - for_each_cpu_and(cpu, sg_cpus, env->cpus) { - if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) + for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { + if (!idle_cpu(cpu)) continue; balance_cpu = cpu; @@ -8083,7 +8090,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, - .dst_grpmask = sched_group_cpus(sd->groups), + .dst_grpmask = sched_group_span(sd->groups), .idle = idle, .loop_break = sched_nr_migrate_break, .cpus = cpus, @@ -9523,10 +9530,10 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SCHED_DEBUG void print_cfs_stats(struct seq_file *m, int cpu) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; rcu_read_lock(); - for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) + for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos) print_cfs_rq(m, cpu, cfs_rq); rcu_read_unlock(); } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 11192e0cb122..dc4d1483b038 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -76,7 +76,6 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false) SCHED_FEAT(RT_PUSH_IPI, true) #endif -SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 979b7341008a..581d5c7a5264 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -840,6 +840,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); + int skip; + + /* + * When span == cpu_online_mask, taking each rq->lock + * can be time-consuming. Try to avoid it when possible. + */ + raw_spin_lock(&rt_rq->rt_runtime_lock); + skip = !rt_rq->rt_time && !rt_rq->rt_nr_running; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + if (skip) + continue; raw_spin_lock(&rq->lock); if (rt_rq->rt_time) { @@ -1819,7 +1830,7 @@ retry: * pushing. */ task = pick_next_pushable_task(rq); - if (task_cpu(next_task) == rq->cpu && task == next_task) { + if (task == next_task) { /* * The task hasn't migrated, and is still the next * eligible task, but we failed to find a run-queue diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6dda2aab731e..f8cf1d87f065 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -606,11 +606,9 @@ struct root_domain { extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; -extern cpumask_var_t fallback_doms; -extern cpumask_var_t sched_domains_tmpmask; extern void init_defrootdomain(void); -extern int init_sched_domains(const struct cpumask *cpu_map); +extern int sched_init_domains(const struct cpumask *cpu_map); extern void rq_attach_root(struct rq *rq, struct root_domain *rd); #endif /* CONFIG_SMP */ @@ -1025,7 +1023,11 @@ struct sched_group_capacity { unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ - unsigned long cpumask[0]; /* iteration mask */ +#ifdef CONFIG_SCHED_DEBUG + int id; +#endif + + unsigned long cpumask[0]; /* balance mask */ }; struct sched_group { @@ -1046,16 +1048,15 @@ struct sched_group { unsigned long cpumask[0]; }; -static inline struct cpumask *sched_group_cpus(struct sched_group *sg) +static inline struct cpumask *sched_group_span(struct sched_group *sg) { return to_cpumask(sg->cpumask); } /* - * cpumask masking which cpus in the group are allowed to iterate up the domain - * tree. + * See build_balance_mask(). */ -static inline struct cpumask *sched_group_mask(struct sched_group *sg) +static inline struct cpumask *group_balance_mask(struct sched_group *sg) { return to_cpumask(sg->sgc->cpumask); } @@ -1066,7 +1067,7 @@ static inline struct cpumask *sched_group_mask(struct sched_group *sg) */ static inline unsigned int group_first_cpu(struct sched_group *group) { - return cpumask_first(sched_group_cpus(group)); + return cpumask_first(sched_group_span(group)); } extern int group_balance_cpu(struct sched_group *sg); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 1b0b4fb12837..79895aec281e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -10,6 +10,7 @@ DEFINE_MUTEX(sched_domains_mutex); /* Protected by sched_domains_mutex: */ cpumask_var_t sched_domains_tmpmask; +cpumask_var_t sched_domains_tmpmask2; #ifdef CONFIG_SCHED_DEBUG @@ -35,7 +36,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_clear(groupmask); - printk(KERN_DEBUG "%*s domain %d: ", level, "", level); + printk(KERN_DEBUG "%*s domain-%d: ", level, "", level); if (!(sd->flags & SD_LOAD_BALANCE)) { printk("does not load-balance\n"); @@ -45,14 +46,14 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, return -1; } - printk(KERN_CONT "span %*pbl level %s\n", + printk(KERN_CONT "span=%*pbl level=%s\n", cpumask_pr_args(sched_domain_span(sd)), sd->name); if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { printk(KERN_ERR "ERROR: domain->span does not contain " "CPU%d\n", cpu); } - if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { + if (!cpumask_test_cpu(cpu, sched_group_span(group))) { printk(KERN_ERR "ERROR: domain->groups does not contain" " CPU%d\n", cpu); } @@ -65,29 +66,47 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!cpumask_weight(sched_group_cpus(group))) { + if (!cpumask_weight(sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); break; } if (!(sd->flags & SD_OVERLAP) && - cpumask_intersects(groupmask, sched_group_cpus(group))) { + cpumask_intersects(groupmask, sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; } - cpumask_or(groupmask, groupmask, sched_group_cpus(group)); + cpumask_or(groupmask, groupmask, sched_group_span(group)); - printk(KERN_CONT " %*pbl", - cpumask_pr_args(sched_group_cpus(group))); - if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { - printk(KERN_CONT " (cpu_capacity = %lu)", - group->sgc->capacity); + printk(KERN_CONT " %d:{ span=%*pbl", + group->sgc->id, + cpumask_pr_args(sched_group_span(group))); + + if ((sd->flags & SD_OVERLAP) && + !cpumask_equal(group_balance_mask(group), sched_group_span(group))) { + printk(KERN_CONT " mask=%*pbl", + cpumask_pr_args(group_balance_mask(group))); + } + + if (group->sgc->capacity != SCHED_CAPACITY_SCALE) + printk(KERN_CONT " cap=%lu", group->sgc->capacity); + + if (group == sd->groups && sd->child && + !cpumask_equal(sched_domain_span(sd->child), + sched_group_span(group))) { + printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n"); } + printk(KERN_CONT " }"); + group = group->next; + + if (group != sd->groups) + printk(KERN_CONT ","); + } while (group != sd->groups); printk(KERN_CONT "\n"); @@ -113,7 +132,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) return; } - printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu); for (;;) { if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) @@ -477,46 +496,214 @@ enum s_alloc { }; /* - * Build an iteration mask that can exclude certain CPUs from the upwards - * domain traversal. + * Return the canonical balance CPU for this group, this is the first CPU + * of this group that's also in the balance mask. * - * Asymmetric node setups can result in situations where the domain tree is of - * unequal depth, make sure to skip domains that already cover the entire - * range. + * The balance mask are all those CPUs that could actually end up at this + * group. See build_balance_mask(). * - * In that case build_sched_domains() will have terminated the iteration early - * and our sibling sd spans will be empty. Domains should always include the - * CPU they're built on, so check that. + * Also see should_we_balance(). */ -static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) +int group_balance_cpu(struct sched_group *sg) { - const struct cpumask *span = sched_domain_span(sd); + return cpumask_first(group_balance_mask(sg)); +} + + +/* + * NUMA topology (first read the regular topology blurb below) + * + * Given a node-distance table, for example: + * + * node 0 1 2 3 + * 0: 10 20 30 20 + * 1: 20 10 20 30 + * 2: 30 20 10 20 + * 3: 20 30 20 10 + * + * which represents a 4 node ring topology like: + * + * 0 ----- 1 + * | | + * | | + * | | + * 3 ----- 2 + * + * We want to construct domains and groups to represent this. The way we go + * about doing this is to build the domains on 'hops'. For each NUMA level we + * construct the mask of all nodes reachable in @level hops. + * + * For the above NUMA topology that gives 3 levels: + * + * NUMA-2 0-3 0-3 0-3 0-3 + * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2} + * + * NUMA-1 0-1,3 0-2 1-3 0,2-3 + * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3} + * + * NUMA-0 0 1 2 3 + * + * + * As can be seen; things don't nicely line up as with the regular topology. + * When we iterate a domain in child domain chunks some nodes can be + * represented multiple times -- hence the "overlap" naming for this part of + * the topology. + * + * In order to minimize this overlap, we only build enough groups to cover the + * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3. + * + * Because: + * + * - the first group of each domain is its child domain; this + * gets us the first 0-1,3 + * - the only uncovered node is 2, who's child domain is 1-3. + * + * However, because of the overlap, computing a unique CPU for each group is + * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both + * groups include the CPUs of Node-0, while those CPUs would not in fact ever + * end up at those groups (they would end up in group: 0-1,3). + * + * To correct this we have to introduce the group balance mask. This mask + * will contain those CPUs in the group that can reach this group given the + * (child) domain tree. + * + * With this we can once again compute balance_cpu and sched_group_capacity + * relations. + * + * XXX include words on how balance_cpu is unique and therefore can be + * used for sched_group_capacity links. + * + * + * Another 'interesting' topology is: + * + * node 0 1 2 3 + * 0: 10 20 20 30 + * 1: 20 10 20 20 + * 2: 20 20 10 20 + * 3: 30 20 20 10 + * + * Which looks a little like: + * + * 0 ----- 1 + * | / | + * | / | + * | / | + * 2 ----- 3 + * + * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3 + * are not. + * + * This leads to a few particularly weird cases where the sched_domain's are + * not of the same number for each cpu. Consider: + * + * NUMA-2 0-3 0-3 + * groups: {0-2},{1-3} {1-3},{0-2} + * + * NUMA-1 0-2 0-3 0-3 1-3 + * + * NUMA-0 0 1 2 3 + * + */ + + +/* + * Build the balance mask; it contains only those CPUs that can arrive at this + * group and should be considered to continue balancing. + * + * We do this during the group creation pass, therefore the group information + * isn't complete yet, however since each group represents a (child) domain we + * can fully construct this using the sched_domain bits (which are already + * complete). + */ +static void +build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask) +{ + const struct cpumask *sg_span = sched_group_span(sg); struct sd_data *sdd = sd->private; struct sched_domain *sibling; int i; - for_each_cpu(i, span) { + cpumask_clear(mask); + + for_each_cpu(i, sg_span) { sibling = *per_cpu_ptr(sdd->sd, i); - if (!cpumask_test_cpu(i, sched_domain_span(sibling))) + + /* + * Can happen in the asymmetric case, where these siblings are + * unused. The mask will not be empty because those CPUs that + * do have the top domain _should_ span the domain. + */ + if (!sibling->child) continue; - cpumask_set_cpu(i, sched_group_mask(sg)); + /* If we would not end up here, we can't continue from here */ + if (!cpumask_equal(sg_span, sched_domain_span(sibling->child))) + continue; + + cpumask_set_cpu(i, mask); } + + /* We must not have empty masks here */ + WARN_ON_ONCE(cpumask_empty(mask)); } /* - * Return the canonical balance CPU for this group, this is the first CPU - * of this group that's also in the iteration mask. + * XXX: This creates per-node group entries; since the load-balancer will + * immediately access remote memory to construct this group's load-balance + * statistics having the groups node local is of dubious benefit. */ -int group_balance_cpu(struct sched_group *sg) +static struct sched_group * +build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) { - return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); + struct sched_group *sg; + struct cpumask *sg_span; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(cpu)); + + if (!sg) + return NULL; + + sg_span = sched_group_span(sg); + if (sd->child) + cpumask_copy(sg_span, sched_domain_span(sd->child)); + else + cpumask_copy(sg_span, sched_domain_span(sd)); + + return sg; +} + +static void init_overlap_sched_group(struct sched_domain *sd, + struct sched_group *sg) +{ + struct cpumask *mask = sched_domains_tmpmask2; + struct sd_data *sdd = sd->private; + struct cpumask *sg_span; + int cpu; + + build_balance_mask(sd, sg, mask); + cpu = cpumask_first_and(sched_group_span(sg), mask); + + sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); + if (atomic_inc_return(&sg->sgc->ref) == 1) + cpumask_copy(group_balance_mask(sg), mask); + else + WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask)); + + /* + * Initialize sgc->capacity such that even if we mess up the + * domains and no possible iteration will get us here, we won't + * die on a /0 trap. + */ + sg_span = sched_group_span(sg); + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); + sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; } static int build_overlap_sched_groups(struct sched_domain *sd, int cpu) { - struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; + struct sched_group *first = NULL, *last = NULL, *sg; const struct cpumask *span = sched_domain_span(sd); struct cpumask *covered = sched_domains_tmpmask; struct sd_data *sdd = sd->private; @@ -525,7 +712,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) cpumask_clear(covered); - for_each_cpu(i, span) { + for_each_cpu_wrap(i, span, cpu) { struct cpumask *sg_span; if (cpumask_test_cpu(i, covered)) @@ -533,44 +720,27 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) sibling = *per_cpu_ptr(sdd->sd, i); - /* See the comment near build_group_mask(). */ + /* + * Asymmetric node setups can result in situations where the + * domain tree is of unequal depth, make sure to skip domains + * that already cover the entire range. + * + * In that case build_sched_domains() will have terminated the + * iteration early and our sibling sd spans will be empty. + * Domains should always include the CPU they're built on, so + * check that. + */ if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; - sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, cpu_to_node(cpu)); - + sg = build_group_from_child_sched_domain(sibling, cpu); if (!sg) goto fail; - sg_span = sched_group_cpus(sg); - if (sibling->child) - cpumask_copy(sg_span, sched_domain_span(sibling->child)); - else - cpumask_set_cpu(i, sg_span); - + sg_span = sched_group_span(sg); cpumask_or(covered, covered, sg_span); - sg->sgc = *per_cpu_ptr(sdd->sgc, i); - if (atomic_inc_return(&sg->sgc->ref) == 1) - build_group_mask(sd, sg); - - /* - * Initialize sgc->capacity such that even if we mess up the - * domains and no possible iteration will get us here, we won't - * die on a /0 trap. - */ - sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); - sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; - - /* - * Make sure the first group of this domain contains the - * canonical balance CPU. Otherwise the sched_domain iteration - * breaks. See update_sg_lb_stats(). - */ - if ((!groups && cpumask_test_cpu(cpu, sg_span)) || - group_balance_cpu(sg) == cpu) - groups = sg; + init_overlap_sched_group(sd, sg); if (!first) first = sg; @@ -579,7 +749,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) last = sg; last->next = first; } - sd->groups = groups; + sd->groups = first; return 0; @@ -589,23 +759,106 @@ fail: return -ENOMEM; } -static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) + +/* + * Package topology (also see the load-balance blurb in fair.c) + * + * The scheduler builds a tree structure to represent a number of important + * topology features. By default (default_topology[]) these include: + * + * - Simultaneous multithreading (SMT) + * - Multi-Core Cache (MC) + * - Package (DIE) + * + * Where the last one more or less denotes everything up to a NUMA node. + * + * The tree consists of 3 primary data structures: + * + * sched_domain -> sched_group -> sched_group_capacity + * ^ ^ ^ ^ + * `-' `-' + * + * The sched_domains are per-cpu and have a two way link (parent & child) and + * denote the ever growing mask of CPUs belonging to that level of topology. + * + * Each sched_domain has a circular (double) linked list of sched_group's, each + * denoting the domains of the level below (or individual CPUs in case of the + * first domain level). The sched_group linked by a sched_domain includes the + * CPU of that sched_domain [*]. + * + * Take for instance a 2 threaded, 2 core, 2 cache cluster part: + * + * CPU 0 1 2 3 4 5 6 7 + * + * DIE [ ] + * MC [ ] [ ] + * SMT [ ] [ ] [ ] [ ] + * + * - or - + * + * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7 + * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7 + * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7 + * + * CPU 0 1 2 3 4 5 6 7 + * + * One way to think about it is: sched_domain moves you up and down among these + * topology levels, while sched_group moves you sideways through it, at child + * domain granularity. + * + * sched_group_capacity ensures each unique sched_group has shared storage. + * + * There are two related construction problems, both require a CPU that + * uniquely identify each group (for a given domain): + * + * - The first is the balance_cpu (see should_we_balance() and the + * load-balance blub in fair.c); for each group we only want 1 CPU to + * continue balancing at a higher domain. + * + * - The second is the sched_group_capacity; we want all identical groups + * to share a single sched_group_capacity. + * + * Since these topologies are exclusive by construction. That is, its + * impossible for an SMT thread to belong to multiple cores, and cores to + * be part of multiple caches. There is a very clear and unique location + * for each CPU in the hierarchy. + * + * Therefore computing a unique CPU for each group is trivial (the iteration + * mask is redundant and set all 1s; all CPUs in a group will end up at _that_ + * group), we can simply pick the first CPU in each group. + * + * + * [*] in other words, the first group of each domain is its child domain. + */ + +static struct sched_group *get_group(int cpu, struct sd_data *sdd) { struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); struct sched_domain *child = sd->child; + struct sched_group *sg; if (child) cpu = cpumask_first(sched_domain_span(child)); - if (sg) { - *sg = *per_cpu_ptr(sdd->sg, cpu); - (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); + sg = *per_cpu_ptr(sdd->sg, cpu); + sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); + + /* For claim_allocations: */ + atomic_inc(&sg->ref); + atomic_inc(&sg->sgc->ref); - /* For claim_allocations: */ - atomic_set(&(*sg)->sgc->ref, 1); + if (child) { + cpumask_copy(sched_group_span(sg), sched_domain_span(child)); + cpumask_copy(group_balance_mask(sg), sched_group_span(sg)); + } else { + cpumask_set_cpu(cpu, sched_group_span(sg)); + cpumask_set_cpu(cpu, group_balance_mask(sg)); } - return cpu; + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg)); + sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; + + return sg; } /* @@ -624,34 +877,20 @@ build_sched_groups(struct sched_domain *sd, int cpu) struct cpumask *covered; int i; - get_group(cpu, sdd, &sd->groups); - atomic_inc(&sd->groups->ref); - - if (cpu != cpumask_first(span)) - return 0; - lockdep_assert_held(&sched_domains_mutex); covered = sched_domains_tmpmask; cpumask_clear(covered); - for_each_cpu(i, span) { + for_each_cpu_wrap(i, span, cpu) { struct sched_group *sg; - int group, j; if (cpumask_test_cpu(i, covered)) continue; - group = get_group(i, sdd, &sg); - cpumask_setall(sched_group_mask(sg)); + sg = get_group(i, sdd); - for_each_cpu(j, span) { - if (get_group(j, sdd, NULL) != group) - continue; - - cpumask_set_cpu(j, covered); - cpumask_set_cpu(j, sched_group_cpus(sg)); - } + cpumask_or(covered, covered, sched_group_span(sg)); if (!first) first = sg; @@ -660,6 +899,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) last = sg; } last->next = first; + sd->groups = first; return 0; } @@ -683,12 +923,12 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) do { int cpu, max_cpu = -1; - sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + sg->group_weight = cpumask_weight(sched_group_span(sg)); if (!(sd->flags & SD_ASYM_PACKING)) goto next; - for_each_cpu(cpu, sched_group_cpus(sg)) { + for_each_cpu(cpu, sched_group_span(sg)) { if (max_cpu < 0) max_cpu = cpu; else if (sched_asym_prefer(cpu, max_cpu)) @@ -1308,6 +1548,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sgc) return -ENOMEM; +#ifdef CONFIG_SCHED_DEBUG + sgc->id = j; +#endif + *per_cpu_ptr(sdd->sgc, j) = sgc; } } @@ -1407,7 +1651,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att sd = build_sched_domain(tl, cpu_map, attr, sd, i); if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; - if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) + if (tl->flags & SDTL_OVERLAP) sd->flags |= SD_OVERLAP; if (cpumask_equal(cpu_map, sched_domain_span(sd))) break; @@ -1478,7 +1722,7 @@ static struct sched_domain_attr *dattr_cur; * cpumask) fails, then fallback to a single sched domain, * as determined by the single cpumask fallback_doms. */ -cpumask_var_t fallback_doms; +static cpumask_var_t fallback_doms; /* * arch_update_cpu_topology lets virtualized architectures update the @@ -1520,10 +1764,14 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) * For now this just excludes isolated CPUs, but could be used to * exclude other special cases in the future. */ -int init_sched_domains(const struct cpumask *cpu_map) +int sched_init_domains(const struct cpumask *cpu_map) { int err; + zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL); + zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL); + zalloc_cpumask_var(&fallback_doms, GFP_KERNEL); + arch_update_cpu_topology(); ndoms_cur = 1; doms_cur = alloc_sched_domains(ndoms_cur); diff --git a/kernel/smp.c b/kernel/smp.c index a817769b53c0..3061483cb3ad 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -30,6 +30,7 @@ enum { struct call_function_data { struct call_single_data __percpu *csd; cpumask_var_t cpumask; + cpumask_var_t cpumask_ipi; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); @@ -45,9 +46,15 @@ int smpcfd_prepare_cpu(unsigned int cpu) if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return -ENOMEM; + if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, + cpu_to_node(cpu))) { + free_cpumask_var(cfd->cpumask); + return -ENOMEM; + } cfd->csd = alloc_percpu(struct call_single_data); if (!cfd->csd) { free_cpumask_var(cfd->cpumask); + free_cpumask_var(cfd->cpumask_ipi); return -ENOMEM; } @@ -59,6 +66,7 @@ int smpcfd_dead_cpu(unsigned int cpu) struct call_function_data *cfd = &per_cpu(cfd_data, cpu); free_cpumask_var(cfd->cpumask); + free_cpumask_var(cfd->cpumask_ipi); free_percpu(cfd->csd); return 0; } @@ -428,12 +436,13 @@ void smp_call_function_many(const struct cpumask *mask, cfd = this_cpu_ptr(&cfd_data); cpumask_and(cfd->cpumask, mask, cpu_online_mask); - cpumask_clear_cpu(this_cpu, cfd->cpumask); + __cpumask_clear_cpu(this_cpu, cfd->cpumask); /* Some callers race with other cpus changing the passed mask */ if (unlikely(!cpumask_weight(cfd->cpumask))) return; + cpumask_clear(cfd->cpumask_ipi); for_each_cpu(cpu, cfd->cpumask) { struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); @@ -442,11 +451,12 @@ void smp_call_function_many(const struct cpumask *mask, csd->flags |= CSD_FLAG_SYNCHRONOUS; csd->func = func; csd->info = info; - llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)); + if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) + __cpumask_set_cpu(cpu, cfd->cpumask_ipi); } /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi_mask(cfd->cpumask); + arch_send_call_function_ipi_mask(cfd->cpumask_ipi); if (wait) { for_each_cpu(cpu, cfd->cpumask) { diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 1eb82661ecdb..b7591261652d 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -552,7 +552,8 @@ static int __init cpu_stop_init(void) } early_initcall(cpu_stop_init); -static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) +int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, + const struct cpumask *cpus) { struct multi_stop_data msdata = { .fn = fn, @@ -561,6 +562,8 @@ static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cp .active_cpus = cpus, }; + lockdep_assert_cpus_held(); + if (!stop_machine_initialized) { /* * Handle the case where stop_machine() is called @@ -590,9 +593,9 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) int ret; /* No CPUs can come up or down during this. */ - get_online_cpus(); - ret = __stop_machine(fn, data, cpus); - put_online_cpus(); + cpus_read_lock(); + ret = stop_machine_cpuslocked(fn, data, cpus); + cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(stop_machine); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 5cb5b0008d97..4cfebfff848d 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -45,11 +45,13 @@ static struct alarm_base { clockid_t base_clockid; } alarm_bases[ALARM_NUMTYPE]; +#if defined(CONFIG_POSIX_TIMERS) || defined(CONFIG_RTC_CLASS) /* freezer information to handle clock_nanosleep triggered wakeups */ static enum alarmtimer_type freezer_alarmtype; static ktime_t freezer_expires; static ktime_t freezer_delta; static DEFINE_SPINLOCK(freezer_delta_lock); +#endif static struct wakeup_source *ws; @@ -307,38 +309,6 @@ static int alarmtimer_resume(struct device *dev) } #endif -static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) -{ - struct alarm_base *base; - unsigned long flags; - ktime_t delta; - - switch(type) { - case ALARM_REALTIME: - base = &alarm_bases[ALARM_REALTIME]; - type = ALARM_REALTIME_FREEZER; - break; - case ALARM_BOOTTIME: - base = &alarm_bases[ALARM_BOOTTIME]; - type = ALARM_BOOTTIME_FREEZER; - break; - default: - WARN_ONCE(1, "Invalid alarm type: %d\n", type); - return; - } - - delta = ktime_sub(absexp, base->gettime()); - - spin_lock_irqsave(&freezer_delta_lock, flags); - if (!freezer_delta || (delta < freezer_delta)) { - freezer_delta = delta; - freezer_expires = absexp; - freezer_alarmtype = type; - } - spin_unlock_irqrestore(&freezer_delta_lock, flags); -} - - /** * alarm_init - Initialize an alarm structure * @alarm: ptr to alarm to be initialized @@ -488,6 +458,38 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) } EXPORT_SYMBOL_GPL(alarm_forward_now); +#ifdef CONFIG_POSIX_TIMERS + +static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) +{ + struct alarm_base *base; + unsigned long flags; + ktime_t delta; + + switch(type) { + case ALARM_REALTIME: + base = &alarm_bases[ALARM_REALTIME]; + type = ALARM_REALTIME_FREEZER; + break; + case ALARM_BOOTTIME: + base = &alarm_bases[ALARM_BOOTTIME]; + type = ALARM_BOOTTIME_FREEZER; + break; + default: + WARN_ONCE(1, "Invalid alarm type: %d\n", type); + return; + } + + delta = ktime_sub(absexp, base->gettime()); + + spin_lock_irqsave(&freezer_delta_lock, flags); + if (!freezer_delta || (delta < freezer_delta)) { + freezer_delta = delta; + freezer_expires = absexp; + freezer_alarmtype = type; + } + spin_unlock_irqrestore(&freezer_delta_lock, flags); +} /** * clock2alarm - helper that converts from clockid to alarmtypes @@ -846,6 +848,17 @@ out: return ret; } +const struct k_clock alarm_clock = { + .clock_getres = alarm_clock_getres, + .clock_get = alarm_clock_get, + .timer_create = alarm_timer_create, + .timer_set = alarm_timer_set, + .timer_del = alarm_timer_del, + .timer_get = alarm_timer_get, + .nsleep = alarm_timer_nsleep, +}; +#endif /* CONFIG_POSIX_TIMERS */ + /* Suspend hook structures */ static const struct dev_pm_ops alarmtimer_pm_ops = { @@ -871,23 +884,9 @@ static int __init alarmtimer_init(void) struct platform_device *pdev; int error = 0; int i; - struct k_clock alarm_clock = { - .clock_getres = alarm_clock_getres, - .clock_get = alarm_clock_get, - .timer_create = alarm_timer_create, - .timer_set = alarm_timer_set, - .timer_del = alarm_timer_del, - .timer_get = alarm_timer_get, - .nsleep = alarm_timer_nsleep, - }; alarmtimer_rtc_timer_init(); - if (IS_ENABLED(CONFIG_POSIX_TIMERS)) { - posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); - posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); - } - /* Initialize alarm bases */ alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 93621ae718d3..03918a19cf2d 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -233,6 +233,9 @@ static void clocksource_watchdog(unsigned long data) continue; } + if (cs == curr_clocksource && cs->tick_stable) + cs->tick_stable(cs); + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 31d588d37a17..7e453005e078 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -434,7 +434,7 @@ static int pc_timer_settime(struct k_itimer *kit, int flags, return err; } -struct k_clock clock_posix_dynamic = { +const struct k_clock clock_posix_dynamic = { .clock_getres = pc_clock_getres, .clock_set = pc_clock_settime, .clock_get = pc_clock_gettime, diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index d2a1e6dd0291..c99434739fd5 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1421,7 +1421,7 @@ static int thread_cpu_timer_create(struct k_itimer *timer) return posix_cpu_timer_create(timer); } -struct k_clock clock_posix_cpu = { +const struct k_clock clock_posix_cpu = { .clock_getres = posix_cpu_clock_getres, .clock_set = posix_cpu_clock_set, .clock_get = posix_cpu_clock_get, @@ -1433,24 +1433,16 @@ struct k_clock clock_posix_cpu = { .timer_get = posix_cpu_timer_get, }; -static __init int init_posix_cpu_timers(void) -{ - struct k_clock process = { - .clock_getres = process_cpu_clock_getres, - .clock_get = process_cpu_clock_get, - .timer_create = process_cpu_timer_create, - .nsleep = process_cpu_nsleep, - .nsleep_restart = process_cpu_nsleep_restart, - }; - struct k_clock thread = { - .clock_getres = thread_cpu_clock_getres, - .clock_get = thread_cpu_clock_get, - .timer_create = thread_cpu_timer_create, - }; - - posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); - posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread); +const struct k_clock clock_process = { + .clock_getres = process_cpu_clock_getres, + .clock_get = process_cpu_clock_get, + .timer_create = process_cpu_timer_create, + .nsleep = process_cpu_nsleep, + .nsleep_restart = process_cpu_nsleep_restart, +}; - return 0; -} -__initcall(init_posix_cpu_timers); +const struct k_clock clock_thread = { + .clock_getres = thread_cpu_clock_getres, + .clock_get = thread_cpu_clock_get, + .timer_create = thread_cpu_timer_create, +}; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 4d7b2ce09c27..0c0cccfa3586 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -125,8 +125,6 @@ static DEFINE_SPINLOCK(hash_lock); * which we beg off on and pass to do_sys_settimeofday(). */ -static struct k_clock posix_clocks[MAX_CLOCKS]; - /* * These ones are defined below. */ @@ -280,74 +278,87 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) return 0; } + +static const struct k_clock clock_realtime = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_clock_realtime_get, + .clock_set = posix_clock_realtime_set, + .clock_adj = posix_clock_realtime_adj, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, +}; + +static const struct k_clock clock_monotonic = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_ktime_get_ts, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, +}; + +static const struct k_clock clock_monotonic_raw = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_monotonic_raw, +}; + +static const struct k_clock clock_realtime_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, +}; + +static const struct k_clock clock_monotonic_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, +}; + +static const struct k_clock clock_tai = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_tai, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, +}; + +static const struct k_clock clock_boottime = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_boottime, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, +}; + +static const struct k_clock * const posix_clocks[] = { + [CLOCK_REALTIME] = &clock_realtime, + [CLOCK_MONOTONIC] = &clock_monotonic, + [CLOCK_PROCESS_CPUTIME_ID] = &clock_process, + [CLOCK_THREAD_CPUTIME_ID] = &clock_thread, + [CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw, + [CLOCK_REALTIME_COARSE] = &clock_realtime_coarse, + [CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse, + [CLOCK_BOOTTIME] = &clock_boottime, + [CLOCK_REALTIME_ALARM] = &alarm_clock, + [CLOCK_BOOTTIME_ALARM] = &alarm_clock, + [CLOCK_TAI] = &clock_tai, +}; + /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ static __init int init_posix_timers(void) { - struct k_clock clock_realtime = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_clock_realtime_get, - .clock_set = posix_clock_realtime_set, - .clock_adj = posix_clock_realtime_adj, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - struct k_clock clock_monotonic = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_ktime_get_ts, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - struct k_clock clock_monotonic_raw = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_monotonic_raw, - }; - struct k_clock clock_realtime_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_realtime_coarse, - }; - struct k_clock clock_monotonic_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_monotonic_coarse, - }; - struct k_clock clock_tai = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_tai, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - struct k_clock clock_boottime = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_boottime, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - - posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime); - posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic); - posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); - posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); - posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); - posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); - posix_timers_register_clock(CLOCK_TAI, &clock_tai); - posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, NULL); @@ -521,30 +532,6 @@ static struct pid *good_sigevent(sigevent_t * event) return task_pid(rtn); } -void posix_timers_register_clock(const clockid_t clock_id, - struct k_clock *new_clock) -{ - if ((unsigned) clock_id >= MAX_CLOCKS) { - printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n", - clock_id); - return; - } - - if (!new_clock->clock_get) { - printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n", - clock_id); - return; - } - if (!new_clock->clock_getres) { - printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n", - clock_id); - return; - } - - posix_clocks[clock_id] = *new_clock; -} -EXPORT_SYMBOL_GPL(posix_timers_register_clock); - static struct k_itimer * alloc_posix_timer(void) { struct k_itimer *tmr; @@ -581,15 +568,15 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) call_rcu(&tmr->it.rcu, k_itimer_rcu_free); } -static struct k_clock *clockid_to_kclock(const clockid_t id) +static const struct k_clock *clockid_to_kclock(const clockid_t id) { if (id < 0) return (id & CLOCKFD_MASK) == CLOCKFD ? &clock_posix_dynamic : &clock_posix_cpu; - if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres) + if (id >= ARRAY_SIZE(posix_clocks) || !posix_clocks[id]) return NULL; - return &posix_clocks[id]; + return posix_clocks[id]; } static int common_timer_create(struct k_itimer *new_timer) @@ -604,7 +591,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, struct sigevent __user *, timer_event_spec, timer_t __user *, created_timer_id) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct k_itimer *new_timer; int error, new_timer_id; sigevent_t event; @@ -781,7 +768,7 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, struct itimerspec64 cur_setting64; struct itimerspec cur_setting; struct k_itimer *timr; - struct k_clock *kc; + const struct k_clock *kc; unsigned long flags; int ret = 0; @@ -890,7 +877,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, struct itimerspec new_spec, old_spec; struct k_itimer *timr; unsigned long flag; - struct k_clock *kc; + const struct k_clock *kc; int error = 0; if (!new_setting) @@ -939,7 +926,7 @@ static int common_timer_del(struct k_itimer *timer) static inline int timer_delete_hook(struct k_itimer *timer) { - struct k_clock *kc = clockid_to_kclock(timer->it_clock); + const struct k_clock *kc = clockid_to_kclock(timer->it_clock); if (WARN_ON_ONCE(!kc || !kc->timer_del)) return -EINVAL; @@ -1018,7 +1005,7 @@ void exit_itimers(struct signal_struct *sig) SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 new_tp64; struct timespec new_tp; @@ -1035,7 +1022,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct timespec __user *,tp) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 kernel_tp64; struct timespec kernel_tp; int error; @@ -1055,7 +1042,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, struct timex __user *, utx) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct timex ktx; int err; @@ -1078,7 +1065,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 rtn_tp64; struct timespec rtn_tp; int error; @@ -1110,7 +1097,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, const struct timespec __user *, rqtp, struct timespec __user *, rmtp) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 t64; struct timespec t; @@ -1136,7 +1123,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, long clock_nanosleep_restart(struct restart_block *restart_block) { clockid_t which_clock = restart_block->nanosleep.clockid; - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) return -EINVAL; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 64c97fc130c4..2de9c553682f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -150,6 +150,12 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; + /* + * In case the current tick fired too early past its expected + * expiration, make sure we don't bypass the next clock reprogramming + * to the same deadline. + */ + ts->next_tick = 0; } #endif update_process_times(user_mode(regs)); @@ -554,7 +560,7 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) update_ts_time_stats(smp_processor_id(), ts, now, NULL); ts->idle_active = 0; - sched_clock_idle_wakeup_event(0); + sched_clock_idle_wakeup_event(); } static ktime_t tick_nohz_start_idle(struct tick_sched *ts) @@ -660,6 +666,12 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); else tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); + + /* + * Reset to make sure next tick stop doesn't get fooled by past + * cached clock deadline. + */ + ts->next_tick = 0; } static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, @@ -771,8 +783,16 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, tick = expires; /* Skip reprogram of event if its not changed */ - if (ts->tick_stopped && (expires == dev->next_event)) - goto out; + if (ts->tick_stopped && (expires == ts->next_tick)) { + /* Sanity check: make sure clockevent is actually programmed */ + if (likely(dev->next_event <= ts->next_tick)) + goto out; + + WARN_ON_ONCE(1); + printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", + basemono, ts->next_tick, dev->next_event, + hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); + } /* * nohz_stop_sched_tick can be called several times before @@ -791,6 +811,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, trace_tick_stop(1, TICK_DEP_MASK_NONE); } + ts->next_tick = tick; + /* * If the expiration time == KTIME_MAX, then we simply stop * the tick timer. @@ -806,7 +828,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, else tick_program_event(tick, 1); out: - /* Update the estimated sleep length */ + /* + * Update the estimated sleep length until the next timer + * (not only the tick). + */ ts->sleep_length = ktime_sub(dev->next_event, now); return tick; } @@ -864,6 +889,11 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) tick_do_timer_cpu = TICK_DO_TIMER_NONE; + /* + * Make sure the CPU doesn't get fooled by obsolete tick + * deadline if it comes back online later. + */ + ts->next_tick = 0; return false; } @@ -1172,6 +1202,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) */ if (regs) tick_sched_handle(ts, regs); + else + ts->next_tick = 0; /* No need to reprogram if we are in idle or full dynticks mode */ if (unlikely(ts->tick_stopped)) diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index bf38226e5c17..075444e3d48e 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -27,6 +27,7 @@ enum tick_nohz_mode { * timer is modified for nohz sleeps. This is necessary * to resume the tick timer operation in the timeline * when the CPU returns from nohz sleep. + * @next_tick: Next tick to be fired when in dynticks mode. * @tick_stopped: Indicator that the idle tick has been stopped * @idle_jiffies: jiffies at the entry to idle for idle time accounting * @idle_calls: Total number of idle calls @@ -44,6 +45,7 @@ struct tick_sched { unsigned long check_clocks; enum tick_nohz_mode nohz_mode; ktime_t last_tick; + ktime_t next_tick; int inidle; int tick_stopped; unsigned long idle_jiffies; diff --git a/lib/cpumask.c b/lib/cpumask.c index 81dedaab36cc..4731a0895760 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -43,6 +43,38 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) } EXPORT_SYMBOL(cpumask_any_but); +/** + * cpumask_next_wrap - helper to implement for_each_cpu_wrap + * @n: the cpu prior to the place to search + * @mask: the cpumask pointer + * @start: the start point of the iteration + * @wrap: assume @n crossing @start terminates the iteration + * + * Returns >= nr_cpu_ids on completion + * + * Note: the @wrap argument is required for the start condition when + * we cannot assume @start is set in @mask. + */ +int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap) +{ + int next; + +again: + next = cpumask_next(n, mask); + + if (wrap && n < start && next >= start) { + return nr_cpumask_bits; + + } else if (next >= nr_cpumask_bits) { + wrap = true; + n = -1; + goto again; + } + + return next; +} +EXPORT_SYMBOL(cpumask_next_wrap); + /* These are not inline because of header tangles. */ #ifdef CONFIG_CPUMASK_OFFSTACK /** diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 690d75b132fa..2fb007be0212 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -28,7 +28,7 @@ notrace static unsigned int check_preemption_disabled(const char *what1, /* * It is valid to assume CPU-locality during early bootup: */ - if (system_state != SYSTEM_RUNNING) + if (system_state < SYSTEM_SCHEDULING) goto out; /* diff --git a/mm/rmap.c b/mm/rmap.c index d405f0e0ee96..130c238fe384 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -579,25 +579,13 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma) void try_to_unmap_flush(void) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; - int cpu; if (!tlb_ubc->flush_required) return; - cpu = get_cpu(); - - if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) { - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - local_flush_tlb(); - trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); - } - - if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) - flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL); - cpumask_clear(&tlb_ubc->cpumask); + arch_tlbbatch_flush(&tlb_ubc->arch); tlb_ubc->flush_required = false; tlb_ubc->writable = false; - put_cpu(); } /* Flush iff there are potentially writable TLB entries that can race with IO */ @@ -613,7 +601,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; - cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm)); + arch_tlbbatch_add_mm(&tlb_ubc->arch, mm); tlb_ubc->flush_required = true; /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 8ad39bbc79e6..c3c1c6ac62da 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3652,7 +3652,7 @@ int kswapd_run(int nid) pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ - BUG_ON(system_state == SYSTEM_BOOTING); + BUG_ON(system_state < SYSTEM_RUNNING); pr_err("Failed to start kswapd on node %d\n", nid); ret = PTR_ERR(pgdat->kswapd); pgdat->kswapd = NULL; |