From c88d5910890ad35af283344417891344604f0438 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Sep 2009 13:50:02 +0200 Subject: sched: Merge select_task_rq_fair() and sched_balance_self() The problem with wake_idle() is that is doesn't respect things like cpu_power, which means it doesn't deal well with SMT nor the recent RT interaction. To cure this, it needs to do what sched_balance_self() does, which leads to the possibility of merging select_task_rq_fair() and sched_balance_self(). Modify sched_balance_self() to: - update_shares() when walking up the domain tree, (it only called it for the top domain, but it should have done this anyway), which allows us to remove this ugly bit from try_to_wake_up(). - do wake_affine() on the smallest domain that contains both this (the waking) and the prev (the wakee) cpu for WAKE invocations. Then use the top-down balance steps it had to replace wake_idle(). This leads to the dissapearance of SD_WAKE_BALANCE and SD_WAKE_IDLE_FAR, with SD_WAKE_IDLE replaced with SD_BALANCE_WAKE. SD_WAKE_AFFINE needs SD_BALANCE_WAKE to be effective. Touch all topology bits to replace the old with new SD flags -- platforms might need re-tuning, enabling SD_BALANCE_WAKE conditionally on a NUMA distance seems like a good additional feature, magny-core and small nehalem systems would want this enabled, systems with slow interconnects would not. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/topology.h | 5 +++-- arch/mips/include/asm/mach-ip27/topology.h | 2 +- arch/powerpc/include/asm/topology.h | 5 ++--- arch/sh/include/asm/topology.h | 4 ++-- arch/sparc/include/asm/topology_64.h | 4 ++-- arch/x86/include/asm/topology.h | 4 +--- 6 files changed, 11 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 7b4c8c70b2d..cf6053b226c 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -67,6 +67,7 @@ void build_cpu_to_node_map(void); .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ + | SD_BALANCE_WAKE \ | SD_WAKE_AFFINE, \ .last_balance = jiffies, \ .balance_interval = 1, \ @@ -91,8 +92,8 @@ void build_cpu_to_node_map(void); .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ | SD_BALANCE_FORK \ - | SD_SERIALIZE \ - | SD_WAKE_BALANCE, \ + | SD_BALANCE_WAKE \ + | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 64, \ .nr_balance_failed = 0, \ diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h index 07547231e07..d8332398f5b 100644 --- a/arch/mips/include/asm/mach-ip27/topology.h +++ b/arch/mips/include/asm/mach-ip27/topology.h @@ -48,7 +48,7 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES]; .cache_nice_tries = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ - | SD_WAKE_BALANCE, \ + | SD_BALANCE_WAKE, \ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 054a16d6808..c6343313ff5 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -62,9 +62,8 @@ static inline int pcibus_to_node(struct pci_bus *bus) .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ | SD_BALANCE_NEWIDLE \ - | SD_WAKE_IDLE \ - | SD_SERIALIZE \ - | SD_WAKE_BALANCE, \ + | SD_BALANCE_WAKE \ + | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h index b69ee850906..dc1531e2f25 100644 --- a/arch/sh/include/asm/topology.h +++ b/arch/sh/include/asm/topology.h @@ -21,8 +21,8 @@ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ | SD_BALANCE_EXEC \ - | SD_SERIALIZE \ - | SD_WAKE_BALANCE, \ + | SD_BALANCE_WAKE \ + | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h index e5ea8d33242..1d091abd2d1 100644 --- a/arch/sparc/include/asm/topology_64.h +++ b/arch/sparc/include/asm/topology_64.h @@ -57,8 +57,8 @@ static inline int pcibus_to_node(struct pci_bus *pbus) .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ | SD_BALANCE_EXEC \ - | SD_SERIALIZE \ - | SD_WAKE_BALANCE, \ + | SD_BALANCE_WAKE \ + | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 1, \ } diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 26d06e052a1..966d58dc627 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -145,14 +145,12 @@ extern unsigned long node_remap_size[]; | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 0*SD_WAKE_IDLE \ + | 1*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ - | 1*SD_WAKE_BALANCE \ | 0*SD_SHARE_CPUPOWER \ | 0*SD_POWERSAVINGS_BALANCE \ | 0*SD_SHARE_PKG_RESOURCES \ | 1*SD_SERIALIZE \ - | 1*SD_WAKE_IDLE_FAR \ | 0*SD_PREFER_SIBLING \ , \ .last_balance = jiffies, \ -- cgit v1.2.3 From 78e7ed53c9f42f04f9401ada6f7047db60781676 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Sep 2009 13:16:51 +0200 Subject: sched: Tweak wake_idx When merging select_task_rq_fair() and sched_balance_self() we lost the use of wake_idx, restore that and set them to 0 to make wake balancing more aggressive. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/topology.h | 5 +++-- arch/powerpc/include/asm/topology.h | 3 ++- arch/sh/include/asm/topology.h | 2 +- arch/sparc/include/asm/topology_64.h | 2 +- arch/x86/include/asm/topology.h | 2 +- 5 files changed, 8 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index cf6053b226c..47f3c51d5e2 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -62,11 +62,12 @@ void build_cpu_to_node_map(void); .busy_idx = 2, \ .idle_idx = 1, \ .newidle_idx = 2, \ - .wake_idx = 1, \ + .wake_idx = 0, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ + | SD_BALANCE_FORK \ | SD_BALANCE_WAKE \ | SD_WAKE_AFFINE, \ .last_balance = jiffies, \ @@ -87,7 +88,7 @@ void build_cpu_to_node_map(void); .busy_idx = 3, \ .idle_idx = 2, \ .newidle_idx = 2, \ - .wake_idx = 1, \ + .wake_idx = 0, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index c6343313ff5..a6b220ab56d 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -58,9 +58,10 @@ static inline int pcibus_to_node(struct pci_bus *bus) .busy_idx = 3, \ .idle_idx = 1, \ .newidle_idx = 2, \ - .wake_idx = 1, \ + .wake_idx = 0, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ + | SD_BALANCE_FORK \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_WAKE \ | SD_SERIALIZE, \ diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h index dc1531e2f25..9054e5c0ad5 100644 --- a/arch/sh/include/asm/topology.h +++ b/arch/sh/include/asm/topology.h @@ -16,7 +16,7 @@ .busy_idx = 3, \ .idle_idx = 2, \ .newidle_idx = 2, \ - .wake_idx = 1, \ + .wake_idx = 0, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h index 1d091abd2d1..bc3a0930ed6 100644 --- a/arch/sparc/include/asm/topology_64.h +++ b/arch/sparc/include/asm/topology_64.h @@ -52,7 +52,7 @@ static inline int pcibus_to_node(struct pci_bus *pbus) .busy_idx = 3, \ .idle_idx = 2, \ .newidle_idx = 0, \ - .wake_idx = 1, \ + .wake_idx = 0, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 966d58dc627..4b1b335097b 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -138,7 +138,7 @@ extern unsigned long node_remap_size[]; .busy_idx = 3, \ .idle_idx = SD_IDLE_IDX, \ .newidle_idx = SD_NEWIDLE_IDX, \ - .wake_idx = 1, \ + .wake_idx = 0, \ .forkexec_idx = SD_FORKEXEC_IDX, \ \ .flags = 1*SD_LOAD_BALANCE \ -- cgit v1.2.3 From 0ec9fab3d186d9cbb00c0f694d4a260d07c198d9 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 15 Sep 2009 15:07:03 +0200 Subject: sched: Improve latencies and throughput Make the idle balancer more agressive, to improve a x264 encoding workload provided by Jason Garrett-Glaser: NEXT_BUDDY NO_LB_BIAS encoded 600 frames, 252.82 fps, 22096.60 kb/s encoded 600 frames, 250.69 fps, 22096.60 kb/s encoded 600 frames, 245.76 fps, 22096.60 kb/s NO_NEXT_BUDDY LB_BIAS encoded 600 frames, 344.44 fps, 22096.60 kb/s encoded 600 frames, 346.66 fps, 22096.60 kb/s encoded 600 frames, 352.59 fps, 22096.60 kb/s NO_NEXT_BUDDY NO_LB_BIAS encoded 600 frames, 425.75 fps, 22096.60 kb/s encoded 600 frames, 425.45 fps, 22096.60 kb/s encoded 600 frames, 422.49 fps, 22096.60 kb/s Peter pointed out that this is better done via newidle_idx, not via LB_BIAS, newidle balancing should look for where there is load _now_, not where there was load 2 ticks ago. Worst-case latencies are improved as well as no buddies means less vruntime spread. (as per prior lkml discussions) This change improves kbuild-peak parallelism as well. Reported-by: Jason Garrett-Glaser Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1253011667.9128.16.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/topology.h | 5 +++-- arch/powerpc/include/asm/topology.h | 2 +- arch/sh/include/asm/topology.h | 3 ++- arch/x86/include/asm/topology.h | 4 +--- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 47f3c51d5e2..42f1673ec83 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -61,7 +61,7 @@ void build_cpu_to_node_map(void); .cache_nice_tries = 2, \ .busy_idx = 2, \ .idle_idx = 1, \ - .newidle_idx = 2, \ + .newidle_idx = 0, \ .wake_idx = 0, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ @@ -87,10 +87,11 @@ void build_cpu_to_node_map(void); .cache_nice_tries = 2, \ .busy_idx = 3, \ .idle_idx = 2, \ - .newidle_idx = 2, \ + .newidle_idx = 0, \ .wake_idx = 0, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ + | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_BALANCE_FORK \ | SD_BALANCE_WAKE \ diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index a6b220ab56d..1a2c9eb42a0 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -57,7 +57,7 @@ static inline int pcibus_to_node(struct pci_bus *bus) .cache_nice_tries = 1, \ .busy_idx = 3, \ .idle_idx = 1, \ - .newidle_idx = 2, \ + .newidle_idx = 0, \ .wake_idx = 0, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h index 9054e5c0ad5..c8436771e31 100644 --- a/arch/sh/include/asm/topology.h +++ b/arch/sh/include/asm/topology.h @@ -15,13 +15,14 @@ .cache_nice_tries = 2, \ .busy_idx = 3, \ .idle_idx = 2, \ - .newidle_idx = 2, \ + .newidle_idx = 0, \ .wake_idx = 0, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ | SD_BALANCE_EXEC \ | SD_BALANCE_WAKE \ + | SD_BALANCE_NEWIDLE \ | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 1, \ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 4b1b335097b..7fafd1bc414 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -116,14 +116,12 @@ extern unsigned long node_remap_size[]; # define SD_CACHE_NICE_TRIES 1 # define SD_IDLE_IDX 1 -# define SD_NEWIDLE_IDX 2 # define SD_FORKEXEC_IDX 0 #else # define SD_CACHE_NICE_TRIES 2 # define SD_IDLE_IDX 2 -# define SD_NEWIDLE_IDX 2 # define SD_FORKEXEC_IDX 1 #endif @@ -137,7 +135,7 @@ extern unsigned long node_remap_size[]; .cache_nice_tries = SD_CACHE_NICE_TRIES, \ .busy_idx = 3, \ .idle_idx = SD_IDLE_IDX, \ - .newidle_idx = SD_NEWIDLE_IDX, \ + .newidle_idx = 0, \ .wake_idx = 0, \ .forkexec_idx = SD_FORKEXEC_IDX, \ \ -- cgit v1.2.3 From b8a543ea5a5896830a9969bacfd047f9d15940b2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Sep 2009 15:22:03 +0200 Subject: sched: Reduce forkexec_idx If we're looking to place a new task, we might as well find the idlest position _now_, not 1 tick ago. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/topology.h | 4 ++-- arch/sh/include/asm/topology.h | 2 +- arch/sparc/include/asm/topology_64.h | 2 +- arch/x86/include/asm/topology.h | 4 +--- 4 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 42f1673ec83..569b9dafc78 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -63,7 +63,7 @@ void build_cpu_to_node_map(void); .idle_idx = 1, \ .newidle_idx = 0, \ .wake_idx = 0, \ - .forkexec_idx = 1, \ + .forkexec_idx = 0, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ @@ -89,7 +89,7 @@ void build_cpu_to_node_map(void); .idle_idx = 2, \ .newidle_idx = 0, \ .wake_idx = 0, \ - .forkexec_idx = 1, \ + .forkexec_idx = 0, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h index c8436771e31..a8cc564b703 100644 --- a/arch/sh/include/asm/topology.h +++ b/arch/sh/include/asm/topology.h @@ -17,7 +17,7 @@ .idle_idx = 2, \ .newidle_idx = 0, \ .wake_idx = 0, \ - .forkexec_idx = 1, \ + .forkexec_idx = 0, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ | SD_BALANCE_EXEC \ diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h index bc3a0930ed6..10b979d1de2 100644 --- a/arch/sparc/include/asm/topology_64.h +++ b/arch/sparc/include/asm/topology_64.h @@ -53,7 +53,7 @@ static inline int pcibus_to_node(struct pci_bus *pbus) .idle_idx = 2, \ .newidle_idx = 0, \ .wake_idx = 0, \ - .forkexec_idx = 1, \ + .forkexec_idx = 0, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ | SD_BALANCE_EXEC \ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 7fafd1bc414..589f12383d7 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -116,13 +116,11 @@ extern unsigned long node_remap_size[]; # define SD_CACHE_NICE_TRIES 1 # define SD_IDLE_IDX 1 -# define SD_FORKEXEC_IDX 0 #else # define SD_CACHE_NICE_TRIES 2 # define SD_IDLE_IDX 2 -# define SD_FORKEXEC_IDX 1 #endif @@ -137,7 +135,7 @@ extern unsigned long node_remap_size[]; .idle_idx = SD_IDLE_IDX, \ .newidle_idx = 0, \ .wake_idx = 0, \ - .forkexec_idx = SD_FORKEXEC_IDX, \ + .forkexec_idx = 0, \ \ .flags = 1*SD_LOAD_BALANCE \ | 1*SD_BALANCE_NEWIDLE \ -- cgit v1.2.3 From a8303aaf2b2f74714db6d204ab4fcb810942664e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Sep 2009 10:56:56 +0200 Subject: x86: Move APERF/MPERF into a X86_FEATURE Move the APERFMPERF capacility into a X86_FEATURE flag so that it can be used outside of the acpi cpufreq driver. Cc: H. Peter Anvin Cc: Venkatesh Pallipadi Cc: Yanmin Cc: Dave Jones Cc: Len Brown Cc: Yinghai Lu Cc: cpufreq@vger.kernel.org Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 + arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 9 ++------- arch/x86/kernel/cpu/intel.c | 6 ++++++ 3 files changed, 9 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 847fee6493a..9cfc88b9774 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -96,6 +96,7 @@ #define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */ #define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */ #define X86_FEATURE_AMD_DCM (3*32+27) /* multi-node processor */ +#define X86_FEATURE_APERFMPERF (3*32+28) /* APERFMPERF */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index ae9b503220c..509e6a7db71 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -60,7 +60,6 @@ enum { }; #define INTEL_MSR_RANGE (0xffff) -#define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1) struct acpi_cpufreq_data { struct acpi_processor_performance *acpi_data; @@ -731,12 +730,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) acpi_processor_notify_smm(THIS_MODULE); /* Check for APERF/MPERF support in hardware */ - if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) { - unsigned int ecx; - ecx = cpuid_ecx(6); - if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY) - acpi_cpufreq_driver.getavg = get_measured_perf; - } + if (cpu_has(c, X86_FEATURE_APERFMPERF)) + acpi_cpufreq_driver.getavg = get_measured_perf; dprintk("CPU%u - ACPI performance management activated.\n", cpu); for (i = 0; i < perf->state_count; i++) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 80a722a071b..40e1835b35e 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -350,6 +350,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } + if (c->cpuid_level > 6) { + unsigned ecx = cpuid_ecx(6); + if (ecx & 0x01) + set_cpu_cap(c, X86_FEATURE_APERFMPERF); + } + if (cpu_has_xmm2) set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); if (cpu_has_ds) { -- cgit v1.2.3 From 5cbc19a983141729d716be17197028434127b376 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Sep 2009 11:49:52 +0200 Subject: x86: Add generic aperf/mperf code Move some of the aperf/mperf code out from the cpufreq driver thingy so that other people can enjoy it too. Cc: H. Peter Anvin Cc: Venkatesh Pallipadi Cc: Yanmin Cc: Dave Jones Cc: Len Brown Cc: Yinghai Lu Cc: cpufreq@vger.kernel.org Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 30 ++++++++++++ arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 79 ++++-------------------------- 2 files changed, 39 insertions(+), 70 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index e08ea043e08..4ae2ccfed63 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -27,6 +27,7 @@ struct mm_struct; #include #include #include +#include #include /* @@ -1020,4 +1021,33 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, extern int get_tsc_mode(unsigned long adr); extern int set_tsc_mode(unsigned int val); +struct aperfmperf { + u64 aperf, mperf; +}; + +static inline void get_aperfmperf(struct aperfmperf *am) +{ + WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF)); + + rdmsrl(MSR_IA32_APERF, am->aperf); + rdmsrl(MSR_IA32_MPERF, am->mperf); +} + +#define APERFMPERF_SHIFT 10 + +static inline +unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, + struct aperfmperf *new) +{ + u64 aperf = new->aperf - old->aperf; + u64 mperf = new->mperf - old->mperf; + unsigned long ratio = aperf; + + mperf >>= APERFMPERF_SHIFT; + if (mperf) + ratio = div64_u64(aperf, mperf); + + return ratio; +} + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 509e6a7db71..4109679863c 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -70,11 +70,7 @@ struct acpi_cpufreq_data { static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); -struct acpi_msr_data { - u64 saved_aperf, saved_mperf; -}; - -static DEFINE_PER_CPU(struct acpi_msr_data, msr_data); +static DEFINE_PER_CPU(struct aperfmperf, old_perf); DEFINE_TRACE(power_mark); @@ -243,23 +239,12 @@ static u32 get_cur_val(const struct cpumask *mask) return cmd.val; } -struct perf_pair { - union { - struct { - u32 lo; - u32 hi; - } split; - u64 whole; - } aperf, mperf; -}; - /* Called via smp_call_function_single(), on the target CPU */ static void read_measured_perf_ctrs(void *_cur) { - struct perf_pair *cur = _cur; + struct aperfmperf *am = _cur; - rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi); - rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi); + get_aperfmperf(am); } /* @@ -278,63 +263,17 @@ static void read_measured_perf_ctrs(void *_cur) static unsigned int get_measured_perf(struct cpufreq_policy *policy, unsigned int cpu) { - struct perf_pair readin, cur; - unsigned int perf_percent; + struct aperfmperf perf; + unsigned long ratio; unsigned int retval; - if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1)) + if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) return 0; - cur.aperf.whole = readin.aperf.whole - - per_cpu(msr_data, cpu).saved_aperf; - cur.mperf.whole = readin.mperf.whole - - per_cpu(msr_data, cpu).saved_mperf; - per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole; - per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole; - -#ifdef __i386__ - /* - * We dont want to do 64 bit divide with 32 bit kernel - * Get an approximate value. Return failure in case we cannot get - * an approximate value. - */ - if (unlikely(cur.aperf.split.hi || cur.mperf.split.hi)) { - int shift_count; - u32 h; - - h = max_t(u32, cur.aperf.split.hi, cur.mperf.split.hi); - shift_count = fls(h); - - cur.aperf.whole >>= shift_count; - cur.mperf.whole >>= shift_count; - } - - if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) { - int shift_count = 7; - cur.aperf.split.lo >>= shift_count; - cur.mperf.split.lo >>= shift_count; - } - - if (cur.aperf.split.lo && cur.mperf.split.lo) - perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo; - else - perf_percent = 0; - -#else - if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) { - int shift_count = 7; - cur.aperf.whole >>= shift_count; - cur.mperf.whole >>= shift_count; - } - - if (cur.aperf.whole && cur.mperf.whole) - perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole; - else - perf_percent = 0; - -#endif + ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf); + per_cpu(old_perf, cpu) = perf; - retval = (policy->cpuinfo.max_freq * perf_percent) / 100; + retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; return retval; } -- cgit v1.2.3 From 47fe38fcff0517e67d395c039d2e26d2de688a60 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Sep 2009 13:49:18 +0200 Subject: x86: sched: Provide arch implementations using aperf/mperf APERF/MPERF support for cpu_power. APERF/MPERF is arch defined to be a relative scale of work capacity per logical cpu, this is assumed to include SMT and Turbo mode. APERF/MPERF are specified to both reset to 0 when either counter wraps, which is highly inconvenient, since that'll give a blimp when that happens. The manual specifies writing 0 to the counters after each read, but that's 1) too expensive, and 2) destroys the possibility of sharing these counters with other users, so we live with the blimp - the other existing user does too. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/sched.c | 55 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/cpu/sched.c (limited to 'arch') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index c1f253dac15..8dd30638fe4 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -13,7 +13,7 @@ CFLAGS_common.o := $(nostackp) obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o -obj-y += vmware.o hypervisor.o +obj-y += vmware.o hypervisor.o sched.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c new file mode 100644 index 00000000000..6c00a8f3cce --- /dev/null +++ b/arch/x86/kernel/cpu/sched.c @@ -0,0 +1,55 @@ +#include +#include +#include +#include + +#include +#include + +#ifdef CONFIG_SMP + +static DEFINE_PER_CPU(struct aperfmperf, old_perf); + +static unsigned long scale_aperfmperf(void) +{ + struct aperfmperf val, *old = &__get_cpu_var(old_perf); + unsigned long ratio, flags; + + local_irq_save(flags); + get_aperfmperf(&val); + local_irq_restore(flags); + + ratio = calc_aperfmperf_ratio(old, &val); + *old = val; + + return ratio; +} + +unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ + /* + * do aperf/mperf on the cpu level because it includes things + * like turbo mode, which are relevant to full cores. + */ + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) + return scale_aperfmperf(); + + /* + * maybe have something cpufreq here + */ + + return default_scale_freq_power(sd, cpu); +} + +unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu) +{ + /* + * aperf/mperf already includes the smt gain + */ + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) + return SCHED_LOAD_SCALE; + + return default_scale_smt_power(sd, cpu); +} + +#endif -- cgit v1.2.3 From 7c423e98856df9b941223a7e7845b2502ad84b00 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Sep 2009 09:38:24 +0200 Subject: sched: x86: Name old_perf in a unique way Silly percpu bits don't respect static.. Signed-off-by: Peter Zijlstra LKML-Reference: --- arch/x86/kernel/cpu/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c index 6c00a8f3cce..a640ae5ad20 100644 --- a/arch/x86/kernel/cpu/sched.c +++ b/arch/x86/kernel/cpu/sched.c @@ -8,11 +8,11 @@ #ifdef CONFIG_SMP -static DEFINE_PER_CPU(struct aperfmperf, old_perf); +static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched); static unsigned long scale_aperfmperf(void) { - struct aperfmperf val, *old = &__get_cpu_var(old_perf); + struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched); unsigned long ratio, flags; local_irq_save(flags); -- cgit v1.2.3 From 182a85f8a119c789610a9d464f4129ded9f3c107 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Sep 2009 13:24:49 +0200 Subject: sched: Disable wakeup balancing Sysbench thinks SD_BALANCE_WAKE is too agressive and kbuild doesn't really mind too much, SD_BALANCE_NEWIDLE picks up most of the slack. On a dual socket, quad core, dual thread nehalem system: sysbench (--num_threads=16): SD_BALANCE_WAKE-: 13982 tx/s SD_BALANCE_WAKE+: 15688 tx/s kbuild (-j16): SD_BALANCE_WAKE-: 47.648295846 seconds time elapsed ( +- 0.312% ) SD_BALANCE_WAKE+: 47.608607360 seconds time elapsed ( +- 0.026% ) (same within noise) Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/topology.h | 2 -- arch/mips/include/asm/mach-ip27/topology.h | 1 - arch/powerpc/include/asm/topology.h | 1 - arch/sh/include/asm/topology.h | 1 - arch/sparc/include/asm/topology_64.h | 1 - arch/x86/include/asm/topology.h | 2 +- 6 files changed, 1 insertion(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 569b9dafc78..d0141fbf51d 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -68,7 +68,6 @@ void build_cpu_to_node_map(void); | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_BALANCE_FORK \ - | SD_BALANCE_WAKE \ | SD_WAKE_AFFINE, \ .last_balance = jiffies, \ .balance_interval = 1, \ @@ -94,7 +93,6 @@ void build_cpu_to_node_map(void); | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_BALANCE_FORK \ - | SD_BALANCE_WAKE \ | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 64, \ diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h index d8332398f5b..23059170700 100644 --- a/arch/mips/include/asm/mach-ip27/topology.h +++ b/arch/mips/include/asm/mach-ip27/topology.h @@ -48,7 +48,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES]; .cache_nice_tries = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ - | SD_BALANCE_WAKE, \ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 1a2c9eb42a0..394edcbcce7 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -63,7 +63,6 @@ static inline int pcibus_to_node(struct pci_bus *bus) | SD_BALANCE_EXEC \ | SD_BALANCE_FORK \ | SD_BALANCE_NEWIDLE \ - | SD_BALANCE_WAKE \ | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 1, \ diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h index a8cc564b703..f8c40cc6505 100644 --- a/arch/sh/include/asm/topology.h +++ b/arch/sh/include/asm/topology.h @@ -21,7 +21,6 @@ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ | SD_BALANCE_EXEC \ - | SD_BALANCE_WAKE \ | SD_BALANCE_NEWIDLE \ | SD_SERIALIZE, \ .last_balance = jiffies, \ diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h index 10b979d1de2..26cd25c0839 100644 --- a/arch/sparc/include/asm/topology_64.h +++ b/arch/sparc/include/asm/topology_64.h @@ -57,7 +57,6 @@ static inline int pcibus_to_node(struct pci_bus *pbus) .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ | SD_BALANCE_EXEC \ - | SD_BALANCE_WAKE \ | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 1, \ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 589f12383d7..6f0695d744b 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -141,7 +141,7 @@ extern unsigned long node_remap_size[]; | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 1*SD_BALANCE_WAKE \ + | 0*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ | 0*SD_SHARE_CPUPOWER \ | 0*SD_POWERSAVINGS_BALANCE \ -- cgit v1.2.3