diff options
-rw-r--r-- | Documentation/devicetree/bindings/arm/pmu.txt | 3 | ||||
-rw-r--r-- | arch/arm/include/asm/pmu.h | 12 | ||||
-rw-r--r-- | arch/arm/include/asm/topology.h | 3 | ||||
-rw-r--r-- | arch/arm/kernel/hw_breakpoint.c | 3 | ||||
-rw-r--r-- | arch/arm/kernel/perf_event.c | 19 | ||||
-rw-r--r-- | arch/arm/kernel/perf_event_cpu.c | 117 | ||||
-rw-r--r-- | arch/arm/kernel/perf_event_v7.c | 57 | ||||
-rw-r--r-- | arch/arm/kernel/topology.c | 27 | ||||
-rw-r--r-- | include/linux/vmstat.h | 2 | ||||
-rw-r--r-- | mm/vmstat.c | 95 |
10 files changed, 289 insertions, 49 deletions
diff --git a/Documentation/devicetree/bindings/arm/pmu.txt b/Documentation/devicetree/bindings/arm/pmu.txt index 343781b9f24..4ce82d045a6 100644 --- a/Documentation/devicetree/bindings/arm/pmu.txt +++ b/Documentation/devicetree/bindings/arm/pmu.txt @@ -16,6 +16,9 @@ Required properties: "arm,arm1176-pmu" "arm,arm1136-pmu" - interrupts : 1 combined interrupt or 1 per core. +- cluster : a phandle to the cluster to which it belongs + If there are more than one cluster with same CPU type + then there should be separate PMU nodes per cluster. Example: diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h index f24edad26c7..0cd7824ca76 100644 --- a/arch/arm/include/asm/pmu.h +++ b/arch/arm/include/asm/pmu.h @@ -62,9 +62,19 @@ struct pmu_hw_events { raw_spinlock_t pmu_lock; }; +struct cpupmu_regs { + u32 pmc; + u32 pmcntenset; + u32 pmuseren; + u32 pmintenset; + u32 pmxevttype[8]; + u32 pmxevtcnt[8]; +}; + struct arm_pmu { struct pmu pmu; cpumask_t active_irqs; + cpumask_t valid_cpus; char *name; irqreturn_t (*handle_irq)(int irq_num, void *dev); void (*enable)(struct perf_event *event); @@ -81,6 +91,8 @@ struct arm_pmu { int (*request_irq)(struct arm_pmu *, irq_handler_t handler); void (*free_irq)(struct arm_pmu *); int (*map_event)(struct perf_event *event); + void (*save_regs)(struct arm_pmu *, struct cpupmu_regs *); + void (*restore_regs)(struct arm_pmu *, struct cpupmu_regs *); int num_events; atomic_t active_events; struct mutex reserve_mutex; diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 5692ba11322..983fa7c153a 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -26,6 +26,7 @@ extern struct cputopo_arm cpu_topology[NR_CPUS]; void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +int cluster_to_logical_mask(unsigned int socket_id, cpumask_t *cluster_mask); #ifdef CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE /* Common values for CPUs */ @@ -62,6 +63,8 @@ const struct cpumask *cpu_coregroup_mask(int cpu); static inline void init_cpu_topology(void) { } static inline void store_cpu_topology(unsigned int cpuid) { } +static inline int cluster_to_logical_mask(unsigned int socket_id, + cpumask_t *cluster_mask) { return -EINVAL; } #endif diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index 1fd749ee4a1..1b803117ed9 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -1049,7 +1049,8 @@ static struct notifier_block dbg_cpu_pm_nb = { static void __init pm_init(void) { - cpu_pm_register_notifier(&dbg_cpu_pm_nb); + if (has_ossr) + cpu_pm_register_notifier(&dbg_cpu_pm_nb); } #else static inline void pm_init(void) diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index 8c3094d0f7b..d847c622a7b 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -12,6 +12,7 @@ */ #define pr_fmt(fmt) "hw perfevents: " fmt +#include <linux/cpumask.h> #include <linux/kernel.h> #include <linux/platform_device.h> #include <linux/pm_runtime.h> @@ -81,6 +82,9 @@ armpmu_map_event(struct perf_event *event, return armpmu_map_cache_event(cache_map, config); case PERF_TYPE_RAW: return armpmu_map_raw_event(raw_event_mask, config); + default: + if (event->attr.type >= PERF_TYPE_MAX) + return armpmu_map_raw_event(raw_event_mask, config); } return -ENOENT; @@ -158,6 +162,8 @@ armpmu_stop(struct perf_event *event, int flags) struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; + if (!cpumask_test_cpu(smp_processor_id(), &armpmu->valid_cpus)) + return; /* * ARM pmu always has to update the counter, so ignore * PERF_EF_UPDATE, see comments in armpmu_start(). @@ -174,6 +180,8 @@ static void armpmu_start(struct perf_event *event, int flags) struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; + if (!cpumask_test_cpu(smp_processor_id(), &armpmu->valid_cpus)) + return; /* * ARM pmu always has to reprogram the period, so ignore * PERF_EF_RELOAD, see the comment below. @@ -201,6 +209,9 @@ armpmu_del(struct perf_event *event, int flags) struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; + if (!cpumask_test_cpu(smp_processor_id(), &armpmu->valid_cpus)) + return; + armpmu_stop(event, PERF_EF_UPDATE); hw_events->events[idx] = NULL; clear_bit(idx, hw_events->used_mask); @@ -217,6 +228,10 @@ armpmu_add(struct perf_event *event, int flags) int idx; int err = 0; + /* An event following a process won't be stopped earlier */ + if (!cpumask_test_cpu(smp_processor_id(), &armpmu->valid_cpus)) + return 0; + perf_pmu_disable(event->pmu); /* If we don't have a space for the counter then finish early. */ @@ -416,6 +431,10 @@ static int armpmu_event_init(struct perf_event *event) int err = 0; atomic_t *active_events = &armpmu->active_events; + if (event->cpu != -1 && + !cpumask_test_cpu(event->cpu, &armpmu->valid_cpus)) + return -ENOENT; + /* does not support taken branch sampling */ if (has_branch_stack(event)) return -EOPNOTSUPP; diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c index 1f2740e3dbc..0b48a38e3cf 100644 --- a/arch/arm/kernel/perf_event_cpu.c +++ b/arch/arm/kernel/perf_event_cpu.c @@ -19,6 +19,7 @@ #define pr_fmt(fmt) "CPU PMU: " fmt #include <linux/bitmap.h> +#include <linux/cpu_pm.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/of.h> @@ -31,33 +32,36 @@ #include <asm/pmu.h> /* Set at runtime when we know what CPU type we are. */ -static struct arm_pmu *cpu_pmu; +static DEFINE_PER_CPU(struct arm_pmu *, cpu_pmu); static DEFINE_PER_CPU(struct perf_event * [ARMPMU_MAX_HWEVENTS], hw_events); static DEFINE_PER_CPU(unsigned long [BITS_TO_LONGS(ARMPMU_MAX_HWEVENTS)], used_mask); static DEFINE_PER_CPU(struct pmu_hw_events, cpu_hw_events); +static DEFINE_PER_CPU(struct cpupmu_regs, cpu_pmu_regs); + /* * Despite the names, these two functions are CPU-specific and are used * by the OProfile/perf code. */ const char *perf_pmu_name(void) { - if (!cpu_pmu) + struct arm_pmu *pmu = per_cpu(cpu_pmu, 0); + if (!pmu) return NULL; - return cpu_pmu->name; + return pmu->name; } EXPORT_SYMBOL_GPL(perf_pmu_name); int perf_num_counters(void) { - int max_events = 0; + struct arm_pmu *pmu = per_cpu(cpu_pmu, 0); - if (cpu_pmu != NULL) - max_events = cpu_pmu->num_events; + if (!pmu) + return 0; - return max_events; + return pmu->num_events; } EXPORT_SYMBOL_GPL(perf_num_counters); @@ -75,11 +79,13 @@ static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu) { int i, irq, irqs; struct platform_device *pmu_device = cpu_pmu->plat_device; + int cpu = -1; irqs = min(pmu_device->num_resources, num_possible_cpus()); for (i = 0; i < irqs; ++i) { - if (!cpumask_test_and_clear_cpu(i, &cpu_pmu->active_irqs)) + cpu = cpumask_next(cpu, &cpu_pmu->valid_cpus); + if (!cpumask_test_and_clear_cpu(cpu, &cpu_pmu->active_irqs)) continue; irq = platform_get_irq(pmu_device, i); if (irq >= 0) @@ -91,6 +97,7 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler) { int i, err, irq, irqs; struct platform_device *pmu_device = cpu_pmu->plat_device; + int cpu = -1; if (!pmu_device) return -ENODEV; @@ -103,6 +110,7 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler) for (i = 0; i < irqs; ++i) { err = 0; + cpu = cpumask_next(cpu, &cpu_pmu->valid_cpus); irq = platform_get_irq(pmu_device, i); if (irq < 0) continue; @@ -112,7 +120,7 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler) * assume that we're running on a uniprocessor machine and * continue. Otherwise, continue without this interrupt. */ - if (irq_set_affinity(irq, cpumask_of(i)) && irqs > 1) { + if (irq_set_affinity(irq, cpumask_of(cpu)) && irqs > 1) { pr_warning("unable to set irq affinity (irq=%d, cpu=%u)\n", irq, i); continue; @@ -126,7 +134,7 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler) return err; } - cpumask_set_cpu(i, &cpu_pmu->active_irqs); + cpumask_set_cpu(cpu, &cpu_pmu->active_irqs); } return 0; @@ -135,7 +143,7 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler) static void cpu_pmu_init(struct arm_pmu *cpu_pmu) { int cpu; - for_each_possible_cpu(cpu) { + for_each_cpu_mask(cpu, cpu_pmu->valid_cpus) { struct pmu_hw_events *events = &per_cpu(cpu_hw_events, cpu); events->events = per_cpu(hw_events, cpu); events->used_mask = per_cpu(used_mask, cpu); @@ -148,7 +156,7 @@ static void cpu_pmu_init(struct arm_pmu *cpu_pmu) /* Ensure the PMU has sane values out of reset. */ if (cpu_pmu->reset) - on_each_cpu(cpu_pmu->reset, cpu_pmu, 1); + on_each_cpu_mask(&cpu_pmu->valid_cpus, cpu_pmu->reset, cpu_pmu, 1); } /* @@ -160,21 +168,46 @@ static void cpu_pmu_init(struct arm_pmu *cpu_pmu) static int __cpuinit cpu_pmu_notify(struct notifier_block *b, unsigned long action, void *hcpu) { + struct arm_pmu *pmu = per_cpu(cpu_pmu, (long)hcpu); + if ((action & ~CPU_TASKS_FROZEN) != CPU_STARTING) return NOTIFY_DONE; - if (cpu_pmu && cpu_pmu->reset) - cpu_pmu->reset(cpu_pmu); + if (pmu && pmu->reset) + pmu->reset(pmu); else return NOTIFY_DONE; return NOTIFY_OK; } +static int cpu_pmu_pm_notify(struct notifier_block *b, + unsigned long action, void *hcpu) +{ + int cpu = smp_processor_id(); + struct arm_pmu *pmu = per_cpu(cpu_pmu, cpu); + struct cpupmu_regs *pmuregs = &per_cpu(cpu_pmu_regs, cpu); + + if (!pmu) + return NOTIFY_DONE; + + if (action == CPU_PM_ENTER && pmu->save_regs) { + pmu->save_regs(pmu, pmuregs); + } else if (action == CPU_PM_EXIT && pmu->restore_regs) { + pmu->restore_regs(pmu, pmuregs); + } + + return NOTIFY_OK; +} + static struct notifier_block __cpuinitdata cpu_pmu_hotplug_notifier = { .notifier_call = cpu_pmu_notify, }; +static struct notifier_block __cpuinitdata cpu_pmu_pm_notifier = { + .notifier_call = cpu_pmu_pm_notify, +}; + /* * PMU platform driver and devicetree bindings. */ @@ -246,6 +279,9 @@ static int probe_current_pmu(struct arm_pmu *pmu) } } + /* assume PMU support all the CPUs in this case */ + cpumask_setall(&pmu->valid_cpus); + put_cpu(); return ret; } @@ -253,15 +289,10 @@ static int probe_current_pmu(struct arm_pmu *pmu) static int cpu_pmu_device_probe(struct platform_device *pdev) { const struct of_device_id *of_id; - int (*init_fn)(struct arm_pmu *); struct device_node *node = pdev->dev.of_node; struct arm_pmu *pmu; - int ret = -ENODEV; - - if (cpu_pmu) { - pr_info("attempt to register multiple PMU devices!"); - return -ENOSPC; - } + int ret = 0; + int cpu; pmu = kzalloc(sizeof(struct arm_pmu), GFP_KERNEL); if (!pmu) { @@ -270,8 +301,28 @@ static int cpu_pmu_device_probe(struct platform_device *pdev) } if (node && (of_id = of_match_node(cpu_pmu_of_device_ids, pdev->dev.of_node))) { - init_fn = of_id->data; - ret = init_fn(pmu); + smp_call_func_t init_fn = (smp_call_func_t)of_id->data; + struct device_node *ncluster; + int cluster = -1; + cpumask_t sibling_mask; + + ncluster = of_parse_phandle(node, "cluster", 0); + if (ncluster) { + int len; + const u32 *hwid; + hwid = of_get_property(ncluster, "reg", &len); + if (hwid && len == 4) + cluster = be32_to_cpup(hwid); + } + /* set sibling mask to all cpu mask if socket is not specified */ + if (cluster == -1 || + cluster_to_logical_mask(cluster, &sibling_mask)) + cpumask_setall(&sibling_mask); + + smp_call_function_any(&sibling_mask, init_fn, pmu, 1); + + /* now set the valid_cpus after init */ + cpumask_copy(&pmu->valid_cpus, &sibling_mask); } else { ret = probe_current_pmu(pmu); } @@ -281,10 +332,12 @@ static int cpu_pmu_device_probe(struct platform_device *pdev) goto out_free; } - cpu_pmu = pmu; - cpu_pmu->plat_device = pdev; - cpu_pmu_init(cpu_pmu); - ret = armpmu_register(cpu_pmu, PERF_TYPE_RAW); + for_each_cpu_mask(cpu, pmu->valid_cpus) + per_cpu(cpu_pmu, cpu) = pmu; + + pmu->plat_device = pdev; + cpu_pmu_init(pmu); + ret = armpmu_register(pmu, -1); if (!ret) return 0; @@ -313,9 +366,17 @@ static int __init register_pmu_driver(void) if (err) return err; + err = cpu_pm_register_notifier(&cpu_pmu_pm_notifier); + if (err) { + unregister_cpu_notifier(&cpu_pmu_hotplug_notifier); + return err; + } + err = platform_driver_register(&cpu_pmu_driver); - if (err) + if (err) { + cpu_pm_unregister_notifier(&cpu_pmu_pm_notifier); unregister_cpu_notifier(&cpu_pmu_hotplug_notifier); + } return err; } diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c index 039cffb053a..654db5030c3 100644 --- a/arch/arm/kernel/perf_event_v7.c +++ b/arch/arm/kernel/perf_event_v7.c @@ -950,6 +950,51 @@ static void armv7_pmnc_dump_regs(struct arm_pmu *cpu_pmu) } #endif +static void armv7pmu_save_regs(struct arm_pmu *cpu_pmu, + struct cpupmu_regs *regs) +{ + unsigned int cnt; + asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r" (regs->pmc)); + if (!(regs->pmc & ARMV7_PMNC_E)) + return; + + asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r" (regs->pmcntenset)); + asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r" (regs->pmuseren)); + asm volatile("mrc p15, 0, %0, c9, c14, 1" : "=r" (regs->pmintenset)); + asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (regs->pmxevtcnt[0])); + for (cnt = ARMV7_IDX_COUNTER0; + cnt <= ARMV7_IDX_COUNTER_LAST(cpu_pmu); cnt++) { + armv7_pmnc_select_counter(cnt); + asm volatile("mrc p15, 0, %0, c9, c13, 1" + : "=r"(regs->pmxevttype[cnt])); + asm volatile("mrc p15, 0, %0, c9, c13, 2" + : "=r"(regs->pmxevtcnt[cnt])); + } + return; +} + +static void armv7pmu_restore_regs(struct arm_pmu *cpu_pmu, + struct cpupmu_regs *regs) +{ + unsigned int cnt; + if (!(regs->pmc & ARMV7_PMNC_E)) + return; + + asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r" (regs->pmcntenset)); + asm volatile("mcr p15, 0, %0, c9, c14, 0" : : "r" (regs->pmuseren)); + asm volatile("mcr p15, 0, %0, c9, c14, 1" : : "r" (regs->pmintenset)); + asm volatile("mcr p15, 0, %0, c9, c13, 0" : : "r" (regs->pmxevtcnt[0])); + for (cnt = ARMV7_IDX_COUNTER0; + cnt <= ARMV7_IDX_COUNTER_LAST(cpu_pmu); cnt++) { + armv7_pmnc_select_counter(cnt); + asm volatile("mcr p15, 0, %0, c9, c13, 1" + : : "r"(regs->pmxevttype[cnt])); + asm volatile("mcr p15, 0, %0, c9, c13, 2" + : : "r"(regs->pmxevtcnt[cnt])); + } + asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (regs->pmc)); +} + static void armv7pmu_enable_event(struct perf_event *event) { unsigned long flags; @@ -1223,6 +1268,8 @@ static void armv7pmu_init(struct arm_pmu *cpu_pmu) cpu_pmu->start = armv7pmu_start; cpu_pmu->stop = armv7pmu_stop; cpu_pmu->reset = armv7pmu_reset; + cpu_pmu->save_regs = armv7pmu_save_regs; + cpu_pmu->restore_regs = armv7pmu_restore_regs; cpu_pmu->max_period = (1LLU << 32) - 1; }; @@ -1240,7 +1287,7 @@ static u32 armv7_read_num_pmnc_events(void) static int armv7_a8_pmu_init(struct arm_pmu *cpu_pmu) { armv7pmu_init(cpu_pmu); - cpu_pmu->name = "ARMv7 Cortex-A8"; + cpu_pmu->name = "ARMv7_Cortex_A8"; cpu_pmu->map_event = armv7_a8_map_event; cpu_pmu->num_events = armv7_read_num_pmnc_events(); return 0; @@ -1249,7 +1296,7 @@ static int armv7_a8_pmu_init(struct arm_pmu *cpu_pmu) static int armv7_a9_pmu_init(struct arm_pmu *cpu_pmu) { armv7pmu_init(cpu_pmu); - cpu_pmu->name = "ARMv7 Cortex-A9"; + cpu_pmu->name = "ARMv7_Cortex_A9"; cpu_pmu->map_event = armv7_a9_map_event; cpu_pmu->num_events = armv7_read_num_pmnc_events(); return 0; @@ -1258,7 +1305,7 @@ static int armv7_a9_pmu_init(struct arm_pmu *cpu_pmu) static int armv7_a5_pmu_init(struct arm_pmu *cpu_pmu) { armv7pmu_init(cpu_pmu); - cpu_pmu->name = "ARMv7 Cortex-A5"; + cpu_pmu->name = "ARMv7_Cortex_A5"; cpu_pmu->map_event = armv7_a5_map_event; cpu_pmu->num_events = armv7_read_num_pmnc_events(); return 0; @@ -1267,7 +1314,7 @@ static int armv7_a5_pmu_init(struct arm_pmu *cpu_pmu) static int armv7_a15_pmu_init(struct arm_pmu *cpu_pmu) { armv7pmu_init(cpu_pmu); - cpu_pmu->name = "ARMv7 Cortex-A15"; + cpu_pmu->name = "ARMv7_Cortex_A15"; cpu_pmu->map_event = armv7_a15_map_event; cpu_pmu->num_events = armv7_read_num_pmnc_events(); cpu_pmu->set_event_filter = armv7pmu_set_event_filter; @@ -1277,7 +1324,7 @@ static int armv7_a15_pmu_init(struct arm_pmu *cpu_pmu) static int armv7_a7_pmu_init(struct arm_pmu *cpu_pmu) { armv7pmu_init(cpu_pmu); - cpu_pmu->name = "ARMv7 Cortex-A7"; + cpu_pmu->name = "ARMv7_Cortex_A7"; cpu_pmu->map_event = armv7_a7_map_event; cpu_pmu->num_events = armv7_read_num_pmnc_events(); cpu_pmu->set_event_filter = armv7pmu_set_event_filter; diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 4459c0b4e91..677da58d9e8 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -398,6 +398,33 @@ void __init arch_get_hmp_domains(struct list_head *hmp_domains_list) /* + * cluster_to_logical_mask - return cpu logical mask of CPUs in a cluster + * @socket_id: cluster HW identifier + * @cluster_mask: the cpumask location to be initialized, modified by the + * function only if return value == 0 + * + * Return: + * + * 0 on success + * -EINVAL if cluster_mask is NULL or there is no record matching socket_id + */ +int cluster_to_logical_mask(unsigned int socket_id, cpumask_t *cluster_mask) +{ + int cpu; + + if (!cluster_mask) + return -EINVAL; + + for_each_online_cpu(cpu) + if (socket_id == topology_physical_package_id(cpu)) { + cpumask_copy(cluster_mask, topology_core_cpumask(cpu)); + return 0; + } + + return -EINVAL; +} + +/* * init_cpu_topology is called at boot when only one cpu is running * which prevent simultaneous write access to cpu_topology array */ diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index c586679b6fe..a30ab7910ff 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -198,7 +198,7 @@ extern void __inc_zone_state(struct zone *, enum zone_stat_item); extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); -void refresh_cpu_vm_stats(int); +bool refresh_cpu_vm_stats(int); void refresh_zone_stat_thresholds(void); void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); diff --git a/mm/vmstat.c b/mm/vmstat.c index f42745e6578..b916a43a6b3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -14,6 +14,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/cpu.h> +#include <linux/cpumask.h> #include <linux/vmstat.h> #include <linux/sched.h> #include <linux/math64.h> @@ -432,11 +433,12 @@ EXPORT_SYMBOL(dec_zone_page_state); * with the global counters. These could cause remote node cache line * bouncing and will have to be only done when necessary. */ -void refresh_cpu_vm_stats(int cpu) +bool refresh_cpu_vm_stats(int cpu) { struct zone *zone; int i; int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + bool vm_activity = false; for_each_populated_zone(zone) { struct per_cpu_pageset *p; @@ -483,14 +485,21 @@ void refresh_cpu_vm_stats(int cpu) if (p->expire) continue; - if (p->pcp.count) + if (p->pcp.count) { + vm_activity = true; drain_zone_pages(zone, &p->pcp); + } #endif } for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (global_diff[i]) + if (global_diff[i]) { atomic_long_add(global_diff[i], &vm_stat[i]); + vm_activity = true; + } + + return vm_activity; + } /* @@ -1174,22 +1183,72 @@ static const struct file_operations proc_vmstat_file_operations = { #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct delayed_work, vmstat_work); int sysctl_stat_interval __read_mostly = HZ; +static struct cpumask vmstat_off_cpus; +struct delayed_work vmstat_monitor_work; -static void vmstat_update(struct work_struct *w) +static inline bool need_vmstat(int cpu) { - refresh_cpu_vm_stats(smp_processor_id()); - schedule_delayed_work(&__get_cpu_var(vmstat_work), - round_jiffies_relative(sysctl_stat_interval)); + struct zone *zone; + int i; + + for_each_populated_zone(zone) { + struct per_cpu_pageset *p; + + p = per_cpu_ptr(zone->pageset, cpu); + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (p->vm_stat_diff[i]) + return true; + + if (zone_to_nid(zone) != numa_node_id() && p->pcp.count) + return true; + } + + return false; } -static void __cpuinit start_cpu_timer(int cpu) +static void vmstat_update(struct work_struct *w); + +static void start_cpu_timer(int cpu) { struct delayed_work *work = &per_cpu(vmstat_work, cpu); - INIT_DEFERRABLE_WORK(work, vmstat_update); + cpumask_clear_cpu(cpu, &vmstat_off_cpus); schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); } +static void __cpuinit setup_cpu_timer(int cpu) +{ + struct delayed_work *work = &per_cpu(vmstat_work, cpu); + + INIT_DEFERRABLE_WORK(work, vmstat_update); + start_cpu_timer(cpu); +} + +static void vmstat_update_monitor(struct work_struct *w) +{ + int cpu; + + for_each_cpu_and(cpu, &vmstat_off_cpus, cpu_online_mask) + if (need_vmstat(cpu)) + start_cpu_timer(cpu); + + queue_delayed_work(system_unbound_wq, &vmstat_monitor_work, + round_jiffies_relative(sysctl_stat_interval)); +} + + +static void vmstat_update(struct work_struct *w) +{ + int cpu = smp_processor_id(); + + if (likely(refresh_cpu_vm_stats(cpu))) + schedule_delayed_work(&__get_cpu_var(vmstat_work), + round_jiffies_relative(sysctl_stat_interval)); + else + cpumask_set_cpu(cpu, &vmstat_off_cpus); +} + /* * Use the cpu notifier to insure that the thresholds are recalculated * when necessary. @@ -1204,17 +1263,19 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, case CPU_ONLINE: case CPU_ONLINE_FROZEN: refresh_zone_stat_thresholds(); - start_cpu_timer(cpu); + setup_cpu_timer(cpu); node_set_state(cpu_to_node(cpu), N_CPU); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: - cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); - per_cpu(vmstat_work, cpu).work.func = NULL; + if (!cpumask_test_cpu(cpu, &vmstat_off_cpus)) { + cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); + per_cpu(vmstat_work, cpu).work.func = NULL; + } break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - start_cpu_timer(cpu); + setup_cpu_timer(cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: @@ -1237,8 +1298,14 @@ static int __init setup_vmstat(void) register_cpu_notifier(&vmstat_notifier); + INIT_DEFERRABLE_WORK(&vmstat_monitor_work, + vmstat_update_monitor); + queue_delayed_work(system_unbound_wq, + &vmstat_monitor_work, + round_jiffies_relative(HZ)); + for_each_online_cpu(cpu) - start_cpu_timer(cpu); + setup_cpu_timer(cpu); #endif #ifdef CONFIG_PROC_FS proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); |