diff options
author | Jon Medhurst <tixy@linaro.org> | 2013-05-09 13:58:50 +0100 |
---|---|---|
committer | Jon Medhurst <tixy@linaro.org> | 2013-05-09 13:58:50 +0100 |
commit | 3f18385eb14528024a3c635a28a4acc4837b90e6 (patch) | |
tree | 2eef1f12e9997566f7776ee23a28dcc5645cadb9 | |
parent | c1be5a5b1b355d40e6cf79cc979eb66dafa24ad1 (diff) | |
parent | a7eb7c6f9a657a01a8359edae31bbeacd18b072c (diff) |
Merge branch 'mcpm' of git://git.linaro.org/people/nico/linux into mcpm-merge-nico
-rw-r--r-- | Documentation/arm/cluster-pm-race-avoidance.txt | 498 | ||||
-rw-r--r-- | Documentation/arm/vlocks.txt | 211 | ||||
-rw-r--r-- | arch/arm/Kconfig | 8 | ||||
-rw-r--r-- | arch/arm/common/Makefile | 1 | ||||
-rw-r--r-- | arch/arm/common/mcpm_entry.c | 263 | ||||
-rw-r--r-- | arch/arm/common/mcpm_head.S | 219 | ||||
-rw-r--r-- | arch/arm/common/mcpm_platsmp.c | 92 | ||||
-rw-r--r-- | arch/arm/common/vlock.S | 108 | ||||
-rw-r--r-- | arch/arm/common/vlock.h | 29 | ||||
-rw-r--r-- | arch/arm/include/asm/cacheflush.h | 75 | ||||
-rw-r--r-- | arch/arm/include/asm/mcpm.h | 209 | ||||
-rw-r--r-- | arch/arm/kernel/asm-offsets.c | 4 |
12 files changed, 1717 insertions, 0 deletions
diff --git a/Documentation/arm/cluster-pm-race-avoidance.txt b/Documentation/arm/cluster-pm-race-avoidance.txt new file mode 100644 index 00000000000..750b6fc24af --- /dev/null +++ b/Documentation/arm/cluster-pm-race-avoidance.txt @@ -0,0 +1,498 @@ +Cluster-wide Power-up/power-down race avoidance algorithm +========================================================= + +This file documents the algorithm which is used to coordinate CPU and +cluster setup and teardown operations and to manage hardware coherency +controls safely. + +The section "Rationale" explains what the algorithm is for and why it is +needed. "Basic model" explains general concepts using a simplified view +of the system. The other sections explain the actual details of the +algorithm in use. + + +Rationale +--------- + +In a system containing multiple CPUs, it is desirable to have the +ability to turn off individual CPUs when the system is idle, reducing +power consumption and thermal dissipation. + +In a system containing multiple clusters of CPUs, it is also desirable +to have the ability to turn off entire clusters. + +Turning entire clusters off and on is a risky business, because it +involves performing potentially destructive operations affecting a group +of independently running CPUs, while the OS continues to run. This +means that we need some coordination in order to ensure that critical +cluster-level operations are only performed when it is truly safe to do +so. + +Simple locking may not be sufficient to solve this problem, because +mechanisms like Linux spinlocks may rely on coherency mechanisms which +are not immediately enabled when a cluster powers up. Since enabling or +disabling those mechanisms may itself be a non-atomic operation (such as +writing some hardware registers and invalidating large caches), other +methods of coordination are required in order to guarantee safe +power-down and power-up at the cluster level. + +The mechanism presented in this document describes a coherent memory +based protocol for performing the needed coordination. It aims to be as +lightweight as possible, while providing the required safety properties. + + +Basic model +----------- + +Each cluster and CPU is assigned a state, as follows: + + DOWN + COMING_UP + UP + GOING_DOWN + + +---------> UP ----------+ + | v + + COMING_UP GOING_DOWN + + ^ | + +--------- DOWN <--------+ + + +DOWN: The CPU or cluster is not coherent, and is either powered off or + suspended, or is ready to be powered off or suspended. + +COMING_UP: The CPU or cluster has committed to moving to the UP state. + It may be part way through the process of initialisation and + enabling coherency. + +UP: The CPU or cluster is active and coherent at the hardware + level. A CPU in this state is not necessarily being used + actively by the kernel. + +GOING_DOWN: The CPU or cluster has committed to moving to the DOWN + state. It may be part way through the process of teardown and + coherency exit. + + +Each CPU has one of these states assigned to it at any point in time. +The CPU states are described in the "CPU state" section, below. + +Each cluster is also assigned a state, but it is necessary to split the +state value into two parts (the "cluster" state and "inbound" state) and +to introduce additional states in order to avoid races between different +CPUs in the cluster simultaneously modifying the state. The cluster- +level states are described in the "Cluster state" section. + +To help distinguish the CPU states from cluster states in this +discussion, the state names are given a CPU_ prefix for the CPU states, +and a CLUSTER_ or INBOUND_ prefix for the cluster states. + + +CPU state +--------- + +In this algorithm, each individual core in a multi-core processor is +referred to as a "CPU". CPUs are assumed to be single-threaded: +therefore, a CPU can only be doing one thing at a single point in time. + +This means that CPUs fit the basic model closely. + +The algorithm defines the following states for each CPU in the system: + + CPU_DOWN + CPU_COMING_UP + CPU_UP + CPU_GOING_DOWN + + cluster setup and + CPU setup complete policy decision + +-----------> CPU_UP ------------+ + | v + + CPU_COMING_UP CPU_GOING_DOWN + + ^ | + +----------- CPU_DOWN <----------+ + policy decision CPU teardown complete + or hardware event + + +The definitions of the four states correspond closely to the states of +the basic model. + +Transitions between states occur as follows. + +A trigger event (spontaneous) means that the CPU can transition to the +next state as a result of making local progress only, with no +requirement for any external event to happen. + + +CPU_DOWN: + + A CPU reaches the CPU_DOWN state when it is ready for + power-down. On reaching this state, the CPU will typically + power itself down or suspend itself, via a WFI instruction or a + firmware call. + + Next state: CPU_COMING_UP + Conditions: none + + Trigger events: + + a) an explicit hardware power-up operation, resulting + from a policy decision on another CPU; + + b) a hardware event, such as an interrupt. + + +CPU_COMING_UP: + + A CPU cannot start participating in hardware coherency until the + cluster is set up and coherent. If the cluster is not ready, + then the CPU will wait in the CPU_COMING_UP state until the + cluster has been set up. + + Next state: CPU_UP + Conditions: The CPU's parent cluster must be in CLUSTER_UP. + Trigger events: Transition of the parent cluster to CLUSTER_UP. + + Refer to the "Cluster state" section for a description of the + CLUSTER_UP state. + + +CPU_UP: + When a CPU reaches the CPU_UP state, it is safe for the CPU to + start participating in local coherency. + + This is done by jumping to the kernel's CPU resume code. + + Note that the definition of this state is slightly different + from the basic model definition: CPU_UP does not mean that the + CPU is coherent yet, but it does mean that it is safe to resume + the kernel. The kernel handles the rest of the resume + procedure, so the remaining steps are not visible as part of the + race avoidance algorithm. + + The CPU remains in this state until an explicit policy decision + is made to shut down or suspend the CPU. + + Next state: CPU_GOING_DOWN + Conditions: none + Trigger events: explicit policy decision + + +CPU_GOING_DOWN: + + While in this state, the CPU exits coherency, including any + operations required to achieve this (such as cleaning data + caches). + + Next state: CPU_DOWN + Conditions: local CPU teardown complete + Trigger events: (spontaneous) + + +Cluster state +------------- + +A cluster is a group of connected CPUs with some common resources. +Because a cluster contains multiple CPUs, it can be doing multiple +things at the same time. This has some implications. In particular, a +CPU can start up while another CPU is tearing the cluster down. + +In this discussion, the "outbound side" is the view of the cluster state +as seen by a CPU tearing the cluster down. The "inbound side" is the +view of the cluster state as seen by a CPU setting the CPU up. + +In order to enable safe coordination in such situations, it is important +that a CPU which is setting up the cluster can advertise its state +independently of the CPU which is tearing down the cluster. For this +reason, the cluster state is split into two parts: + + "cluster" state: The global state of the cluster; or the state + on the outbound side: + + CLUSTER_DOWN + CLUSTER_UP + CLUSTER_GOING_DOWN + + "inbound" state: The state of the cluster on the inbound side. + + INBOUND_NOT_COMING_UP + INBOUND_COMING_UP + + + The different pairings of these states results in six possible + states for the cluster as a whole: + + CLUSTER_UP + +==========> INBOUND_NOT_COMING_UP -------------+ + # | + | + CLUSTER_UP <----+ | + INBOUND_COMING_UP | v + + ^ CLUSTER_GOING_DOWN CLUSTER_GOING_DOWN + # INBOUND_COMING_UP <=== INBOUND_NOT_COMING_UP + + CLUSTER_DOWN | | + INBOUND_COMING_UP <----+ | + | + ^ | + +=========== CLUSTER_DOWN <------------+ + INBOUND_NOT_COMING_UP + + Transitions -----> can only be made by the outbound CPU, and + only involve changes to the "cluster" state. + + Transitions ===##> can only be made by the inbound CPU, and only + involve changes to the "inbound" state, except where there is no + further transition possible on the outbound side (i.e., the + outbound CPU has put the cluster into the CLUSTER_DOWN state). + + The race avoidance algorithm does not provide a way to determine + which exact CPUs within the cluster play these roles. This must + be decided in advance by some other means. Refer to the section + "Last man and first man selection" for more explanation. + + + CLUSTER_DOWN/INBOUND_NOT_COMING_UP is the only state where the + cluster can actually be powered down. + + The parallelism of the inbound and outbound CPUs is observed by + the existence of two different paths from CLUSTER_GOING_DOWN/ + INBOUND_NOT_COMING_UP (corresponding to GOING_DOWN in the basic + model) to CLUSTER_DOWN/INBOUND_COMING_UP (corresponding to + COMING_UP in the basic model). The second path avoids cluster + teardown completely. + + CLUSTER_UP/INBOUND_COMING_UP is equivalent to UP in the basic + model. The final transition to CLUSTER_UP/INBOUND_NOT_COMING_UP + is trivial and merely resets the state machine ready for the + next cycle. + + Details of the allowable transitions follow. + + The next state in each case is notated + + <cluster state>/<inbound state> (<transitioner>) + + where the <transitioner> is the side on which the transition + can occur; either the inbound or the outbound side. + + +CLUSTER_DOWN/INBOUND_NOT_COMING_UP: + + Next state: CLUSTER_DOWN/INBOUND_COMING_UP (inbound) + Conditions: none + Trigger events: + + a) an explicit hardware power-up operation, resulting + from a policy decision on another CPU; + + b) a hardware event, such as an interrupt. + + +CLUSTER_DOWN/INBOUND_COMING_UP: + + In this state, an inbound CPU sets up the cluster, including + enabling of hardware coherency at the cluster level and any + other operations (such as cache invalidation) which are required + in order to achieve this. + + The purpose of this state is to do sufficient cluster-level + setup to enable other CPUs in the cluster to enter coherency + safely. + + Next state: CLUSTER_UP/INBOUND_COMING_UP (inbound) + Conditions: cluster-level setup and hardware coherency complete + Trigger events: (spontaneous) + + +CLUSTER_UP/INBOUND_COMING_UP: + + Cluster-level setup is complete and hardware coherency is + enabled for the cluster. Other CPUs in the cluster can safely + enter coherency. + + This is a transient state, leading immediately to + CLUSTER_UP/INBOUND_NOT_COMING_UP. All other CPUs on the cluster + should consider treat these two states as equivalent. + + Next state: CLUSTER_UP/INBOUND_NOT_COMING_UP (inbound) + Conditions: none + Trigger events: (spontaneous) + + +CLUSTER_UP/INBOUND_NOT_COMING_UP: + + Cluster-level setup is complete and hardware coherency is + enabled for the cluster. Other CPUs in the cluster can safely + enter coherency. + + The cluster will remain in this state until a policy decision is + made to power the cluster down. + + Next state: CLUSTER_GOING_DOWN/INBOUND_NOT_COMING_UP (outbound) + Conditions: none + Trigger events: policy decision to power down the cluster + + +CLUSTER_GOING_DOWN/INBOUND_NOT_COMING_UP: + + An outbound CPU is tearing the cluster down. The selected CPU + must wait in this state until all CPUs in the cluster are in the + CPU_DOWN state. + + When all CPUs are in the CPU_DOWN state, the cluster can be torn + down, for example by cleaning data caches and exiting + cluster-level coherency. + + To avoid wasteful unnecessary teardown operations, the outbound + should check the inbound cluster state for asynchronous + transitions to INBOUND_COMING_UP. Alternatively, individual + CPUs can be checked for entry into CPU_COMING_UP or CPU_UP. + + + Next states: + + CLUSTER_DOWN/INBOUND_NOT_COMING_UP (outbound) + Conditions: cluster torn down and ready to power off + Trigger events: (spontaneous) + + CLUSTER_GOING_DOWN/INBOUND_COMING_UP (inbound) + Conditions: none + Trigger events: + + a) an explicit hardware power-up operation, + resulting from a policy decision on another + CPU; + + b) a hardware event, such as an interrupt. + + +CLUSTER_GOING_DOWN/INBOUND_COMING_UP: + + The cluster is (or was) being torn down, but another CPU has + come online in the meantime and is trying to set up the cluster + again. + + If the outbound CPU observes this state, it has two choices: + + a) back out of teardown, restoring the cluster to the + CLUSTER_UP state; + + b) finish tearing the cluster down and put the cluster + in the CLUSTER_DOWN state; the inbound CPU will + set up the cluster again from there. + + Choice (a) permits the removal of some latency by avoiding + unnecessary teardown and setup operations in situations where + the cluster is not really going to be powered down. + + + Next states: + + CLUSTER_UP/INBOUND_COMING_UP (outbound) + Conditions: cluster-level setup and hardware + coherency complete + Trigger events: (spontaneous) + + CLUSTER_DOWN/INBOUND_COMING_UP (outbound) + Conditions: cluster torn down and ready to power off + Trigger events: (spontaneous) + + +Last man and First man selection +-------------------------------- + +The CPU which performs cluster tear-down operations on the outbound side +is commonly referred to as the "last man". + +The CPU which performs cluster setup on the inbound side is commonly +referred to as the "first man". + +The race avoidance algorithm documented above does not provide a +mechanism to choose which CPUs should play these roles. + + +Last man: + +When shutting down the cluster, all the CPUs involved are initially +executing Linux and hence coherent. Therefore, ordinary spinlocks can +be used to select a last man safely, before the CPUs become +non-coherent. + + +First man: + +Because CPUs may power up asynchronously in response to external wake-up +events, a dynamic mechanism is needed to make sure that only one CPU +attempts to play the first man role and do the cluster-level +initialisation: any other CPUs must wait for this to complete before +proceeding. + +Cluster-level initialisation may involve actions such as configuring +coherency controls in the bus fabric. + +The current implementation in mcpm_head.S uses a separate mutual exclusion +mechanism to do this arbitration. This mechanism is documented in +detail in vlocks.txt. + + +Features and Limitations +------------------------ + +Implementation: + + The current ARM-based implementation is split between + arch/arm/common/mcpm_head.S (low-level inbound CPU operations) and + arch/arm/common/mcpm_entry.c (everything else): + + __mcpm_cpu_going_down() signals the transition of a CPU to the + CPU_GOING_DOWN state. + + __mcpm_cpu_down() signals the transition of a CPU to the CPU_DOWN + state. + + A CPU transitions to CPU_COMING_UP and then to CPU_UP via the + low-level power-up code in mcpm_head.S. This could + involve CPU-specific setup code, but in the current + implementation it does not. + + __mcpm_outbound_enter_critical() and __mcpm_outbound_leave_critical() + handle transitions from CLUSTER_UP to CLUSTER_GOING_DOWN + and from there to CLUSTER_DOWN or back to CLUSTER_UP (in + the case of an aborted cluster power-down). + + These functions are more complex than the __mcpm_cpu_*() + functions due to the extra inter-CPU coordination which + is needed for safe transitions at the cluster level. + + A cluster transitions from CLUSTER_DOWN back to CLUSTER_UP via + the low-level power-up code in mcpm_head.S. This + typically involves platform-specific setup code, + provided by the platform-specific power_up_setup + function registered via mcpm_sync_init. + +Deep topologies: + + As currently described and implemented, the algorithm does not + support CPU topologies involving more than two levels (i.e., + clusters of clusters are not supported). The algorithm could be + extended by replicating the cluster-level states for the + additional topological levels, and modifying the transition + rules for the intermediate (non-outermost) cluster levels. + + +Colophon +-------- + +Originally created and documented by Dave Martin for Linaro Limited, in +collaboration with Nicolas Pitre and Achin Gupta. + +Copyright (C) 2012-2013 Linaro Limited +Distributed under the terms of Version 2 of the GNU General Public +License, as defined in linux/COPYING. diff --git a/Documentation/arm/vlocks.txt b/Documentation/arm/vlocks.txt new file mode 100644 index 00000000000..415960a9bab --- /dev/null +++ b/Documentation/arm/vlocks.txt @@ -0,0 +1,211 @@ +vlocks for Bare-Metal Mutual Exclusion +====================================== + +Voting Locks, or "vlocks" provide a simple low-level mutual exclusion +mechanism, with reasonable but minimal requirements on the memory +system. + +These are intended to be used to coordinate critical activity among CPUs +which are otherwise non-coherent, in situations where the hardware +provides no other mechanism to support this and ordinary spinlocks +cannot be used. + + +vlocks make use of the atomicity provided by the memory system for +writes to a single memory location. To arbitrate, every CPU "votes for +itself", by storing a unique number to a common memory location. The +final value seen in that memory location when all the votes have been +cast identifies the winner. + +In order to make sure that the election produces an unambiguous result +in finite time, a CPU will only enter the election in the first place if +no winner has been chosen and the election does not appear to have +started yet. + + +Algorithm +--------- + +The easiest way to explain the vlocks algorithm is with some pseudo-code: + + + int currently_voting[NR_CPUS] = { 0, }; + int last_vote = -1; /* no votes yet */ + + bool vlock_trylock(int this_cpu) + { + /* signal our desire to vote */ + currently_voting[this_cpu] = 1; + if (last_vote != -1) { + /* someone already volunteered himself */ + currently_voting[this_cpu] = 0; + return false; /* not ourself */ + } + + /* let's suggest ourself */ + last_vote = this_cpu; + currently_voting[this_cpu] = 0; + + /* then wait until everyone else is done voting */ + for_each_cpu(i) { + while (currently_voting[i] != 0) + /* wait */; + } + + /* result */ + if (last_vote == this_cpu) + return true; /* we won */ + return false; + } + + bool vlock_unlock(void) + { + last_vote = -1; + } + + +The currently_voting[] array provides a way for the CPUs to determine +whether an election is in progress, and plays a role analogous to the +"entering" array in Lamport's bakery algorithm [1]. + +However, once the election has started, the underlying memory system +atomicity is used to pick the winner. This avoids the need for a static +priority rule to act as a tie-breaker, or any counters which could +overflow. + +As long as the last_vote variable is globally visible to all CPUs, it +will contain only one value that won't change once every CPU has cleared +its currently_voting flag. + + +Features and limitations +------------------------ + + * vlocks are not intended to be fair. In the contended case, it is the + _last_ CPU which attempts to get the lock which will be most likely + to win. + + vlocks are therefore best suited to situations where it is necessary + to pick a unique winner, but it does not matter which CPU actually + wins. + + * Like other similar mechanisms, vlocks will not scale well to a large + number of CPUs. + + vlocks can be cascaded in a voting hierarchy to permit better scaling + if necessary, as in the following hypothetical example for 4096 CPUs: + + /* first level: local election */ + my_town = towns[(this_cpu >> 4) & 0xf]; + I_won = vlock_trylock(my_town, this_cpu & 0xf); + if (I_won) { + /* we won the town election, let's go for the state */ + my_state = states[(this_cpu >> 8) & 0xf]; + I_won = vlock_lock(my_state, this_cpu & 0xf)); + if (I_won) { + /* and so on */ + I_won = vlock_lock(the_whole_country, this_cpu & 0xf]; + if (I_won) { + /* ... */ + } + vlock_unlock(the_whole_country); + } + vlock_unlock(my_state); + } + vlock_unlock(my_town); + + +ARM implementation +------------------ + +The current ARM implementation [2] contains some optimisations beyond +the basic algorithm: + + * By packing the members of the currently_voting array close together, + we can read the whole array in one transaction (providing the number + of CPUs potentially contending the lock is small enough). This + reduces the number of round-trips required to external memory. + + In the ARM implementation, this means that we can use a single load + and comparison: + + LDR Rt, [Rn] + CMP Rt, #0 + + ...in place of code equivalent to: + + LDRB Rt, [Rn] + CMP Rt, #0 + LDRBEQ Rt, [Rn, #1] + CMPEQ Rt, #0 + LDRBEQ Rt, [Rn, #2] + CMPEQ Rt, #0 + LDRBEQ Rt, [Rn, #3] + CMPEQ Rt, #0 + + This cuts down on the fast-path latency, as well as potentially + reducing bus contention in contended cases. + + The optimisation relies on the fact that the ARM memory system + guarantees coherency between overlapping memory accesses of + different sizes, similarly to many other architectures. Note that + we do not care which element of currently_voting appears in which + bits of Rt, so there is no need to worry about endianness in this + optimisation. + + If there are too many CPUs to read the currently_voting array in + one transaction then multiple transations are still required. The + implementation uses a simple loop of word-sized loads for this + case. The number of transactions is still fewer than would be + required if bytes were loaded individually. + + + In principle, we could aggregate further by using LDRD or LDM, but + to keep the code simple this was not attempted in the initial + implementation. + + + * vlocks are currently only used to coordinate between CPUs which are + unable to enable their caches yet. This means that the + implementation removes many of the barriers which would be required + when executing the algorithm in cached memory. + + packing of the currently_voting array does not work with cached + memory unless all CPUs contending the lock are cache-coherent, due + to cache writebacks from one CPU clobbering values written by other + CPUs. (Though if all the CPUs are cache-coherent, you should be + probably be using proper spinlocks instead anyway). + + + * The "no votes yet" value used for the last_vote variable is 0 (not + -1 as in the pseudocode). This allows statically-allocated vlocks + to be implicitly initialised to an unlocked state simply by putting + them in .bss. + + An offset is added to each CPU's ID for the purpose of setting this + variable, so that no CPU uses the value 0 for its ID. + + +Colophon +-------- + +Originally created and documented by Dave Martin for Linaro Limited, for +use in ARM-based big.LITTLE platforms, with review and input gratefully +received from Nicolas Pitre and Achin Gupta. Thanks to Nicolas for +grabbing most of this text out of the relevant mail thread and writing +up the pseudocode. + +Copyright (C) 2012-2013 Linaro Limited +Distributed under the terms of Version 2 of the GNU General Public +License, as defined in linux/COPYING. + + +References +---------- + +[1] Lamport, L. "A New Solution of Dijkstra's Concurrent Programming + Problem", Communications of the ACM 17, 8 (August 1974), 453-455. + + http://en.wikipedia.org/wiki/Lamport%27s_bakery_algorithm + +[2] linux/arch/arm/common/vlock.S, www.kernel.org. diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 1cacda426a0..e5260d4a81f 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1606,6 +1606,14 @@ config HAVE_ARM_TWD help This options enables support for the ARM timer and watchdog unit +config MCPM + bool "Multi-Cluster Power Management" + depends on CPU_V7 && SMP + help + This option provides the common power management infrastructure + for (multi-)cluster based systems, such as big.LITTLE based + systems. + choice prompt "Memory split" default VMSPLIT_3G diff --git a/arch/arm/common/Makefile b/arch/arm/common/Makefile index dc8dd0de5c0..48c0ee57817 100644 --- a/arch/arm/common/Makefile +++ b/arch/arm/common/Makefile @@ -11,3 +11,4 @@ obj-$(CONFIG_SHARP_PARAM) += sharpsl_param.o obj-$(CONFIG_SHARP_SCOOP) += scoop.o obj-$(CONFIG_PCI_HOST_ITE8152) += it8152.o obj-$(CONFIG_ARM_TIMER_SP804) += timer-sp.o +obj-$(CONFIG_MCPM) += mcpm_head.o mcpm_entry.o mcpm_platsmp.o vlock.o diff --git a/arch/arm/common/mcpm_entry.c b/arch/arm/common/mcpm_entry.c new file mode 100644 index 00000000000..370236dd1a0 --- /dev/null +++ b/arch/arm/common/mcpm_entry.c @@ -0,0 +1,263 @@ +/* + * arch/arm/common/mcpm_entry.c -- entry point for multi-cluster PM + * + * Created by: Nicolas Pitre, March 2012 + * Copyright: (C) 2012-2013 Linaro Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/irqflags.h> + +#include <asm/mcpm.h> +#include <asm/cacheflush.h> +#include <asm/idmap.h> +#include <asm/cputype.h> + +extern unsigned long mcpm_entry_vectors[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER]; + +void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr) +{ + unsigned long val = ptr ? virt_to_phys(ptr) : 0; + mcpm_entry_vectors[cluster][cpu] = val; + sync_cache_w(&mcpm_entry_vectors[cluster][cpu]); +} + +static const struct mcpm_platform_ops *platform_ops; + +int __init mcpm_platform_register(const struct mcpm_platform_ops *ops) +{ + if (platform_ops) + return -EBUSY; + platform_ops = ops; + return 0; +} + +int mcpm_cpu_power_up(unsigned int cpu, unsigned int cluster) +{ + if (!platform_ops) + return -EUNATCH; /* try not to shadow power_up errors */ + might_sleep(); + return platform_ops->power_up(cpu, cluster); +} + +typedef void (*phys_reset_t)(unsigned long); + +void mcpm_cpu_power_down(void) +{ + phys_reset_t phys_reset; + + BUG_ON(!platform_ops); + BUG_ON(!irqs_disabled()); + + /* + * Do this before calling into the power_down method, + * as it might not always be safe to do afterwards. + */ + setup_mm_for_reboot(); + + platform_ops->power_down(); + + /* + * It is possible for a power_up request to happen concurrently + * with a power_down request for the same CPU. In this case the + * power_down method might not be able to actually enter a + * powered down state with the WFI instruction if the power_up + * method has removed the required reset condition. The + * power_down method is then allowed to return. We must perform + * a re-entry in the kernel as if the power_up method just had + * deasserted reset on the CPU. + * + * To simplify race issues, the platform specific implementation + * must accommodate for the possibility of unordered calls to + * power_down and power_up with a usage count. Therefore, if a + * call to power_up is issued for a CPU that is not down, then + * the next call to power_down must not attempt a full shutdown + * but only do the minimum (normally disabling L1 cache and CPU + * coherency) and return just as if a concurrent power_up request + * had happened as described above. + */ + + phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset); + phys_reset(virt_to_phys(mcpm_entry_point)); + + /* should never get here */ + BUG(); +} + +void mcpm_cpu_suspend(u64 expected_residency) +{ + phys_reset_t phys_reset; + + BUG_ON(!platform_ops); + BUG_ON(!irqs_disabled()); + + /* Very similar to mcpm_cpu_power_down() */ + setup_mm_for_reboot(); + platform_ops->suspend(expected_residency); + phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset); + phys_reset(virt_to_phys(mcpm_entry_point)); + BUG(); +} + +int mcpm_cpu_powered_up(void) +{ + if (!platform_ops) + return -EUNATCH; + if (platform_ops->powered_up) + platform_ops->powered_up(); + return 0; +} + +struct sync_struct mcpm_sync; + +/* + * __mcpm_cpu_going_down: Indicates that the cpu is being torn down. + * This must be called at the point of committing to teardown of a CPU. + * The CPU cache (SCTRL.C bit) is expected to still be active. + */ +void __mcpm_cpu_going_down(unsigned int cpu, unsigned int cluster) +{ + mcpm_sync.clusters[cluster].cpus[cpu].cpu = CPU_GOING_DOWN; + sync_cache_w(&mcpm_sync.clusters[cluster].cpus[cpu].cpu); +} + +/* + * __mcpm_cpu_down: Indicates that cpu teardown is complete and that the + * cluster can be torn down without disrupting this CPU. + * To avoid deadlocks, this must be called before a CPU is powered down. + * The CPU cache (SCTRL.C bit) is expected to be off. + * However L2 cache might or might not be active. + */ +void __mcpm_cpu_down(unsigned int cpu, unsigned int cluster) +{ + dmb(); + mcpm_sync.clusters[cluster].cpus[cpu].cpu = CPU_DOWN; + sync_cache_w(&mcpm_sync.clusters[cluster].cpus[cpu].cpu); + dsb_sev(); +} + +/* + * __mcpm_outbound_leave_critical: Leave the cluster teardown critical section. + * @state: the final state of the cluster: + * CLUSTER_UP: no destructive teardown was done and the cluster has been + * restored to the previous state (CPU cache still active); or + * CLUSTER_DOWN: the cluster has been torn-down, ready for power-off + * (CPU cache disabled, L2 cache either enabled or disabled). + */ +void __mcpm_outbound_leave_critical(unsigned int cluster, int state) +{ + dmb(); + mcpm_sync.clusters[cluster].cluster = state; + sync_cache_w(&mcpm_sync.clusters[cluster].cluster); + dsb_sev(); +} + +/* + * __mcpm_outbound_enter_critical: Enter the cluster teardown critical section. + * This function should be called by the last man, after local CPU teardown + * is complete. CPU cache expected to be active. + * + * Returns: + * false: the critical section was not entered because an inbound CPU was + * observed, or the cluster is already being set up; + * true: the critical section was entered: it is now safe to tear down the + * cluster. + */ +bool __mcpm_outbound_enter_critical(unsigned int cpu, unsigned int cluster) +{ + unsigned int i; + struct mcpm_sync_struct *c = &mcpm_sync.clusters[cluster]; + + /* Warn inbound CPUs that the cluster is being torn down: */ + c->cluster = CLUSTER_GOING_DOWN; + sync_cache_w(&c->cluster); + + /* Back out if the inbound cluster is already in the critical region: */ + sync_cache_r(&c->inbound); + if (c->inbound == INBOUND_COMING_UP) + goto abort; + + /* + * Wait for all CPUs to get out of the GOING_DOWN state, so that local + * teardown is complete on each CPU before tearing down the cluster. + * + * If any CPU has been woken up again from the DOWN state, then we + * shouldn't be taking the cluster down at all: abort in that case. + */ + sync_cache_r(&c->cpus); + for (i = 0; i < MAX_CPUS_PER_CLUSTER; i++) { + int cpustate; + + if (i == cpu) + continue; + + while (1) { + cpustate = c->cpus[i].cpu; + if (cpustate != CPU_GOING_DOWN) + break; + + wfe(); + sync_cache_r(&c->cpus[i].cpu); + } + + switch (cpustate) { + case CPU_DOWN: + continue; + + default: + goto abort; + } + } + + return true; + +abort: + __mcpm_outbound_leave_critical(cluster, CLUSTER_UP); + return false; +} + +int __mcpm_cluster_state(unsigned int cluster) +{ + sync_cache_r(&mcpm_sync.clusters[cluster].cluster); + return mcpm_sync.clusters[cluster].cluster; +} + +extern unsigned long mcpm_power_up_setup_phys; + +int __init mcpm_sync_init( + void (*power_up_setup)(unsigned int affinity_level)) +{ + unsigned int i, j, mpidr, this_cluster; + + BUILD_BUG_ON(MCPM_SYNC_CLUSTER_SIZE * MAX_NR_CLUSTERS != sizeof mcpm_sync); + BUG_ON((unsigned long)&mcpm_sync & (__CACHE_WRITEBACK_GRANULE - 1)); + + /* + * Set initial CPU and cluster states. + * Only one cluster is assumed to be active at this point. + */ + for (i = 0; i < MAX_NR_CLUSTERS; i++) { + mcpm_sync.clusters[i].cluster = CLUSTER_DOWN; + mcpm_sync.clusters[i].inbound = INBOUND_NOT_COMING_UP; + for (j = 0; j < MAX_CPUS_PER_CLUSTER; j++) + mcpm_sync.clusters[i].cpus[j].cpu = CPU_DOWN; + } + mpidr = read_cpuid_mpidr(); + this_cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1); + for_each_online_cpu(i) + mcpm_sync.clusters[this_cluster].cpus[i].cpu = CPU_UP; + mcpm_sync.clusters[this_cluster].cluster = CLUSTER_UP; + sync_cache_w(&mcpm_sync); + + if (power_up_setup) { + mcpm_power_up_setup_phys = virt_to_phys(power_up_setup); + sync_cache_w(&mcpm_power_up_setup_phys); + } + + return 0; +} diff --git a/arch/arm/common/mcpm_head.S b/arch/arm/common/mcpm_head.S new file mode 100644 index 00000000000..8178705c4b2 --- /dev/null +++ b/arch/arm/common/mcpm_head.S @@ -0,0 +1,219 @@ +/* + * arch/arm/common/mcpm_head.S -- kernel entry point for multi-cluster PM + * + * Created by: Nicolas Pitre, March 2012 + * Copyright: (C) 2012-2013 Linaro Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * + * Refer to Documentation/arm/cluster-pm-race-avoidance.txt + * for details of the synchronisation algorithms used here. + */ + +#include <linux/linkage.h> +#include <asm/mcpm.h> + +#include "vlock.h" + +.if MCPM_SYNC_CLUSTER_CPUS +.error "cpus must be the first member of struct mcpm_sync_struct" +.endif + + .macro pr_dbg string +#if defined(CONFIG_DEBUG_LL) && defined(DEBUG) + b 1901f +1902: .asciz "CPU" +1903: .asciz " cluster" +1904: .asciz ": \string" + .align +1901: adr r0, 1902b + bl printascii + mov r0, r9 + bl printhex8 + adr r0, 1903b + bl printascii + mov r0, r10 + bl printhex8 + adr r0, 1904b + bl printascii +#endif + .endm + + .arm + .align + +ENTRY(mcpm_entry_point) + + THUMB( adr r12, BSYM(1f) ) + THUMB( bx r12 ) + THUMB( .thumb ) +1: + mrc p15, 0, r0, c0, c0, 5 @ MPIDR + ubfx r9, r0, #0, #8 @ r9 = cpu + ubfx r10, r0, #8, #8 @ r10 = cluster + mov r3, #MAX_CPUS_PER_CLUSTER + mla r4, r3, r10, r9 @ r4 = canonical CPU index + cmp r4, #(MAX_CPUS_PER_CLUSTER * MAX_NR_CLUSTERS) + blo 2f + + /* We didn't expect this CPU. Try to cheaply make it quiet. */ +1: wfi + wfe + b 1b + +2: pr_dbg "kernel mcpm_entry_point\n" + + /* + * MMU is off so we need to get to various variables in a + * position independent way. + */ + adr r5, 3f + ldmia r5, {r6, r7, r8, r11} + add r6, r5, r6 @ r6 = mcpm_entry_vectors + ldr r7, [r5, r7] @ r7 = mcpm_power_up_setup_phys + add r8, r5, r8 @ r8 = mcpm_sync + add r11, r5, r11 @ r11 = first_man_locks + + mov r0, #MCPM_SYNC_CLUSTER_SIZE + mla r8, r0, r10, r8 @ r8 = sync cluster base + + @ Signal that this CPU is coming UP: + mov r0, #CPU_COMING_UP + mov r5, #MCPM_SYNC_CPU_SIZE + mla r5, r9, r5, r8 @ r5 = sync cpu address + strb r0, [r5] + + @ At this point, the cluster cannot unexpectedly enter the GOING_DOWN + @ state, because there is at least one active CPU (this CPU). + + mov r0, #VLOCK_SIZE + mla r11, r0, r10, r11 @ r11 = cluster first man lock + mov r0, r11 + mov r1, r9 @ cpu + bl vlock_trylock @ implies DMB + + cmp r0, #0 @ failed to get the lock? + bne mcpm_setup_wait @ wait for cluster setup if so + + ldrb r0, [r8, #MCPM_SYNC_CLUSTER_CLUSTER] + cmp r0, #CLUSTER_UP @ cluster already up? + bne mcpm_setup @ if not, set up the cluster + + @ Otherwise, release the first man lock and skip setup: + mov r0, r11 + bl vlock_unlock + b mcpm_setup_complete + +mcpm_setup: + @ Control dependency implies strb not observable before previous ldrb. + + @ Signal that the cluster is being brought up: + mov r0, #INBOUND_COMING_UP + strb r0, [r8, #MCPM_SYNC_CLUSTER_INBOUND] + dmb + + @ Any CPU trying to take the cluster into CLUSTER_GOING_DOWN from this + @ point onwards will observe INBOUND_COMING_UP and abort. + + @ Wait for any previously-pending cluster teardown operations to abort + @ or complete: +mcpm_teardown_wait: + ldrb r0, [r8, #MCPM_SYNC_CLUSTER_CLUSTER] + cmp r0, #CLUSTER_GOING_DOWN + bne first_man_setup + wfe + b mcpm_teardown_wait + +first_man_setup: + dmb + + @ If the outbound gave up before teardown started, skip cluster setup: + + cmp r0, #CLUSTER_UP + beq mcpm_setup_leave + + @ power_up_setup is now responsible for setting up the cluster: + + cmp r7, #0 + mov r0, #1 @ second (cluster) affinity level + blxne r7 @ Call power_up_setup if defined + dmb + + mov r0, #CLUSTER_UP + strb r0, [r8, #MCPM_SYNC_CLUSTER_CLUSTER] + dmb + +mcpm_setup_leave: + @ Leave the cluster setup critical section: + + mov r0, #INBOUND_NOT_COMING_UP + strb r0, [r8, #MCPM_SYNC_CLUSTER_INBOUND] + dsb + sev + + mov r0, r11 + bl vlock_unlock @ implies DMB + b mcpm_setup_complete + + @ In the contended case, non-first men wait here for cluster setup + @ to complete: +mcpm_setup_wait: + ldrb r0, [r8, #MCPM_SYNC_CLUSTER_CLUSTER] + cmp r0, #CLUSTER_UP + wfene + bne mcpm_setup_wait + dmb + +mcpm_setup_complete: + @ If a platform-specific CPU setup hook is needed, it is + @ called from here. + + cmp r7, #0 + mov r0, #0 @ first (CPU) affinity level + blxne r7 @ Call power_up_setup if defined + dmb + + @ Mark the CPU as up: + + mov r0, #CPU_UP + strb r0, [r5] + + @ Observability order of CPU_UP and opening of the gate does not matter. + +mcpm_entry_gated: + ldr r5, [r6, r4, lsl #2] @ r5 = CPU entry vector + cmp r5, #0 + wfeeq + beq mcpm_entry_gated + dmb + + pr_dbg "released\n" + bx r5 + + .align 2 + +3: .word mcpm_entry_vectors - . + .word mcpm_power_up_setup_phys - 3b + .word mcpm_sync - 3b + .word first_man_locks - 3b + +ENDPROC(mcpm_entry_point) + + .bss + + .align CACHE_WRITEBACK_ORDER + .type first_man_locks, #object +first_man_locks: + .space VLOCK_SIZE * MAX_NR_CLUSTERS + .align CACHE_WRITEBACK_ORDER + + .type mcpm_entry_vectors, #object +ENTRY(mcpm_entry_vectors) + .space 4 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER + + .type mcpm_power_up_setup_phys, #object +ENTRY(mcpm_power_up_setup_phys) + .space 4 @ set by mcpm_sync_init() diff --git a/arch/arm/common/mcpm_platsmp.c b/arch/arm/common/mcpm_platsmp.c new file mode 100644 index 00000000000..52b88d81b7b --- /dev/null +++ b/arch/arm/common/mcpm_platsmp.c @@ -0,0 +1,92 @@ +/* + * linux/arch/arm/mach-vexpress/mcpm_platsmp.c + * + * Created by: Nicolas Pitre, November 2012 + * Copyright: (C) 2012-2013 Linaro Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Code to handle secondary CPU bringup and hotplug for the cluster power API. + */ + +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/spinlock.h> + +#include <linux/irqchip/arm-gic.h> + +#include <asm/mcpm.h> +#include <asm/smp.h> +#include <asm/smp_plat.h> + +static void __init simple_smp_init_cpus(void) +{ +} + +static int __cpuinit mcpm_boot_secondary(unsigned int cpu, struct task_struct *idle) +{ + unsigned int mpidr, pcpu, pcluster, ret; + extern void secondary_startup(void); + + mpidr = cpu_logical_map(cpu); + pcpu = MPIDR_AFFINITY_LEVEL(mpidr, 0); + pcluster = MPIDR_AFFINITY_LEVEL(mpidr, 1); + pr_debug("%s: logical CPU %d is physical CPU %d cluster %d\n", + __func__, cpu, pcpu, pcluster); + + mcpm_set_entry_vector(pcpu, pcluster, NULL); + ret = mcpm_cpu_power_up(pcpu, pcluster); + if (ret) + return ret; + mcpm_set_entry_vector(pcpu, pcluster, secondary_startup); + arch_send_wakeup_ipi_mask(cpumask_of(cpu)); + dsb_sev(); + return 0; +} + +static void __cpuinit mcpm_secondary_init(unsigned int cpu) +{ + mcpm_cpu_powered_up(); + gic_secondary_init(0); +} + +#ifdef CONFIG_HOTPLUG_CPU + +static int mcpm_cpu_disable(unsigned int cpu) +{ + /* + * We assume all CPUs may be shut down. + * This would be the hook to use for eventual Secure + * OS migration requests as described in the PSCI spec. + */ + return 0; +} + +static void mcpm_cpu_die(unsigned int cpu) +{ + unsigned int mpidr, pcpu, pcluster; + mpidr = read_cpuid_mpidr(); + pcpu = MPIDR_AFFINITY_LEVEL(mpidr, 0); + pcluster = MPIDR_AFFINITY_LEVEL(mpidr, 1); + mcpm_set_entry_vector(pcpu, pcluster, NULL); + mcpm_cpu_power_down(); +} + +#endif + +static struct smp_operations __initdata mcpm_smp_ops = { + .smp_init_cpus = simple_smp_init_cpus, + .smp_boot_secondary = mcpm_boot_secondary, + .smp_secondary_init = mcpm_secondary_init, +#ifdef CONFIG_HOTPLUG_CPU + .cpu_disable = mcpm_cpu_disable, + .cpu_die = mcpm_cpu_die, +#endif +}; + +void __init mcpm_smp_set_ops(void) +{ + smp_set_ops(&mcpm_smp_ops); +} diff --git a/arch/arm/common/vlock.S b/arch/arm/common/vlock.S new file mode 100644 index 00000000000..ff198583f68 --- /dev/null +++ b/arch/arm/common/vlock.S @@ -0,0 +1,108 @@ +/* + * vlock.S - simple voting lock implementation for ARM + * + * Created by: Dave Martin, 2012-08-16 + * Copyright: (C) 2012-2013 Linaro Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * This algorithm is described in more detail in + * Documentation/arm/vlocks.txt. + */ + +#include <linux/linkage.h> +#include "vlock.h" + +/* Select different code if voting flags can fit in a single word. */ +#if VLOCK_VOTING_SIZE > 4 +#define FEW(x...) +#define MANY(x...) x +#else +#define FEW(x...) x +#define MANY(x...) +#endif + +@ voting lock for first-man coordination + +.macro voting_begin rbase:req, rcpu:req, rscratch:req + mov \rscratch, #1 + strb \rscratch, [\rbase, \rcpu] + dmb +.endm + +.macro voting_end rbase:req, rcpu:req, rscratch:req + dmb + mov \rscratch, #0 + strb \rscratch, [\rbase, \rcpu] + dsb + sev +.endm + +/* + * The vlock structure must reside in Strongly-Ordered or Device memory. + * This implementation deliberately eliminates most of the barriers which + * would be required for other memory types, and assumes that independent + * writes to neighbouring locations within a cacheline do not interfere + * with one another. + */ + +@ r0: lock structure base +@ r1: CPU ID (0-based index within cluster) +ENTRY(vlock_trylock) + add r1, r1, #VLOCK_VOTING_OFFSET + + voting_begin r0, r1, r2 + + ldrb r2, [r0, #VLOCK_OWNER_OFFSET] @ check whether lock is held + cmp r2, #VLOCK_OWNER_NONE + bne trylock_fail @ fail if so + + @ Control dependency implies strb not observable before previous ldrb. + + strb r1, [r0, #VLOCK_OWNER_OFFSET] @ submit my vote + + voting_end r0, r1, r2 @ implies DMB + + @ Wait for the current round of voting to finish: + + MANY( mov r3, #VLOCK_VOTING_OFFSET ) +0: + MANY( ldr r2, [r0, r3] ) + FEW( ldr r2, [r0, #VLOCK_VOTING_OFFSET] ) + cmp r2, #0 + wfene + bne 0b + MANY( add r3, r3, #4 ) + MANY( cmp r3, #VLOCK_VOTING_OFFSET + VLOCK_VOTING_SIZE ) + MANY( bne 0b ) + + @ Check who won: + + dmb + ldrb r2, [r0, #VLOCK_OWNER_OFFSET] + eor r0, r1, r2 @ zero if I won, else nonzero + bx lr + +trylock_fail: + voting_end r0, r1, r2 + mov r0, #1 @ nonzero indicates that I lost + bx lr +ENDPROC(vlock_trylock) + +@ r0: lock structure base +ENTRY(vlock_unlock) + dmb + mov r1, #VLOCK_OWNER_NONE + strb r1, [r0, #VLOCK_OWNER_OFFSET] + dsb + sev + bx lr +ENDPROC(vlock_unlock) diff --git a/arch/arm/common/vlock.h b/arch/arm/common/vlock.h new file mode 100644 index 00000000000..3b441475a59 --- /dev/null +++ b/arch/arm/common/vlock.h @@ -0,0 +1,29 @@ +/* + * vlock.h - simple voting lock implementation + * + * Created by: Dave Martin, 2012-08-16 + * Copyright: (C) 2012-2013 Linaro Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef __VLOCK_H +#define __VLOCK_H + +#include <asm/mcpm.h> + +/* Offsets and sizes are rounded to a word (4 bytes) */ +#define VLOCK_OWNER_OFFSET 0 +#define VLOCK_VOTING_OFFSET 4 +#define VLOCK_VOTING_SIZE ((MAX_CPUS_PER_CLUSTER + 3) / 4 * 4) +#define VLOCK_SIZE (VLOCK_VOTING_OFFSET + VLOCK_VOTING_SIZE) +#define VLOCK_OWNER_NONE 0 + +#endif /* ! __VLOCK_H */ diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index e1489c54cd1..bff71388e72 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -363,4 +363,79 @@ static inline void flush_cache_vunmap(unsigned long start, unsigned long end) flush_cache_all(); } +/* + * Memory synchronization helpers for mixed cached vs non cached accesses. + * + * Some synchronization algorithms have to set states in memory with the + * cache enabled or disabled depending on the code path. It is crucial + * to always ensure proper cache maintenance to update main memory right + * away in that case. + * + * Any cached write must be followed by a cache clean operation. + * Any cached read must be preceded by a cache invalidate operation. + * Yet, in the read case, a cache flush i.e. atomic clean+invalidate + * operation is needed to avoid discarding possible concurrent writes to the + * accessed memory. + * + * Also, in order to prevent a cached writer from interfering with an + * adjacent non-cached writer, each state variable must be located to + * a separate cache line. + */ + +/* + * This needs to be >= the max cache writeback size of all + * supported platforms included in the current kernel configuration. + * This is used to align state variables to their own cache lines. + */ +#define __CACHE_WRITEBACK_ORDER 6 /* guessed from existing platforms */ +#define __CACHE_WRITEBACK_GRANULE (1 << __CACHE_WRITEBACK_ORDER) + +/* + * There is no __cpuc_clean_dcache_area but we use it anyway for + * code intent clarity, and alias it to __cpuc_flush_dcache_area. + */ +#define __cpuc_clean_dcache_area __cpuc_flush_dcache_area + +/* + * Ensure preceding writes to *p by this CPU are visible to + * subsequent reads by other CPUs: + */ +static inline void __sync_cache_range_w(volatile void *p, size_t size) +{ + char *_p = (char *)p; + + __cpuc_clean_dcache_area(_p, size); + outer_clean_range(__pa(_p), __pa(_p + size)); +} + +/* + * Ensure preceding writes to *p by other CPUs are visible to + * subsequent reads by this CPU. We must be careful not to + * discard data simultaneously written by another CPU, hence the + * usage of flush rather than invalidate operations. + */ +static inline void __sync_cache_range_r(volatile void *p, size_t size) +{ + char *_p = (char *)p; + +#ifdef CONFIG_OUTER_CACHE + if (outer_cache.flush_range) { + /* + * Ensure dirty data migrated from other CPUs into our cache + * are cleaned out safely before the outer cache is cleaned: + */ + __cpuc_clean_dcache_area(_p, size); + + /* Clean and invalidate stale data for *p from outer ... */ + outer_flush_range(__pa(_p), __pa(_p + size)); + } +#endif + + /* ... and inner cache: */ + __cpuc_flush_dcache_area(_p, size); +} + +#define sync_cache_w(ptr) __sync_cache_range_w(ptr, sizeof *(ptr)) +#define sync_cache_r(ptr) __sync_cache_range_r(ptr, sizeof *(ptr)) + #endif diff --git a/arch/arm/include/asm/mcpm.h b/arch/arm/include/asm/mcpm.h new file mode 100644 index 00000000000..0f7b7620e9a --- /dev/null +++ b/arch/arm/include/asm/mcpm.h @@ -0,0 +1,209 @@ +/* + * arch/arm/include/asm/mcpm.h + * + * Created by: Nicolas Pitre, April 2012 + * Copyright: (C) 2012-2013 Linaro Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef MCPM_H +#define MCPM_H + +/* + * Maximum number of possible clusters / CPUs per cluster. + * + * This should be sufficient for quite a while, while keeping the + * (assembly) code simpler. When this starts to grow then we'll have + * to consider dynamic allocation. + */ +#define MAX_CPUS_PER_CLUSTER 4 +#define MAX_NR_CLUSTERS 2 + +#ifndef __ASSEMBLY__ + +#include <linux/types.h> +#include <asm/cacheflush.h> + +/* + * Platform specific code should use this symbol to set up secondary + * entry location for processors to use when released from reset. + */ +extern void mcpm_entry_point(void); + +/* + * This is used to indicate where the given CPU from given cluster should + * branch once it is ready to re-enter the kernel using ptr, or NULL if it + * should be gated. A gated CPU is held in a WFE loop until its vector + * becomes non NULL. + */ +void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr); + +/* + * CPU/cluster power operations API for higher subsystems to use. + */ + +/** + * mcpm_cpu_power_up - make given CPU in given cluster runable + * + * @cpu: CPU number within given cluster + * @cluster: cluster number for the CPU + * + * The identified CPU is brought out of reset. If the cluster was powered + * down then it is brought up as well, taking care not to let the other CPUs + * in the cluster run, and ensuring appropriate cluster setup. + * + * Caller must ensure the appropriate entry vector is initialized with + * mcpm_set_entry_vector() prior to calling this. + * + * This must be called in a sleepable context. However, the implementation + * is strongly encouraged to return early and let the operation happen + * asynchronously, especially when significant delays are expected. + * + * If the operation cannot be performed then an error code is returned. + */ +int mcpm_cpu_power_up(unsigned int cpu, unsigned int cluster); + +/** + * mcpm_cpu_power_down - power the calling CPU down + * + * The calling CPU is powered down. + * + * If this CPU is found to be the "last man standing" in the cluster + * then the cluster is prepared for power-down too. + * + * This must be called with interrupts disabled. + * + * This does not return. Re-entry in the kernel is expected via + * mcpm_entry_point. + */ +void mcpm_cpu_power_down(void); + +/** + * mcpm_cpu_suspend - bring the calling CPU in a suspended state + * + * @expected_residency: duration in microseconds the CPU is expected + * to remain suspended, or 0 if unknown/infinity. + * + * The calling CPU is suspended. The expected residency argument is used + * as a hint by the platform specific backend to implement the appropriate + * sleep state level according to the knowledge it has on wake-up latency + * for the given hardware. + * + * If this CPU is found to be the "last man standing" in the cluster + * then the cluster may be prepared for power-down too, if the expected + * residency makes it worthwhile. + * + * This must be called with interrupts disabled. + * + * This does not return. Re-entry in the kernel is expected via + * mcpm_entry_point. + */ +void mcpm_cpu_suspend(u64 expected_residency); + +/** + * mcpm_cpu_powered_up - housekeeping workafter a CPU has been powered up + * + * This lets the platform specific backend code perform needed housekeeping + * work. This must be called by the newly activated CPU as soon as it is + * fully operational in kernel space, before it enables interrupts. + * + * If the operation cannot be performed then an error code is returned. + */ +int mcpm_cpu_powered_up(void); + +/* + * Platform specific methods used in the implementation of the above API. + */ +struct mcpm_platform_ops { + int (*power_up)(unsigned int cpu, unsigned int cluster); + void (*power_down)(void); + void (*suspend)(u64); + void (*powered_up)(void); +}; + +/** + * mcpm_platform_register - register platform specific power methods + * + * @ops: mcpm_platform_ops structure to register + * + * An error is returned if the registration has been done previously. + */ +int __init mcpm_platform_register(const struct mcpm_platform_ops *ops); + +/* Synchronisation structures for coordinating safe cluster setup/teardown: */ + +/* + * When modifying this structure, make sure you update the MCPM_SYNC_ defines + * to match. + */ +struct mcpm_sync_struct { + /* individual CPU states */ + struct { + s8 cpu __aligned(__CACHE_WRITEBACK_GRANULE); + } cpus[MAX_CPUS_PER_CLUSTER]; + + /* cluster state */ + s8 cluster __aligned(__CACHE_WRITEBACK_GRANULE); + + /* inbound-side state */ + s8 inbound __aligned(__CACHE_WRITEBACK_GRANULE); +}; + +struct sync_struct { + struct mcpm_sync_struct clusters[MAX_NR_CLUSTERS]; +}; + +extern unsigned long sync_phys; /* physical address of *mcpm_sync */ + +void __mcpm_cpu_going_down(unsigned int cpu, unsigned int cluster); +void __mcpm_cpu_down(unsigned int cpu, unsigned int cluster); +void __mcpm_outbound_leave_critical(unsigned int cluster, int state); +bool __mcpm_outbound_enter_critical(unsigned int this_cpu, unsigned int cluster); +int __mcpm_cluster_state(unsigned int cluster); + +int __init mcpm_sync_init( + void (*power_up_setup)(unsigned int affinity_level)); + +void __init mcpm_smp_set_ops(void); + +#else + +/* + * asm-offsets.h causes trouble when included in .c files, and cacheflush.h + * cannot be included in asm files. Let's work around the conflict like this. + */ +#include <asm/asm-offsets.h> +#define __CACHE_WRITEBACK_GRANULE CACHE_WRITEBACK_GRANULE + +#endif /* ! __ASSEMBLY__ */ + +/* Definitions for mcpm_sync_struct */ +#define CPU_DOWN 0x11 +#define CPU_COMING_UP 0x12 +#define CPU_UP 0x13 +#define CPU_GOING_DOWN 0x14 + +#define CLUSTER_DOWN 0x21 +#define CLUSTER_UP 0x22 +#define CLUSTER_GOING_DOWN 0x23 + +#define INBOUND_NOT_COMING_UP 0x31 +#define INBOUND_COMING_UP 0x32 + +/* + * Offsets for the mcpm_sync_struct members, for use in asm. + * We don't want to make them global to the kernel via asm-offsets.c. + */ +#define MCPM_SYNC_CLUSTER_CPUS 0 +#define MCPM_SYNC_CPU_SIZE __CACHE_WRITEBACK_GRANULE +#define MCPM_SYNC_CLUSTER_CLUSTER \ + (MCPM_SYNC_CLUSTER_CPUS + MCPM_SYNC_CPU_SIZE * MAX_CPUS_PER_CLUSTER) +#define MCPM_SYNC_CLUSTER_INBOUND \ + (MCPM_SYNC_CLUSTER_CLUSTER + __CACHE_WRITEBACK_GRANULE) +#define MCPM_SYNC_CLUSTER_SIZE \ + (MCPM_SYNC_CLUSTER_INBOUND + __CACHE_WRITEBACK_GRANULE) + +#endif diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index 923eec7105c..3f088225e71 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -149,6 +149,10 @@ int main(void) DEFINE(DMA_BIDIRECTIONAL, DMA_BIDIRECTIONAL); DEFINE(DMA_TO_DEVICE, DMA_TO_DEVICE); DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE); + BLANK(); + DEFINE(CACHE_WRITEBACK_ORDER, __CACHE_WRITEBACK_ORDER); + DEFINE(CACHE_WRITEBACK_GRANULE, __CACHE_WRITEBACK_GRANULE); + BLANK(); #ifdef CONFIG_KVM_ARM_HOST DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm)); DEFINE(VCPU_MIDR, offsetof(struct kvm_vcpu, arch.midr)); |