From c5019f7e551c99e562e6b39ca97f8a8189c89cca Mon Sep 17 00:00:00 2001
From: Jon Medhurst <tixy@linaro.org>
Date: Fri, 8 Jun 2012 16:40:05 +0100
Subject: configs: Initial core configs

Copied from branch linaro-configs-3.4 of
git://git.linaro.org/people/jstultz/android.git

Signed-off-by: Jon Medhurst <tixy@linaro.org>
---
 linaro/configs/android.conf     | 32 ++++++++++++++++
 linaro/configs/linaro-base.conf | 82 +++++++++++++++++++++++++++++++++++++++++
 linaro/configs/ubuntu.conf      | 25 +++++++++++++
 3 files changed, 139 insertions(+)
 create mode 100644 linaro/configs/android.conf
 create mode 100644 linaro/configs/linaro-base.conf
 create mode 100644 linaro/configs/ubuntu.conf

diff --git a/linaro/configs/android.conf b/linaro/configs/android.conf
new file mode 100644
index 000000000000..12daf6192544
--- /dev/null
+++ b/linaro/configs/android.conf
@@ -0,0 +1,32 @@
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+# CONFIG_THUMB2_KERNEL is not set
+CONFIG_IPV6=y
+# CONFIG_IPV6_SIT is not set
+CONFIG_PANIC_TIMEOUT=0
+CONFIG_HAS_WAKELOCK=y
+CONFIG_WAKELOCK=y
+CONFIG_USER_WAKELOCK=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_DM_CRYPT=y
+CONFIG_AEABI=y
+CONFIG_POWER_SUPPLY=y
+CONFIG_ANDROID_PARANOID_NETWORK=y
+CONFIG_NET_ACTIVITY_STATS=y
+CONFIG_INPUT_GPIO=y
+CONFIG_USB_G_ANDROID=y
+CONFIG_SWITCH=y
+CONFIG_STAGING=y
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_LOGGER=y
+CONFIG_ANDROID_RAM_CONSOLE=y
+CONFIG_ANDROID_TIMED_OUTPUT=y
+CONFIG_ANDROID_TIMED_GPIO=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_ANDROID_INTF_ALARM_DEV=y
+CONFIG_CRYPTO_TWOFISH=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_COUNT=16
+CONFIG_BLK_DEV_RAM_SIZE=16384
diff --git a/linaro/configs/linaro-base.conf b/linaro/configs/linaro-base.conf
new file mode 100644
index 000000000000..80bdf69fc5ff
--- /dev/null
+++ b/linaro/configs/linaro-base.conf
@@ -0,0 +1,82 @@
+CONFIG_EXPERIMENTAL=y
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_LOG_BUF_SHIFT=16
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_EMBEDDED=y
+CONFIG_PERF_COUNTERS=y
+CONFIG_SLAB=y
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_SMP=y
+CONFIG_SCHED_MC=y
+CONFIG_SCHED_SMT=y
+CONFIG_THUMB2_KERNEL=y
+CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
+CONFIG_CPU_IDLE=y
+CONFIG_BINFMT_MISC=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_NET_KEY_MIGRATE=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_IP_PNP_RARP=y
+# CONFIG_INET_LRO is not set
+CONFIG_NETFILTER=y
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_CONNECTOR=y
+CONFIG_MTD=y
+CONFIG_MTD_CMDLINE_PARTS=y
+CONFIG_MTD_CHAR=y
+CONFIG_MTD_BLOCK=y
+CONFIG_MTD_OOPS=y
+CONFIG_MTD_CFI=y
+CONFIG_MTD_CFI_INTELEXT=y
+CONFIG_MTD_NAND=y
+CONFIG_NETDEVICES=y
+CONFIG_EXT2_FS=y
+CONFIG_EXT3_FS=y
+CONFIG_EXT4_FS=y
+CONFIG_BTRFS_FS=y
+CONFIG_QUOTA=y
+CONFIG_QFMT_V2=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_TMPFS=y
+CONFIG_ECRYPT_FS=y
+CONFIG_JFFS2_FS=y
+CONFIG_JFFS2_SUMMARY=y
+CONFIG_JFFS2_FS_XATTR=y
+CONFIG_JFFS2_COMPRESSION_OPTIONS=y
+CONFIG_JFFS2_LZO=y
+CONFIG_JFFS2_RUBIN=y
+CONFIG_CRAMFS=y
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_PRINTK_TIME=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+CONFIG_PROVE_LOCKING=y
+CONFIG_KEYS=y
+CONFIG_CRYPTO_MICHAEL_MIC=y
+CONFIG_CRC_CCITT=y
+CONFIG_CRC_T10DIF=y
+CONFIG_CRC_ITU_T=y
+CONFIG_CRC7=y
+CONFIG_HW_PERF_EVENTS=y
+CONFIG_ENABLE_DEFAULT_TRACERS=y
diff --git a/linaro/configs/ubuntu.conf b/linaro/configs/ubuntu.conf
new file mode 100644
index 000000000000..b8c74b8c750e
--- /dev/null
+++ b/linaro/configs/ubuntu.conf
@@ -0,0 +1,25 @@
+# CONFIG_LOCALVERSION_AUTO is not set
+# CONFIG_COMPAT_BRK is not set
+CONFIG_MODULES=y
+CONFIG_DEFAULT_MMAP_MIN_ADDR=32768
+CONFIG_SECCOMP=y
+CONFIG_CC_STACKPROTECTOR=y
+CONFIG_SYN_COOKIES=y
+CONFIG_IPV6=y
+CONFIG_NETLABEL=y
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=65536
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_UINPUT=y
+# CONFIG_DEVKMEM is not set
+CONFIG_FRAMEBUFFER_CONSOLE=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_STRICT_DEVMEM=y
+CONFIG_SECURITY=y
+CONFIG_LSM_MMAP_MIN_ADDR=0
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SECURITY_SMACK=y
+CONFIG_SECURITY_APPARMOR=y
+CONFIG_DEFAULT_SECURITY_APPARMOR=y
-- 
cgit v1.2.3


From d0d5f2c88d639a35856f88250459770a8993c402 Mon Sep 17 00:00:00 2001
From: Jon Medhurst <tixy@linaro.org>
Date: Wed, 6 Jun 2012 14:24:53 +0100
Subject: configs: Make CONFIG_MODULES part of linaro-base

CONFIG_MODULE was in both Ubuntu and Android configs so it would be
best if it is in the base config instead. Also, having
CONFIG_MODULE_UNLOAD in will enable the Gator module to be upgraded to a
later version via DKSM without rebooting, and allows for easier testing.

Signed-off-by: Jon Medhurst <tixy@linaro.org>
---
 linaro/configs/android.conf     | 2 --
 linaro/configs/linaro-base.conf | 2 ++
 linaro/configs/ubuntu.conf      | 1 -
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/linaro/configs/android.conf b/linaro/configs/android.conf
index 12daf6192544..ed2a8d86ee56 100644
--- a/linaro/configs/android.conf
+++ b/linaro/configs/android.conf
@@ -1,5 +1,3 @@
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
 # CONFIG_THUMB2_KERNEL is not set
 CONFIG_IPV6=y
 # CONFIG_IPV6_SIT is not set
diff --git a/linaro/configs/linaro-base.conf b/linaro/configs/linaro-base.conf
index 80bdf69fc5ff..c2f23b19f3c5 100644
--- a/linaro/configs/linaro-base.conf
+++ b/linaro/configs/linaro-base.conf
@@ -11,6 +11,8 @@ CONFIG_PERF_COUNTERS=y
 CONFIG_SLAB=y
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
 CONFIG_NO_HZ=y
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_SMP=y
diff --git a/linaro/configs/ubuntu.conf b/linaro/configs/ubuntu.conf
index b8c74b8c750e..2c6a13eb46c9 100644
--- a/linaro/configs/ubuntu.conf
+++ b/linaro/configs/ubuntu.conf
@@ -1,6 +1,5 @@
 # CONFIG_LOCALVERSION_AUTO is not set
 # CONFIG_COMPAT_BRK is not set
-CONFIG_MODULES=y
 CONFIG_DEFAULT_MMAP_MIN_ADDR=32768
 CONFIG_SECCOMP=y
 CONFIG_CC_STACKPROTECTOR=y
-- 
cgit v1.2.3


From 743c9750b82c06f81df671849fbae6596881bc07 Mon Sep 17 00:00:00 2001
From: Jon Medhurst <tixy@linaro.org>
Date: Wed, 6 Jun 2012 14:29:52 +0100
Subject: configs: Replace CONFIG_PERF_COUNTERS with CONFIG_PERF_EVENTS

CONFIG_PERF_COUNTERS was removed in comit 392d65a9 (perf: Remove
PERF_COUNTERS config option)

Signed-off-by: Jon Medhurst <tixy@linaro.org>
---
 linaro/configs/linaro-base.conf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linaro/configs/linaro-base.conf b/linaro/configs/linaro-base.conf
index c2f23b19f3c5..a0fa9d59c20b 100644
--- a/linaro/configs/linaro-base.conf
+++ b/linaro/configs/linaro-base.conf
@@ -7,7 +7,7 @@ CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=16
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_EMBEDDED=y
-CONFIG_PERF_COUNTERS=y
+CONFIG_PERF_EVENTS=y
 CONFIG_SLAB=y
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=y
-- 
cgit v1.2.3


From 240c61c5dc7f54fd1f76389c6e7a1928fa66f7d0 Mon Sep 17 00:00:00 2001
From: Ricardo Salveti de Araujo <ricardo.salveti@linaro.org>
Date: Mon, 25 Jun 2012 14:11:03 -0300
Subject: configs: moving previous ubuntu.conf as ubuntu-minimal and creating a
 full ubuntu one

The ubuntu.conf now is based on the official ubuntu packages, enabling a
large set of configs and drivers.

Signed-off-by: Ricardo Salveti de Araujo <ricardo.salveti@linaro.org>
---
 linaro/configs/ubuntu-minimal.conf |   24 +
 linaro/configs/ubuntu.conf         | 2138 +++++++++++++++++++++++++++++++++++-
 2 files changed, 2159 insertions(+), 3 deletions(-)
 create mode 100644 linaro/configs/ubuntu-minimal.conf

diff --git a/linaro/configs/ubuntu-minimal.conf b/linaro/configs/ubuntu-minimal.conf
new file mode 100644
index 000000000000..2c6a13eb46c9
--- /dev/null
+++ b/linaro/configs/ubuntu-minimal.conf
@@ -0,0 +1,24 @@
+# CONFIG_LOCALVERSION_AUTO is not set
+# CONFIG_COMPAT_BRK is not set
+CONFIG_DEFAULT_MMAP_MIN_ADDR=32768
+CONFIG_SECCOMP=y
+CONFIG_CC_STACKPROTECTOR=y
+CONFIG_SYN_COOKIES=y
+CONFIG_IPV6=y
+CONFIG_NETLABEL=y
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=65536
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_UINPUT=y
+# CONFIG_DEVKMEM is not set
+CONFIG_FRAMEBUFFER_CONSOLE=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_STRICT_DEVMEM=y
+CONFIG_SECURITY=y
+CONFIG_LSM_MMAP_MIN_ADDR=0
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SECURITY_SMACK=y
+CONFIG_SECURITY_APPARMOR=y
+CONFIG_DEFAULT_SECURITY_APPARMOR=y
diff --git a/linaro/configs/ubuntu.conf b/linaro/configs/ubuntu.conf
index 2c6a13eb46c9..03a01419e10e 100644
--- a/linaro/configs/ubuntu.conf
+++ b/linaro/configs/ubuntu.conf
@@ -1,24 +1,2156 @@
-# CONFIG_LOCALVERSION_AUTO is not set
-# CONFIG_COMPAT_BRK is not set
+CONFIG_INIT_ENV_ARG_LIMIT=32
+CONFIG_KERNEL_GZIP=y
+CONFIG_SWAP=y
+CONFIG_SYSVIPC_SYSCTL=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
+CONFIG_BSD_PROCESS_ACCT_V3=y
+CONFIG_FHANDLE=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_AUDIT=y
+CONFIG_AUDITSYSCALL=y
+CONFIG_AUDIT_WATCH=y
+CONFIG_AUDIT_TREE=y
+CONFIG_AUDIT_LOGINUID_IMMUTABLE=y
+CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_IRQ_CHIP=y
+CONFIG_IRQ_DOMAIN=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_DEVICE=y
+CONFIG_CPUSETS=y
+CONFIG_PROC_PID_CPUSET=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_RESOURCE_COUNTERS=y
+CONFIG_CGROUP_MEM_RES_CTLR=y
+CONFIG_CGROUP_PERF=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_FAIR_GROUP_SCHED=y
+CONFIG_CFS_BANDWIDTH=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_BLK_CGROUP=y
+CONFIG_CHECKPOINT_RESTORE=y
+CONFIG_NAMESPACES=y
+CONFIG_UTS_NS=y
+CONFIG_IPC_NS=y
+CONFIG_USER_NS=y
+CONFIG_PID_NS=y
+CONFIG_NET_NS=y
+CONFIG_SCHED_AUTOGROUP=y
+CONFIG_MM_OWNER=y
+CONFIG_RELAY=y
+CONFIG_INITRAMFS_SOURCE=""
+CONFIG_RD_GZIP=y
+CONFIG_RD_BZIP2=y
+CONFIG_RD_LZMA=y
+CONFIG_RD_XZ=y
+CONFIG_RD_LZO=y
+CONFIG_SYSCTL=y
+CONFIG_ANON_INODES=y
+CONFIG_EXPERT=y
+CONFIG_UID16=y
+CONFIG_SYSCTL_SYSCALL=y
+CONFIG_KALLSYMS=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_HOTPLUG=y
+CONFIG_PRINTK=y
+CONFIG_BUG=y
+CONFIG_ELF_CORE=y
+CONFIG_BASE_FULL=y
+CONFIG_FUTEX=y
+CONFIG_EPOLL=y
+CONFIG_SIGNALFD=y
+CONFIG_TIMERFD=y
+CONFIG_EVENTFD=y
+CONFIG_SHMEM=y
+CONFIG_AIO=y
+CONFIG_PERF_EVENTS=y
+CONFIG_VM_EVENT_COUNTERS=y
+CONFIG_TRACEPOINTS=y
+CONFIG_KPROBES=y
+CONFIG_JUMP_LABEL=y
+CONFIG_KRETPROBES=y
+CONFIG_SLABINFO=y
+CONFIG_RT_MUTEXES=y
+CONFIG_BASE_SMALL=0
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODVERSIONS=y
+CONFIG_MODULE_SRCVERSION_ALL=y
+CONFIG_BLOCK=y
+CONFIG_LBDAF=y
+CONFIG_BLK_DEV_BSG=y
+CONFIG_BLK_DEV_BSGLIB=y
+CONFIG_BLK_DEV_INTEGRITY=y
+CONFIG_BLK_DEV_THROTTLING=y
+CONFIG_PARTITION_ADVANCED=y
+CONFIG_MSDOS_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+CONFIG_LDM_PARTITION=y
+CONFIG_EFI_PARTITION=y
+CONFIG_IOSCHED_NOOP=y
+CONFIG_IOSCHED_DEADLINE=y
+CONFIG_IOSCHED_CFQ=y
+CONFIG_CFQ_GROUP_IOSCHED=y
+CONFIG_DEFAULT_CFQ=y
+CONFIG_DEFAULT_IOSCHED="cfq"
+CONFIG_FREEZER=y
+CONFIG_TICK_ONESHOT=y
+CONFIG_VMSPLIT_3G=y
+CONFIG_PAGE_OFFSET=0xC0000000
+CONFIG_PREEMPT_VOLUNTARY=y
+CONFIG_HZ=128
+CONFIG_AEABI=y
+CONFIG_SELECT_MEMORY_MODEL=y
+CONFIG_FLATMEM_MANUAL=y
+CONFIG_FLATMEM=y
+CONFIG_FLAT_NODE_MEM_MAP=y
+CONFIG_PAGEFLAGS_EXTENDED=y
+CONFIG_ZONE_DMA_FLAG=0
+CONFIG_VIRT_TO_BUS=y
 CONFIG_DEFAULT_MMAP_MIN_ADDR=32768
+CONFIG_CLEANCACHE=y
+CONFIG_FORCE_MAX_ZONEORDER=11
+CONFIG_LEDS=y
+CONFIG_ALIGNMENT_TRAP=y
 CONFIG_SECCOMP=y
 CONFIG_CC_STACKPROTECTOR=y
+CONFIG_USE_OF=y
+CONFIG_ZBOOT_ROM_TEXT=0x0
+CONFIG_ZBOOT_ROM_BSS=0x0
+CONFIG_KEXEC=y
+CONFIG_ATAGS_PROC=y
+CONFIG_CRASH_DUMP=y
+CONFIG_CPU_FREQ_TABLE=y
+CONFIG_CPU_FREQ_STAT=y
+CONFIG_CPU_FREQ_STAT_DETAILS=y
+CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
+CONFIG_CPU_FREQ_GOV_POWERSAVE=y
+CONFIG_CPU_FREQ_GOV_USERSPACE=y
+CONFIG_CPU_FREQ_GOV_ONDEMAND=y
+CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
+CONFIG_CPU_IDLE_GOV_LADDER=y
+CONFIG_CPU_IDLE_GOV_MENU=y
+CONFIG_BINFMT_ELF=y
+CONFIG_BINFMT_AOUT=m
+CONFIG_SUSPEND=y
+CONFIG_SUSPEND_FREEZER=y
+CONFIG_PM_SLEEP=y
+CONFIG_PM_RUNTIME=y
+CONFIG_PM=y
+CONFIG_PM_OPP=y
+CONFIG_PM_CLK=y
+CONFIG_UNIX_DIAG=m
+CONFIG_XFRM=y
+CONFIG_XFRM_IPCOMP=m
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_FIB_TRIE_STATS=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_VERBOSE=y
+CONFIG_IP_ROUTE_CLASSID=y
+CONFIG_NET_IPIP=m
+CONFIG_NET_IPGRE_DEMUX=m
+CONFIG_NET_IPGRE=m
+CONFIG_NET_IPGRE_BROADCAST=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
 CONFIG_SYN_COOKIES=y
+CONFIG_INET_AH=m
+CONFIG_INET_ESP=m
+CONFIG_INET_IPCOMP=m
+CONFIG_INET_XFRM_TUNNEL=m
+CONFIG_INET_TUNNEL=m
+CONFIG_INET_XFRM_MODE_TRANSPORT=m
+CONFIG_INET_XFRM_MODE_TUNNEL=m
+CONFIG_INET_XFRM_MODE_BEET=m
+CONFIG_INET_DIAG=m
+CONFIG_INET_TCP_DIAG=m
+CONFIG_INET_UDP_DIAG=m
+CONFIG_TCP_CONG_ADVANCED=y
+CONFIG_TCP_CONG_BIC=m
+CONFIG_TCP_CONG_CUBIC=y
+CONFIG_TCP_CONG_WESTWOOD=m
+CONFIG_TCP_CONG_HTCP=m
+CONFIG_TCP_CONG_HSTCP=m
+CONFIG_TCP_CONG_HYBLA=m
+CONFIG_TCP_CONG_VEGAS=m
+CONFIG_TCP_CONG_SCALABLE=m
+CONFIG_TCP_CONG_LP=m
+CONFIG_TCP_CONG_VENO=m
+CONFIG_TCP_CONG_YEAH=m
+CONFIG_TCP_CONG_ILLINOIS=m
+CONFIG_DEFAULT_CUBIC=y
+CONFIG_DEFAULT_TCP_CONG="cubic"
+CONFIG_TCP_MD5SIG=y
 CONFIG_IPV6=y
+CONFIG_IPV6_PRIVACY=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_INET6_AH=m
+CONFIG_INET6_ESP=m
+CONFIG_INET6_IPCOMP=m
+CONFIG_IPV6_MIP6=m
+CONFIG_INET6_XFRM_TUNNEL=m
+CONFIG_INET6_TUNNEL=m
+CONFIG_INET6_XFRM_MODE_TRANSPORT=m
+CONFIG_INET6_XFRM_MODE_TUNNEL=m
+CONFIG_INET6_XFRM_MODE_BEET=m
+CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m
+CONFIG_IPV6_SIT=m
+CONFIG_IPV6_SIT_6RD=y
+CONFIG_IPV6_NDISC_NODETYPE=y
+CONFIG_IPV6_TUNNEL=m
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_IPV6_SUBTREES=y
+CONFIG_IPV6_MROUTE=y
+CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=y
+CONFIG_IPV6_PIMSM_V2=y
 CONFIG_NETLABEL=y
+CONFIG_NETWORK_SECMARK=y
+CONFIG_NETFILTER_ADVANCED=y
+CONFIG_BRIDGE_NETFILTER=y
+CONFIG_NETFILTER_NETLINK=m
+CONFIG_NETFILTER_NETLINK_ACCT=m
+CONFIG_NETFILTER_NETLINK_QUEUE=m
+CONFIG_NETFILTER_NETLINK_LOG=m
+CONFIG_NF_CONNTRACK=m
+CONFIG_NF_CONNTRACK_MARK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_ZONES=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CONNTRACK_TIMEOUT=y
+CONFIG_NF_CONNTRACK_TIMESTAMP=y
+CONFIG_NF_CT_PROTO_DCCP=m
+CONFIG_NF_CT_PROTO_GRE=m
+CONFIG_NF_CT_PROTO_SCTP=m
+CONFIG_NF_CT_PROTO_UDPLITE=m
+CONFIG_NF_CONNTRACK_AMANDA=m
+CONFIG_NF_CONNTRACK_FTP=m
+CONFIG_NF_CONNTRACK_H323=m
+CONFIG_NF_CONNTRACK_IRC=m
+CONFIG_NF_CONNTRACK_BROADCAST=m
+CONFIG_NF_CONNTRACK_NETBIOS_NS=m
+CONFIG_NF_CONNTRACK_SNMP=m
+CONFIG_NF_CONNTRACK_PPTP=m
+CONFIG_NF_CONNTRACK_SANE=m
+CONFIG_NF_CONNTRACK_SIP=m
+CONFIG_NF_CONNTRACK_TFTP=m
+CONFIG_NF_CT_NETLINK=m
+CONFIG_NF_CT_NETLINK_TIMEOUT=m
+CONFIG_NETFILTER_TPROXY=m
+CONFIG_NETFILTER_XTABLES=m
+CONFIG_NETFILTER_XT_MARK=m
+CONFIG_NETFILTER_XT_CONNMARK=m
+CONFIG_NETFILTER_XT_SET=m
+CONFIG_NETFILTER_XT_TARGET_AUDIT=m
+CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=m
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m
+CONFIG_NETFILTER_XT_TARGET_CT=m
+CONFIG_NETFILTER_XT_TARGET_DSCP=m
+CONFIG_NETFILTER_XT_TARGET_HL=m
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m
+CONFIG_NETFILTER_XT_TARGET_LED=m
+CONFIG_NETFILTER_XT_TARGET_LOG=m
+CONFIG_NETFILTER_XT_TARGET_MARK=m
+CONFIG_NETFILTER_XT_TARGET_NFLOG=m
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
+CONFIG_NETFILTER_XT_TARGET_NOTRACK=m
+CONFIG_NETFILTER_XT_TARGET_RATEEST=m
+CONFIG_NETFILTER_XT_TARGET_TEE=m
+CONFIG_NETFILTER_XT_TARGET_TPROXY=m
+CONFIG_NETFILTER_XT_TARGET_TRACE=m
+CONFIG_NETFILTER_XT_TARGET_SECMARK=m
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=m
+CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m
+CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m
+CONFIG_NETFILTER_XT_MATCH_CLUSTER=m
+CONFIG_NETFILTER_XT_MATCH_COMMENT=m
+CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
+CONFIG_NETFILTER_XT_MATCH_CPU=m
+CONFIG_NETFILTER_XT_MATCH_DCCP=m
+CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m
+CONFIG_NETFILTER_XT_MATCH_DSCP=m
+CONFIG_NETFILTER_XT_MATCH_ECN=m
+CONFIG_NETFILTER_XT_MATCH_ESP=m
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
+CONFIG_NETFILTER_XT_MATCH_HELPER=m
+CONFIG_NETFILTER_XT_MATCH_HL=m
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=m
+CONFIG_NETFILTER_XT_MATCH_IPVS=m
+CONFIG_NETFILTER_XT_MATCH_LENGTH=m
+CONFIG_NETFILTER_XT_MATCH_LIMIT=m
+CONFIG_NETFILTER_XT_MATCH_MAC=m
+CONFIG_NETFILTER_XT_MATCH_MARK=m
+CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m
+CONFIG_NETFILTER_XT_MATCH_NFACCT=m
+CONFIG_NETFILTER_XT_MATCH_OSF=m
+CONFIG_NETFILTER_XT_MATCH_OWNER=m
+CONFIG_NETFILTER_XT_MATCH_POLICY=m
+CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m
+CONFIG_NETFILTER_XT_MATCH_QUOTA=m
+CONFIG_NETFILTER_XT_MATCH_RATEEST=m
+CONFIG_NETFILTER_XT_MATCH_REALM=m
+CONFIG_NETFILTER_XT_MATCH_RECENT=m
+CONFIG_NETFILTER_XT_MATCH_SCTP=m
+CONFIG_NETFILTER_XT_MATCH_SOCKET=m
+CONFIG_NETFILTER_XT_MATCH_STATE=m
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=m
+CONFIG_NETFILTER_XT_MATCH_STRING=m
+CONFIG_NETFILTER_XT_MATCH_TCPMSS=m
+CONFIG_NETFILTER_XT_MATCH_TIME=m
+CONFIG_NETFILTER_XT_MATCH_U32=m
+CONFIG_IP_SET=m
+CONFIG_IP_SET_MAX=256
+CONFIG_IP_SET_BITMAP_IP=m
+CONFIG_IP_SET_BITMAP_IPMAC=m
+CONFIG_IP_SET_BITMAP_PORT=m
+CONFIG_IP_SET_HASH_IP=m
+CONFIG_IP_SET_HASH_IPPORT=m
+CONFIG_IP_SET_HASH_IPPORTIP=m
+CONFIG_IP_SET_HASH_IPPORTNET=m
+CONFIG_IP_SET_HASH_NET=m
+CONFIG_IP_SET_HASH_NETPORT=m
+CONFIG_IP_SET_HASH_NETIFACE=m
+CONFIG_IP_SET_LIST_SET=m
+CONFIG_IP_VS=m
+CONFIG_IP_VS_IPV6=y
+CONFIG_IP_VS_TAB_BITS=12
+CONFIG_IP_VS_PROTO_TCP=y
+CONFIG_IP_VS_PROTO_UDP=y
+CONFIG_IP_VS_PROTO_AH_ESP=y
+CONFIG_IP_VS_PROTO_ESP=y
+CONFIG_IP_VS_PROTO_AH=y
+CONFIG_IP_VS_PROTO_SCTP=y
+CONFIG_IP_VS_RR=m
+CONFIG_IP_VS_WRR=m
+CONFIG_IP_VS_LC=m
+CONFIG_IP_VS_WLC=m
+CONFIG_IP_VS_LBLC=m
+CONFIG_IP_VS_LBLCR=m
+CONFIG_IP_VS_DH=m
+CONFIG_IP_VS_SH=m
+CONFIG_IP_VS_SED=m
+CONFIG_IP_VS_NQ=m
+CONFIG_IP_VS_SH_TAB_BITS=8
+CONFIG_IP_VS_FTP=m
+CONFIG_IP_VS_NFCT=y
+CONFIG_IP_VS_PE_SIP=m
+CONFIG_NF_DEFRAG_IPV4=m
+CONFIG_NF_CONNTRACK_IPV4=m
+CONFIG_IP_NF_QUEUE=m
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP_NF_MATCH_AH=m
+CONFIG_IP_NF_MATCH_ECN=m
+CONFIG_IP_NF_MATCH_RPFILTER=m
+CONFIG_IP_NF_MATCH_TTL=m
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP_NF_TARGET_REJECT=m
+CONFIG_IP_NF_TARGET_ULOG=m
+CONFIG_NF_NAT=m
+CONFIG_NF_NAT_NEEDED=y
+CONFIG_IP_NF_TARGET_MASQUERADE=m
+CONFIG_IP_NF_TARGET_NETMAP=m
+CONFIG_IP_NF_TARGET_REDIRECT=m
+CONFIG_NF_NAT_SNMP_BASIC=m
+CONFIG_NF_NAT_PROTO_DCCP=m
+CONFIG_NF_NAT_PROTO_GRE=m
+CONFIG_NF_NAT_PROTO_UDPLITE=m
+CONFIG_NF_NAT_PROTO_SCTP=m
+CONFIG_NF_NAT_FTP=m
+CONFIG_NF_NAT_IRC=m
+CONFIG_NF_NAT_TFTP=m
+CONFIG_NF_NAT_AMANDA=m
+CONFIG_NF_NAT_PPTP=m
+CONFIG_NF_NAT_H323=m
+CONFIG_NF_NAT_SIP=m
+CONFIG_IP_NF_MANGLE=m
+CONFIG_IP_NF_TARGET_CLUSTERIP=m
+CONFIG_IP_NF_TARGET_ECN=m
+CONFIG_IP_NF_TARGET_TTL=m
+CONFIG_IP_NF_RAW=m
+CONFIG_IP_NF_SECURITY=m
+CONFIG_IP_NF_ARPTABLES=m
+CONFIG_IP_NF_ARPFILTER=m
+CONFIG_IP_NF_ARP_MANGLE=m
+CONFIG_NF_DEFRAG_IPV6=m
+CONFIG_NF_CONNTRACK_IPV6=m
+CONFIG_IP6_NF_QUEUE=m
+CONFIG_IP6_NF_IPTABLES=m
+CONFIG_IP6_NF_MATCH_AH=m
+CONFIG_IP6_NF_MATCH_EUI64=m
+CONFIG_IP6_NF_MATCH_FRAG=m
+CONFIG_IP6_NF_MATCH_OPTS=m
+CONFIG_IP6_NF_MATCH_HL=m
+CONFIG_IP6_NF_MATCH_IPV6HEADER=m
+CONFIG_IP6_NF_MATCH_MH=m
+CONFIG_IP6_NF_MATCH_RPFILTER=m
+CONFIG_IP6_NF_MATCH_RT=m
+CONFIG_IP6_NF_TARGET_HL=m
+CONFIG_IP6_NF_FILTER=m
+CONFIG_IP6_NF_TARGET_REJECT=m
+CONFIG_IP6_NF_MANGLE=m
+CONFIG_IP6_NF_RAW=m
+CONFIG_IP6_NF_SECURITY=m
+CONFIG_DECNET_NF_GRABULATOR=m
+CONFIG_BRIDGE_NF_EBTABLES=m
+CONFIG_BRIDGE_EBT_BROUTE=m
+CONFIG_BRIDGE_EBT_T_FILTER=m
+CONFIG_BRIDGE_EBT_T_NAT=m
+CONFIG_BRIDGE_EBT_802_3=m
+CONFIG_BRIDGE_EBT_AMONG=m
+CONFIG_BRIDGE_EBT_ARP=m
+CONFIG_BRIDGE_EBT_IP=m
+CONFIG_BRIDGE_EBT_IP6=m
+CONFIG_BRIDGE_EBT_LIMIT=m
+CONFIG_BRIDGE_EBT_MARK=m
+CONFIG_BRIDGE_EBT_PKTTYPE=m
+CONFIG_BRIDGE_EBT_STP=m
+CONFIG_BRIDGE_EBT_VLAN=m
+CONFIG_BRIDGE_EBT_ARPREPLY=m
+CONFIG_BRIDGE_EBT_DNAT=m
+CONFIG_BRIDGE_EBT_MARK_T=m
+CONFIG_BRIDGE_EBT_REDIRECT=m
+CONFIG_BRIDGE_EBT_SNAT=m
+CONFIG_BRIDGE_EBT_LOG=m
+CONFIG_BRIDGE_EBT_ULOG=m
+CONFIG_BRIDGE_EBT_NFLOG=m
+CONFIG_IP_DCCP=m
+CONFIG_INET_DCCP_DIAG=m
+CONFIG_IP_DCCP_CCID3=y
+CONFIG_IP_DCCP_TFRC_LIB=y
+CONFIG_NET_DCCPPROBE=m
+CONFIG_IP_SCTP=m
+CONFIG_NET_SCTPPROBE=m
+CONFIG_SCTP_HMAC_MD5=y
+CONFIG_RDS=m
+CONFIG_RDS_TCP=m
+CONFIG_TIPC=m
+CONFIG_ATM=m
+CONFIG_ATM_CLIP=m
+CONFIG_ATM_LANE=m
+CONFIG_ATM_MPOA=m
+CONFIG_ATM_BR2684=m
+CONFIG_ATM_BR2684_IPFILTER=y
+CONFIG_L2TP=m
+CONFIG_L2TP_DEBUGFS=m
+CONFIG_STP=m
+CONFIG_GARP=m
+CONFIG_BRIDGE=m
+CONFIG_BRIDGE_IGMP_SNOOPING=y
+CONFIG_NET_DSA=y
+CONFIG_NET_DSA_TAG_DSA=y
+CONFIG_NET_DSA_TAG_EDSA=y
+CONFIG_NET_DSA_TAG_TRAILER=y
+CONFIG_VLAN_8021Q=m
+CONFIG_VLAN_8021Q_GVRP=y
+CONFIG_DECNET=m
+CONFIG_LLC=m
+CONFIG_LLC2=m
+CONFIG_IPX=m
+CONFIG_ATALK=m
+CONFIG_DEV_APPLETALK=m
+CONFIG_IPDDP=m
+CONFIG_IPDDP_ENCAP=y
+CONFIG_IPDDP_DECAP=y
+CONFIG_X25=m
+CONFIG_LAPB=m
+CONFIG_ECONET=m
+CONFIG_ECONET_AUNUDP=y
+CONFIG_ECONET_NATIVE=y
+CONFIG_WAN_ROUTER=m
+CONFIG_PHONET=m
+CONFIG_IEEE802154=m
+CONFIG_IEEE802154_6LOWPAN=m
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_CBQ=m
+CONFIG_NET_SCH_HTB=m
+CONFIG_NET_SCH_HFSC=m
+CONFIG_NET_SCH_ATM=m
+CONFIG_NET_SCH_PRIO=m
+CONFIG_NET_SCH_MULTIQ=m
+CONFIG_NET_SCH_RED=m
+CONFIG_NET_SCH_SFB=m
+CONFIG_NET_SCH_SFQ=m
+CONFIG_NET_SCH_TEQL=m
+CONFIG_NET_SCH_TBF=m
+CONFIG_NET_SCH_GRED=m
+CONFIG_NET_SCH_DSMARK=m
+CONFIG_NET_SCH_NETEM=m
+CONFIG_NET_SCH_DRR=m
+CONFIG_NET_SCH_MQPRIO=m
+CONFIG_NET_SCH_CHOKE=m
+CONFIG_NET_SCH_QFQ=m
+CONFIG_NET_SCH_INGRESS=m
+CONFIG_NET_SCH_PLUG=m
+CONFIG_NET_CLS=y
+CONFIG_NET_CLS_BASIC=m
+CONFIG_NET_CLS_TCINDEX=m
+CONFIG_NET_CLS_ROUTE4=m
+CONFIG_NET_CLS_FW=m
+CONFIG_NET_CLS_U32=m
+CONFIG_CLS_U32_MARK=y
+CONFIG_NET_CLS_RSVP=m
+CONFIG_NET_CLS_RSVP6=m
+CONFIG_NET_CLS_FLOW=m
+CONFIG_NET_CLS_CGROUP=m
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_STACK=32
+CONFIG_NET_EMATCH_CMP=m
+CONFIG_NET_EMATCH_NBYTE=m
+CONFIG_NET_EMATCH_U32=m
+CONFIG_NET_EMATCH_META=m
+CONFIG_NET_EMATCH_TEXT=m
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_ACT_POLICE=m
+CONFIG_NET_ACT_GACT=m
+CONFIG_GACT_PROB=y
+CONFIG_NET_ACT_MIRRED=m
+CONFIG_NET_ACT_IPT=m
+CONFIG_NET_ACT_NAT=m
+CONFIG_NET_ACT_PEDIT=m
+CONFIG_NET_ACT_SIMP=m
+CONFIG_NET_ACT_SKBEDIT=m
+CONFIG_NET_ACT_CSUM=m
+CONFIG_NET_SCH_FIFO=y
+CONFIG_DCB=y
+CONFIG_DNS_RESOLVER=y
+CONFIG_BATMAN_ADV=m
+CONFIG_OPENVSWITCH=m
+CONFIG_NETPRIO_CGROUP=m
+CONFIG_BQL=y
+CONFIG_BPF_JIT=y
+CONFIG_NET_PKTGEN=m
+CONFIG_NET_TCPPROBE=m
+CONFIG_HAMRADIO=y
+CONFIG_AX25=m
+CONFIG_AX25_DAMA_SLAVE=y
+CONFIG_NETROM=m
+CONFIG_ROSE=m
+CONFIG_MKISS=m
+CONFIG_6PACK=m
+CONFIG_BPQETHER=m
+CONFIG_BAYCOM_SER_FDX=m
+CONFIG_BAYCOM_SER_HDX=m
+CONFIG_BAYCOM_PAR=m
+CONFIG_BAYCOM_EPP=m
+CONFIG_YAM=m
+CONFIG_CAN=m
+CONFIG_CAN_RAW=m
+CONFIG_CAN_BCM=m
+CONFIG_CAN_GW=m
+CONFIG_CAN_VCAN=m
+CONFIG_CAN_SLCAN=m
+CONFIG_CAN_DEV=m
+CONFIG_CAN_CALC_BITTIMING=y
+CONFIG_CAN_MCP251X=m
+CONFIG_CAN_SJA1000=m
+CONFIG_CAN_SJA1000_ISA=m
+CONFIG_CAN_SJA1000_PLATFORM=m
+CONFIG_CAN_C_CAN=m
+CONFIG_CAN_C_CAN_PLATFORM=m
+CONFIG_CAN_CC770=m
+CONFIG_CAN_CC770_ISA=m
+CONFIG_CAN_CC770_PLATFORM=m
+CONFIG_CAN_EMS_USB=m
+CONFIG_CAN_ESD_USB2=m
+CONFIG_CAN_PEAK_USB=m
+CONFIG_CAN_SOFTING=m
+CONFIG_IRDA=m
+CONFIG_IRLAN=m
+CONFIG_IRNET=m
+CONFIG_IRCOMM=m
+CONFIG_IRDA_ULTRA=y
+CONFIG_IRDA_CACHE_LAST_LSAP=y
+CONFIG_IRDA_FAST_RR=y
+CONFIG_IRDA_DEBUG=y
+CONFIG_IRTTY_SIR=m
+CONFIG_DONGLE=y
+CONFIG_ESI_DONGLE=m
+CONFIG_ACTISYS_DONGLE=m
+CONFIG_TEKRAM_DONGLE=m
+CONFIG_TOIM3232_DONGLE=m
+CONFIG_LITELINK_DONGLE=m
+CONFIG_MA600_DONGLE=m
+CONFIG_GIRBIL_DONGLE=m
+CONFIG_MCP2120_DONGLE=m
+CONFIG_OLD_BELKIN_DONGLE=m
+CONFIG_ACT200L_DONGLE=m
+CONFIG_KINGSUN_DONGLE=m
+CONFIG_KSDAZZLE_DONGLE=m
+CONFIG_KS959_DONGLE=m
+CONFIG_USB_IRDA=m
+CONFIG_SIGMATEL_FIR=m
+CONFIG_MCS_FIR=m
+CONFIG_BT=m
+CONFIG_BT_RFCOMM=m
+CONFIG_BT_RFCOMM_TTY=y
+CONFIG_BT_BNEP=m
+CONFIG_BT_BNEP_MC_FILTER=y
+CONFIG_BT_BNEP_PROTO_FILTER=y
+CONFIG_BT_CMTP=m
+CONFIG_BT_HIDP=m
+CONFIG_BT_HCIBTUSB=m
+CONFIG_BT_HCIBTSDIO=m
+CONFIG_BT_HCIUART=m
+CONFIG_BT_HCIUART_H4=y
+CONFIG_BT_HCIUART_BCSP=y
+CONFIG_BT_HCIUART_ATH3K=y
+CONFIG_BT_HCIUART_LL=y
+CONFIG_BT_HCIBCM203X=m
+CONFIG_BT_HCIBPA10X=m
+CONFIG_BT_HCIBFUSB=m
+CONFIG_BT_HCIVHCI=m
+CONFIG_BT_MRVL=m
+CONFIG_BT_MRVL_SDIO=m
+CONFIG_BT_ATH3K=m
+CONFIG_BT_WILINK=m
+CONFIG_AF_RXRPC=m
+CONFIG_RXKAD=m
+CONFIG_FIB_RULES=y
+CONFIG_WIRELESS=y
+CONFIG_WIRELESS_EXT=y
+CONFIG_WEXT_CORE=y
+CONFIG_WEXT_PROC=y
+CONFIG_WEXT_SPY=y
+CONFIG_WEXT_PRIV=y
+CONFIG_CFG80211_REG_DEBUG=y
+CONFIG_CFG80211_DEFAULT_PS=y
+CONFIG_CFG80211_DEBUGFS=y
+CONFIG_CFG80211_WEXT=y
+CONFIG_WIRELESS_EXT_SYSFS=y
+CONFIG_LIB80211_CRYPT_WEP=m
+CONFIG_LIB80211_CRYPT_CCMP=m
+CONFIG_LIB80211_CRYPT_TKIP=m
+CONFIG_MAC80211_MESH=y
+CONFIG_MAC80211_LEDS=y
+CONFIG_MAC80211_DEBUGFS=y
+CONFIG_MAC80211_DEBUG_MENU=y
+CONFIG_WIMAX=m
+CONFIG_WIMAX_DEBUG_LEVEL=8
+CONFIG_RFKILL=y
+CONFIG_RFKILL_LEDS=y
+CONFIG_RFKILL_INPUT=y
+CONFIG_RFKILL_REGULATOR=m
+CONFIG_RFKILL_GPIO=m
+CONFIG_NET_9P=m
+CONFIG_CAIF=m
+CONFIG_CAIF_NETDEV=m
+CONFIG_CAIF_USB=m
+CONFIG_CEPH_LIB=m
+CONFIG_CEPH_LIB_USE_DNS_RESOLVER=y
+CONFIG_NFC=m
+CONFIG_NFC_NCI=m
+CONFIG_PN544_NFC=m
+CONFIG_NFC_PN533=m
+CONFIG_NFC_WILINK=m
 CONFIG_DEVTMPFS=y
 CONFIG_DEVTMPFS_MOUNT=y
+CONFIG_STANDALONE=y
+CONFIG_PREVENT_FIRMWARE_BUILD=y
+CONFIG_FW_LOADER=y
+CONFIG_FIRMWARE_IN_KERNEL=y
+CONFIG_EXTRA_FIRMWARE=""
+CONFIG_REGMAP=y
+CONFIG_REGMAP_I2C=y
+CONFIG_REGMAP_SPI=y
+CONFIG_SPI=y
+CONFIG_DMA_SHARED_BUFFER=y
+CONFIG_PROC_EVENTS=y
+CONFIG_MTD_REDBOOT_PARTS=m
+CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1
+CONFIG_MTD_AFS_PARTS=m
+CONFIG_MTD_OF_PARTS=y
+CONFIG_MTD_AR7_PARTS=m
+CONFIG_HAVE_MTD_OTP=y
+CONFIG_MTD_BLKDEVS=y
+CONFIG_FTL=m
+CONFIG_NFTL=m
+CONFIG_NFTL_RW=y
+CONFIG_INFTL=m
+CONFIG_RFD_FTL=m
+CONFIG_SSFDC=m
+CONFIG_SM_FTL=m
+CONFIG_MTD_SWAP=m
+CONFIG_MTD_JEDECPROBE=m
+CONFIG_MTD_MAP_BANK_WIDTH_1=y
+CONFIG_MTD_MAP_BANK_WIDTH_2=y
+CONFIG_MTD_MAP_BANK_WIDTH_4=y
+CONFIG_MTD_CFI_I1=y
+CONFIG_MTD_CFI_I2=y
+CONFIG_MTD_CFI_AMDSTD=m
+CONFIG_MTD_CFI_STAA=m
+CONFIG_MTD_RAM=m
+CONFIG_MTD_ROM=m
+CONFIG_MTD_ABSENT=m
+CONFIG_MTD_COMPLEX_MAPPINGS=y
+CONFIG_MTD_PHYSMAP=m
+CONFIG_MTD_PHYSMAP_OF=m
+CONFIG_MTD_IMPA7=m
+CONFIG_MTD_GPIO_ADDR=m
+CONFIG_MTD_PLATRAM=m
+CONFIG_MTD_LATCH_ADDR=m
+CONFIG_MTD_DATAFLASH=m
+CONFIG_MTD_DATAFLASH_OTP=y
+CONFIG_MTD_M25P80=m
+CONFIG_M25PXX_USE_FAST_READ=y
+CONFIG_MTD_SST25L=m
+CONFIG_MTD_SLRAM=m
+CONFIG_MTD_PHRAM=m
+CONFIG_MTD_MTDRAM=m
+CONFIG_MTDRAM_TOTAL_SIZE=4096
+CONFIG_MTDRAM_ERASE_SIZE=128
+CONFIG_MTD_BLOCK2MTD=m
+CONFIG_MTD_DOC2000=m
+CONFIG_MTD_DOC2001=m
+CONFIG_MTD_DOC2001PLUS=m
+CONFIG_MTD_DOCG3=m
+CONFIG_BCH_CONST_M=14
+CONFIG_BCH_CONST_T=4
+CONFIG_MTD_ONENAND=m
+CONFIG_MTD_DOCPROBE=m
+CONFIG_MTD_DOCECC=m
+CONFIG_MTD_DOCPROBE_ADDRESS=0x0
+CONFIG_MTD_NAND_ECC=y
+CONFIG_MTD_NAND_BCH=y
+CONFIG_MTD_NAND_ECC_BCH=y
+CONFIG_MTD_NAND_GPIO=m
+CONFIG_MTD_NAND_IDS=y
+CONFIG_MTD_NAND_DISKONCHIP=m
+CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0
+CONFIG_MTD_NAND_DOCG4=m
+CONFIG_MTD_NAND_NANDSIM=m
+CONFIG_MTD_NAND_PLATFORM=m
+CONFIG_MTD_ALAUDA=m
+CONFIG_MTD_ONENAND_GENERIC=m
+CONFIG_MTD_ONENAND_2X_PROGRAM=y
+CONFIG_MTD_ONENAND_SIM=m
+CONFIG_MTD_LPDDR=m
+CONFIG_MTD_QINFO_PROBE=m
+CONFIG_DTC=y
+CONFIG_OF=y
+CONFIG_PROC_DEVICETREE=y
+CONFIG_OF_FLATTREE=y
+CONFIG_OF_EARLY_FLATTREE=y
+CONFIG_OF_ADDRESS=y
+CONFIG_OF_IRQ=y
+CONFIG_OF_DEVICE=y
+CONFIG_OF_GPIO=y
+CONFIG_OF_I2C=y
+CONFIG_OF_NET=y
+CONFIG_OF_SPI=y
+CONFIG_OF_MDIO=y
+CONFIG_OF_MTD=y
+CONFIG_PARPORT=m
+CONFIG_PARPORT_AX88796=m
+CONFIG_PARPORT_1284=y
+CONFIG_PARPORT_NOT_PC=y
+CONFIG_BLK_DEV=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_LOOP_MIN_COUNT=8
+CONFIG_BLK_DEV_CRYPTOLOOP=m
+CONFIG_BLK_DEV_DRBD=m
+CONFIG_BLK_DEV_NBD=m
+CONFIG_BLK_DEV_UB=m
 CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_COUNT=16
 CONFIG_BLK_DEV_RAM_SIZE=65536
+CONFIG_CDROM_PKTCDVD=m
+CONFIG_CDROM_PKTCDVD_BUFFERS=8
+CONFIG_ATA_OVER_ETH=m
+CONFIG_MG_DISK=m
+CONFIG_MG_DISK_RES=0
+CONFIG_BLK_DEV_RBD=m
+CONFIG_SENSORS_LIS3LV02D=m
+CONFIG_AD525X_DPOT=m
+CONFIG_AD525X_DPOT_I2C=m
+CONFIG_AD525X_DPOT_SPI=m
+CONFIG_ICS932S401=m
+CONFIG_ENCLOSURE_SERVICES=m
+CONFIG_APDS9802ALS=m
+CONFIG_ISL29003=m
+CONFIG_ISL29020=m
+CONFIG_SENSORS_TSL2550=m
+CONFIG_SENSORS_BH1780=m
+CONFIG_SENSORS_BH1770=m
+CONFIG_SENSORS_APDS990X=m
+CONFIG_HMC6352=m
+CONFIG_DS1682=m
+CONFIG_BMP085=m
+CONFIG_USB_SWITCH_FSA9480=m
+CONFIG_C2PORT=m
+CONFIG_EEPROM_AT24=m
+CONFIG_EEPROM_AT25=m
+CONFIG_EEPROM_LEGACY=m
+CONFIG_EEPROM_MAX6875=m
+CONFIG_EEPROM_93XX46=m
+CONFIG_IWMC3200TOP=m
+CONFIG_SENSORS_LIS3_SPI=m
+CONFIG_SENSORS_LIS3_I2C=m
+CONFIG_SCSI_MOD=y
+CONFIG_RAID_ATTRS=m
+CONFIG_SCSI_DMA=y
+CONFIG_SCSI_TGT=m
+CONFIG_SCSI_NETLINK=y
+CONFIG_SCSI_PROC_FS=y
+CONFIG_CHR_DEV_ST=m
+CONFIG_CHR_DEV_OSST=m
+CONFIG_BLK_DEV_SR=y
+CONFIG_CHR_DEV_SG=y
+CONFIG_CHR_DEV_SCH=m
+CONFIG_SCSI_ENCLOSURE=m
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_LOGGING=y
+CONFIG_SCSI_WAIT_SCAN=m
+CONFIG_SCSI_FC_ATTRS=m
+CONFIG_SCSI_FC_TGT_ATTRS=y
+CONFIG_SCSI_ISCSI_ATTRS=m
+CONFIG_SCSI_SAS_ATTRS=m
+CONFIG_SCSI_SAS_LIBSAS=m
+CONFIG_SCSI_SAS_ATA=y
+CONFIG_SCSI_SAS_HOST_SMP=y
+CONFIG_SCSI_SRP_ATTRS=m
+CONFIG_SCSI_SRP_TGT_ATTRS=y
+CONFIG_SCSI_LOWLEVEL=y
+CONFIG_ISCSI_TCP=m
+CONFIG_ISCSI_BOOT_SYSFS=m
+CONFIG_LIBFC=m
+CONFIG_LIBFCOE=m
+CONFIG_SCSI_DEBUG=m
+CONFIG_SCSI_DH=y
+CONFIG_SCSI_DH_RDAC=m
+CONFIG_SCSI_DH_HP_SW=m
+CONFIG_SCSI_DH_EMC=m
+CONFIG_SCSI_DH_ALUA=m
+CONFIG_BLK_DEV_MD=y
+CONFIG_MD_AUTODETECT=y
+CONFIG_MD_LINEAR=m
+CONFIG_MD_RAID0=m
+CONFIG_MD_RAID1=m
+CONFIG_MD_RAID10=m
+CONFIG_MD_RAID456=m
+CONFIG_MD_MULTIPATH=m
+CONFIG_MD_FAULTY=m
+CONFIG_DM_BUFIO=m
+CONFIG_DM_PERSISTENT_DATA=m
+CONFIG_DM_CRYPT=m
+CONFIG_DM_SNAPSHOT=m
+CONFIG_DM_THIN_PROVISIONING=m
+CONFIG_DM_MIRROR=m
+CONFIG_DM_RAID=m
+CONFIG_DM_ZERO=m
+CONFIG_DM_MULTIPATH=m
+CONFIG_DM_MULTIPATH_QL=m
+CONFIG_DM_MULTIPATH_ST=m
+CONFIG_DM_UEVENT=y
+CONFIG_TARGET_CORE=m
+CONFIG_TCM_IBLOCK=m
+CONFIG_TCM_FILEIO=m
+CONFIG_TCM_PSCSI=m
+CONFIG_LOOPBACK_TARGET=m
+CONFIG_TCM_FC=m
+CONFIG_ISCSI_TARGET=m
+CONFIG_NET_CORE=y
+CONFIG_BONDING=m
+CONFIG_DUMMY=m
+CONFIG_EQUALIZER=m
+CONFIG_MII=y
+CONFIG_IEEE802154_DRIVERS=m
+CONFIG_IFB=m
+CONFIG_MACVLAN=m
+CONFIG_MACVTAP=m
+CONFIG_NETCONSOLE=m
+CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_NETPOLL=y
+CONFIG_NET_POLL_CONTROLLER=y
+CONFIG_TUN=y
+CONFIG_VETH=m
+CONFIG_ATM_DRIVERS=y
+CONFIG_ATM_DUMMY=m
+CONFIG_ATM_TCP=m
+CONFIG_CAIF_TTY=m
+CONFIG_CAIF_SPI_SLAVE=m
+CONFIG_CAIF_HSI=m
+CONFIG_ETHERNET=y
+CONFIG_B44=m
+CONFIG_CS89x0=m
+CONFIG_CS89x0_PLATFORM=y
+CONFIG_DM9000=m
+CONFIG_DNET=m
+CONFIG_MDIO_BITBANG=m
+CONFIG_MDIO_GPIO=m
+CONFIG_PLIP=m
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=m
+CONFIG_PPP_DEFLATE=m
+CONFIG_PPP_FILTER=y
+CONFIG_PPP_MPPE=m
+CONFIG_PPP_MULTILINK=y
+CONFIG_PPPOATM=m
+CONFIG_PPPOE=m
+CONFIG_PPTP=m
+CONFIG_PPPOL2TP=m
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_SYNC_TTY=m
+CONFIG_SLIP=m
+CONFIG_SLHC=y
+CONFIG_SLIP_COMPRESSED=y
+CONFIG_SLIP_SMART=y
+CONFIG_SLIP_MODE_SLIP6=y
+CONFIG_USB_CATC=m
+CONFIG_USB_KAWETH=m
+CONFIG_USB_PEGASUS=m
+CONFIG_USB_RTL8150=m
+CONFIG_USB_NET_AX8817X=m
+CONFIG_USB_NET_CDCETHER=m
+CONFIG_USB_NET_CDC_EEM=m
+CONFIG_USB_NET_CDC_NCM=m
+CONFIG_USB_NET_DM9601=m
+CONFIG_USB_NET_GL620A=m
+CONFIG_USB_NET_NET1080=m
+CONFIG_USB_NET_PLUSB=m
+CONFIG_USB_NET_MCS7830=m
+CONFIG_USB_NET_RNDIS_HOST=m
+CONFIG_USB_NET_CDC_SUBSET=m
+CONFIG_USB_BELKIN=y
+CONFIG_USB_ARMLINUX=y
+CONFIG_USB_NET_ZAURUS=m
+CONFIG_USB_NET_CX82310_ETH=m
+CONFIG_USB_NET_KALMIA=m
+CONFIG_USB_NET_QMI_WWAN=m
+CONFIG_USB_HSO=m
+CONFIG_USB_NET_INT51X1=m
+CONFIG_USB_CDC_PHONET=m
+CONFIG_USB_IPHETH=m
+CONFIG_USB_SIERRA_NET=m
+CONFIG_USB_VL600=m
+CONFIG_WLAN=y
+CONFIG_LIBERTAS=m
+CONFIG_LIBERTAS_THINFIRM=m
+CONFIG_LIBERTAS_THINFIRM_USB=m
+CONFIG_AT76C50X_USB=m
+CONFIG_USB_ZD1201=m
+CONFIG_USB_NET_RNDIS_WLAN=m
+CONFIG_RTL8187=m
+CONFIG_RTL8187_LEDS=y
+CONFIG_MAC80211_HWSIM=m
+CONFIG_ATH_COMMON=m
+CONFIG_ATH9K_HW=m
+CONFIG_ATH9K_COMMON=m
+CONFIG_ATH9K_BTCOEX_SUPPORT=y
+CONFIG_ATH9K=m
+CONFIG_ATH9K_AHB=y
+CONFIG_ATH9K_DEBUGFS=y
+CONFIG_ATH9K_RATE_CONTROL=y
+CONFIG_ATH9K_HTC=m
+CONFIG_ATH9K_HTC_DEBUGFS=y
+CONFIG_CARL9170=m
+CONFIG_CARL9170_LEDS=y
+CONFIG_CARL9170_WPC=y
+CONFIG_CARL9170_HWRNG=y
+CONFIG_ATH6KL=m
+CONFIG_ATH6KL_SDIO=m
+CONFIG_ATH6KL_USB=m
+CONFIG_B43=m
+CONFIG_B43_BCMA=y
+CONFIG_B43_BCMA_EXTRA=y
+CONFIG_B43_SSB=y
+CONFIG_B43_BCMA_PIO=y
+CONFIG_B43_PIO=y
+CONFIG_B43_PHY_N=y
+CONFIG_B43_PHY_LP=y
+CONFIG_B43_PHY_HT=y
+CONFIG_B43_LEDS=y
+CONFIG_B43_HWRNG=y
+CONFIG_B43LEGACY=m
+CONFIG_B43LEGACY_LEDS=y
+CONFIG_B43LEGACY_HWRNG=y
+CONFIG_B43LEGACY_DEBUG=y
+CONFIG_B43LEGACY_DMA=y
+CONFIG_B43LEGACY_PIO=y
+CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y
+CONFIG_BRCMUTIL=m
+CONFIG_BRCMSMAC=m
+CONFIG_BRCMFMAC=m
+CONFIG_BRCMFMAC_SDIO=y
+CONFIG_BRCMFMAC_USB=y
+CONFIG_HOSTAP=m
+CONFIG_HOSTAP_FIRMWARE=y
+CONFIG_HOSTAP_FIRMWARE_NVRAM=y
+CONFIG_IWM=m
+CONFIG_IWM_TRACING=y
+CONFIG_LIBERTAS_SPI=m
+CONFIG_P54_COMMON=m
+CONFIG_P54_USB=m
+CONFIG_P54_SPI=m
+CONFIG_P54_LEDS=y
+CONFIG_RT2X00=m
+CONFIG_RT2500USB=m
+CONFIG_RT73USB=m
+CONFIG_RT2800USB=m
+CONFIG_RT2800USB_RT33XX=y
+CONFIG_RT2800_LIB=m
+CONFIG_RT2X00_LIB_USB=m
+CONFIG_RT2X00_LIB=m
+CONFIG_RT2X00_LIB_FIRMWARE=y
+CONFIG_RT2X00_LIB_CRYPTO=y
+CONFIG_RT2X00_LIB_LEDS=y
+CONFIG_RT2X00_LIB_DEBUGFS=y
+CONFIG_RTL8192CU=m
+CONFIG_RTLWIFI=m
+CONFIG_RTL8192C_COMMON=m
+CONFIG_WL1251=m
+CONFIG_WL1251_SPI=m
+CONFIG_WL1251_SDIO=m
+CONFIG_WL12XX_MENU=m
+CONFIG_WL12XX=m
+CONFIG_WL12XX_SPI=m
+CONFIG_WL12XX_SDIO=m
+CONFIG_WL12XX_PLATFORM_DATA=y
+CONFIG_ZD1211RW=m
+CONFIG_MWIFIEX=m
+CONFIG_MWIFIEX_SDIO=m
+CONFIG_WAN=y
+CONFIG_HDLC=m
+CONFIG_HDLC_RAW=m
+CONFIG_HDLC_RAW_ETH=m
+CONFIG_HDLC_CISCO=m
+CONFIG_HDLC_FR=m
+CONFIG_HDLC_PPP=m
+CONFIG_HDLC_X25=m
+CONFIG_DLCI=m
+CONFIG_DLCI_MAX=8
+CONFIG_WAN_ROUTER_DRIVERS=m
+CONFIG_LAPBETHER=m
+CONFIG_ISDN=y
+CONFIG_ISDN_I4L=m
+CONFIG_ISDN_PPP=y
+CONFIG_ISDN_PPP_VJ=y
+CONFIG_ISDN_MPP=y
+CONFIG_IPPP_FILTER=y
+CONFIG_ISDN_PPP_BSDCOMP=m
+CONFIG_ISDN_AUDIO=y
+CONFIG_ISDN_TTY_FAX=y
+CONFIG_ISDN_X25=y
+CONFIG_ISDN_DIVERSION=m
+CONFIG_ISDN_DRV_HISAX=m
+CONFIG_ISDN_CAPI=m
+CONFIG_ISDN_DRV_AVMB1_VERBOSE_REASON=y
+CONFIG_CAPI_TRACE=y
+CONFIG_ISDN_CAPI_MIDDLEWARE=y
+CONFIG_ISDN_CAPI_CAPI20=m
+CONFIG_ISDN_CAPI_CAPIDRV=m
+CONFIG_CAPI_AVM=y
+CONFIG_CAPI_EICON=y
+CONFIG_ISDN_DRV_GIGASET=m
+CONFIG_GIGASET_I4L=y
+CONFIG_GIGASET_BASE=m
+CONFIG_GIGASET_M105=m
+CONFIG_GIGASET_M101=m
+CONFIG_MISDN=m
+CONFIG_MISDN_DSP=m
+CONFIG_MISDN_L1OIP=m
+CONFIG_MISDN_HFCUSB=m
+CONFIG_INPUT=y
+CONFIG_INPUT_FF_MEMLESS=m
+CONFIG_INPUT_POLLDEV=m
+CONFIG_INPUT_SPARSEKMAP=m
+CONFIG_INPUT_MOUSEDEV=y
+CONFIG_INPUT_MOUSEDEV_PSAUX=y
+CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
+CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
+CONFIG_INPUT_EVBUG=m
+CONFIG_INPUT_KEYBOARD=y
+CONFIG_KEYBOARD_ADP5588=m
+CONFIG_KEYBOARD_ADP5589=m
+CONFIG_KEYBOARD_ATKBD=y
+CONFIG_KEYBOARD_QT1070=m
+CONFIG_KEYBOARD_LKKBD=m
+CONFIG_KEYBOARD_TCA6416=m
+CONFIG_KEYBOARD_TCA8418=m
+CONFIG_KEYBOARD_MATRIX=m
+CONFIG_KEYBOARD_LM8323=m
+CONFIG_KEYBOARD_MAX7359=m
+CONFIG_KEYBOARD_MCS=m
+CONFIG_KEYBOARD_MPR121=m
+CONFIG_KEYBOARD_NEWTON=m
+CONFIG_KEYBOARD_OPENCORES=m
+CONFIG_KEYBOARD_SAMSUNG=m
+CONFIG_KEYBOARD_STOWAWAY=m
+CONFIG_KEYBOARD_SUNKBD=m
+CONFIG_KEYBOARD_STMPE=m
+CONFIG_KEYBOARD_XTKBD=m
+CONFIG_INPUT_MOUSE=y
+CONFIG_MOUSE_PS2=m
+CONFIG_MOUSE_PS2_ALPS=y
+CONFIG_MOUSE_PS2_LOGIPS2PP=y
+CONFIG_MOUSE_PS2_SYNAPTICS=y
+CONFIG_MOUSE_PS2_TRACKPOINT=y
+CONFIG_MOUSE_PS2_ELANTECH=y
+CONFIG_MOUSE_PS2_SENTELIC=y
+CONFIG_MOUSE_SERIAL=m
+CONFIG_MOUSE_APPLETOUCH=m
+CONFIG_MOUSE_BCM5974=m
+CONFIG_MOUSE_VSXXXAA=m
+CONFIG_MOUSE_GPIO=m
+CONFIG_MOUSE_SYNAPTICS_I2C=m
+CONFIG_MOUSE_SYNAPTICS_USB=m
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_JOYSTICK_ANALOG=m
+CONFIG_JOYSTICK_INTERACT=m
+CONFIG_JOYSTICK_SIDEWINDER=m
+CONFIG_JOYSTICK_WARRIOR=m
+CONFIG_JOYSTICK_MAGELLAN=m
+CONFIG_JOYSTICK_GAMECON=m
+CONFIG_JOYSTICK_TURBOGRAFX=m
+CONFIG_JOYSTICK_JOYDUMP=m
+CONFIG_JOYSTICK_XPAD=m
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_JOYSTICK_WALKERA0701=m
+CONFIG_INPUT_TABLET=y
+CONFIG_TABLET_USB_ACECAD=m
+CONFIG_TABLET_USB_AIPTEK=m
+CONFIG_TABLET_USB_GTCO=m
+CONFIG_TABLET_USB_HANWANG=m
+CONFIG_TABLET_USB_KBTAB=m
+CONFIG_TABLET_USB_WACOM=m
 CONFIG_INPUT_MISC=y
+CONFIG_INPUT_88PM860X_ONKEY=m
+CONFIG_INPUT_AD714X=m
+CONFIG_INPUT_AD714X_I2C=m
+CONFIG_INPUT_AD714X_SPI=m
+CONFIG_INPUT_BMA150=m
+CONFIG_INPUT_MMA8450=m
+CONFIG_INPUT_MPU3050=m
+CONFIG_INPUT_GP2A=m
+CONFIG_INPUT_GPIO_TILT_POLLED=m
+CONFIG_INPUT_ATI_REMOTE2=m
+CONFIG_INPUT_KEYSPAN_REMOTE=m
+CONFIG_INPUT_KXTJ9=m
+CONFIG_INPUT_POWERMATE=m
+CONFIG_INPUT_YEALINK=m
+CONFIG_INPUT_CM109=m
+CONFIG_INPUT_TWL4030_VIBRA=m
+CONFIG_INPUT_TWL6040_VIBRA=m
 CONFIG_INPUT_UINPUT=y
-# CONFIG_DEVKMEM is not set
+CONFIG_INPUT_PCF8574=m
+CONFIG_INPUT_GPIO_ROTARY_ENCODER=m
+CONFIG_INPUT_ADXL34X=m
+CONFIG_INPUT_ADXL34X_I2C=m
+CONFIG_INPUT_ADXL34X_SPI=m
+CONFIG_INPUT_CMA3000=m
+CONFIG_INPUT_CMA3000_I2C=m
+CONFIG_SERIO=y
+CONFIG_SERIO_SERPORT=m
+CONFIG_SERIO_PARKBD=m
+CONFIG_SERIO_LIBPS2=y
+CONFIG_SERIO_RAW=m
+CONFIG_SERIO_ALTERA_PS2=m
+CONFIG_SERIO_PS2MULT=m
+CONFIG_GAMEPORT=m
+CONFIG_GAMEPORT_NS558=m
+CONFIG_GAMEPORT_L4=m
+CONFIG_VT=y
+CONFIG_CONSOLE_TRANSLATIONS=y
+CONFIG_VT_CONSOLE=y
+CONFIG_VT_CONSOLE_SLEEP=y
+CONFIG_HW_CONSOLE=y
+CONFIG_UNIX98_PTYS=y
+CONFIG_DEVPTS_MULTIPLE_INSTANCES=y
+CONFIG_SERIAL_NONSTANDARD=y
+CONFIG_N_HDLC=m
+CONFIG_TRACE_ROUTER=m
+CONFIG_TRACE_SINK=m
+CONFIG_STALDRV=y
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_8250_RUNTIME_UARTS=32
+CONFIG_SERIAL_8250_DW=m
+CONFIG_SERIAL_MAX3100=m
+CONFIG_SERIAL_MAX3107=m
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+CONFIG_CONSOLE_POLL=y
+CONFIG_SERIAL_OF_PLATFORM=m
+CONFIG_SERIAL_TIMBERDALE=m
+CONFIG_SERIAL_ALTERA_JTAGUART=m
+CONFIG_SERIAL_ALTERA_UART=m
+CONFIG_SERIAL_ALTERA_UART_MAXPORTS=4
+CONFIG_SERIAL_ALTERA_UART_BAUDRATE=115200
+CONFIG_SERIAL_XILINX_PS_UART=m
+CONFIG_TTY_PRINTK=y
+CONFIG_PRINTER=m
+CONFIG_PPDEV=m
+CONFIG_HVC_DRIVER=y
+CONFIG_HVC_DCC=y
+CONFIG_IPMI_HANDLER=m
+CONFIG_IPMI_DEVICE_INTERFACE=m
+CONFIG_IPMI_SI=m
+CONFIG_IPMI_WATCHDOG=m
+CONFIG_IPMI_POWEROFF=m
+CONFIG_HW_RANDOM_TIMERIOMEM=m
+CONFIG_NVRAM=m
+CONFIG_RAW_DRIVER=m
+CONFIG_MAX_RAW_DEVS=256
+CONFIG_RAMOOPS=m
+CONFIG_I2C=y
+CONFIG_I2C_BOARDINFO=y
+CONFIG_I2C_COMPAT=y
+CONFIG_I2C_MUX=m
+CONFIG_I2C_MUX_GPIO=m
+CONFIG_I2C_MUX_PCA9541=m
+CONFIG_I2C_MUX_PCA954x=m
+CONFIG_I2C_SMBUS=m
+CONFIG_I2C_ALGOBIT=m
+CONFIG_I2C_ALGOPCF=m
+CONFIG_I2C_ALGOPCA=m
+CONFIG_I2C_DESIGNWARE_PLATFORM=m
+CONFIG_I2C_GPIO=m
+CONFIG_I2C_OCORES=m
+CONFIG_I2C_PCA_PLATFORM=m
+CONFIG_I2C_SIMTEC=m
+CONFIG_I2C_XILINX=m
+CONFIG_I2C_DIOLAN_U2C=m
+CONFIG_I2C_PARPORT=m
+CONFIG_I2C_PARPORT_LIGHT=m
+CONFIG_I2C_TAOS_EVM=m
+CONFIG_I2C_TINY_USB=m
+CONFIG_I2C_STUB=m
+CONFIG_SPI_MASTER=y
+CONFIG_SPI_BITBANG=m
+CONFIG_SPI_BUTTERFLY=m
+CONFIG_SPI_GPIO=m
+CONFIG_SPI_LM70_LLP=m
+CONFIG_SPI_OC_TINY=m
+CONFIG_SPI_DESIGNWARE=m
+CONFIG_SPI_TLE62X0=m
+CONFIG_HSI=m
+CONFIG_HSI_BOARDINFO=y
+CONFIG_HSI_CHAR=m
+CONFIG_PPS=m
+CONFIG_PPS_CLIENT_PARPORT=m
+CONFIG_PPS_CLIENT_GPIO=m
+CONFIG_GPIOLIB=y
+CONFIG_GPIO_GENERIC=m
+CONFIG_GPIO_GENERIC_PLATFORM=m
+CONFIG_POWER_SUPPLY=y
+CONFIG_TEST_POWER=m
+CONFIG_HWMON=y
+CONFIG_HWMON_VID=m
+CONFIG_THERMAL=y
+CONFIG_THERMAL_HWMON=y
+CONFIG_WATCHDOG_CORE=y
+CONFIG_SOFT_WATCHDOG=m
+CONFIG_SSB_POSSIBLE=y
+CONFIG_SSB=m
+CONFIG_SSB_BLOCKIO=y
+CONFIG_SSB_SDIOHOST_POSSIBLE=y
+CONFIG_SSB_SDIOHOST=y
+CONFIG_BCMA_POSSIBLE=y
+CONFIG_BCMA=m
+CONFIG_BCMA_BLOCKIO=y
+CONFIG_MFD_CORE=y
+CONFIG_MFD_88PM860X=y
+CONFIG_MFD_SM501=m
+CONFIG_HTC_EGPIO=y
+CONFIG_HTC_PASIC3=m
+CONFIG_HTC_I2CPLD=y
+CONFIG_MFD_STMPE=y
+CONFIG_STMPE_I2C=y
+CONFIG_STMPE_SPI=y
+CONFIG_MFD_WL1273_CORE=m
+CONFIG_REGULATOR=y
+CONFIG_REGULATOR_FIXED_VOLTAGE=y
+CONFIG_REGULATOR_VIRTUAL_CONSUMER=m
+CONFIG_REGULATOR_USERSPACE_CONSUMER=m
+CONFIG_REGULATOR_GPIO=m
+CONFIG_DVB_CORE=m
+CONFIG_DVB_NET=y
+CONFIG_VIDEO_MEDIA=m
+CONFIG_MEDIA_SUPPORT=m
+CONFIG_VIDEO_DEV=m
+CONFIG_RC_CORE=m
+CONFIG_LIRC=m
+CONFIG_LIRC_SERIAL_TRANSMITTER=y
+CONFIG_RC_MAP=m
+CONFIG_IR_NEC_DECODER=m
+CONFIG_IR_JVC_DECODER=m
+CONFIG_IR_SONY_DECODER=m
+CONFIG_IR_SANYO_DECODER=m
+CONFIG_IR_LIRC_CODEC=m
+CONFIG_RC_ATI_REMOTE=m
+CONFIG_IR_IMON=m
+CONFIG_IR_MCEUSB=m
+CONFIG_IR_REDRAT3=m
+CONFIG_IR_STREAMZAP=m
+CONFIG_RC_LOOPBACK=m
+CONFIG_IR_GPIO_CIR=m
+CONFIG_MEDIA_ATTACH=y
+CONFIG_MEDIA_TUNER=m
+CONFIG_MEDIA_TUNER_SIMPLE=m
+CONFIG_VIDEOBUF_GEN=m
+CONFIG_VIDEOBUF_VMALLOC=m
+CONFIG_VIDEOBUF_DMA_CONTIG=m
+CONFIG_VIDEOBUF_DVB=m
+CONFIG_VIDEO_TVEEPROM=m
+CONFIG_VIDEO_TUNER=m
+CONFIG_V4L2_MEM2MEM_DEV=m
+CONFIG_VIDEOBUF2_DMA_CONTIG=m
+CONFIG_VIDEO_CAPTURE_DRIVERS=y
+CONFIG_VIDEO_IR_I2C=m
+CONFIG_VIDEO_TVAUDIO=m
+CONFIG_V4L_USB_DRIVERS=y
+CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y
+CONFIG_USB_GSPCA=m
+CONFIG_USB_M5602=m
+CONFIG_USB_STV06XX=m
+CONFIG_USB_GL860=m
+CONFIG_USB_GSPCA_BENQ=m
+CONFIG_USB_GSPCA_CONEX=m
+CONFIG_USB_GSPCA_CPIA1=m
+CONFIG_USB_GSPCA_ETOMS=m
+CONFIG_USB_GSPCA_FINEPIX=m
+CONFIG_USB_GSPCA_JEILINJ=m
+CONFIG_USB_GSPCA_JL2005BCD=m
+CONFIG_USB_GSPCA_KINECT=m
+CONFIG_USB_GSPCA_KONICA=m
+CONFIG_USB_GSPCA_MARS=m
+CONFIG_USB_GSPCA_MR97310A=m
+CONFIG_USB_GSPCA_NW80X=m
+CONFIG_USB_GSPCA_OV519=m
+CONFIG_USB_GSPCA_OV534=m
+CONFIG_USB_GSPCA_OV534_9=m
+CONFIG_USB_GSPCA_PAC207=m
+CONFIG_USB_GSPCA_PAC7302=m
+CONFIG_USB_GSPCA_PAC7311=m
+CONFIG_USB_GSPCA_SE401=m
+CONFIG_USB_GSPCA_SN9C2028=m
+CONFIG_USB_GSPCA_SN9C20X=m
+CONFIG_USB_GSPCA_SONIXB=m
+CONFIG_USB_GSPCA_SONIXJ=m
+CONFIG_USB_GSPCA_SPCA500=m
+CONFIG_USB_GSPCA_SPCA501=m
+CONFIG_USB_GSPCA_SPCA505=m
+CONFIG_USB_GSPCA_SPCA506=m
+CONFIG_USB_GSPCA_SPCA508=m
+CONFIG_USB_GSPCA_SPCA561=m
+CONFIG_USB_GSPCA_SPCA1528=m
+CONFIG_USB_GSPCA_SQ905=m
+CONFIG_USB_GSPCA_SQ905C=m
+CONFIG_USB_GSPCA_SQ930X=m
+CONFIG_USB_GSPCA_STK014=m
+CONFIG_USB_GSPCA_STV0680=m
+CONFIG_USB_GSPCA_SUNPLUS=m
+CONFIG_USB_GSPCA_T613=m
+CONFIG_USB_GSPCA_TOPRO=m
+CONFIG_USB_GSPCA_TV8532=m
+CONFIG_USB_GSPCA_VC032X=m
+CONFIG_USB_GSPCA_VICAM=m
+CONFIG_USB_GSPCA_XIRLINK_CIT=m
+CONFIG_USB_GSPCA_ZC3XX=m
+CONFIG_VIDEO_PVRUSB2=m
+CONFIG_VIDEO_PVRUSB2_SYSFS=y
+CONFIG_VIDEO_PVRUSB2_DVB=y
+CONFIG_VIDEO_HDPVR=m
+CONFIG_VIDEO_EM28XX=m
+CONFIG_VIDEO_EM28XX_ALSA=m
+CONFIG_VIDEO_EM28XX_DVB=m
+CONFIG_VIDEO_EM28XX_RC=y
+CONFIG_VIDEO_TLG2300=m
+CONFIG_VIDEO_CX231XX=m
+CONFIG_VIDEO_CX231XX_RC=y
+CONFIG_VIDEO_CX231XX_ALSA=m
+CONFIG_VIDEO_CX231XX_DVB=m
+CONFIG_VIDEO_TM6000=m
+CONFIG_VIDEO_TM6000_ALSA=m
+CONFIG_VIDEO_TM6000_DVB=m
+CONFIG_VIDEO_USBVISION=m
+CONFIG_USB_PWC=m
+CONFIG_USB_PWC_INPUT_EVDEV=y
+CONFIG_VIDEO_CPIA2=m
+CONFIG_USB_ZR364XX=m
+CONFIG_USB_STKWEBCAM=m
+CONFIG_USB_S2255=m
+CONFIG_V4L_ISA_PARPORT_DRIVERS=y
+CONFIG_VIDEO_BWQCAM=m
+CONFIG_VIDEO_CQCAM=m
+CONFIG_VIDEO_W9966=m
+CONFIG_V4L_PLATFORM_DRIVERS=y
+CONFIG_VIDEO_TIMBERDALE=m
+CONFIG_SOC_CAMERA=m
+CONFIG_SOC_CAMERA_PLATFORM=m
+CONFIG_VIDEO_SH_MOBILE_CSI2=m
+CONFIG_VIDEO_SH_MOBILE_CEU=m
+CONFIG_V4L_MEM2MEM_DRIVERS=y
+CONFIG_VIDEO_MEM2MEM_TESTDEV=m
+CONFIG_RADIO_ADAPTERS=y
+CONFIG_RADIO_SI470X=y
+CONFIG_USB_SI470X=m
+CONFIG_I2C_SI470X=m
+CONFIG_USB_MR800=m
+CONFIG_USB_DSBR=m
+CONFIG_I2C_SI4713=m
+CONFIG_RADIO_SI4713=m
+CONFIG_USB_KEENE=m
+CONFIG_RADIO_WL1273=m
+CONFIG_RADIO_WL128X=m
+CONFIG_DVB_MAX_ADAPTERS=8
+CONFIG_DVB_DYNAMIC_MINORS=y
+CONFIG_DVB_CAPTURE_DRIVERS=y
+CONFIG_TTPCI_EEPROM=m
+CONFIG_DVB_USB=m
+CONFIG_SMS_SIANO_MDTV=m
+CONFIG_SMS_USB_DRV=m
+CONFIG_SMS_SDIO_DRV=m
+CONFIG_DVB_B2C2_FLEXCOP=m
+CONFIG_DVB_B2C2_FLEXCOP_USB=m
+CONFIG_DVB_FE_CUSTOMISE=y
+CONFIG_DVB_PLL=m
+CONFIG_DRM=m
+CONFIG_DRM_USB=m
+CONFIG_DRM_KMS_HELPER=m
+CONFIG_DRM_LOAD_EDID_FIRMWARE=y
+CONFIG_DRM_I2C_CH7006=m
+CONFIG_DRM_I2C_SIL164=m
+CONFIG_DRM_UDL=m
+CONFIG_VIDEO_OUTPUT_CONTROL=m
+CONFIG_FB_SYS_FILLRECT=m
+CONFIG_FB_SYS_COPYAREA=m
+CONFIG_FB_SYS_IMAGEBLIT=m
+CONFIG_FB_SYS_FOPS=m
+CONFIG_FB_DEFERRED_IO=y
+CONFIG_FB_UVESA=m
+CONFIG_FB_S1D13XXX=m
+CONFIG_FB_TMIO=m
+CONFIG_FB_TMIO_ACCELL=y
+CONFIG_FB_SM501=m
+CONFIG_FB_SMSCUFX=m
+CONFIG_FB_UDL=m
+CONFIG_FB_METRONOME=m
+CONFIG_FB_BROADSHEET=m
+CONFIG_PANEL_LGPHILIPS_LB035Q02=m
+CONFIG_PANEL_SHARP_LS037V7DW01=y
+CONFIG_PANEL_NEC_NL8048HL11_01B=m
+CONFIG_PANEL_PICODLP=m
+CONFIG_PANEL_TPO_TD043MTEA1=y
+CONFIG_LCD_L4F00242T03=m
+CONFIG_LCD_LMS283GF05=m
+CONFIG_LCD_LTV350QV=m
+CONFIG_LCD_ILI9320=m
+CONFIG_LCD_TDO24M=m
+CONFIG_LCD_VGG2432A4=m
+CONFIG_LCD_S6E63M0=m
+CONFIG_LCD_LD9040=m
+CONFIG_LCD_AMS369FG06=m
+CONFIG_BACKLIGHT_CLASS_DEVICE=y
+CONFIG_BACKLIGHT_ATMEL_PWM=m
+CONFIG_BACKLIGHT_GENERIC=m
+CONFIG_BACKLIGHT_PWM=m
+CONFIG_DUMMY_CONSOLE=y
 CONFIG_FRAMEBUFFER_CONSOLE=y
+CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y
+CONFIG_FONTS=y
+CONFIG_FONT_8x8=y
+CONFIG_FONT_8x16=y
+CONFIG_FONT_ACORN_8x8=y
+CONFIG_SOUND_OSS_CORE=y
+CONFIG_SOUND_OSS_CORE_PRECLAIM=y
+CONFIG_SND_JACK=y
+CONFIG_SND_SEQUENCER=m
+CONFIG_SND_SEQ_DUMMY=m
+CONFIG_SND_OSSEMUL=y
+CONFIG_SND_PCM_OSS_PLUGINS=y
+CONFIG_SND_HRTIMER=m
+CONFIG_SND_SEQ_HRTIMER_DEFAULT=y
+CONFIG_SND_DYNAMIC_MINORS=y
+CONFIG_SND_SUPPORT_OLD_API=y
+CONFIG_SND_VERBOSE_PROCFS=y
+CONFIG_SND_VMASTER=y
+CONFIG_SND_RAWMIDI_SEQ=m
+CONFIG_SND_MPU401_UART=m
+CONFIG_SND_DRIVERS=y
+CONFIG_SND_DUMMY=m
+CONFIG_SND_ALOOP=m
+CONFIG_SND_VIRMIDI=m
+CONFIG_SND_MTPAV=m
+CONFIG_SND_MTS64=m
+CONFIG_SND_SERIAL_U16550=m
+CONFIG_SND_MPU401=m
+CONFIG_SND_PORTMAN2X4=m
+CONFIG_SND_SPI=y
+CONFIG_SND_USB=y
+CONFIG_SND_USB_UA101=m
+CONFIG_SND_USB_CAIAQ=m
+CONFIG_SND_USB_CAIAQ_INPUT=y
+CONFIG_SND_USB_6FIRE=m
+CONFIG_HID_SUPPORT=y
+CONFIG_HID=m
+CONFIG_HIDRAW=y
+CONFIG_USB_HID=m
+CONFIG_HID_PID=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_KBD=m
+CONFIG_USB_MOUSE=m
+CONFIG_HID_A4TECH=m
+CONFIG_HID_ACRUX=m
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=m
+CONFIG_HID_BELKIN=m
+CONFIG_HID_CHERRY=m
+CONFIG_HID_CHICONY=m
+CONFIG_HID_PRODIKEYS=m
+CONFIG_HID_CYPRESS=m
+CONFIG_HID_DRAGONRISE=m
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=m
+CONFIG_HID_ELECOM=m
+CONFIG_HID_EZKEY=m
+CONFIG_HID_HOLTEK=m
+CONFIG_HOLTEK_FF=y
+CONFIG_HID_KEYTOUCH=m
+CONFIG_HID_KYE=m
+CONFIG_HID_UCLOGIC=m
+CONFIG_HID_WALTOP=m
+CONFIG_HID_GYRATION=m
+CONFIG_HID_TWINHAN=m
+CONFIG_HID_KENSINGTON=m
+CONFIG_HID_LCPOWER=m
+CONFIG_HID_LOGITECH=m
+CONFIG_HID_LOGITECH_DJ=m
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_LOGIWHEELS_FF=y
+CONFIG_HID_MAGICMOUSE=m
+CONFIG_HID_MICROSOFT=m
+CONFIG_HID_MONTEREY=m
+CONFIG_HID_MULTITOUCH=m
+CONFIG_HID_NTRIG=m
+CONFIG_HID_ORTEK=m
+CONFIG_HID_PANTHERLORD=m
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=m
+CONFIG_HID_PICOLCD=m
+CONFIG_HID_PICOLCD_FB=y
+CONFIG_HID_PICOLCD_BACKLIGHT=y
+CONFIG_HID_PICOLCD_LCD=y
+CONFIG_HID_PICOLCD_LEDS=y
+CONFIG_HID_PRIMAX=m
+CONFIG_HID_ROCCAT=m
+CONFIG_HID_SAITEK=m
+CONFIG_HID_SAMSUNG=m
+CONFIG_HID_SONY=m
+CONFIG_HID_SPEEDLINK=m
+CONFIG_HID_SUNPLUS=m
+CONFIG_HID_GREENASIA=m
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=m
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=m
+CONFIG_HID_TOPSEED=m
+CONFIG_HID_THRUSTMASTER=m
+CONFIG_THRUSTMASTER_FF=y
+CONFIG_HID_WACOM=m
+CONFIG_HID_WACOM_POWER_SUPPLY=y
+CONFIG_HID_WIIMOTE=m
+CONFIG_HID_WIIMOTE_EXT=y
+CONFIG_HID_ZEROPLUS=m
+CONFIG_ZEROPLUS_FF=y
+CONFIG_HID_ZYDACRON=m
+CONFIG_USB_SUPPORT=y
+CONFIG_USB_SUSPEND=y
+CONFIG_USB_COMMON=y
+CONFIG_USB_OTG=y
+CONFIG_USB_WUSB_CBAF=m
+CONFIG_USB_C67X00_HCD=m
+CONFIG_USB_OXU210HP_HCD=m
+CONFIG_USB_ISP116X_HCD=m
+CONFIG_USB_ISP1760_HCD=m
+CONFIG_USB_OHCI_HCD=y
+CONFIG_USB_OHCI_HCD_PLATFORM=y
+CONFIG_USB_OHCI_LITTLE_ENDIAN=y
+CONFIG_USB_U132_HCD=m
+CONFIG_USB_SL811_HCD=m
+CONFIG_USB_SL811_HCD_ISO=y
+CONFIG_USB_R8A66597_HCD=m
+CONFIG_USB_RENESAS_USBHS_HCD=m
+CONFIG_USB_RENESAS_USBHS=m
+CONFIG_USB_ACM=m
+CONFIG_USB_PRINTER=m
+CONFIG_USB_TMC=m
+CONFIG_USB_STORAGE_REALTEK=m
+CONFIG_REALTEK_AUTOPM=y
+CONFIG_USB_STORAGE_DATAFAB=m
+CONFIG_USB_STORAGE_FREECOM=m
+CONFIG_USB_STORAGE_ISD200=m
+CONFIG_USB_STORAGE_USBAT=m
+CONFIG_USB_STORAGE_SDDR09=m
+CONFIG_USB_STORAGE_SDDR55=m
+CONFIG_USB_STORAGE_JUMPSHOT=m
+CONFIG_USB_STORAGE_ALAUDA=m
+CONFIG_USB_STORAGE_ONETOUCH=m
+CONFIG_USB_STORAGE_KARMA=m
+CONFIG_USB_STORAGE_CYPRESS_ATACB=m
+CONFIG_USB_STORAGE_ENE_UB6250=m
+CONFIG_USB_UAS=m
+CONFIG_USB_MDC800=m
+CONFIG_USB_MICROTEK=m
+CONFIG_USB_USS720=m
+CONFIG_USB_SERIAL=m
+CONFIG_USB_EZUSB=y
+CONFIG_USB_SERIAL_GENERIC=y
+CONFIG_USB_SERIAL_CP210X=m
+CONFIG_USB_SERIAL_PL2303=m
+CONFIG_USB_SERIAL_QUALCOMM=m
+CONFIG_USB_SERIAL_SPCP8X5=m
+CONFIG_USB_SERIAL_HP4X=m
+CONFIG_USB_SERIAL_TI=m
+CONFIG_USB_SERIAL_DEBUG=m
+CONFIG_USB_EMI62=m
+CONFIG_USB_EMI26=m
+CONFIG_USB_ADUTUX=m
+CONFIG_USB_SEVSEG=m
+CONFIG_USB_RIO500=m
+CONFIG_USB_LEGOTOWER=m
+CONFIG_USB_LCD=m
+CONFIG_USB_LED=m
+CONFIG_USB_CYPRESS_CY7C63=m
+CONFIG_USB_CYTHERM=m
+CONFIG_USB_IDMOUSE=m
+CONFIG_USB_FTDI_ELAN=m
+CONFIG_USB_APPLEDISPLAY=m
+CONFIG_USB_SISUSBVGA=m
+CONFIG_USB_LD=m
+CONFIG_USB_TRANCEVIBRATOR=m
+CONFIG_USB_IOWARRIOR=m
+CONFIG_USB_ISIGHTFW=m
+CONFIG_USB_YUREX=m
+CONFIG_USB_ATM=m
+CONFIG_USB_SPEEDTOUCH=m
+CONFIG_USB_CXACRU=m
+CONFIG_USB_UEAGLEATM=m
+CONFIG_USB_XUSBATM=m
+CONFIG_USB_GADGET_VBUS_DRAW=2
+CONFIG_USB_GADGET_STORAGE_NUM_BUFFERS=2
+CONFIG_USB_ZERO=m
+CONFIG_USB_AUDIO=m
+CONFIG_GADGET_UAC1=y
+CONFIG_USB_ETH=m
+CONFIG_USB_ETH_RNDIS=y
+CONFIG_USB_G_NCM=m
+CONFIG_USB_GADGETFS=m
+CONFIG_USB_FUNCTIONFS=m
+CONFIG_USB_FUNCTIONFS_ETH=y
+CONFIG_USB_FUNCTIONFS_RNDIS=y
+CONFIG_USB_FUNCTIONFS_GENERIC=y
+CONFIG_USB_MASS_STORAGE=m
+CONFIG_USB_G_SERIAL=m
+CONFIG_USB_MIDI_GADGET=m
+CONFIG_USB_G_PRINTER=m
+CONFIG_USB_CDC_COMPOSITE=m
+CONFIG_USB_G_NOKIA=m
+CONFIG_USB_G_ACM_MS=m
+CONFIG_USB_G_MULTI=m
+CONFIG_USB_G_MULTI_RNDIS=y
+CONFIG_USB_G_MULTI_CDC=y
+CONFIG_USB_G_HID=m
+CONFIG_USB_G_DBGP=m
+CONFIG_USB_G_DBGP_SERIAL=y
+CONFIG_USB_G_WEBCAM=m
+CONFIG_USB_OTG_UTILS=y
+CONFIG_USB_GPIO_VBUS=y
+CONFIG_USB_ULPI=y
+CONFIG_NOP_USB_XCEIV=y
+CONFIG_MMC_BLOCK=y
+CONFIG_MMC_BLOCK_MINORS=8
+CONFIG_MMC_BLOCK_BOUNCE=y
+CONFIG_MMC_SDHCI=y
+CONFIG_MMC_SDHCI_PLTFM=m
+CONFIG_MMC_SDHCI_PXAV3=m
+CONFIG_MMC_SDHCI_PXAV2=m
+CONFIG_MMC_SPI=m
+CONFIG_MMC_TMIO_CORE=m
+CONFIG_MMC_TMIO=m
+CONFIG_MMC_DW=m
+CONFIG_MMC_DW_PLTFM=m
+CONFIG_MMC_VUB300=m
+CONFIG_MMC_USHC=m
+CONFIG_MEMSTICK=m
+CONFIG_MSPRO_BLOCK=m
+CONFIG_NEW_LEDS=y
+CONFIG_LEDS_CLASS=y
+CONFIG_LEDS_PCA9532=m
+CONFIG_LEDS_PCA9532_GPIO=y
+CONFIG_LEDS_GPIO=m
+CONFIG_LEDS_LP3944=m
+CONFIG_LEDS_LP5521=m
+CONFIG_LEDS_LP5523=m
+CONFIG_LEDS_PWM=m
+CONFIG_LEDS_REGULATOR=m
+CONFIG_LEDS_TRIGGERS=y
+CONFIG_LEDS_TRIGGER_TIMER=m
+CONFIG_LEDS_TRIGGER_HEARTBEAT=m
+CONFIG_LEDS_TRIGGER_BACKLIGHT=m
+CONFIG_LEDS_TRIGGER_GPIO=m
+CONFIG_LEDS_TRIGGER_DEFAULT_ON=m
+CONFIG_RTC_DRV_CMOS=y
+CONFIG_RTC_HCTOSYS=y
+CONFIG_RTC_HCTOSYS_DEVICE="rtc0"
+CONFIG_RTC_INTF_SYSFS=y
+CONFIG_RTC_INTF_PROC=y
+CONFIG_RTC_INTF_DEV=y
+CONFIG_DMADEVICES=y
+CONFIG_DW_DMAC=m
+CONFIG_TIMB_DMA=m
+CONFIG_DMA_ENGINE=y
+CONFIG_NET_DMA=y
+CONFIG_ASYNC_TX_DMA=y
+CONFIG_AUXDISPLAY=y
+CONFIG_UIO=m
+CONFIG_UIO_PDRV=m
+CONFIG_UIO_PDRV_GENIRQ=m
+CONFIG_STAGING=y
+CONFIG_USBIP_CORE=m
+CONFIG_USBIP_VHCI_HCD=m
+CONFIG_USBIP_HOST=m
+CONFIG_W35UND=m
+CONFIG_PRISM2_USB=m
+CONFIG_ECHO=m
+CONFIG_ASUS_OLED=m
+CONFIG_PANEL=m
+CONFIG_PANEL_PARPORT=0
+CONFIG_PANEL_PROFILE=5
+CONFIG_RTLLIB=m
+CONFIG_RTLLIB_CRYPTO_CCMP=m
+CONFIG_RTLLIB_CRYPTO_TKIP=m
+CONFIG_RTLLIB_CRYPTO_WEP=m
+CONFIG_R8712U=m
+CONFIG_RTS5139=m
+CONFIG_TRANZPORT=m
+CONFIG_LINE6_USB=m
+CONFIG_USB_SERIAL_QUATECH2=m
+CONFIG_USB_SERIAL_QUATECH_USB2=m
+CONFIG_IIO=m
+CONFIG_IIO_ST_HWMON=m
+CONFIG_IIO_BUFFER=y
+CONFIG_IIO_SW_RING=m
+CONFIG_IIO_KFIFO_BUF=m
+CONFIG_IIO_TRIGGER=y
+CONFIG_IIO_CONSUMERS_PER_TRIGGER=2
+CONFIG_KXSD9=m
+CONFIG_SCA3000=m
+CONFIG_IIO_PERIODIC_RTC_TRIGGER=m
+CONFIG_IIO_GPIO_TRIGGER=m
+CONFIG_IIO_SYSFS_TRIGGER=m
+CONFIG_IIO_SIMPLE_DUMMY=m
+CONFIG_FB_SM7XX=m
+CONFIG_USB_ENESTORAGE=m
+CONFIG_BCM_WIMAX=m
+CONFIG_FT1000=m
+CONFIG_FT1000_USB=m
+CONFIG_SPEAKUP=m
+CONFIG_SPEAKUP_SYNTH_SPKOUT=m
+CONFIG_SPEAKUP_SYNTH_TXPRT=m
+CONFIG_SPEAKUP_SYNTH_DUMMY=m
+CONFIG_STAGING_MEDIA=y
+CONFIG_DVB_AS102=m
+CONFIG_EASYCAP=m
+CONFIG_LIRC_STAGING=y
+CONFIG_LIRC_IGORPLUGUSB=m
+CONFIG_LIRC_IMON=m
+CONFIG_LIRC_PARALLEL=m
+CONFIG_LIRC_SASEM=m
+CONFIG_LIRC_SERIAL=m
+CONFIG_LIRC_SIR=m
+CONFIG_LIRC_TTUSBIR=m
+CONFIG_LIRC_ZILOG=m
+CONFIG_PHONE=m
+CONFIG_USB_WPAN_HCD=m
+CONFIG_CLKDEV_LOOKUP=y
+CONFIG_IOMMU_SUPPORT=y
+CONFIG_VIRT_DRIVERS=y
+CONFIG_PM_DEVFREQ=y
+CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=y
+CONFIG_DEVFREQ_GOV_PERFORMANCE=y
+CONFIG_DEVFREQ_GOV_POWERSAVE=y
+CONFIG_DEVFREQ_GOV_USERSPACE=y
+CONFIG_EXT2_FS_XATTR=y
+CONFIG_EXT2_FS_POSIX_ACL=y
+CONFIG_EXT2_FS_SECURITY=y
+CONFIG_EXT3_DEFAULTS_TO_ORDERED=y
+CONFIG_EXT3_FS_XATTR=y
+CONFIG_EXT3_FS_POSIX_ACL=y
+CONFIG_EXT3_FS_SECURITY=y
+CONFIG_EXT4_FS_XATTR=y
+CONFIG_EXT4_FS_POSIX_ACL=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_JBD=y
+CONFIG_JBD2=y
+CONFIG_FS_MBCACHE=y
+CONFIG_REISERFS_FS=m
+CONFIG_REISERFS_FS_XATTR=y
+CONFIG_REISERFS_FS_POSIX_ACL=y
+CONFIG_REISERFS_FS_SECURITY=y
+CONFIG_JFS_FS=m
+CONFIG_JFS_POSIX_ACL=y
+CONFIG_JFS_SECURITY=y
+CONFIG_JFS_STATISTICS=y
+CONFIG_XFS_FS=m
+CONFIG_XFS_QUOTA=y
+CONFIG_XFS_POSIX_ACL=y
+CONFIG_XFS_RT=y
+CONFIG_GFS2_FS=m
+CONFIG_GFS2_FS_LOCKING_DLM=y
+CONFIG_OCFS2_FS=m
+CONFIG_OCFS2_FS_O2CB=m
+CONFIG_OCFS2_FS_USERSPACE_CLUSTER=m
+CONFIG_OCFS2_FS_STATS=y
+CONFIG_OCFS2_DEBUG_MASKLOG=y
+CONFIG_NILFS2_FS=m
+CONFIG_FS_POSIX_ACL=y
+CONFIG_EXPORTFS=y
+CONFIG_FILE_LOCKING=y
+CONFIG_FSNOTIFY=y
+CONFIG_DNOTIFY=y
+CONFIG_INOTIFY_USER=y
+CONFIG_FANOTIFY=y
+CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y
+CONFIG_QUOTA_NETLINK_INTERFACE=y
+CONFIG_PRINT_QUOTA_WARNING=y
+CONFIG_QFMT_V1=m
+CONFIG_QUOTACTL=y
+CONFIG_AUTOFS4_FS=m
+CONFIG_FUSE_FS=y
+CONFIG_CUSE=m
+CONFIG_GENERIC_ACL=y
+CONFIG_FSCACHE=m
+CONFIG_FSCACHE_STATS=y
+CONFIG_FSCACHE_HISTOGRAM=y
+CONFIG_CACHEFILES=m
+CONFIG_ISO9660_FS=m
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_UDF_FS=m
+CONFIG_UDF_NLS=y
+CONFIG_FAT_FS=y
+CONFIG_FAT_DEFAULT_CODEPAGE=437
+CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1"
+CONFIG_NTFS_FS=m
+CONFIG_PROC_FS=y
+CONFIG_PROC_VMCORE=y
+CONFIG_PROC_SYSCTL=y
+CONFIG_PROC_PAGE_MONITOR=y
+CONFIG_SYSFS=y
 CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_TMPFS_XATTR=y
+CONFIG_CONFIGFS_FS=m
+CONFIG_MISC_FILESYSTEMS=y
+CONFIG_ADFS_FS=m
+CONFIG_AFFS_FS=m
+CONFIG_HFS_FS=m
+CONFIG_HFSPLUS_FS=m
+CONFIG_BEFS_FS=m
+CONFIG_BFS_FS=m
+CONFIG_EFS_FS=m
+CONFIG_JFFS2_FS_DEBUG=0
+CONFIG_JFFS2_FS_WRITEBUFFER=y
+CONFIG_JFFS2_ZLIB=y
+CONFIG_JFFS2_RTIME=y
+CONFIG_JFFS2_CMODE_FAVOURLZO=y
+CONFIG_SQUASHFS=m
+CONFIG_SQUASHFS_XATTR=y
+CONFIG_SQUASHFS_ZLIB=y
+CONFIG_SQUASHFS_LZO=y
+CONFIG_SQUASHFS_XZ=y
+CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3
+CONFIG_VXFS_FS=m
+CONFIG_MINIX_FS=m
+CONFIG_OMFS_FS=m
+CONFIG_HPFS_FS=m
+CONFIG_QNX4FS_FS=m
+CONFIG_QNX6FS_FS=m
+CONFIG_ROMFS_FS=m
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+CONFIG_ROMFS_ON_BLOCK=y
+CONFIG_PSTORE=y
+CONFIG_SYSV_FS=m
+CONFIG_UFS_FS=m
+CONFIG_NETWORK_FILESYSTEMS=y
+CONFIG_NFS_FS=m
+CONFIG_NFS_V3=y
+CONFIG_NFS_V3_ACL=y
+CONFIG_NFS_V4=y
+CONFIG_NFS_FSCACHE=y
+CONFIG_NFS_USE_KERNEL_DNS=y
+CONFIG_NFSD=m
+CONFIG_NFSD_V2_ACL=y
+CONFIG_NFSD_V3=y
+CONFIG_NFSD_V3_ACL=y
+CONFIG_NFSD_V4=y
+CONFIG_LOCKD=m
+CONFIG_LOCKD_V4=y
+CONFIG_NFS_ACL_SUPPORT=m
+CONFIG_NFS_COMMON=y
+CONFIG_SUNRPC=m
+CONFIG_SUNRPC_GSS=m
+CONFIG_RPCSEC_GSS_KRB5=m
+CONFIG_CEPH_FS=m
+CONFIG_CIFS=m
+CONFIG_CIFS_WEAK_PW_HASH=y
+CONFIG_CIFS_UPCALL=y
+CONFIG_CIFS_XATTR=y
+CONFIG_CIFS_POSIX=y
+CONFIG_CIFS_DFS_UPCALL=y
+CONFIG_NCP_FS=m
+CONFIG_NCPFS_PACKET_SIGNING=y
+CONFIG_NCPFS_IOCTL_LOCKING=y
+CONFIG_NCPFS_STRONG=y
+CONFIG_NCPFS_NFS_NS=y
+CONFIG_NCPFS_OS2_NS=y
+CONFIG_NCPFS_NLS=y
+CONFIG_NCPFS_EXTRAS=y
+CONFIG_CODA_FS=m
+CONFIG_AFS_FS=m
+CONFIG_9P_FS=m
+CONFIG_9P_FS_POSIX_ACL=y
+CONFIG_NLS=y
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_737=m
+CONFIG_NLS_CODEPAGE_775=m
+CONFIG_NLS_CODEPAGE_850=m
+CONFIG_NLS_CODEPAGE_852=m
+CONFIG_NLS_CODEPAGE_855=m
+CONFIG_NLS_CODEPAGE_857=m
+CONFIG_NLS_CODEPAGE_860=m
+CONFIG_NLS_CODEPAGE_861=m
+CONFIG_NLS_CODEPAGE_862=m
+CONFIG_NLS_CODEPAGE_863=m
+CONFIG_NLS_CODEPAGE_864=m
+CONFIG_NLS_CODEPAGE_865=m
+CONFIG_NLS_CODEPAGE_866=m
+CONFIG_NLS_CODEPAGE_869=m
+CONFIG_NLS_CODEPAGE_936=m
+CONFIG_NLS_CODEPAGE_950=m
+CONFIG_NLS_CODEPAGE_932=m
+CONFIG_NLS_CODEPAGE_949=m
+CONFIG_NLS_CODEPAGE_874=m
+CONFIG_NLS_ISO8859_8=m
+CONFIG_NLS_CODEPAGE_1250=m
+CONFIG_NLS_CODEPAGE_1251=m
+CONFIG_NLS_ASCII=m
+CONFIG_NLS_ISO8859_2=m
+CONFIG_NLS_ISO8859_3=m
+CONFIG_NLS_ISO8859_4=m
+CONFIG_NLS_ISO8859_5=m
+CONFIG_NLS_ISO8859_6=m
+CONFIG_NLS_ISO8859_7=m
+CONFIG_NLS_ISO8859_9=m
+CONFIG_NLS_ISO8859_13=m
+CONFIG_NLS_ISO8859_14=m
+CONFIG_NLS_ISO8859_15=m
+CONFIG_NLS_KOI8_R=m
+CONFIG_NLS_KOI8_U=m
+CONFIG_NLS_UTF8=m
+CONFIG_DLM=m
+CONFIG_DEFAULT_MESSAGE_LOGLEVEL=4
+CONFIG_FRAME_WARN=1024
+CONFIG_UNUSED_SYMBOLS=y
+CONFIG_DEBUG_FS=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_LOCKUP_DETECTOR=y
+CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE=0
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+CONFIG_DETECT_HUNG_TASK=y
+CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
+CONFIG_SCHED_DEBUG=y
+CONFIG_STACKTRACE=y
+CONFIG_DEBUG_BUGVERBOSE=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_BOOT_PRINTK_DELAY=y
+CONFIG_NOP_TRACER=y
+CONFIG_RING_BUFFER=y
+CONFIG_EVENT_TRACING=y
+CONFIG_EVENT_POWER_TRACING_DEPRECATED=y
+CONFIG_CONTEXT_SWITCH_TRACER=y
+CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
+CONFIG_FTRACE=y
+CONFIG_BRANCH_PROFILE_NONE=y
+CONFIG_KPROBE_EVENT=y
+CONFIG_ASYNC_RAID6_TEST=m
+CONFIG_KGDB=y
+CONFIG_KGDB_SERIAL_CONSOLE=y
+CONFIG_KGDB_KDB=y
+CONFIG_KDB_KEYBOARD=y
+CONFIG_TEST_KSTRTOX=m
 CONFIG_STRICT_DEVMEM=y
+CONFIG_DEBUG_USER=y
+CONFIG_DEBUG_LL=y
+CONFIG_DEBUG_LL_UART_NONE=y
+CONFIG_EARLY_PRINTK=y
+CONFIG_ENCRYPTED_KEYS=y
 CONFIG_SECURITY=y
+CONFIG_SECURITYFS=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_PATH=y
 CONFIG_LSM_MMAP_MIN_ADDR=0
 CONFIG_SECURITY_SELINUX=y
+CONFIG_SECURITY_SELINUX_BOOTPARAM=y
+CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=0
+CONFIG_SECURITY_SELINUX_DISABLE=y
+CONFIG_SECURITY_SELINUX_DEVELOP=y
+CONFIG_SECURITY_SELINUX_AVC_STATS=y
+CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
 CONFIG_SECURITY_SMACK=y
+CONFIG_SECURITY_TOMOYO=y
+CONFIG_SECURITY_TOMOYO_MAX_ACCEPT_ENTRY=2048
+CONFIG_SECURITY_TOMOYO_MAX_AUDIT_LOG=1024
+CONFIG_SECURITY_TOMOYO_POLICY_LOADER="/sbin/tomoyo-init"
+CONFIG_SECURITY_TOMOYO_ACTIVATION_TRIGGER="/sbin/init"
 CONFIG_SECURITY_APPARMOR=y
+CONFIG_SECURITY_APPARMOR_BOOTPARAM_VALUE=1
+CONFIG_SECURITY_YAMA=y
+CONFIG_INTEGRITY=y
+CONFIG_INTEGRITY_SIGNATURE=y
+CONFIG_EVM=y
 CONFIG_DEFAULT_SECURITY_APPARMOR=y
+CONFIG_DEFAULT_SECURITY="apparmor"
+CONFIG_XOR_BLOCKS=m
+CONFIG_ASYNC_CORE=m
+CONFIG_ASYNC_MEMCPY=m
+CONFIG_ASYNC_XOR=m
+CONFIG_ASYNC_PQ=m
+CONFIG_ASYNC_RAID6_RECOV=m
+CONFIG_CRYPTO=y
+CONFIG_CRYPTO_ALGAPI=y
+CONFIG_CRYPTO_ALGAPI2=y
+CONFIG_CRYPTO_AEAD=m
+CONFIG_CRYPTO_AEAD2=y
+CONFIG_CRYPTO_BLKCIPHER=y
+CONFIG_CRYPTO_BLKCIPHER2=y
+CONFIG_CRYPTO_HASH=y
+CONFIG_CRYPTO_HASH2=y
+CONFIG_CRYPTO_RNG=y
+CONFIG_CRYPTO_RNG2=y
+CONFIG_CRYPTO_PCOMP=m
+CONFIG_CRYPTO_PCOMP2=y
+CONFIG_CRYPTO_MANAGER=y
+CONFIG_CRYPTO_MANAGER2=y
+CONFIG_CRYPTO_USER=m
+CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y
+CONFIG_CRYPTO_GF128MUL=m
+CONFIG_CRYPTO_NULL=m
+CONFIG_CRYPTO_WORKQUEUE=y
+CONFIG_CRYPTO_CRYPTD=m
+CONFIG_CRYPTO_AUTHENC=m
+CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_CCM=m
+CONFIG_CRYPTO_GCM=m
+CONFIG_CRYPTO_SEQIV=m
+CONFIG_CRYPTO_CBC=y
+CONFIG_CRYPTO_CTR=m
+CONFIG_CRYPTO_CTS=m
+CONFIG_CRYPTO_ECB=y
+CONFIG_CRYPTO_LRW=m
+CONFIG_CRYPTO_PCBC=m
+CONFIG_CRYPTO_XTS=m
+CONFIG_CRYPTO_HMAC=y
+CONFIG_CRYPTO_XCBC=m
+CONFIG_CRYPTO_VMAC=m
+CONFIG_CRYPTO_CRC32C=y
+CONFIG_CRYPTO_GHASH=m
+CONFIG_CRYPTO_MD4=m
+CONFIG_CRYPTO_MD5=y
+CONFIG_CRYPTO_RMD128=m
+CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_RMD256=m
+CONFIG_CRYPTO_RMD320=m
+CONFIG_CRYPTO_SHA1=y
+CONFIG_CRYPTO_SHA256=y
+CONFIG_CRYPTO_SHA512=m
+CONFIG_CRYPTO_TGR192=m
+CONFIG_CRYPTO_WP512=m
+CONFIG_CRYPTO_AES=y
+CONFIG_CRYPTO_ANUBIS=m
+CONFIG_CRYPTO_BLOWFISH=m
+CONFIG_CRYPTO_BLOWFISH_COMMON=m
+CONFIG_CRYPTO_CAMELLIA=m
+CONFIG_CRYPTO_CAST5=m
+CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_DES=m
+CONFIG_CRYPTO_FCRYPT=m
+CONFIG_CRYPTO_KHAZAD=m
+CONFIG_CRYPTO_SALSA20=m
+CONFIG_CRYPTO_SEED=m
+CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_TEA=m
+CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_TWOFISH_COMMON=m
+CONFIG_CRYPTO_DEFLATE=m
+CONFIG_CRYPTO_ZLIB=m
+CONFIG_CRYPTO_LZO=m
+CONFIG_CRYPTO_ANSI_CPRNG=m
+CONFIG_CRYPTO_USER_API=m
+CONFIG_CRYPTO_USER_API_HASH=m
+CONFIG_CRYPTO_USER_API_SKCIPHER=m
+CONFIG_CRYPTO_HW=y
+CONFIG_BINARY_PRINTF=y
+CONFIG_RAID6_PQ=m
+CONFIG_BITREVERSE=y
+CONFIG_GENERIC_IO=y
+CONFIG_CRC16=y
+CONFIG_CRC32=y
+CONFIG_CRC32_SLICEBY8=y
+CONFIG_CRC8=m
+CONFIG_AUDIT_GENERIC=y
+CONFIG_ZLIB_INFLATE=y
+CONFIG_LZO_COMPRESS=y
+CONFIG_LZO_DECOMPRESS=y
+CONFIG_XZ_DEC=y
+CONFIG_XZ_DEC_X86=y
+CONFIG_XZ_DEC_POWERPC=y
+CONFIG_XZ_DEC_IA64=y
+CONFIG_XZ_DEC_ARM=y
+CONFIG_XZ_DEC_ARMTHUMB=y
+CONFIG_XZ_DEC_SPARC=y
+CONFIG_XZ_DEC_BCJ=y
+CONFIG_XZ_DEC_TEST=m
+CONFIG_DECOMPRESS_GZIP=y
+CONFIG_DECOMPRESS_BZIP2=y
+CONFIG_DECOMPRESS_LZMA=y
+CONFIG_DECOMPRESS_XZ=y
+CONFIG_DECOMPRESS_LZO=y
+CONFIG_REED_SOLOMON=y
+CONFIG_REED_SOLOMON_ENC8=y
+CONFIG_REED_SOLOMON_DEC8=y
+CONFIG_REED_SOLOMON_DEC16=y
+CONFIG_BCH=y
+CONFIG_BCH_CONST_PARAMS=y
+CONFIG_TEXTSEARCH=y
+CONFIG_TEXTSEARCH_KMP=m
+CONFIG_TEXTSEARCH_BM=m
+CONFIG_TEXTSEARCH_FSM=m
+CONFIG_HAS_IOMEM=y
+CONFIG_HAS_IOPORT=y
+CONFIG_HAS_DMA=y
+CONFIG_DQL=y
+CONFIG_NLATTR=y
+CONFIG_LRU_CACHE=m
+CONFIG_AVERAGE=y
+CONFIG_CLZ_TAB=y
+CONFIG_CORDIC=m
+CONFIG_MPILIB=y
+CONFIG_SIGNATURE=y
-- 
cgit v1.2.3


From 9df91136a54e350b748fcdbb9c04dba408b07b52 Mon Sep 17 00:00:00 2001
From: Ricardo Salveti de Araujo <ricardo.salveti@linaro.org>
Date: Mon, 25 Jun 2012 15:41:35 -0300
Subject: configs: ubuntu: be compatible with the enforce script

Signed-off-by: Ricardo Salveti de Araujo <ricardo.salveti@linaro.org>
---
 linaro/configs/ubuntu.conf | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/linaro/configs/ubuntu.conf b/linaro/configs/ubuntu.conf
index 03a01419e10e..98af7523fc20 100644
--- a/linaro/configs/ubuntu.conf
+++ b/linaro/configs/ubuntu.conf
@@ -1,3 +1,8 @@
+# CONFIG_COMPAT_BRK is not set
+# CONFIG_DEVKMEM is not set
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_INIT_PASS_ALL_PARAMS=y
+CONFIG_DEBUG_RODATA=y
 CONFIG_INIT_ENV_ARG_LIMIT=32
 CONFIG_KERNEL_GZIP=y
 CONFIG_SWAP=y
@@ -13,7 +18,7 @@ CONFIG_AUDIT=y
 CONFIG_AUDITSYSCALL=y
 CONFIG_AUDIT_WATCH=y
 CONFIG_AUDIT_TREE=y
-CONFIG_AUDIT_LOGINUID_IMMUTABLE=y
+# CONFIG_AUDIT_LOGINUID_IMMUTABLE is not set
 CONFIG_GENERIC_HARDIRQS=y
 CONFIG_GENERIC_IRQ_CHIP=y
 CONFIG_IRQ_DOMAIN=y
@@ -964,7 +969,7 @@ CONFIG_ATH6KL_SDIO=m
 CONFIG_ATH6KL_USB=m
 CONFIG_B43=m
 CONFIG_B43_BCMA=y
-CONFIG_B43_BCMA_EXTRA=y
+# CONFIG_B43_BCMA_EXTRA is not set
 CONFIG_B43_SSB=y
 CONFIG_B43_BCMA_PIO=y
 CONFIG_B43_PIO=y
-- 
cgit v1.2.3


From 9ed24c6d0e3def975aeb74e1780b43ad6c0f2dbf Mon Sep 17 00:00:00 2001
From: Ricardo Salveti de Araujo <ricardo.salveti@linaro.org>
Date: Mon, 25 Jun 2012 16:06:19 -0300
Subject: configs: ubuntu: disabling CODA_FS, seems to be broken atm

Signed-off-by: Ricardo Salveti de Araujo <ricardo.salveti@linaro.org>
---
 linaro/configs/ubuntu.conf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linaro/configs/ubuntu.conf b/linaro/configs/ubuntu.conf
index 98af7523fc20..55aff9c42987 100644
--- a/linaro/configs/ubuntu.conf
+++ b/linaro/configs/ubuntu.conf
@@ -1926,7 +1926,7 @@ CONFIG_NCPFS_NFS_NS=y
 CONFIG_NCPFS_OS2_NS=y
 CONFIG_NCPFS_NLS=y
 CONFIG_NCPFS_EXTRAS=y
-CONFIG_CODA_FS=m
+# CONFIG_CODA_FS is not set
 CONFIG_AFS_FS=m
 CONFIG_9P_FS=m
 CONFIG_9P_FS_POSIX_ACL=y
-- 
cgit v1.2.3


From 072bd6cb29aa25a3fe0f6be08af48f5a58eae849 Mon Sep 17 00:00:00 2001
From: Ricardo Salveti de Araujo <ricardo.salveti@linaro.org>
Date: Mon, 25 Jun 2012 18:06:06 -0300
Subject: configs: ubuntu: disabling CGROUPS as default

Not yet stable enough to be used as default for all boards we currently
support at Linaro.

Signed-off-by: Ricardo Salveti de Araujo <ricardo.salveti@linaro.org>
---
 linaro/configs/ubuntu.conf | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/linaro/configs/ubuntu.conf b/linaro/configs/ubuntu.conf
index 55aff9c42987..04ef57a73588 100644
--- a/linaro/configs/ubuntu.conf
+++ b/linaro/configs/ubuntu.conf
@@ -22,20 +22,8 @@ CONFIG_AUDIT_TREE=y
 CONFIG_GENERIC_HARDIRQS=y
 CONFIG_GENERIC_IRQ_CHIP=y
 CONFIG_IRQ_DOMAIN=y
-CONFIG_CGROUPS=y
-CONFIG_CGROUP_FREEZER=y
-CONFIG_CGROUP_DEVICE=y
-CONFIG_CPUSETS=y
-CONFIG_PROC_PID_CPUSET=y
-CONFIG_CGROUP_CPUACCT=y
+# CONFIG_CGROUPS is not set
 CONFIG_RESOURCE_COUNTERS=y
-CONFIG_CGROUP_MEM_RES_CTLR=y
-CONFIG_CGROUP_PERF=y
-CONFIG_CGROUP_SCHED=y
-CONFIG_FAIR_GROUP_SCHED=y
-CONFIG_CFS_BANDWIDTH=y
-CONFIG_RT_GROUP_SCHED=y
-CONFIG_BLK_CGROUP=y
 CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_NAMESPACES=y
 CONFIG_UTS_NS=y
@@ -43,7 +31,7 @@ CONFIG_IPC_NS=y
 CONFIG_USER_NS=y
 CONFIG_PID_NS=y
 CONFIG_NET_NS=y
-CONFIG_SCHED_AUTOGROUP=y
+# CONFIG_SCHED_AUTOGROUP is not set
 CONFIG_MM_OWNER=y
 CONFIG_RELAY=y
 CONFIG_INITRAMFS_SOURCE=""
@@ -100,7 +88,6 @@ CONFIG_EFI_PARTITION=y
 CONFIG_IOSCHED_NOOP=y
 CONFIG_IOSCHED_DEADLINE=y
 CONFIG_IOSCHED_CFQ=y
-CONFIG_CFQ_GROUP_IOSCHED=y
 CONFIG_DEFAULT_CFQ=y
 CONFIG_DEFAULT_IOSCHED="cfq"
 CONFIG_FREEZER=y
@@ -509,7 +496,6 @@ CONFIG_CLS_U32_MARK=y
 CONFIG_NET_CLS_RSVP=m
 CONFIG_NET_CLS_RSVP6=m
 CONFIG_NET_CLS_FLOW=m
-CONFIG_NET_CLS_CGROUP=m
 CONFIG_NET_EMATCH=y
 CONFIG_NET_EMATCH_STACK=32
 CONFIG_NET_EMATCH_CMP=m
@@ -533,7 +519,6 @@ CONFIG_DCB=y
 CONFIG_DNS_RESOLVER=y
 CONFIG_BATMAN_ADV=m
 CONFIG_OPENVSWITCH=m
-CONFIG_NETPRIO_CGROUP=m
 CONFIG_BQL=y
 CONFIG_BPF_JIT=y
 CONFIG_NET_PKTGEN=m
-- 
cgit v1.2.3


From a1ba2370f6d62cf421fda959138f96eea7dff8bf Mon Sep 17 00:00:00 2001
From: Ricardo Salveti de Araujo <ricardo.salveti@linaro.org>
Date: Mon, 25 Jun 2012 20:09:28 -0300
Subject: configs: ubuntu: updating configs for 3.5

Signed-off-by: Ricardo Salveti de Araujo <ricardo.salveti@linaro.org>
---
 linaro/configs/ubuntu.conf | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/linaro/configs/ubuntu.conf b/linaro/configs/ubuntu.conf
index 04ef57a73588..202a0a2b68fc 100644
--- a/linaro/configs/ubuntu.conf
+++ b/linaro/configs/ubuntu.conf
@@ -23,7 +23,6 @@ CONFIG_GENERIC_HARDIRQS=y
 CONFIG_GENERIC_IRQ_CHIP=y
 CONFIG_IRQ_DOMAIN=y
 # CONFIG_CGROUPS is not set
-CONFIG_RESOURCE_COUNTERS=y
 CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_NAMESPACES=y
 CONFIG_UTS_NS=y
@@ -32,7 +31,6 @@ CONFIG_USER_NS=y
 CONFIG_PID_NS=y
 CONFIG_NET_NS=y
 # CONFIG_SCHED_AUTOGROUP is not set
-CONFIG_MM_OWNER=y
 CONFIG_RELAY=y
 CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
@@ -77,7 +75,6 @@ CONFIG_LBDAF=y
 CONFIG_BLK_DEV_BSG=y
 CONFIG_BLK_DEV_BSGLIB=y
 CONFIG_BLK_DEV_INTEGRITY=y
-CONFIG_BLK_DEV_THROTTLING=y
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_MSDOS_PARTITION=y
 CONFIG_BSD_DISKLABEL=y
@@ -98,8 +95,6 @@ CONFIG_PREEMPT_VOLUNTARY=y
 CONFIG_HZ=128
 CONFIG_AEABI=y
 CONFIG_SELECT_MEMORY_MODEL=y
-CONFIG_FLATMEM_MANUAL=y
-CONFIG_FLATMEM=y
 CONFIG_FLAT_NODE_MEM_MAP=y
 CONFIG_PAGEFLAGS_EXTENDED=y
 CONFIG_ZONE_DMA_FLAG=0
@@ -458,9 +453,6 @@ CONFIG_IPDDP_ENCAP=y
 CONFIG_IPDDP_DECAP=y
 CONFIG_X25=m
 CONFIG_LAPB=m
-CONFIG_ECONET=m
-CONFIG_ECONET_AUNUDP=y
-CONFIG_ECONET_NATIVE=y
 CONFIG_WAN_ROUTER=m
 CONFIG_PHONET=m
 CONFIG_IEEE802154=m
@@ -623,6 +615,7 @@ CONFIG_WIRELESS_EXT_SYSFS=y
 CONFIG_LIB80211_CRYPT_WEP=m
 CONFIG_LIB80211_CRYPT_CCMP=m
 CONFIG_LIB80211_CRYPT_TKIP=m
+CONFIG_MAC80211=m
 CONFIG_MAC80211_MESH=y
 CONFIG_MAC80211_LEDS=y
 CONFIG_MAC80211_DEBUGFS=y
@@ -777,7 +770,6 @@ CONFIG_SENSORS_BH1770=m
 CONFIG_SENSORS_APDS990X=m
 CONFIG_HMC6352=m
 CONFIG_DS1682=m
-CONFIG_BMP085=m
 CONFIG_USB_SWITCH_FSA9480=m
 CONFIG_C2PORT=m
 CONFIG_EEPROM_AT24=m
@@ -934,7 +926,6 @@ CONFIG_USB_ZD1201=m
 CONFIG_USB_NET_RNDIS_WLAN=m
 CONFIG_RTL8187=m
 CONFIG_RTL8187_LEDS=y
-CONFIG_MAC80211_HWSIM=m
 CONFIG_ATH_COMMON=m
 CONFIG_ATH9K_HW=m
 CONFIG_ATH9K_COMMON=m
-- 
cgit v1.2.3


From 79986376c9b456855cc7a25d32d1f4f2b61b07d8 Mon Sep 17 00:00:00 2001
From: Ricardo Salveti de Araujo <ricardo.salveti@linaro.org>
Date: Mon, 25 Jun 2012 20:54:36 -0300
Subject: configs: ubuntu: disabling CONFIG_MTD_NAND_NANDSIM=m, breaking build

Signed-off-by: Ricardo Salveti de Araujo <ricardo.salveti@linaro.org>
---
 linaro/configs/ubuntu.conf | 1 -
 1 file changed, 1 deletion(-)

diff --git a/linaro/configs/ubuntu.conf b/linaro/configs/ubuntu.conf
index 202a0a2b68fc..5b9d8ad2b235 100644
--- a/linaro/configs/ubuntu.conf
+++ b/linaro/configs/ubuntu.conf
@@ -713,7 +713,6 @@ CONFIG_MTD_NAND_IDS=y
 CONFIG_MTD_NAND_DISKONCHIP=m
 CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0
 CONFIG_MTD_NAND_DOCG4=m
-CONFIG_MTD_NAND_NANDSIM=m
 CONFIG_MTD_NAND_PLATFORM=m
 CONFIG_MTD_ALAUDA=m
 CONFIG_MTD_ONENAND_GENERIC=m
-- 
cgit v1.2.3


From f1a83aae64363f6576065e3a51e804efe248af12 Mon Sep 17 00:00:00 2001
From: Ricardo Salveti de Araujo <ricardo.salveti@linaro.org>
Date: Mon, 25 Jun 2012 23:22:55 -0300
Subject: config: ubuntu: ATH6KL should be platform dependent

Signed-off-by: Ricardo Salveti de Araujo <ricardo.salveti@linaro.org>
---
 linaro/configs/ubuntu.conf | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/linaro/configs/ubuntu.conf b/linaro/configs/ubuntu.conf
index 5b9d8ad2b235..5d0a37296716 100644
--- a/linaro/configs/ubuntu.conf
+++ b/linaro/configs/ubuntu.conf
@@ -939,9 +939,6 @@ CONFIG_CARL9170=m
 CONFIG_CARL9170_LEDS=y
 CONFIG_CARL9170_WPC=y
 CONFIG_CARL9170_HWRNG=y
-CONFIG_ATH6KL=m
-CONFIG_ATH6KL_SDIO=m
-CONFIG_ATH6KL_USB=m
 CONFIG_B43=m
 CONFIG_B43_BCMA=y
 # CONFIG_B43_BCMA_EXTRA is not set
-- 
cgit v1.2.3


From 00fd9de8c2a3c7220b72b7d462203a83d7fde00c Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 10 Jul 2012 14:47:10 +0100
Subject: configs: Add config fragments for big LITTLE MP

This patch adds config fragments used to enable most of the features used by
big LITTLE MP.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 linaro/configs/big-LITTLE-MP.conf | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 linaro/configs/big-LITTLE-MP.conf

diff --git a/linaro/configs/big-LITTLE-MP.conf b/linaro/configs/big-LITTLE-MP.conf
new file mode 100644
index 000000000000..257684574064
--- /dev/null
+++ b/linaro/configs/big-LITTLE-MP.conf
@@ -0,0 +1,4 @@
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_FAIR_GROUP_SCHED=y
+CONFIG_NO_HZ=y
-- 
cgit v1.2.3


From cd5b11b15a4c9e142944668ce6efc44e261e9665 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 11 Jul 2012 09:55:22 +0100
Subject: linaro/configs: Update big LITTLE MP fragment for task placement work

CONFIG_HMP_FAST_CPU_MASK and CONFIG_HMP_SLOW_CPU_MASK must be set correctly by
user platform. For now they are marked 0-1 and 2-3.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 linaro/configs/big-LITTLE-MP.conf | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/linaro/configs/big-LITTLE-MP.conf b/linaro/configs/big-LITTLE-MP.conf
index 257684574064..df35474eff10 100644
--- a/linaro/configs/big-LITTLE-MP.conf
+++ b/linaro/configs/big-LITTLE-MP.conf
@@ -2,3 +2,8 @@ CONFIG_CGROUPS=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_FAIR_GROUP_SCHED=y
 CONFIG_NO_HZ=y
+CONFIG_SCHED_MC=y
+CONFIG_DISABLE_CPU_SCHED_DOMAIN_BALANCE=y
+CONFIG_SCHED_HMP=y
+CONFIG_HMP_FAST_CPU_MASK="0-1"
+CONFIG_HMP_SLOW_CPU_MASK="2-3"
-- 
cgit v1.2.3


From 75a9cda78398c0ca6356a27496f3382de508b687 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 12 Jul 2012 14:46:37 -0700
Subject: workqueue: don't use WQ_HIGHPRI for unbound workqueues

Unbound wqs aren't concurrency-managed and try to execute work items
as soon as possible.  This is currently achieved by implicitly setting
%WQ_HIGHPRI on all unbound workqueues; however, WQ_HIGHPRI
implementation is about to be restructured and this usage won't be
valid anymore.

Add an explicit chain-wakeup path for unbound workqueues in
process_one_work() instead of piggy backing on %WQ_HIGHPRI.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9a3128dc67df..27637c284cb9 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -580,6 +580,10 @@ static bool __need_more_worker(struct global_cwq *gcwq)
 /*
  * Need to wake up a worker?  Called from anything but currently
  * running workers.
+ *
+ * Note that, because unbound workers never contribute to nr_running, this
+ * function will always return %true for unbound gcwq as long as the
+ * worklist isn't empty.
  */
 static bool need_more_worker(struct global_cwq *gcwq)
 {
@@ -1867,6 +1871,13 @@ __acquires(&gcwq->lock)
 	if (unlikely(cpu_intensive))
 		worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
 
+	/*
+	 * Unbound gcwq isn't concurrency managed and work items should be
+	 * executed ASAP.  Wake up another worker if necessary.
+	 */
+	if ((worker->flags & WORKER_UNBOUND) && need_more_worker(gcwq))
+		wake_up_worker(gcwq);
+
 	spin_unlock_irq(&gcwq->lock);
 
 	work_clear_pending(work);
@@ -2984,13 +2995,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 	if (flags & WQ_MEM_RECLAIM)
 		flags |= WQ_RESCUER;
 
-	/*
-	 * Unbound workqueues aren't concurrency managed and should be
-	 * dispatched to workers immediately.
-	 */
-	if (flags & WQ_UNBOUND)
-		flags |= WQ_HIGHPRI;
-
 	max_active = max_active ?: WQ_DFL_ACTIVE;
 	max_active = wq_clamp_max_active(max_active, flags, wq->name);
 
-- 
cgit v1.2.3


From 68ec0ac91ca344bd4d29d867e3155b8d647413f7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 12 Jul 2012 14:46:37 -0700
Subject: workqueue: factor out worker_pool from global_cwq

Move worklist and all worker management fields from global_cwq into
the new struct worker_pool.  worker_pool points back to the containing
gcwq.  worker and cpu_workqueue_struct are updated to point to
worker_pool instead of gcwq too.

This change is mechanical and doesn't introduce any functional
difference other than rearranging of fields and an added level of
indirection in some places.  This is to prepare for multiple pools per
gcwq.

v2: Comment typo fixes as suggested by Namhyung.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
---
 include/trace/events/workqueue.h |   2 +-
 kernel/workqueue.c               | 216 +++++++++++++++++++++------------------
 2 files changed, 118 insertions(+), 100 deletions(-)

diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h
index 4018f5058f27..f28d1b65f178 100644
--- a/include/trace/events/workqueue.h
+++ b/include/trace/events/workqueue.h
@@ -54,7 +54,7 @@ TRACE_EVENT(workqueue_queue_work,
 		__entry->function	= work->func;
 		__entry->workqueue	= cwq->wq;
 		__entry->req_cpu	= req_cpu;
-		__entry->cpu		= cwq->gcwq->cpu;
+		__entry->cpu		= cwq->pool->gcwq->cpu;
 	),
 
 	TP_printk("work struct=%p function=%pf workqueue=%p req_cpu=%u cpu=%u",
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 27637c284cb9..61f154467026 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -115,6 +115,7 @@ enum {
  */
 
 struct global_cwq;
+struct worker_pool;
 
 /*
  * The poor guys doing the actual heavy lifting.  All on-duty workers
@@ -131,7 +132,7 @@ struct worker {
 	struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
 	struct list_head	scheduled;	/* L: scheduled works */
 	struct task_struct	*task;		/* I: worker task */
-	struct global_cwq	*gcwq;		/* I: the associated gcwq */
+	struct worker_pool	*pool;		/* I: the associated pool */
 	/* 64 bytes boundary on 64bit, 32 on 32bit */
 	unsigned long		last_active;	/* L: last active timestamp */
 	unsigned int		flags;		/* X: flags */
@@ -139,6 +140,21 @@ struct worker {
 	struct work_struct	rebind_work;	/* L: rebind worker to cpu */
 };
 
+struct worker_pool {
+	struct global_cwq	*gcwq;		/* I: the owning gcwq */
+
+	struct list_head	worklist;	/* L: list of pending works */
+	int			nr_workers;	/* L: total number of workers */
+	int			nr_idle;	/* L: currently idle ones */
+
+	struct list_head	idle_list;	/* X: list of idle workers */
+	struct timer_list	idle_timer;	/* L: worker idle timeout */
+	struct timer_list	mayday_timer;	/* L: SOS timer for workers */
+
+	struct ida		worker_ida;	/* L: for worker IDs */
+	struct worker		*first_idle;	/* L: first idle worker */
+};
+
 /*
  * Global per-cpu workqueue.  There's one and only one for each cpu
  * and all works are queued and processed here regardless of their
@@ -146,27 +162,18 @@ struct worker {
  */
 struct global_cwq {
 	spinlock_t		lock;		/* the gcwq lock */
-	struct list_head	worklist;	/* L: list of pending works */
 	unsigned int		cpu;		/* I: the associated cpu */
 	unsigned int		flags;		/* L: GCWQ_* flags */
 
-	int			nr_workers;	/* L: total number of workers */
-	int			nr_idle;	/* L: currently idle ones */
-
-	/* workers are chained either in the idle_list or busy_hash */
-	struct list_head	idle_list;	/* X: list of idle workers */
+	/* workers are chained either in busy_hash or pool idle_list */
 	struct hlist_head	busy_hash[BUSY_WORKER_HASH_SIZE];
 						/* L: hash of busy workers */
 
-	struct timer_list	idle_timer;	/* L: worker idle timeout */
-	struct timer_list	mayday_timer;	/* L: SOS timer for dworkers */
-
-	struct ida		worker_ida;	/* L: for worker IDs */
+	struct worker_pool	pool;		/* the worker pools */
 
 	struct task_struct	*trustee;	/* L: for gcwq shutdown */
 	unsigned int		trustee_state;	/* L: trustee state */
 	wait_queue_head_t	trustee_wait;	/* trustee wait */
-	struct worker		*first_idle;	/* L: first idle worker */
 } ____cacheline_aligned_in_smp;
 
 /*
@@ -175,7 +182,7 @@ struct global_cwq {
  * aligned at two's power of the number of flag bits.
  */
 struct cpu_workqueue_struct {
-	struct global_cwq	*gcwq;		/* I: the associated gcwq */
+	struct worker_pool	*pool;		/* I: the associated pool */
 	struct workqueue_struct *wq;		/* I: the owning workqueue */
 	int			work_color;	/* L: current color */
 	int			flush_color;	/* L: flushing color */
@@ -555,7 +562,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
 
 	if (data & WORK_STRUCT_CWQ)
 		return ((struct cpu_workqueue_struct *)
-			(data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
+			(data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq;
 
 	cpu = data >> WORK_STRUCT_FLAG_BITS;
 	if (cpu == WORK_CPU_NONE)
@@ -587,13 +594,13 @@ static bool __need_more_worker(struct global_cwq *gcwq)
  */
 static bool need_more_worker(struct global_cwq *gcwq)
 {
-	return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
+	return !list_empty(&gcwq->pool.worklist) && __need_more_worker(gcwq);
 }
 
 /* Can I start working?  Called from busy but !running workers. */
 static bool may_start_working(struct global_cwq *gcwq)
 {
-	return gcwq->nr_idle;
+	return gcwq->pool.nr_idle;
 }
 
 /* Do I need to keep working?  Called from currently running workers. */
@@ -601,7 +608,7 @@ static bool keep_working(struct global_cwq *gcwq)
 {
 	atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
 
-	return !list_empty(&gcwq->worklist) &&
+	return !list_empty(&gcwq->pool.worklist) &&
 		(atomic_read(nr_running) <= 1 ||
 		 gcwq->flags & GCWQ_HIGHPRI_PENDING);
 }
@@ -622,8 +629,8 @@ static bool need_to_manage_workers(struct global_cwq *gcwq)
 static bool too_many_workers(struct global_cwq *gcwq)
 {
 	bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
-	int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
-	int nr_busy = gcwq->nr_workers - nr_idle;
+	int nr_idle = gcwq->pool.nr_idle + managing; /* manager is considered idle */
+	int nr_busy = gcwq->pool.nr_workers - nr_idle;
 
 	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
 }
@@ -635,10 +642,10 @@ static bool too_many_workers(struct global_cwq *gcwq)
 /* Return the first worker.  Safe with preemption disabled */
 static struct worker *first_worker(struct global_cwq *gcwq)
 {
-	if (unlikely(list_empty(&gcwq->idle_list)))
+	if (unlikely(list_empty(&gcwq->pool.idle_list)))
 		return NULL;
 
-	return list_first_entry(&gcwq->idle_list, struct worker, entry);
+	return list_first_entry(&gcwq->pool.idle_list, struct worker, entry);
 }
 
 /**
@@ -696,7 +703,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
 				       unsigned int cpu)
 {
 	struct worker *worker = kthread_data(task), *to_wakeup = NULL;
-	struct global_cwq *gcwq = get_gcwq(cpu);
+	struct worker_pool *pool = worker->pool;
+	struct global_cwq *gcwq = pool->gcwq;
 	atomic_t *nr_running = get_gcwq_nr_running(cpu);
 
 	if (worker->flags & WORKER_NOT_RUNNING)
@@ -716,7 +724,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
 	 * could be manipulating idle_list, so dereferencing idle_list
 	 * without gcwq lock is safe.
 	 */
-	if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
+	if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
 		to_wakeup = first_worker(gcwq);
 	return to_wakeup ? to_wakeup->task : NULL;
 }
@@ -737,7 +745,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
 static inline void worker_set_flags(struct worker *worker, unsigned int flags,
 				    bool wakeup)
 {
-	struct global_cwq *gcwq = worker->gcwq;
+	struct worker_pool *pool = worker->pool;
+	struct global_cwq *gcwq = pool->gcwq;
 
 	WARN_ON_ONCE(worker->task != current);
 
@@ -752,7 +761,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
 
 		if (wakeup) {
 			if (atomic_dec_and_test(nr_running) &&
-			    !list_empty(&gcwq->worklist))
+			    !list_empty(&pool->worklist))
 				wake_up_worker(gcwq);
 		} else
 			atomic_dec(nr_running);
@@ -773,7 +782,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
  */
 static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 {
-	struct global_cwq *gcwq = worker->gcwq;
+	struct global_cwq *gcwq = worker->pool->gcwq;
 	unsigned int oflags = worker->flags;
 
 	WARN_ON_ONCE(worker->task != current);
@@ -894,9 +903,9 @@ static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
 	struct work_struct *twork;
 
 	if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
-		return &gcwq->worklist;
+		return &gcwq->pool.worklist;
 
-	list_for_each_entry(twork, &gcwq->worklist, entry) {
+	list_for_each_entry(twork, &gcwq->pool.worklist, entry) {
 		struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
 
 		if (!(tcwq->wq->flags & WQ_HIGHPRI))
@@ -924,7 +933,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 			struct work_struct *work, struct list_head *head,
 			unsigned int extra_flags)
 {
-	struct global_cwq *gcwq = cwq->gcwq;
+	struct global_cwq *gcwq = cwq->pool->gcwq;
 
 	/* we own @work, set data and link */
 	set_work_cwq(work, cwq, extra_flags);
@@ -1196,7 +1205,8 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
  */
 static void worker_enter_idle(struct worker *worker)
 {
-	struct global_cwq *gcwq = worker->gcwq;
+	struct worker_pool *pool = worker->pool;
+	struct global_cwq *gcwq = pool->gcwq;
 
 	BUG_ON(worker->flags & WORKER_IDLE);
 	BUG_ON(!list_empty(&worker->entry) &&
@@ -1204,15 +1214,15 @@ static void worker_enter_idle(struct worker *worker)
 
 	/* can't use worker_set_flags(), also called from start_worker() */
 	worker->flags |= WORKER_IDLE;
-	gcwq->nr_idle++;
+	pool->nr_idle++;
 	worker->last_active = jiffies;
 
 	/* idle_list is LIFO */
-	list_add(&worker->entry, &gcwq->idle_list);
+	list_add(&worker->entry, &pool->idle_list);
 
 	if (likely(!(worker->flags & WORKER_ROGUE))) {
-		if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
-			mod_timer(&gcwq->idle_timer,
+		if (too_many_workers(gcwq) && !timer_pending(&pool->idle_timer))
+			mod_timer(&pool->idle_timer,
 				  jiffies + IDLE_WORKER_TIMEOUT);
 	} else
 		wake_up_all(&gcwq->trustee_wait);
@@ -1223,7 +1233,7 @@ static void worker_enter_idle(struct worker *worker)
 	 * warning may trigger spuriously.  Check iff trustee is idle.
 	 */
 	WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
-		     gcwq->nr_workers == gcwq->nr_idle &&
+		     pool->nr_workers == pool->nr_idle &&
 		     atomic_read(get_gcwq_nr_running(gcwq->cpu)));
 }
 
@@ -1238,11 +1248,11 @@ static void worker_enter_idle(struct worker *worker)
  */
 static void worker_leave_idle(struct worker *worker)
 {
-	struct global_cwq *gcwq = worker->gcwq;
+	struct worker_pool *pool = worker->pool;
 
 	BUG_ON(!(worker->flags & WORKER_IDLE));
 	worker_clr_flags(worker, WORKER_IDLE);
-	gcwq->nr_idle--;
+	pool->nr_idle--;
 	list_del_init(&worker->entry);
 }
 
@@ -1279,7 +1289,7 @@ static void worker_leave_idle(struct worker *worker)
 static bool worker_maybe_bind_and_lock(struct worker *worker)
 __acquires(&gcwq->lock)
 {
-	struct global_cwq *gcwq = worker->gcwq;
+	struct global_cwq *gcwq = worker->pool->gcwq;
 	struct task_struct *task = worker->task;
 
 	while (true) {
@@ -1321,7 +1331,7 @@ __acquires(&gcwq->lock)
 static void worker_rebind_fn(struct work_struct *work)
 {
 	struct worker *worker = container_of(work, struct worker, rebind_work);
-	struct global_cwq *gcwq = worker->gcwq;
+	struct global_cwq *gcwq = worker->pool->gcwq;
 
 	if (worker_maybe_bind_and_lock(worker))
 		worker_clr_flags(worker, WORKER_REBIND);
@@ -1362,13 +1372,14 @@ static struct worker *alloc_worker(void)
 static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
 {
 	bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
+	struct worker_pool *pool = &gcwq->pool;
 	struct worker *worker = NULL;
 	int id = -1;
 
 	spin_lock_irq(&gcwq->lock);
-	while (ida_get_new(&gcwq->worker_ida, &id)) {
+	while (ida_get_new(&pool->worker_ida, &id)) {
 		spin_unlock_irq(&gcwq->lock);
-		if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
+		if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
 			goto fail;
 		spin_lock_irq(&gcwq->lock);
 	}
@@ -1378,7 +1389,7 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
 	if (!worker)
 		goto fail;
 
-	worker->gcwq = gcwq;
+	worker->pool = pool;
 	worker->id = id;
 
 	if (!on_unbound_cpu)
@@ -1409,7 +1420,7 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
 fail:
 	if (id >= 0) {
 		spin_lock_irq(&gcwq->lock);
-		ida_remove(&gcwq->worker_ida, id);
+		ida_remove(&pool->worker_ida, id);
 		spin_unlock_irq(&gcwq->lock);
 	}
 	kfree(worker);
@@ -1428,7 +1439,7 @@ fail:
 static void start_worker(struct worker *worker)
 {
 	worker->flags |= WORKER_STARTED;
-	worker->gcwq->nr_workers++;
+	worker->pool->nr_workers++;
 	worker_enter_idle(worker);
 	wake_up_process(worker->task);
 }
@@ -1444,7 +1455,8 @@ static void start_worker(struct worker *worker)
  */
 static void destroy_worker(struct worker *worker)
 {
-	struct global_cwq *gcwq = worker->gcwq;
+	struct worker_pool *pool = worker->pool;
+	struct global_cwq *gcwq = pool->gcwq;
 	int id = worker->id;
 
 	/* sanity check frenzy */
@@ -1452,9 +1464,9 @@ static void destroy_worker(struct worker *worker)
 	BUG_ON(!list_empty(&worker->scheduled));
 
 	if (worker->flags & WORKER_STARTED)
-		gcwq->nr_workers--;
+		pool->nr_workers--;
 	if (worker->flags & WORKER_IDLE)
-		gcwq->nr_idle--;
+		pool->nr_idle--;
 
 	list_del_init(&worker->entry);
 	worker->flags |= WORKER_DIE;
@@ -1465,7 +1477,7 @@ static void destroy_worker(struct worker *worker)
 	kfree(worker);
 
 	spin_lock_irq(&gcwq->lock);
-	ida_remove(&gcwq->worker_ida, id);
+	ida_remove(&pool->worker_ida, id);
 }
 
 static void idle_worker_timeout(unsigned long __gcwq)
@@ -1479,11 +1491,12 @@ static void idle_worker_timeout(unsigned long __gcwq)
 		unsigned long expires;
 
 		/* idle_list is kept in LIFO order, check the last one */
-		worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+		worker = list_entry(gcwq->pool.idle_list.prev, struct worker,
+				    entry);
 		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
 
 		if (time_before(jiffies, expires))
-			mod_timer(&gcwq->idle_timer, expires);
+			mod_timer(&gcwq->pool.idle_timer, expires);
 		else {
 			/* it's been idle for too long, wake up manager */
 			gcwq->flags |= GCWQ_MANAGE_WORKERS;
@@ -1504,7 +1517,7 @@ static bool send_mayday(struct work_struct *work)
 		return false;
 
 	/* mayday mayday mayday */
-	cpu = cwq->gcwq->cpu;
+	cpu = cwq->pool->gcwq->cpu;
 	/* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
 	if (cpu == WORK_CPU_UNBOUND)
 		cpu = 0;
@@ -1527,13 +1540,13 @@ static void gcwq_mayday_timeout(unsigned long __gcwq)
 		 * allocation deadlock.  Send distress signals to
 		 * rescuers.
 		 */
-		list_for_each_entry(work, &gcwq->worklist, entry)
+		list_for_each_entry(work, &gcwq->pool.worklist, entry)
 			send_mayday(work);
 	}
 
 	spin_unlock_irq(&gcwq->lock);
 
-	mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
+	mod_timer(&gcwq->pool.mayday_timer, jiffies + MAYDAY_INTERVAL);
 }
 
 /**
@@ -1568,14 +1581,14 @@ restart:
 	spin_unlock_irq(&gcwq->lock);
 
 	/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
-	mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
+	mod_timer(&gcwq->pool.mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
 
 	while (true) {
 		struct worker *worker;
 
 		worker = create_worker(gcwq, true);
 		if (worker) {
-			del_timer_sync(&gcwq->mayday_timer);
+			del_timer_sync(&gcwq->pool.mayday_timer);
 			spin_lock_irq(&gcwq->lock);
 			start_worker(worker);
 			BUG_ON(need_to_create_worker(gcwq));
@@ -1592,7 +1605,7 @@ restart:
 			break;
 	}
 
-	del_timer_sync(&gcwq->mayday_timer);
+	del_timer_sync(&gcwq->pool.mayday_timer);
 	spin_lock_irq(&gcwq->lock);
 	if (need_to_create_worker(gcwq))
 		goto restart;
@@ -1622,11 +1635,12 @@ static bool maybe_destroy_workers(struct global_cwq *gcwq)
 		struct worker *worker;
 		unsigned long expires;
 
-		worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+		worker = list_entry(gcwq->pool.idle_list.prev, struct worker,
+				    entry);
 		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
 
 		if (time_before(jiffies, expires)) {
-			mod_timer(&gcwq->idle_timer, expires);
+			mod_timer(&gcwq->pool.idle_timer, expires);
 			break;
 		}
 
@@ -1659,7 +1673,7 @@ static bool maybe_destroy_workers(struct global_cwq *gcwq)
  */
 static bool manage_workers(struct worker *worker)
 {
-	struct global_cwq *gcwq = worker->gcwq;
+	struct global_cwq *gcwq = worker->pool->gcwq;
 	bool ret = false;
 
 	if (gcwq->flags & GCWQ_MANAGING_WORKERS)
@@ -1732,7 +1746,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
 {
 	struct work_struct *work = list_first_entry(&cwq->delayed_works,
 						    struct work_struct, entry);
-	struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
+	struct list_head *pos = gcwq_determine_ins_pos(cwq->pool->gcwq, cwq);
 
 	trace_workqueue_activate_work(work);
 	move_linked_works(work, pos, NULL);
@@ -1808,7 +1822,8 @@ __releases(&gcwq->lock)
 __acquires(&gcwq->lock)
 {
 	struct cpu_workqueue_struct *cwq = get_work_cwq(work);
-	struct global_cwq *gcwq = cwq->gcwq;
+	struct worker_pool *pool = worker->pool;
+	struct global_cwq *gcwq = pool->gcwq;
 	struct hlist_head *bwh = busy_worker_head(gcwq, work);
 	bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
 	work_func_t f = work->func;
@@ -1854,10 +1869,10 @@ __acquires(&gcwq->lock)
 	 * wake up another worker; otherwise, clear HIGHPRI_PENDING.
 	 */
 	if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
-		struct work_struct *nwork = list_first_entry(&gcwq->worklist,
-						struct work_struct, entry);
+		struct work_struct *nwork = list_first_entry(&pool->worklist,
+					 struct work_struct, entry);
 
-		if (!list_empty(&gcwq->worklist) &&
+		if (!list_empty(&pool->worklist) &&
 		    get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
 			wake_up_worker(gcwq);
 		else
@@ -1950,7 +1965,8 @@ static void process_scheduled_works(struct worker *worker)
 static int worker_thread(void *__worker)
 {
 	struct worker *worker = __worker;
-	struct global_cwq *gcwq = worker->gcwq;
+	struct worker_pool *pool = worker->pool;
+	struct global_cwq *gcwq = pool->gcwq;
 
 	/* tell the scheduler that this is a workqueue worker */
 	worker->task->flags |= PF_WQ_WORKER;
@@ -1990,7 +2006,7 @@ recheck:
 
 	do {
 		struct work_struct *work =
-			list_first_entry(&gcwq->worklist,
+			list_first_entry(&pool->worklist,
 					 struct work_struct, entry);
 
 		if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
@@ -2064,14 +2080,15 @@ repeat:
 	for_each_mayday_cpu(cpu, wq->mayday_mask) {
 		unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
 		struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
-		struct global_cwq *gcwq = cwq->gcwq;
+		struct worker_pool *pool = cwq->pool;
+		struct global_cwq *gcwq = pool->gcwq;
 		struct work_struct *work, *n;
 
 		__set_current_state(TASK_RUNNING);
 		mayday_clear_cpu(cpu, wq->mayday_mask);
 
 		/* migrate to the target cpu if possible */
-		rescuer->gcwq = gcwq;
+		rescuer->pool = pool;
 		worker_maybe_bind_and_lock(rescuer);
 
 		/*
@@ -2079,7 +2096,7 @@ repeat:
 		 * process'em.
 		 */
 		BUG_ON(!list_empty(&rescuer->scheduled));
-		list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
+		list_for_each_entry_safe(work, n, &pool->worklist, entry)
 			if (get_work_cwq(work) == cwq)
 				move_linked_works(work, scheduled, &n);
 
@@ -2216,7 +2233,7 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
 
 	for_each_cwq_cpu(cpu, wq) {
 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-		struct global_cwq *gcwq = cwq->gcwq;
+		struct global_cwq *gcwq = cwq->pool->gcwq;
 
 		spin_lock_irq(&gcwq->lock);
 
@@ -2432,9 +2449,9 @@ reflush:
 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 		bool drained;
 
-		spin_lock_irq(&cwq->gcwq->lock);
+		spin_lock_irq(&cwq->pool->gcwq->lock);
 		drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
-		spin_unlock_irq(&cwq->gcwq->lock);
+		spin_unlock_irq(&cwq->pool->gcwq->lock);
 
 		if (drained)
 			continue;
@@ -2474,7 +2491,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
 		 */
 		smp_rmb();
 		cwq = get_work_cwq(work);
-		if (unlikely(!cwq || gcwq != cwq->gcwq))
+		if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
 			goto already_gone;
 	} else if (wait_executing) {
 		worker = find_worker_executing_work(gcwq, work);
@@ -3017,7 +3034,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 		struct global_cwq *gcwq = get_gcwq(cpu);
 
 		BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
-		cwq->gcwq = gcwq;
+		cwq->pool = &gcwq->pool;
 		cwq->wq = wq;
 		cwq->flush_color = -1;
 		cwq->max_active = max_active;
@@ -3344,7 +3361,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
 
 	gcwq->flags |= GCWQ_MANAGING_WORKERS;
 
-	list_for_each_entry(worker, &gcwq->idle_list, entry)
+	list_for_each_entry(worker, &gcwq->pool.idle_list, entry)
 		worker->flags |= WORKER_ROGUE;
 
 	for_each_busy_worker(worker, i, pos, gcwq)
@@ -3369,7 +3386,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
 
 	spin_unlock_irq(&gcwq->lock);
-	del_timer_sync(&gcwq->idle_timer);
+	del_timer_sync(&gcwq->pool.idle_timer);
 	spin_lock_irq(&gcwq->lock);
 
 	/*
@@ -3391,17 +3408,17 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * may be frozen works in freezable cwqs.  Don't declare
 	 * completion while frozen.
 	 */
-	while (gcwq->nr_workers != gcwq->nr_idle ||
+	while (gcwq->pool.nr_workers != gcwq->pool.nr_idle ||
 	       gcwq->flags & GCWQ_FREEZING ||
 	       gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
 		int nr_works = 0;
 
-		list_for_each_entry(work, &gcwq->worklist, entry) {
+		list_for_each_entry(work, &gcwq->pool.worklist, entry) {
 			send_mayday(work);
 			nr_works++;
 		}
 
-		list_for_each_entry(worker, &gcwq->idle_list, entry) {
+		list_for_each_entry(worker, &gcwq->pool.idle_list, entry) {
 			if (!nr_works--)
 				break;
 			wake_up_process(worker->task);
@@ -3428,11 +3445,11 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * all workers till we're canceled.
 	 */
 	do {
-		rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
-		while (!list_empty(&gcwq->idle_list))
-			destroy_worker(list_first_entry(&gcwq->idle_list,
+		rc = trustee_wait_event(!list_empty(&gcwq->pool.idle_list));
+		while (!list_empty(&gcwq->pool.idle_list))
+			destroy_worker(list_first_entry(&gcwq->pool.idle_list,
 							struct worker, entry));
-	} while (gcwq->nr_workers && rc >= 0);
+	} while (gcwq->pool.nr_workers && rc >= 0);
 
 	/*
 	 * At this point, either draining has completed and no worker
@@ -3441,7 +3458,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * Tell the remaining busy ones to rebind once it finishes the
 	 * currently scheduled works by scheduling the rebind_work.
 	 */
-	WARN_ON(!list_empty(&gcwq->idle_list));
+	WARN_ON(!list_empty(&gcwq->pool.idle_list));
 
 	for_each_busy_worker(worker, i, pos, gcwq) {
 		struct work_struct *rebind_work = &worker->rebind_work;
@@ -3522,7 +3539,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		kthread_bind(new_trustee, cpu);
 		/* fall through */
 	case CPU_UP_PREPARE:
-		BUG_ON(gcwq->first_idle);
+		BUG_ON(gcwq->pool.first_idle);
 		new_worker = create_worker(gcwq, false);
 		if (!new_worker) {
 			if (new_trustee)
@@ -3544,8 +3561,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
 		/* fall through */
 	case CPU_UP_PREPARE:
-		BUG_ON(gcwq->first_idle);
-		gcwq->first_idle = new_worker;
+		BUG_ON(gcwq->pool.first_idle);
+		gcwq->pool.first_idle = new_worker;
 		break;
 
 	case CPU_DYING:
@@ -3562,8 +3579,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		gcwq->trustee_state = TRUSTEE_BUTCHER;
 		/* fall through */
 	case CPU_UP_CANCELED:
-		destroy_worker(gcwq->first_idle);
-		gcwq->first_idle = NULL;
+		destroy_worker(gcwq->pool.first_idle);
+		gcwq->pool.first_idle = NULL;
 		break;
 
 	case CPU_DOWN_FAILED:
@@ -3581,11 +3598,11 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		 * take a look.
 		 */
 		spin_unlock_irq(&gcwq->lock);
-		kthread_bind(gcwq->first_idle->task, cpu);
+		kthread_bind(gcwq->pool.first_idle->task, cpu);
 		spin_lock_irq(&gcwq->lock);
 		gcwq->flags |= GCWQ_MANAGE_WORKERS;
-		start_worker(gcwq->first_idle);
-		gcwq->first_idle = NULL;
+		start_worker(gcwq->pool.first_idle);
+		gcwq->pool.first_idle = NULL;
 		break;
 	}
 
@@ -3794,22 +3811,23 @@ static int __init init_workqueues(void)
 		struct global_cwq *gcwq = get_gcwq(cpu);
 
 		spin_lock_init(&gcwq->lock);
-		INIT_LIST_HEAD(&gcwq->worklist);
+		gcwq->pool.gcwq = gcwq;
+		INIT_LIST_HEAD(&gcwq->pool.worklist);
 		gcwq->cpu = cpu;
 		gcwq->flags |= GCWQ_DISASSOCIATED;
 
-		INIT_LIST_HEAD(&gcwq->idle_list);
+		INIT_LIST_HEAD(&gcwq->pool.idle_list);
 		for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
 			INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
 
-		init_timer_deferrable(&gcwq->idle_timer);
-		gcwq->idle_timer.function = idle_worker_timeout;
-		gcwq->idle_timer.data = (unsigned long)gcwq;
+		init_timer_deferrable(&gcwq->pool.idle_timer);
+		gcwq->pool.idle_timer.function = idle_worker_timeout;
+		gcwq->pool.idle_timer.data = (unsigned long)gcwq;
 
-		setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
+		setup_timer(&gcwq->pool.mayday_timer, gcwq_mayday_timeout,
 			    (unsigned long)gcwq);
 
-		ida_init(&gcwq->worker_ida);
+		ida_init(&gcwq->pool.worker_ida);
 
 		gcwq->trustee_state = TRUSTEE_DONE;
 		init_waitqueue_head(&gcwq->trustee_wait);
-- 
cgit v1.2.3


From ea23ed8bb3c2303b92e2d22c9b251b8a8958ceaa Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 12 Jul 2012 14:46:37 -0700
Subject: workqueue: use @pool instead of @gcwq or @cpu where applicable

Modify all functions which deal with per-pool properties to pass
around @pool instead of @gcwq or @cpu.

The changes in this patch are mechanical and don't caues any
functional difference.  This is to prepare for multiple pools per
gcwq.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 218 +++++++++++++++++++++++++++--------------------------
 1 file changed, 111 insertions(+), 107 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 61f154467026..2d82f7b193a0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -471,8 +471,10 @@ static struct global_cwq *get_gcwq(unsigned int cpu)
 		return &unbound_global_cwq;
 }
 
-static atomic_t *get_gcwq_nr_running(unsigned int cpu)
+static atomic_t *get_pool_nr_running(struct worker_pool *pool)
 {
+	int cpu = pool->gcwq->cpu;
+
 	if (cpu != WORK_CPU_UNBOUND)
 		return &per_cpu(gcwq_nr_running, cpu);
 	else
@@ -578,10 +580,10 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
  * assume that they're being called with gcwq->lock held.
  */
 
-static bool __need_more_worker(struct global_cwq *gcwq)
+static bool __need_more_worker(struct worker_pool *pool)
 {
-	return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
-		gcwq->flags & GCWQ_HIGHPRI_PENDING;
+	return !atomic_read(get_pool_nr_running(pool)) ||
+		pool->gcwq->flags & GCWQ_HIGHPRI_PENDING;
 }
 
 /*
@@ -592,45 +594,46 @@ static bool __need_more_worker(struct global_cwq *gcwq)
  * function will always return %true for unbound gcwq as long as the
  * worklist isn't empty.
  */
-static bool need_more_worker(struct global_cwq *gcwq)
+static bool need_more_worker(struct worker_pool *pool)
 {
-	return !list_empty(&gcwq->pool.worklist) && __need_more_worker(gcwq);
+	return !list_empty(&pool->worklist) && __need_more_worker(pool);
 }
 
 /* Can I start working?  Called from busy but !running workers. */
-static bool may_start_working(struct global_cwq *gcwq)
+static bool may_start_working(struct worker_pool *pool)
 {
-	return gcwq->pool.nr_idle;
+	return pool->nr_idle;
 }
 
 /* Do I need to keep working?  Called from currently running workers. */
-static bool keep_working(struct global_cwq *gcwq)
+static bool keep_working(struct worker_pool *pool)
 {
-	atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+	atomic_t *nr_running = get_pool_nr_running(pool);
 
-	return !list_empty(&gcwq->pool.worklist) &&
+	return !list_empty(&pool->worklist) &&
 		(atomic_read(nr_running) <= 1 ||
-		 gcwq->flags & GCWQ_HIGHPRI_PENDING);
+		 pool->gcwq->flags & GCWQ_HIGHPRI_PENDING);
 }
 
 /* Do we need a new worker?  Called from manager. */
-static bool need_to_create_worker(struct global_cwq *gcwq)
+static bool need_to_create_worker(struct worker_pool *pool)
 {
-	return need_more_worker(gcwq) && !may_start_working(gcwq);
+	return need_more_worker(pool) && !may_start_working(pool);
 }
 
 /* Do I need to be the manager? */
-static bool need_to_manage_workers(struct global_cwq *gcwq)
+static bool need_to_manage_workers(struct worker_pool *pool)
 {
-	return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
+	return need_to_create_worker(pool) ||
+		pool->gcwq->flags & GCWQ_MANAGE_WORKERS;
 }
 
 /* Do we have too many workers and should some go away? */
-static bool too_many_workers(struct global_cwq *gcwq)
+static bool too_many_workers(struct worker_pool *pool)
 {
-	bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
-	int nr_idle = gcwq->pool.nr_idle + managing; /* manager is considered idle */
-	int nr_busy = gcwq->pool.nr_workers - nr_idle;
+	bool managing = pool->gcwq->flags & GCWQ_MANAGING_WORKERS;
+	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
+	int nr_busy = pool->nr_workers - nr_idle;
 
 	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
 }
@@ -640,26 +643,26 @@ static bool too_many_workers(struct global_cwq *gcwq)
  */
 
 /* Return the first worker.  Safe with preemption disabled */
-static struct worker *first_worker(struct global_cwq *gcwq)
+static struct worker *first_worker(struct worker_pool *pool)
 {
-	if (unlikely(list_empty(&gcwq->pool.idle_list)))
+	if (unlikely(list_empty(&pool->idle_list)))
 		return NULL;
 
-	return list_first_entry(&gcwq->pool.idle_list, struct worker, entry);
+	return list_first_entry(&pool->idle_list, struct worker, entry);
 }
 
 /**
  * wake_up_worker - wake up an idle worker
- * @gcwq: gcwq to wake worker for
+ * @pool: worker pool to wake worker from
  *
- * Wake up the first idle worker of @gcwq.
+ * Wake up the first idle worker of @pool.
  *
  * CONTEXT:
  * spin_lock_irq(gcwq->lock).
  */
-static void wake_up_worker(struct global_cwq *gcwq)
+static void wake_up_worker(struct worker_pool *pool)
 {
-	struct worker *worker = first_worker(gcwq);
+	struct worker *worker = first_worker(pool);
 
 	if (likely(worker))
 		wake_up_process(worker->task);
@@ -681,7 +684,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
 	struct worker *worker = kthread_data(task);
 
 	if (!(worker->flags & WORKER_NOT_RUNNING))
-		atomic_inc(get_gcwq_nr_running(cpu));
+		atomic_inc(get_pool_nr_running(worker->pool));
 }
 
 /**
@@ -704,8 +707,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
 {
 	struct worker *worker = kthread_data(task), *to_wakeup = NULL;
 	struct worker_pool *pool = worker->pool;
-	struct global_cwq *gcwq = pool->gcwq;
-	atomic_t *nr_running = get_gcwq_nr_running(cpu);
+	atomic_t *nr_running = get_pool_nr_running(pool);
 
 	if (worker->flags & WORKER_NOT_RUNNING)
 		return NULL;
@@ -725,7 +727,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
 	 * without gcwq lock is safe.
 	 */
 	if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
-		to_wakeup = first_worker(gcwq);
+		to_wakeup = first_worker(pool);
 	return to_wakeup ? to_wakeup->task : NULL;
 }
 
@@ -746,7 +748,6 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
 				    bool wakeup)
 {
 	struct worker_pool *pool = worker->pool;
-	struct global_cwq *gcwq = pool->gcwq;
 
 	WARN_ON_ONCE(worker->task != current);
 
@@ -757,12 +758,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
 	 */
 	if ((flags & WORKER_NOT_RUNNING) &&
 	    !(worker->flags & WORKER_NOT_RUNNING)) {
-		atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+		atomic_t *nr_running = get_pool_nr_running(pool);
 
 		if (wakeup) {
 			if (atomic_dec_and_test(nr_running) &&
 			    !list_empty(&pool->worklist))
-				wake_up_worker(gcwq);
+				wake_up_worker(pool);
 		} else
 			atomic_dec(nr_running);
 	}
@@ -782,7 +783,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
  */
 static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 {
-	struct global_cwq *gcwq = worker->pool->gcwq;
+	struct worker_pool *pool = worker->pool;
 	unsigned int oflags = worker->flags;
 
 	WARN_ON_ONCE(worker->task != current);
@@ -796,7 +797,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 	 */
 	if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
 		if (!(worker->flags & WORKER_NOT_RUNNING))
-			atomic_inc(get_gcwq_nr_running(gcwq->cpu));
+			atomic_inc(get_pool_nr_running(pool));
 }
 
 /**
@@ -880,15 +881,15 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
 }
 
 /**
- * gcwq_determine_ins_pos - find insertion position
- * @gcwq: gcwq of interest
+ * pool_determine_ins_pos - find insertion position
+ * @pool: pool of interest
  * @cwq: cwq a work is being queued for
  *
- * A work for @cwq is about to be queued on @gcwq, determine insertion
+ * A work for @cwq is about to be queued on @pool, determine insertion
  * position for the work.  If @cwq is for HIGHPRI wq, the work is
  * queued at the head of the queue but in FIFO order with respect to
  * other HIGHPRI works; otherwise, at the end of the queue.  This
- * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
+ * function also sets GCWQ_HIGHPRI_PENDING flag to hint @pool that
  * there are HIGHPRI works pending.
  *
  * CONTEXT:
@@ -897,22 +898,22 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
  * RETURNS:
  * Pointer to inserstion position.
  */
-static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
+static inline struct list_head *pool_determine_ins_pos(struct worker_pool *pool,
 					       struct cpu_workqueue_struct *cwq)
 {
 	struct work_struct *twork;
 
 	if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
-		return &gcwq->pool.worklist;
+		return &pool->worklist;
 
-	list_for_each_entry(twork, &gcwq->pool.worklist, entry) {
+	list_for_each_entry(twork, &pool->worklist, entry) {
 		struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
 
 		if (!(tcwq->wq->flags & WQ_HIGHPRI))
 			break;
 	}
 
-	gcwq->flags |= GCWQ_HIGHPRI_PENDING;
+	pool->gcwq->flags |= GCWQ_HIGHPRI_PENDING;
 	return &twork->entry;
 }
 
@@ -933,7 +934,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 			struct work_struct *work, struct list_head *head,
 			unsigned int extra_flags)
 {
-	struct global_cwq *gcwq = cwq->pool->gcwq;
+	struct worker_pool *pool = cwq->pool;
 
 	/* we own @work, set data and link */
 	set_work_cwq(work, cwq, extra_flags);
@@ -953,8 +954,8 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 	 */
 	smp_mb();
 
-	if (__need_more_worker(gcwq))
-		wake_up_worker(gcwq);
+	if (__need_more_worker(pool))
+		wake_up_worker(pool);
 }
 
 /*
@@ -1056,7 +1057,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 	if (likely(cwq->nr_active < cwq->max_active)) {
 		trace_workqueue_activate_work(work);
 		cwq->nr_active++;
-		worklist = gcwq_determine_ins_pos(gcwq, cwq);
+		worklist = pool_determine_ins_pos(cwq->pool, cwq);
 	} else {
 		work_flags |= WORK_STRUCT_DELAYED;
 		worklist = &cwq->delayed_works;
@@ -1221,7 +1222,7 @@ static void worker_enter_idle(struct worker *worker)
 	list_add(&worker->entry, &pool->idle_list);
 
 	if (likely(!(worker->flags & WORKER_ROGUE))) {
-		if (too_many_workers(gcwq) && !timer_pending(&pool->idle_timer))
+		if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
 			mod_timer(&pool->idle_timer,
 				  jiffies + IDLE_WORKER_TIMEOUT);
 	} else
@@ -1234,7 +1235,7 @@ static void worker_enter_idle(struct worker *worker)
 	 */
 	WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
 		     pool->nr_workers == pool->nr_idle &&
-		     atomic_read(get_gcwq_nr_running(gcwq->cpu)));
+		     atomic_read(get_pool_nr_running(pool)));
 }
 
 /**
@@ -1356,10 +1357,10 @@ static struct worker *alloc_worker(void)
 
 /**
  * create_worker - create a new workqueue worker
- * @gcwq: gcwq the new worker will belong to
+ * @pool: pool the new worker will belong to
  * @bind: whether to set affinity to @cpu or not
  *
- * Create a new worker which is bound to @gcwq.  The returned worker
+ * Create a new worker which is bound to @pool.  The returned worker
  * can be started by calling start_worker() or destroyed using
  * destroy_worker().
  *
@@ -1369,10 +1370,10 @@ static struct worker *alloc_worker(void)
  * RETURNS:
  * Pointer to the newly created worker.
  */
-static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
+static struct worker *create_worker(struct worker_pool *pool, bool bind)
 {
+	struct global_cwq *gcwq = pool->gcwq;
 	bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
-	struct worker_pool *pool = &gcwq->pool;
 	struct worker *worker = NULL;
 	int id = -1;
 
@@ -1480,27 +1481,27 @@ static void destroy_worker(struct worker *worker)
 	ida_remove(&pool->worker_ida, id);
 }
 
-static void idle_worker_timeout(unsigned long __gcwq)
+static void idle_worker_timeout(unsigned long __pool)
 {
-	struct global_cwq *gcwq = (void *)__gcwq;
+	struct worker_pool *pool = (void *)__pool;
+	struct global_cwq *gcwq = pool->gcwq;
 
 	spin_lock_irq(&gcwq->lock);
 
-	if (too_many_workers(gcwq)) {
+	if (too_many_workers(pool)) {
 		struct worker *worker;
 		unsigned long expires;
 
 		/* idle_list is kept in LIFO order, check the last one */
-		worker = list_entry(gcwq->pool.idle_list.prev, struct worker,
-				    entry);
+		worker = list_entry(pool->idle_list.prev, struct worker, entry);
 		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
 
 		if (time_before(jiffies, expires))
-			mod_timer(&gcwq->pool.idle_timer, expires);
+			mod_timer(&pool->idle_timer, expires);
 		else {
 			/* it's been idle for too long, wake up manager */
 			gcwq->flags |= GCWQ_MANAGE_WORKERS;
-			wake_up_worker(gcwq);
+			wake_up_worker(pool);
 		}
 	}
 
@@ -1526,37 +1527,38 @@ static bool send_mayday(struct work_struct *work)
 	return true;
 }
 
-static void gcwq_mayday_timeout(unsigned long __gcwq)
+static void gcwq_mayday_timeout(unsigned long __pool)
 {
-	struct global_cwq *gcwq = (void *)__gcwq;
+	struct worker_pool *pool = (void *)__pool;
+	struct global_cwq *gcwq = pool->gcwq;
 	struct work_struct *work;
 
 	spin_lock_irq(&gcwq->lock);
 
-	if (need_to_create_worker(gcwq)) {
+	if (need_to_create_worker(pool)) {
 		/*
 		 * We've been trying to create a new worker but
 		 * haven't been successful.  We might be hitting an
 		 * allocation deadlock.  Send distress signals to
 		 * rescuers.
 		 */
-		list_for_each_entry(work, &gcwq->pool.worklist, entry)
+		list_for_each_entry(work, &pool->worklist, entry)
 			send_mayday(work);
 	}
 
 	spin_unlock_irq(&gcwq->lock);
 
-	mod_timer(&gcwq->pool.mayday_timer, jiffies + MAYDAY_INTERVAL);
+	mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
 }
 
 /**
  * maybe_create_worker - create a new worker if necessary
- * @gcwq: gcwq to create a new worker for
+ * @pool: pool to create a new worker for
  *
- * Create a new worker for @gcwq if necessary.  @gcwq is guaranteed to
+ * Create a new worker for @pool if necessary.  @pool is guaranteed to
  * have at least one idle worker on return from this function.  If
  * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
- * sent to all rescuers with works scheduled on @gcwq to resolve
+ * sent to all rescuers with works scheduled on @pool to resolve
  * possible allocation deadlock.
  *
  * On return, need_to_create_worker() is guaranteed to be false and
@@ -1571,52 +1573,54 @@ static void gcwq_mayday_timeout(unsigned long __gcwq)
  * false if no action was taken and gcwq->lock stayed locked, true
  * otherwise.
  */
-static bool maybe_create_worker(struct global_cwq *gcwq)
+static bool maybe_create_worker(struct worker_pool *pool)
 __releases(&gcwq->lock)
 __acquires(&gcwq->lock)
 {
-	if (!need_to_create_worker(gcwq))
+	struct global_cwq *gcwq = pool->gcwq;
+
+	if (!need_to_create_worker(pool))
 		return false;
 restart:
 	spin_unlock_irq(&gcwq->lock);
 
 	/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
-	mod_timer(&gcwq->pool.mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
+	mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
 
 	while (true) {
 		struct worker *worker;
 
-		worker = create_worker(gcwq, true);
+		worker = create_worker(pool, true);
 		if (worker) {
-			del_timer_sync(&gcwq->pool.mayday_timer);
+			del_timer_sync(&pool->mayday_timer);
 			spin_lock_irq(&gcwq->lock);
 			start_worker(worker);
-			BUG_ON(need_to_create_worker(gcwq));
+			BUG_ON(need_to_create_worker(pool));
 			return true;
 		}
 
-		if (!need_to_create_worker(gcwq))
+		if (!need_to_create_worker(pool))
 			break;
 
 		__set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(CREATE_COOLDOWN);
 
-		if (!need_to_create_worker(gcwq))
+		if (!need_to_create_worker(pool))
 			break;
 	}
 
-	del_timer_sync(&gcwq->pool.mayday_timer);
+	del_timer_sync(&pool->mayday_timer);
 	spin_lock_irq(&gcwq->lock);
-	if (need_to_create_worker(gcwq))
+	if (need_to_create_worker(pool))
 		goto restart;
 	return true;
 }
 
 /**
  * maybe_destroy_worker - destroy workers which have been idle for a while
- * @gcwq: gcwq to destroy workers for
+ * @pool: pool to destroy workers for
  *
- * Destroy @gcwq workers which have been idle for longer than
+ * Destroy @pool workers which have been idle for longer than
  * IDLE_WORKER_TIMEOUT.
  *
  * LOCKING:
@@ -1627,20 +1631,19 @@ restart:
  * false if no action was taken and gcwq->lock stayed locked, true
  * otherwise.
  */
-static bool maybe_destroy_workers(struct global_cwq *gcwq)
+static bool maybe_destroy_workers(struct worker_pool *pool)
 {
 	bool ret = false;
 
-	while (too_many_workers(gcwq)) {
+	while (too_many_workers(pool)) {
 		struct worker *worker;
 		unsigned long expires;
 
-		worker = list_entry(gcwq->pool.idle_list.prev, struct worker,
-				    entry);
+		worker = list_entry(pool->idle_list.prev, struct worker, entry);
 		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
 
 		if (time_before(jiffies, expires)) {
-			mod_timer(&gcwq->pool.idle_timer, expires);
+			mod_timer(&pool->idle_timer, expires);
 			break;
 		}
 
@@ -1673,7 +1676,8 @@ static bool maybe_destroy_workers(struct global_cwq *gcwq)
  */
 static bool manage_workers(struct worker *worker)
 {
-	struct global_cwq *gcwq = worker->pool->gcwq;
+	struct worker_pool *pool = worker->pool;
+	struct global_cwq *gcwq = pool->gcwq;
 	bool ret = false;
 
 	if (gcwq->flags & GCWQ_MANAGING_WORKERS)
@@ -1686,8 +1690,8 @@ static bool manage_workers(struct worker *worker)
 	 * Destroy and then create so that may_start_working() is true
 	 * on return.
 	 */
-	ret |= maybe_destroy_workers(gcwq);
-	ret |= maybe_create_worker(gcwq);
+	ret |= maybe_destroy_workers(pool);
+	ret |= maybe_create_worker(pool);
 
 	gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
 
@@ -1746,7 +1750,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
 {
 	struct work_struct *work = list_first_entry(&cwq->delayed_works,
 						    struct work_struct, entry);
-	struct list_head *pos = gcwq_determine_ins_pos(cwq->pool->gcwq, cwq);
+	struct list_head *pos = pool_determine_ins_pos(cwq->pool, cwq);
 
 	trace_workqueue_activate_work(work);
 	move_linked_works(work, pos, NULL);
@@ -1874,7 +1878,7 @@ __acquires(&gcwq->lock)
 
 		if (!list_empty(&pool->worklist) &&
 		    get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
-			wake_up_worker(gcwq);
+			wake_up_worker(pool);
 		else
 			gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
 	}
@@ -1890,8 +1894,8 @@ __acquires(&gcwq->lock)
 	 * Unbound gcwq isn't concurrency managed and work items should be
 	 * executed ASAP.  Wake up another worker if necessary.
 	 */
-	if ((worker->flags & WORKER_UNBOUND) && need_more_worker(gcwq))
-		wake_up_worker(gcwq);
+	if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
+		wake_up_worker(pool);
 
 	spin_unlock_irq(&gcwq->lock);
 
@@ -1983,11 +1987,11 @@ woke_up:
 	worker_leave_idle(worker);
 recheck:
 	/* no more worker necessary? */
-	if (!need_more_worker(gcwq))
+	if (!need_more_worker(pool))
 		goto sleep;
 
 	/* do we need to manage? */
-	if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
+	if (unlikely(!may_start_working(pool)) && manage_workers(worker))
 		goto recheck;
 
 	/*
@@ -2018,11 +2022,11 @@ recheck:
 			move_linked_works(work, &worker->scheduled, NULL);
 			process_scheduled_works(worker);
 		}
-	} while (keep_working(gcwq));
+	} while (keep_working(pool));
 
 	worker_set_flags(worker, WORKER_PREP, false);
 sleep:
-	if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
+	if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
 		goto recheck;
 
 	/*
@@ -2107,8 +2111,8 @@ repeat:
 		 * regular worker; otherwise, we end up with 0 concurrency
 		 * and stalling the execution.
 		 */
-		if (keep_working(gcwq))
-			wake_up_worker(gcwq);
+		if (keep_working(pool))
+			wake_up_worker(pool);
 
 		spin_unlock_irq(&gcwq->lock);
 	}
@@ -3383,7 +3387,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * keep_working() are always true as long as the worklist is
 	 * not empty.
 	 */
-	atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
+	atomic_set(get_pool_nr_running(&gcwq->pool), 0);
 
 	spin_unlock_irq(&gcwq->lock);
 	del_timer_sync(&gcwq->pool.idle_timer);
@@ -3424,9 +3428,9 @@ static int __cpuinit trustee_thread(void *__gcwq)
 			wake_up_process(worker->task);
 		}
 
-		if (need_to_create_worker(gcwq)) {
+		if (need_to_create_worker(&gcwq->pool)) {
 			spin_unlock_irq(&gcwq->lock);
-			worker = create_worker(gcwq, false);
+			worker = create_worker(&gcwq->pool, false);
 			spin_lock_irq(&gcwq->lock);
 			if (worker) {
 				worker->flags |= WORKER_ROGUE;
@@ -3540,7 +3544,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		/* fall through */
 	case CPU_UP_PREPARE:
 		BUG_ON(gcwq->pool.first_idle);
-		new_worker = create_worker(gcwq, false);
+		new_worker = create_worker(&gcwq->pool, false);
 		if (!new_worker) {
 			if (new_trustee)
 				kthread_stop(new_trustee);
@@ -3788,7 +3792,7 @@ void thaw_workqueues(void)
 				cwq_activate_first_delayed(cwq);
 		}
 
-		wake_up_worker(gcwq);
+		wake_up_worker(&gcwq->pool);
 
 		spin_unlock_irq(&gcwq->lock);
 	}
@@ -3822,10 +3826,10 @@ static int __init init_workqueues(void)
 
 		init_timer_deferrable(&gcwq->pool.idle_timer);
 		gcwq->pool.idle_timer.function = idle_worker_timeout;
-		gcwq->pool.idle_timer.data = (unsigned long)gcwq;
+		gcwq->pool.idle_timer.data = (unsigned long)&gcwq->pool;
 
 		setup_timer(&gcwq->pool.mayday_timer, gcwq_mayday_timeout,
-			    (unsigned long)gcwq);
+			    (unsigned long)&gcwq->pool);
 
 		ida_init(&gcwq->pool.worker_ida);
 
@@ -3840,7 +3844,7 @@ static int __init init_workqueues(void)
 
 		if (cpu != WORK_CPU_UNBOUND)
 			gcwq->flags &= ~GCWQ_DISASSOCIATED;
-		worker = create_worker(gcwq, true);
+		worker = create_worker(&gcwq->pool, true);
 		BUG_ON(!worker);
 		spin_lock_irq(&gcwq->lock);
 		start_worker(worker);
-- 
cgit v1.2.3


From 1cb7c8f4c82f6f8d428b3bad2757e6ded2a23667 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 12 Jul 2012 14:46:37 -0700
Subject: workqueue: separate out worker_pool flags

GCWQ_MANAGE_WORKERS, GCWQ_MANAGING_WORKERS and GCWQ_HIGHPRI_PENDING
are per-pool properties.  Add worker_pool->flags and make the above
three flags per-pool flags.

The changes in this patch are mechanical and don't caues any
functional difference.  This is to prepare for multiple pools per
gcwq.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 47 +++++++++++++++++++++++++----------------------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2d82f7b193a0..7a98bae635fa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -46,11 +46,13 @@
 
 enum {
 	/* global_cwq flags */
-	GCWQ_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */
-	GCWQ_MANAGING_WORKERS	= 1 << 1,	/* managing workers */
-	GCWQ_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */
-	GCWQ_FREEZING		= 1 << 3,	/* freeze in progress */
-	GCWQ_HIGHPRI_PENDING	= 1 << 4,	/* highpri works on queue */
+	GCWQ_DISASSOCIATED	= 1 << 0,	/* cpu can't serve workers */
+	GCWQ_FREEZING		= 1 << 1,	/* freeze in progress */
+
+	/* pool flags */
+	POOL_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */
+	POOL_MANAGING_WORKERS	= 1 << 1,	/* managing workers */
+	POOL_HIGHPRI_PENDING	= 1 << 2,	/* highpri works on queue */
 
 	/* worker flags */
 	WORKER_STARTED		= 1 << 0,	/* started */
@@ -142,6 +144,7 @@ struct worker {
 
 struct worker_pool {
 	struct global_cwq	*gcwq;		/* I: the owning gcwq */
+	unsigned int		flags;		/* X: flags */
 
 	struct list_head	worklist;	/* L: list of pending works */
 	int			nr_workers;	/* L: total number of workers */
@@ -583,7 +586,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
 static bool __need_more_worker(struct worker_pool *pool)
 {
 	return !atomic_read(get_pool_nr_running(pool)) ||
-		pool->gcwq->flags & GCWQ_HIGHPRI_PENDING;
+		(pool->flags & POOL_HIGHPRI_PENDING);
 }
 
 /*
@@ -612,7 +615,7 @@ static bool keep_working(struct worker_pool *pool)
 
 	return !list_empty(&pool->worklist) &&
 		(atomic_read(nr_running) <= 1 ||
-		 pool->gcwq->flags & GCWQ_HIGHPRI_PENDING);
+		 (pool->flags & POOL_HIGHPRI_PENDING));
 }
 
 /* Do we need a new worker?  Called from manager. */
@@ -625,13 +628,13 @@ static bool need_to_create_worker(struct worker_pool *pool)
 static bool need_to_manage_workers(struct worker_pool *pool)
 {
 	return need_to_create_worker(pool) ||
-		pool->gcwq->flags & GCWQ_MANAGE_WORKERS;
+		(pool->flags & POOL_MANAGE_WORKERS);
 }
 
 /* Do we have too many workers and should some go away? */
 static bool too_many_workers(struct worker_pool *pool)
 {
-	bool managing = pool->gcwq->flags & GCWQ_MANAGING_WORKERS;
+	bool managing = pool->flags & POOL_MANAGING_WORKERS;
 	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
 	int nr_busy = pool->nr_workers - nr_idle;
 
@@ -889,7 +892,7 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
  * position for the work.  If @cwq is for HIGHPRI wq, the work is
  * queued at the head of the queue but in FIFO order with respect to
  * other HIGHPRI works; otherwise, at the end of the queue.  This
- * function also sets GCWQ_HIGHPRI_PENDING flag to hint @pool that
+ * function also sets POOL_HIGHPRI_PENDING flag to hint @pool that
  * there are HIGHPRI works pending.
  *
  * CONTEXT:
@@ -913,7 +916,7 @@ static inline struct list_head *pool_determine_ins_pos(struct worker_pool *pool,
 			break;
 	}
 
-	pool->gcwq->flags |= GCWQ_HIGHPRI_PENDING;
+	pool->flags |= POOL_HIGHPRI_PENDING;
 	return &twork->entry;
 }
 
@@ -1500,7 +1503,7 @@ static void idle_worker_timeout(unsigned long __pool)
 			mod_timer(&pool->idle_timer, expires);
 		else {
 			/* it's been idle for too long, wake up manager */
-			gcwq->flags |= GCWQ_MANAGE_WORKERS;
+			pool->flags |= POOL_MANAGE_WORKERS;
 			wake_up_worker(pool);
 		}
 	}
@@ -1680,11 +1683,11 @@ static bool manage_workers(struct worker *worker)
 	struct global_cwq *gcwq = pool->gcwq;
 	bool ret = false;
 
-	if (gcwq->flags & GCWQ_MANAGING_WORKERS)
+	if (pool->flags & POOL_MANAGING_WORKERS)
 		return ret;
 
-	gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
-	gcwq->flags |= GCWQ_MANAGING_WORKERS;
+	pool->flags &= ~POOL_MANAGE_WORKERS;
+	pool->flags |= POOL_MANAGING_WORKERS;
 
 	/*
 	 * Destroy and then create so that may_start_working() is true
@@ -1693,7 +1696,7 @@ static bool manage_workers(struct worker *worker)
 	ret |= maybe_destroy_workers(pool);
 	ret |= maybe_create_worker(pool);
 
-	gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
+	pool->flags &= ~POOL_MANAGING_WORKERS;
 
 	/*
 	 * The trustee might be waiting to take over the manager
@@ -1872,7 +1875,7 @@ __acquires(&gcwq->lock)
 	 * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
 	 * wake up another worker; otherwise, clear HIGHPRI_PENDING.
 	 */
-	if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
+	if (unlikely(pool->flags & POOL_HIGHPRI_PENDING)) {
 		struct work_struct *nwork = list_first_entry(&pool->worklist,
 					 struct work_struct, entry);
 
@@ -1880,7 +1883,7 @@ __acquires(&gcwq->lock)
 		    get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
 			wake_up_worker(pool);
 		else
-			gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
+			pool->flags &= ~POOL_HIGHPRI_PENDING;
 	}
 
 	/*
@@ -3360,10 +3363,10 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * cancelled.
 	 */
 	BUG_ON(gcwq->cpu != smp_processor_id());
-	rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
+	rc = trustee_wait_event(!(gcwq->pool.flags & POOL_MANAGING_WORKERS));
 	BUG_ON(rc < 0);
 
-	gcwq->flags |= GCWQ_MANAGING_WORKERS;
+	gcwq->pool.flags |= POOL_MANAGING_WORKERS;
 
 	list_for_each_entry(worker, &gcwq->pool.idle_list, entry)
 		worker->flags |= WORKER_ROGUE;
@@ -3487,7 +3490,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	}
 
 	/* relinquish manager role */
-	gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
+	gcwq->pool.flags &= ~POOL_MANAGING_WORKERS;
 
 	/* notify completion */
 	gcwq->trustee = NULL;
@@ -3604,7 +3607,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		spin_unlock_irq(&gcwq->lock);
 		kthread_bind(gcwq->pool.first_idle->task, cpu);
 		spin_lock_irq(&gcwq->lock);
-		gcwq->flags |= GCWQ_MANAGE_WORKERS;
+		gcwq->pool.flags |= POOL_MANAGE_WORKERS;
 		start_worker(gcwq->pool.first_idle);
 		gcwq->pool.first_idle = NULL;
 		break;
-- 
cgit v1.2.3


From 0e38c5a31363c77ecaa7447bde3df94de9267e2f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Jul 2012 22:16:44 -0700
Subject: workqueue: introduce NR_WORKER_POOLS and for_each_worker_pool()

Introduce NR_WORKER_POOLS and for_each_worker_pool() and convert code
paths which need to manipulate all pools in a gcwq to use them.
NR_WORKER_POOLS is currently one and for_each_worker_pool() iterates
over only @gcwq->pool.

Note that nr_running is per-pool property and converted to an array
with NR_WORKER_POOLS elements and renamed to pool_nr_running.  Note
that get_pool_nr_running() currently assumes 0 index.  The next patch
will make use of non-zero index.

The changes in this patch are mechanical and don't caues any
functional difference.  This is to prepare for multiple pools per
gcwq.

v2: nr_running indexing bug in get_pool_nr_running() fixed.

v3: Pointer to array is stupid.  Don't use it in get_pool_nr_running()
    as suggested by Linus.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 223 ++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 153 insertions(+), 70 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7a98bae635fa..b0daaea44eaa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -74,6 +74,8 @@ enum {
 	TRUSTEE_RELEASE		= 3,		/* release workers */
 	TRUSTEE_DONE		= 4,		/* trustee is done */
 
+	NR_WORKER_POOLS		= 1,		/* # worker pools per gcwq */
+
 	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
 	BUSY_WORKER_HASH_SIZE	= 1 << BUSY_WORKER_HASH_ORDER,
 	BUSY_WORKER_HASH_MASK	= BUSY_WORKER_HASH_SIZE - 1,
@@ -274,6 +276,9 @@ EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
 
+#define for_each_worker_pool(pool, gcwq)				\
+	for ((pool) = &(gcwq)->pool; (pool); (pool) = NULL)
+
 #define for_each_busy_worker(worker, i, pos, gcwq)			\
 	for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)			\
 		hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
@@ -454,7 +459,7 @@ static bool workqueue_freezing;		/* W: have wqs started freezing? */
  * try_to_wake_up().  Put it in a separate cacheline.
  */
 static DEFINE_PER_CPU(struct global_cwq, global_cwq);
-static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
+static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]);
 
 /*
  * Global cpu workqueue and nr_running counter for unbound gcwq.  The
@@ -462,7 +467,9 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
  * workers have WORKER_UNBOUND set.
  */
 static struct global_cwq unbound_global_cwq;
-static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0);	/* always 0 */
+static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
+	[0 ... NR_WORKER_POOLS - 1]	= ATOMIC_INIT(0),	/* always 0 */
+};
 
 static int worker_thread(void *__worker);
 
@@ -477,11 +484,12 @@ static struct global_cwq *get_gcwq(unsigned int cpu)
 static atomic_t *get_pool_nr_running(struct worker_pool *pool)
 {
 	int cpu = pool->gcwq->cpu;
+	int idx = 0;
 
 	if (cpu != WORK_CPU_UNBOUND)
-		return &per_cpu(gcwq_nr_running, cpu);
+		return &per_cpu(pool_nr_running, cpu)[idx];
 	else
-		return &unbound_gcwq_nr_running;
+		return &unbound_pool_nr_running[idx];
 }
 
 static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
@@ -3345,9 +3353,30 @@ EXPORT_SYMBOL_GPL(work_busy);
 	__ret1 < 0 ? -1 : 0;						\
 })
 
+static bool gcwq_is_managing_workers(struct global_cwq *gcwq)
+{
+	struct worker_pool *pool;
+
+	for_each_worker_pool(pool, gcwq)
+		if (pool->flags & POOL_MANAGING_WORKERS)
+			return true;
+	return false;
+}
+
+static bool gcwq_has_idle_workers(struct global_cwq *gcwq)
+{
+	struct worker_pool *pool;
+
+	for_each_worker_pool(pool, gcwq)
+		if (!list_empty(&pool->idle_list))
+			return true;
+	return false;
+}
+
 static int __cpuinit trustee_thread(void *__gcwq)
 {
 	struct global_cwq *gcwq = __gcwq;
+	struct worker_pool *pool;
 	struct worker *worker;
 	struct work_struct *work;
 	struct hlist_node *pos;
@@ -3363,13 +3392,15 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * cancelled.
 	 */
 	BUG_ON(gcwq->cpu != smp_processor_id());
-	rc = trustee_wait_event(!(gcwq->pool.flags & POOL_MANAGING_WORKERS));
+	rc = trustee_wait_event(!gcwq_is_managing_workers(gcwq));
 	BUG_ON(rc < 0);
 
-	gcwq->pool.flags |= POOL_MANAGING_WORKERS;
+	for_each_worker_pool(pool, gcwq) {
+		pool->flags |= POOL_MANAGING_WORKERS;
 
-	list_for_each_entry(worker, &gcwq->pool.idle_list, entry)
-		worker->flags |= WORKER_ROGUE;
+		list_for_each_entry(worker, &pool->idle_list, entry)
+			worker->flags |= WORKER_ROGUE;
+	}
 
 	for_each_busy_worker(worker, i, pos, gcwq)
 		worker->flags |= WORKER_ROGUE;
@@ -3390,10 +3421,12 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * keep_working() are always true as long as the worklist is
 	 * not empty.
 	 */
-	atomic_set(get_pool_nr_running(&gcwq->pool), 0);
+	for_each_worker_pool(pool, gcwq)
+		atomic_set(get_pool_nr_running(pool), 0);
 
 	spin_unlock_irq(&gcwq->lock);
-	del_timer_sync(&gcwq->pool.idle_timer);
+	for_each_worker_pool(pool, gcwq)
+		del_timer_sync(&pool->idle_timer);
 	spin_lock_irq(&gcwq->lock);
 
 	/*
@@ -3415,29 +3448,38 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * may be frozen works in freezable cwqs.  Don't declare
 	 * completion while frozen.
 	 */
-	while (gcwq->pool.nr_workers != gcwq->pool.nr_idle ||
-	       gcwq->flags & GCWQ_FREEZING ||
-	       gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
-		int nr_works = 0;
+	while (true) {
+		bool busy = false;
 
-		list_for_each_entry(work, &gcwq->pool.worklist, entry) {
-			send_mayday(work);
-			nr_works++;
-		}
+		for_each_worker_pool(pool, gcwq)
+			busy |= pool->nr_workers != pool->nr_idle;
 
-		list_for_each_entry(worker, &gcwq->pool.idle_list, entry) {
-			if (!nr_works--)
-				break;
-			wake_up_process(worker->task);
-		}
+		if (!busy && !(gcwq->flags & GCWQ_FREEZING) &&
+		    gcwq->trustee_state != TRUSTEE_IN_CHARGE)
+			break;
 
-		if (need_to_create_worker(&gcwq->pool)) {
-			spin_unlock_irq(&gcwq->lock);
-			worker = create_worker(&gcwq->pool, false);
-			spin_lock_irq(&gcwq->lock);
-			if (worker) {
-				worker->flags |= WORKER_ROGUE;
-				start_worker(worker);
+		for_each_worker_pool(pool, gcwq) {
+			int nr_works = 0;
+
+			list_for_each_entry(work, &pool->worklist, entry) {
+				send_mayday(work);
+				nr_works++;
+			}
+
+			list_for_each_entry(worker, &pool->idle_list, entry) {
+				if (!nr_works--)
+					break;
+				wake_up_process(worker->task);
+			}
+
+			if (need_to_create_worker(pool)) {
+				spin_unlock_irq(&gcwq->lock);
+				worker = create_worker(pool, false);
+				spin_lock_irq(&gcwq->lock);
+				if (worker) {
+					worker->flags |= WORKER_ROGUE;
+					start_worker(worker);
+				}
 			}
 		}
 
@@ -3452,11 +3494,18 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * all workers till we're canceled.
 	 */
 	do {
-		rc = trustee_wait_event(!list_empty(&gcwq->pool.idle_list));
-		while (!list_empty(&gcwq->pool.idle_list))
-			destroy_worker(list_first_entry(&gcwq->pool.idle_list,
-							struct worker, entry));
-	} while (gcwq->pool.nr_workers && rc >= 0);
+		rc = trustee_wait_event(gcwq_has_idle_workers(gcwq));
+
+		i = 0;
+		for_each_worker_pool(pool, gcwq) {
+			while (!list_empty(&pool->idle_list)) {
+				worker = list_first_entry(&pool->idle_list,
+							  struct worker, entry);
+				destroy_worker(worker);
+			}
+			i |= pool->nr_workers;
+		}
+	} while (i && rc >= 0);
 
 	/*
 	 * At this point, either draining has completed and no worker
@@ -3465,7 +3514,8 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * Tell the remaining busy ones to rebind once it finishes the
 	 * currently scheduled works by scheduling the rebind_work.
 	 */
-	WARN_ON(!list_empty(&gcwq->pool.idle_list));
+	for_each_worker_pool(pool, gcwq)
+		WARN_ON(!list_empty(&pool->idle_list));
 
 	for_each_busy_worker(worker, i, pos, gcwq) {
 		struct work_struct *rebind_work = &worker->rebind_work;
@@ -3490,7 +3540,8 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	}
 
 	/* relinquish manager role */
-	gcwq->pool.flags &= ~POOL_MANAGING_WORKERS;
+	for_each_worker_pool(pool, gcwq)
+		pool->flags &= ~POOL_MANAGING_WORKERS;
 
 	/* notify completion */
 	gcwq->trustee = NULL;
@@ -3532,8 +3583,10 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	unsigned int cpu = (unsigned long)hcpu;
 	struct global_cwq *gcwq = get_gcwq(cpu);
 	struct task_struct *new_trustee = NULL;
-	struct worker *uninitialized_var(new_worker);
+	struct worker *new_workers[NR_WORKER_POOLS] = { };
+	struct worker_pool *pool;
 	unsigned long flags;
+	int i;
 
 	action &= ~CPU_TASKS_FROZEN;
 
@@ -3546,12 +3599,12 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		kthread_bind(new_trustee, cpu);
 		/* fall through */
 	case CPU_UP_PREPARE:
-		BUG_ON(gcwq->pool.first_idle);
-		new_worker = create_worker(&gcwq->pool, false);
-		if (!new_worker) {
-			if (new_trustee)
-				kthread_stop(new_trustee);
-			return NOTIFY_BAD;
+		i = 0;
+		for_each_worker_pool(pool, gcwq) {
+			BUG_ON(pool->first_idle);
+			new_workers[i] = create_worker(pool, false);
+			if (!new_workers[i++])
+				goto err_destroy;
 		}
 	}
 
@@ -3568,8 +3621,11 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
 		/* fall through */
 	case CPU_UP_PREPARE:
-		BUG_ON(gcwq->pool.first_idle);
-		gcwq->pool.first_idle = new_worker;
+		i = 0;
+		for_each_worker_pool(pool, gcwq) {
+			BUG_ON(pool->first_idle);
+			pool->first_idle = new_workers[i++];
+		}
 		break;
 
 	case CPU_DYING:
@@ -3586,8 +3642,10 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		gcwq->trustee_state = TRUSTEE_BUTCHER;
 		/* fall through */
 	case CPU_UP_CANCELED:
-		destroy_worker(gcwq->pool.first_idle);
-		gcwq->pool.first_idle = NULL;
+		for_each_worker_pool(pool, gcwq) {
+			destroy_worker(pool->first_idle);
+			pool->first_idle = NULL;
+		}
 		break;
 
 	case CPU_DOWN_FAILED:
@@ -3604,18 +3662,32 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		 * Put the first_idle in and request a real manager to
 		 * take a look.
 		 */
-		spin_unlock_irq(&gcwq->lock);
-		kthread_bind(gcwq->pool.first_idle->task, cpu);
-		spin_lock_irq(&gcwq->lock);
-		gcwq->pool.flags |= POOL_MANAGE_WORKERS;
-		start_worker(gcwq->pool.first_idle);
-		gcwq->pool.first_idle = NULL;
+		for_each_worker_pool(pool, gcwq) {
+			spin_unlock_irq(&gcwq->lock);
+			kthread_bind(pool->first_idle->task, cpu);
+			spin_lock_irq(&gcwq->lock);
+			pool->flags |= POOL_MANAGE_WORKERS;
+			start_worker(pool->first_idle);
+			pool->first_idle = NULL;
+		}
 		break;
 	}
 
 	spin_unlock_irqrestore(&gcwq->lock, flags);
 
 	return notifier_from_errno(0);
+
+err_destroy:
+	if (new_trustee)
+		kthread_stop(new_trustee);
+
+	spin_lock_irqsave(&gcwq->lock, flags);
+	for (i = 0; i < NR_WORKER_POOLS; i++)
+		if (new_workers[i])
+			destroy_worker(new_workers[i]);
+	spin_unlock_irqrestore(&gcwq->lock, flags);
+
+	return NOTIFY_BAD;
 }
 
 #ifdef CONFIG_SMP
@@ -3774,6 +3846,7 @@ void thaw_workqueues(void)
 
 	for_each_gcwq_cpu(cpu) {
 		struct global_cwq *gcwq = get_gcwq(cpu);
+		struct worker_pool *pool;
 		struct workqueue_struct *wq;
 
 		spin_lock_irq(&gcwq->lock);
@@ -3795,7 +3868,8 @@ void thaw_workqueues(void)
 				cwq_activate_first_delayed(cwq);
 		}
 
-		wake_up_worker(&gcwq->pool);
+		for_each_worker_pool(pool, gcwq)
+			wake_up_worker(pool);
 
 		spin_unlock_irq(&gcwq->lock);
 	}
@@ -3816,25 +3890,29 @@ static int __init init_workqueues(void)
 	/* initialize gcwqs */
 	for_each_gcwq_cpu(cpu) {
 		struct global_cwq *gcwq = get_gcwq(cpu);
+		struct worker_pool *pool;
 
 		spin_lock_init(&gcwq->lock);
-		gcwq->pool.gcwq = gcwq;
-		INIT_LIST_HEAD(&gcwq->pool.worklist);
 		gcwq->cpu = cpu;
 		gcwq->flags |= GCWQ_DISASSOCIATED;
 
-		INIT_LIST_HEAD(&gcwq->pool.idle_list);
 		for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
 			INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
 
-		init_timer_deferrable(&gcwq->pool.idle_timer);
-		gcwq->pool.idle_timer.function = idle_worker_timeout;
-		gcwq->pool.idle_timer.data = (unsigned long)&gcwq->pool;
+		for_each_worker_pool(pool, gcwq) {
+			pool->gcwq = gcwq;
+			INIT_LIST_HEAD(&pool->worklist);
+			INIT_LIST_HEAD(&pool->idle_list);
 
-		setup_timer(&gcwq->pool.mayday_timer, gcwq_mayday_timeout,
-			    (unsigned long)&gcwq->pool);
+			init_timer_deferrable(&pool->idle_timer);
+			pool->idle_timer.function = idle_worker_timeout;
+			pool->idle_timer.data = (unsigned long)pool;
 
-		ida_init(&gcwq->pool.worker_ida);
+			setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
+				    (unsigned long)pool);
+
+			ida_init(&pool->worker_ida);
+		}
 
 		gcwq->trustee_state = TRUSTEE_DONE;
 		init_waitqueue_head(&gcwq->trustee_wait);
@@ -3843,15 +3921,20 @@ static int __init init_workqueues(void)
 	/* create the initial worker */
 	for_each_online_gcwq_cpu(cpu) {
 		struct global_cwq *gcwq = get_gcwq(cpu);
-		struct worker *worker;
+		struct worker_pool *pool;
 
 		if (cpu != WORK_CPU_UNBOUND)
 			gcwq->flags &= ~GCWQ_DISASSOCIATED;
-		worker = create_worker(&gcwq->pool, true);
-		BUG_ON(!worker);
-		spin_lock_irq(&gcwq->lock);
-		start_worker(worker);
-		spin_unlock_irq(&gcwq->lock);
+
+		for_each_worker_pool(pool, gcwq) {
+			struct worker *worker;
+
+			worker = create_worker(pool, true);
+			BUG_ON(!worker);
+			spin_lock_irq(&gcwq->lock);
+			start_worker(worker);
+			spin_unlock_irq(&gcwq->lock);
+		}
 	}
 
 	system_wq = alloc_workqueue("events", 0, 0);
-- 
cgit v1.2.3


From 7f2532958fb57c4ef952d13b121d0e49e00dab94 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Jul 2012 22:16:45 -0700
Subject: workqueue: reimplement WQ_HIGHPRI using a separate worker_pool

WQ_HIGHPRI was implemented by queueing highpri work items at the head
of the global worklist.  Other than queueing at the head, they weren't
handled differently; unfortunately, this could lead to execution
latency of a few seconds on heavily loaded systems.

Now that workqueue code has been updated to deal with multiple
worker_pools per global_cwq, this patch reimplements WQ_HIGHPRI using
a separate worker_pool.  NR_WORKER_POOLS is bumped to two and
gcwq->pools[0] is used for normal pri work items and ->pools[1] for
highpri.  Highpri workers get -20 nice level and has 'H' suffix in
their names.  Note that this change increases the number of kworkers
per cpu.

POOL_HIGHPRI_PENDING, pool_determine_ins_pos() and highpri chain
wakeup code in process_one_work() are no longer used and removed.

This allows proper prioritization of highpri work items and removes
high execution latency of highpri work items.

v2: nr_running indexing bug in get_pool_nr_running() fixed.

v3: Refreshed for the get_pool_nr_running() update in the previous
    patch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Josh Hunt <joshhunt00@gmail.com>
LKML-Reference: <CAKA=qzaHqwZ8eqpLNFjxnO2fX-tgAOjmpvxgBFjv6dJeQaOW1w@mail.gmail.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
---
 Documentation/workqueue.txt | 103 ++++++++++++++++----------------------------
 kernel/workqueue.c          | 100 ++++++++++++------------------------------
 2 files changed, 65 insertions(+), 138 deletions(-)

diff --git a/Documentation/workqueue.txt b/Documentation/workqueue.txt
index a0b577de918f..a6ab4b62d926 100644
--- a/Documentation/workqueue.txt
+++ b/Documentation/workqueue.txt
@@ -89,25 +89,28 @@ called thread-pools.
 
 The cmwq design differentiates between the user-facing workqueues that
 subsystems and drivers queue work items on and the backend mechanism
-which manages thread-pool and processes the queued work items.
+which manages thread-pools and processes the queued work items.
 
 The backend is called gcwq.  There is one gcwq for each possible CPU
-and one gcwq to serve work items queued on unbound workqueues.
+and one gcwq to serve work items queued on unbound workqueues.  Each
+gcwq has two thread-pools - one for normal work items and the other
+for high priority ones.
 
 Subsystems and drivers can create and queue work items through special
 workqueue API functions as they see fit. They can influence some
 aspects of the way the work items are executed by setting flags on the
 workqueue they are putting the work item on. These flags include
-things like CPU locality, reentrancy, concurrency limits and more. To
-get a detailed overview refer to the API description of
+things like CPU locality, reentrancy, concurrency limits, priority and
+more.  To get a detailed overview refer to the API description of
 alloc_workqueue() below.
 
-When a work item is queued to a workqueue, the target gcwq is
-determined according to the queue parameters and workqueue attributes
-and appended on the shared worklist of the gcwq.  For example, unless
-specifically overridden, a work item of a bound workqueue will be
-queued on the worklist of exactly that gcwq that is associated to the
-CPU the issuer is running on.
+When a work item is queued to a workqueue, the target gcwq and
+thread-pool is determined according to the queue parameters and
+workqueue attributes and appended on the shared worklist of the
+thread-pool.  For example, unless specifically overridden, a work item
+of a bound workqueue will be queued on the worklist of either normal
+or highpri thread-pool of the gcwq that is associated to the CPU the
+issuer is running on.
 
 For any worker pool implementation, managing the concurrency level
 (how many execution contexts are active) is an important issue.  cmwq
@@ -115,26 +118,26 @@ tries to keep the concurrency at a minimal but sufficient level.
 Minimal to save resources and sufficient in that the system is used at
 its full capacity.
 
-Each gcwq bound to an actual CPU implements concurrency management by
-hooking into the scheduler.  The gcwq is notified whenever an active
-worker wakes up or sleeps and keeps track of the number of the
-currently runnable workers.  Generally, work items are not expected to
-hog a CPU and consume many cycles.  That means maintaining just enough
-concurrency to prevent work processing from stalling should be
-optimal.  As long as there are one or more runnable workers on the
-CPU, the gcwq doesn't start execution of a new work, but, when the
-last running worker goes to sleep, it immediately schedules a new
-worker so that the CPU doesn't sit idle while there are pending work
-items.  This allows using a minimal number of workers without losing
-execution bandwidth.
+Each thread-pool bound to an actual CPU implements concurrency
+management by hooking into the scheduler.  The thread-pool is notified
+whenever an active worker wakes up or sleeps and keeps track of the
+number of the currently runnable workers.  Generally, work items are
+not expected to hog a CPU and consume many cycles.  That means
+maintaining just enough concurrency to prevent work processing from
+stalling should be optimal.  As long as there are one or more runnable
+workers on the CPU, the thread-pool doesn't start execution of a new
+work, but, when the last running worker goes to sleep, it immediately
+schedules a new worker so that the CPU doesn't sit idle while there
+are pending work items.  This allows using a minimal number of workers
+without losing execution bandwidth.
 
 Keeping idle workers around doesn't cost other than the memory space
 for kthreads, so cmwq holds onto idle ones for a while before killing
 them.
 
 For an unbound wq, the above concurrency management doesn't apply and
-the gcwq for the pseudo unbound CPU tries to start executing all work
-items as soon as possible.  The responsibility of regulating
+the thread-pools for the pseudo unbound CPU try to start executing all
+work items as soon as possible.  The responsibility of regulating
 concurrency level is on the users.  There is also a flag to mark a
 bound wq to ignore the concurrency management.  Please refer to the
 API section for details.
@@ -205,31 +208,22 @@ resources, scheduled and executed.
 
   WQ_HIGHPRI
 
-	Work items of a highpri wq are queued at the head of the
-	worklist of the target gcwq and start execution regardless of
-	the current concurrency level.  In other words, highpri work
-	items will always start execution as soon as execution
-	resource is available.
+	Work items of a highpri wq are queued to the highpri
+	thread-pool of the target gcwq.  Highpri thread-pools are
+	served by worker threads with elevated nice level.
 
-	Ordering among highpri work items is preserved - a highpri
-	work item queued after another highpri work item will start
-	execution after the earlier highpri work item starts.
-
-	Although highpri work items are not held back by other
-	runnable work items, they still contribute to the concurrency
-	level.  Highpri work items in runnable state will prevent
-	non-highpri work items from starting execution.
-
-	This flag is meaningless for unbound wq.
+	Note that normal and highpri thread-pools don't interact with
+	each other.  Each maintain its separate pool of workers and
+	implements concurrency management among its workers.
 
   WQ_CPU_INTENSIVE
 
 	Work items of a CPU intensive wq do not contribute to the
 	concurrency level.  In other words, runnable CPU intensive
-	work items will not prevent other work items from starting
-	execution.  This is useful for bound work items which are
-	expected to hog CPU cycles so that their execution is
-	regulated by the system scheduler.
+	work items will not prevent other work items in the same
+	thread-pool from starting execution.  This is useful for bound
+	work items which are expected to hog CPU cycles so that their
+	execution is regulated by the system scheduler.
 
 	Although CPU intensive work items don't contribute to the
 	concurrency level, start of their executions is still
@@ -239,14 +233,6 @@ resources, scheduled and executed.
 
 	This flag is meaningless for unbound wq.
 
-  WQ_HIGHPRI | WQ_CPU_INTENSIVE
-
-	This combination makes the wq avoid interaction with
-	concurrency management completely and behave as a simple
-	per-CPU execution context provider.  Work items queued on a
-	highpri CPU-intensive wq start execution as soon as resources
-	are available and don't affect execution of other work items.
-
 @max_active:
 
 @max_active determines the maximum number of execution contexts per
@@ -328,20 +314,7 @@ If @max_active == 2,
  35		w2 wakes up and finishes
 
 Now, let's assume w1 and w2 are queued to a different wq q1 which has
-WQ_HIGHPRI set,
-
- TIME IN MSECS	EVENT
- 0		w1 and w2 start and burn CPU
- 5		w1 sleeps
- 10		w2 sleeps
- 10		w0 starts and burns CPU
- 15		w0 sleeps
- 15		w1 wakes up and finishes
- 20		w2 wakes up and finishes
- 25		w0 wakes up and burns CPU
- 30		w0 finishes
-
-If q1 has WQ_CPU_INTENSIVE set,
+WQ_CPU_INTENSIVE set,
 
  TIME IN MSECS	EVENT
  0		w0 starts and burns CPU
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b0daaea44eaa..4fa9e3552f1e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -52,7 +52,6 @@ enum {
 	/* pool flags */
 	POOL_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */
 	POOL_MANAGING_WORKERS	= 1 << 1,	/* managing workers */
-	POOL_HIGHPRI_PENDING	= 1 << 2,	/* highpri works on queue */
 
 	/* worker flags */
 	WORKER_STARTED		= 1 << 0,	/* started */
@@ -74,7 +73,7 @@ enum {
 	TRUSTEE_RELEASE		= 3,		/* release workers */
 	TRUSTEE_DONE		= 4,		/* trustee is done */
 
-	NR_WORKER_POOLS		= 1,		/* # worker pools per gcwq */
+	NR_WORKER_POOLS		= 2,		/* # worker pools per gcwq */
 
 	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
 	BUSY_WORKER_HASH_SIZE	= 1 << BUSY_WORKER_HASH_ORDER,
@@ -95,6 +94,7 @@ enum {
 	 * all cpus.  Give -20.
 	 */
 	RESCUER_NICE_LEVEL	= -20,
+	HIGHPRI_NICE_LEVEL	= -20,
 };
 
 /*
@@ -174,7 +174,7 @@ struct global_cwq {
 	struct hlist_head	busy_hash[BUSY_WORKER_HASH_SIZE];
 						/* L: hash of busy workers */
 
-	struct worker_pool	pool;		/* the worker pools */
+	struct worker_pool	pools[2];	/* normal and highpri pools */
 
 	struct task_struct	*trustee;	/* L: for gcwq shutdown */
 	unsigned int		trustee_state;	/* L: trustee state */
@@ -277,7 +277,8 @@ EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
 #include <trace/events/workqueue.h>
 
 #define for_each_worker_pool(pool, gcwq)				\
-	for ((pool) = &(gcwq)->pool; (pool); (pool) = NULL)
+	for ((pool) = &(gcwq)->pools[0];				\
+	     (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++)
 
 #define for_each_busy_worker(worker, i, pos, gcwq)			\
 	for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)			\
@@ -473,6 +474,11 @@ static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
 
 static int worker_thread(void *__worker);
 
+static int worker_pool_pri(struct worker_pool *pool)
+{
+	return pool - pool->gcwq->pools;
+}
+
 static struct global_cwq *get_gcwq(unsigned int cpu)
 {
 	if (cpu != WORK_CPU_UNBOUND)
@@ -484,7 +490,7 @@ static struct global_cwq *get_gcwq(unsigned int cpu)
 static atomic_t *get_pool_nr_running(struct worker_pool *pool)
 {
 	int cpu = pool->gcwq->cpu;
-	int idx = 0;
+	int idx = worker_pool_pri(pool);
 
 	if (cpu != WORK_CPU_UNBOUND)
 		return &per_cpu(pool_nr_running, cpu)[idx];
@@ -586,15 +592,14 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
 }
 
 /*
- * Policy functions.  These define the policies on how the global
- * worker pool is managed.  Unless noted otherwise, these functions
- * assume that they're being called with gcwq->lock held.
+ * Policy functions.  These define the policies on how the global worker
+ * pools are managed.  Unless noted otherwise, these functions assume that
+ * they're being called with gcwq->lock held.
  */
 
 static bool __need_more_worker(struct worker_pool *pool)
 {
-	return !atomic_read(get_pool_nr_running(pool)) ||
-		(pool->flags & POOL_HIGHPRI_PENDING);
+	return !atomic_read(get_pool_nr_running(pool));
 }
 
 /*
@@ -621,9 +626,7 @@ static bool keep_working(struct worker_pool *pool)
 {
 	atomic_t *nr_running = get_pool_nr_running(pool);
 
-	return !list_empty(&pool->worklist) &&
-		(atomic_read(nr_running) <= 1 ||
-		 (pool->flags & POOL_HIGHPRI_PENDING));
+	return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
 }
 
 /* Do we need a new worker?  Called from manager. */
@@ -891,43 +894,6 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
 					    work);
 }
 
-/**
- * pool_determine_ins_pos - find insertion position
- * @pool: pool of interest
- * @cwq: cwq a work is being queued for
- *
- * A work for @cwq is about to be queued on @pool, determine insertion
- * position for the work.  If @cwq is for HIGHPRI wq, the work is
- * queued at the head of the queue but in FIFO order with respect to
- * other HIGHPRI works; otherwise, at the end of the queue.  This
- * function also sets POOL_HIGHPRI_PENDING flag to hint @pool that
- * there are HIGHPRI works pending.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- *
- * RETURNS:
- * Pointer to inserstion position.
- */
-static inline struct list_head *pool_determine_ins_pos(struct worker_pool *pool,
-					       struct cpu_workqueue_struct *cwq)
-{
-	struct work_struct *twork;
-
-	if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
-		return &pool->worklist;
-
-	list_for_each_entry(twork, &pool->worklist, entry) {
-		struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
-
-		if (!(tcwq->wq->flags & WQ_HIGHPRI))
-			break;
-	}
-
-	pool->flags |= POOL_HIGHPRI_PENDING;
-	return &twork->entry;
-}
-
 /**
  * insert_work - insert a work into gcwq
  * @cwq: cwq @work belongs to
@@ -1068,7 +1034,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 	if (likely(cwq->nr_active < cwq->max_active)) {
 		trace_workqueue_activate_work(work);
 		cwq->nr_active++;
-		worklist = pool_determine_ins_pos(cwq->pool, cwq);
+		worklist = &cwq->pool->worklist;
 	} else {
 		work_flags |= WORK_STRUCT_DELAYED;
 		worklist = &cwq->delayed_works;
@@ -1385,6 +1351,7 @@ static struct worker *create_worker(struct worker_pool *pool, bool bind)
 {
 	struct global_cwq *gcwq = pool->gcwq;
 	bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
+	const char *pri = worker_pool_pri(pool) ? "H" : "";
 	struct worker *worker = NULL;
 	int id = -1;
 
@@ -1406,15 +1373,17 @@ static struct worker *create_worker(struct worker_pool *pool, bool bind)
 
 	if (!on_unbound_cpu)
 		worker->task = kthread_create_on_node(worker_thread,
-						      worker,
-						      cpu_to_node(gcwq->cpu),
-						      "kworker/%u:%d", gcwq->cpu, id);
+					worker, cpu_to_node(gcwq->cpu),
+					"kworker/%u:%d%s", gcwq->cpu, id, pri);
 	else
 		worker->task = kthread_create(worker_thread, worker,
-					      "kworker/u:%d", id);
+					      "kworker/u:%d%s", id, pri);
 	if (IS_ERR(worker->task))
 		goto fail;
 
+	if (worker_pool_pri(pool))
+		set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
+
 	/*
 	 * A rogue worker will become a regular one if CPU comes
 	 * online later on.  Make sure every worker has
@@ -1761,10 +1730,9 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
 {
 	struct work_struct *work = list_first_entry(&cwq->delayed_works,
 						    struct work_struct, entry);
-	struct list_head *pos = pool_determine_ins_pos(cwq->pool, cwq);
 
 	trace_workqueue_activate_work(work);
-	move_linked_works(work, pos, NULL);
+	move_linked_works(work, &cwq->pool->worklist, NULL);
 	__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
 	cwq->nr_active++;
 }
@@ -1879,21 +1847,6 @@ __acquires(&gcwq->lock)
 	set_work_cpu(work, gcwq->cpu);
 	list_del_init(&work->entry);
 
-	/*
-	 * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
-	 * wake up another worker; otherwise, clear HIGHPRI_PENDING.
-	 */
-	if (unlikely(pool->flags & POOL_HIGHPRI_PENDING)) {
-		struct work_struct *nwork = list_first_entry(&pool->worklist,
-					 struct work_struct, entry);
-
-		if (!list_empty(&pool->worklist) &&
-		    get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
-			wake_up_worker(pool);
-		else
-			pool->flags &= ~POOL_HIGHPRI_PENDING;
-	}
-
 	/*
 	 * CPU intensive works don't participate in concurrency
 	 * management.  They're the scheduler's responsibility.
@@ -3047,9 +3000,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 	for_each_cwq_cpu(cpu, wq) {
 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 		struct global_cwq *gcwq = get_gcwq(cpu);
+		int pool_idx = (bool)(flags & WQ_HIGHPRI);
 
 		BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
-		cwq->pool = &gcwq->pool;
+		cwq->pool = &gcwq->pools[pool_idx];
 		cwq->wq = wq;
 		cwq->flush_color = -1;
 		cwq->max_active = max_active;
-- 
cgit v1.2.3


From fd193a6056990edf8160aa23d7ecf442d293dd97 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 17 Jul 2012 12:39:26 -0700
Subject: workqueue: perform cpu down operations from low priority
 cpu_notifier()

Currently, all workqueue cpu hotplug operations run off
CPU_PRI_WORKQUEUE which is higher than normal notifiers.  This is to
ensure that workqueue is up and running while bringing up a CPU before
other notifiers try to use workqueue on the CPU.

Per-cpu workqueues are supposed to remain working and bound to the CPU
for normal CPU_DOWN_PREPARE notifiers.  This holds mostly true even
with workqueue offlining running with higher priority because
workqueue CPU_DOWN_PREPARE only creates a bound trustee thread which
runs the per-cpu workqueue without concurrency management without
explicitly detaching the existing workers.

However, if the trustee needs to create new workers, it creates
unbound workers which may wander off to other CPUs while
CPU_DOWN_PREPARE notifiers are in progress.  Furthermore, if the CPU
down is cancelled, the per-CPU workqueue may end up with workers which
aren't bound to the CPU.

While reliably reproducible with a convoluted artificial test-case
involving scheduling and flushing CPU burning work items from CPU down
notifiers, this isn't very likely to happen in the wild, and, even
when it happens, the effects are likely to be hidden by the following
successful CPU down.

Fix it by using different priorities for up and down notifiers - high
priority for up operations and low priority for down operations.

Workqueue cpu hotplug operations will soon go through further cleanup.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
---
 include/linux/cpu.h |  5 +++--
 kernel/workqueue.c  | 38 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 2e9b9ebbeb78..ce7a074f2519 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -73,8 +73,9 @@ enum {
 	/* migration should happen before other stuff but after perf */
 	CPU_PRI_PERF		= 20,
 	CPU_PRI_MIGRATION	= 10,
-	/* prepare workqueues for other notifiers */
-	CPU_PRI_WORKQUEUE	= 5,
+	/* bring up workqueues before normal notifiers and down after */
+	CPU_PRI_WORKQUEUE_UP	= 5,
+	CPU_PRI_WORKQUEUE_DOWN	= -5,
 };
 
 #define CPU_ONLINE		0x0002 /* CPU (unsigned)v is up */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4fa9e3552f1e..f59b7fd26e26 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3644,6 +3644,41 @@ err_destroy:
 	return NOTIFY_BAD;
 }
 
+/*
+ * Workqueues should be brought up before normal priority CPU notifiers.
+ * This will be registered high priority CPU notifier.
+ */
+static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
+					       unsigned long action,
+					       void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_CANCELED:
+	case CPU_DOWN_FAILED:
+	case CPU_ONLINE:
+		return workqueue_cpu_callback(nfb, action, hcpu);
+	}
+	return NOTIFY_OK;
+}
+
+/*
+ * Workqueues should be brought down after normal priority CPU notifiers.
+ * This will be registered as low priority CPU notifier.
+ */
+static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
+						 unsigned long action,
+						 void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_PREPARE:
+	case CPU_DYING:
+	case CPU_POST_DEAD:
+		return workqueue_cpu_callback(nfb, action, hcpu);
+	}
+	return NOTIFY_OK;
+}
+
 #ifdef CONFIG_SMP
 
 struct work_for_cpu {
@@ -3839,7 +3874,8 @@ static int __init init_workqueues(void)
 	unsigned int cpu;
 	int i;
 
-	cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
+	cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
+	cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
 
 	/* initialize gcwqs */
 	for_each_gcwq_cpu(cpu) {
-- 
cgit v1.2.3


From 24e023a4ca785639b9849bc736b2ffe7a7c1d4a4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 17 Jul 2012 12:39:26 -0700
Subject: workqueue: drop CPU_DYING notifier operation

Workqueue used CPU_DYING notification to mark GCWQ_DISASSOCIATED.
This was necessary because workqueue's CPU_DOWN_PREPARE happened
before other DOWN_PREPARE notifiers and workqueue needed to stay
associated across the rest of DOWN_PREPARE.

After the previous patch, workqueue's DOWN_PREPARE happens after
others and can set GCWQ_DISASSOCIATED directly.  Drop CPU_DYING and
let the trustee set GCWQ_DISASSOCIATED after disabling concurrency
management.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
---
 kernel/workqueue.c | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f59b7fd26e26..1405fb98c0b1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1250,11 +1250,11 @@ static void worker_leave_idle(struct worker *worker)
  * verbatim as it's best effort and blocking and gcwq may be
  * [dis]associated in the meantime.
  *
- * This function tries set_cpus_allowed() and locks gcwq and verifies
- * the binding against GCWQ_DISASSOCIATED which is set during
- * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
- * idle state or fetches works without dropping lock, it can guarantee
- * the scheduling requirement described in the first paragraph.
+ * This function tries set_cpus_allowed() and locks gcwq and verifies the
+ * binding against %GCWQ_DISASSOCIATED which is set during
+ * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
+ * enters idle state or fetches works without dropping lock, it can
+ * guarantee the scheduling requirement described in the first paragraph.
  *
  * CONTEXT:
  * Might sleep.  Called without any lock but returns with gcwq->lock
@@ -3349,6 +3349,12 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	rc = trustee_wait_event(!gcwq_is_managing_workers(gcwq));
 	BUG_ON(rc < 0);
 
+	/*
+	 * We've claimed all manager positions.  Make all workers unbound
+	 * and set DISASSOCIATED.  Before this, all workers except for the
+	 * ones which are still executing works from before the last CPU
+	 * down must be on the cpu.  After this, they may become diasporas.
+	 */
 	for_each_worker_pool(pool, gcwq) {
 		pool->flags |= POOL_MANAGING_WORKERS;
 
@@ -3359,6 +3365,8 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	for_each_busy_worker(worker, i, pos, gcwq)
 		worker->flags |= WORKER_ROGUE;
 
+	gcwq->flags |= GCWQ_DISASSOCIATED;
+
 	/*
 	 * Call schedule() so that we cross rq->lock and thus can
 	 * guarantee sched callbacks see the rogue flag.  This is
@@ -3582,16 +3590,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		}
 		break;
 
-	case CPU_DYING:
-		/*
-		 * Before this, the trustee and all workers except for
-		 * the ones which are still executing works from
-		 * before the last CPU down must be on the cpu.  After
-		 * this, they'll all be diasporas.
-		 */
-		gcwq->flags |= GCWQ_DISASSOCIATED;
-		break;
-
 	case CPU_POST_DEAD:
 		gcwq->trustee_state = TRUSTEE_BUTCHER;
 		/* fall through */
@@ -3672,7 +3670,6 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
 {
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_DOWN_PREPARE:
-	case CPU_DYING:
 	case CPU_POST_DEAD:
 		return workqueue_cpu_callback(nfb, action, hcpu);
 	}
-- 
cgit v1.2.3


From e9a4a180064edba5451bf067fc16744b0ca6b977 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 17 Jul 2012 12:39:27 -0700
Subject: workqueue: ROGUE workers are UNBOUND workers

Currently, WORKER_UNBOUND is used to mark workers for the unbound
global_cwq and WORKER_ROGUE is used to mark workers for disassociated
per-cpu global_cwqs.  Both are used to make the marked worker skip
concurrency management and the only place they make any difference is
in worker_enter_idle() where WORKER_ROGUE is used to skip scheduling
idle timer, which can easily be replaced with trustee state testing.

This patch replaces WORKER_ROGUE with WORKER_UNBOUND and drops
WORKER_ROGUE.  This is to prepare for removing trustee and handling
disassociated global_cwqs as unbound.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
---
 kernel/workqueue.c | 46 +++++++++++++++++++++-------------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1405fb98c0b1..af512927c607 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -58,13 +58,12 @@ enum {
 	WORKER_DIE		= 1 << 1,	/* die die die */
 	WORKER_IDLE		= 1 << 2,	/* is idle */
 	WORKER_PREP		= 1 << 3,	/* preparing to run works */
-	WORKER_ROGUE		= 1 << 4,	/* not bound to any cpu */
 	WORKER_REBIND		= 1 << 5,	/* mom is home, come back */
 	WORKER_CPU_INTENSIVE	= 1 << 6,	/* cpu intensive */
 	WORKER_UNBOUND		= 1 << 7,	/* worker is unbound */
 
-	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
-				  WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
+	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND |
+				  WORKER_CPU_INTENSIVE,
 
 	/* gcwq->trustee_state */
 	TRUSTEE_START		= 0,		/* start */
@@ -1198,7 +1197,7 @@ static void worker_enter_idle(struct worker *worker)
 	/* idle_list is LIFO */
 	list_add(&worker->entry, &pool->idle_list);
 
-	if (likely(!(worker->flags & WORKER_ROGUE))) {
+	if (likely(gcwq->trustee_state != TRUSTEE_DONE)) {
 		if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
 			mod_timer(&pool->idle_timer,
 				  jiffies + IDLE_WORKER_TIMEOUT);
@@ -1207,7 +1206,7 @@ static void worker_enter_idle(struct worker *worker)
 
 	/*
 	 * Sanity check nr_running.  Because trustee releases gcwq->lock
-	 * between setting %WORKER_ROGUE and zapping nr_running, the
+	 * between setting %WORKER_UNBOUND and zapping nr_running, the
 	 * warning may trigger spuriously.  Check iff trustee is idle.
 	 */
 	WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
@@ -1301,10 +1300,10 @@ __acquires(&gcwq->lock)
 }
 
 /*
- * Function for worker->rebind_work used to rebind rogue busy workers
- * to the associated cpu which is coming back online.  This is
- * scheduled by cpu up but can race with other cpu hotplug operations
- * and may be executed twice without intervening cpu down.
+ * Function for worker->rebind_work used to rebind unbound busy workers to
+ * the associated cpu which is coming back online.  This is scheduled by
+ * cpu up but can race with other cpu hotplug operations and may be
+ * executed twice without intervening cpu down.
  */
 static void worker_rebind_fn(struct work_struct *work)
 {
@@ -1385,9 +1384,8 @@ static struct worker *create_worker(struct worker_pool *pool, bool bind)
 		set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
 
 	/*
-	 * A rogue worker will become a regular one if CPU comes
-	 * online later on.  Make sure every worker has
-	 * PF_THREAD_BOUND set.
+	 * An unbound worker will become a regular one if CPU comes online
+	 * later on.  Make sure every worker has PF_THREAD_BOUND set.
 	 */
 	if (bind && !on_unbound_cpu)
 		kthread_bind(worker->task, gcwq->cpu);
@@ -3215,11 +3213,10 @@ EXPORT_SYMBOL_GPL(work_busy);
  * gcwqs serve mix of short, long and very long running works making
  * blocked draining impractical.
  *
- * This is solved by allowing a gcwq to be detached from CPU, running
- * it with unbound (rogue) workers and allowing it to be reattached
- * later if the cpu comes back online.  A separate thread is created
- * to govern a gcwq in such state and is called the trustee of the
- * gcwq.
+ * This is solved by allowing a gcwq to be detached from CPU, running it
+ * with unbound workers and allowing it to be reattached later if the cpu
+ * comes back online.  A separate thread is created to govern a gcwq in
+ * such state and is called the trustee of the gcwq.
  *
  * Trustee states and their descriptions.
  *
@@ -3359,19 +3356,18 @@ static int __cpuinit trustee_thread(void *__gcwq)
 		pool->flags |= POOL_MANAGING_WORKERS;
 
 		list_for_each_entry(worker, &pool->idle_list, entry)
-			worker->flags |= WORKER_ROGUE;
+			worker->flags |= WORKER_UNBOUND;
 	}
 
 	for_each_busy_worker(worker, i, pos, gcwq)
-		worker->flags |= WORKER_ROGUE;
+		worker->flags |= WORKER_UNBOUND;
 
 	gcwq->flags |= GCWQ_DISASSOCIATED;
 
 	/*
-	 * Call schedule() so that we cross rq->lock and thus can
-	 * guarantee sched callbacks see the rogue flag.  This is
-	 * necessary as scheduler callbacks may be invoked from other
-	 * cpus.
+	 * Call schedule() so that we cross rq->lock and thus can guarantee
+	 * sched callbacks see the unbound flag.  This is necessary as
+	 * scheduler callbacks may be invoked from other cpus.
 	 */
 	spin_unlock_irq(&gcwq->lock);
 	schedule();
@@ -3439,7 +3435,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
 				worker = create_worker(pool, false);
 				spin_lock_irq(&gcwq->lock);
 				if (worker) {
-					worker->flags |= WORKER_ROGUE;
+					worker->flags |= WORKER_UNBOUND;
 					start_worker(worker);
 				}
 			}
@@ -3488,7 +3484,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
 		 * rebinding is scheduled.
 		 */
 		worker->flags |= WORKER_REBIND;
-		worker->flags &= ~WORKER_ROGUE;
+		worker->flags &= ~WORKER_UNBOUND;
 
 		/* queue rebind_work, wq doesn't matter, use the default one */
 		if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
-- 
cgit v1.2.3


From 4f2c8c302881be070c3d1056a24fbfb0021479d2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 17 Jul 2012 12:39:27 -0700
Subject: workqueue: use mutex for global_cwq manager exclusion

POOL_MANAGING_WORKERS is used to ensure that at most one worker takes
the manager role at any given time on a given global_cwq.  Trustee
later hitched on it to assume manager adding blocking wait for the
bit.  As trustee already needed a custom wait mechanism, waiting for
MANAGING_WORKERS was rolled into the same mechanism.

Trustee is scheduled to be removed.  This patch separates out
MANAGING_WORKERS wait into per-pool mutex.  Workers use
mutex_trylock() to test for manager role and trustee uses mutex_lock()
to claim manager roles.

gcwq_claim/release_management() helpers are added to grab and release
manager roles of all pools on a global_cwq.  gcwq_claim_management()
always grabs pool manager mutexes in ascending pool index order and
uses pool index as lockdep subclass.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
---
 kernel/workqueue.c | 65 ++++++++++++++++++++++--------------------------------
 1 file changed, 26 insertions(+), 39 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index af512927c607..f7a00697d150 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -51,7 +51,6 @@ enum {
 
 	/* pool flags */
 	POOL_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */
-	POOL_MANAGING_WORKERS	= 1 << 1,	/* managing workers */
 
 	/* worker flags */
 	WORKER_STARTED		= 1 << 0,	/* started */
@@ -155,6 +154,7 @@ struct worker_pool {
 	struct timer_list	idle_timer;	/* L: worker idle timeout */
 	struct timer_list	mayday_timer;	/* L: SOS timer for workers */
 
+	struct mutex		manager_mutex;	/* mutex manager should hold */
 	struct ida		worker_ida;	/* L: for worker IDs */
 	struct worker		*first_idle;	/* L: first idle worker */
 };
@@ -644,7 +644,7 @@ static bool need_to_manage_workers(struct worker_pool *pool)
 /* Do we have too many workers and should some go away? */
 static bool too_many_workers(struct worker_pool *pool)
 {
-	bool managing = pool->flags & POOL_MANAGING_WORKERS;
+	bool managing = mutex_is_locked(&pool->manager_mutex);
 	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
 	int nr_busy = pool->nr_workers - nr_idle;
 
@@ -1655,14 +1655,12 @@ static bool maybe_destroy_workers(struct worker_pool *pool)
 static bool manage_workers(struct worker *worker)
 {
 	struct worker_pool *pool = worker->pool;
-	struct global_cwq *gcwq = pool->gcwq;
 	bool ret = false;
 
-	if (pool->flags & POOL_MANAGING_WORKERS)
+	if (!mutex_trylock(&pool->manager_mutex))
 		return ret;
 
 	pool->flags &= ~POOL_MANAGE_WORKERS;
-	pool->flags |= POOL_MANAGING_WORKERS;
 
 	/*
 	 * Destroy and then create so that may_start_working() is true
@@ -1671,15 +1669,7 @@ static bool manage_workers(struct worker *worker)
 	ret |= maybe_destroy_workers(pool);
 	ret |= maybe_create_worker(pool);
 
-	pool->flags &= ~POOL_MANAGING_WORKERS;
-
-	/*
-	 * The trustee might be waiting to take over the manager
-	 * position, tell it we're done.
-	 */
-	if (unlikely(gcwq->trustee))
-		wake_up_all(&gcwq->trustee_wait);
-
+	mutex_unlock(&pool->manager_mutex);
 	return ret;
 }
 
@@ -3255,6 +3245,24 @@ EXPORT_SYMBOL_GPL(work_busy);
  *                         ----------------> RELEASE --------------
  */
 
+/* claim manager positions of all pools */
+static void gcwq_claim_management(struct global_cwq *gcwq)
+{
+	struct worker_pool *pool;
+
+	for_each_worker_pool(pool, gcwq)
+		mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools);
+}
+
+/* release manager positions */
+static void gcwq_release_management(struct global_cwq *gcwq)
+{
+	struct worker_pool *pool;
+
+	for_each_worker_pool(pool, gcwq)
+		mutex_unlock(&pool->manager_mutex);
+}
+
 /**
  * trustee_wait_event_timeout - timed event wait for trustee
  * @cond: condition to wait for
@@ -3304,16 +3312,6 @@ EXPORT_SYMBOL_GPL(work_busy);
 	__ret1 < 0 ? -1 : 0;						\
 })
 
-static bool gcwq_is_managing_workers(struct global_cwq *gcwq)
-{
-	struct worker_pool *pool;
-
-	for_each_worker_pool(pool, gcwq)
-		if (pool->flags & POOL_MANAGING_WORKERS)
-			return true;
-	return false;
-}
-
 static bool gcwq_has_idle_workers(struct global_cwq *gcwq)
 {
 	struct worker_pool *pool;
@@ -3336,15 +3334,8 @@ static int __cpuinit trustee_thread(void *__gcwq)
 
 	BUG_ON(gcwq->cpu != smp_processor_id());
 
+	gcwq_claim_management(gcwq);
 	spin_lock_irq(&gcwq->lock);
-	/*
-	 * Claim the manager position and make all workers rogue.
-	 * Trustee must be bound to the target cpu and can't be
-	 * cancelled.
-	 */
-	BUG_ON(gcwq->cpu != smp_processor_id());
-	rc = trustee_wait_event(!gcwq_is_managing_workers(gcwq));
-	BUG_ON(rc < 0);
 
 	/*
 	 * We've claimed all manager positions.  Make all workers unbound
@@ -3352,12 +3343,9 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * ones which are still executing works from before the last CPU
 	 * down must be on the cpu.  After this, they may become diasporas.
 	 */
-	for_each_worker_pool(pool, gcwq) {
-		pool->flags |= POOL_MANAGING_WORKERS;
-
+	for_each_worker_pool(pool, gcwq)
 		list_for_each_entry(worker, &pool->idle_list, entry)
 			worker->flags |= WORKER_UNBOUND;
-	}
 
 	for_each_busy_worker(worker, i, pos, gcwq)
 		worker->flags |= WORKER_UNBOUND;
@@ -3497,9 +3485,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
 			    work_color_to_flags(WORK_NO_COLOR));
 	}
 
-	/* relinquish manager role */
-	for_each_worker_pool(pool, gcwq)
-		pool->flags &= ~POOL_MANAGING_WORKERS;
+	gcwq_release_management(gcwq);
 
 	/* notify completion */
 	gcwq->trustee = NULL;
@@ -3894,6 +3880,7 @@ static int __init init_workqueues(void)
 			setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
 				    (unsigned long)pool);
 
+			mutex_init(&pool->manager_mutex);
 			ida_init(&pool->worker_ida);
 		}
 
-- 
cgit v1.2.3


From b73e4b170c7681b18ca85b483d43644fc49ed8aa Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 17 Jul 2012 12:39:27 -0700
Subject: workqueue: drop @bind from create_worker()

Currently, create_worker()'s callers are responsible for deciding
whether the newly created worker should be bound to the associated CPU
and create_worker() sets WORKER_UNBOUND only for the workers for the
unbound global_cwq.  Creation during normal operation is always via
maybe_create_worker() and @bind is true.  For workers created during
hotplug, @bind is false.

Normal operation path is planned to be used even while the CPU is
going through hotplug operations or offline and this static decision
won't work.

Drop @bind from create_worker() and decide whether to bind by looking
at GCWQ_DISASSOCIATED.  create_worker() will also set WORKER_UNBOUND
autmatically if disassociated.  To avoid flipping GCWQ_DISASSOCIATED
while create_worker() is in progress, the flag is now allowed to be
changed only while holding all manager_mutexes on the global_cwq.

This requires that GCWQ_DISASSOCIATED is not cleared behind trustee's
back.  CPU_ONLINE no longer clears DISASSOCIATED before flushing
trustee, which clears DISASSOCIATED before rebinding remaining workers
if asked to release.  For cases where trustee isn't around, CPU_ONLINE
clears DISASSOCIATED after flushing trustee.  Also, now, first_idle
has UNBOUND set on creation which is explicitly cleared by CPU_ONLINE
while binding it.  These convolutions will soon be removed by further
simplification of CPU hotplug path.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
---
 kernel/workqueue.c | 64 ++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 19 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f7a00697d150..e1d05e51a80a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -45,7 +45,22 @@
 #include "workqueue_sched.h"
 
 enum {
-	/* global_cwq flags */
+	/*
+	 * global_cwq flags
+	 *
+	 * A bound gcwq is either associated or disassociated with its CPU.
+	 * While associated (!DISASSOCIATED), all workers are bound to the
+	 * CPU and none has %WORKER_UNBOUND set and concurrency management
+	 * is in effect.
+	 *
+	 * While DISASSOCIATED, the cpu may be offline and all workers have
+	 * %WORKER_UNBOUND set and concurrency management disabled, and may
+	 * be executing on any CPU.  The gcwq behaves as an unbound one.
+	 *
+	 * Note that DISASSOCIATED can be flipped only while holding
+	 * managership of all pools on the gcwq to avoid changing binding
+	 * state while create_worker() is in progress.
+	 */
 	GCWQ_DISASSOCIATED	= 1 << 0,	/* cpu can't serve workers */
 	GCWQ_FREEZING		= 1 << 1,	/* freeze in progress */
 
@@ -1334,7 +1349,6 @@ static struct worker *alloc_worker(void)
 /**
  * create_worker - create a new workqueue worker
  * @pool: pool the new worker will belong to
- * @bind: whether to set affinity to @cpu or not
  *
  * Create a new worker which is bound to @pool.  The returned worker
  * can be started by calling start_worker() or destroyed using
@@ -1346,10 +1360,9 @@ static struct worker *alloc_worker(void)
  * RETURNS:
  * Pointer to the newly created worker.
  */
-static struct worker *create_worker(struct worker_pool *pool, bool bind)
+static struct worker *create_worker(struct worker_pool *pool)
 {
 	struct global_cwq *gcwq = pool->gcwq;
-	bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
 	const char *pri = worker_pool_pri(pool) ? "H" : "";
 	struct worker *worker = NULL;
 	int id = -1;
@@ -1370,7 +1383,7 @@ static struct worker *create_worker(struct worker_pool *pool, bool bind)
 	worker->pool = pool;
 	worker->id = id;
 
-	if (!on_unbound_cpu)
+	if (gcwq->cpu != WORK_CPU_UNBOUND)
 		worker->task = kthread_create_on_node(worker_thread,
 					worker, cpu_to_node(gcwq->cpu),
 					"kworker/%u:%d%s", gcwq->cpu, id, pri);
@@ -1384,15 +1397,19 @@ static struct worker *create_worker(struct worker_pool *pool, bool bind)
 		set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
 
 	/*
-	 * An unbound worker will become a regular one if CPU comes online
-	 * later on.  Make sure every worker has PF_THREAD_BOUND set.
+	 * Determine CPU binding of the new worker depending on
+	 * %GCWQ_DISASSOCIATED.  The caller is responsible for ensuring the
+	 * flag remains stable across this function.  See the comments
+	 * above the flag definition for details.
+	 *
+	 * As an unbound worker may later become a regular one if CPU comes
+	 * online, make sure every worker has %PF_THREAD_BOUND set.
 	 */
-	if (bind && !on_unbound_cpu)
+	if (!(gcwq->flags & GCWQ_DISASSOCIATED)) {
 		kthread_bind(worker->task, gcwq->cpu);
-	else {
+	} else {
 		worker->task->flags |= PF_THREAD_BOUND;
-		if (on_unbound_cpu)
-			worker->flags |= WORKER_UNBOUND;
+		worker->flags |= WORKER_UNBOUND;
 	}
 
 	return worker;
@@ -1568,7 +1585,7 @@ restart:
 	while (true) {
 		struct worker *worker;
 
-		worker = create_worker(pool, true);
+		worker = create_worker(pool);
 		if (worker) {
 			del_timer_sync(&pool->mayday_timer);
 			spin_lock_irq(&gcwq->lock);
@@ -3420,12 +3437,10 @@ static int __cpuinit trustee_thread(void *__gcwq)
 
 			if (need_to_create_worker(pool)) {
 				spin_unlock_irq(&gcwq->lock);
-				worker = create_worker(pool, false);
+				worker = create_worker(pool);
 				spin_lock_irq(&gcwq->lock);
-				if (worker) {
-					worker->flags |= WORKER_UNBOUND;
+				if (worker)
 					start_worker(worker);
-				}
 			}
 		}
 
@@ -3463,6 +3478,10 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	for_each_worker_pool(pool, gcwq)
 		WARN_ON(!list_empty(&pool->idle_list));
 
+	/* if we're reassociating, clear DISASSOCIATED */
+	if (gcwq->trustee_state == TRUSTEE_RELEASE)
+		gcwq->flags &= ~GCWQ_DISASSOCIATED;
+
 	for_each_busy_worker(worker, i, pos, gcwq) {
 		struct work_struct *rebind_work = &worker->rebind_work;
 
@@ -3546,7 +3565,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		i = 0;
 		for_each_worker_pool(pool, gcwq) {
 			BUG_ON(pool->first_idle);
-			new_workers[i] = create_worker(pool, false);
+			new_workers[i] = create_worker(pool);
 			if (!new_workers[i++])
 				goto err_destroy;
 		}
@@ -3584,13 +3603,19 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 
 	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
-		gcwq->flags &= ~GCWQ_DISASSOCIATED;
 		if (gcwq->trustee_state != TRUSTEE_DONE) {
 			gcwq->trustee_state = TRUSTEE_RELEASE;
 			wake_up_process(gcwq->trustee);
 			wait_trustee_state(gcwq, TRUSTEE_DONE);
 		}
 
+		/*
+		 * Either DISASSOCIATED is already cleared or no worker is
+		 * left on the gcwq.  Safe to clear DISASSOCIATED without
+		 * claiming managers.
+		 */
+		gcwq->flags &= ~GCWQ_DISASSOCIATED;
+
 		/*
 		 * Trustee is done and there might be no worker left.
 		 * Put the first_idle in and request a real manager to
@@ -3601,6 +3626,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 			kthread_bind(pool->first_idle->task, cpu);
 			spin_lock_irq(&gcwq->lock);
 			pool->flags |= POOL_MANAGE_WORKERS;
+			pool->first_idle->flags &= ~WORKER_UNBOUND;
 			start_worker(pool->first_idle);
 			pool->first_idle = NULL;
 		}
@@ -3899,7 +3925,7 @@ static int __init init_workqueues(void)
 		for_each_worker_pool(pool, gcwq) {
 			struct worker *worker;
 
-			worker = create_worker(pool, true);
+			worker = create_worker(pool);
 			BUG_ON(!worker);
 			spin_lock_irq(&gcwq->lock);
 			start_worker(worker);
-- 
cgit v1.2.3


From 10fdc72e960a87151c1ee0bb01377345c05827f8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 17 Jul 2012 12:39:27 -0700
Subject: workqueue: reimplement CPU online rebinding to handle idle workers

Currently, if there are left workers when a CPU is being brough back
online, the trustee kills all idle workers and scheduled rebind_work
so that they re-bind to the CPU after the currently executing work is
finished.  This works for busy workers because concurrency management
doesn't try to wake up them from scheduler callbacks, which require
the target task to be on the local run queue.  The busy worker bumps
concurrency counter appropriately as it clears WORKER_UNBOUND from the
rebind work item and it's bound to the CPU before returning to the
idle state.

To reduce CPU on/offlining overhead (as many embedded systems use it
for powersaving) and simplify the code path, workqueue is planned to
be modified to retain idle workers across CPU on/offlining.  This
patch reimplements CPU online rebinding such that it can also handle
idle workers.

As noted earlier, due to the local wakeup requirement, rebinding idle
workers is tricky.  All idle workers must be re-bound before scheduler
callbacks are enabled.  This is achieved by interlocking idle
re-binding.  Idle workers are requested to re-bind and then hold until
all idle re-binding is complete so that no bound worker starts
executing work item.  Only after all idle workers are re-bound and
parked, CPU_ONLINE proceeds to release them and queue rebind work item
to busy workers thus guaranteeing scheduler callbacks aren't invoked
until all idle workers are ready.

worker_rebind_fn() is renamed to busy_worker_rebind_fn() and
idle_worker_rebind() for idle workers is added.  Rebinding logic is
moved to rebind_workers() and now called from CPU_ONLINE after
flushing trustee.  While at it, add CPU sanity check in
worker_thread().

Note that now a worker may become idle or the manager between trustee
release and rebinding during CPU_ONLINE.  As the previous patch
updated create_worker() so that it can be used by regular manager
while unbound and this patch implements idle re-binding, this is safe.

This prepares for removal of trustee and keeping idle workers across
CPU hotplugs.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
---
 kernel/workqueue.c | 215 +++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 166 insertions(+), 49 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e1d05e51a80a..6927fecae412 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -133,6 +133,7 @@ enum {
 
 struct global_cwq;
 struct worker_pool;
+struct idle_rebind;
 
 /*
  * The poor guys doing the actual heavy lifting.  All on-duty workers
@@ -154,7 +155,10 @@ struct worker {
 	unsigned long		last_active;	/* L: last active timestamp */
 	unsigned int		flags;		/* X: flags */
 	int			id;		/* I: worker id */
-	struct work_struct	rebind_work;	/* L: rebind worker to cpu */
+
+	/* for rebinding worker to CPU */
+	struct idle_rebind	*idle_rebind;	/* L: for idle worker */
+	struct work_struct	rebind_work;	/* L: for busy worker */
 };
 
 struct worker_pool {
@@ -190,6 +194,8 @@ struct global_cwq {
 
 	struct worker_pool	pools[2];	/* normal and highpri pools */
 
+	wait_queue_head_t	rebind_hold;	/* rebind hold wait */
+
 	struct task_struct	*trustee;	/* L: for gcwq shutdown */
 	unsigned int		trustee_state;	/* L: trustee state */
 	wait_queue_head_t	trustee_wait;	/* trustee wait */
@@ -1314,13 +1320,37 @@ __acquires(&gcwq->lock)
 	}
 }
 
+struct idle_rebind {
+	int			cnt;		/* # workers to be rebound */
+	struct completion	done;		/* all workers rebound */
+};
+
+/*
+ * Rebind an idle @worker to its CPU.  During CPU onlining, this has to
+ * happen synchronously for idle workers.  worker_thread() will test
+ * %WORKER_REBIND before leaving idle and call this function.
+ */
+static void idle_worker_rebind(struct worker *worker)
+{
+	struct global_cwq *gcwq = worker->pool->gcwq;
+
+	/* CPU must be online at this point */
+	WARN_ON(!worker_maybe_bind_and_lock(worker));
+	if (!--worker->idle_rebind->cnt)
+		complete(&worker->idle_rebind->done);
+	spin_unlock_irq(&worker->pool->gcwq->lock);
+
+	/* we did our part, wait for rebind_workers() to finish up */
+	wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));
+}
+
 /*
- * Function for worker->rebind_work used to rebind unbound busy workers to
+ * Function for @worker->rebind.work used to rebind unbound busy workers to
  * the associated cpu which is coming back online.  This is scheduled by
  * cpu up but can race with other cpu hotplug operations and may be
  * executed twice without intervening cpu down.
  */
-static void worker_rebind_fn(struct work_struct *work)
+static void busy_worker_rebind_fn(struct work_struct *work)
 {
 	struct worker *worker = container_of(work, struct worker, rebind_work);
 	struct global_cwq *gcwq = worker->pool->gcwq;
@@ -1331,6 +1361,112 @@ static void worker_rebind_fn(struct work_struct *work)
 	spin_unlock_irq(&gcwq->lock);
 }
 
+/**
+ * rebind_workers - rebind all workers of a gcwq to the associated CPU
+ * @gcwq: gcwq of interest
+ *
+ * @gcwq->cpu is coming online.  Rebind all workers to the CPU.  Rebinding
+ * is different for idle and busy ones.
+ *
+ * The idle ones should be rebound synchronously and idle rebinding should
+ * be complete before any worker starts executing work items with
+ * concurrency management enabled; otherwise, scheduler may oops trying to
+ * wake up non-local idle worker from wq_worker_sleeping().
+ *
+ * This is achieved by repeatedly requesting rebinding until all idle
+ * workers are known to have been rebound under @gcwq->lock and holding all
+ * idle workers from becoming busy until idle rebinding is complete.
+ *
+ * Once idle workers are rebound, busy workers can be rebound as they
+ * finish executing their current work items.  Queueing the rebind work at
+ * the head of their scheduled lists is enough.  Note that nr_running will
+ * be properbly bumped as busy workers rebind.
+ *
+ * On return, all workers are guaranteed to either be bound or have rebind
+ * work item scheduled.
+ */
+static void rebind_workers(struct global_cwq *gcwq)
+	__releases(&gcwq->lock) __acquires(&gcwq->lock)
+{
+	struct idle_rebind idle_rebind;
+	struct worker_pool *pool;
+	struct worker *worker;
+	struct hlist_node *pos;
+	int i;
+
+	lockdep_assert_held(&gcwq->lock);
+
+	for_each_worker_pool(pool, gcwq)
+		lockdep_assert_held(&pool->manager_mutex);
+
+	/*
+	 * Rebind idle workers.  Interlocked both ways.  We wait for
+	 * workers to rebind via @idle_rebind.done.  Workers will wait for
+	 * us to finish up by watching %WORKER_REBIND.
+	 */
+	init_completion(&idle_rebind.done);
+retry:
+	idle_rebind.cnt = 1;
+	INIT_COMPLETION(idle_rebind.done);
+
+	/* set REBIND and kick idle ones, we'll wait for these later */
+	for_each_worker_pool(pool, gcwq) {
+		list_for_each_entry(worker, &pool->idle_list, entry) {
+			if (worker->flags & WORKER_REBIND)
+				continue;
+
+			/* morph UNBOUND to REBIND */
+			worker->flags &= ~WORKER_UNBOUND;
+			worker->flags |= WORKER_REBIND;
+
+			idle_rebind.cnt++;
+			worker->idle_rebind = &idle_rebind;
+
+			/* worker_thread() will call idle_worker_rebind() */
+			wake_up_process(worker->task);
+		}
+	}
+
+	if (--idle_rebind.cnt) {
+		spin_unlock_irq(&gcwq->lock);
+		wait_for_completion(&idle_rebind.done);
+		spin_lock_irq(&gcwq->lock);
+		/* busy ones might have become idle while waiting, retry */
+		goto retry;
+	}
+
+	/*
+	 * All idle workers are rebound and waiting for %WORKER_REBIND to
+	 * be cleared inside idle_worker_rebind().  Clear and release.
+	 * Clearing %WORKER_REBIND from this foreign context is safe
+	 * because these workers are still guaranteed to be idle.
+	 */
+	for_each_worker_pool(pool, gcwq)
+		list_for_each_entry(worker, &pool->idle_list, entry)
+			worker->flags &= ~WORKER_REBIND;
+
+	wake_up_all(&gcwq->rebind_hold);
+
+	/* rebind busy workers */
+	for_each_busy_worker(worker, i, pos, gcwq) {
+		struct work_struct *rebind_work = &worker->rebind_work;
+
+		/* morph UNBOUND to REBIND */
+		worker->flags &= ~WORKER_UNBOUND;
+		worker->flags |= WORKER_REBIND;
+
+		if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
+				     work_data_bits(rebind_work)))
+			continue;
+
+		/* wq doesn't matter, use the default one */
+		debug_work_activate(rebind_work);
+		insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
+			    worker->scheduled.next,
+			    work_color_to_flags(WORK_NO_COLOR));
+	}
+}
+
 static struct worker *alloc_worker(void)
 {
 	struct worker *worker;
@@ -1339,7 +1475,7 @@ static struct worker *alloc_worker(void)
 	if (worker) {
 		INIT_LIST_HEAD(&worker->entry);
 		INIT_LIST_HEAD(&worker->scheduled);
-		INIT_WORK(&worker->rebind_work, worker_rebind_fn);
+		INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);
 		/* on creation a worker is in !idle && prep state */
 		worker->flags = WORKER_PREP;
 	}
@@ -1829,6 +1965,9 @@ __acquires(&gcwq->lock)
 
 	lockdep_copy_map(&lockdep_map, &work->lockdep_map);
 #endif
+	WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) &&
+		     raw_smp_processor_id() != gcwq->cpu);
+
 	/*
 	 * A single work shouldn't be executed concurrently by
 	 * multiple workers on a single cpu.  Check whether anyone is
@@ -1946,11 +2085,20 @@ static int worker_thread(void *__worker)
 woke_up:
 	spin_lock_irq(&gcwq->lock);
 
-	/* DIE can be set only while we're idle, checking here is enough */
-	if (worker->flags & WORKER_DIE) {
+	/*
+	 * DIE can be set only while idle and REBIND set while busy has
+	 * @worker->rebind_work scheduled.  Checking here is enough.
+	 */
+	if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) {
 		spin_unlock_irq(&gcwq->lock);
-		worker->task->flags &= ~PF_WQ_WORKER;
-		return 0;
+
+		if (worker->flags & WORKER_DIE) {
+			worker->task->flags &= ~PF_WQ_WORKER;
+			return 0;
+		}
+
+		idle_worker_rebind(worker);
+		goto woke_up;
 	}
 
 	worker_leave_idle(worker);
@@ -3468,42 +3616,6 @@ static int __cpuinit trustee_thread(void *__gcwq)
 		}
 	} while (i && rc >= 0);
 
-	/*
-	 * At this point, either draining has completed and no worker
-	 * is left, or cpu down has been canceled or the cpu is being
-	 * brought back up.  There shouldn't be any idle one left.
-	 * Tell the remaining busy ones to rebind once it finishes the
-	 * currently scheduled works by scheduling the rebind_work.
-	 */
-	for_each_worker_pool(pool, gcwq)
-		WARN_ON(!list_empty(&pool->idle_list));
-
-	/* if we're reassociating, clear DISASSOCIATED */
-	if (gcwq->trustee_state == TRUSTEE_RELEASE)
-		gcwq->flags &= ~GCWQ_DISASSOCIATED;
-
-	for_each_busy_worker(worker, i, pos, gcwq) {
-		struct work_struct *rebind_work = &worker->rebind_work;
-
-		/*
-		 * Rebind_work may race with future cpu hotplug
-		 * operations.  Use a separate flag to mark that
-		 * rebinding is scheduled.
-		 */
-		worker->flags |= WORKER_REBIND;
-		worker->flags &= ~WORKER_UNBOUND;
-
-		/* queue rebind_work, wq doesn't matter, use the default one */
-		if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
-				     work_data_bits(rebind_work)))
-			continue;
-
-		debug_work_activate(rebind_work);
-		insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
-			    worker->scheduled.next,
-			    work_color_to_flags(WORK_NO_COLOR));
-	}
-
 	gcwq_release_management(gcwq);
 
 	/* notify completion */
@@ -3609,13 +3721,16 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 			wait_trustee_state(gcwq, TRUSTEE_DONE);
 		}
 
-		/*
-		 * Either DISASSOCIATED is already cleared or no worker is
-		 * left on the gcwq.  Safe to clear DISASSOCIATED without
-		 * claiming managers.
-		 */
+		spin_unlock_irq(&gcwq->lock);
+		gcwq_claim_management(gcwq);
+		spin_lock_irq(&gcwq->lock);
+
 		gcwq->flags &= ~GCWQ_DISASSOCIATED;
 
+		rebind_workers(gcwq);
+
+		gcwq_release_management(gcwq);
+
 		/*
 		 * Trustee is done and there might be no worker left.
 		 * Put the first_idle in and request a real manager to
@@ -3910,6 +4025,8 @@ static int __init init_workqueues(void)
 			ida_init(&pool->worker_ida);
 		}
 
+		init_waitqueue_head(&gcwq->rebind_hold);
+
 		gcwq->trustee_state = TRUSTEE_DONE;
 		init_waitqueue_head(&gcwq->trustee_wait);
 	}
-- 
cgit v1.2.3


From 8388831f42902992072858526e2e007c2defe0a6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 17 Jul 2012 12:39:27 -0700
Subject: workqueue: don't butcher idle workers on an offline CPU

Currently, during CPU offlining, after all pending work items are
drained, the trustee butchers all workers.  Also, on CPU onlining
failure, workqueue_cpu_callback() ensures that the first idle worker
is destroyed.  Combined, these guarantee that an offline CPU doesn't
have any worker for it once all the lingering work items are finished.

This guarantee isn't really necessary and makes CPU on/offlining more
expensive than needs to be, especially for platforms which use CPU
hotplug for powersaving.

This patch lets offline CPUs removes idle worker butchering from the
trustee and let a CPU which failed onlining keep the created first
worker.  The first worker is created if the CPU doesn't have any
during CPU_DOWN_PREPARE and started right away.  If onlining succeeds,
the rebind_workers() call in CPU_ONLINE will rebind it like any other
workers.  If onlining fails, the worker is left alone till the next
try.

This makes CPU hotplugs cheaper by allowing global_cwqs to keep
workers across them and simplifies code.

Note that trustee doesn't re-arm idle timer when it's done and thus
the disassociated global_cwq will keep all workers until it comes back
online.  This will be improved by further patches.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
---
 kernel/workqueue.c | 94 ++++++++----------------------------------------------
 1 file changed, 14 insertions(+), 80 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6927fecae412..acfabb22e2c4 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -175,7 +175,6 @@ struct worker_pool {
 
 	struct mutex		manager_mutex;	/* mutex manager should hold */
 	struct ida		worker_ida;	/* L: for worker IDs */
-	struct worker		*first_idle;	/* L: first idle worker */
 };
 
 /*
@@ -3477,16 +3476,6 @@ static void gcwq_release_management(struct global_cwq *gcwq)
 	__ret1 < 0 ? -1 : 0;						\
 })
 
-static bool gcwq_has_idle_workers(struct global_cwq *gcwq)
-{
-	struct worker_pool *pool;
-
-	for_each_worker_pool(pool, gcwq)
-		if (!list_empty(&pool->idle_list))
-			return true;
-	return false;
-}
-
 static int __cpuinit trustee_thread(void *__gcwq)
 {
 	struct global_cwq *gcwq = __gcwq;
@@ -3494,7 +3483,6 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	struct worker *worker;
 	struct work_struct *work;
 	struct hlist_node *pos;
-	long rc;
 	int i;
 
 	BUG_ON(gcwq->cpu != smp_processor_id());
@@ -3597,25 +3585,6 @@ static int __cpuinit trustee_thread(void *__gcwq)
 			break;
 	}
 
-	/*
-	 * Either all works have been scheduled and cpu is down, or
-	 * cpu down has already been canceled.  Wait for and butcher
-	 * all workers till we're canceled.
-	 */
-	do {
-		rc = trustee_wait_event(gcwq_has_idle_workers(gcwq));
-
-		i = 0;
-		for_each_worker_pool(pool, gcwq) {
-			while (!list_empty(&pool->idle_list)) {
-				worker = list_first_entry(&pool->idle_list,
-							  struct worker, entry);
-				destroy_worker(worker);
-			}
-			i |= pool->nr_workers;
-		}
-	} while (i && rc >= 0);
-
 	gcwq_release_management(gcwq);
 
 	/* notify completion */
@@ -3658,10 +3627,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	unsigned int cpu = (unsigned long)hcpu;
 	struct global_cwq *gcwq = get_gcwq(cpu);
 	struct task_struct *new_trustee = NULL;
-	struct worker *new_workers[NR_WORKER_POOLS] = { };
 	struct worker_pool *pool;
 	unsigned long flags;
-	int i;
 
 	action &= ~CPU_TASKS_FROZEN;
 
@@ -3672,14 +3639,22 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		if (IS_ERR(new_trustee))
 			return notifier_from_errno(PTR_ERR(new_trustee));
 		kthread_bind(new_trustee, cpu);
-		/* fall through */
+		break;
+
 	case CPU_UP_PREPARE:
-		i = 0;
 		for_each_worker_pool(pool, gcwq) {
-			BUG_ON(pool->first_idle);
-			new_workers[i] = create_worker(pool);
-			if (!new_workers[i++])
-				goto err_destroy;
+			struct worker *worker;
+
+			if (pool->nr_workers)
+				continue;
+
+			worker = create_worker(pool);
+			if (!worker)
+				return NOTIFY_BAD;
+
+			spin_lock_irq(&gcwq->lock);
+			start_worker(worker);
+			spin_unlock_irq(&gcwq->lock);
 		}
 	}
 
@@ -3694,23 +3669,10 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		gcwq->trustee_state = TRUSTEE_START;
 		wake_up_process(gcwq->trustee);
 		wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
-		/* fall through */
-	case CPU_UP_PREPARE:
-		i = 0;
-		for_each_worker_pool(pool, gcwq) {
-			BUG_ON(pool->first_idle);
-			pool->first_idle = new_workers[i++];
-		}
 		break;
 
 	case CPU_POST_DEAD:
 		gcwq->trustee_state = TRUSTEE_BUTCHER;
-		/* fall through */
-	case CPU_UP_CANCELED:
-		for_each_worker_pool(pool, gcwq) {
-			destroy_worker(pool->first_idle);
-			pool->first_idle = NULL;
-		}
 		break;
 
 	case CPU_DOWN_FAILED:
@@ -3730,39 +3692,12 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		rebind_workers(gcwq);
 
 		gcwq_release_management(gcwq);
-
-		/*
-		 * Trustee is done and there might be no worker left.
-		 * Put the first_idle in and request a real manager to
-		 * take a look.
-		 */
-		for_each_worker_pool(pool, gcwq) {
-			spin_unlock_irq(&gcwq->lock);
-			kthread_bind(pool->first_idle->task, cpu);
-			spin_lock_irq(&gcwq->lock);
-			pool->flags |= POOL_MANAGE_WORKERS;
-			pool->first_idle->flags &= ~WORKER_UNBOUND;
-			start_worker(pool->first_idle);
-			pool->first_idle = NULL;
-		}
 		break;
 	}
 
 	spin_unlock_irqrestore(&gcwq->lock, flags);
 
 	return notifier_from_errno(0);
-
-err_destroy:
-	if (new_trustee)
-		kthread_stop(new_trustee);
-
-	spin_lock_irqsave(&gcwq->lock, flags);
-	for (i = 0; i < NR_WORKER_POOLS; i++)
-		if (new_workers[i])
-			destroy_worker(new_workers[i]);
-	spin_unlock_irqrestore(&gcwq->lock, flags);
-
-	return NOTIFY_BAD;
 }
 
 /*
@@ -3775,7 +3710,6 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
 {
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_UP_PREPARE:
-	case CPU_UP_CANCELED:
 	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
 		return workqueue_cpu_callback(nfb, action, hcpu);
-- 
cgit v1.2.3


From ce90fd860eef1317cc4b1efec2487703c5b7701b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 17 Jul 2012 12:39:27 -0700
Subject: workqueue: remove CPU offline trustee

With the previous changes, a disassociated global_cwq now can run as
an unbound one on its own - it can create workers as necessary to
drain remaining works after the CPU has been brought down and manage
the number of workers using the usual idle timer mechanism making
trustee completely redundant except for the actual unbinding
operation.

This patch removes the trustee and let a disassociated global_cwq
manage itself.  Unbinding is moved to a work item (for CPU affinity)
which is scheduled and flushed from CPU_DONW_PREPARE.

This patch moves nr_running clearing outside gcwq and manager locks to
simplify the code.  As nr_running is unused at the point, this is
safe.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
---
 kernel/workqueue.c | 288 +++++++----------------------------------------------
 1 file changed, 36 insertions(+), 252 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index acfabb22e2c4..d1545daa74ad 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -79,13 +79,6 @@ enum {
 	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND |
 				  WORKER_CPU_INTENSIVE,
 
-	/* gcwq->trustee_state */
-	TRUSTEE_START		= 0,		/* start */
-	TRUSTEE_IN_CHARGE	= 1,		/* trustee in charge of gcwq */
-	TRUSTEE_BUTCHER		= 2,		/* butcher workers */
-	TRUSTEE_RELEASE		= 3,		/* release workers */
-	TRUSTEE_DONE		= 4,		/* trustee is done */
-
 	NR_WORKER_POOLS		= 2,		/* # worker pools per gcwq */
 
 	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
@@ -100,7 +93,6 @@ enum {
 						   (min two ticks) */
 	MAYDAY_INTERVAL		= HZ / 10,	/* and then every 100ms */
 	CREATE_COOLDOWN		= HZ,		/* time to breath after fail */
-	TRUSTEE_COOLDOWN	= HZ / 10,	/* for trustee draining */
 
 	/*
 	 * Rescue workers are used only on emergencies and shared by
@@ -194,10 +186,6 @@ struct global_cwq {
 	struct worker_pool	pools[2];	/* normal and highpri pools */
 
 	wait_queue_head_t	rebind_hold;	/* rebind hold wait */
-
-	struct task_struct	*trustee;	/* L: for gcwq shutdown */
-	unsigned int		trustee_state;	/* L: trustee state */
-	wait_queue_head_t	trustee_wait;	/* trustee wait */
 } ____cacheline_aligned_in_smp;
 
 /*
@@ -753,11 +741,11 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
 	 * worklist not empty test sequence is in insert_work().
 	 * Please read comment there.
 	 *
-	 * NOT_RUNNING is clear.  This means that trustee is not in
-	 * charge and we're running on the local cpu w/ rq lock held
-	 * and preemption disabled, which in turn means that none else
-	 * could be manipulating idle_list, so dereferencing idle_list
-	 * without gcwq lock is safe.
+	 * NOT_RUNNING is clear.  This means that we're bound to and
+	 * running on the local cpu w/ rq lock held and preemption
+	 * disabled, which in turn means that none else could be
+	 * manipulating idle_list, so dereferencing idle_list without gcwq
+	 * lock is safe.
 	 */
 	if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
 		to_wakeup = first_worker(pool);
@@ -1217,19 +1205,16 @@ static void worker_enter_idle(struct worker *worker)
 	/* idle_list is LIFO */
 	list_add(&worker->entry, &pool->idle_list);
 
-	if (likely(gcwq->trustee_state != TRUSTEE_DONE)) {
-		if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
-			mod_timer(&pool->idle_timer,
-				  jiffies + IDLE_WORKER_TIMEOUT);
-	} else
-		wake_up_all(&gcwq->trustee_wait);
+	if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
+		mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
 
 	/*
-	 * Sanity check nr_running.  Because trustee releases gcwq->lock
-	 * between setting %WORKER_UNBOUND and zapping nr_running, the
-	 * warning may trigger spuriously.  Check iff trustee is idle.
+	 * Sanity check nr_running.  Because gcwq_unbind_fn() releases
+	 * gcwq->lock between setting %WORKER_UNBOUND and zapping
+	 * nr_running, the warning may trigger spuriously.  Check iff
+	 * unbind is not in progress.
 	 */
-	WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
+	WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) &&
 		     pool->nr_workers == pool->nr_idle &&
 		     atomic_read(get_pool_nr_running(pool)));
 }
@@ -3367,46 +3352,9 @@ EXPORT_SYMBOL_GPL(work_busy);
  * gcwqs serve mix of short, long and very long running works making
  * blocked draining impractical.
  *
- * This is solved by allowing a gcwq to be detached from CPU, running it
- * with unbound workers and allowing it to be reattached later if the cpu
- * comes back online.  A separate thread is created to govern a gcwq in
- * such state and is called the trustee of the gcwq.
- *
- * Trustee states and their descriptions.
- *
- * START	Command state used on startup.  On CPU_DOWN_PREPARE, a
- *		new trustee is started with this state.
- *
- * IN_CHARGE	Once started, trustee will enter this state after
- *		assuming the manager role and making all existing
- *		workers rogue.  DOWN_PREPARE waits for trustee to
- *		enter this state.  After reaching IN_CHARGE, trustee
- *		tries to execute the pending worklist until it's empty
- *		and the state is set to BUTCHER, or the state is set
- *		to RELEASE.
- *
- * BUTCHER	Command state which is set by the cpu callback after
- *		the cpu has went down.  Once this state is set trustee
- *		knows that there will be no new works on the worklist
- *		and once the worklist is empty it can proceed to
- *		killing idle workers.
- *
- * RELEASE	Command state which is set by the cpu callback if the
- *		cpu down has been canceled or it has come online
- *		again.  After recognizing this state, trustee stops
- *		trying to drain or butcher and clears ROGUE, rebinds
- *		all remaining workers back to the cpu and releases
- *		manager role.
- *
- * DONE		Trustee will enter this state after BUTCHER or RELEASE
- *		is complete.
- *
- *          trustee                 CPU                draining
- *         took over                down               complete
- * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
- *                        |                     |                  ^
- *                        | CPU is back online  v   return workers |
- *                         ----------------> RELEASE --------------
+ * This is solved by allowing a gcwq to be disassociated from the CPU
+ * running as an unbound one and allowing it to be reattached later if the
+ * cpu comes back online.
  */
 
 /* claim manager positions of all pools */
@@ -3427,61 +3375,11 @@ static void gcwq_release_management(struct global_cwq *gcwq)
 		mutex_unlock(&pool->manager_mutex);
 }
 
-/**
- * trustee_wait_event_timeout - timed event wait for trustee
- * @cond: condition to wait for
- * @timeout: timeout in jiffies
- *
- * wait_event_timeout() for trustee to use.  Handles locking and
- * checks for RELEASE request.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times.  To be used by trustee.
- *
- * RETURNS:
- * Positive indicating left time if @cond is satisfied, 0 if timed
- * out, -1 if canceled.
- */
-#define trustee_wait_event_timeout(cond, timeout) ({			\
-	long __ret = (timeout);						\
-	while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) &&	\
-	       __ret) {							\
-		spin_unlock_irq(&gcwq->lock);				\
-		__wait_event_timeout(gcwq->trustee_wait, (cond) ||	\
-			(gcwq->trustee_state == TRUSTEE_RELEASE),	\
-			__ret);						\
-		spin_lock_irq(&gcwq->lock);				\
-	}								\
-	gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret);		\
-})
-
-/**
- * trustee_wait_event - event wait for trustee
- * @cond: condition to wait for
- *
- * wait_event() for trustee to use.  Automatically handles locking and
- * checks for CANCEL request.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times.  To be used by trustee.
- *
- * RETURNS:
- * 0 if @cond is satisfied, -1 if canceled.
- */
-#define trustee_wait_event(cond) ({					\
-	long __ret1;							\
-	__ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
-	__ret1 < 0 ? -1 : 0;						\
-})
-
-static int __cpuinit trustee_thread(void *__gcwq)
+static void gcwq_unbind_fn(struct work_struct *work)
 {
-	struct global_cwq *gcwq = __gcwq;
+	struct global_cwq *gcwq = get_gcwq(smp_processor_id());
 	struct worker_pool *pool;
 	struct worker *worker;
-	struct work_struct *work;
 	struct hlist_node *pos;
 	int i;
 
@@ -3505,119 +3403,29 @@ static int __cpuinit trustee_thread(void *__gcwq)
 
 	gcwq->flags |= GCWQ_DISASSOCIATED;
 
+	spin_unlock_irq(&gcwq->lock);
+	gcwq_release_management(gcwq);
+
 	/*
 	 * Call schedule() so that we cross rq->lock and thus can guarantee
-	 * sched callbacks see the unbound flag.  This is necessary as
-	 * scheduler callbacks may be invoked from other cpus.
+	 * sched callbacks see the %WORKER_UNBOUND flag.  This is necessary
+	 * as scheduler callbacks may be invoked from other cpus.
 	 */
-	spin_unlock_irq(&gcwq->lock);
 	schedule();
-	spin_lock_irq(&gcwq->lock);
 
 	/*
-	 * Sched callbacks are disabled now.  Zap nr_running.  After
-	 * this, nr_running stays zero and need_more_worker() and
-	 * keep_working() are always true as long as the worklist is
-	 * not empty.
+	 * Sched callbacks are disabled now.  Zap nr_running.  After this,
+	 * nr_running stays zero and need_more_worker() and keep_working()
+	 * are always true as long as the worklist is not empty.  @gcwq now
+	 * behaves as unbound (in terms of concurrency management) gcwq
+	 * which is served by workers tied to the CPU.
+	 *
+	 * On return from this function, the current worker would trigger
+	 * unbound chain execution of pending work items if other workers
+	 * didn't already.
 	 */
 	for_each_worker_pool(pool, gcwq)
 		atomic_set(get_pool_nr_running(pool), 0);
-
-	spin_unlock_irq(&gcwq->lock);
-	for_each_worker_pool(pool, gcwq)
-		del_timer_sync(&pool->idle_timer);
-	spin_lock_irq(&gcwq->lock);
-
-	/*
-	 * We're now in charge.  Notify and proceed to drain.  We need
-	 * to keep the gcwq running during the whole CPU down
-	 * procedure as other cpu hotunplug callbacks may need to
-	 * flush currently running tasks.
-	 */
-	gcwq->trustee_state = TRUSTEE_IN_CHARGE;
-	wake_up_all(&gcwq->trustee_wait);
-
-	/*
-	 * The original cpu is in the process of dying and may go away
-	 * anytime now.  When that happens, we and all workers would
-	 * be migrated to other cpus.  Try draining any left work.  We
-	 * want to get it over with ASAP - spam rescuers, wake up as
-	 * many idlers as necessary and create new ones till the
-	 * worklist is empty.  Note that if the gcwq is frozen, there
-	 * may be frozen works in freezable cwqs.  Don't declare
-	 * completion while frozen.
-	 */
-	while (true) {
-		bool busy = false;
-
-		for_each_worker_pool(pool, gcwq)
-			busy |= pool->nr_workers != pool->nr_idle;
-
-		if (!busy && !(gcwq->flags & GCWQ_FREEZING) &&
-		    gcwq->trustee_state != TRUSTEE_IN_CHARGE)
-			break;
-
-		for_each_worker_pool(pool, gcwq) {
-			int nr_works = 0;
-
-			list_for_each_entry(work, &pool->worklist, entry) {
-				send_mayday(work);
-				nr_works++;
-			}
-
-			list_for_each_entry(worker, &pool->idle_list, entry) {
-				if (!nr_works--)
-					break;
-				wake_up_process(worker->task);
-			}
-
-			if (need_to_create_worker(pool)) {
-				spin_unlock_irq(&gcwq->lock);
-				worker = create_worker(pool);
-				spin_lock_irq(&gcwq->lock);
-				if (worker)
-					start_worker(worker);
-			}
-		}
-
-		/* give a breather */
-		if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
-			break;
-	}
-
-	gcwq_release_management(gcwq);
-
-	/* notify completion */
-	gcwq->trustee = NULL;
-	gcwq->trustee_state = TRUSTEE_DONE;
-	wake_up_all(&gcwq->trustee_wait);
-	spin_unlock_irq(&gcwq->lock);
-	return 0;
-}
-
-/**
- * wait_trustee_state - wait for trustee to enter the specified state
- * @gcwq: gcwq the trustee of interest belongs to
- * @state: target state to wait for
- *
- * Wait for the trustee to reach @state.  DONE is already matched.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times.  To be used by cpu_callback.
- */
-static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
-__releases(&gcwq->lock)
-__acquires(&gcwq->lock)
-{
-	if (!(gcwq->trustee_state == state ||
-	      gcwq->trustee_state == TRUSTEE_DONE)) {
-		spin_unlock_irq(&gcwq->lock);
-		__wait_event(gcwq->trustee_wait,
-			     gcwq->trustee_state == state ||
-			     gcwq->trustee_state == TRUSTEE_DONE);
-		spin_lock_irq(&gcwq->lock);
-	}
 }
 
 static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
@@ -3626,19 +3434,18 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 {
 	unsigned int cpu = (unsigned long)hcpu;
 	struct global_cwq *gcwq = get_gcwq(cpu);
-	struct task_struct *new_trustee = NULL;
 	struct worker_pool *pool;
+	struct work_struct unbind_work;
 	unsigned long flags;
 
 	action &= ~CPU_TASKS_FROZEN;
 
 	switch (action) {
 	case CPU_DOWN_PREPARE:
-		new_trustee = kthread_create(trustee_thread, gcwq,
-					     "workqueue_trustee/%d\n", cpu);
-		if (IS_ERR(new_trustee))
-			return notifier_from_errno(PTR_ERR(new_trustee));
-		kthread_bind(new_trustee, cpu);
+		/* unbinding should happen on the local CPU */
+		INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
+		schedule_work_on(cpu, &unbind_work);
+		flush_work(&unbind_work);
 		break;
 
 	case CPU_UP_PREPARE:
@@ -3662,27 +3469,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	spin_lock_irqsave(&gcwq->lock, flags);
 
 	switch (action) {
-	case CPU_DOWN_PREPARE:
-		/* initialize trustee and tell it to acquire the gcwq */
-		BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
-		gcwq->trustee = new_trustee;
-		gcwq->trustee_state = TRUSTEE_START;
-		wake_up_process(gcwq->trustee);
-		wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
-		break;
-
-	case CPU_POST_DEAD:
-		gcwq->trustee_state = TRUSTEE_BUTCHER;
-		break;
-
 	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
-		if (gcwq->trustee_state != TRUSTEE_DONE) {
-			gcwq->trustee_state = TRUSTEE_RELEASE;
-			wake_up_process(gcwq->trustee);
-			wait_trustee_state(gcwq, TRUSTEE_DONE);
-		}
-
 		spin_unlock_irq(&gcwq->lock);
 		gcwq_claim_management(gcwq);
 		spin_lock_irq(&gcwq->lock);
@@ -3727,7 +3515,6 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
 {
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_DOWN_PREPARE:
-	case CPU_POST_DEAD:
 		return workqueue_cpu_callback(nfb, action, hcpu);
 	}
 	return NOTIFY_OK;
@@ -3960,9 +3747,6 @@ static int __init init_workqueues(void)
 		}
 
 		init_waitqueue_head(&gcwq->rebind_hold);
-
-		gcwq->trustee_state = TRUSTEE_DONE;
-		init_waitqueue_head(&gcwq->trustee_wait);
 	}
 
 	/* create the initial worker */
-- 
cgit v1.2.3


From 341191789ec00a6509a77a7b428c93082e26f815 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 17 Jul 2012 12:39:28 -0700
Subject: workqueue: simplify CPU hotplug code

With trustee gone, CPU hotplug code can be simplified.

* gcwq_claim/release_management() now grab and release gcwq lock too
  respectively and gained _and_lock and _and_unlock postfixes.

* All CPU hotplug logic was implemented in workqueue_cpu_callback()
  which was called by workqueue_cpu_up/down_callback() for the correct
  priority.  This was because up and down paths shared a lot of logic,
  which is no longer true.  Remove workqueue_cpu_callback() and move
  all hotplug logic into the two actual callbacks.

This patch doesn't make any functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
---
 kernel/workqueue.c | 79 +++++++++++++++++-------------------------------------
 1 file changed, 25 insertions(+), 54 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d1545daa74ad..471996a81633 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3358,19 +3358,21 @@ EXPORT_SYMBOL_GPL(work_busy);
  */
 
 /* claim manager positions of all pools */
-static void gcwq_claim_management(struct global_cwq *gcwq)
+static void gcwq_claim_management_and_lock(struct global_cwq *gcwq)
 {
 	struct worker_pool *pool;
 
 	for_each_worker_pool(pool, gcwq)
 		mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools);
+	spin_lock_irq(&gcwq->lock);
 }
 
 /* release manager positions */
-static void gcwq_release_management(struct global_cwq *gcwq)
+static void gcwq_release_management_and_unlock(struct global_cwq *gcwq)
 {
 	struct worker_pool *pool;
 
+	spin_unlock_irq(&gcwq->lock);
 	for_each_worker_pool(pool, gcwq)
 		mutex_unlock(&pool->manager_mutex);
 }
@@ -3385,8 +3387,7 @@ static void gcwq_unbind_fn(struct work_struct *work)
 
 	BUG_ON(gcwq->cpu != smp_processor_id());
 
-	gcwq_claim_management(gcwq);
-	spin_lock_irq(&gcwq->lock);
+	gcwq_claim_management_and_lock(gcwq);
 
 	/*
 	 * We've claimed all manager positions.  Make all workers unbound
@@ -3403,8 +3404,7 @@ static void gcwq_unbind_fn(struct work_struct *work)
 
 	gcwq->flags |= GCWQ_DISASSOCIATED;
 
-	spin_unlock_irq(&gcwq->lock);
-	gcwq_release_management(gcwq);
+	gcwq_release_management_and_unlock(gcwq);
 
 	/*
 	 * Call schedule() so that we cross rq->lock and thus can guarantee
@@ -3428,26 +3428,19 @@ static void gcwq_unbind_fn(struct work_struct *work)
 		atomic_set(get_pool_nr_running(pool), 0);
 }
 
-static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
-						unsigned long action,
-						void *hcpu)
+/*
+ * Workqueues should be brought up before normal priority CPU notifiers.
+ * This will be registered high priority CPU notifier.
+ */
+static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
+					       unsigned long action,
+					       void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
 	struct global_cwq *gcwq = get_gcwq(cpu);
 	struct worker_pool *pool;
-	struct work_struct unbind_work;
-	unsigned long flags;
-
-	action &= ~CPU_TASKS_FROZEN;
-
-	switch (action) {
-	case CPU_DOWN_PREPARE:
-		/* unbinding should happen on the local CPU */
-		INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
-		schedule_work_on(cpu, &unbind_work);
-		flush_work(&unbind_work);
-		break;
 
+	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_UP_PREPARE:
 		for_each_worker_pool(pool, gcwq) {
 			struct worker *worker;
@@ -3463,45 +3456,16 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 			start_worker(worker);
 			spin_unlock_irq(&gcwq->lock);
 		}
-	}
-
-	/* some are called w/ irq disabled, don't disturb irq status */
-	spin_lock_irqsave(&gcwq->lock, flags);
+		break;
 
-	switch (action) {
 	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
-		spin_unlock_irq(&gcwq->lock);
-		gcwq_claim_management(gcwq);
-		spin_lock_irq(&gcwq->lock);
-
+		gcwq_claim_management_and_lock(gcwq);
 		gcwq->flags &= ~GCWQ_DISASSOCIATED;
-
 		rebind_workers(gcwq);
-
-		gcwq_release_management(gcwq);
+		gcwq_release_management_and_unlock(gcwq);
 		break;
 	}
-
-	spin_unlock_irqrestore(&gcwq->lock, flags);
-
-	return notifier_from_errno(0);
-}
-
-/*
- * Workqueues should be brought up before normal priority CPU notifiers.
- * This will be registered high priority CPU notifier.
- */
-static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
-					       unsigned long action,
-					       void *hcpu)
-{
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_UP_PREPARE:
-	case CPU_DOWN_FAILED:
-	case CPU_ONLINE:
-		return workqueue_cpu_callback(nfb, action, hcpu);
-	}
 	return NOTIFY_OK;
 }
 
@@ -3513,9 +3477,16 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
 						 unsigned long action,
 						 void *hcpu)
 {
+	unsigned int cpu = (unsigned long)hcpu;
+	struct work_struct unbind_work;
+
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_DOWN_PREPARE:
-		return workqueue_cpu_callback(nfb, action, hcpu);
+		/* unbinding should happen on the local CPU */
+		INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
+		schedule_work_on(cpu, &unbind_work);
+		flush_work(&unbind_work);
+		break;
 	}
 	return NOTIFY_OK;
 }
-- 
cgit v1.2.3


From 1fd7bc678f96a680ee5d0727957df41cd96cf016 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 19 Jul 2012 13:52:53 -0700
Subject: kthread_worker: reorganize to prepare for flush_kthread_work()
 reimplementation

Make the following two non-functional changes.

* Separate out insert_kthread_work() from queue_kthread_work().

* Relocate struct kthread_flush_work and kthread_flush_work_fn()
  definitions above flush_kthread_work().

v2: Added lockdep_assert_held() in insert_kthread_work() as suggested
    by Andy Walls.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Andy Walls <awalls@md.metrocast.net>
---
 kernel/kthread.c | 42 ++++++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3d3de633702e..4bfbff36d447 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -378,6 +378,19 @@ repeat:
 }
 EXPORT_SYMBOL_GPL(kthread_worker_fn);
 
+/* insert @work before @pos in @worker */
+static void insert_kthread_work(struct kthread_worker *worker,
+			       struct kthread_work *work,
+			       struct list_head *pos)
+{
+	lockdep_assert_held(&worker->lock);
+
+	list_add_tail(&work->node, pos);
+	work->queue_seq++;
+	if (likely(worker->task))
+		wake_up_process(worker->task);
+}
+
 /**
  * queue_kthread_work - queue a kthread_work
  * @worker: target kthread_worker
@@ -395,10 +408,7 @@ bool queue_kthread_work(struct kthread_worker *worker,
 
 	spin_lock_irqsave(&worker->lock, flags);
 	if (list_empty(&work->node)) {
-		list_add_tail(&work->node, &worker->work_list);
-		work->queue_seq++;
-		if (likely(worker->task))
-			wake_up_process(worker->task);
+		insert_kthread_work(worker, work, &worker->work_list);
 		ret = true;
 	}
 	spin_unlock_irqrestore(&worker->lock, flags);
@@ -406,6 +416,18 @@ bool queue_kthread_work(struct kthread_worker *worker,
 }
 EXPORT_SYMBOL_GPL(queue_kthread_work);
 
+struct kthread_flush_work {
+	struct kthread_work	work;
+	struct completion	done;
+};
+
+static void kthread_flush_work_fn(struct kthread_work *work)
+{
+	struct kthread_flush_work *fwork =
+		container_of(work, struct kthread_flush_work, work);
+	complete(&fwork->done);
+}
+
 /**
  * flush_kthread_work - flush a kthread_work
  * @work: work to flush
@@ -436,18 +458,6 @@ void flush_kthread_work(struct kthread_work *work)
 }
 EXPORT_SYMBOL_GPL(flush_kthread_work);
 
-struct kthread_flush_work {
-	struct kthread_work	work;
-	struct completion	done;
-};
-
-static void kthread_flush_work_fn(struct kthread_work *work)
-{
-	struct kthread_flush_work *fwork =
-		container_of(work, struct kthread_flush_work, work);
-	complete(&fwork->done);
-}
-
 /**
  * flush_kthread_worker - flush all current works on a kthread_worker
  * @worker: worker to flush
-- 
cgit v1.2.3


From 700cabaed8f7cf3847e58e1a6ae72bc1dc79f78d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 19 Jul 2012 13:52:53 -0700
Subject: kthread_worker: reimplement flush_kthread_work() to allow freeing the
 work item being executed

kthread_worker provides minimalistic workqueue-like interface for
users which need a dedicated worker thread (e.g. for realtime
priority).  It has basic queue, flush_work, flush_worker operations
which mostly match the workqueue counterparts; however, due to the way
flush_work() is implemented, it has a noticeable difference of not
allowing work items to be freed while being executed.

While the current users of kthread_worker are okay with the current
behavior, the restriction does impede some valid use cases.  Also,
removing this difference isn't difficult and actually makes the code
easier to understand.

This patch reimplements flush_kthread_work() such that it uses a
flush_work item instead of queue/done sequence numbers.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/kthread.h |  8 ++------
 kernel/kthread.c        | 48 +++++++++++++++++++++++++++---------------------
 2 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 0714b24c0e45..22ccf9dee177 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -49,8 +49,6 @@ extern int tsk_fork_get_node(struct task_struct *tsk);
  * can be queued and flushed using queue/flush_kthread_work()
  * respectively.  Queued kthread_works are processed by a kthread
  * running kthread_worker_fn().
- *
- * A kthread_work can't be freed while it is executing.
  */
 struct kthread_work;
 typedef void (*kthread_work_func_t)(struct kthread_work *work);
@@ -59,15 +57,14 @@ struct kthread_worker {
 	spinlock_t		lock;
 	struct list_head	work_list;
 	struct task_struct	*task;
+	struct kthread_work	*current_work;
 };
 
 struct kthread_work {
 	struct list_head	node;
 	kthread_work_func_t	func;
 	wait_queue_head_t	done;
-	atomic_t		flushing;
-	int			queue_seq;
-	int			done_seq;
+	struct kthread_worker	*worker;
 };
 
 #define KTHREAD_WORKER_INIT(worker)	{				\
@@ -79,7 +76,6 @@ struct kthread_work {
 	.node = LIST_HEAD_INIT((work).node),				\
 	.func = (fn),							\
 	.done = __WAIT_QUEUE_HEAD_INITIALIZER((work).done),		\
-	.flushing = ATOMIC_INIT(0),					\
 	}
 
 #define DEFINE_KTHREAD_WORKER(worker)					\
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4bfbff36d447..b579af57ea10 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -360,16 +360,12 @@ repeat:
 					struct kthread_work, node);
 		list_del_init(&work->node);
 	}
+	worker->current_work = work;
 	spin_unlock_irq(&worker->lock);
 
 	if (work) {
 		__set_current_state(TASK_RUNNING);
 		work->func(work);
-		smp_wmb();	/* wmb worker-b0 paired with flush-b1 */
-		work->done_seq = work->queue_seq;
-		smp_mb();	/* mb worker-b1 paired with flush-b0 */
-		if (atomic_read(&work->flushing))
-			wake_up_all(&work->done);
 	} else if (!freezing(current))
 		schedule();
 
@@ -386,7 +382,7 @@ static void insert_kthread_work(struct kthread_worker *worker,
 	lockdep_assert_held(&worker->lock);
 
 	list_add_tail(&work->node, pos);
-	work->queue_seq++;
+	work->worker = worker;
 	if (likely(worker->task))
 		wake_up_process(worker->task);
 }
@@ -436,25 +432,35 @@ static void kthread_flush_work_fn(struct kthread_work *work)
  */
 void flush_kthread_work(struct kthread_work *work)
 {
-	int seq = work->queue_seq;
+	struct kthread_flush_work fwork = {
+		KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
+		COMPLETION_INITIALIZER_ONSTACK(fwork.done),
+	};
+	struct kthread_worker *worker;
+	bool noop = false;
+
+retry:
+	worker = work->worker;
+	if (!worker)
+		return;
 
-	atomic_inc(&work->flushing);
+	spin_lock_irq(&worker->lock);
+	if (work->worker != worker) {
+		spin_unlock_irq(&worker->lock);
+		goto retry;
+	}
 
-	/*
-	 * mb flush-b0 paired with worker-b1, to make sure either
-	 * worker sees the above increment or we see done_seq update.
-	 */
-	smp_mb__after_atomic_inc();
+	if (!list_empty(&work->node))
+		insert_kthread_work(worker, &fwork.work, work->node.next);
+	else if (worker->current_work == work)
+		insert_kthread_work(worker, &fwork.work, worker->work_list.next);
+	else
+		noop = true;
 
-	/* A - B <= 0 tests whether B is in front of A regardless of overflow */
-	wait_event(work->done, seq - work->done_seq <= 0);
-	atomic_dec(&work->flushing);
+	spin_unlock_irq(&worker->lock);
 
-	/*
-	 * rmb flush-b1 paired with worker-b0, to make sure our caller
-	 * sees every change made by work->func().
-	 */
-	smp_mb__after_atomic_dec();
+	if (!noop)
+		wait_for_completion(&fwork.done);
 }
 EXPORT_SYMBOL_GPL(flush_kthread_work);
 
-- 
cgit v1.2.3


From 5556f0d930354f7172411b5c70a5b5fcfaa7f6bc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 22 Jul 2012 10:16:34 -0700
Subject: workqueue: fix spurious CPU locality WARN from process_one_work()

25511a4776 "workqueue: reimplement CPU online rebinding to handle idle
workers" added CPU locality sanity check in process_one_work().  It
triggers if a worker is executing on a different CPU without UNBOUND
or REBIND set.

This works for all normal workers but rescuers can trigger this
spuriously when they're serving the unbound or a disassociated
global_cwq - rescuers don't have either flag set and thus its
gcwq->cpu can be a different value including %WORK_CPU_UNBOUND.

Fix it by additionally testing %GCWQ_DISASSOCIATED.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
LKML-Refence: <20120721213656.GA7783@linux.vnet.ibm.com>
---
 kernel/workqueue.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 471996a81633..692d97628a10 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1949,7 +1949,13 @@ __acquires(&gcwq->lock)
 
 	lockdep_copy_map(&lockdep_map, &work->lockdep_map);
 #endif
+	/*
+	 * Ensure we're on the correct CPU.  DISASSOCIATED test is
+	 * necessary to avoid spurious warnings from rescuers servicing the
+	 * unbound or a disassociated gcwq.
+	 */
 	WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) &&
+		     !(gcwq->flags & GCWQ_DISASSOCIATED) &&
 		     raw_smp_processor_id() != gcwq->cpu);
 
 	/*
-- 
cgit v1.2.3


From 5506fcd779609e3c03c4ad7fff36c5dcd0cdc686 Mon Sep 17 00:00:00 2001
From: Colin Cross <ccross@android.com>
Date: Wed, 14 Mar 2012 11:29:50 -0700
Subject: cpuidle: refactor out cpuidle_enter_state

Split the code to enter a state and update the stats into a helper
function, cpuidle_enter_state, and export it.  This function will
be called by the coupled state code to handle entering the safe
state and the final coupled state.

Reviewed-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Tested-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Reviewed-by: Kevin Hilman <khilman@ti.com>
Tested-by: Kevin Hilman <khilman@ti.com>
Reviewed-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Colin Cross <ccross@android.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/cpuidle/cpuidle.c | 42 +++++++++++++++++++++++++++++-------------
 drivers/cpuidle/cpuidle.h |  2 ++
 2 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index d90519cec880..cd266101c80e 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -91,6 +91,34 @@ int cpuidle_play_dead(void)
 	return -ENODEV;
 }
 
+/**
+ * cpuidle_enter_state - enter the state and update stats
+ * @dev: cpuidle device for this cpu
+ * @drv: cpuidle driver for this cpu
+ * @next_state: index into drv->states of the state to enter
+ */
+int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
+		int next_state)
+{
+	int entered_state;
+
+	entered_state = cpuidle_enter_ops(dev, drv, next_state);
+
+	if (entered_state >= 0) {
+		/* Update cpuidle counters */
+		/* This can be moved to within driver enter routine
+		 * but that results in multiple copies of same code.
+		 */
+		dev->states_usage[entered_state].time +=
+				(unsigned long long)dev->last_residency;
+		dev->states_usage[entered_state].usage++;
+	} else {
+		dev->last_residency = 0;
+	}
+
+	return entered_state;
+}
+
 /**
  * cpuidle_idle_call - the main idle loop
  *
@@ -132,23 +160,11 @@ int cpuidle_idle_call(void)
 	trace_power_start_rcuidle(POWER_CSTATE, next_state, dev->cpu);
 	trace_cpu_idle_rcuidle(next_state, dev->cpu);
 
-	entered_state = cpuidle_enter_ops(dev, drv, next_state);
+	entered_state = cpuidle_enter_state(dev, drv, next_state);
 
 	trace_power_end_rcuidle(dev->cpu);
 	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
 
-	if (entered_state >= 0) {
-		/* Update cpuidle counters */
-		/* This can be moved to within driver enter routine
-		 * but that results in multiple copies of same code.
-		 */
-		dev->states_usage[entered_state].time +=
-				(unsigned long long)dev->last_residency;
-		dev->states_usage[entered_state].usage++;
-	} else {
-		dev->last_residency = 0;
-	}
-
 	/* give the governor an opportunity to reflect on the outcome */
 	if (cpuidle_curr_governor->reflect)
 		cpuidle_curr_governor->reflect(dev, entered_state);
diff --git a/drivers/cpuidle/cpuidle.h b/drivers/cpuidle/cpuidle.h
index 7db186685c27..d8a3ccce8281 100644
--- a/drivers/cpuidle/cpuidle.h
+++ b/drivers/cpuidle/cpuidle.h
@@ -14,6 +14,8 @@ extern struct list_head cpuidle_detected_devices;
 extern struct mutex cpuidle_lock;
 extern spinlock_t cpuidle_driver_lock;
 extern int cpuidle_disabled(void);
+extern int cpuidle_enter_state(struct cpuidle_device *dev,
+		struct cpuidle_driver *drv, int next_state);
 
 /* idle loop */
 extern void cpuidle_install_idle_handler(void);
-- 
cgit v1.2.3


From e9545f85421c9c1f43905fa85fa489ab63371f3c Mon Sep 17 00:00:00 2001
From: Colin Cross <ccross@android.com>
Date: Wed, 14 Mar 2012 11:29:51 -0700
Subject: cpuidle: fix error handling in __cpuidle_register_device

Fix the error handling in __cpuidle_register_device to include
the missing list_del.  Move it to a label, which will simplify
the error handling when coupled states are added.

Reviewed-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Tested-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Reviewed-by: Kevin Hilman <khilman@ti.com>
Tested-by: Kevin Hilman <khilman@ti.com>
Reviewed-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Colin Cross <ccross@android.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/cpuidle/cpuidle.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index cd266101c80e..3c37ced0e0c0 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -392,13 +392,18 @@ static int __cpuidle_register_device(struct cpuidle_device *dev)
 
 	per_cpu(cpuidle_devices, dev->cpu) = dev;
 	list_add(&dev->device_list, &cpuidle_detected_devices);
-	if ((ret = cpuidle_add_sysfs(cpu_dev))) {
-		module_put(cpuidle_driver->owner);
-		return ret;
-	}
+	ret = cpuidle_add_sysfs(cpu_dev);
+	if (ret)
+		goto err_sysfs;
 
 	dev->registered = 1;
 	return 0;
+
+err_sysfs:
+	list_del(&dev->device_list);
+	per_cpu(cpuidle_devices, dev->cpu) = NULL;
+	module_put(cpuidle_driver->owner);
+	return ret;
 }
 
 /**
-- 
cgit v1.2.3


From d678be3dbea5a310f57ead82d076b301b40dc315 Mon Sep 17 00:00:00 2001
From: Colin Cross <ccross@android.com>
Date: Wed, 14 Mar 2012 11:29:52 -0700
Subject: cpuidle: add support for states that affect multiple cpus

On some ARM SMP SoCs (OMAP4460, Tegra 2, and probably more), the
cpus cannot be independently powered down, either due to
sequencing restrictions (on Tegra 2, cpu 0 must be the last to
power down), or due to HW bugs (on OMAP4460, a cpu powering up
will corrupt the gic state unless the other cpu runs a work
around).  Each cpu has a power state that it can enter without
coordinating with the other cpu (usually Wait For Interrupt, or
WFI), and one or more "coupled" power states that affect blocks
shared between the cpus (L2 cache, interrupt controller, and
sometimes the whole SoC).  Entering a coupled power state must
be tightly controlled on both cpus.

The easiest solution to implementing coupled cpu power states is
to hotplug all but one cpu whenever possible, usually using a
cpufreq governor that looks at cpu load to determine when to
enable the secondary cpus.  This causes problems, as hotplug is an
expensive operation, so the number of hotplug transitions must be
minimized, leading to very slow response to loads, often on the
order of seconds.

This file implements an alternative solution, where each cpu will
wait in the WFI state until all cpus are ready to enter a coupled
state, at which point the coupled state function will be called
on all cpus at approximately the same time.

Once all cpus are ready to enter idle, they are woken by an smp
cross call.  At this point, there is a chance that one of the
cpus will find work to do, and choose not to enter idle.  A
final pass is needed to guarantee that all cpus will call the
power state enter function at the same time.  During this pass,
each cpu will increment the ready counter, and continue once the
ready counter matches the number of online coupled cpus.  If any
cpu exits idle, the other cpus will decrement their counter and
retry.

To use coupled cpuidle states, a cpuidle driver must:

   Set struct cpuidle_device.coupled_cpus to the mask of all
   coupled cpus, usually the same as cpu_possible_mask if all cpus
   are part of the same cluster.  The coupled_cpus mask must be
   set in the struct cpuidle_device for each cpu.

   Set struct cpuidle_device.safe_state to a state that is not a
   coupled state.  This is usually WFI.

   Set CPUIDLE_FLAG_COUPLED in struct cpuidle_state.flags for each
   state that affects multiple cpus.

   Provide a struct cpuidle_state.enter function for each state
   that affects multiple cpus.  This function is guaranteed to be
   called on all cpus at approximately the same time.  The driver
   should ensure that the cpus all abort together if any cpu tries
   to abort once the function is called.

Cc: Len Brown <len.brown@intel.com>
Cc: Amit Kucheria <amit.kucheria@linaro.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Trinabh Gupta <g.trinabh@gmail.com>
Cc: Deepthi Dharwar <deepthi@linux.vnet.ibm.com>
Reviewed-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Tested-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Reviewed-by: Kevin Hilman <khilman@ti.com>
Tested-by: Kevin Hilman <khilman@ti.com>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Colin Cross <ccross@android.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/cpuidle/Kconfig   |   3 +
 drivers/cpuidle/Makefile  |   1 +
 drivers/cpuidle/coupled.c | 678 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/cpuidle/cpuidle.c |  15 +-
 drivers/cpuidle/cpuidle.h |  30 ++
 include/linux/cpuidle.h   |   7 +
 6 files changed, 733 insertions(+), 1 deletion(-)
 create mode 100644 drivers/cpuidle/coupled.c

diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index 78a666d1e5f5..a76b689e553b 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -18,3 +18,6 @@ config CPU_IDLE_GOV_MENU
 	bool
 	depends on CPU_IDLE && NO_HZ
 	default y
+
+config ARCH_NEEDS_CPU_IDLE_COUPLED
+	def_bool n
diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile
index 5634f88379df..38c8f69f30cf 100644
--- a/drivers/cpuidle/Makefile
+++ b/drivers/cpuidle/Makefile
@@ -3,3 +3,4 @@
 #
 
 obj-y += cpuidle.o driver.o governor.o sysfs.o governors/
+obj-$(CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED) += coupled.o
diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
new file mode 100644
index 000000000000..4a7dcc41360d
--- /dev/null
+++ b/drivers/cpuidle/coupled.c
@@ -0,0 +1,678 @@
+/*
+ * coupled.c - helper functions to enter the same idle state on multiple cpus
+ *
+ * Copyright (c) 2011 Google, Inc.
+ *
+ * Author: Colin Cross <ccross@android.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/cpu.h>
+#include <linux/cpuidle.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include "cpuidle.h"
+
+/**
+ * DOC: Coupled cpuidle states
+ *
+ * On some ARM SMP SoCs (OMAP4460, Tegra 2, and probably more), the
+ * cpus cannot be independently powered down, either due to
+ * sequencing restrictions (on Tegra 2, cpu 0 must be the last to
+ * power down), or due to HW bugs (on OMAP4460, a cpu powering up
+ * will corrupt the gic state unless the other cpu runs a work
+ * around).  Each cpu has a power state that it can enter without
+ * coordinating with the other cpu (usually Wait For Interrupt, or
+ * WFI), and one or more "coupled" power states that affect blocks
+ * shared between the cpus (L2 cache, interrupt controller, and
+ * sometimes the whole SoC).  Entering a coupled power state must
+ * be tightly controlled on both cpus.
+ *
+ * This file implements a solution, where each cpu will wait in the
+ * WFI state until all cpus are ready to enter a coupled state, at
+ * which point the coupled state function will be called on all
+ * cpus at approximately the same time.
+ *
+ * Once all cpus are ready to enter idle, they are woken by an smp
+ * cross call.  At this point, there is a chance that one of the
+ * cpus will find work to do, and choose not to enter idle.  A
+ * final pass is needed to guarantee that all cpus will call the
+ * power state enter function at the same time.  During this pass,
+ * each cpu will increment the ready counter, and continue once the
+ * ready counter matches the number of online coupled cpus.  If any
+ * cpu exits idle, the other cpus will decrement their counter and
+ * retry.
+ *
+ * requested_state stores the deepest coupled idle state each cpu
+ * is ready for.  It is assumed that the states are indexed from
+ * shallowest (highest power, lowest exit latency) to deepest
+ * (lowest power, highest exit latency).  The requested_state
+ * variable is not locked.  It is only written from the cpu that
+ * it stores (or by the on/offlining cpu if that cpu is offline),
+ * and only read after all the cpus are ready for the coupled idle
+ * state are are no longer updating it.
+ *
+ * Three atomic counters are used.  alive_count tracks the number
+ * of cpus in the coupled set that are currently or soon will be
+ * online.  waiting_count tracks the number of cpus that are in
+ * the waiting loop, in the ready loop, or in the coupled idle state.
+ * ready_count tracks the number of cpus that are in the ready loop
+ * or in the coupled idle state.
+ *
+ * To use coupled cpuidle states, a cpuidle driver must:
+ *
+ *    Set struct cpuidle_device.coupled_cpus to the mask of all
+ *    coupled cpus, usually the same as cpu_possible_mask if all cpus
+ *    are part of the same cluster.  The coupled_cpus mask must be
+ *    set in the struct cpuidle_device for each cpu.
+ *
+ *    Set struct cpuidle_device.safe_state to a state that is not a
+ *    coupled state.  This is usually WFI.
+ *
+ *    Set CPUIDLE_FLAG_COUPLED in struct cpuidle_state.flags for each
+ *    state that affects multiple cpus.
+ *
+ *    Provide a struct cpuidle_state.enter function for each state
+ *    that affects multiple cpus.  This function is guaranteed to be
+ *    called on all cpus at approximately the same time.  The driver
+ *    should ensure that the cpus all abort together if any cpu tries
+ *    to abort once the function is called.  The function should return
+ *    with interrupts still disabled.
+ */
+
+/**
+ * struct cpuidle_coupled - data for set of cpus that share a coupled idle state
+ * @coupled_cpus: mask of cpus that are part of the coupled set
+ * @requested_state: array of requested states for cpus in the coupled set
+ * @ready_waiting_counts: combined count of cpus  in ready or waiting loops
+ * @online_count: count of cpus that are online
+ * @refcnt: reference count of cpuidle devices that are using this struct
+ * @prevent: flag to prevent coupled idle while a cpu is hotplugging
+ */
+struct cpuidle_coupled {
+	cpumask_t coupled_cpus;
+	int requested_state[NR_CPUS];
+	atomic_t ready_waiting_counts;
+	int online_count;
+	int refcnt;
+	int prevent;
+};
+
+#define WAITING_BITS 16
+#define MAX_WAITING_CPUS (1 << WAITING_BITS)
+#define WAITING_MASK (MAX_WAITING_CPUS - 1)
+#define READY_MASK (~WAITING_MASK)
+
+#define CPUIDLE_COUPLED_NOT_IDLE	(-1)
+
+static DEFINE_MUTEX(cpuidle_coupled_lock);
+static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb);
+
+/*
+ * The cpuidle_coupled_poked_mask mask is used to avoid calling
+ * __smp_call_function_single with the per cpu call_single_data struct already
+ * in use.  This prevents a deadlock where two cpus are waiting for each others
+ * call_single_data struct to be available
+ */
+static cpumask_t cpuidle_coupled_poked_mask;
+
+/**
+ * cpuidle_state_is_coupled - check if a state is part of a coupled set
+ * @dev: struct cpuidle_device for the current cpu
+ * @drv: struct cpuidle_driver for the platform
+ * @state: index of the target state in drv->states
+ *
+ * Returns true if the target state is coupled with cpus besides this one
+ */
+bool cpuidle_state_is_coupled(struct cpuidle_device *dev,
+	struct cpuidle_driver *drv, int state)
+{
+	return drv->states[state].flags & CPUIDLE_FLAG_COUPLED;
+}
+
+/**
+ * cpuidle_coupled_set_ready - mark a cpu as ready
+ * @coupled: the struct coupled that contains the current cpu
+ */
+static inline void cpuidle_coupled_set_ready(struct cpuidle_coupled *coupled)
+{
+	atomic_add(MAX_WAITING_CPUS, &coupled->ready_waiting_counts);
+}
+
+/**
+ * cpuidle_coupled_set_not_ready - mark a cpu as not ready
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Decrements the ready counter, unless the ready (and thus the waiting) counter
+ * is equal to the number of online cpus.  Prevents a race where one cpu
+ * decrements the waiting counter and then re-increments it just before another
+ * cpu has decremented its ready counter, leading to the ready counter going
+ * down from the number of online cpus without going through the coupled idle
+ * state.
+ *
+ * Returns 0 if the counter was decremented successfully, -EINVAL if the ready
+ * counter was equal to the number of online cpus.
+ */
+static
+inline int cpuidle_coupled_set_not_ready(struct cpuidle_coupled *coupled)
+{
+	int all;
+	int ret;
+
+	all = coupled->online_count || (coupled->online_count << WAITING_BITS);
+	ret = atomic_add_unless(&coupled->ready_waiting_counts,
+		-MAX_WAITING_CPUS, all);
+
+	return ret ? 0 : -EINVAL;
+}
+
+/**
+ * cpuidle_coupled_no_cpus_ready - check if no cpus in a coupled set are ready
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Returns true if all of the cpus in a coupled set are out of the ready loop.
+ */
+static inline int cpuidle_coupled_no_cpus_ready(struct cpuidle_coupled *coupled)
+{
+	int r = atomic_read(&coupled->ready_waiting_counts) >> WAITING_BITS;
+	return r == 0;
+}
+
+/**
+ * cpuidle_coupled_cpus_ready - check if all cpus in a coupled set are ready
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Returns true if all cpus coupled to this target state are in the ready loop
+ */
+static inline bool cpuidle_coupled_cpus_ready(struct cpuidle_coupled *coupled)
+{
+	int r = atomic_read(&coupled->ready_waiting_counts) >> WAITING_BITS;
+	return r == coupled->online_count;
+}
+
+/**
+ * cpuidle_coupled_cpus_waiting - check if all cpus in a coupled set are waiting
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Returns true if all cpus coupled to this target state are in the wait loop
+ */
+static inline bool cpuidle_coupled_cpus_waiting(struct cpuidle_coupled *coupled)
+{
+	int w = atomic_read(&coupled->ready_waiting_counts) & WAITING_MASK;
+	return w == coupled->online_count;
+}
+
+/**
+ * cpuidle_coupled_no_cpus_waiting - check if no cpus in coupled set are waiting
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Returns true if all of the cpus in a coupled set are out of the waiting loop.
+ */
+static inline int cpuidle_coupled_no_cpus_waiting(struct cpuidle_coupled *coupled)
+{
+	int w = atomic_read(&coupled->ready_waiting_counts) & WAITING_MASK;
+	return w == 0;
+}
+
+/**
+ * cpuidle_coupled_get_state - determine the deepest idle state
+ * @dev: struct cpuidle_device for this cpu
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Returns the deepest idle state that all coupled cpus can enter
+ */
+static inline int cpuidle_coupled_get_state(struct cpuidle_device *dev,
+		struct cpuidle_coupled *coupled)
+{
+	int i;
+	int state = INT_MAX;
+
+	/*
+	 * Read barrier ensures that read of requested_state is ordered after
+	 * reads of ready_count.  Matches the write barriers
+	 * cpuidle_set_state_waiting.
+	 */
+	smp_rmb();
+
+	for_each_cpu_mask(i, coupled->coupled_cpus)
+		if (cpu_online(i) && coupled->requested_state[i] < state)
+			state = coupled->requested_state[i];
+
+	return state;
+}
+
+static void cpuidle_coupled_poked(void *info)
+{
+	int cpu = (unsigned long)info;
+	cpumask_clear_cpu(cpu, &cpuidle_coupled_poked_mask);
+}
+
+/**
+ * cpuidle_coupled_poke - wake up a cpu that may be waiting
+ * @cpu: target cpu
+ *
+ * Ensures that the target cpu exits it's waiting idle state (if it is in it)
+ * and will see updates to waiting_count before it re-enters it's waiting idle
+ * state.
+ *
+ * If cpuidle_coupled_poked_mask is already set for the target cpu, that cpu
+ * either has or will soon have a pending IPI that will wake it out of idle,
+ * or it is currently processing the IPI and is not in idle.
+ */
+static void cpuidle_coupled_poke(int cpu)
+{
+	struct call_single_data *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu);
+
+	if (!cpumask_test_and_set_cpu(cpu, &cpuidle_coupled_poked_mask))
+		__smp_call_function_single(cpu, csd, 0);
+}
+
+/**
+ * cpuidle_coupled_poke_others - wake up all other cpus that may be waiting
+ * @dev: struct cpuidle_device for this cpu
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Calls cpuidle_coupled_poke on all other online cpus.
+ */
+static void cpuidle_coupled_poke_others(int this_cpu,
+		struct cpuidle_coupled *coupled)
+{
+	int cpu;
+
+	for_each_cpu_mask(cpu, coupled->coupled_cpus)
+		if (cpu != this_cpu && cpu_online(cpu))
+			cpuidle_coupled_poke(cpu);
+}
+
+/**
+ * cpuidle_coupled_set_waiting - mark this cpu as in the wait loop
+ * @dev: struct cpuidle_device for this cpu
+ * @coupled: the struct coupled that contains the current cpu
+ * @next_state: the index in drv->states of the requested state for this cpu
+ *
+ * Updates the requested idle state for the specified cpuidle device,
+ * poking all coupled cpus out of idle if necessary to let them see the new
+ * state.
+ */
+static void cpuidle_coupled_set_waiting(int cpu,
+		struct cpuidle_coupled *coupled, int next_state)
+{
+	int w;
+
+	coupled->requested_state[cpu] = next_state;
+
+	/*
+	 * If this is the last cpu to enter the waiting state, poke
+	 * all the other cpus out of their waiting state so they can
+	 * enter a deeper state.  This can race with one of the cpus
+	 * exiting the waiting state due to an interrupt and
+	 * decrementing waiting_count, see comment below.
+	 *
+	 * The atomic_inc_return provides a write barrier to order the write
+	 * to requested_state with the later write that increments ready_count.
+	 */
+	w = atomic_inc_return(&coupled->ready_waiting_counts) & WAITING_MASK;
+	if (w == coupled->online_count)
+		cpuidle_coupled_poke_others(cpu, coupled);
+}
+
+/**
+ * cpuidle_coupled_set_not_waiting - mark this cpu as leaving the wait loop
+ * @dev: struct cpuidle_device for this cpu
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Removes the requested idle state for the specified cpuidle device.
+ */
+static void cpuidle_coupled_set_not_waiting(int cpu,
+		struct cpuidle_coupled *coupled)
+{
+	/*
+	 * Decrementing waiting count can race with incrementing it in
+	 * cpuidle_coupled_set_waiting, but that's OK.  Worst case, some
+	 * cpus will increment ready_count and then spin until they
+	 * notice that this cpu has cleared it's requested_state.
+	 */
+	atomic_dec(&coupled->ready_waiting_counts);
+
+	coupled->requested_state[cpu] = CPUIDLE_COUPLED_NOT_IDLE;
+}
+
+/**
+ * cpuidle_coupled_set_done - mark this cpu as leaving the ready loop
+ * @cpu: the current cpu
+ * @coupled: the struct coupled that contains the current cpu
+ *
+ * Marks this cpu as no longer in the ready and waiting loops.  Decrements
+ * the waiting count first to prevent another cpu looping back in and seeing
+ * this cpu as waiting just before it exits idle.
+ */
+static void cpuidle_coupled_set_done(int cpu, struct cpuidle_coupled *coupled)
+{
+	cpuidle_coupled_set_not_waiting(cpu, coupled);
+	atomic_sub(MAX_WAITING_CPUS, &coupled->ready_waiting_counts);
+}
+
+/**
+ * cpuidle_coupled_clear_pokes - spin until the poke interrupt is processed
+ * @cpu - this cpu
+ *
+ * Turns on interrupts and spins until any outstanding poke interrupts have
+ * been processed and the poke bit has been cleared.
+ *
+ * Other interrupts may also be processed while interrupts are enabled, so
+ * need_resched() must be tested after turning interrupts off again to make sure
+ * the interrupt didn't schedule work that should take the cpu out of idle.
+ *
+ * Returns 0 if need_resched was false, -EINTR if need_resched was true.
+ */
+static int cpuidle_coupled_clear_pokes(int cpu)
+{
+	local_irq_enable();
+	while (cpumask_test_cpu(cpu, &cpuidle_coupled_poked_mask))
+		cpu_relax();
+	local_irq_disable();
+
+	return need_resched() ? -EINTR : 0;
+}
+
+/**
+ * cpuidle_enter_state_coupled - attempt to enter a state with coupled cpus
+ * @dev: struct cpuidle_device for the current cpu
+ * @drv: struct cpuidle_driver for the platform
+ * @next_state: index of the requested state in drv->states
+ *
+ * Coordinate with coupled cpus to enter the target state.  This is a two
+ * stage process.  In the first stage, the cpus are operating independently,
+ * and may call into cpuidle_enter_state_coupled at completely different times.
+ * To save as much power as possible, the first cpus to call this function will
+ * go to an intermediate state (the cpuidle_device's safe state), and wait for
+ * all the other cpus to call this function.  Once all coupled cpus are idle,
+ * the second stage will start.  Each coupled cpu will spin until all cpus have
+ * guaranteed that they will call the target_state.
+ *
+ * This function must be called with interrupts disabled.  It may enable
+ * interrupts while preparing for idle, and it will always return with
+ * interrupts enabled.
+ */
+int cpuidle_enter_state_coupled(struct cpuidle_device *dev,
+		struct cpuidle_driver *drv, int next_state)
+{
+	int entered_state = -1;
+	struct cpuidle_coupled *coupled = dev->coupled;
+
+	if (!coupled)
+		return -EINVAL;
+
+	while (coupled->prevent) {
+		if (cpuidle_coupled_clear_pokes(dev->cpu)) {
+			local_irq_enable();
+			return entered_state;
+		}
+		entered_state = cpuidle_enter_state(dev, drv,
+			dev->safe_state_index);
+	}
+
+	/* Read barrier ensures online_count is read after prevent is cleared */
+	smp_rmb();
+
+	cpuidle_coupled_set_waiting(dev->cpu, coupled, next_state);
+
+retry:
+	/*
+	 * Wait for all coupled cpus to be idle, using the deepest state
+	 * allowed for a single cpu.
+	 */
+	while (!cpuidle_coupled_cpus_waiting(coupled)) {
+		if (cpuidle_coupled_clear_pokes(dev->cpu)) {
+			cpuidle_coupled_set_not_waiting(dev->cpu, coupled);
+			goto out;
+		}
+
+		if (coupled->prevent) {
+			cpuidle_coupled_set_not_waiting(dev->cpu, coupled);
+			goto out;
+		}
+
+		entered_state = cpuidle_enter_state(dev, drv,
+			dev->safe_state_index);
+	}
+
+	if (cpuidle_coupled_clear_pokes(dev->cpu)) {
+		cpuidle_coupled_set_not_waiting(dev->cpu, coupled);
+		goto out;
+	}
+
+	/*
+	 * All coupled cpus are probably idle.  There is a small chance that
+	 * one of the other cpus just became active.  Increment the ready count,
+	 * and spin until all coupled cpus have incremented the counter. Once a
+	 * cpu has incremented the ready counter, it cannot abort idle and must
+	 * spin until either all cpus have incremented the ready counter, or
+	 * another cpu leaves idle and decrements the waiting counter.
+	 */
+
+	cpuidle_coupled_set_ready(coupled);
+	while (!cpuidle_coupled_cpus_ready(coupled)) {
+		/* Check if any other cpus bailed out of idle. */
+		if (!cpuidle_coupled_cpus_waiting(coupled))
+			if (!cpuidle_coupled_set_not_ready(coupled))
+				goto retry;
+
+		cpu_relax();
+	}
+
+	/* all cpus have acked the coupled state */
+	next_state = cpuidle_coupled_get_state(dev, coupled);
+
+	entered_state = cpuidle_enter_state(dev, drv, next_state);
+
+	cpuidle_coupled_set_done(dev->cpu, coupled);
+
+out:
+	/*
+	 * Normal cpuidle states are expected to return with irqs enabled.
+	 * That leads to an inefficiency where a cpu receiving an interrupt
+	 * that brings it out of idle will process that interrupt before
+	 * exiting the idle enter function and decrementing ready_count.  All
+	 * other cpus will need to spin waiting for the cpu that is processing
+	 * the interrupt.  If the driver returns with interrupts disabled,
+	 * all other cpus will loop back into the safe idle state instead of
+	 * spinning, saving power.
+	 *
+	 * Calling local_irq_enable here allows coupled states to return with
+	 * interrupts disabled, but won't cause problems for drivers that
+	 * exit with interrupts enabled.
+	 */
+	local_irq_enable();
+
+	/*
+	 * Wait until all coupled cpus have exited idle.  There is no risk that
+	 * a cpu exits and re-enters the ready state because this cpu has
+	 * already decremented its waiting_count.
+	 */
+	while (!cpuidle_coupled_no_cpus_ready(coupled))
+		cpu_relax();
+
+	return entered_state;
+}
+
+static void cpuidle_coupled_update_online_cpus(struct cpuidle_coupled *coupled)
+{
+	cpumask_t cpus;
+	cpumask_and(&cpus, cpu_online_mask, &coupled->coupled_cpus);
+	coupled->online_count = cpumask_weight(&cpus);
+}
+
+/**
+ * cpuidle_coupled_register_device - register a coupled cpuidle device
+ * @dev: struct cpuidle_device for the current cpu
+ *
+ * Called from cpuidle_register_device to handle coupled idle init.  Finds the
+ * cpuidle_coupled struct for this set of coupled cpus, or creates one if none
+ * exists yet.
+ */
+int cpuidle_coupled_register_device(struct cpuidle_device *dev)
+{
+	int cpu;
+	struct cpuidle_device *other_dev;
+	struct call_single_data *csd;
+	struct cpuidle_coupled *coupled;
+
+	if (cpumask_empty(&dev->coupled_cpus))
+		return 0;
+
+	for_each_cpu_mask(cpu, dev->coupled_cpus) {
+		other_dev = per_cpu(cpuidle_devices, cpu);
+		if (other_dev && other_dev->coupled) {
+			coupled = other_dev->coupled;
+			goto have_coupled;
+		}
+	}
+
+	/* No existing coupled info found, create a new one */
+	coupled = kzalloc(sizeof(struct cpuidle_coupled), GFP_KERNEL);
+	if (!coupled)
+		return -ENOMEM;
+
+	coupled->coupled_cpus = dev->coupled_cpus;
+
+have_coupled:
+	dev->coupled = coupled;
+	if (WARN_ON(!cpumask_equal(&dev->coupled_cpus, &coupled->coupled_cpus)))
+		coupled->prevent++;
+
+	cpuidle_coupled_update_online_cpus(coupled);
+
+	coupled->refcnt++;
+
+	csd = &per_cpu(cpuidle_coupled_poke_cb, dev->cpu);
+	csd->func = cpuidle_coupled_poked;
+	csd->info = (void *)(unsigned long)dev->cpu;
+
+	return 0;
+}
+
+/**
+ * cpuidle_coupled_unregister_device - unregister a coupled cpuidle device
+ * @dev: struct cpuidle_device for the current cpu
+ *
+ * Called from cpuidle_unregister_device to tear down coupled idle.  Removes the
+ * cpu from the coupled idle set, and frees the cpuidle_coupled_info struct if
+ * this was the last cpu in the set.
+ */
+void cpuidle_coupled_unregister_device(struct cpuidle_device *dev)
+{
+	struct cpuidle_coupled *coupled = dev->coupled;
+
+	if (cpumask_empty(&dev->coupled_cpus))
+		return;
+
+	if (--coupled->refcnt)
+		kfree(coupled);
+	dev->coupled = NULL;
+}
+
+/**
+ * cpuidle_coupled_prevent_idle - prevent cpus from entering a coupled state
+ * @coupled: the struct coupled that contains the cpu that is changing state
+ *
+ * Disables coupled cpuidle on a coupled set of cpus.  Used to ensure that
+ * cpu_online_mask doesn't change while cpus are coordinating coupled idle.
+ */
+static void cpuidle_coupled_prevent_idle(struct cpuidle_coupled *coupled)
+{
+	int cpu = get_cpu();
+
+	/* Force all cpus out of the waiting loop. */
+	coupled->prevent++;
+	cpuidle_coupled_poke_others(cpu, coupled);
+	put_cpu();
+	while (!cpuidle_coupled_no_cpus_waiting(coupled))
+		cpu_relax();
+}
+
+/**
+ * cpuidle_coupled_allow_idle - allows cpus to enter a coupled state
+ * @coupled: the struct coupled that contains the cpu that is changing state
+ *
+ * Enables coupled cpuidle on a coupled set of cpus.  Used to ensure that
+ * cpu_online_mask doesn't change while cpus are coordinating coupled idle.
+ */
+static void cpuidle_coupled_allow_idle(struct cpuidle_coupled *coupled)
+{
+	int cpu = get_cpu();
+
+	/*
+	 * Write barrier ensures readers see the new online_count when they
+	 * see prevent == false.
+	 */
+	smp_wmb();
+	coupled->prevent--;
+	/* Force cpus out of the prevent loop. */
+	cpuidle_coupled_poke_others(cpu, coupled);
+	put_cpu();
+}
+
+/**
+ * cpuidle_coupled_cpu_notify - notifier called during hotplug transitions
+ * @nb: notifier block
+ * @action: hotplug transition
+ * @hcpu: target cpu number
+ *
+ * Called when a cpu is brought on or offline using hotplug.  Updates the
+ * coupled cpu set appropriately
+ */
+static int cpuidle_coupled_cpu_notify(struct notifier_block *nb,
+		unsigned long action, void *hcpu)
+{
+	int cpu = (unsigned long)hcpu;
+	struct cpuidle_device *dev;
+
+	mutex_lock(&cpuidle_lock);
+
+	dev = per_cpu(cpuidle_devices, cpu);
+	if (!dev->coupled)
+		goto out;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+	case CPU_DOWN_PREPARE:
+		cpuidle_coupled_prevent_idle(dev->coupled);
+		break;
+	case CPU_ONLINE:
+	case CPU_DEAD:
+		cpuidle_coupled_update_online_cpus(dev->coupled);
+		/* Fall through */
+	case CPU_UP_CANCELED:
+	case CPU_DOWN_FAILED:
+		cpuidle_coupled_allow_idle(dev->coupled);
+		break;
+	}
+
+out:
+	mutex_unlock(&cpuidle_lock);
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cpuidle_coupled_cpu_notifier = {
+	.notifier_call = cpuidle_coupled_cpu_notify,
+};
+
+static int __init cpuidle_coupled_init(void)
+{
+	return register_cpu_notifier(&cpuidle_coupled_cpu_notifier);
+}
+core_initcall(cpuidle_coupled_init);
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 3c37ced0e0c0..ed1dd859299b 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -160,7 +160,11 @@ int cpuidle_idle_call(void)
 	trace_power_start_rcuidle(POWER_CSTATE, next_state, dev->cpu);
 	trace_cpu_idle_rcuidle(next_state, dev->cpu);
 
-	entered_state = cpuidle_enter_state(dev, drv, next_state);
+	if (cpuidle_state_is_coupled(dev, drv, next_state))
+		entered_state = cpuidle_enter_state_coupled(dev, drv,
+							    next_state);
+	else
+		entered_state = cpuidle_enter_state(dev, drv, next_state);
 
 	trace_power_end_rcuidle(dev->cpu);
 	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
@@ -396,9 +400,16 @@ static int __cpuidle_register_device(struct cpuidle_device *dev)
 	if (ret)
 		goto err_sysfs;
 
+	ret = cpuidle_coupled_register_device(dev);
+	if (ret)
+		goto err_coupled;
+
 	dev->registered = 1;
 	return 0;
 
+err_coupled:
+	cpuidle_remove_sysfs(cpu_dev);
+	wait_for_completion(&dev->kobj_unregister);
 err_sysfs:
 	list_del(&dev->device_list);
 	per_cpu(cpuidle_devices, dev->cpu) = NULL;
@@ -453,6 +464,8 @@ void cpuidle_unregister_device(struct cpuidle_device *dev)
 	wait_for_completion(&dev->kobj_unregister);
 	per_cpu(cpuidle_devices, dev->cpu) = NULL;
 
+	cpuidle_coupled_unregister_device(dev);
+
 	cpuidle_resume_and_unlock();
 
 	module_put(cpuidle_driver->owner);
diff --git a/drivers/cpuidle/cpuidle.h b/drivers/cpuidle/cpuidle.h
index d8a3ccce8281..76e7f696ad8c 100644
--- a/drivers/cpuidle/cpuidle.h
+++ b/drivers/cpuidle/cpuidle.h
@@ -32,4 +32,34 @@ extern void cpuidle_remove_state_sysfs(struct cpuidle_device *device);
 extern int cpuidle_add_sysfs(struct device *dev);
 extern void cpuidle_remove_sysfs(struct device *dev);
 
+#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
+bool cpuidle_state_is_coupled(struct cpuidle_device *dev,
+		struct cpuidle_driver *drv, int state);
+int cpuidle_enter_state_coupled(struct cpuidle_device *dev,
+		struct cpuidle_driver *drv, int next_state);
+int cpuidle_coupled_register_device(struct cpuidle_device *dev);
+void cpuidle_coupled_unregister_device(struct cpuidle_device *dev);
+#else
+static inline bool cpuidle_state_is_coupled(struct cpuidle_device *dev,
+		struct cpuidle_driver *drv, int state)
+{
+	return false;
+}
+
+static inline int cpuidle_enter_state_coupled(struct cpuidle_device *dev,
+		struct cpuidle_driver *drv, int next_state)
+{
+	return -1;
+}
+
+static inline int cpuidle_coupled_register_device(struct cpuidle_device *dev)
+{
+	return 0;
+}
+
+static inline void cpuidle_coupled_unregister_device(struct cpuidle_device *dev)
+{
+}
+#endif
+
 #endif /* __DRIVER_CPUIDLE_H */
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 6c26a3da0e03..603844835bc7 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -57,6 +57,7 @@ struct cpuidle_state {
 
 /* Idle State Flags */
 #define CPUIDLE_FLAG_TIME_VALID	(0x01) /* is residency time measurable? */
+#define CPUIDLE_FLAG_COUPLED	(0x02) /* state applies to multiple cpus */
 
 #define CPUIDLE_DRIVER_FLAGS_MASK (0xFFFF0000)
 
@@ -100,6 +101,12 @@ struct cpuidle_device {
 	struct list_head 	device_list;
 	struct kobject		kobj;
 	struct completion	kobj_unregister;
+
+#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
+	int			safe_state_index;
+	cpumask_t		coupled_cpus;
+	struct cpuidle_coupled	*coupled;
+#endif
 };
 
 DECLARE_PER_CPU(struct cpuidle_device *, cpuidle_devices);
-- 
cgit v1.2.3


From 4f51412c71edab8b3033ea97f84bc0b94470287b Mon Sep 17 00:00:00 2001
From: Colin Cross <ccross@android.com>
Date: Wed, 14 Mar 2012 11:29:53 -0700
Subject: cpuidle: coupled: add parallel barrier function

Adds cpuidle_coupled_parallel_barrier, which can be used by coupled
cpuidle state enter functions to handle resynchronization after
determining if any cpu needs to abort.  The normal use case will
be:

static bool abort_flag;
static atomic_t abort_barrier;

int arch_cpuidle_enter(struct cpuidle_device *dev, ...)
{
	if (arch_turn_off_irq_controller()) {
	   	/* returns an error if an irq is pending and would be lost
		   if idle continued and turned off power */
		abort_flag = true;
	}

	cpuidle_coupled_parallel_barrier(dev, &abort_barrier);

	if (abort_flag) {
	   	/* One of the cpus didn't turn off it's irq controller */
	   	arch_turn_on_irq_controller();
		return -EINTR;
	}

	/* continue with idle */
	...
}

This will cause all cpus to abort idle together if one of them needs
to abort.

Reviewed-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Tested-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Reviewed-by: Kevin Hilman <khilman@ti.com>
Tested-by: Kevin Hilman <khilman@ti.com>
Signed-off-by: Colin Cross <ccross@android.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/cpuidle/coupled.c | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/cpuidle.h   |  4 ++++
 2 files changed, 41 insertions(+)

diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
index 4a7dcc41360d..fc427fa1fefb 100644
--- a/drivers/cpuidle/coupled.c
+++ b/drivers/cpuidle/coupled.c
@@ -129,6 +129,43 @@ static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb);
  */
 static cpumask_t cpuidle_coupled_poked_mask;
 
+/**
+ * cpuidle_coupled_parallel_barrier - synchronize all online coupled cpus
+ * @dev: cpuidle_device of the calling cpu
+ * @a:   atomic variable to hold the barrier
+ *
+ * No caller to this function will return from this function until all online
+ * cpus in the same coupled group have called this function.  Once any caller
+ * has returned from this function, the barrier is immediately available for
+ * reuse.
+ *
+ * The atomic variable a must be initialized to 0 before any cpu calls
+ * this function, will be reset to 0 before any cpu returns from this function.
+ *
+ * Must only be called from within a coupled idle state handler
+ * (state.enter when state.flags has CPUIDLE_FLAG_COUPLED set).
+ *
+ * Provides full smp barrier semantics before and after calling.
+ */
+void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev, atomic_t *a)
+{
+	int n = dev->coupled->online_count;
+
+	smp_mb__before_atomic_inc();
+	atomic_inc(a);
+
+	while (atomic_read(a) < n)
+		cpu_relax();
+
+	if (atomic_inc_return(a) == n * 2) {
+		atomic_set(a, 0);
+		return;
+	}
+
+	while (atomic_read(a) > n)
+		cpu_relax();
+}
+
 /**
  * cpuidle_state_is_coupled - check if a state is part of a coupled set
  * @dev: struct cpuidle_device for the current cpu
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 603844835bc7..5ab7183313ce 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -183,6 +183,10 @@ static inline int cpuidle_play_dead(void) {return -ENODEV; }
 
 #endif
 
+#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
+void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev, atomic_t *a);
+#endif
+
 /******************************
  * CPUIDLE GOVERNOR INTERFACE *
  ******************************/
-- 
cgit v1.2.3


From 9bdbcb4f8fb35f35bf0be9d2db569563179ce5c1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 16 Jul 2012 10:42:35 +0000
Subject: rcu: Yield simpler

The rcu_yield() code is amazing. It's there to avoid starvation of the
system when lots of (boosting) work is to be done.

Now looking at the code it's functionality is:

 Make the thread SCHED_OTHER and very nice, i.e. get it out of the way
 Arm a timer with 2 ticks
 schedule()

Now if the system goes idle the rcu task returns, regains SCHED_FIFO
and plugs on. If the systems stays busy the timer fires and wakes a
per node kthread which in turn makes the per cpu thread SCHED_FIFO and
brings it back on the cpu. For the boosting thread the "make it FIFO"
bit is missing and it just runs some magic boost checks. Now this is a
lot of code with extra threads and complexity.

It's way simpler to let the tasks when they detect overload schedule
away for 2 ticks and defer the normal wakeup as long as they are in
yielded state and the cpu is not idle.

That solves the same problem and the only difference is that when the
cpu goes idle it's not guaranteed that the thread returns right away,
but it won't be longer out than two ticks, so no harm is done. If
that's an issue than it is way simpler just to wake the task from
idle as RCU has callbacks there anyway.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/rcutree.c        |   8 +-
 kernel/rcutree.h        |   7 +-
 kernel/rcutree_plugin.h | 210 +++++++++---------------------------------------
 3 files changed, 41 insertions(+), 184 deletions(-)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 4b97bba7396e..cba883468548 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -131,7 +131,7 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
 
 #endif /* #ifdef CONFIG_RCU_BOOST */
 
-static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
 static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 
@@ -1462,7 +1462,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 
 	/* Adjust any no-longer-needed kthreads. */
 	rcu_stop_cpu_kthread(cpu);
-	rcu_node_kthread_setaffinity(rnp, -1);
+	rcu_boost_kthread_setaffinity(rnp, -1);
 
 	/* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
 
@@ -2516,11 +2516,11 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 		break;
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
-		rcu_node_kthread_setaffinity(rnp, -1);
+		rcu_boost_kthread_setaffinity(rnp, -1);
 		rcu_cpu_kthread_setrt(cpu, 1);
 		break;
 	case CPU_DOWN_PREPARE:
-		rcu_node_kthread_setaffinity(rnp, cpu);
+		rcu_boost_kthread_setaffinity(rnp, cpu);
 		rcu_cpu_kthread_setrt(cpu, 0);
 		break;
 	case CPU_DYING:
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 19b61ac1079f..816ed78858b1 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -479,13 +479,8 @@ static void invoke_rcu_callbacks_kthread(void);
 static bool rcu_is_callbacks_kthread(void);
 #ifdef CONFIG_RCU_BOOST
 static void rcu_preempt_do_callbacks(void);
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
-					  cpumask_var_t cm);
 static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
-						 struct rcu_node *rnp,
-						 int rnp_index);
-static void invoke_rcu_node_kthread(struct rcu_node *rnp);
-static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
+						 struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_RCU_BOOST */
 static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
 static void __cpuinit rcu_prepare_kthreads(int cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 3e4899459f3d..95fd5634e541 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1225,6 +1225,16 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
 
 #endif /* #else #ifdef CONFIG_RCU_TRACE */
 
+static void rcu_wake_cond(struct task_struct *t, int status)
+{
+	/*
+	 * If the thread is yielding, only wake it when this
+	 * is invoked from idle
+	 */
+	if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
+		wake_up_process(t);
+}
+
 /*
  * Carry out RCU priority boosting on the task indicated by ->exp_tasks
  * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1296,17 +1306,6 @@ static int rcu_boost(struct rcu_node *rnp)
 	       ACCESS_ONCE(rnp->boost_tasks) != NULL;
 }
 
-/*
- * Timer handler to initiate waking up of boost kthreads that
- * have yielded the CPU due to excessive numbers of tasks to
- * boost.  We wake up the per-rcu_node kthread, which in turn
- * will wake up the booster kthread.
- */
-static void rcu_boost_kthread_timer(unsigned long arg)
-{
-	invoke_rcu_node_kthread((struct rcu_node *)arg);
-}
-
 /*
  * Priority-boosting kthread.  One per leaf rcu_node and one for the
  * root rcu_node.
@@ -1330,8 +1329,9 @@ static int rcu_boost_kthread(void *arg)
 		else
 			spincnt = 0;
 		if (spincnt > 10) {
+			rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
 			trace_rcu_utilization("End boost kthread@rcu_yield");
-			rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
+			schedule_timeout_interruptible(2);
 			trace_rcu_utilization("Start boost kthread@rcu_yield");
 			spincnt = 0;
 		}
@@ -1369,8 +1369,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
 			rnp->boost_tasks = rnp->gp_tasks;
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		t = rnp->boost_kthread_task;
-		if (t != NULL)
-			wake_up_process(t);
+		if (t)
+			rcu_wake_cond(t, rnp->boost_kthread_status);
 	} else {
 		rcu_initiate_boost_trace(rnp);
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1387,8 +1387,10 @@ static void invoke_rcu_callbacks_kthread(void)
 	local_irq_save(flags);
 	__this_cpu_write(rcu_cpu_has_work, 1);
 	if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
-	    current != __this_cpu_read(rcu_cpu_kthread_task))
-		wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
+	    current != __this_cpu_read(rcu_cpu_kthread_task)) {
+		rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
+			      __this_cpu_read(rcu_cpu_kthread_status));
+	}
 	local_irq_restore(flags);
 }
 
@@ -1401,21 +1403,6 @@ static bool rcu_is_callbacks_kthread(void)
 	return __get_cpu_var(rcu_cpu_kthread_task) == current;
 }
 
-/*
- * Set the affinity of the boost kthread.  The CPU-hotplug locks are
- * held, so no one should be messing with the existence of the boost
- * kthread.
- */
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
-					  cpumask_var_t cm)
-{
-	struct task_struct *t;
-
-	t = rnp->boost_kthread_task;
-	if (t != NULL)
-		set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
-}
-
 #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
 
 /*
@@ -1432,15 +1419,19 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
  * Returns zero if all is well, a negated errno otherwise.
  */
 static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
-						 struct rcu_node *rnp,
-						 int rnp_index)
+						 struct rcu_node *rnp)
 {
+	int rnp_index = rnp - &rsp->node[0];
 	unsigned long flags;
 	struct sched_param sp;
 	struct task_struct *t;
 
 	if (&rcu_preempt_state != rsp)
 		return 0;
+
+	if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0)
+		return 0;
+
 	rsp->boost = 1;
 	if (rnp->boost_kthread_task != NULL)
 		return 0;
@@ -1483,20 +1474,6 @@ static void rcu_kthread_do_work(void)
 	rcu_preempt_do_callbacks();
 }
 
-/*
- * Wake up the specified per-rcu_node-structure kthread.
- * Because the per-rcu_node kthreads are immortal, we don't need
- * to do anything to keep them alive.
- */
-static void invoke_rcu_node_kthread(struct rcu_node *rnp)
-{
-	struct task_struct *t;
-
-	t = rnp->node_kthread_task;
-	if (t != NULL)
-		wake_up_process(t);
-}
-
 /*
  * Set the specified CPU's kthread to run RT or not, as specified by
  * the to_rt argument.  The CPU-hotplug locks are held, so the task
@@ -1521,45 +1498,6 @@ static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
 	sched_setscheduler_nocheck(t, policy, &sp);
 }
 
-/*
- * Timer handler to initiate the waking up of per-CPU kthreads that
- * have yielded the CPU due to excess numbers of RCU callbacks.
- * We wake up the per-rcu_node kthread, which in turn will wake up
- * the booster kthread.
- */
-static void rcu_cpu_kthread_timer(unsigned long arg)
-{
-	struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
-	struct rcu_node *rnp = rdp->mynode;
-
-	atomic_or(rdp->grpmask, &rnp->wakemask);
-	invoke_rcu_node_kthread(rnp);
-}
-
-/*
- * Drop to non-real-time priority and yield, but only after posting a
- * timer that will cause us to regain our real-time priority if we
- * remain preempted.  Either way, we restore our real-time priority
- * before returning.
- */
-static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
-{
-	struct sched_param sp;
-	struct timer_list yield_timer;
-	int prio = current->rt_priority;
-
-	setup_timer_on_stack(&yield_timer, f, arg);
-	mod_timer(&yield_timer, jiffies + 2);
-	sp.sched_priority = 0;
-	sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
-	set_user_nice(current, 19);
-	schedule();
-	set_user_nice(current, 0);
-	sp.sched_priority = prio;
-	sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
-	del_timer(&yield_timer);
-}
-
 /*
  * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
  * This can happen while the corresponding CPU is either coming online
@@ -1632,7 +1570,7 @@ static int rcu_cpu_kthread(void *arg)
 		if (spincnt > 10) {
 			*statusp = RCU_KTHREAD_YIELDING;
 			trace_rcu_utilization("End CPU kthread@rcu_yield");
-			rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
+			schedule_timeout_interruptible(2);
 			trace_rcu_utilization("Start CPU kthread@rcu_yield");
 			spincnt = 0;
 		}
@@ -1688,48 +1626,6 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
 	return 0;
 }
 
-/*
- * Per-rcu_node kthread, which is in charge of waking up the per-CPU
- * kthreads when needed.  We ignore requests to wake up kthreads
- * for offline CPUs, which is OK because force_quiescent_state()
- * takes care of this case.
- */
-static int rcu_node_kthread(void *arg)
-{
-	int cpu;
-	unsigned long flags;
-	unsigned long mask;
-	struct rcu_node *rnp = (struct rcu_node *)arg;
-	struct sched_param sp;
-	struct task_struct *t;
-
-	for (;;) {
-		rnp->node_kthread_status = RCU_KTHREAD_WAITING;
-		rcu_wait(atomic_read(&rnp->wakemask) != 0);
-		rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
-		raw_spin_lock_irqsave(&rnp->lock, flags);
-		mask = atomic_xchg(&rnp->wakemask, 0);
-		rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
-		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
-			if ((mask & 0x1) == 0)
-				continue;
-			preempt_disable();
-			t = per_cpu(rcu_cpu_kthread_task, cpu);
-			if (!cpu_online(cpu) || t == NULL) {
-				preempt_enable();
-				continue;
-			}
-			per_cpu(rcu_cpu_has_work, cpu) = 1;
-			sp.sched_priority = RCU_KTHREAD_PRIO;
-			sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
-			preempt_enable();
-		}
-	}
-	/* NOTREACHED */
-	rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
-	return 0;
-}
-
 /*
  * Set the per-rcu_node kthread's affinity to cover all CPUs that are
  * served by the rcu_node in question.  The CPU hotplug lock is still
@@ -1739,17 +1635,17 @@ static int rcu_node_kthread(void *arg)
  * no outgoing CPU.  If there are no CPUs left in the affinity set,
  * this function allows the kthread to execute on any CPU.
  */
-static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 {
+	struct task_struct *t = rnp->boost_kthread_task;
+	unsigned long mask = rnp->qsmaskinit;
 	cpumask_var_t cm;
 	int cpu;
-	unsigned long mask = rnp->qsmaskinit;
 
-	if (rnp->node_kthread_task == NULL)
+	if (!t)
 		return;
-	if (!alloc_cpumask_var(&cm, GFP_KERNEL))
+	if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
 		return;
-	cpumask_clear(cm);
 	for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
 		if ((mask & 0x1) && cpu != outgoingcpu)
 			cpumask_set_cpu(cpu, cm);
@@ -1759,50 +1655,17 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 			cpumask_clear_cpu(cpu, cm);
 		WARN_ON_ONCE(cpumask_weight(cm) == 0);
 	}
-	set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
-	rcu_boost_kthread_setaffinity(rnp, cm);
+	set_cpus_allowed_ptr(t, cm);
 	free_cpumask_var(cm);
 }
 
-/*
- * Spawn a per-rcu_node kthread, setting priority and affinity.
- * Called during boot before online/offline can happen, or, if
- * during runtime, with the main CPU-hotplug locks held.  So only
- * one of these can be executing at a time.
- */
-static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
-						struct rcu_node *rnp)
-{
-	unsigned long flags;
-	int rnp_index = rnp - &rsp->node[0];
-	struct sched_param sp;
-	struct task_struct *t;
-
-	if (!rcu_scheduler_fully_active ||
-	    rnp->qsmaskinit == 0)
-		return 0;
-	if (rnp->node_kthread_task == NULL) {
-		t = kthread_create(rcu_node_kthread, (void *)rnp,
-				   "rcun/%d", rnp_index);
-		if (IS_ERR(t))
-			return PTR_ERR(t);
-		raw_spin_lock_irqsave(&rnp->lock, flags);
-		rnp->node_kthread_task = t;
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
-		sp.sched_priority = 99;
-		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
-		wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
-	}
-	return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
-}
-
 /*
  * Spawn all kthreads -- called as soon as the scheduler is running.
  */
 static int __init rcu_spawn_kthreads(void)
 {
-	int cpu;
 	struct rcu_node *rnp;
+	int cpu;
 
 	rcu_scheduler_fully_active = 1;
 	for_each_possible_cpu(cpu) {
@@ -1811,10 +1674,10 @@ static int __init rcu_spawn_kthreads(void)
 			(void)rcu_spawn_one_cpu_kthread(cpu);
 	}
 	rnp = rcu_get_root(rcu_state);
-	(void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+	(void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
 	if (NUM_RCU_NODES > 1) {
 		rcu_for_each_leaf_node(rcu_state, rnp)
-			(void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+			(void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
 	}
 	return 0;
 }
@@ -1828,8 +1691,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
 	/* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
 	if (rcu_scheduler_fully_active) {
 		(void)rcu_spawn_one_cpu_kthread(cpu);
-		if (rnp->node_kthread_task == NULL)
-			(void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+		(void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
 	}
 }
 
@@ -1862,7 +1724,7 @@ static void rcu_stop_cpu_kthread(int cpu)
 
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 
-static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 {
 }
 
-- 
cgit v1.2.3


From c19731de7edef5c378574340e414a514ba79504a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 16 Jul 2012 10:42:36 +0000
Subject: kthread: Implement park/unpark facility

To avoid the full teardown/setup of per cpu kthreads in the case of
cpu hot(un)plug, provide a facility which allows to put the kthread
into a park position and unpark it when the cpu comes online again.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
---
 include/linux/kthread.h |  11 ++-
 kernel/kthread.c        | 185 +++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 176 insertions(+), 20 deletions(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 0714b24c0e45..5365347a0dc9 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -14,6 +14,11 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 	kthread_create_on_node(threadfn, data, -1, namefmt, ##arg)
 
 
+struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
+					  void *data,
+					  unsigned int cpu,
+					  const char *namefmt);
+
 /**
  * kthread_run - create and wake a thread.
  * @threadfn: the function to run until signal_pending(current).
@@ -34,9 +39,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 
 void kthread_bind(struct task_struct *k, unsigned int cpu);
 int kthread_stop(struct task_struct *k);
-int kthread_should_stop(void);
+bool kthread_should_stop(void);
+bool kthread_should_park(void);
 bool kthread_freezable_should_stop(bool *was_frozen);
 void *kthread_data(struct task_struct *k);
+int kthread_park(struct task_struct *k);
+void kthread_unpark(struct task_struct *k);
+void kthread_parkme(void);
 
 int kthreadd(void *unused);
 extern struct task_struct *kthreadd_task;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3d3de633702e..571fa798323b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -37,11 +37,20 @@ struct kthread_create_info
 };
 
 struct kthread {
-	int should_stop;
+	unsigned long flags;
+	unsigned int cpu;
 	void *data;
+	struct completion parked;
 	struct completion exited;
 };
 
+enum KTHREAD_BITS {
+	KTHREAD_IS_PER_CPU = 0,
+	KTHREAD_SHOULD_STOP,
+	KTHREAD_SHOULD_PARK,
+	KTHREAD_IS_PARKED,
+};
+
 #define to_kthread(tsk)	\
 	container_of((tsk)->vfork_done, struct kthread, exited)
 
@@ -52,12 +61,28 @@ struct kthread {
  * and this will return true.  You should then return, and your return
  * value will be passed through to kthread_stop().
  */
-int kthread_should_stop(void)
+bool kthread_should_stop(void)
 {
-	return to_kthread(current)->should_stop;
+	return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags);
 }
 EXPORT_SYMBOL(kthread_should_stop);
 
+/**
+ * kthread_should_park - should this kthread park now?
+ *
+ * When someone calls kthread_park() on your kthread, it will be woken
+ * and this will return true.  You should then do the necessary
+ * cleanup and call kthread_parkme()
+ *
+ * Similar to kthread_should_stop(), but this keeps the thread alive
+ * and in a park position. kthread_unpark() "restarts" the thread and
+ * calls the thread function again.
+ */
+bool kthread_should_park(void)
+{
+	return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
+}
+
 /**
  * kthread_freezable_should_stop - should this freezable kthread return now?
  * @was_frozen: optional out parameter, indicates whether %current was frozen
@@ -96,6 +121,24 @@ void *kthread_data(struct task_struct *task)
 	return to_kthread(task)->data;
 }
 
+static void __kthread_parkme(struct kthread *self)
+{
+	__set_current_state(TASK_INTERRUPTIBLE);
+	while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
+		if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
+			complete(&self->parked);
+		schedule();
+		__set_current_state(TASK_INTERRUPTIBLE);
+	}
+	clear_bit(KTHREAD_IS_PARKED, &self->flags);
+	__set_current_state(TASK_RUNNING);
+}
+
+void kthread_parkme(void)
+{
+	__kthread_parkme(to_kthread(current));
+}
+
 static int kthread(void *_create)
 {
 	/* Copy data: it's on kthread's stack */
@@ -105,9 +148,10 @@ static int kthread(void *_create)
 	struct kthread self;
 	int ret;
 
-	self.should_stop = 0;
+	self.flags = 0;
 	self.data = data;
 	init_completion(&self.exited);
+	init_completion(&self.parked);
 	current->vfork_done = &self.exited;
 
 	/* OK, tell user we're spawned, wait for stop or wakeup */
@@ -117,9 +161,11 @@ static int kthread(void *_create)
 	schedule();
 
 	ret = -EINTR;
-	if (!self.should_stop)
-		ret = threadfn(data);
 
+	if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) {
+		__kthread_parkme(&self);
+		ret = threadfn(data);
+	}
 	/* we can't just return, we must preserve "self" on stack */
 	do_exit(ret);
 }
@@ -172,8 +218,7 @@ static void create_kthread(struct kthread_create_info *create)
  * Returns a task_struct or ERR_PTR(-ENOMEM).
  */
 struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
-					   void *data,
-					   int node,
+					   void *data, int node,
 					   const char namefmt[],
 					   ...)
 {
@@ -210,6 +255,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 }
 EXPORT_SYMBOL(kthread_create_on_node);
 
+static void __kthread_bind(struct task_struct *p, unsigned int cpu)
+{
+	/* It's safe because the task is inactive. */
+	do_set_cpus_allowed(p, cpumask_of(cpu));
+	p->flags |= PF_THREAD_BOUND;
+}
+
 /**
  * kthread_bind - bind a just-created kthread to a cpu.
  * @p: thread created by kthread_create().
@@ -226,13 +278,111 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
 		WARN_ON(1);
 		return;
 	}
-
-	/* It's safe because the task is inactive. */
-	do_set_cpus_allowed(p, cpumask_of(cpu));
-	p->flags |= PF_THREAD_BOUND;
+	__kthread_bind(p, cpu);
 }
 EXPORT_SYMBOL(kthread_bind);
 
+/**
+ * kthread_create_on_cpu - Create a cpu bound kthread
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @cpu: The cpu on which the thread should be bound,
+ * @namefmt: printf-style name for the thread. Format is restricted
+ *	     to "name.*%u". Code fills in cpu number.
+ *
+ * Description: This helper function creates and names a kernel thread
+ * The thread will be woken and put into park mode.
+ */
+struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
+					  void *data, unsigned int cpu,
+					  const char *namefmt)
+{
+	struct task_struct *p;
+
+	p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
+				   cpu);
+	if (IS_ERR(p))
+		return p;
+	set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
+	to_kthread(p)->cpu = cpu;
+	/* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */
+	kthread_park(p);
+	return p;
+}
+
+static struct kthread *task_get_live_kthread(struct task_struct *k)
+{
+	struct kthread *kthread;
+
+	get_task_struct(k);
+	kthread = to_kthread(k);
+	/* It might have exited */
+	barrier();
+	if (k->vfork_done != NULL)
+		return kthread;
+	return NULL;
+}
+
+/**
+ * kthread_unpark - unpark a thread created by kthread_create().
+ * @k:		thread created by kthread_create().
+ *
+ * Sets kthread_should_park() for @k to return false, wakes it, and
+ * waits for it to return. If the thread is marked percpu then its
+ * bound to the cpu again.
+ */
+void kthread_unpark(struct task_struct *k)
+{
+	struct kthread *kthread = task_get_live_kthread(k);
+
+	if (kthread) {
+		clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+		/*
+		 * We clear the IS_PARKED bit here as we don't wait
+		 * until the task has left the park code. So if we'd
+		 * park before that happens we'd see the IS_PARKED bit
+		 * which might be about to be cleared.
+		 */
+		if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+			if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
+				__kthread_bind(k, kthread->cpu);
+			wake_up_process(k);
+		}
+	}
+	put_task_struct(k);
+}
+
+/**
+ * kthread_park - park a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ *
+ * Sets kthread_should_park() for @k to return true, wakes it, and
+ * waits for it to return. This can also be called after kthread_create()
+ * instead of calling wake_up_process(): the thread will park without
+ * calling threadfn().
+ *
+ * Returns 0 if the thread is parked, -ENOSYS if the thread exited.
+ * If called by the kthread itself just the park bit is set.
+ */
+int kthread_park(struct task_struct *k)
+{
+	struct kthread *kthread = task_get_live_kthread(k);
+	int ret = -ENOSYS;
+
+	if (kthread) {
+		if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+			set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+			if (k != current) {
+				wake_up_process(k);
+				wait_for_completion(&kthread->parked);
+			}
+		}
+		ret = 0;
+	}
+	put_task_struct(k);
+	return ret;
+}
+
 /**
  * kthread_stop - stop a thread created by kthread_create().
  * @k: thread created by kthread_create().
@@ -250,16 +400,13 @@ EXPORT_SYMBOL(kthread_bind);
  */
 int kthread_stop(struct task_struct *k)
 {
-	struct kthread *kthread;
+	struct kthread *kthread = task_get_live_kthread(k);
 	int ret;
 
 	trace_sched_kthread_stop(k);
-	get_task_struct(k);
-
-	kthread = to_kthread(k);
-	barrier(); /* it might have exited */
-	if (k->vfork_done != NULL) {
-		kthread->should_stop = 1;
+	if (kthread) {
+		set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
+		clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
 		wake_up_process(k);
 		wait_for_completion(&kthread->exited);
 	}
-- 
cgit v1.2.3


From c4c7fc8864a71329dad7c1bd68d033c3cdb84911 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 28 Jun 2012 03:24:00 +0100
Subject: sched: track the runnable average on a per-task entitiy basis

Instead of tracking averaging the load parented by a cfs_rq, we can track
entity load directly.  With the load for a given cfs_Rq then being the sum of
its children.

To do this we represent the historical contribution to runnable average within each
trailing 1024us of execution as the coefficients of a geometric series.

We can express this for a given task t as:
  runnable_sum(t) = \Sum u_i * y^i ,
  load(t) = weight_t * runnable_sum(t) / (\Sum 1024 * y^i)

Where: u_i is the usage in the last i`th 1024us period (approximately 1ms) ~ms
and y is chosen such that y^k = 1/2.  We currently choose k to be 32 which
roughly translates to about a sched period.

Signed-off-by: Paul Turner <pjt@google.com>
---
 include/linux/sched.h |   8 ++++
 kernel/sched/debug.c  |   4 ++
 kernel/sched/fair.c   | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 140 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4a1f493e0fef..a8640c411f46 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1134,6 +1134,11 @@ struct load_weight {
 	unsigned long weight, inv_weight;
 };
 
+struct sched_avg {
+	u32 runnable_avg_sum, runnable_avg_period;
+	u64 last_runnable_update;
+};
+
 #ifdef CONFIG_SCHEDSTATS
 struct sched_statistics {
 	u64			wait_start;
@@ -1194,6 +1199,9 @@ struct sched_entity {
 	/* rq "owned" by this entity/group: */
 	struct cfs_rq		*my_q;
 #endif
+#ifdef CONFIG_SMP
+	struct sched_avg	avg;
+#endif
 };
 
 struct sched_rt_entity {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6f79596e0ea9..61f70979153a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -85,6 +85,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 	P(se->statistics.wait_count);
 #endif
 	P(se->load.weight);
+#ifdef CONFIG_SMP
+	P(se->avg.runnable_avg_sum);
+	P(se->avg.runnable_avg_period);
+#endif
 #undef PN
 #undef P
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c099cc6eebe3..4704785d54f9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -971,6 +971,125 @@ static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
+#ifdef CONFIG_SMP
+/*
+ * Approximate:
+ *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
+ */
+static __always_inline u64 decay_load(u64 val, int n)
+{
+	for (; n && val; n--) {
+		val *= 4008;
+		val >>= 12;
+	}
+
+	return val;
+}
+
+/* We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series.  To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ *      p0            p1           p1
+ *     (now)       (~1ms ago)  (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ *   y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1]
+ */
+static __always_inline int __update_entity_runnable_avg(u64 now,
+							struct sched_avg *sa,
+							int runnable)
+{
+	u64 delta;
+	int delta_w, decayed = 0;
+
+	delta = now - sa->last_runnable_update;
+	/*
+	 * This should only happen when time goes backwards, which it
+	 * unfortunately does during sched clock init when we swap over to TSC.
+	 */
+	if ((s64)delta < 0) {
+		sa->last_runnable_update = now;
+		return 0;
+	}
+
+	/*
+	 * Use 1024ns as the unit of measurement since it's a reasonable
+	 * approximation of 1us and fast to compute.
+	 */
+	delta >>= 10;
+	if (!delta)
+		return 0;
+	sa->last_runnable_update = now;
+
+	/* delta_w is the amount already accumulated against our next period */
+	delta_w = sa->runnable_avg_period % 1024;
+	if (delta + delta_w >= 1024) {
+		/* period roll-over */
+		decayed = 1;
+
+		/*
+		 * Now that we know we're crossing a period boundary, figure
+		 * out how much from delta we need to complete the current
+		 * period and accrue it.
+		 */
+		delta_w = 1024 - delta_w;
+		BUG_ON(delta_w > delta);
+		do {
+			if (runnable)
+				sa->runnable_avg_sum += delta_w;
+			sa->runnable_avg_period += delta_w;
+
+			/*
+			 * Remainder of delta initiates a new period, roll over
+			 * the previous.
+			 */
+			sa->runnable_avg_sum =
+				decay_load(sa->runnable_avg_sum, 1);
+			sa->runnable_avg_period =
+				decay_load(sa->runnable_avg_period, 1);
+
+			delta -= delta_w;
+			/* New period is empty */
+			delta_w = 1024;
+		} while (delta >= 1024);
+	}
+
+	/* Remainder of delta accrued against u_0` */
+	if (runnable)
+		sa->runnable_avg_sum += delta;
+	sa->runnable_avg_period += delta;
+
+	return decayed;
+}
+
+/* Update a sched_entity's runnable average */
+static inline void update_entity_load_avg(struct sched_entity *se)
+{
+	__update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg,
+				     se->on_rq);
+}
+#else
+static inline void update_entity_load_avg(struct sched_entity *se) {}
+#endif
+
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHEDSTATS
@@ -1097,6 +1216,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 */
 	update_curr(cfs_rq);
 	update_cfs_load(cfs_rq, 0);
+	update_entity_load_avg(se);
 	account_entity_enqueue(cfs_rq, se);
 	update_cfs_shares(cfs_rq);
 
@@ -1171,6 +1291,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
+	update_entity_load_avg(se);
 
 	update_stats_dequeue(cfs_rq, se);
 	if (flags & DEQUEUE_SLEEP) {
@@ -1340,6 +1461,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 		update_stats_wait_start(cfs_rq, prev);
 		/* Put 'current' back into the tree. */
 		__enqueue_entity(cfs_rq, prev);
+		/* in !on_rq case, update occurred at dequeue */
+		update_entity_load_avg(prev);
 	}
 	cfs_rq->curr = NULL;
 }
@@ -1352,6 +1475,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 */
 	update_curr(cfs_rq);
 
+	/*
+	 * Ensure that runnable average is periodically updated.
+	 */
+	update_entity_load_avg(curr);
+
 	/*
 	 * Update share accounting for long-running entities.
 	 */
-- 
cgit v1.2.3


From 073b20041be06539b1d6818db2c62029d3090970 Mon Sep 17 00:00:00 2001
From: Ben Segall <bsegall@google.com>
Date: Thu, 28 Jun 2012 03:24:00 +0100
Subject: sched: maintain per-rq runnable averages

Since runqueues do not have a corresponding sched_entity we instead embed a
sched_avg structure directly.

Signed-off-by: Ben Segall <bsegall@google.com>
Signed-off-by: Paul Turner <pjt@google.com>
---
 kernel/sched/debug.c | 10 ++++++++--
 kernel/sched/fair.c  | 18 ++++++++++++++++--
 kernel/sched/sched.h |  2 ++
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 61f70979153a..4240abce4116 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec)
 static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
 {
 	struct sched_entity *se = tg->se[cpu];
-	if (!se)
-		return;
 
 #define P(F) \
 	SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
 #define PN(F) \
 	SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
 
+	if (!se) {
+		struct sched_avg *avg = &cpu_rq(cpu)->avg;
+		P(avg->runnable_avg_sum);
+		P(avg->runnable_avg_period);
+		return;
+	}
+
+
 	PN(se->exec_start);
 	PN(se->vruntime);
 	PN(se->sum_exec_runtime);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4704785d54f9..f04ceeccad48 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1086,8 +1086,14 @@ static inline void update_entity_load_avg(struct sched_entity *se)
 	__update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg,
 				     se->on_rq);
 }
+
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
+{
+	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
+}
 #else
 static inline void update_entity_load_avg(struct sched_entity *se) {}
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
 #endif
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -2339,8 +2345,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		update_cfs_shares(cfs_rq);
 	}
 
-	if (!se)
+	if (!se) {
+		update_rq_runnable_avg(rq, rq->nr_running);
 		inc_nr_running(rq);
+	}
 	hrtick_update(rq);
 }
 
@@ -2398,8 +2406,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		update_cfs_shares(cfs_rq);
 	}
 
-	if (!se)
+	if (!se) {
 		dec_nr_running(rq);
+		update_rq_runnable_avg(rq, 1);
+	}
 	hrtick_update(rq);
 }
 
@@ -4537,6 +4547,8 @@ void idle_balance(int this_cpu, struct rq *this_rq)
 	if (this_rq->avg_idle < sysctl_sched_migration_cost)
 		return;
 
+	update_rq_runnable_avg(this_rq, 1);
+
 	/*
 	 * Drop the rq->lock, but keep IRQ/preempt disabled.
 	 */
@@ -5035,6 +5047,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 		cfs_rq = cfs_rq_of(se);
 		entity_tick(cfs_rq, se, queued);
 	}
+
+	update_rq_runnable_avg(rq, 1);
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 55844f24435a..246ccc70bf17 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -463,6 +463,8 @@ struct rq {
 #ifdef CONFIG_SMP
 	struct llist_head wake_list;
 #endif
+
+	struct sched_avg avg;
 };
 
 static inline int cpu_of(struct rq *rq)
-- 
cgit v1.2.3


From 2ac17c05e1ef68f73ec7553623ca1c052e8cf65c Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 28 Jun 2012 03:24:00 +0100
Subject: sched: aggregate load contributed by task entities on parenting
 cfs_rq

For a given task t, we can compute its contribution to load as:
  task_load(t) = runnable_avg(t) * weight(t)

On a parenting cfs_rq we can then aggregate
  runnable_load(cfs_rq) = \Sum task_load(t), for all runnable children t

Maintain this bottom up, with task entities adding their contributed load to
the parenting cfs_rq sum.  When a task entities load changes we add the same
delta to the maintained sum.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Ben Segall <bsegall@google.com>
---
 include/linux/sched.h |  1 +
 kernel/sched/debug.c  |  3 +++
 kernel/sched/fair.c   | 51 +++++++++++++++++++++++++++++++++++++++++++++++----
 kernel/sched/sched.h  |  9 +++++++++
 4 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a8640c411f46..0575e1f31a67 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1137,6 +1137,7 @@ struct load_weight {
 struct sched_avg {
 	u32 runnable_avg_sum, runnable_avg_period;
 	u64 last_runnable_update;
+	unsigned long load_avg_contrib;
 };
 
 #ifdef CONFIG_SCHEDSTATS
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4240abce4116..c953a89f94aa 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -94,6 +94,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 #ifdef CONFIG_SMP
 	P(se->avg.runnable_avg_sum);
 	P(se->avg.runnable_avg_period);
+	P(se->avg.load_avg_contrib);
 #endif
 #undef PN
 #undef P
@@ -224,6 +225,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->load_contribution);
 	SEQ_printf(m, "  .%-30s: %d\n", "load_tg",
 			atomic_read(&cfs_rq->tg->load_weight));
+	SEQ_printf(m, "  .%-30s: %lld\n", "runnable_load_avg",
+			cfs_rq->runnable_load_avg);
 #endif
 
 	print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f04ceeccad48..6437398002ba 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1080,20 +1080,63 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
 	return decayed;
 }
 
+/* Compute the current contribution to load_avg by se, return any delta */
+static long __update_entity_load_avg_contrib(struct sched_entity *se)
+{
+	long old_contrib = se->avg.load_avg_contrib;
+
+	if (!entity_is_task(se))
+		return 0;
+
+	se->avg.load_avg_contrib = div64_u64(se->avg.runnable_avg_sum *
+					     se->load.weight,
+					     se->avg.runnable_avg_period + 1);
+
+	return se->avg.load_avg_contrib - old_contrib;
+}
+
 /* Update a sched_entity's runnable average */
 static inline void update_entity_load_avg(struct sched_entity *se)
 {
-	__update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg,
-				     se->on_rq);
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	long contrib_delta;
+
+	if (!__update_entity_runnable_avg(rq_of(cfs_rq)->clock_task, &se->avg,
+					  se->on_rq))
+		return;
+
+	contrib_delta = __update_entity_load_avg_contrib(se);
+	if (se->on_rq)
+		cfs_rq->runnable_load_avg += contrib_delta;
 }
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 {
 	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
 }
+
+/* Add the load generated by se into cfs_rq's child load-average */
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+						  struct sched_entity *se)
+{
+	update_entity_load_avg(se);
+	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+}
+
+/* Remove se's load from this cfs_rq child load-average */
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+						  struct sched_entity *se)
+{
+	update_entity_load_avg(se);
+	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+}
 #else
 static inline void update_entity_load_avg(struct sched_entity *se) {}
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+						  struct sched_entity *se) {}
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+						  struct sched_entity *se) {}
 #endif
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1222,7 +1265,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 */
 	update_curr(cfs_rq);
 	update_cfs_load(cfs_rq, 0);
-	update_entity_load_avg(se);
+	enqueue_entity_load_avg(cfs_rq, se);
 	account_entity_enqueue(cfs_rq, se);
 	update_cfs_shares(cfs_rq);
 
@@ -1297,7 +1340,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
-	update_entity_load_avg(se);
+	dequeue_entity_load_avg(cfs_rq, se);
 
 	update_stats_dequeue(cfs_rq, se);
 	if (flags & DEQUEUE_SLEEP) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 246ccc70bf17..8c3fd2cece4d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -222,6 +222,15 @@ struct cfs_rq {
 	unsigned int nr_spread_over;
 #endif
 
+#ifdef CONFIG_SMP
+	/*
+	 * CFS Load tracking
+	 * Under CFS, load is tracked on a per-entity basis and aggregated up.
+	 * This allows for the description of both thread and group usage (in
+	 * the FAIR_GROUP_SCHED case).
+	 */
+	u64 runnable_load_avg;
+#endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
 
-- 
cgit v1.2.3


From ee80663075d4e905a452273d4b143b8bea9ad0b5 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 28 Jun 2012 03:24:00 +0100
Subject: sched: maintain the load contribution of blocked entities

We are currently maintaining:
  runnable_load(cfs_rq) = \Sum task_load(t)

For all running children t of cfs_rq.  While this can be naturally updated for
tasks in a runnable state (as they are scheduled); this does not account for
the load contributed by blocked task entities.

This can be solved by introducing a separate accounting for blocked load:
  blocked_load(cfs_rq) = \Sum runnable(b) * weight(b)

Obviously we do not want to iterate over all blocked entities to account for
their decay, we instead observe that:
  runnable_load(t) = \Sum p_i*y^i

and that to account for an additional idle period we only need to compute:
  y*runnable_load(t).

This means that we can compute all blocked entities at once by evaluating:
  blocked_load(cfs_rq)` = y * blocked_load(cfs_rq)

Finally we maintain a decay counter so that when a sleeping entity re-awakens
we can determine how much of its load should be removed from the blocked sum.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Ben Segall <bsegall@google.com>
---
 include/linux/sched.h |   1 +
 kernel/sched/core.c   |   3 ++
 kernel/sched/debug.c  |   3 ++
 kernel/sched/fair.c   | 130 ++++++++++++++++++++++++++++++++++++++++++++------
 kernel/sched/sched.h  |   4 +-
 5 files changed, 126 insertions(+), 15 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0575e1f31a67..e86cb147ca7f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1137,6 +1137,7 @@ struct load_weight {
 struct sched_avg {
 	u32 runnable_avg_sum, runnable_avg_period;
 	u64 last_runnable_update;
+	s64 decay_count;
 	unsigned long load_avg_contrib;
 };
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 468bdd44c1ba..70bda6293944 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1713,6 +1713,9 @@ static void __sched_fork(struct task_struct *p)
 	p->se.vruntime			= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+       p->se.avg.decay_count = 0;
+#endif
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index c953a89f94aa..2d2e2b3c1bef 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -95,6 +95,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 	P(se->avg.runnable_avg_sum);
 	P(se->avg.runnable_avg_period);
 	P(se->avg.load_avg_contrib);
+	P(se->avg.decay_count);
 #endif
 #undef PN
 #undef P
@@ -227,6 +228,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			atomic_read(&cfs_rq->tg->load_weight));
 	SEQ_printf(m, "  .%-30s: %lld\n", "runnable_load_avg",
 			cfs_rq->runnable_load_avg);
+	SEQ_printf(m, "  .%-30s: %lld\n", "blocked_load_avg",
+			cfs_rq->blocked_load_avg);
 #endif
 
 	print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6437398002ba..029c31af1aef 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1080,6 +1080,20 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
 	return decayed;
 }
 
+/* Synchronize an entity's decay with its parentin cfs_rq.*/
+static inline void __synchronize_entity_decay(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       u64 decays = atomic64_read(&cfs_rq->decay_counter);
+
+       decays -= se->avg.decay_count;
+       if (!decays)
+               return;
+
+       se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+       se->avg.decay_count += decays;
+}
+
 /* Compute the current contribution to load_avg by se, return any delta */
 static long __update_entity_load_avg_contrib(struct sched_entity *se)
 {
@@ -1095,8 +1109,18 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
 	return se->avg.load_avg_contrib - old_contrib;
 }
 
+static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
+                                                long load_contrib)
+{
+       if (likely(load_contrib < cfs_rq->blocked_load_avg))
+               cfs_rq->blocked_load_avg -= load_contrib;
+       else
+               cfs_rq->blocked_load_avg = 0;
+}
+
 /* Update a sched_entity's runnable average */
-static inline void update_entity_load_avg(struct sched_entity *se)
+static inline void update_entity_load_avg(struct sched_entity *se,
+                                         int update_cfs_rq)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	long contrib_delta;
@@ -1106,8 +1130,34 @@ static inline void update_entity_load_avg(struct sched_entity *se)
 		return;
 
 	contrib_delta = __update_entity_load_avg_contrib(se);
+
+	if (!update_cfs_rq)
+		return;
+
 	if (se->on_rq)
 		cfs_rq->runnable_load_avg += contrib_delta;
+	else
+		subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+}
+
+/*
+ * Decay the load contributed by all blocked children and account this so that
+ * they their contribution may appropriately discounted when they wake up.
+ */
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq)
+{
+	u64 now = rq_of(cfs_rq)->clock_task >> 20;
+	u64 decays;
+
+	decays = now - cfs_rq->last_decay;
+	if (!decays)
+		return;
+
+	cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
+					      decays);
+	atomic64_add(decays, &cfs_rq->decay_counter);
+
+	cfs_rq->last_decay = now;
 }
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
@@ -1117,26 +1167,56 @@ static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 
 /* Add the load generated by se into cfs_rq's child load-average */
 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
-						  struct sched_entity *se)
-{
-	update_entity_load_avg(se);
+						  struct sched_entity *se,
+						  int wakeup)
+{
+	/* we track migrations using entity decay_count == 0 */
+	if (unlikely(!se->avg.decay_count)) {
+		se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
+		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+		wakeup = 0;
+	} else {
+		__synchronize_entity_decay(se);
+	}
+
+	if (wakeup)
+		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+
+	update_entity_load_avg(se, 0);
 	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+	update_cfs_rq_blocked_load(cfs_rq);
 }
 
-/* Remove se's load from this cfs_rq child load-average */
+/*
+ * Remove se's load from this cfs_rq child load-average, if the entity is
+ * transitioning to a blocked state we track its projected decay using
+ * blocked_load_avg.
+ */
 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
-						  struct sched_entity *se)
+						  struct sched_entity *se,
+						  int sleep)
 {
-	update_entity_load_avg(se);
+	update_entity_load_avg(se, 1);
+
 	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+	if (sleep) {
+		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+	} else {
+		se->avg.decay_count = 0;
+	}
 }
 #else
-static inline void update_entity_load_avg(struct sched_entity *se) {}
+static inline void update_entity_load_avg(struct sched_entity *se,
+					  int update_cfs_rq) {}
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
-						  struct sched_entity *se) {}
+					  struct sched_entity *se,
+					  int wakeup) {}
 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
-						  struct sched_entity *se) {}
+					  struct sched_entity *se,
+					  int sleep) {}
+static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) {}
 #endif
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1265,7 +1345,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 */
 	update_curr(cfs_rq);
 	update_cfs_load(cfs_rq, 0);
-	enqueue_entity_load_avg(cfs_rq, se);
+	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
 	account_entity_enqueue(cfs_rq, se);
 	update_cfs_shares(cfs_rq);
 
@@ -1340,7 +1420,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
-	dequeue_entity_load_avg(cfs_rq, se);
+	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
 
 	update_stats_dequeue(cfs_rq, se);
 	if (flags & DEQUEUE_SLEEP) {
@@ -1511,7 +1591,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 		/* Put 'current' back into the tree. */
 		__enqueue_entity(cfs_rq, prev);
 		/* in !on_rq case, update occurred at dequeue */
-		update_entity_load_avg(prev);
+		update_entity_load_avg(prev, 1);
 	}
 	cfs_rq->curr = NULL;
 }
@@ -1527,7 +1607,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	/*
 	 * Ensure that runnable average is periodically updated.
 	 */
-	update_entity_load_avg(curr);
+	update_entity_load_avg(curr, 1);
+	update_cfs_rq_blocked_load(cfs_rq);
 
 	/*
 	 * Update share accounting for long-running entities.
@@ -2386,6 +2467,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
+		update_entity_load_avg(se, 1);
 	}
 
 	if (!se) {
@@ -2447,6 +2529,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
+		update_entity_load_avg(se, 1);
 	}
 
 	if (!se) {
@@ -3497,6 +3580,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
 
 	update_rq_clock(rq);
 	update_cfs_load(cfs_rq, 1);
+	update_cfs_rq_blocked_load(cfs_rq);
 
 	/*
 	 * We need to update shares after updating tg->load_weight in
@@ -5184,6 +5268,21 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
 		place_entity(cfs_rq, se, 0);
 		se->vruntime -= cfs_rq->min_vruntime;
 	}
+
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+	/*
+	 * Remove our load from contribution when we leave sched_fair
+	 * and ensure we don't carry in an old decay_count if we
+	 * switch back.
+	 */
+	if (p->se.avg.decay_count) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+		__synchronize_entity_decay(&p->se);
+		subtract_blocked_load_contrib(cfs_rq,
+				p->se.avg.load_avg_contrib);
+		p->se.avg.decay_count = 0;
+	}
+#endif
 }
 
 /*
@@ -5230,6 +5329,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #ifndef CONFIG_64BIT
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 #endif
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+       atomic64_set(&cfs_rq->decay_counter, 1);
+#endif
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8c3fd2cece4d..eb85e6502214 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -229,7 +229,9 @@ struct cfs_rq {
 	 * This allows for the description of both thread and group usage (in
 	 * the FAIR_GROUP_SCHED case).
 	 */
-	u64 runnable_load_avg;
+	u64 runnable_load_avg, blocked_load_avg;
+	atomic64_t decay_counter;
+	u64 last_decay;
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
-- 
cgit v1.2.3


From ea9f7f32734a22df2bab3d4410590d34b30e7515 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 28 Jun 2012 03:24:00 +0100
Subject: sched: add an rq migration call-back to sched_class

Since we are now doing bottom up load accumulation we need explicit
notification when a task has been re-parented so that the old hierarchy can be
updated.

Adds task_migrate_rq(struct rq *prev, struct *rq new_rq);

(The alternative is to do this out of __set_task_cpu, but it was suggested that
this would be a cleaner encapsulation.)

Signed-off-by: Paul Turner <pjt@google.com>
---
 include/linux/sched.h |  1 +
 kernel/sched/core.c   |  2 ++
 kernel/sched/fair.c   | 12 ++++++++++++
 3 files changed, 15 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e86cb147ca7f..4a2aa19e9d9a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1100,6 +1100,7 @@ struct sched_class {
 
 #ifdef CONFIG_SMP
 	int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+	void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
 
 	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
 	void (*post_schedule) (struct rq *this_rq);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 70bda6293944..9df64ea993fa 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1109,6 +1109,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	trace_sched_migrate_task(p, new_cpu);
 
 	if (task_cpu(p) != new_cpu) {
+		if (p->sched_class->migrate_task_rq)
+			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
 		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
 	}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 029c31af1aef..12f50bb42630 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3073,6 +3073,17 @@ unlock:
 
 	return new_cpu;
 }
+
+/*
+ * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
+ * cfs_rq_of(p) references at time of call are still valid and identify the
+ * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
+ * other assumptions, including rq->lock state, should be made.
+ * Caller guarantees p->pi_lock held, but nothing else.
+ */
+static void
+migrate_task_rq_fair(struct task_struct *p, int next_cpu) {
+}
 #endif /* CONFIG_SMP */
 
 static unsigned long
@@ -5559,6 +5570,7 @@ const struct sched_class fair_sched_class = {
 
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_fair,
+	.migrate_task_rq	= migrate_task_rq_fair,
 
 	.rq_online		= rq_online_fair,
 	.rq_offline		= rq_offline_fair,
-- 
cgit v1.2.3


From 8f313b242a85abbcdb89bc574f5d4fcf57ca929a Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 28 Jun 2012 03:24:00 +0100
Subject: sched: account for blocked load waking back up

When a running entity blocks we migrate its tracked load to
cfs_rq->blocked_runnable_avg.  In the sleep case this occurs while holding
rq->lock and so is a natural transition.  Wake-ups however, are potentially
asynchronous in the presence of migration and so special care must be taken.

We use an atomic counter to track such migrated load, taking care to match this
with the previously introduced decay counters so that we don't migrate too much
load.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Ben Segall <bsegall@google.com>
---
 kernel/sched/fair.c  | 96 +++++++++++++++++++++++++++++++++++++++++-----------
 kernel/sched/sched.h |  2 +-
 2 files changed, 78 insertions(+), 20 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 12f50bb42630..2ef413f6e56b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1081,17 +1081,19 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
 }
 
 /* Synchronize an entity's decay with its parentin cfs_rq.*/
-static inline void __synchronize_entity_decay(struct sched_entity *se)
+static inline u64 __synchronize_entity_decay(struct sched_entity *se)
 {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        u64 decays = atomic64_read(&cfs_rq->decay_counter);
 
        decays -= se->avg.decay_count;
        if (!decays)
-               return;
+               return 0;
 
        se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
        se->avg.decay_count += decays;
+
+       return decays;
 }
 
 /* Compute the current contribution to load_avg by se, return any delta */
@@ -1144,20 +1146,26 @@ static inline void update_entity_load_avg(struct sched_entity *se,
  * Decay the load contributed by all blocked children and account this so that
  * they their contribution may appropriately discounted when they wake up.
  */
-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq)
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 {
 	u64 now = rq_of(cfs_rq)->clock_task >> 20;
 	u64 decays;
 
 	decays = now - cfs_rq->last_decay;
-	if (!decays)
+	if (!decays && !force_update)
 		return;
 
-	cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
-					      decays);
-	atomic64_add(decays, &cfs_rq->decay_counter);
+	if (atomic64_read(&cfs_rq->removed_load)) {
+		u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
+		subtract_blocked_load_contrib(cfs_rq, removed_load);
+	}
 
-	cfs_rq->last_decay = now;
+	if (decays) {
+		cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
+						      decays);
+		atomic64_add(decays, &cfs_rq->decay_counter);
+		cfs_rq->last_decay = now;
+	}
 }
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
@@ -1170,21 +1178,41 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
 						  struct sched_entity *se,
 						  int wakeup)
 {
-	/* we track migrations using entity decay_count == 0 */
-	if (unlikely(!se->avg.decay_count)) {
+	/*
+	 * We track migrations using entity decay_count <= 0, on a wake-up
+	 * migration we use a negative decay count to track the remote decays
+	 * accumulated while sleeping.
+	 */
+	if (unlikely(se->avg.decay_count <= 0)) {
 		se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
+		if (se->avg.decay_count) {
+			/*
+			 * In a wake-up migration we have to approximate the
+			 * time sleeping.  This is because we can't synchronize
+			 * clock_task between the two cpus, and it is not
+			 * guaranteed to be read-safe.  Instead, we can
+			 * approximate this using our carried decays, which are
+			 * explicitly atomically readable.
+			 */
+			se->avg.last_runnable_update -= (-se->avg.decay_count)
+							<< 20;
+			update_entity_load_avg(se, 0);
+		}
 		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
 		wakeup = 0;
 	} else {
 		__synchronize_entity_decay(se);
 	}
 
-	if (wakeup)
+	/* migrated tasks did not contribute to our blocked load */
+	if (wakeup) {
 		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+		update_entity_load_avg(se, 0);
+	}
 
-	update_entity_load_avg(se, 0);
 	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
-	update_cfs_rq_blocked_load(cfs_rq);
+	/* we force update consideration on load-balancer moves */
+	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
 }
 
 /*
@@ -1197,6 +1225,8 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
 						  int sleep)
 {
 	update_entity_load_avg(se, 1);
+	/* we force update consideration on load-balancer moves */
+	update_cfs_rq_blocked_load(cfs_rq, !sleep);
 
 	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
 	if (sleep) {
@@ -1216,7 +1246,8 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
 					  struct sched_entity *se,
 					  int sleep) {}
-static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) {}
+static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+					     int force_update) {}
 #endif
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1608,7 +1639,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * Ensure that runnable average is periodically updated.
 	 */
 	update_entity_load_avg(curr, 1);
-	update_cfs_rq_blocked_load(cfs_rq);
+	update_cfs_rq_blocked_load(cfs_rq, 1);
 
 	/*
 	 * Update share accounting for long-running entities.
@@ -3083,7 +3114,21 @@ unlock:
  */
 static void
 migrate_task_rq_fair(struct task_struct *p, int next_cpu) {
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+	/*
+	 * Load tracking: accumulate removed load so that it can be processed
+	 * when we next update owning cfs_rq under rq->lock.  Tasks contribute
+	 * to blocked load iff they have a non-zero decay-count.
+	 */
+	if (se->avg.decay_count) {
+		se->avg.decay_count = -__synchronize_entity_decay(se);
+		atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
+	}
 }
+
+
 #endif /* CONFIG_SMP */
 
 static unsigned long
@@ -3591,7 +3636,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
 
 	update_rq_clock(rq);
 	update_cfs_load(cfs_rq, 1);
-	update_cfs_rq_blocked_load(cfs_rq);
+	update_cfs_rq_blocked_load(cfs_rq, 1);
 
 	/*
 	 * We need to update shares after updating tg->load_weight in
@@ -5341,13 +5386,15 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 #endif
 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
-       atomic64_set(&cfs_rq->decay_counter, 1);
+	atomic64_set(&cfs_rq->decay_counter, 1);
+	atomic64_set(&cfs_rq->removed_load, 0);
 #endif
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
+	struct cfs_rq *cfs_rq;
 	/*
 	 * If the task was not on the rq at the time of this cgroup movement
 	 * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5379,8 +5426,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
 	if (!on_rq)
 		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
 	set_task_rq(p, task_cpu(p));
-	if (!on_rq)
-		p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
+	if (!on_rq) {
+		cfs_rq = cfs_rq_of(&p->se);
+		p->se.vruntime += cfs_rq->min_vruntime;
+#ifdef CONFIG_SMP
+		/*
+		 * set_task_rq will() have removed our previous contribution,
+		 * but we must synchronize explicitly against further decay
+		 * here.
+		 */
+		p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+		cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+#endif
+	}
 }
 
 void free_fair_sched_group(struct task_group *tg)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eb85e6502214..28a76f6bb55c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -230,7 +230,7 @@ struct cfs_rq {
 	 * the FAIR_GROUP_SCHED case).
 	 */
 	u64 runnable_load_avg, blocked_load_avg;
-	atomic64_t decay_counter;
+	atomic64_t decay_counter, removed_load;
 	u64 last_decay;
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
-- 
cgit v1.2.3


From 3036a88654ddac784125859756bb5bd1713020ac Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 28 Jun 2012 03:24:00 +0100
Subject: sched: aggregate total task_group load

Maintain a global running sum of the average load seen on each cfs_rq belonging
to each task group so that it may be used in calculating an appropriate
shares:weight distribution.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Ben Segall <bsegall@google.com>
---
 kernel/sched/debug.c |  4 ++++
 kernel/sched/fair.c  | 22 ++++++++++++++++++++++
 kernel/sched/sched.h |  4 ++++
 3 files changed, 30 insertions(+)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2d2e2b3c1bef..290892361a09 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -230,6 +230,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->runnable_load_avg);
 	SEQ_printf(m, "  .%-30s: %lld\n", "blocked_load_avg",
 			cfs_rq->blocked_load_avg);
+	SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_avg",
+			atomic64_read(&cfs_rq->tg->load_avg));
+	SEQ_printf(m, "  .%-30s: %lld\n", "tg_load_contrib",
+			cfs_rq->tg_load_contrib);
 #endif
 
 	print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2ef413f6e56b..48993da8eae3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1096,6 +1096,26 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
        return decays;
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+						 int force_update)
+{
+	struct task_group *tg = cfs_rq->tg;
+	s64 tg_contrib;
+
+	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
+	tg_contrib -= cfs_rq->tg_load_contrib;
+
+	if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
+		atomic64_add(tg_contrib, &tg->load_avg);
+		cfs_rq->tg_load_contrib += tg_contrib;
+	}
+}
+#else
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+						 int force_update) {}
+#endif
+
 /* Compute the current contribution to load_avg by se, return any delta */
 static long __update_entity_load_avg_contrib(struct sched_entity *se)
 {
@@ -1166,6 +1186,8 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 		atomic64_add(decays, &cfs_rq->decay_counter);
 		cfs_rq->last_decay = now;
 	}
+
+	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
 }
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 28a76f6bb55c..de8a0d7ee62f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -112,6 +112,7 @@ struct task_group {
 	unsigned long shares;
 
 	atomic_t load_weight;
+	atomic64_t load_avg;
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -232,6 +233,9 @@ struct cfs_rq {
 	u64 runnable_load_avg, blocked_load_avg;
 	atomic64_t decay_counter, removed_load;
 	u64 last_decay;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	u64 tg_load_contrib;
+#endif
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
-- 
cgit v1.2.3


From d935256af8f9944d342a34648d73c823c8627657 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 28 Jun 2012 03:24:00 +0100
Subject: sched: compute load contribution by a group entity

Unlike task entities who have a fixed weight, group entities instead own a
fraction of their parenting task_group's shares as their contributed weight.

Compute this fraction so that we can correctly account hierarchies and shared
entity nodes.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Ben Segall <bsegall@google.com>
---
 kernel/sched/fair.c | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 48993da8eae3..f4ca00e88fe2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1111,22 +1111,43 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
 		cfs_rq->tg_load_contrib += tg_contrib;
 	}
 }
+
+static inline void __update_group_entity_contrib(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq = group_cfs_rq(se);
+	struct task_group *tg = cfs_rq->tg;
+	u64 contrib;
+
+	contrib = cfs_rq->tg_load_contrib * tg->shares;
+	se->avg.load_avg_contrib = div64_u64(contrib,
+					     atomic64_read(&tg->load_avg) + 1);
+}
 #else
 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
 						 int force_update) {}
+static inline void __update_group_entity_contrib(struct sched_entity *se) {}
 #endif
 
+static inline void __update_task_entity_contrib(struct sched_entity *se)
+{
+	u32 contrib;
+
+	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
+	contrib /= (se->avg.runnable_avg_period + 1);
+	se->avg.load_avg_contrib = scale_load(contrib);
+}
+
 /* Compute the current contribution to load_avg by se, return any delta */
 static long __update_entity_load_avg_contrib(struct sched_entity *se)
 {
 	long old_contrib = se->avg.load_avg_contrib;
 
-	if (!entity_is_task(se))
-		return 0;
-
-	se->avg.load_avg_contrib = div64_u64(se->avg.runnable_avg_sum *
-					     se->load.weight,
-					     se->avg.runnable_avg_period + 1);
+	if (entity_is_task(se)) {
+		__update_task_entity_contrib(se);
+	} else {
+		__update_group_entity_contrib(se);
+	}
 
 	return se->avg.load_avg_contrib - old_contrib;
 }
-- 
cgit v1.2.3


From 6e2d534c005ad79d758e7dbaa362f7f91e3d3ee5 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 28 Jun 2012 03:24:00 +0100
Subject: sched: normalize tg load contributions against runnable time

Entities of equal weight should receive equitable distribution of cpu time.
This is challenging in the case of a task_group's shares as execution may be
occurring on multiple cpus simultaneously.

To handle this we divide up the shares into weights proportionate with the load
on each cfs_rq.  This does not however, account for the fact that the sum of
the parts may be less than one cpu and so we need to normalize:
  load(tg) = min(runnable_avg(tg), 1) * tg->shares
Where runnable_avg is the aggregate time in which the task_group had runnable
children.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Ben Segall <bsegall@google.com>.
---
 kernel/sched/debug.c |  4 ++++
 kernel/sched/fair.c  | 39 +++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h |  2 ++
 3 files changed, 45 insertions(+)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 290892361a09..71b0ea325e93 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -234,6 +234,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			atomic64_read(&cfs_rq->tg->load_avg));
 	SEQ_printf(m, "  .%-30s: %lld\n", "tg_load_contrib",
 			cfs_rq->tg_load_contrib);
+	SEQ_printf(m, "  .%-30s: %d\n", "tg_runnable_contrib",
+			cfs_rq->tg_runnable_contrib);
+	SEQ_printf(m, "  .%-30s: %d\n", "tg->runnable_avg",
+			atomic_read(&cfs_rq->tg->runnable_avg));
 #endif
 
 	print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f4ca00e88fe2..8a519a1bba72 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1112,19 +1112,56 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
 	}
 }
 
+/*
+ * Aggregate cfs_rq runnable averages into an equivalent task_group
+ * representation for computing load contributions.
+ */
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+						  struct cfs_rq *cfs_rq)
+{
+	struct task_group *tg = cfs_rq->tg;
+	long contrib;
+
+	contrib = div_u64(sa->runnable_avg_sum << 12,
+			  sa->runnable_avg_period + 1);
+	contrib -= cfs_rq->tg_runnable_contrib;
+
+	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
+		atomic_add(contrib, &tg->runnable_avg);
+		cfs_rq->tg_runnable_contrib += contrib;
+	}
+}
+
 static inline void __update_group_entity_contrib(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = group_cfs_rq(se);
 	struct task_group *tg = cfs_rq->tg;
+	int runnable_avg;
+
 	u64 contrib;
 
 	contrib = cfs_rq->tg_load_contrib * tg->shares;
 	se->avg.load_avg_contrib = div64_u64(contrib,
 					     atomic64_read(&tg->load_avg) + 1);
+
+	/*
+	 * Unlike a task-entity, a group entity may be using >=1 cpu globally.
+	 * However, in the case that it's using <1 cpu we need to form a
+	 * correction term so that we contribute the same load as a task of
+	 * equal weight. (Global runnable time is taken as a fraction over
+	 * 2^12.)
+	 */
+	runnable_avg = atomic_read(&tg->runnable_avg);
+	if (runnable_avg < (1<<12)) {
+		se->avg.load_avg_contrib *= runnable_avg;
+		se->avg.load_avg_contrib /= (1<<12);
+	}
 }
 #else
 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
 						 int force_update) {}
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+						  struct cfs_rq *cfs_rq) {}
 static inline void __update_group_entity_contrib(struct sched_entity *se) {}
 #endif
 
@@ -1146,6 +1183,7 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
 	if (entity_is_task(se)) {
 		__update_task_entity_contrib(se);
 	} else {
+		__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
 		__update_group_entity_contrib(se);
 	}
 
@@ -1214,6 +1252,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 {
 	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
+	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
 }
 
 /* Add the load generated by se into cfs_rq's child load-average */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index de8a0d7ee62f..088f8f65e1ed 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -113,6 +113,7 @@ struct task_group {
 
 	atomic_t load_weight;
 	atomic64_t load_avg;
+	atomic_t runnable_avg;
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -234,6 +235,7 @@ struct cfs_rq {
 	atomic64_t decay_counter, removed_load;
 	u64 last_decay;
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	u32 tg_runnable_contrib;
 	u64 tg_load_contrib;
 #endif
 #endif
-- 
cgit v1.2.3


From 2784c223de5ab75c9d32033badce32e7bb58e17b Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 28 Jun 2012 03:24:00 +0100
Subject: sched: maintain runnable averages across throttled periods

With bandwidth control tracked entities may cease execution according to user
specified bandwidth limits.  Charging this time as either throttled or blocked
however, is incorrect and would falsely skew in either direction.

What we actually want is for any throttled periods to be "invisible" to
load-tracking as they are removed from the system for that interval and
contribute normally otherwise.

Do this by moderating the progression of time to omit any periods in which the
entity belonged to a throttled hierarchy.

Signed-off-by: Paul Turner <pjt@google.com>
---
 kernel/sched/fair.c  | 50 ++++++++++++++++++++++++++++++++++++++++----------
 kernel/sched/sched.h |  3 ++-
 2 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8a519a1bba72..f2649391a59c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1199,15 +1199,26 @@ static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
                cfs_rq->blocked_load_avg = 0;
 }
 
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+
 /* Update a sched_entity's runnable average */
 static inline void update_entity_load_avg(struct sched_entity *se,
                                          int update_cfs_rq)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	long contrib_delta;
+	u64 now;
 
-	if (!__update_entity_runnable_avg(rq_of(cfs_rq)->clock_task, &se->avg,
-					  se->on_rq))
+	/*
+	 * For a group entity we need to use their owned cfs_rq_clock_task() in
+	 * case they are the parent of a throttled hierarchy.
+	 */
+	if (entity_is_task(se))
+		now = cfs_rq_clock_task(cfs_rq);
+	else
+		now = cfs_rq_clock_task(group_cfs_rq(se));
+
+	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
 		return;
 
 	contrib_delta = __update_entity_load_avg_contrib(se);
@@ -1227,7 +1238,7 @@ static inline void update_entity_load_avg(struct sched_entity *se,
  */
 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 {
-	u64 now = rq_of(cfs_rq)->clock_task >> 20;
+	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
 	u64 decays;
 
 	decays = now - cfs_rq->last_decay;
@@ -1819,6 +1830,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
 	return &tg->cfs_bandwidth;
 }
 
+/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+	if (unlikely(cfs_rq->throttle_count))
+		return cfs_rq->throttled_clock_task;
+
+	return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
+}
+
 /* returns 0 on failure to allocate runtime */
 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
@@ -1969,6 +1989,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
 		cfs_rq->load_stamp += delta;
 		cfs_rq->load_last += delta;
 
+		/* adjust cfs_rq_clock_task() */
+		cfs_rq->throttled_clock_task_time += rq->clock_task -
+					     cfs_rq->throttled_clock_task;
+
 		/* update entity weight now that we are on_rq again */
 		update_cfs_shares(cfs_rq);
 	}
@@ -1983,8 +2007,10 @@ static int tg_throttle_down(struct task_group *tg, void *data)
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
 	/* group is entering throttled state, record last load */
-	if (!cfs_rq->throttle_count)
+	if (!cfs_rq->throttle_count) {
 		update_cfs_load(cfs_rq, 0);
+		cfs_rq->throttled_clock_task = rq->clock_task;
+	}
 	cfs_rq->throttle_count++;
 
 	return 0;
@@ -1999,7 +2025,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 
 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
 
-	/* account load preceding throttle */
+	/* freeze hierarchy runnable averages while throttled */
 	rcu_read_lock();
 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
 	rcu_read_unlock();
@@ -2023,7 +2049,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 		rq->nr_running -= task_delta;
 
 	cfs_rq->throttled = 1;
-	cfs_rq->throttled_timestamp = rq->clock;
+	cfs_rq->throttled_clock = rq->clock;
 	raw_spin_lock(&cfs_b->lock);
 	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
 	raw_spin_unlock(&cfs_b->lock);
@@ -2041,10 +2067,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 
 	cfs_rq->throttled = 0;
 	raw_spin_lock(&cfs_b->lock);
-	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
+	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
 	list_del_rcu(&cfs_rq->throttled_list);
 	raw_spin_unlock(&cfs_b->lock);
-	cfs_rq->throttled_timestamp = 0;
 
 	update_rq_clock(rq);
 	/* update hierarchical throttle state */
@@ -2444,8 +2469,13 @@ void unthrottle_offline_cfs_rqs(struct rq *rq)
 }
 
 #else /* CONFIG_CFS_BANDWIDTH */
-static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {}
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+	return rq_of(cfs_rq)->clock_task;
+}
+
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+				     unsigned long delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 088f8f65e1ed..7561a0f81f7e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -281,7 +281,8 @@ struct cfs_rq {
 	u64 runtime_expires;
 	s64 runtime_remaining;
 
-	u64 throttled_timestamp;
+	u64 throttled_clock, throttled_clock_task;
+	u64 throttled_clock_task_time;
 	int throttled, throttle_count;
 	struct list_head throttled_list;
 #endif /* CONFIG_CFS_BANDWIDTH */
-- 
cgit v1.2.3


From 7c1b2f86a3839ad82caee1e493986b7dc1a0aa40 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 28 Jun 2012 03:24:00 +0100
Subject: sched: replace update_shares weight distribution with per-entity
 computation

Now that the machinery in place is in place to compute contributed load in a
bottom up fashion; replace the shares distribution code within update_shares()
accordingly.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Ben Segall <bsegall@google.com>
---
 kernel/sched/debug.c |   8 ---
 kernel/sched/fair.c  | 153 +++++++--------------------------------------------
 kernel/sched/sched.h |  36 ++++--------
 3 files changed, 32 insertions(+), 165 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 71b0ea325e93..2cd3c1b4e582 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -218,14 +218,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_SMP
-	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_avg",
-			SPLIT_NS(cfs_rq->load_avg));
-	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_period",
-			SPLIT_NS(cfs_rq->load_period));
-	SEQ_printf(m, "  .%-30s: %ld\n", "load_contrib",
-			cfs_rq->load_contribution);
-	SEQ_printf(m, "  .%-30s: %d\n", "load_tg",
-			atomic_read(&cfs_rq->tg->load_weight));
 	SEQ_printf(m, "  .%-30s: %lld\n", "runnable_load_avg",
 			cfs_rq->runnable_load_avg);
 	SEQ_printf(m, "  .%-30s: %lld\n", "blocked_load_avg",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f2649391a59c..4ad9e2b2b368 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -653,9 +653,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
 }
 
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq);
-
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -675,10 +672,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 
 	curr->vruntime += delta_exec_weighted;
 	update_min_vruntime(cfs_rq);
-
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-	cfs_rq->load_unacc_exec_time += delta_exec;
-#endif
 }
 
 static void update_curr(struct cfs_rq *cfs_rq)
@@ -801,72 +794,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-/* we need this in update_cfs_load and load-balance functions below */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 # ifdef CONFIG_SMP
-static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
-					    int global_update)
-{
-	struct task_group *tg = cfs_rq->tg;
-	long load_avg;
-
-	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
-	load_avg -= cfs_rq->load_contribution;
-
-	if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
-		atomic_add(load_avg, &tg->load_weight);
-		cfs_rq->load_contribution += load_avg;
-	}
-}
-
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-	u64 period = sysctl_sched_shares_window;
-	u64 now, delta;
-	unsigned long load = cfs_rq->load.weight;
-
-	if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
-		return;
-
-	now = rq_of(cfs_rq)->clock_task;
-	delta = now - cfs_rq->load_stamp;
-
-	/* truncate load history at 4 idle periods */
-	if (cfs_rq->load_stamp > cfs_rq->load_last &&
-	    now - cfs_rq->load_last > 4 * period) {
-		cfs_rq->load_period = 0;
-		cfs_rq->load_avg = 0;
-		delta = period - 1;
-	}
-
-	cfs_rq->load_stamp = now;
-	cfs_rq->load_unacc_exec_time = 0;
-	cfs_rq->load_period += delta;
-	if (load) {
-		cfs_rq->load_last = now;
-		cfs_rq->load_avg += delta * load;
-	}
-
-	/* consider updating load contribution on each fold or truncate */
-	if (global_update || cfs_rq->load_period > period
-	    || !cfs_rq->load_period)
-		update_cfs_rq_load_contribution(cfs_rq, global_update);
-
-	while (cfs_rq->load_period > period) {
-		/*
-		 * Inline assembly required to prevent the compiler
-		 * optimising this loop into a divmod call.
-		 * See __iter_div_u64_rem() for another example of this.
-		 */
-		asm("" : "+rm" (cfs_rq->load_period));
-		cfs_rq->load_period /= 2;
-		cfs_rq->load_avg /= 2;
-	}
-
-	if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
-		list_del_leaf_cfs_rq(cfs_rq);
-}
-
 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
 {
 	long tg_weight;
@@ -876,8 +804,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
 	 * to gain a more accurate current total weight. See
 	 * update_cfs_rq_load_contribution().
 	 */
-	tg_weight = atomic_read(&tg->load_weight);
-	tg_weight -= cfs_rq->load_contribution;
+	tg_weight = atomic64_read(&tg->load_avg);
+	tg_weight -= cfs_rq->tg_load_contrib;
 	tg_weight += cfs_rq->load.weight;
 
 	return tg_weight;
@@ -901,27 +829,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 
 	return shares;
 }
-
-static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
-		update_cfs_load(cfs_rq, 0);
-		update_cfs_shares(cfs_rq);
-	}
-}
 # else /* CONFIG_SMP */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-}
-
 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
 	return tg->shares;
 }
-
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-}
 # endif /* CONFIG_SMP */
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			    unsigned long weight)
@@ -939,6 +851,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 		account_entity_enqueue(cfs_rq, se);
 }
 
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+
 static void update_cfs_shares(struct cfs_rq *cfs_rq)
 {
 	struct task_group *tg;
@@ -958,17 +872,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-}
-
 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 {
 }
-
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-}
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_SMP
@@ -1468,7 +1374,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
-	update_cfs_load(cfs_rq, 0);
 	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
 	account_entity_enqueue(cfs_rq, se);
 	update_cfs_shares(cfs_rq);
@@ -1565,7 +1470,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	se->on_rq = 0;
-	update_cfs_load(cfs_rq, 0);
 	account_entity_dequeue(cfs_rq, se);
 
 	/*
@@ -1734,11 +1638,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	update_entity_load_avg(curr, 1);
 	update_cfs_rq_blocked_load(cfs_rq, 1);
 
-	/*
-	 * Update share accounting for long-running entities.
-	 */
-	update_entity_shares_tick(cfs_rq);
-
 #ifdef CONFIG_SCHED_HRTICK
 	/*
 	 * queued ticks are scheduled to match the slice, so don't bother
@@ -1983,18 +1882,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
 	cfs_rq->throttle_count--;
 #ifdef CONFIG_SMP
 	if (!cfs_rq->throttle_count) {
-		u64 delta = rq->clock_task - cfs_rq->load_stamp;
-
-		/* leaving throttled state, advance shares averaging windows */
-		cfs_rq->load_stamp += delta;
-		cfs_rq->load_last += delta;
-
 		/* adjust cfs_rq_clock_task() */
 		cfs_rq->throttled_clock_task_time += rq->clock_task -
 					     cfs_rq->throttled_clock_task;
-
-		/* update entity weight now that we are on_rq again */
-		update_cfs_shares(cfs_rq);
 	}
 #endif
 
@@ -2006,11 +1896,9 @@ static int tg_throttle_down(struct task_group *tg, void *data)
 	struct rq *rq = data;
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
-	/* group is entering throttled state, record last load */
-	if (!cfs_rq->throttle_count) {
-		update_cfs_load(cfs_rq, 0);
+	/* group is entering throttled state, stop time */
+	if (!cfs_rq->throttle_count)
 		cfs_rq->throttled_clock_task = rq->clock_task;
-	}
 	cfs_rq->throttle_count++;
 
 	return 0;
@@ -2608,7 +2496,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
-		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
 		update_entity_load_avg(se, 1);
 	}
@@ -2670,7 +2557,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
-		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
 		update_entity_load_avg(se, 1);
 	}
@@ -3734,27 +3620,32 @@ next:
  */
 static int update_shares_cpu(struct task_group *tg, int cpu)
 {
+	struct sched_entity *se;
 	struct cfs_rq *cfs_rq;
 	unsigned long flags;
 	struct rq *rq;
 
-	if (!tg->se[cpu])
-		return 0;
-
 	rq = cpu_rq(cpu);
+	se = tg->se[cpu];
 	cfs_rq = tg->cfs_rq[cpu];
 
 	raw_spin_lock_irqsave(&rq->lock, flags);
 
 	update_rq_clock(rq);
-	update_cfs_load(cfs_rq, 1);
 	update_cfs_rq_blocked_load(cfs_rq, 1);
+	update_entity_load_avg(tg->se[cpu], 1);
 
-	/*
-	 * We need to update shares after updating tg->load_weight in
-	 * order to adjust the weight of groups with long running tasks.
-	 */
-	update_cfs_shares(cfs_rq);
+	if (se) {
+		/*
+		 * We can pivot on the runnable average decaying to zero for
+		 * list removal since the parent average will always be >=
+		 * child.
+		 */
+		if (se->avg.runnable_avg_sum)
+			update_cfs_shares(cfs_rq);
+		else
+			list_del_leaf_cfs_rq(cfs_rq);
+	}
 
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 
@@ -5635,10 +5526,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 
 	cfs_rq->tg = tg;
 	cfs_rq->rq = rq;
-#ifdef CONFIG_SMP
-	/* allow initial update_cfs_load() to truncate */
-	cfs_rq->load_stamp = 1;
-#endif
 	init_cfs_rq_runtime(cfs_rq);
 
 	tg->cfs_rq[cpu] = cfs_rq;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7561a0f81f7e..9e9c63832246 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -234,11 +234,21 @@ struct cfs_rq {
 	u64 runnable_load_avg, blocked_load_avg;
 	atomic64_t decay_counter, removed_load;
 	u64 last_decay;
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	u32 tg_runnable_contrib;
 	u64 tg_load_contrib;
-#endif
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+	/*
+	 *   h_load = weight * f(tg)
+	 *
+	 * Where f(tg) is the recursive weight fraction assigned to
+	 * this group.
+	 */
+	unsigned long h_load;
+#endif /* CONFIG_SMP */
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
 
@@ -254,28 +264,6 @@ struct cfs_rq {
 	struct list_head leaf_cfs_rq_list;
 	struct task_group *tg;	/* group that "owns" this runqueue */
 
-#ifdef CONFIG_SMP
-	/*
-	 *   h_load = weight * f(tg)
-	 *
-	 * Where f(tg) is the recursive weight fraction assigned to
-	 * this group.
-	 */
-	unsigned long h_load;
-
-	/*
-	 * Maintaining per-cpu shares distribution for group scheduling
-	 *
-	 * load_stamp is the last time we updated the load average
-	 * load_last is the last time we updated the load average and saw load
-	 * load_unacc_exec_time is currently unaccounted execution time
-	 */
-	u64 load_avg;
-	u64 load_period;
-	u64 load_stamp, load_last, load_unacc_exec_time;
-
-	unsigned long load_contribution;
-#endif /* CONFIG_SMP */
 #ifdef CONFIG_CFS_BANDWIDTH
 	int runtime_enabled;
 	u64 runtime_expires;
-- 
cgit v1.2.3


From 3debcdc79dc76cd65b00cbb5e36f1e6f248bda68 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 28 Jun 2012 03:24:00 +0100
Subject: sched: refactor update_shares_cpu() -> update_blocked_avgs()

Now that running entities maintain their own load-averages the work we must do
in update_shares() is largely restricted to the periodic decay of blocked
entities.  This allows us to be a little less pessimistic regarding our
occupancy on rq->lock and the associated rq->clock updates required.

Signed-off-by: Paul Turner <pjt@google.com>
---
 kernel/sched/fair.c | 58 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 33 insertions(+), 25 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4ad9e2b2b368..ea02c288fcf4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3618,22 +3618,20 @@ next:
 /*
  * update tg->load_weight by folding this cpu's load_avg
  */
-static int update_shares_cpu(struct task_group *tg, int cpu)
+static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
 {
-	struct sched_entity *se;
-	struct cfs_rq *cfs_rq;
-	unsigned long flags;
-	struct rq *rq;
-
-	rq = cpu_rq(cpu);
-	se = tg->se[cpu];
-	cfs_rq = tg->cfs_rq[cpu];
+	struct sched_entity *se = tg->se[cpu];
+	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
 
-	raw_spin_lock_irqsave(&rq->lock, flags);
+	/* throttled entities do not contribute to load */
+	if (throttled_hierarchy(cfs_rq))
+		return;
 
-	update_rq_clock(rq);
 	update_cfs_rq_blocked_load(cfs_rq, 1);
-	update_entity_load_avg(tg->se[cpu], 1);
+	if (se)
+		update_entity_load_avg(se, 1);
+	else
+		update_rq_runnable_avg(rq_of(cfs_rq), 1);
 
 	if (se) {
 		/*
@@ -3646,29 +3644,39 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
 		else
 			list_del_leaf_cfs_rq(cfs_rq);
 	}
-
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
-
-	return 0;
 }
 
-static void update_shares(int cpu)
+static void update_blocked_averages(int cpu)
 {
-	struct cfs_rq *cfs_rq;
 	struct rq *rq = cpu_rq(cpu);
+	struct cfs_rq *cfs_rq;
+
+	unsigned long flags;
+	int num_updates = 0;
 
 	rcu_read_lock();
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	update_rq_clock(rq);
 	/*
 	 * Iterates the task_group tree in a bottom up fashion, see
 	 * list_add_leaf_cfs_rq() for details.
 	 */
 	for_each_leaf_cfs_rq(rq, cfs_rq) {
-		/* throttled entities do not contribute to load */
-		if (throttled_hierarchy(cfs_rq))
-			continue;
+		__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
 
-		update_shares_cpu(cfs_rq->tg, cpu);
+		/*
+		 * Periodically release the lock so that a cfs_rq with many
+		 * children cannot hold it for an arbitrary period of time.
+		 */
+		if (num_updates++ % 20 == 0) {
+			raw_spin_unlock_irqrestore(&rq->lock, flags);
+			cpu_relax();
+			raw_spin_lock_irqsave(&rq->lock, flags);
+			update_rq_clock(rq);
+		}
 	}
+
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
 	rcu_read_unlock();
 }
 
@@ -3713,7 +3721,7 @@ static unsigned long task_h_load(struct task_struct *p)
 	return load;
 }
 #else
-static inline void update_shares(int cpu)
+static inline void update_blocked_averages(int cpu)
 {
 }
 
@@ -4740,7 +4748,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
 	 */
 	raw_spin_unlock(&this_rq->lock);
 
-	update_shares(this_cpu);
+	update_blocked_averages(this_cpu);
 	rcu_read_lock();
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
@@ -5000,7 +5008,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 	int update_next_balance = 0;
 	int need_serialize;
 
-	update_shares(cpu);
+	update_blocked_averages(cpu);
 
 	rcu_read_lock();
 	for_each_domain(cpu, sd) {
-- 
cgit v1.2.3


From 7887bf893c44e2c1ba49d4399b491ebda3ed8e45 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 28 Jun 2012 03:24:00 +0100
Subject: sched: update_cfs_shares at period edge

Now that our measurement intervals are small (~1ms) we can amortize the posting
of update_shares() to be about each period overflow.  This is a large cost
saving for frequently switching tasks.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Ben Segall <bsegall@google.com>
---
 kernel/sched/fair.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ea02c288fcf4..f50667c6c64c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1164,6 +1164,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 	}
 
 	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+	update_cfs_shares(cfs_rq);
 }
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
@@ -1374,9 +1375,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
-	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
 	account_entity_enqueue(cfs_rq, se);
-	update_cfs_shares(cfs_rq);
+	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
 
 	if (flags & ENQUEUE_WAKEUP) {
 		place_entity(cfs_rq, se, 0);
@@ -1449,7 +1449,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
-	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
 
 	update_stats_dequeue(cfs_rq, se);
 	if (flags & DEQUEUE_SLEEP) {
@@ -1469,8 +1468,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
-	se->on_rq = 0;
 	account_entity_dequeue(cfs_rq, se);
+	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
 
 	/*
 	 * Normalize the entity after updating the min_vruntime because the
@@ -1484,7 +1483,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	return_cfs_rq_runtime(cfs_rq);
 
 	update_min_vruntime(cfs_rq);
-	update_cfs_shares(cfs_rq);
+	se->on_rq = 0;
 }
 
 /*
@@ -2496,8 +2495,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
-		update_cfs_shares(cfs_rq);
 		update_entity_load_avg(se, 1);
+		update_cfs_rq_blocked_load(cfs_rq, 0);
 	}
 
 	if (!se) {
@@ -2557,8 +2556,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
-		update_cfs_shares(cfs_rq);
 		update_entity_load_avg(se, 1);
+		update_cfs_rq_blocked_load(cfs_rq, 0);
 	}
 
 	if (!se) {
@@ -5580,8 +5579,11 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 		se = tg->se[i];
 		/* Propagate contribution to hierarchy */
 		raw_spin_lock_irqsave(&rq->lock, flags);
-		for_each_sched_entity(se)
+		for_each_sched_entity(se) {
 			update_cfs_shares(group_cfs_rq(se));
+			/* update contribution to parent */
+			update_entity_load_avg(se, 1);
+		}
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 	}
 
-- 
cgit v1.2.3


From abd09a4f2572406970033d0d128a450e0cd04e1f Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 28 Jun 2012 03:24:00 +0100
Subject: sched: make __update_entity_runnable_avg() fast

__update_entity_runnable_avg forms the core of maintaining an entity's runnable
load average.  In this function we charge the accumulated run-time since last
update and handle appropriate decay.  In some cases, e.g. a waking task, this
time interval may be much larger than our period unit.

Fortunately we can exploit some properties of our series to perform decay for a
blocked update in constant time and account the contribution for a running
update in essentially-constant* time.

[*]: For any running entity they should be performing updates at the tick which
gives us a soft limit of 1 jiffy between updates, and we can compute up to a
32 jiffy update in a single pass.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Ben Segall <bsegall@google.com>
---
 kernel/sched/fair.c | 122 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 97 insertions(+), 25 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f50667c6c64c..a7085f3faf1d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -878,18 +878,88 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_SMP
+/*
+ * We choose a half-life close to 1 scheduling period.
+ * Note: The tables below are dependent on this value.
+ */
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 46742 /* maximum possible load avg */
+#define LOAD_AVG_MAX_N 516 /* number of full periods it takes to produce max */
+
+/* Precomputed fixed inverse multiplies for multiplication by y^n */
+static const u32 runnable_avg_yN_inv[] = {
+	0xffffffff, 0xfa83b2db, 0xf5257d15, 0xefe4b99b, 0xeac0c6e7, 0xe5b906e7,
+	0xe0ccdeec, 0xdbfbb797, 0xd744fcca, 0xd2a81d91, 0xce248c15, 0xc9b9bd86,
+	0xc5672a11, 0xc12c4cca, 0xbd08a39f, 0xb8fbaf47, 0xb504f333, 0xb123f581,
+	0xad583eea, 0xa9a15ab4, 0xa5fed6a9, 0xa2704303, 0x9ef53260, 0x9b8d39b9,
+	0x9837f051, 0x94f4efa8, 0x91c3d373, 0x8ea4398b, 0x8b95c1e3, 0x88980e80,
+	0x85aac367, 0x82cd8698,
+};
+
+/* Precomputed \Sum y^k { 1<=k<=n } */
+static const u32 runnable_avg_yN_sum[] = {
+	    0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
+	 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
+	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
+};
+
 /*
  * Approximate:
  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
  */
-static __always_inline u64 decay_load(u64 val, int n)
+static __always_inline u64 decay_load(u64 val, u64 n)
 {
-	for (; n && val; n--) {
-		val *= 4008;
-		val >>= 12;
+	int local_n;
+	if (!n)
+		return val;
+	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+		return 0;
+
+	/* will be 32 bits if that's desirable */
+	local_n = n;
+
+	/*
+	 * As y^PERIOD = 1/2, we can combine
+	 *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
+	 * With a look-up table which covers k^n (n<PERIOD)
+	 *
+	 * To achieve constant time decay_load.
+	 */
+	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
+		val >>= local_n / LOAD_AVG_PERIOD;
+		n %= LOAD_AVG_PERIOD;
 	}
 
-	return val;
+	val *= runnable_avg_yN_inv[local_n];
+	return SRR(val, 32);
+}
+
+/*
+ * For updates fully spanning n periods, the contribution to runnable
+ * average will be: \Sum 1024*y^n
+ *
+ * We can compute this reasonably efficiently by combining:
+ *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
+ */
+static u32 __compute_runnable_contrib(int n)
+{
+	u32 contrib = 0;
+
+	if (likely(n <= LOAD_AVG_PERIOD))
+		return runnable_avg_yN_sum[n];
+	else if (unlikely(n >= LOAD_AVG_MAX_N))
+		return LOAD_AVG_MAX;
+
+	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
+	do {
+		contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
+		contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
+
+		n -= LOAD_AVG_PERIOD;
+	} while (n > LOAD_AVG_PERIOD);
+
+	contrib = decay_load(contrib, n);
+	return contrib + runnable_avg_yN_sum[n];
 }
 
 /* We can represent the historical contribution to runnable average as the
@@ -923,7 +993,8 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
 							struct sched_avg *sa,
 							int runnable)
 {
-	u64 delta;
+	u64 delta, periods;
+	u32 runnable_contrib;
 	int delta_w, decayed = 0;
 
 	delta = now - sa->last_runnable_update;
@@ -957,25 +1028,26 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
 		 * period and accrue it.
 		 */
 		delta_w = 1024 - delta_w;
-		BUG_ON(delta_w > delta);
-		do {
-			if (runnable)
-				sa->runnable_avg_sum += delta_w;
-			sa->runnable_avg_period += delta_w;
-
-			/*
-			 * Remainder of delta initiates a new period, roll over
-			 * the previous.
-			 */
-			sa->runnable_avg_sum =
-				decay_load(sa->runnable_avg_sum, 1);
-			sa->runnable_avg_period =
-				decay_load(sa->runnable_avg_period, 1);
-
-			delta -= delta_w;
-			/* New period is empty */
-			delta_w = 1024;
-		} while (delta >= 1024);
+		if (runnable)
+			sa->runnable_avg_sum += delta_w;
+		sa->runnable_avg_period += delta_w;
+
+		delta -= delta_w;
+
+		/* Figure out how many additional periods this update spans */
+		periods = delta / 1024;
+		delta %= 1024;
+
+		sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
+						  periods + 1);
+		sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
+						     periods + 1);
+
+		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
+		runnable_contrib = __compute_runnable_contrib(periods);
+		if (runnable)
+			sa->runnable_avg_sum += runnable_contrib;
+		sa->runnable_avg_period += runnable_contrib;
 	}
 
 	/* Remainder of delta accrued against u_0` */
-- 
cgit v1.2.3


From 1a26ee47d08f9cc330e45536a1631fe901a06d60 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 28 Jun 2012 03:24:00 +0100
Subject: sched: implement usage tracking

With the frame-work for runnable tracking now fully in place.  Per-entity usage
tracking is a simple and low-overhead addition.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Ben Segall <bsegall@google.com>
---
 include/linux/sched.h |  1 +
 kernel/sched/debug.c  |  3 +++
 kernel/sched/fair.c   | 33 ++++++++++++++++++++++++++++-----
 kernel/sched/sched.h  |  4 ++--
 4 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4a2aa19e9d9a..7b636c961cbe 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1140,6 +1140,7 @@ struct sched_avg {
 	u64 last_runnable_update;
 	s64 decay_count;
 	unsigned long load_avg_contrib;
+	u32 usage_avg_sum;
 };
 
 #ifdef CONFIG_SCHEDSTATS
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2cd3c1b4e582..b9d54d0d7bb0 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -94,6 +94,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 #ifdef CONFIG_SMP
 	P(se->avg.runnable_avg_sum);
 	P(se->avg.runnable_avg_period);
+	P(se->avg.usage_avg_sum);
 	P(se->avg.load_avg_contrib);
 	P(se->avg.decay_count);
 #endif
@@ -230,6 +231,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->tg_runnable_contrib);
 	SEQ_printf(m, "  .%-30s: %d\n", "tg->runnable_avg",
 			atomic_read(&cfs_rq->tg->runnable_avg));
+	SEQ_printf(m, "  .%-30s: %d\n", "tg->usage_avg",
+			atomic_read(&cfs_rq->tg->usage_avg));
 #endif
 
 	print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a7085f3faf1d..302439299839 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -991,7 +991,8 @@ static u32 __compute_runnable_contrib(int n)
  */
 static __always_inline int __update_entity_runnable_avg(u64 now,
 							struct sched_avg *sa,
-							int runnable)
+							int runnable,
+							int running)
 {
 	u64 delta, periods;
 	u32 runnable_contrib;
@@ -1030,6 +1031,8 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
 		delta_w = 1024 - delta_w;
 		if (runnable)
 			sa->runnable_avg_sum += delta_w;
+		if (running)
+			sa->usage_avg_sum += delta_w;
 		sa->runnable_avg_period += delta_w;
 
 		delta -= delta_w;
@@ -1042,17 +1045,22 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
 						  periods + 1);
 		sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
 						     periods + 1);
+		sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1);
 
 		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
 		runnable_contrib = __compute_runnable_contrib(periods);
 		if (runnable)
 			sa->runnable_avg_sum += runnable_contrib;
+		if (running)
+			sa->usage_avg_sum += runnable_contrib;
 		sa->runnable_avg_period += runnable_contrib;
 	}
 
 	/* Remainder of delta accrued against u_0` */
 	if (runnable)
 		sa->runnable_avg_sum += delta;
+	if (running)
+		sa->usage_avg_sum += delta;
 	sa->runnable_avg_period += delta;
 
 	return decayed;
@@ -1098,15 +1106,27 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
 						  struct cfs_rq *cfs_rq)
 {
 	struct task_group *tg = cfs_rq->tg;
-	long contrib;
+	long contrib, usage_contrib;
 
 	contrib = div_u64(sa->runnable_avg_sum << 12,
 			  sa->runnable_avg_period + 1);
 	contrib -= cfs_rq->tg_runnable_contrib;
 
-	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
+	usage_contrib = div_u64(sa->usage_avg_sum << 12,
+			        sa->runnable_avg_period + 1);
+	usage_contrib -= cfs_rq->tg_usage_contrib;
+
+	/*
+	 * contrib/usage at this point represent deltas, only update if they
+	 * are substantive.
+	 */
+	if ((abs(contrib) > cfs_rq->tg_runnable_contrib / 64) ||
+	    (abs(usage_contrib) > cfs_rq->tg_usage_contrib / 64)) {
 		atomic_add(contrib, &tg->runnable_avg);
 		cfs_rq->tg_runnable_contrib += contrib;
+
+		atomic_add(usage_contrib, &tg->usage_avg);
+		cfs_rq->tg_usage_contrib += usage_contrib;
 	}
 }
 
@@ -1196,7 +1216,8 @@ static inline void update_entity_load_avg(struct sched_entity *se,
 	else
 		now = cfs_rq_clock_task(group_cfs_rq(se));
 
-	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
+	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq,
+					  cfs_rq->curr == se))
 		return;
 
 	contrib_delta = __update_entity_load_avg_contrib(se);
@@ -1241,7 +1262,8 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 {
-	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
+	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable,
+				     runnable);
 	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
 }
 
@@ -1610,6 +1632,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		 */
 		update_stats_wait_end(cfs_rq, se);
 		__dequeue_entity(cfs_rq, se);
+		update_entity_load_avg(se, 1);
 	}
 
 	update_stats_curr_start(cfs_rq, se);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9e9c63832246..a01110038f71 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -113,7 +113,7 @@ struct task_group {
 
 	atomic_t load_weight;
 	atomic64_t load_avg;
-	atomic_t runnable_avg;
+	atomic_t runnable_avg, usage_avg;
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -236,7 +236,7 @@ struct cfs_rq {
 	u64 last_decay;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	u32 tg_runnable_contrib;
+	u32 tg_runnable_contrib, tg_usage_contrib;
 	u64 tg_load_contrib;
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
-- 
cgit v1.2.3


From fbaa0db1a12cd5f76b52abb48c764bc0441f0419 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 28 Jun 2012 03:24:00 +0100
Subject: sched: introduce temporary FAIR_GROUP_SCHED dependency for
 load-tracking

While per-entity load-tracking is generally useful, beyond computing shares
distribution, e.g.
  runnable based load-balance (in progress), governors, power-management, etc

These facilities are not yet consumers of this data.  This may be trivially
reverted when the information is required; but avoid paying the overhead for
calculations we will not use until then.

Signed-off-by: Paul Turner <pjt@google.com>
---
 include/linux/sched.h |  8 +++++++-
 kernel/sched/fair.c   | 14 +++++++++++---
 kernel/sched/sched.h  |  9 ++++++++-
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7b636c961cbe..2b22d81c6b1d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1203,7 +1203,13 @@ struct sched_entity {
 	/* rq "owned" by this entity/group: */
 	struct cfs_rq		*my_q;
 #endif
-#ifdef CONFIG_SMP
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+	/* Per-entity load-tracking */
 	struct sched_avg	avg;
 #endif
 };
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 302439299839..7aaeb56d83a7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -877,7 +877,8 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
-#ifdef CONFIG_SMP
+/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
 /*
  * We choose a half-life close to 1 scheduling period.
  * Note: The tables below are dependent on this value.
@@ -3197,6 +3198,12 @@ unlock:
 	return new_cpu;
 }
 
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#ifdef CONFIG_FAIR_GROUP_SCHED
 /*
  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
  * cfs_rq_of(p) references at time of call are still valid and identify the
@@ -3219,7 +3226,7 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) {
 		atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
 	}
 }
-
+#endif
 
 #endif /* CONFIG_SMP */
 
@@ -5732,8 +5739,9 @@ const struct sched_class fair_sched_class = {
 
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_fair,
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	.migrate_task_rq	= migrate_task_rq_fair,
-
+#endif
 	.rq_online		= rq_online_fair,
 	.rq_offline		= rq_offline_fair,
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a01110038f71..3a34fd637106 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -225,6 +225,12 @@ struct cfs_rq {
 #endif
 
 #ifdef CONFIG_SMP
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	/*
 	 * CFS Load tracking
 	 * Under CFS, load is tracked on a per-entity basis and aggregated up.
@@ -234,7 +240,8 @@ struct cfs_rq {
 	u64 runnable_load_avg, blocked_load_avg;
 	atomic64_t decay_counter, removed_load;
 	u64 last_decay;
-
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+/* These always depend on CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	u32 tg_runnable_contrib, tg_usage_contrib;
 	u64 tg_load_contrib;
-- 
cgit v1.2.3


From b6afcf140b944908f7debec8a9fe186d214c07c7 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <Morten.Rasmussen@arm.com>
Date: Mon, 25 Jun 2012 13:52:31 +0100
Subject: sched: Add ftrace events for entity load-tracking

Adds ftrace events for key variables related to the entity
load-tracking to help debugging scheduler behaviour. Allows tracing
of load contribution and runqueue residency ratio for both entities
and runqueues as well as entity CPU usage ratio.

Signed-off-by: Morten Rasmussen <Morten.Rasmussen@arm.com>
---
 include/trace/events/sched.h | 125 +++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          |  12 +++++
 2 files changed, 137 insertions(+)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index ea7a2035456d..002317926c2c 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -426,6 +426,131 @@ TRACE_EVENT(sched_pi_setprio,
 			__entry->oldprio, __entry->newprio)
 );
 
+/*
+ * Tracepoint for showing tracked load contribution.
+ */
+TRACE_EVENT(sched_task_load_contrib,
+
+	TP_PROTO(struct task_struct *tsk, unsigned long load_contrib),
+
+	TP_ARGS(tsk, load_contrib),
+
+	TP_STRUCT__entry(
+		__array(char, comm, TASK_COMM_LEN)
+		__field(pid_t, pid)
+		__field(unsigned long, load_contrib)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->pid            = tsk->pid;
+		__entry->load_contrib   = load_contrib;
+	),
+
+	TP_printk("comm=%s pid=%d load_contrib=%lu",
+			__entry->comm, __entry->pid,
+			__entry->load_contrib)
+);
+
+/*
+ * Tracepoint for showing tracked task runnable ratio [0..1023].
+ */
+TRACE_EVENT(sched_task_runnable_ratio,
+
+	TP_PROTO(struct task_struct *tsk, unsigned long ratio),
+
+	TP_ARGS(tsk, ratio),
+
+	TP_STRUCT__entry(
+		__array(char, comm, TASK_COMM_LEN)
+		__field(pid_t, pid)
+		__field(unsigned long, ratio)
+	),
+
+	TP_fast_assign(
+	memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->pid   = tsk->pid;
+		__entry->ratio = ratio;
+	),
+
+	TP_printk("comm=%s pid=%d ratio=%lu",
+			__entry->comm, __entry->pid,
+			__entry->ratio)
+);
+
+/*
+ * Tracepoint for showing tracked rq runnable ratio [0..1023].
+ */
+TRACE_EVENT(sched_rq_runnable_ratio,
+
+	TP_PROTO(int cpu, unsigned long ratio),
+
+	TP_ARGS(cpu, ratio),
+
+	TP_STRUCT__entry(
+		__field(int, cpu)
+		__field(unsigned long, ratio)
+	),
+
+	TP_fast_assign(
+		__entry->cpu   = cpu;
+		__entry->ratio = ratio;
+	),
+
+	TP_printk("cpu=%d ratio=%lu",
+			__entry->cpu,
+			__entry->ratio)
+);
+
+/*
+ * Tracepoint for showing tracked rq runnable load.
+ */
+TRACE_EVENT(sched_rq_runnable_load,
+
+	TP_PROTO(int cpu, u64 load),
+
+	TP_ARGS(cpu, load),
+
+	TP_STRUCT__entry(
+		__field(int, cpu)
+		__field(u64, load)
+	),
+
+	TP_fast_assign(
+		__entry->cpu  = cpu;
+		__entry->load = load;
+	),
+
+	TP_printk("cpu=%d load=%llu",
+			__entry->cpu,
+			__entry->load)
+);
+
+/*
+ * Tracepoint for showing tracked task cpu usage ratio [0..1023].
+ */
+TRACE_EVENT(sched_task_usage_ratio,
+
+	TP_PROTO(struct task_struct *tsk, unsigned long ratio),
+
+	TP_ARGS(tsk, ratio),
+
+	TP_STRUCT__entry(
+		__array(char, comm, TASK_COMM_LEN)
+		__field(pid_t, pid)
+		__field(unsigned long, ratio)
+	),
+
+	TP_fast_assign(
+	memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->pid   = tsk->pid;
+		__entry->ratio = ratio;
+	),
+
+	TP_printk("comm=%s pid=%d ratio=%lu",
+			__entry->comm, __entry->pid,
+			__entry->ratio)
+);
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7aaeb56d83a7..989540007748 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1172,6 +1172,13 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
 	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
 	contrib /= (se->avg.runnable_avg_period + 1);
 	se->avg.load_avg_contrib = scale_load(contrib);
+	trace_sched_task_load_contrib(task_of(se), se->avg.load_avg_contrib);
+	contrib = se->avg.runnable_avg_sum * scale_load_down(1024);
+	contrib /= (se->avg.runnable_avg_period + 1);
+	trace_sched_task_runnable_ratio(task_of(se), scale_load(contrib));
+	contrib = se->avg.usage_avg_sum * scale_load_down(1024);
+	contrib /= (se->avg.runnable_avg_period + 1);
+	trace_sched_task_usage_ratio(task_of(se), scale_load(contrib));
 }
 
 /* Compute the current contribution to load_avg by se, return any delta */
@@ -1263,9 +1270,14 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 {
+	u32 contrib;
 	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable,
 				     runnable);
 	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
+	contrib = rq->avg.runnable_avg_sum * scale_load_down(1024);
+	contrib /= (rq->avg.runnable_avg_period + 1);
+	trace_sched_rq_runnable_ratio(cpu_of(rq), scale_load(contrib));
+	trace_sched_rq_runnable_load(cpu_of(rq), rq->cfs.runnable_load_avg);
 }
 
 /* Add the load generated by se into cfs_rq's child load-average */
-- 
cgit v1.2.3


From b32ad183db527a60b39e989b5b7e1576d5b80a66 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <Morten.Rasmussen@arm.com>
Date: Fri, 29 Jun 2012 16:46:34 +0100
Subject: sched: entity load-tracking load_avg_ratio

load_avg_contrib includes task load.weight and therefore not the
pure tracked load of the task. This patch adds load_avg_ratio, which
does not include the task load.weight.

Signed-off-by: Morten Rasmussen <Morten.Rasmussen@arm.com>
---
 include/linux/sched.h | 1 +
 kernel/sched/fair.c   | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2b22d81c6b1d..c30a799227d9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1140,6 +1140,7 @@ struct sched_avg {
 	u64 last_runnable_update;
 	s64 decay_count;
 	unsigned long load_avg_contrib;
+	unsigned long load_avg_ratio;
 	u32 usage_avg_sum;
 };
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 989540007748..97760898376c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1175,7 +1175,8 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
 	trace_sched_task_load_contrib(task_of(se), se->avg.load_avg_contrib);
 	contrib = se->avg.runnable_avg_sum * scale_load_down(1024);
 	contrib /= (se->avg.runnable_avg_period + 1);
-	trace_sched_task_runnable_ratio(task_of(se), scale_load(contrib));
+	se->avg.load_avg_ratio = scale_load(contrib);
+	trace_sched_task_runnable_ratio(task_of(se), se->avg.load_avg_ratio);
 	contrib = se->avg.usage_avg_sum * scale_load_down(1024);
 	contrib /= (se->avg.runnable_avg_period + 1);
 	trace_sched_task_usage_ratio(task_of(se), scale_load(contrib));
-- 
cgit v1.2.3


From e3355d72293e865acad2ca56e4b836c5717c3d3a Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <Morten.Rasmussen@arm.com>
Date: Fri, 29 Jun 2012 16:49:59 +0100
Subject: sched: load-tracking driven wakeup migration for HMP platforms

Attempts to migrate tasks to an appropriate cpu on heterogeneous
platforms based on the task's individual tracked load at wakeup.
The migration decision is based on task load thresholds and task
priority.

Currently only two types of cpus are supported:
fast (high-performance) and slow (power-efficient).

The HMP setup (fast/slow cpuids) is currently hardcoded in the
scheduler. Obviously, this hack needs to be replaced by a generic
way to expose to expose this information to the scheduler. Ideally
this could be done using device tree and a not yet implemented
scheduler interface.

Signed-off-by: Morten Rasmussen <Morten.Rasmussen@arm.com>
---
 arch/arm/Kconfig    |  29 +++++++++++++++
 kernel/sched/fair.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 134 insertions(+)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index a91009c61870..adddd220bd9a 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1512,6 +1512,35 @@ config SCHED_SMT
 	  MultiThreading at a cost of slightly increased overhead in some
 	  places. If unsure say N here.
 
+config DISABLE_CPU_SCHED_DOMAIN_BALANCE
+	bool "(EXPERIMENTAL) Disable CPU level scheduler load-balancing"
+	help
+	  Disables scheduler load-balancing at CPU sched domain level.
+
+config SCHED_HMP
+	bool "(EXPERIMENTAL) Heterogenous multiprocessor scheduling"
+	depends on DISABLE_CPU_SCHED_DOMAIN_BALANCE && SCHED_MC && FAIR_GROUP_SCHED && !SCHED_AUTOGROUP
+	help
+	  Experimental scheduler optimizations for heterogeneous platforms.
+	  Attempts introspectively select task affinity to optimize power
+	  and performance. Currently support two types of CPUs: fast
+	  (high-performance) and slow (power-efficient). There is currently
+	  no support for migration of task groups, hence !SCHED_AUTOGROUP.
+
+config HMP_FAST_CPU_MASK
+	string "HMP scheduler fast CPU mask"
+	depends on SCHED_HMP
+	help
+	  Specifies the cpuids of the fast CPUs in the system as a list
+	  string, e.g. cpuid 0+1 should be specified as 0-1.
+
+config HMP_SLOW_CPU_MASK
+	string "HMP scheduler slow CPU mask"
+	depends on SCHED_HMP
+	help
+	  Specifies the cpuids of the slow CPUs in the system as a list
+	  string, e.g. cpuid 0+1 should be specified as 0-1.
+
 config HAVE_ARM_SCU
 	bool
 	help
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 97760898376c..92d806d2a667 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3082,6 +3082,73 @@ done:
 	return target;
 }
 
+#ifdef CONFIG_SCHED_HMP
+/* Heterogenous multiprocessor (HMP) optimizations
+ * We need to know which cpus that are fast and slow. Ideally, this
+ * information would be provided by the platform in some way. For now it is
+ * set in the kernel config. */
+static struct cpumask hmp_fast_cpu_mask;
+static struct cpumask hmp_slow_cpu_mask;
+
+/* Setup fast and slow cpumasks.
+ * This should be setup based on device tree somehow. */
+static int __init hmp_cpu_mask_setup(void)
+{
+	char buf[64];
+
+	cpumask_clear(&hmp_fast_cpu_mask);
+	cpumask_clear(&hmp_slow_cpu_mask);
+
+	if (cpulist_parse(CONFIG_HMP_FAST_CPU_MASK, &hmp_fast_cpu_mask))
+		WARN(1, "Failed to parse HMP fast cpu mask!\n");
+	if (cpulist_parse(CONFIG_HMP_SLOW_CPU_MASK, &hmp_slow_cpu_mask))
+		WARN(1, "Failed to parse HMP slow cpu mask!\n");
+
+	printk(KERN_DEBUG "Initializing HMP scheduler:\n");
+	cpulist_scnprintf(buf, 64, &hmp_fast_cpu_mask);
+	printk(KERN_DEBUG "  fast cpus: %s\n", buf);
+	cpulist_scnprintf(buf, 64, &hmp_slow_cpu_mask);
+	printk(KERN_DEBUG "  slow cpus: %s\n", buf);
+
+	return 1;
+}
+early_initcall(hmp_cpu_mask_setup);
+
+/* Migration thresholds should be in the range [0..1023]
+ * hmp_up_threshold: min. load required for migrating tasks to a fast cpu
+ * hmp_down_threshold: max. load allowed for tasks migrating to a slow cpu
+ * hmp_up_prio: min. task prio for tasks migrating to faster cpus */
+unsigned int hmp_up_threshold = 512;
+unsigned int hmp_down_threshold = 256;
+unsigned int hmp_up_prio = 125;
+static unsigned int hmp_up_migration(int cpu, struct sched_entity *se);
+static unsigned int hmp_down_migration(int cpu, struct sched_entity *se);
+
+static unsigned int hmp_cpu_is_fast(int cpu)
+{
+	return cpumask_test_cpu(cpu, &hmp_fast_cpu_mask);
+}
+
+static unsigned int hmp_cpu_is_slow(int cpu)
+{
+	return cpumask_test_cpu(cpu, &hmp_slow_cpu_mask);
+}
+
+/* Select target cpu for HMP migration to fast cpu
+ * returns target >= nr_cpu_ids if no fast cpus in affinity mask */
+static inline unsigned int hmp_select_fast_cpu(struct task_struct *tsk)
+{
+	return cpumask_any_and(&hmp_fast_cpu_mask, tsk_cpus_allowed(tsk));
+}
+
+/* Select target cpu for HMP migration to slow cpu
+ * returns target >= nr_cpu_ids if no slow cpus in affinity mask */
+static inline unsigned int hmp_select_slow_cpu(struct task_struct *tsk)
+{
+	return cpumask_any_and(&hmp_slow_cpu_mask, tsk_cpus_allowed(tsk));
+}
+#endif /* CONFIG_SCHED_HMP */
+
 /*
  * sched_balance_self: balance the current task (running on cpu) in domains
  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
@@ -3208,6 +3275,19 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
 unlock:
 	rcu_read_unlock();
 
+#ifdef CONFIG_SCHED_HMP
+	if (hmp_up_migration(new_cpu, &p->se)) {
+		cpu = hmp_select_fast_cpu(p);
+		if (cpu < nr_cpu_ids)
+			return cpu;
+	}
+	if (hmp_down_migration(new_cpu, &p->se)) {
+		cpu = hmp_select_slow_cpu(p);
+		if (cpu < nr_cpu_ids)
+			return cpu;
+	}
+#endif
+
 	return new_cpu;
 }
 
@@ -5290,6 +5370,31 @@ need_kick:
 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
 #endif
 
+#ifdef CONFIG_SCHED_HMP
+/* Check if task should migrate to a faster core */
+static unsigned int hmp_up_migration(int cpu, struct sched_entity *se)
+{
+	struct task_struct *p = task_of(se);
+	if (p->prio < hmp_up_prio && p->prio > 100
+		&& hmp_cpu_is_slow(cpu)
+		&& se->avg.load_avg_ratio > hmp_up_threshold) {
+		return 1;
+	}
+	return 0;
+}
+
+/* Check if task should migrate to a slower core */
+static unsigned int hmp_down_migration(int cpu, struct sched_entity *se)
+{
+	struct task_struct *p = task_of(se);
+	if (p->prio >= hmp_up_prio || (hmp_cpu_is_fast(cpu)
+		&& se->avg.load_avg_ratio < hmp_down_threshold)) {
+		return 1;
+	}
+	return 0;
+}
+#endif /* CONFIG_SCHED_HMP */
+
 /*
  * run_rebalance_domains is triggered when needed from the scheduler tick.
  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
-- 
cgit v1.2.3


From ff8a9da13f02c8d3a39be61f1cf3542a864b878b Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <Morten.Rasmussen@arm.com>
Date: Mon, 25 Jun 2012 16:07:55 +0100
Subject: sched: Forced migration of high load task on HMP platforms

This patch introduces a periodic check to look for high load tasks on
runqueues of low-performance cpus on heterogeneous platforms. These will
be migrated immediately rather than wait until next time they go to sleep
and goes through the wakeup migration.

The patch is proof-of-concept code and therefore attempts to have minimal
impact on existing scheduler code paths. The most of the functions can
potentially be merged with existing functions and reduce the size of this
patch considerably.

Signed-off-by: Morten Rasmussen <Morten.Rasmussen@arm.com>
---
 kernel/sched/fair.c  | 193 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h |   1 +
 2 files changed, 194 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 92d806d2a667..5033c208ab92 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5393,6 +5393,197 @@ static unsigned int hmp_down_migration(int cpu, struct sched_entity *se)
 	}
 	return 0;
 }
+
+/*
+ * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ * Ideally this function should be merged with can_migrate_task() to avoid
+ * redundant code.
+ */
+static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env)
+{
+	int tsk_cache_hot = 0;
+	/*
+	 * We do not migrate tasks that are:
+	 * 1) running (obviously), or
+	 * 2) cannot be migrated to this CPU due to cpus_allowed
+	 */
+	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+		return 0;
+	}
+	env->flags &= ~LBF_ALL_PINNED;
+
+	if (task_running(env->src_rq, p)) {
+		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+		return 0;
+	}
+
+	/*
+	 * Aggressive migration if:
+	 * 1) task is cache cold, or
+	 * 2) too many balance attempts have failed.
+	 */
+
+	tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
+	if (!tsk_cache_hot ||
+		env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+		if (tsk_cache_hot) {
+			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+			schedstat_inc(p, se.statistics.nr_forced_migrations);
+		}
+#endif
+		return 1;
+	}
+
+	return 1;
+}
+
+/*
+ * move_specific_task tries to move a specific task.
+ * Returns 1 if successful and 0 otherwise.
+ * Called with both runqueues locked.
+ */
+static int move_specific_task(struct lb_env *env, struct task_struct *pm)
+{
+	struct task_struct *p, *n;
+
+	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+	if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
+				env->dst_cpu))
+		continue;
+
+		if (!hmp_can_migrate_task(p, env))
+			continue;
+		/* Check if we found the right task */
+		if (p != pm)
+			continue;
+
+		move_task(p, env);
+		/*
+		 * Right now, this is only the third place move_task()
+		 * is called, so we can safely collect move_task()
+		 * stats here rather than inside move_task().
+		 */
+		schedstat_inc(env->sd, lb_gained[env->idle]);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to
+ * migrate a specific task from one runqueue to another.
+ * hmp_force_up_migration uses this to push a currently running task
+ * off a runqueue.
+ * Based on active_load_balance_stop_cpu and can potentially be merged.
+ */
+static int hmp_active_task_migration_cpu_stop(void *data)
+{
+	struct rq *busiest_rq = data;
+	struct task_struct *p = busiest_rq->migrate_task;
+	int busiest_cpu = cpu_of(busiest_rq);
+	int target_cpu = busiest_rq->push_cpu;
+	struct rq *target_rq = cpu_rq(target_cpu);
+	struct sched_domain *sd;
+
+	raw_spin_lock_irq(&busiest_rq->lock);
+	/* make sure the requested cpu hasn't gone down in the meantime */
+	if (unlikely(busiest_cpu != smp_processor_id() ||
+		!busiest_rq->active_balance)) {
+		goto out_unlock;
+	}
+	/* Is there any task to move? */
+	if (busiest_rq->nr_running <= 1)
+		goto out_unlock;
+	/* Task has migrated meanwhile, abort forced migration */
+	if (task_rq(p) != busiest_rq)
+		goto out_unlock;
+	/*
+	 * This condition is "impossible", if it occurs
+	 * we need to fix it. Originally reported by
+	 * Bjorn Helgaas on a 128-cpu setup.
+	 */
+	BUG_ON(busiest_rq == target_rq);
+
+	/* move a task from busiest_rq to target_rq */
+	double_lock_balance(busiest_rq, target_rq);
+
+	/* Search for an sd spanning us and the target CPU. */
+	rcu_read_lock();
+	for_each_domain(target_cpu, sd) {
+		if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+			break;
+	}
+
+	if (likely(sd)) {
+		struct lb_env env = {
+			.sd		= sd,
+			.dst_cpu	= target_cpu,
+			.dst_rq		= target_rq,
+			.src_cpu	= busiest_rq->cpu,
+			.src_rq		= busiest_rq,
+			.idle		= CPU_IDLE,
+		};
+
+		schedstat_inc(sd, alb_count);
+
+		if (move_specific_task(&env, p))
+			schedstat_inc(sd, alb_pushed);
+		else
+			schedstat_inc(sd, alb_failed);
+	}
+	rcu_read_unlock();
+	double_unlock_balance(busiest_rq, target_rq);
+out_unlock:
+	busiest_rq->active_balance = 0;
+	raw_spin_unlock_irq(&busiest_rq->lock);
+	return 0;
+}
+
+static DEFINE_SPINLOCK(hmp_force_migration);
+
+/* hmp_force_up_migration checks runqueues for tasks that need to
+ * be actively migrated to a faster cpu. */
+static void hmp_force_up_migration(int this_cpu)
+{
+	int i;
+	struct sched_entity *curr;
+	struct rq *target;
+	unsigned long flags;
+	unsigned int force;
+	struct task_struct *p;
+
+	if (!spin_trylock(&hmp_force_migration))
+		return;
+	for_each_cpu(i, &hmp_slow_cpu_mask) {
+		force = 0;
+		target = cpu_rq(i);
+		raw_spin_lock_irqsave(&target->lock, flags);
+		curr = target->cfs.curr;
+		if (!curr || !entity_is_task(curr)) {
+			raw_spin_unlock_irqrestore(&target->lock, flags);
+			continue;
+		}
+		p = task_of(curr);
+		if (hmp_up_migration(i, curr)) {
+			if (!target->active_balance) {
+				target->active_balance = 1;
+				target->push_cpu = hmp_select_fast_cpu(p);
+				target->migrate_task = p;
+				force = 1;
+			}
+		}
+		raw_spin_unlock_irqrestore(&target->lock, flags);
+		if (force)
+			stop_one_cpu_nowait(cpu_of(target),
+				hmp_active_task_migration_cpu_stop,
+				target, &target->active_balance_work);
+	}
+	spin_unlock(&hmp_force_migration);
+}
+#else
+static void hmp_force_up_migration(int this_cpu) { }
 #endif /* CONFIG_SCHED_HMP */
 
 /*
@@ -5406,6 +5597,8 @@ static void run_rebalance_domains(struct softirq_action *h)
 	enum cpu_idle_type idle = this_rq->idle_balance ?
 						CPU_IDLE : CPU_NOT_IDLE;
 
+	hmp_force_up_migration(this_cpu);
+
 	rebalance_domains(this_cpu, idle);
 
 	/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3a34fd637106..0dc9bd108f57 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -421,6 +421,7 @@ struct rq {
 	int active_balance;
 	int push_cpu;
 	struct cpu_stop_work active_balance_work;
+	struct task_struct *migrate_task;
 	/* cpu of this runqueue: */
 	int cpu;
 	int online;
-- 
cgit v1.2.3


From e97ad684522fa231bbb9da3d0f0c89bcda345e6e Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <Morten.Rasmussen@arm.com>
Date: Mon, 25 Jun 2012 16:11:44 +0100
Subject: sched: Add HMP forced task migration ftrace event

Adds ftrace event for tracing forced task migrations using HMP
optimized scheduling.

Signed-off-by: Morten Rasmussen <Morten.Rasmussen@arm.com>
---
 include/trace/events/sched.h | 26 ++++++++++++++++++++++++++
 kernel/sched/fair.c          |  1 +
 2 files changed, 27 insertions(+)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 002317926c2c..2c50a06fbedd 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -551,6 +551,32 @@ TRACE_EVENT(sched_task_usage_ratio,
 			__entry->comm, __entry->pid,
 			__entry->ratio)
 );
+
+/*
+ * Tracepoint for HMP (CONFIG_SCHED_HMP) task migrations.
+ */
+TRACE_EVENT(sched_hmp_migrate,
+
+	TP_PROTO(struct task_struct *tsk, int val),
+
+	TP_ARGS(tsk, val),
+
+	TP_STRUCT__entry(
+		__array(char, comm, TASK_COMM_LEN)
+		__field(pid_t, pid)
+		__field(int,  val)
+	),
+
+	TP_fast_assign(
+	memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->pid = tsk->pid;
+		__entry->val = val;
+	),
+
+	TP_printk("comm=%s pid=%d val=%d",
+			__entry->comm, __entry->pid,
+			__entry->val)
+);
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5033c208ab92..f705a87ac7b5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5572,6 +5572,7 @@ static void hmp_force_up_migration(int this_cpu)
 				target->push_cpu = hmp_select_fast_cpu(p);
 				target->migrate_task = p;
 				force = 1;
+				trace_sched_hmp_migrate(p, 1);
 			}
 		}
 		raw_spin_unlock_irqrestore(&target->lock, flags);
-- 
cgit v1.2.3


From 83ce6a1760deba496a4dbb1ae4ce608e914044c3 Mon Sep 17 00:00:00 2001
From: "Jon Medhurst (Tixy)" <tixy@linaro.org>
Date: Sat, 14 Jul 2012 18:56:08 +0100
Subject: sched: Use device-tree to provide fast/slow CPU list for HMP

We can't rely on Kconfig options to set the fast and slow CPU lists for
HMP scheduling if we want a single kernel binary to support multiple
devices with different CPU topology. E.g. ARM's TC2, Fast Models, or
even non big.LITTLE devices.

This patch adds the function arch_get_fast_and_slow_cpus() to generate
the lists at run-time by parsing the CPU nodes in device-tree; it
assumes slow cores are A7s and everything else is fast. The function
still supports the old Kconfig options as this is useful for testing the
HMP scheduler on devices without big.LITTLE.

Signed-off-by: Jon Medhurst <tixy@linaro.org>
---
 arch/arm/kernel/topology.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c        | 18 ++++--------
 2 files changed, 75 insertions(+), 12 deletions(-)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 8200deaa14f6..28d2be00ed89 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -126,6 +126,75 @@ void store_cpu_topology(unsigned int cpuid)
 		cpu_topology[cpuid].socket_id, mpidr);
 }
 
+
+#ifdef CONFIG_SCHED_HMP
+
+static const char * const little_cores[] = {
+	"arm,cortex-a7",
+	NULL,
+};
+
+static bool is_little_cpu(struct device_node *cn)
+{
+	const char * const *lc;
+	for (lc = little_cores; *lc; lc++)
+		if (of_device_is_compatible(cn, *lc))
+			return true;
+	return false;
+}
+
+void __init arch_get_fast_and_slow_cpus(struct cpumask *fast,
+					struct cpumask *slow)
+{
+	struct device_node *cn = NULL;
+	int cpu = 0;
+
+	cpumask_clear(fast);
+	cpumask_clear(slow);
+
+	/*
+	 * Use the config options if they are given. This helps testing
+	 * HMP scheduling on systems without a big.LITTLE architecture.
+	 */
+	if (strlen(CONFIG_HMP_FAST_CPU_MASK) && strlen(CONFIG_HMP_SLOW_CPU_MASK)) {
+		if (cpulist_parse(CONFIG_HMP_FAST_CPU_MASK, fast))
+			WARN(1, "Failed to parse HMP fast cpu mask!\n");
+		if (cpulist_parse(CONFIG_HMP_SLOW_CPU_MASK, slow))
+			WARN(1, "Failed to parse HMP slow cpu mask!\n");
+		return;
+	}
+
+	/*
+	 * Else, parse device tree for little cores.
+	 */
+	while ((cn = of_find_node_by_type(cn, "cpu"))) {
+
+		if (cpu >= num_possible_cpus())
+			break;
+
+		if (is_little_cpu(cn))
+			cpumask_set_cpu(cpu, slow);
+		else
+			cpumask_set_cpu(cpu, fast);
+
+		cpu++;
+	}
+
+	if (!cpumask_empty(fast) && !cpumask_empty(slow))
+		return;
+
+	/*
+	 * We didn't find both big and little cores so let's call all cores
+	 * fast as this will keep the system running, with all cores being
+	 * treated equal.
+	 */
+	cpumask_setall(fast);
+	cpumask_clear(slow);
+}
+
+#endif /* CONFIG_SCHED_HMP */
+
+
 /*
  * init_cpu_topology is called at boot when only one cpu is running
  * which prevent simultaneous write access to cpu_topology array
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f705a87ac7b5..e05636144c62 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3084,25 +3084,19 @@ done:
 
 #ifdef CONFIG_SCHED_HMP
 /* Heterogenous multiprocessor (HMP) optimizations
- * We need to know which cpus that are fast and slow. Ideally, this
- * information would be provided by the platform in some way. For now it is
- * set in the kernel config. */
+ * We need to know which cpus that are fast and slow. */
 static struct cpumask hmp_fast_cpu_mask;
 static struct cpumask hmp_slow_cpu_mask;
 
-/* Setup fast and slow cpumasks.
- * This should be setup based on device tree somehow. */
+extern void __init arch_get_fast_and_slow_cpus(struct cpumask *fast,
+					       struct cpumask *slow);
+
+/* Setup fast and slow cpumasks. */
 static int __init hmp_cpu_mask_setup(void)
 {
 	char buf[64];
 
-	cpumask_clear(&hmp_fast_cpu_mask);
-	cpumask_clear(&hmp_slow_cpu_mask);
-
-	if (cpulist_parse(CONFIG_HMP_FAST_CPU_MASK, &hmp_fast_cpu_mask))
-		WARN(1, "Failed to parse HMP fast cpu mask!\n");
-	if (cpulist_parse(CONFIG_HMP_SLOW_CPU_MASK, &hmp_slow_cpu_mask))
-		WARN(1, "Failed to parse HMP slow cpu mask!\n");
+	arch_get_fast_and_slow_cpus(&hmp_fast_cpu_mask, &hmp_slow_cpu_mask);
 
 	printk(KERN_DEBUG "Initializing HMP scheduler:\n");
 	cpulist_scnprintf(buf, 64, &hmp_fast_cpu_mask);
-- 
cgit v1.2.3


From cff7c1f0846a97e48cd3560264e922ccbdbb8fa8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 16 Jul 2012 10:42:36 +0000
Subject: smpboot: Provide infrastructure for percpu hotplug threads

Provide a generic interface for setting up and tearing down percpu
threads.

On registration the threads for already online cpus are created and
started. On deregistration (modules) the threads are stoppped.

During hotplug operations the threads are created, started, parked and
unparked. The datastructure for registration provides a pointer to
percpu storage space and optional setup, cleanup, park, unpark
functions. These functions are called when the thread state changes.

Each implementation has to provide a function which is queried and
returns whether the thread should run and the thread function itself.

The core code handles all state transitions and avoids duplicated code
in the call sites.

[ paulmck: Updated to fix preempt_disable() misnesting. ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/smpboot.h |  43 +++++++++
 kernel/cpu.c            |  10 ++-
 kernel/smpboot.c        | 229 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/smpboot.h        |   4 +
 4 files changed, 285 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/smpboot.h

diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
new file mode 100644
index 000000000000..e0106d8581d3
--- /dev/null
+++ b/include/linux/smpboot.h
@@ -0,0 +1,43 @@
+#ifndef _LINUX_SMPBOOT_H
+#define _LINUX_SMPBOOT_H
+
+#include <linux/types.h>
+
+struct task_struct;
+/* Cookie handed to the thread_fn*/
+struct smpboot_thread_data;
+
+/**
+ * struct smp_hotplug_thread - CPU hotplug related thread descriptor
+ * @store:		Pointer to per cpu storage for the task pointers
+ * @list:		List head for core management
+ * @thread_should_run:	Check whether the thread should run or not. Called with
+ *			preemption disabled.
+ * @thread_fn:		The associated thread function
+ * @setup:		Optional setup function, called when the thread gets
+ *			operational the first time
+ * @cleanup:		Optional cleanup function, called when the thread
+ *			should stop (module exit)
+ * @park:		Optional park function, called when the thread is
+ *			parked (cpu offline)
+ * @unpark:		Optional unpark function, called when the thread is
+ *			unparked (cpu online)
+ * @thread_comm:	The base name of the thread
+ */
+struct smp_hotplug_thread {
+	struct task_struct __percpu	**store;
+	struct list_head		list;
+	int				(*thread_should_run)(unsigned int cpu);
+	void				(*thread_fn)(unsigned int cpu);
+	void				(*setup)(unsigned int cpu);
+	void				(*cleanup)(unsigned int cpu, bool online);
+	void				(*park)(unsigned int cpu);
+	void				(*unpark)(unsigned int cpu);
+	const char			*thread_comm;
+};
+
+int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
+void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
+int smpboot_thread_schedule(void);
+
+#endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a4eb5227a19e..8ab33aeea0e5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -280,12 +280,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 				__func__, cpu);
 		goto out_release;
 	}
+	smpboot_park_threads(cpu);
 
 	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
 	if (err) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
+		smpboot_unpark_threads(cpu);
 		cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
-
 		goto out_release;
 	}
 	BUG_ON(cpu_online(cpu));
@@ -354,6 +355,10 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 		goto out;
 	}
 
+	ret = smpboot_create_threads(cpu);
+	if (ret)
+		goto out;
+
 	ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
 	if (ret) {
 		nr_calls--;
@@ -368,6 +373,9 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
 
+	/* Wake the per cpu threads */
+	smpboot_unpark_threads(cpu);
+
 	/* Now call notifier in preparation. */
 	cpu_notify(CPU_ONLINE | mod, hcpu);
 
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 98f60c5caa1b..b2545c843a99 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -1,11 +1,17 @@
 /*
  * Common SMP CPU bringup/teardown functions
  */
+#include <linux/cpu.h>
 #include <linux/err.h>
 #include <linux/smp.h>
 #include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/export.h>
 #include <linux/percpu.h>
+#include <linux/kthread.h>
+#include <linux/smpboot.h>
 
 #include "smpboot.h"
 
@@ -65,3 +71,226 @@ void __init idle_threads_init(void)
 	}
 }
 #endif
+
+static LIST_HEAD(hotplug_threads);
+static DEFINE_MUTEX(smpboot_threads_lock);
+
+struct smpboot_thread_data {
+	unsigned int			cpu;
+	unsigned int			status;
+	struct smp_hotplug_thread	*ht;
+};
+
+enum {
+	HP_THREAD_NONE = 0,
+	HP_THREAD_ACTIVE,
+	HP_THREAD_PARKED,
+};
+
+/**
+ * smpboot_thread_fn - percpu hotplug thread loop function
+ * @void:	thread data pointer
+ *
+ * Checks for thread stop and park conditions. Calls the necessary
+ * setup, cleanup, park and unpark functions for the registered
+ * thread.
+ *
+ * Returns 1 when the thread should exit, 0 otherwise.
+ */
+static int smpboot_thread_fn(void *data)
+{
+	struct smpboot_thread_data *td = data;
+	struct smp_hotplug_thread *ht = td->ht;
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		preempt_disable();
+		if (kthread_should_stop()) {
+			set_current_state(TASK_RUNNING);
+			preempt_enable();
+			if (ht->cleanup)
+				ht->cleanup(td->cpu, cpu_online(td->cpu));
+			kfree(td);
+			return 0;
+		}
+
+		if (kthread_should_park()) {
+			__set_current_state(TASK_RUNNING);
+			preempt_enable();
+			if (ht->park && td->status == HP_THREAD_ACTIVE) {
+				BUG_ON(td->cpu != smp_processor_id());
+				ht->park(td->cpu);
+				td->status = HP_THREAD_PARKED;
+			}
+			kthread_parkme();
+			/* We might have been woken for stop */
+			continue;
+		}
+
+		BUG_ON(td->cpu != smp_processor_id());
+
+		/* Check for state change setup */
+		switch (td->status) {
+		case HP_THREAD_NONE:
+			preempt_enable();
+			if (ht->setup)
+				ht->setup(td->cpu);
+			td->status = HP_THREAD_ACTIVE;
+			preempt_disable();
+			break;
+		case HP_THREAD_PARKED:
+			preempt_enable();
+			if (ht->unpark)
+				ht->unpark(td->cpu);
+			td->status = HP_THREAD_ACTIVE;
+			preempt_disable();
+			break;
+		}
+
+		if (!ht->thread_should_run(td->cpu)) {
+			preempt_enable();
+			schedule();
+		} else {
+			set_current_state(TASK_RUNNING);
+			preempt_enable();
+			ht->thread_fn(td->cpu);
+		}
+	}
+}
+
+static int
+__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+	struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+	struct smpboot_thread_data *td;
+
+	if (tsk)
+		return 0;
+
+	td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
+	if (!td)
+		return -ENOMEM;
+	td->cpu = cpu;
+	td->ht = ht;
+
+	tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu,
+				    ht->thread_comm);
+	if (IS_ERR(tsk)) {
+		kfree(td);
+		return PTR_ERR(tsk);
+	}
+
+	get_task_struct(tsk);
+	*per_cpu_ptr(ht->store, cpu) = tsk;
+	return 0;
+}
+
+int smpboot_create_threads(unsigned int cpu)
+{
+	struct smp_hotplug_thread *cur;
+	int ret = 0;
+
+	mutex_lock(&smpboot_threads_lock);
+	list_for_each_entry(cur, &hotplug_threads, list) {
+		ret = __smpboot_create_thread(cur, cpu);
+		if (ret)
+			break;
+	}
+	mutex_unlock(&smpboot_threads_lock);
+	return ret;
+}
+
+static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+	struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+	kthread_unpark(tsk);
+}
+
+void smpboot_unpark_threads(unsigned int cpu)
+{
+	struct smp_hotplug_thread *cur;
+
+	mutex_lock(&smpboot_threads_lock);
+	list_for_each_entry(cur, &hotplug_threads, list)
+		smpboot_unpark_thread(cur, cpu);
+	mutex_unlock(&smpboot_threads_lock);
+}
+
+static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+	struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+	if (tsk)
+		kthread_park(tsk);
+}
+
+void smpboot_park_threads(unsigned int cpu)
+{
+	struct smp_hotplug_thread *cur;
+
+	mutex_lock(&smpboot_threads_lock);
+	list_for_each_entry_reverse(cur, &hotplug_threads, list)
+		smpboot_park_thread(cur, cpu);
+	mutex_unlock(&smpboot_threads_lock);
+}
+
+static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
+{
+	unsigned int cpu;
+
+	/* We need to destroy also the parked threads of offline cpus */
+	for_each_possible_cpu(cpu) {
+		struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+		if (tsk) {
+			kthread_stop(tsk);
+			put_task_struct(tsk);
+			*per_cpu_ptr(ht->store, cpu) = NULL;
+		}
+	}
+}
+
+/**
+ * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
+ * @plug_thread:	Hotplug thread descriptor
+ *
+ * Creates and starts the threads on all online cpus.
+ */
+int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+	unsigned int cpu;
+	int ret = 0;
+
+	mutex_lock(&smpboot_threads_lock);
+	for_each_online_cpu(cpu) {
+		ret = __smpboot_create_thread(plug_thread, cpu);
+		if (ret) {
+			smpboot_destroy_threads(plug_thread);
+			goto out;
+		}
+		smpboot_unpark_thread(plug_thread, cpu);
+	}
+	list_add(&plug_thread->list, &hotplug_threads);
+out:
+	mutex_unlock(&smpboot_threads_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
+
+/**
+ * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
+ * @plug_thread:	Hotplug thread descriptor
+ *
+ * Stops all threads on all possible cpus.
+ */
+void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+	get_online_cpus();
+	mutex_lock(&smpboot_threads_lock);
+	list_del(&plug_thread->list);
+	smpboot_destroy_threads(plug_thread);
+	mutex_unlock(&smpboot_threads_lock);
+	put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
index 80c0acfb8472..9e26228edf66 100644
--- a/kernel/smpboot.h
+++ b/kernel/smpboot.h
@@ -15,4 +15,8 @@ static inline void idle_thread_set_boot_cpu(void) { }
 static inline void idle_threads_init(void) { }
 #endif
 
+int smpboot_create_threads(unsigned int cpu);
+void smpboot_park_threads(unsigned int cpu);
+void smpboot_unpark_threads(unsigned int cpu);
+
 #endif
-- 
cgit v1.2.3


From 6b66733501991419961b9171c50359658e35586a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 16 Jul 2012 10:42:37 +0000
Subject: softirq: Use hotplug thread infrastructure

[ paulmck: Updated to avoid invoking rcu_note_context_switch() with
  preemption enabled. ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/softirq.c | 108 ++++++++++++++-----------------------------------------
 1 file changed, 27 insertions(+), 81 deletions(-)

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 671f9594e368..090e1b920538 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -23,6 +23,7 @@
 #include <linux/rcupdate.h>
 #include <linux/ftrace.h>
 #include <linux/smp.h>
+#include <linux/smpboot.h>
 #include <linux/tick.h>
 
 #define CREATE_TRACE_POINTS
@@ -733,49 +734,22 @@ void __init softirq_init(void)
 	open_softirq(HI_SOFTIRQ, tasklet_hi_action);
 }
 
-static int run_ksoftirqd(void * __bind_cpu)
+static int ksoftirqd_should_run(unsigned int cpu)
 {
-	set_current_state(TASK_INTERRUPTIBLE);
-
-	while (!kthread_should_stop()) {
-		preempt_disable();
-		if (!local_softirq_pending()) {
-			schedule_preempt_disabled();
-		}
-
-		__set_current_state(TASK_RUNNING);
-
-		while (local_softirq_pending()) {
-			/* Preempt disable stops cpu going offline.
-			   If already offline, we'll be on wrong CPU:
-			   don't process */
-			if (cpu_is_offline((long)__bind_cpu))
-				goto wait_to_die;
-			local_irq_disable();
-			if (local_softirq_pending())
-				__do_softirq();
-			local_irq_enable();
-			sched_preempt_enable_no_resched();
-			cond_resched();
-			preempt_disable();
-			rcu_note_context_switch((long)__bind_cpu);
-		}
-		preempt_enable();
-		set_current_state(TASK_INTERRUPTIBLE);
-	}
-	__set_current_state(TASK_RUNNING);
-	return 0;
+	return local_softirq_pending();
+}
 
-wait_to_die:
-	preempt_enable();
-	/* Wait for kthread_stop */
-	set_current_state(TASK_INTERRUPTIBLE);
-	while (!kthread_should_stop()) {
-		schedule();
-		set_current_state(TASK_INTERRUPTIBLE);
+static void run_ksoftirqd(unsigned int cpu)
+{
+	local_irq_disable();
+	if (local_softirq_pending()) {
+		__do_softirq();
+		rcu_note_context_switch(cpu);
+		local_irq_enable();
+		cond_resched();
+		return;
 	}
-	__set_current_state(TASK_RUNNING);
-	return 0;
+	local_irq_enable();
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -841,50 +815,17 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
 				  unsigned long action,
 				  void *hcpu)
 {
-	int hotcpu = (unsigned long)hcpu;
-	struct task_struct *p;
-
 	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		p = kthread_create_on_node(run_ksoftirqd,
-					   hcpu,
-					   cpu_to_node(hotcpu),
-					   "ksoftirqd/%d", hotcpu);
-		if (IS_ERR(p)) {
-			printk("ksoftirqd for %i failed\n", hotcpu);
-			return notifier_from_errno(PTR_ERR(p));
-		}
-		kthread_bind(p, hotcpu);
-  		per_cpu(ksoftirqd, hotcpu) = p;
- 		break;
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-		wake_up_process(per_cpu(ksoftirqd, hotcpu));
-		break;
 #ifdef CONFIG_HOTPLUG_CPU
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-		if (!per_cpu(ksoftirqd, hotcpu))
-			break;
-		/* Unbind so it can run.  Fall thru. */
-		kthread_bind(per_cpu(ksoftirqd, hotcpu),
-			     cpumask_any(cpu_online_mask));
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN: {
-		static const struct sched_param param = {
-			.sched_priority = MAX_RT_PRIO-1
-		};
-
-		p = per_cpu(ksoftirqd, hotcpu);
-		per_cpu(ksoftirqd, hotcpu) = NULL;
-		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
-		kthread_stop(p);
+		int hotcpu = (unsigned long)hcpu;
+
 		takeover_tasklets(hotcpu);
 		break;
 	}
 #endif /* CONFIG_HOTPLUG_CPU */
- 	}
+	}
 	return NOTIFY_OK;
 }
 
@@ -892,14 +833,19 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
+static struct smp_hotplug_thread softirq_threads = {
+	.store			= &ksoftirqd,
+	.thread_should_run	= ksoftirqd_should_run,
+	.thread_fn		= run_ksoftirqd,
+	.thread_comm		= "ksoftirqd/%u",
+};
+
 static __init int spawn_ksoftirqd(void)
 {
-	void *cpu = (void *)(long)smp_processor_id();
-	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
-
-	BUG_ON(err != NOTIFY_OK);
-	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
 	register_cpu_notifier(&cpu_nfb);
+
+	BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
+
 	return 0;
 }
 early_initcall(spawn_ksoftirqd);
-- 
cgit v1.2.3


From 7b389b042a82df878a78aee1d551291f906e3a9c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 16 Jul 2012 10:42:38 +0000
Subject: watchdog: Use hotplug thread infrastructure

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/watchdog.c | 263 ++++++++++++++++++------------------------------------
 1 file changed, 89 insertions(+), 174 deletions(-)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 4b1dfba70f7c..9d4c8d5a1f53 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -22,6 +22,7 @@
 #include <linux/notifier.h>
 #include <linux/module.h>
 #include <linux/sysctl.h>
+#include <linux/smpboot.h>
 
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
@@ -29,16 +30,18 @@
 
 int watchdog_enabled = 1;
 int __read_mostly watchdog_thresh = 10;
+static int __read_mostly watchdog_disabled;
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
 static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
 static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
 static DEFINE_PER_CPU(bool, softlockup_touch_sync);
 static DEFINE_PER_CPU(bool, soft_watchdog_warn);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
+static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
-static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
 #endif
@@ -248,13 +251,15 @@ static void watchdog_overflow_callback(struct perf_event *event,
 	__this_cpu_write(hard_watchdog_warn, false);
 	return;
 }
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+
 static void watchdog_interrupt_count(void)
 {
 	__this_cpu_inc(hrtimer_interrupts);
 }
-#else
-static inline void watchdog_interrupt_count(void) { return; }
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+
+static int watchdog_nmi_enable(unsigned int cpu);
+static void watchdog_nmi_disable(unsigned int cpu);
 
 /* watchdog kicker functions */
 static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
@@ -327,49 +332,68 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	return HRTIMER_RESTART;
 }
 
+static void watchdog_set_prio(unsigned int policy, unsigned int prio)
+{
+	struct sched_param param = { .sched_priority = prio };
 
-/*
- * The watchdog thread - touches the timestamp.
- */
-static int watchdog(void *unused)
+	sched_setscheduler(current, policy, &param);
+}
+
+static void watchdog_enable(unsigned int cpu)
 {
-	struct sched_param param = { .sched_priority = 0 };
 	struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
 
-	/* initialize timestamp */
-	__touch_watchdog();
+	if (!watchdog_enabled) {
+		kthread_park(current);
+		return;
+	}
+
+	/* Enable the perf event */
+	watchdog_nmi_enable(cpu);
 
 	/* kick off the timer for the hardlockup detector */
+	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer->function = watchdog_timer_fn;
+
 	/* done here because hrtimer_start can only pin to smp_processor_id() */
 	hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
 		      HRTIMER_MODE_REL_PINNED);
 
-	set_current_state(TASK_INTERRUPTIBLE);
-	/*
-	 * Run briefly (kicked by the hrtimer callback function) once every
-	 * get_sample_period() seconds (4 seconds by default) to reset the
-	 * softlockup timestamp. If this gets delayed for more than
-	 * 2*watchdog_thresh seconds then the debug-printout triggers in
-	 * watchdog_timer_fn().
-	 */
-	while (!kthread_should_stop()) {
-		__touch_watchdog();
-		schedule();
+	/* initialize timestamp */
+	watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
+	__touch_watchdog();
+}
 
-		if (kthread_should_stop())
-			break;
+static void watchdog_disable(unsigned int cpu)
+{
+	struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
 
-		set_current_state(TASK_INTERRUPTIBLE);
-	}
-	/*
-	 * Drop the policy/priority elevation during thread exit to avoid a
-	 * scheduling latency spike.
-	 */
-	__set_current_state(TASK_RUNNING);
-	sched_setscheduler(current, SCHED_NORMAL, &param);
-	return 0;
+	watchdog_set_prio(SCHED_NORMAL, 0);
+	hrtimer_cancel(hrtimer);
+	/* disable the perf event */
+	watchdog_nmi_disable(cpu);
 }
 
+static int watchdog_should_run(unsigned int cpu)
+{
+	return __this_cpu_read(hrtimer_interrupts) !=
+		__this_cpu_read(soft_lockup_hrtimer_cnt);
+}
+
+/*
+ * The watchdog thread function - touches the timestamp.
+ *
+ * It only runs once every get_sample_period() seconds (4 seconds by
+ * default) to reset the softlockup timestamp. If this gets delayed
+ * for more than 2*watchdog_thresh seconds then the debug-printout
+ * triggers in watchdog_timer_fn().
+ */
+static void watchdog(unsigned int cpu)
+{
+	__this_cpu_write(soft_lockup_hrtimer_cnt,
+			 __this_cpu_read(hrtimer_interrupts));
+	__touch_watchdog();
+}
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 /*
@@ -379,7 +403,7 @@ static int watchdog(void *unused)
  */
 static unsigned long cpu0_err;
 
-static int watchdog_nmi_enable(int cpu)
+static int watchdog_nmi_enable(unsigned int cpu)
 {
 	struct perf_event_attr *wd_attr;
 	struct perf_event *event = per_cpu(watchdog_ev, cpu);
@@ -433,7 +457,7 @@ out:
 	return 0;
 }
 
-static void watchdog_nmi_disable(int cpu)
+static void watchdog_nmi_disable(unsigned int cpu)
 {
 	struct perf_event *event = per_cpu(watchdog_ev, cpu);
 
@@ -447,107 +471,35 @@ static void watchdog_nmi_disable(int cpu)
 	return;
 }
 #else
-static int watchdog_nmi_enable(int cpu) { return 0; }
-static void watchdog_nmi_disable(int cpu) { return; }
+static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
+static void watchdog_nmi_disable(unsigned int cpu) { return; }
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 
 /* prepare/enable/disable routines */
-static void watchdog_prepare_cpu(int cpu)
-{
-	struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
-
-	WARN_ON(per_cpu(softlockup_watchdog, cpu));
-	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	hrtimer->function = watchdog_timer_fn;
-}
-
-static int watchdog_enable(int cpu)
-{
-	struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
-	int err = 0;
-
-	/* enable the perf event */
-	err = watchdog_nmi_enable(cpu);
-
-	/* Regardless of err above, fall through and start softlockup */
-
-	/* create the watchdog thread */
-	if (!p) {
-		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
-		p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
-		if (IS_ERR(p)) {
-			pr_err("softlockup watchdog for %i failed\n", cpu);
-			if (!err) {
-				/* if hardlockup hasn't already set this */
-				err = PTR_ERR(p);
-				/* and disable the perf event */
-				watchdog_nmi_disable(cpu);
-			}
-			goto out;
-		}
-		sched_setscheduler(p, SCHED_FIFO, &param);
-		kthread_bind(p, cpu);
-		per_cpu(watchdog_touch_ts, cpu) = 0;
-		per_cpu(softlockup_watchdog, cpu) = p;
-		wake_up_process(p);
-	}
-
-out:
-	return err;
-}
-
-static void watchdog_disable(int cpu)
-{
-	struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
-	struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
-
-	/*
-	 * cancel the timer first to stop incrementing the stats
-	 * and waking up the kthread
-	 */
-	hrtimer_cancel(hrtimer);
-
-	/* disable the perf event */
-	watchdog_nmi_disable(cpu);
-
-	/* stop the watchdog thread */
-	if (p) {
-		per_cpu(softlockup_watchdog, cpu) = NULL;
-		kthread_stop(p);
-	}
-}
-
 /* sysctl functions */
 #ifdef CONFIG_SYSCTL
 static void watchdog_enable_all_cpus(void)
 {
-	int cpu;
-
-	watchdog_enabled = 0;
-
-	for_each_online_cpu(cpu)
-		if (!watchdog_enable(cpu))
-			/* if any cpu succeeds, watchdog is considered
-			   enabled for the system */
-			watchdog_enabled = 1;
-
-	if (!watchdog_enabled)
-		pr_err("failed to be enabled on some cpus\n");
+	unsigned int cpu;
 
+	if (watchdog_disabled) {
+		watchdog_disabled = 0;
+		for_each_online_cpu(cpu)
+			kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+	}
 }
 
 static void watchdog_disable_all_cpus(void)
 {
-	int cpu;
-
-	for_each_online_cpu(cpu)
-		watchdog_disable(cpu);
+	unsigned int cpu;
 
-	/* if all watchdogs are disabled, then they are disabled for the system */
-	watchdog_enabled = 0;
+	if (!watchdog_disabled) {
+		watchdog_disabled = 1;
+		for_each_online_cpu(cpu)
+			kthread_park(per_cpu(softlockup_watchdog, cpu));
+	}
 }
 
-
 /*
  * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
  */
@@ -557,73 +509,36 @@ int proc_dowatchdog(struct ctl_table *table, int write,
 {
 	int ret;
 
+	if (watchdog_disabled < 0)
+		return -ENODEV;
+
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret || !write)
-		goto out;
+		return ret;
 
 	if (watchdog_enabled && watchdog_thresh)
 		watchdog_enable_all_cpus();
 	else
 		watchdog_disable_all_cpus();
 
-out:
 	return ret;
 }
 #endif /* CONFIG_SYSCTL */
 
-
-/*
- * Create/destroy watchdog threads as CPUs come and go:
- */
-static int __cpuinit
-cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-	int hotcpu = (unsigned long)hcpu;
-
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		watchdog_prepare_cpu(hotcpu);
-		break;
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-		if (watchdog_enabled)
-			watchdog_enable(hotcpu);
-		break;
-#ifdef CONFIG_HOTPLUG_CPU
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-		watchdog_disable(hotcpu);
-		break;
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		watchdog_disable(hotcpu);
-		break;
-#endif /* CONFIG_HOTPLUG_CPU */
-	}
-
-	/*
-	 * hardlockup and softlockup are not important enough
-	 * to block cpu bring up.  Just always succeed and
-	 * rely on printk output to flag problems.
-	 */
-	return NOTIFY_OK;
-}
-
-static struct notifier_block __cpuinitdata cpu_nfb = {
-	.notifier_call = cpu_callback
+static struct smp_hotplug_thread watchdog_threads = {
+	.store			= &softlockup_watchdog,
+	.thread_should_run	= watchdog_should_run,
+	.thread_fn		= watchdog,
+	.thread_comm		= "watchdog/%u",
+	.setup			= watchdog_enable,
+	.park			= watchdog_disable,
+	.unpark			= watchdog_enable,
 };
 
 void __init lockup_detector_init(void)
 {
-	void *cpu = (void *)(long)smp_processor_id();
-	int err;
-
-	err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
-	WARN_ON(notifier_to_errno(err));
-
-	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
-	register_cpu_notifier(&cpu_nfb);
-
-	return;
+	if (smpboot_register_percpu_thread(&watchdog_threads)) {
+		pr_err("Failed to create watchdog threads, disabled\n");
+		watchdog_disabled = -ENODEV;
+	}
 }
-- 
cgit v1.2.3


From 760014b2aeea10260ca080ffdd08d2f9dff10167 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 16 Jul 2012 10:42:38 +0000
Subject: rcu: Use smp_hotplug_thread facility for RCUs per-CPU kthread

Bring RCU into the new-age CPU-hotplug fold by modifying RCU's per-CPU
kthread code to use the new smp_hotplug_thread facility.

[ tglx: Adapted it to use callbacks and to the simplified rcu yield ]

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/rcutree.c        |   4 -
 kernel/rcutree.h        |   8 --
 kernel/rcutree_plugin.h | 203 ++++++++++--------------------------------------
 kernel/rcutree_trace.c  |   3 +-
 4 files changed, 41 insertions(+), 177 deletions(-)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index cba883468548..a8ba38369efa 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -125,7 +125,6 @@ static int rcu_scheduler_fully_active __read_mostly;
  */
 static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
 DEFINE_PER_CPU(char, rcu_cpu_has_work);
 
@@ -1461,7 +1460,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
 
 	/* Adjust any no-longer-needed kthreads. */
-	rcu_stop_cpu_kthread(cpu);
 	rcu_boost_kthread_setaffinity(rnp, -1);
 
 	/* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
@@ -2517,11 +2515,9 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
 		rcu_boost_kthread_setaffinity(rnp, -1);
-		rcu_cpu_kthread_setrt(cpu, 1);
 		break;
 	case CPU_DOWN_PREPARE:
 		rcu_boost_kthread_setaffinity(rnp, cpu);
-		rcu_cpu_kthread_setrt(cpu, 0);
 		break;
 	case CPU_DYING:
 	case CPU_DYING_FROZEN:
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 816ed78858b1..1989438c68fb 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -192,12 +192,6 @@ struct rcu_node {
 				/* Refused to boost: not sure why, though. */
 				/*  This can happen due to race conditions. */
 #endif /* #ifdef CONFIG_RCU_BOOST */
-	struct task_struct *node_kthread_task;
-				/* kthread that takes care of this rcu_node */
-				/*  structure, for example, awakening the */
-				/*  per-CPU kthreads as needed. */
-	unsigned int node_kthread_status;
-				/* State of node_kthread_task for tracing. */
 } ____cacheline_internodealigned_in_smp;
 
 /*
@@ -449,7 +443,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
 				      unsigned long flags);
-static void rcu_stop_cpu_kthread(int cpu);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static int rcu_print_task_stall(struct rcu_node *rnp);
@@ -482,7 +475,6 @@ static void rcu_preempt_do_callbacks(void);
 static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 						 struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_RCU_BOOST */
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
 static void __cpuinit rcu_prepare_kthreads(int cpu);
 static void rcu_prepare_for_idle_init(int cpu);
 static void rcu_cleanup_after_idle(int cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 95fd5634e541..a0871b3cb3de 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
  */
 
 #include <linux/delay.h>
+#include <linux/smpboot.h>
 
 #define RCU_KTHREAD_PRIO 1
 
@@ -1448,25 +1449,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 	return 0;
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Stop the RCU's per-CPU kthread when its CPU goes offline,.
- */
-static void rcu_stop_cpu_kthread(int cpu)
-{
-	struct task_struct *t;
-
-	/* Stop the CPU's kthread. */
-	t = per_cpu(rcu_cpu_kthread_task, cpu);
-	if (t != NULL) {
-		per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
-		kthread_stop(t);
-	}
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
 static void rcu_kthread_do_work(void)
 {
 	rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
@@ -1474,59 +1456,22 @@ static void rcu_kthread_do_work(void)
 	rcu_preempt_do_callbacks();
 }
 
-/*
- * Set the specified CPU's kthread to run RT or not, as specified by
- * the to_rt argument.  The CPU-hotplug locks are held, so the task
- * is not going away.
- */
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
+static void rcu_cpu_kthread_setup(unsigned int cpu)
 {
-	int policy;
 	struct sched_param sp;
-	struct task_struct *t;
 
-	t = per_cpu(rcu_cpu_kthread_task, cpu);
-	if (t == NULL)
-		return;
-	if (to_rt) {
-		policy = SCHED_FIFO;
-		sp.sched_priority = RCU_KTHREAD_PRIO;
-	} else {
-		policy = SCHED_NORMAL;
-		sp.sched_priority = 0;
-	}
-	sched_setscheduler_nocheck(t, policy, &sp);
+	sp.sched_priority = RCU_KTHREAD_PRIO;
+	sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
 }
 
-/*
- * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
- * This can happen while the corresponding CPU is either coming online
- * or going offline.  We cannot wait until the CPU is fully online
- * before starting the kthread, because the various notifier functions
- * can wait for RCU grace periods.  So we park rcu_cpu_kthread() until
- * the corresponding CPU is online.
- *
- * Return 1 if the kthread needs to stop, 0 otherwise.
- *
- * Caller must disable bh.  This function can momentarily enable it.
- */
-static int rcu_cpu_kthread_should_stop(int cpu)
+static void rcu_cpu_kthread_park(unsigned int cpu)
 {
-	while (cpu_is_offline(cpu) ||
-	       !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
-	       smp_processor_id() != cpu) {
-		if (kthread_should_stop())
-			return 1;
-		per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
-		per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
-		local_bh_enable();
-		schedule_timeout_uninterruptible(1);
-		if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
-			set_cpus_allowed_ptr(current, cpumask_of(cpu));
-		local_bh_disable();
-	}
-	per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
-	return 0;
+	per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
+}
+
+static int rcu_cpu_kthread_should_run(unsigned int cpu)
+{
+	return __get_cpu_var(rcu_cpu_has_work);
 }
 
 /*
@@ -1534,96 +1479,35 @@ static int rcu_cpu_kthread_should_stop(int cpu)
  * RCU softirq used in flavors and configurations of RCU that do not
  * support RCU priority boosting.
  */
-static int rcu_cpu_kthread(void *arg)
+static void rcu_cpu_kthread(unsigned int cpu)
 {
-	int cpu = (int)(long)arg;
-	unsigned long flags;
-	int spincnt = 0;
-	unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
-	char work;
-	char *workp = &per_cpu(rcu_cpu_has_work, cpu);
+	unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status);
+	char work, *workp = &__get_cpu_var(rcu_cpu_has_work);
+	int spincnt;
 
-	trace_rcu_utilization("Start CPU kthread@init");
-	for (;;) {
-		*statusp = RCU_KTHREAD_WAITING;
-		trace_rcu_utilization("End CPU kthread@rcu_wait");
-		rcu_wait(*workp != 0 || kthread_should_stop());
+	for (spincnt = 0; spincnt < 10; spincnt++) {
 		trace_rcu_utilization("Start CPU kthread@rcu_wait");
 		local_bh_disable();
-		if (rcu_cpu_kthread_should_stop(cpu)) {
-			local_bh_enable();
-			break;
-		}
 		*statusp = RCU_KTHREAD_RUNNING;
-		per_cpu(rcu_cpu_kthread_loops, cpu)++;
-		local_irq_save(flags);
+		this_cpu_inc(rcu_cpu_kthread_loops);
+		local_irq_disable();
 		work = *workp;
 		*workp = 0;
-		local_irq_restore(flags);
+		local_irq_enable();
 		if (work)
 			rcu_kthread_do_work();
 		local_bh_enable();
-		if (*workp != 0)
-			spincnt++;
-		else
-			spincnt = 0;
-		if (spincnt > 10) {
-			*statusp = RCU_KTHREAD_YIELDING;
-			trace_rcu_utilization("End CPU kthread@rcu_yield");
-			schedule_timeout_interruptible(2);
-			trace_rcu_utilization("Start CPU kthread@rcu_yield");
-			spincnt = 0;
+		if (*workp == 0) {
+			trace_rcu_utilization("End CPU kthread@rcu_wait");
+			*statusp = RCU_KTHREAD_WAITING;
+			return;
 		}
 	}
-	*statusp = RCU_KTHREAD_STOPPED;
-	trace_rcu_utilization("End CPU kthread@term");
-	return 0;
-}
-
-/*
- * Spawn a per-CPU kthread, setting up affinity and priority.
- * Because the CPU hotplug lock is held, no other CPU will be attempting
- * to manipulate rcu_cpu_kthread_task.  There might be another CPU
- * attempting to access it during boot, but the locking in kthread_bind()
- * will enforce sufficient ordering.
- *
- * Please note that we cannot simply refuse to wake up the per-CPU
- * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
- * which can result in softlockup complaints if the task ends up being
- * idle for more than a couple of minutes.
- *
- * However, please note also that we cannot bind the per-CPU kthread to its
- * CPU until that CPU is fully online.  We also cannot wait until the
- * CPU is fully online before we create its per-CPU kthread, as this would
- * deadlock the system when CPU notifiers tried waiting for grace
- * periods.  So we bind the per-CPU kthread to its CPU only if the CPU
- * is online.  If its CPU is not yet fully online, then the code in
- * rcu_cpu_kthread() will wait until it is fully online, and then do
- * the binding.
- */
-static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
-{
-	struct sched_param sp;
-	struct task_struct *t;
-
-	if (!rcu_scheduler_fully_active ||
-	    per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
-		return 0;
-	t = kthread_create_on_node(rcu_cpu_kthread,
-				   (void *)(long)cpu,
-				   cpu_to_node(cpu),
-				   "rcuc/%d", cpu);
-	if (IS_ERR(t))
-		return PTR_ERR(t);
-	if (cpu_online(cpu))
-		kthread_bind(t, cpu);
-	per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
-	WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
-	sp.sched_priority = RCU_KTHREAD_PRIO;
-	sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
-	per_cpu(rcu_cpu_kthread_task, cpu) = t;
-	wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
-	return 0;
+	*statusp = RCU_KTHREAD_YIELDING;
+	trace_rcu_utilization("Start CPU kthread@rcu_yield");
+	schedule_timeout_interruptible(2);
+	trace_rcu_utilization("End CPU kthread@rcu_yield");
+	*statusp = RCU_KTHREAD_WAITING;
 }
 
 /*
@@ -1659,6 +1543,15 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 	free_cpumask_var(cm);
 }
 
+static struct smp_hotplug_thread rcu_cpu_thread_spec = {
+	.store			= &rcu_cpu_kthread_task,
+	.thread_should_run	= rcu_cpu_kthread_should_run,
+	.thread_fn		= rcu_cpu_kthread,
+	.thread_comm		= "rcuc/%u",
+	.setup			= rcu_cpu_kthread_setup,
+	.park			= rcu_cpu_kthread_park,
+};
+
 /*
  * Spawn all kthreads -- called as soon as the scheduler is running.
  */
@@ -1668,11 +1561,9 @@ static int __init rcu_spawn_kthreads(void)
 	int cpu;
 
 	rcu_scheduler_fully_active = 1;
-	for_each_possible_cpu(cpu) {
+	for_each_possible_cpu(cpu)
 		per_cpu(rcu_cpu_has_work, cpu) = 0;
-		if (cpu_online(cpu))
-			(void)rcu_spawn_one_cpu_kthread(cpu);
-	}
+	BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
 	rnp = rcu_get_root(rcu_state);
 	(void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
 	if (NUM_RCU_NODES > 1) {
@@ -1689,10 +1580,8 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
 	struct rcu_node *rnp = rdp->mynode;
 
 	/* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
-	if (rcu_scheduler_fully_active) {
-		(void)rcu_spawn_one_cpu_kthread(cpu);
+	if (rcu_scheduler_fully_active)
 		(void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
-	}
 }
 
 #else /* #ifdef CONFIG_RCU_BOOST */
@@ -1716,22 +1605,10 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
 {
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-
-static void rcu_stop_cpu_kthread(int cpu)
-{
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 {
 }
 
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
-{
-}
-
 static int __init rcu_scheduler_really_started(void)
 {
 	rcu_scheduler_fully_active = 1;
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d4bc16ddd1d4..6b4c76ba3529 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -83,11 +83,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 			rdp->nxttail[RCU_WAIT_TAIL]],
 		   ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
 #ifdef CONFIG_RCU_BOOST
-	seq_printf(m, " kt=%d/%c/%d ktl=%x",
+	seq_printf(m, " kt=%d/%c ktl=%x",
 		   per_cpu(rcu_cpu_has_work, rdp->cpu),
 		   convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
 					  rdp->cpu)),
-		   per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
 		   per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
 #endif /* #ifdef CONFIG_RCU_BOOST */
 	seq_printf(m, " b=%ld", rdp->blimit);
-- 
cgit v1.2.3


From 495bcbed25d49dfaebfe4a8937f9f7667a0c0506 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 16 Jul 2012 10:42:39 +0000
Subject: infiniband: ehca: Use hotplug thread infrastructure

Get rid of the hotplug notifiers and use the generic hotplug thread
infrastructure.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/infiniband/hw/ehca/ehca_irq.c | 253 +++++++++++++---------------------
 drivers/infiniband/hw/ehca/ehca_irq.h |   6 +-
 2 files changed, 95 insertions(+), 164 deletions(-)

diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c
index 53589000fd07..a4c9f9d45528 100644
--- a/drivers/infiniband/hw/ehca/ehca_irq.c
+++ b/drivers/infiniband/hw/ehca/ehca_irq.c
@@ -42,6 +42,7 @@
  */
 
 #include <linux/slab.h>
+#include <linux/smpboot.h>
 
 #include "ehca_classes.h"
 #include "ehca_irq.h"
@@ -652,7 +653,7 @@ void ehca_tasklet_eq(unsigned long data)
 	ehca_process_eq((struct ehca_shca*)data, 1);
 }
 
-static inline int find_next_online_cpu(struct ehca_comp_pool *pool)
+static int find_next_online_cpu(struct ehca_comp_pool *pool)
 {
 	int cpu;
 	unsigned long flags;
@@ -662,17 +663,23 @@ static inline int find_next_online_cpu(struct ehca_comp_pool *pool)
 		ehca_dmp(cpu_online_mask, cpumask_size(), "");
 
 	spin_lock_irqsave(&pool->last_cpu_lock, flags);
-	cpu = cpumask_next(pool->last_cpu, cpu_online_mask);
-	if (cpu >= nr_cpu_ids)
-		cpu = cpumask_first(cpu_online_mask);
-	pool->last_cpu = cpu;
+	while (1) {
+		cpu = cpumask_next(pool->last_cpu, cpu_online_mask);
+		if (cpu >= nr_cpu_ids)
+			cpu = cpumask_first(cpu_online_mask);
+		pool->last_cpu = cpu;
+		/* Might be on the way out */
+		if (per_cpu_ptr(pool->cpu_comp_tasks, cpu)->active)
+			break;
+	}
 	spin_unlock_irqrestore(&pool->last_cpu_lock, flags);
 
 	return cpu;
 }
 
 static void __queue_comp_task(struct ehca_cq *__cq,
-			      struct ehca_cpu_comp_task *cct)
+			      struct ehca_cpu_comp_task *cct,
+			      struct task_struct *thread)
 {
 	unsigned long flags;
 
@@ -683,7 +690,7 @@ static void __queue_comp_task(struct ehca_cq *__cq,
 		__cq->nr_callbacks++;
 		list_add_tail(&__cq->entry, &cct->cq_list);
 		cct->cq_jobs++;
-		wake_up(&cct->wait_queue);
+		wake_up_process(thread);
 	} else
 		__cq->nr_callbacks++;
 
@@ -695,6 +702,7 @@ static void queue_comp_task(struct ehca_cq *__cq)
 {
 	int cpu_id;
 	struct ehca_cpu_comp_task *cct;
+	struct task_struct *thread;
 	int cq_jobs;
 	unsigned long flags;
 
@@ -702,7 +710,8 @@ static void queue_comp_task(struct ehca_cq *__cq)
 	BUG_ON(!cpu_online(cpu_id));
 
 	cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
-	BUG_ON(!cct);
+	thread = per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
+	BUG_ON(!cct || !thread);
 
 	spin_lock_irqsave(&cct->task_lock, flags);
 	cq_jobs = cct->cq_jobs;
@@ -710,28 +719,25 @@ static void queue_comp_task(struct ehca_cq *__cq)
 	if (cq_jobs > 0) {
 		cpu_id = find_next_online_cpu(pool);
 		cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
-		BUG_ON(!cct);
+		thread = per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
+		BUG_ON(!cct || !thread);
 	}
-
-	__queue_comp_task(__cq, cct);
+	__queue_comp_task(__cq, cct, thread);
 }
 
 static void run_comp_task(struct ehca_cpu_comp_task *cct)
 {
 	struct ehca_cq *cq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&cct->task_lock, flags);
 
 	while (!list_empty(&cct->cq_list)) {
 		cq = list_entry(cct->cq_list.next, struct ehca_cq, entry);
-		spin_unlock_irqrestore(&cct->task_lock, flags);
+		spin_unlock_irq(&cct->task_lock);
 
 		comp_event_callback(cq);
 		if (atomic_dec_and_test(&cq->nr_events))
 			wake_up(&cq->wait_completion);
 
-		spin_lock_irqsave(&cct->task_lock, flags);
+		spin_lock_irq(&cct->task_lock);
 		spin_lock(&cq->task_lock);
 		cq->nr_callbacks--;
 		if (!cq->nr_callbacks) {
@@ -740,159 +746,76 @@ static void run_comp_task(struct ehca_cpu_comp_task *cct)
 		}
 		spin_unlock(&cq->task_lock);
 	}
-
-	spin_unlock_irqrestore(&cct->task_lock, flags);
 }
 
-static int comp_task(void *__cct)
+static void comp_task_park(unsigned int cpu)
 {
-	struct ehca_cpu_comp_task *cct = __cct;
-	int cql_empty;
-	DECLARE_WAITQUEUE(wait, current);
-
-	set_current_state(TASK_INTERRUPTIBLE);
-	while (!kthread_should_stop()) {
-		add_wait_queue(&cct->wait_queue, &wait);
-
-		spin_lock_irq(&cct->task_lock);
-		cql_empty = list_empty(&cct->cq_list);
-		spin_unlock_irq(&cct->task_lock);
-		if (cql_empty)
-			schedule();
-		else
-			__set_current_state(TASK_RUNNING);
-
-		remove_wait_queue(&cct->wait_queue, &wait);
+	struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+	struct ehca_cpu_comp_task *target;
+	struct task_struct *thread;
+	struct ehca_cq *cq, *tmp;
+	LIST_HEAD(list);
 
-		spin_lock_irq(&cct->task_lock);
-		cql_empty = list_empty(&cct->cq_list);
-		spin_unlock_irq(&cct->task_lock);
-		if (!cql_empty)
-			run_comp_task(__cct);
+	spin_lock_irq(&cct->task_lock);
+	cct->cq_jobs = 0;
+	cct->active = 0;
+	list_splice_init(&cct->cq_list, &list);
+	spin_unlock_irq(&cct->task_lock);
 
-		set_current_state(TASK_INTERRUPTIBLE);
+	cpu = find_next_online_cpu(pool);
+	target = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+	thread = per_cpu_ptr(pool->cpu_comp_threads, cpu);
+	spin_lock_irq(&target->task_lock);
+	list_for_each_entry_safe(cq, tmp, &list, entry) {
+		list_del(&cq->entry);
+		__queue_comp_task(cq, target, thread);
 	}
-	__set_current_state(TASK_RUNNING);
-
-	return 0;
-}
-
-static struct task_struct *create_comp_task(struct ehca_comp_pool *pool,
-					    int cpu)
-{
-	struct ehca_cpu_comp_task *cct;
-
-	cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-	spin_lock_init(&cct->task_lock);
-	INIT_LIST_HEAD(&cct->cq_list);
-	init_waitqueue_head(&cct->wait_queue);
-	cct->task = kthread_create_on_node(comp_task, cct, cpu_to_node(cpu),
-					   "ehca_comp/%d", cpu);
-
-	return cct->task;
+	spin_unlock_irq(&target->task_lock);
 }
 
-static void destroy_comp_task(struct ehca_comp_pool *pool,
-			      int cpu)
+static void comp_task_stop(unsigned int cpu, bool online)
 {
-	struct ehca_cpu_comp_task *cct;
-	struct task_struct *task;
-	unsigned long flags_cct;
-
-	cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-
-	spin_lock_irqsave(&cct->task_lock, flags_cct);
+	struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
 
-	task = cct->task;
-	cct->task = NULL;
+	spin_lock_irq(&cct->task_lock);
 	cct->cq_jobs = 0;
-
-	spin_unlock_irqrestore(&cct->task_lock, flags_cct);
-
-	if (task)
-		kthread_stop(task);
+	cct->active = 0;
+	WARN_ON(!list_empty(&cct->cq_list));
+	spin_unlock_irq(&cct->task_lock);
 }
 
-static void __cpuinit take_over_work(struct ehca_comp_pool *pool, int cpu)
+static int comp_task_should_run(unsigned int cpu)
 {
 	struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-	LIST_HEAD(list);
-	struct ehca_cq *cq;
-	unsigned long flags_cct;
-
-	spin_lock_irqsave(&cct->task_lock, flags_cct);
-
-	list_splice_init(&cct->cq_list, &list);
-
-	while (!list_empty(&list)) {
-		cq = list_entry(cct->cq_list.next, struct ehca_cq, entry);
-
-		list_del(&cq->entry);
-		__queue_comp_task(cq, this_cpu_ptr(pool->cpu_comp_tasks));
-	}
-
-	spin_unlock_irqrestore(&cct->task_lock, flags_cct);
 
+	return cct->cq_jobs;
 }
 
-static int __cpuinit comp_pool_callback(struct notifier_block *nfb,
-					unsigned long action,
-					void *hcpu)
+static int comp_task(unsigned int cpu)
 {
-	unsigned int cpu = (unsigned long)hcpu;
-	struct ehca_cpu_comp_task *cct;
+	struct ehca_cpu_comp_task *cct = this_cpu_ptr(pool->cpu_comp_tasks);
+	int cql_empty;
 
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		ehca_gen_dbg("CPU: %x (CPU_PREPARE)", cpu);
-		if (!create_comp_task(pool, cpu)) {
-			ehca_gen_err("Can't create comp_task for cpu: %x", cpu);
-			return notifier_from_errno(-ENOMEM);
-		}
-		break;
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-		ehca_gen_dbg("CPU: %x (CPU_CANCELED)", cpu);
-		cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-		kthread_bind(cct->task, cpumask_any(cpu_online_mask));
-		destroy_comp_task(pool, cpu);
-		break;
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-		ehca_gen_dbg("CPU: %x (CPU_ONLINE)", cpu);
-		cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-		kthread_bind(cct->task, cpu);
-		wake_up_process(cct->task);
-		break;
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-		ehca_gen_dbg("CPU: %x (CPU_DOWN_PREPARE)", cpu);
-		break;
-	case CPU_DOWN_FAILED:
-	case CPU_DOWN_FAILED_FROZEN:
-		ehca_gen_dbg("CPU: %x (CPU_DOWN_FAILED)", cpu);
-		break;
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		ehca_gen_dbg("CPU: %x (CPU_DEAD)", cpu);
-		destroy_comp_task(pool, cpu);
-		take_over_work(pool, cpu);
-		break;
+	spin_lock_irq(&cct->task_lock);
+	cql_empty = list_empty(&cct->cq_list);
+	if (!cql_empty) {
+		__set_current_state(TASK_RUNNING);
+		run_comp_task(cct);
 	}
-
-	return NOTIFY_OK;
+	spin_unlock_irq(&cct->task_lock);
 }
 
-static struct notifier_block comp_pool_callback_nb __cpuinitdata = {
-	.notifier_call	= comp_pool_callback,
-	.priority	= 0,
+static struct smp_hotplug_thread comp_pool_threads = {
+	.thread_should_run	= comp_task_should_run,
+	.thread_fn		= comp_task,
+	.thread_comm		= "ehca_comp/%u",
+	.cleanup		= comp_task_stop,
+	.park			= comp_task_park,
 };
 
 int ehca_create_comp_pool(void)
 {
-	int cpu;
-	struct task_struct *task;
+	int cpu, ret = -ENOMEM;
 
 	if (!ehca_scaling_code)
 		return 0;
@@ -905,38 +828,46 @@ int ehca_create_comp_pool(void)
 	pool->last_cpu = cpumask_any(cpu_online_mask);
 
 	pool->cpu_comp_tasks = alloc_percpu(struct ehca_cpu_comp_task);
-	if (pool->cpu_comp_tasks == NULL) {
-		kfree(pool);
-		return -EINVAL;
-	}
+	if (!pool->cpu_comp_tasks)
+		goto out_pool;
 
-	for_each_online_cpu(cpu) {
-		task = create_comp_task(pool, cpu);
-		if (task) {
-			kthread_bind(task, cpu);
-			wake_up_process(task);
-		}
+	pool->cpu_comp_threads = alloc_percpu(struct task_struct *);
+	if (!pool->cpu_comp_threads)
+		goto out_tasks;
+
+	for_each_present_cpu(cpu) {
+		struct ehca_cpu_comp_task *cct;
+
+		cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+		spin_lock_init(&cct->task_lock);
+		INIT_LIST_HEAD(&cct->cq_list);
 	}
 
-	register_hotcpu_notifier(&comp_pool_callback_nb);
+	comp_pool_threads.store = pool->cpu_comp_threads;
+	ret = smpboot_register_percpu_thread(&comp_pool_threads);
+	if (ret)
+		goto out_threads;
 
-	printk(KERN_INFO "eHCA scaling code enabled\n");
+	pr_info("eHCA scaling code enabled\n");
+	return ret;
 
-	return 0;
+out_threads:
+	free_percpu(pool->cpu_comp_threads);
+out_tasks:
+	free_percpu(pool->cpu_comp_tasks);
+out_pool:
+	kfree(pool);
+	return ret;
 }
 
 void ehca_destroy_comp_pool(void)
 {
-	int i;
-
 	if (!ehca_scaling_code)
 		return;
 
-	unregister_hotcpu_notifier(&comp_pool_callback_nb);
-
-	for_each_online_cpu(i)
-		destroy_comp_task(pool, i);
+	smpboot_unregister_percpu_thread(&comp_pool_threads);
 
+	free_percpu(pool->cpu_comp_threads);
 	free_percpu(pool->cpu_comp_tasks);
 	kfree(pool);
 }
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.h b/drivers/infiniband/hw/ehca/ehca_irq.h
index 3346cb06cea6..5370199f08c7 100644
--- a/drivers/infiniband/hw/ehca/ehca_irq.h
+++ b/drivers/infiniband/hw/ehca/ehca_irq.h
@@ -58,15 +58,15 @@ void ehca_tasklet_eq(unsigned long data);
 void ehca_process_eq(struct ehca_shca *shca, int is_irq);
 
 struct ehca_cpu_comp_task {
-	wait_queue_head_t wait_queue;
 	struct list_head cq_list;
-	struct task_struct *task;
 	spinlock_t task_lock;
 	int cq_jobs;
+	int active;
 };
 
 struct ehca_comp_pool {
-	struct ehca_cpu_comp_task *cpu_comp_tasks;
+	struct ehca_cpu_comp_task __percpu *cpu_comp_tasks;
+	struct task_struct * __percpu *cpu_comp_threads;
 	int last_cpu;
 	spinlock_t last_cpu_lock;
 };
-- 
cgit v1.2.3


From 0ce23d9b87b72b1c7601daa5226a9b6f011e7939 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Thu, 12 Jul 2012 01:55:54 -0700
Subject: hotplug: Fix UP bug in smpboot hotplug code

Because kernel subsystems need their per-CPU kthreads on UP systems
as well as on SMP systems, the smpboot hotplug kthread functions
must be provided in UP builds as well as in SMP builds.  This commit
therefore adds smpboot.c to UP builds and excludes irrelevant code
via #ifdef.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/Makefile  | 3 +--
 kernel/smpboot.c | 4 ++++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/Makefile b/kernel/Makefile
index c0cc67ad764c..e5602d32acb3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = fork.o exec_domain.o panic.o printk.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o cred.o \
-	    async.o range.o groups.o lglock.o
+	    async.o range.o groups.o lglock.o smpboot.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
@@ -46,7 +46,6 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += smp.o
-obj-$(CONFIG_SMP) += smpboot.o
 ifneq ($(CONFIG_SMP),y)
 obj-y += up.o
 endif
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index b2545c843a99..0a49ee70054e 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -15,6 +15,8 @@
 
 #include "smpboot.h"
 
+#ifdef CONFIG_SMP
+
 #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
 /*
  * For the hotplug case we keep the task structs around and reuse
@@ -72,6 +74,8 @@ void __init idle_threads_init(void)
 }
 #endif
 
+#endif /* #ifdef CONFIG_SMP */
+
 static LIST_HEAD(hotplug_threads);
 static DEFINE_MUTEX(smpboot_threads_lock);
 
-- 
cgit v1.2.3